80958d08f7415327a733336569cd7fed3a74c699
[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_bo.c
1 /*
2 * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
3 * Copyright © 2015 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 #include "amdgpu_cs.h"
29
30 #include "util/os_time.h"
31 #include "util/u_hash_table.h"
32 #include "state_tracker/drm_driver.h"
33 #include <amdgpu_drm.h>
34 #include <xf86drm.h>
35 #include <stdio.h>
36 #include <inttypes.h>
37
38 #ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID
39 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6)
40 #endif
41
42 #ifndef AMDGPU_VA_RANGE_HIGH
43 #define AMDGPU_VA_RANGE_HIGH 0x2
44 #endif
45
46 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
47 #define DEBUG_SPARSE_COMMITS 0
48
49 struct amdgpu_sparse_backing_chunk {
50 uint32_t begin, end;
51 };
52
53 static struct pb_buffer *
54 amdgpu_bo_create(struct radeon_winsys *rws,
55 uint64_t size,
56 unsigned alignment,
57 enum radeon_bo_domain domain,
58 enum radeon_bo_flag flags);
59 static void amdgpu_bo_unmap(struct pb_buffer *buf);
60
61 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
62 enum radeon_bo_usage usage)
63 {
64 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
65 struct amdgpu_winsys *ws = bo->ws;
66 int64_t abs_timeout;
67
68 if (timeout == 0) {
69 if (p_atomic_read(&bo->num_active_ioctls))
70 return false;
71
72 } else {
73 abs_timeout = os_time_get_absolute_timeout(timeout);
74
75 /* Wait if any ioctl is being submitted with this buffer. */
76 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
77 return false;
78 }
79
80 if (bo->is_shared) {
81 /* We can't use user fences for shared buffers, because user fences
82 * are local to this process only. If we want to wait for all buffer
83 * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
84 */
85 bool buffer_busy = true;
86 int r;
87
88 r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
89 if (r)
90 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
91 r);
92 return !buffer_busy;
93 }
94
95 if (timeout == 0) {
96 unsigned idle_fences;
97 bool buffer_idle;
98
99 simple_mtx_lock(&ws->bo_fence_lock);
100
101 for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
102 if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
103 break;
104 }
105
106 /* Release the idle fences to avoid checking them again later. */
107 for (unsigned i = 0; i < idle_fences; ++i)
108 amdgpu_fence_reference(&bo->fences[i], NULL);
109
110 memmove(&bo->fences[0], &bo->fences[idle_fences],
111 (bo->num_fences - idle_fences) * sizeof(*bo->fences));
112 bo->num_fences -= idle_fences;
113
114 buffer_idle = !bo->num_fences;
115 simple_mtx_unlock(&ws->bo_fence_lock);
116
117 return buffer_idle;
118 } else {
119 bool buffer_idle = true;
120
121 simple_mtx_lock(&ws->bo_fence_lock);
122 while (bo->num_fences && buffer_idle) {
123 struct pipe_fence_handle *fence = NULL;
124 bool fence_idle = false;
125
126 amdgpu_fence_reference(&fence, bo->fences[0]);
127
128 /* Wait for the fence. */
129 simple_mtx_unlock(&ws->bo_fence_lock);
130 if (amdgpu_fence_wait(fence, abs_timeout, true))
131 fence_idle = true;
132 else
133 buffer_idle = false;
134 simple_mtx_lock(&ws->bo_fence_lock);
135
136 /* Release an idle fence to avoid checking it again later, keeping in
137 * mind that the fence array may have been modified by other threads.
138 */
139 if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
140 amdgpu_fence_reference(&bo->fences[0], NULL);
141 memmove(&bo->fences[0], &bo->fences[1],
142 (bo->num_fences - 1) * sizeof(*bo->fences));
143 bo->num_fences--;
144 }
145
146 amdgpu_fence_reference(&fence, NULL);
147 }
148 simple_mtx_unlock(&ws->bo_fence_lock);
149
150 return buffer_idle;
151 }
152 }
153
154 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
155 struct pb_buffer *buf)
156 {
157 return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
158 }
159
160 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
161 {
162 for (unsigned i = 0; i < bo->num_fences; ++i)
163 amdgpu_fence_reference(&bo->fences[i], NULL);
164
165 FREE(bo->fences);
166 bo->num_fences = 0;
167 bo->max_fences = 0;
168 }
169
170 void amdgpu_bo_destroy(struct pb_buffer *_buf)
171 {
172 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
173 struct amdgpu_winsys *ws = bo->ws;
174
175 assert(bo->bo && "must not be called for slab entries");
176
177 if (!bo->is_user_ptr && bo->cpu_ptr) {
178 bo->cpu_ptr = NULL;
179 amdgpu_bo_unmap(&bo->base);
180 }
181 assert(bo->is_user_ptr || bo->u.real.map_count == 0);
182
183 if (ws->debug_all_bos) {
184 simple_mtx_lock(&ws->global_bo_list_lock);
185 LIST_DEL(&bo->u.real.global_list_item);
186 ws->num_buffers--;
187 simple_mtx_unlock(&ws->global_bo_list_lock);
188 }
189
190 simple_mtx_lock(&ws->bo_export_table_lock);
191 util_hash_table_remove(ws->bo_export_table, bo->bo);
192 simple_mtx_unlock(&ws->bo_export_table_lock);
193
194 amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
195 amdgpu_va_range_free(bo->u.real.va_handle);
196 amdgpu_bo_free(bo->bo);
197
198 amdgpu_bo_remove_fences(bo);
199
200 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
201 ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
202 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
203 ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
204
205 simple_mtx_destroy(&bo->lock);
206 FREE(bo);
207 }
208
209 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
210 {
211 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
212
213 assert(bo->bo); /* slab buffers have a separate vtbl */
214
215 if (bo->u.real.use_reusable_pool)
216 pb_cache_add_buffer(&bo->u.real.cache_entry);
217 else
218 amdgpu_bo_destroy(_buf);
219 }
220
221 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
222 {
223 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
224 pb_slabs_reclaim(&ws->bo_slabs[i]);
225
226 pb_cache_release_all_buffers(&ws->bo_cache);
227 }
228
229 static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
230 {
231 assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
232 int r = amdgpu_bo_cpu_map(bo->bo, cpu);
233 if (r) {
234 /* Clean up buffer managers and try again. */
235 amdgpu_clean_up_buffer_managers(bo->ws);
236 r = amdgpu_bo_cpu_map(bo->bo, cpu);
237 if (r)
238 return false;
239 }
240
241 if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
242 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
243 bo->ws->mapped_vram += bo->base.size;
244 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
245 bo->ws->mapped_gtt += bo->base.size;
246 bo->ws->num_mapped_buffers++;
247 }
248
249 return true;
250 }
251
252 static void *amdgpu_bo_map(struct pb_buffer *buf,
253 struct radeon_cmdbuf *rcs,
254 enum pipe_transfer_usage usage)
255 {
256 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
257 struct amdgpu_winsys_bo *real;
258 struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
259
260 assert(!bo->sparse);
261
262 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
263 if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
264 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
265 if (usage & PIPE_TRANSFER_DONTBLOCK) {
266 if (!(usage & PIPE_TRANSFER_WRITE)) {
267 /* Mapping for read.
268 *
269 * Since we are mapping for read, we don't need to wait
270 * if the GPU is using the buffer for read too
271 * (neither one is changing it).
272 *
273 * Only check whether the buffer is being used for write. */
274 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
275 RADEON_USAGE_WRITE)) {
276 cs->flush_cs(cs->flush_data,
277 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
278 return NULL;
279 }
280
281 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
282 RADEON_USAGE_WRITE)) {
283 return NULL;
284 }
285 } else {
286 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
287 cs->flush_cs(cs->flush_data,
288 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
289 return NULL;
290 }
291
292 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
293 RADEON_USAGE_READWRITE)) {
294 return NULL;
295 }
296 }
297 } else {
298 uint64_t time = os_time_get_nano();
299
300 if (!(usage & PIPE_TRANSFER_WRITE)) {
301 /* Mapping for read.
302 *
303 * Since we are mapping for read, we don't need to wait
304 * if the GPU is using the buffer for read too
305 * (neither one is changing it).
306 *
307 * Only check whether the buffer is being used for write. */
308 if (cs) {
309 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
310 RADEON_USAGE_WRITE)) {
311 cs->flush_cs(cs->flush_data,
312 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
313 } else {
314 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
315 if (p_atomic_read(&bo->num_active_ioctls))
316 amdgpu_cs_sync_flush(rcs);
317 }
318 }
319
320 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
321 RADEON_USAGE_WRITE);
322 } else {
323 /* Mapping for write. */
324 if (cs) {
325 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
326 cs->flush_cs(cs->flush_data,
327 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
328 } else {
329 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
330 if (p_atomic_read(&bo->num_active_ioctls))
331 amdgpu_cs_sync_flush(rcs);
332 }
333 }
334
335 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
336 RADEON_USAGE_READWRITE);
337 }
338
339 bo->ws->buffer_wait_time += os_time_get_nano() - time;
340 }
341 }
342
343 /* Buffer synchronization has been checked, now actually map the buffer. */
344 void *cpu = NULL;
345 uint64_t offset = 0;
346
347 if (bo->bo) {
348 real = bo;
349 } else {
350 real = bo->u.slab.real;
351 offset = bo->va - real->va;
352 }
353
354 if (usage & RADEON_TRANSFER_TEMPORARY) {
355 if (real->is_user_ptr) {
356 cpu = real->cpu_ptr;
357 } else {
358 if (!amdgpu_bo_do_map(real, &cpu))
359 return NULL;
360 }
361 } else {
362 cpu = p_atomic_read(&real->cpu_ptr);
363 if (!cpu) {
364 simple_mtx_lock(&real->lock);
365 /* Must re-check due to the possibility of a race. Re-check need not
366 * be atomic thanks to the lock. */
367 cpu = real->cpu_ptr;
368 if (!cpu) {
369 if (!amdgpu_bo_do_map(real, &cpu)) {
370 simple_mtx_unlock(&real->lock);
371 return NULL;
372 }
373 p_atomic_set(&real->cpu_ptr, cpu);
374 }
375 simple_mtx_unlock(&real->lock);
376 }
377 }
378
379 return (uint8_t*)cpu + offset;
380 }
381
382 static void amdgpu_bo_unmap(struct pb_buffer *buf)
383 {
384 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
385 struct amdgpu_winsys_bo *real;
386
387 assert(!bo->sparse);
388
389 if (bo->is_user_ptr)
390 return;
391
392 real = bo->bo ? bo : bo->u.slab.real;
393 assert(real->u.real.map_count != 0 && "too many unmaps");
394 if (p_atomic_dec_zero(&real->u.real.map_count)) {
395 assert(!real->cpu_ptr &&
396 "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
397
398 if (real->initial_domain & RADEON_DOMAIN_VRAM)
399 real->ws->mapped_vram -= real->base.size;
400 else if (real->initial_domain & RADEON_DOMAIN_GTT)
401 real->ws->mapped_gtt -= real->base.size;
402 real->ws->num_mapped_buffers--;
403 }
404
405 amdgpu_bo_cpu_unmap(real->bo);
406 }
407
408 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
409 amdgpu_bo_destroy_or_cache
410 /* other functions are never called */
411 };
412
413 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
414 {
415 struct amdgpu_winsys *ws = bo->ws;
416
417 assert(bo->bo);
418
419 if (ws->debug_all_bos) {
420 simple_mtx_lock(&ws->global_bo_list_lock);
421 LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list);
422 ws->num_buffers++;
423 simple_mtx_unlock(&ws->global_bo_list_lock);
424 }
425 }
426
427 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
428 uint64_t size,
429 unsigned alignment,
430 enum radeon_bo_domain initial_domain,
431 unsigned flags,
432 int heap)
433 {
434 struct amdgpu_bo_alloc_request request = {0};
435 amdgpu_bo_handle buf_handle;
436 uint64_t va = 0;
437 struct amdgpu_winsys_bo *bo;
438 amdgpu_va_handle va_handle;
439 unsigned va_gap_size;
440 int r;
441
442 /* VRAM or GTT must be specified, but not both at the same time. */
443 assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1);
444
445 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
446 if (!bo) {
447 return NULL;
448 }
449
450 if (heap >= 0) {
451 pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
452 heap);
453 }
454 request.alloc_size = size;
455 request.phys_alignment = alignment;
456
457 if (initial_domain & RADEON_DOMAIN_VRAM)
458 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
459 if (initial_domain & RADEON_DOMAIN_GTT)
460 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
461
462 /* Since VRAM and GTT have almost the same performance on APUs, we could
463 * just set GTT. However, in order to decrease GTT(RAM) usage, which is
464 * shared with the OS, allow VRAM placements too. The idea is not to use
465 * VRAM usefully, but to use it so that it's not unused and wasted.
466 */
467 if (!ws->info.has_dedicated_vram)
468 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
469
470 if (flags & RADEON_FLAG_NO_CPU_ACCESS)
471 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
472 if (flags & RADEON_FLAG_GTT_WC)
473 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
474 if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
475 ws->info.has_local_buffers)
476 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
477 if (ws->zero_all_vram_allocs &&
478 (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
479 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
480
481 r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
482 if (r) {
483 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
484 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);
485 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);
486 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);
487 goto error_bo_alloc;
488 }
489
490 va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
491 if (size > ws->info.pte_fragment_size)
492 alignment = MAX2(alignment, ws->info.pte_fragment_size);
493 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
494 size + va_gap_size, alignment, 0, &va, &va_handle,
495 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
496 AMDGPU_VA_RANGE_HIGH);
497 if (r)
498 goto error_va_alloc;
499
500 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
501 AMDGPU_VM_PAGE_EXECUTABLE;
502
503 if (!(flags & RADEON_FLAG_READ_ONLY))
504 vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
505
506 r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
507 AMDGPU_VA_OP_MAP);
508 if (r)
509 goto error_va_map;
510
511 simple_mtx_init(&bo->lock, mtx_plain);
512 pipe_reference_init(&bo->base.reference, 1);
513 bo->base.alignment = alignment;
514 bo->base.usage = 0;
515 bo->base.size = size;
516 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
517 bo->ws = ws;
518 bo->bo = buf_handle;
519 bo->va = va;
520 bo->u.real.va_handle = va_handle;
521 bo->initial_domain = initial_domain;
522 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
523 bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID);
524
525 if (initial_domain & RADEON_DOMAIN_VRAM)
526 ws->allocated_vram += align64(size, ws->info.gart_page_size);
527 else if (initial_domain & RADEON_DOMAIN_GTT)
528 ws->allocated_gtt += align64(size, ws->info.gart_page_size);
529
530 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
531
532 amdgpu_add_buffer_to_global_list(bo);
533
534 return bo;
535
536 error_va_map:
537 amdgpu_va_range_free(va_handle);
538
539 error_va_alloc:
540 amdgpu_bo_free(buf_handle);
541
542 error_bo_alloc:
543 FREE(bo);
544 return NULL;
545 }
546
547 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
548 {
549 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
550
551 if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
552 return false;
553 }
554
555 return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
556 }
557
558 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
559 {
560 struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
561 bo = container_of(entry, bo, u.slab.entry);
562
563 return amdgpu_bo_can_reclaim(&bo->base);
564 }
565
566 static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
567 {
568 /* Find the correct slab allocator for the given size. */
569 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
570 struct pb_slabs *slabs = &ws->bo_slabs[i];
571
572 if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
573 return slabs;
574 }
575
576 assert(0);
577 return NULL;
578 }
579
580 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
581 {
582 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
583
584 assert(!bo->bo);
585
586 pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry);
587 }
588
589 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
590 amdgpu_bo_slab_destroy
591 /* other functions are never called */
592 };
593
594 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
595 unsigned entry_size,
596 unsigned group_index)
597 {
598 struct amdgpu_winsys *ws = priv;
599 struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
600 enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
601 enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
602 uint32_t base_id;
603 unsigned slab_size = 0;
604
605 if (!slab)
606 return NULL;
607
608 /* Determine the slab buffer size. */
609 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
610 struct pb_slabs *slabs = &ws->bo_slabs[i];
611 unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1);
612
613 if (entry_size <= max_entry_size) {
614 /* The slab size is twice the size of the largest possible entry. */
615 slab_size = max_entry_size * 2;
616 }
617 }
618 assert(slab_size != 0);
619
620 slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
621 slab_size, slab_size,
622 domains, flags));
623 if (!slab->buffer)
624 goto fail;
625
626 slab->base.num_entries = slab->buffer->base.size / entry_size;
627 slab->base.num_free = slab->base.num_entries;
628 slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
629 if (!slab->entries)
630 goto fail_buffer;
631
632 LIST_INITHEAD(&slab->base.free);
633
634 base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
635
636 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
637 struct amdgpu_winsys_bo *bo = &slab->entries[i];
638
639 simple_mtx_init(&bo->lock, mtx_plain);
640 bo->base.alignment = entry_size;
641 bo->base.usage = slab->buffer->base.usage;
642 bo->base.size = entry_size;
643 bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
644 bo->ws = ws;
645 bo->va = slab->buffer->va + i * entry_size;
646 bo->initial_domain = domains;
647 bo->unique_id = base_id + i;
648 bo->u.slab.entry.slab = &slab->base;
649 bo->u.slab.entry.group_index = group_index;
650
651 if (slab->buffer->bo) {
652 /* The slab is not suballocated. */
653 bo->u.slab.real = slab->buffer;
654 } else {
655 /* The slab is allocated out of a bigger slab. */
656 bo->u.slab.real = slab->buffer->u.slab.real;
657 assert(bo->u.slab.real->bo);
658 }
659
660 LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
661 }
662
663 return &slab->base;
664
665 fail_buffer:
666 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
667 fail:
668 FREE(slab);
669 return NULL;
670 }
671
672 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
673 {
674 struct amdgpu_slab *slab = amdgpu_slab(pslab);
675
676 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
677 amdgpu_bo_remove_fences(&slab->entries[i]);
678 simple_mtx_destroy(&slab->entries[i].lock);
679 }
680
681 FREE(slab->entries);
682 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
683 FREE(slab);
684 }
685
686 #if DEBUG_SPARSE_COMMITS
687 static void
688 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
689 {
690 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
691 "Commitments:\n",
692 __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
693
694 struct amdgpu_sparse_backing *span_backing = NULL;
695 uint32_t span_first_backing_page = 0;
696 uint32_t span_first_va_page = 0;
697 uint32_t va_page = 0;
698
699 for (;;) {
700 struct amdgpu_sparse_backing *backing = 0;
701 uint32_t backing_page = 0;
702
703 if (va_page < bo->u.sparse.num_va_pages) {
704 backing = bo->u.sparse.commitments[va_page].backing;
705 backing_page = bo->u.sparse.commitments[va_page].page;
706 }
707
708 if (span_backing &&
709 (backing != span_backing ||
710 backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
711 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
712 span_first_va_page, va_page - 1, span_backing,
713 span_first_backing_page,
714 span_first_backing_page + (va_page - span_first_va_page) - 1);
715
716 span_backing = NULL;
717 }
718
719 if (va_page >= bo->u.sparse.num_va_pages)
720 break;
721
722 if (backing && !span_backing) {
723 span_backing = backing;
724 span_first_backing_page = backing_page;
725 span_first_va_page = va_page;
726 }
727
728 va_page++;
729 }
730
731 fprintf(stderr, "Backing:\n");
732
733 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
734 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
735 for (unsigned i = 0; i < backing->num_chunks; ++i)
736 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
737 }
738 }
739 #endif
740
741 /*
742 * Attempt to allocate the given number of backing pages. Fewer pages may be
743 * allocated (depending on the fragmentation of existing backing buffers),
744 * which will be reflected by a change to *pnum_pages.
745 */
746 static struct amdgpu_sparse_backing *
747 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
748 {
749 struct amdgpu_sparse_backing *best_backing;
750 unsigned best_idx;
751 uint32_t best_num_pages;
752
753 best_backing = NULL;
754 best_idx = 0;
755 best_num_pages = 0;
756
757 /* This is a very simple and inefficient best-fit algorithm. */
758 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
759 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
760 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
761 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
762 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
763 best_backing = backing;
764 best_idx = idx;
765 best_num_pages = cur_num_pages;
766 }
767 }
768 }
769
770 /* Allocate a new backing buffer if necessary. */
771 if (!best_backing) {
772 struct pb_buffer *buf;
773 uint64_t size;
774 uint32_t pages;
775
776 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
777 if (!best_backing)
778 return NULL;
779
780 best_backing->max_chunks = 4;
781 best_backing->chunks = CALLOC(best_backing->max_chunks,
782 sizeof(*best_backing->chunks));
783 if (!best_backing->chunks) {
784 FREE(best_backing);
785 return NULL;
786 }
787
788 assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
789
790 size = MIN3(bo->base.size / 16,
791 8 * 1024 * 1024,
792 bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
793 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
794
795 buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
796 bo->initial_domain,
797 bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
798 if (!buf) {
799 FREE(best_backing->chunks);
800 FREE(best_backing);
801 return NULL;
802 }
803
804 /* We might have gotten a bigger buffer than requested via caching. */
805 pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
806
807 best_backing->bo = amdgpu_winsys_bo(buf);
808 best_backing->num_chunks = 1;
809 best_backing->chunks[0].begin = 0;
810 best_backing->chunks[0].end = pages;
811
812 list_add(&best_backing->list, &bo->u.sparse.backing);
813 bo->u.sparse.num_backing_pages += pages;
814
815 best_idx = 0;
816 best_num_pages = pages;
817 }
818
819 *pnum_pages = MIN2(*pnum_pages, best_num_pages);
820 *pstart_page = best_backing->chunks[best_idx].begin;
821 best_backing->chunks[best_idx].begin += *pnum_pages;
822
823 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
824 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
825 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
826 best_backing->num_chunks--;
827 }
828
829 return best_backing;
830 }
831
832 static void
833 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
834 struct amdgpu_sparse_backing *backing)
835 {
836 struct amdgpu_winsys *ws = backing->bo->ws;
837
838 bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
839
840 simple_mtx_lock(&ws->bo_fence_lock);
841 amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
842 simple_mtx_unlock(&ws->bo_fence_lock);
843
844 list_del(&backing->list);
845 amdgpu_winsys_bo_reference(&backing->bo, NULL);
846 FREE(backing->chunks);
847 FREE(backing);
848 }
849
850 /*
851 * Return a range of pages from the given backing buffer back into the
852 * free structure.
853 */
854 static bool
855 sparse_backing_free(struct amdgpu_winsys_bo *bo,
856 struct amdgpu_sparse_backing *backing,
857 uint32_t start_page, uint32_t num_pages)
858 {
859 uint32_t end_page = start_page + num_pages;
860 unsigned low = 0;
861 unsigned high = backing->num_chunks;
862
863 /* Find the first chunk with begin >= start_page. */
864 while (low < high) {
865 unsigned mid = low + (high - low) / 2;
866
867 if (backing->chunks[mid].begin >= start_page)
868 high = mid;
869 else
870 low = mid + 1;
871 }
872
873 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
874 assert(low == 0 || backing->chunks[low - 1].end <= start_page);
875
876 if (low > 0 && backing->chunks[low - 1].end == start_page) {
877 backing->chunks[low - 1].end = end_page;
878
879 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
880 backing->chunks[low - 1].end = backing->chunks[low].end;
881 memmove(&backing->chunks[low], &backing->chunks[low + 1],
882 sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
883 backing->num_chunks--;
884 }
885 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
886 backing->chunks[low].begin = start_page;
887 } else {
888 if (backing->num_chunks >= backing->max_chunks) {
889 unsigned new_max_chunks = 2 * backing->max_chunks;
890 struct amdgpu_sparse_backing_chunk *new_chunks =
891 REALLOC(backing->chunks,
892 sizeof(*backing->chunks) * backing->max_chunks,
893 sizeof(*backing->chunks) * new_max_chunks);
894 if (!new_chunks)
895 return false;
896
897 backing->max_chunks = new_max_chunks;
898 backing->chunks = new_chunks;
899 }
900
901 memmove(&backing->chunks[low + 1], &backing->chunks[low],
902 sizeof(*backing->chunks) * (backing->num_chunks - low));
903 backing->chunks[low].begin = start_page;
904 backing->chunks[low].end = end_page;
905 backing->num_chunks++;
906 }
907
908 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
909 backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
910 sparse_free_backing_buffer(bo, backing);
911
912 return true;
913 }
914
915 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
916 {
917 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
918 int r;
919
920 assert(!bo->bo && bo->sparse);
921
922 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
923 (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
924 bo->va, 0, AMDGPU_VA_OP_CLEAR);
925 if (r) {
926 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
927 }
928
929 while (!list_empty(&bo->u.sparse.backing)) {
930 struct amdgpu_sparse_backing *dummy = NULL;
931 sparse_free_backing_buffer(bo,
932 container_of(bo->u.sparse.backing.next,
933 dummy, list));
934 }
935
936 amdgpu_va_range_free(bo->u.sparse.va_handle);
937 FREE(bo->u.sparse.commitments);
938 simple_mtx_destroy(&bo->lock);
939 FREE(bo);
940 }
941
942 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
943 amdgpu_bo_sparse_destroy
944 /* other functions are never called */
945 };
946
947 static struct pb_buffer *
948 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
949 enum radeon_bo_domain domain,
950 enum radeon_bo_flag flags)
951 {
952 struct amdgpu_winsys_bo *bo;
953 uint64_t map_size;
954 uint64_t va_gap_size;
955 int r;
956
957 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
958 * that exceed this limit. This is not really a restriction: we don't have
959 * that much virtual address space anyway.
960 */
961 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
962 return NULL;
963
964 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
965 if (!bo)
966 return NULL;
967
968 simple_mtx_init(&bo->lock, mtx_plain);
969 pipe_reference_init(&bo->base.reference, 1);
970 bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
971 bo->base.size = size;
972 bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
973 bo->ws = ws;
974 bo->initial_domain = domain;
975 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
976 bo->sparse = true;
977 bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
978
979 bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
980 bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
981 sizeof(*bo->u.sparse.commitments));
982 if (!bo->u.sparse.commitments)
983 goto error_alloc_commitments;
984
985 LIST_INITHEAD(&bo->u.sparse.backing);
986
987 /* For simplicity, we always map a multiple of the page size. */
988 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
989 va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
990 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
991 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
992 0, &bo->va, &bo->u.sparse.va_handle,
993 AMDGPU_VA_RANGE_HIGH);
994 if (r)
995 goto error_va_alloc;
996
997 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
998 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
999 if (r)
1000 goto error_va_map;
1001
1002 return &bo->base;
1003
1004 error_va_map:
1005 amdgpu_va_range_free(bo->u.sparse.va_handle);
1006 error_va_alloc:
1007 FREE(bo->u.sparse.commitments);
1008 error_alloc_commitments:
1009 simple_mtx_destroy(&bo->lock);
1010 FREE(bo);
1011 return NULL;
1012 }
1013
1014 static bool
1015 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
1016 bool commit)
1017 {
1018 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1019 struct amdgpu_sparse_commitment *comm;
1020 uint32_t va_page, end_va_page;
1021 bool ok = true;
1022 int r;
1023
1024 assert(bo->sparse);
1025 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1026 assert(offset <= bo->base.size);
1027 assert(size <= bo->base.size - offset);
1028 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
1029
1030 comm = bo->u.sparse.commitments;
1031 va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1032 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1033
1034 simple_mtx_lock(&bo->lock);
1035
1036 #if DEBUG_SPARSE_COMMITS
1037 sparse_dump(bo, __func__);
1038 #endif
1039
1040 if (commit) {
1041 while (va_page < end_va_page) {
1042 uint32_t span_va_page;
1043
1044 /* Skip pages that are already committed. */
1045 if (comm[va_page].backing) {
1046 va_page++;
1047 continue;
1048 }
1049
1050 /* Determine length of uncommitted span. */
1051 span_va_page = va_page;
1052 while (va_page < end_va_page && !comm[va_page].backing)
1053 va_page++;
1054
1055 /* Fill the uncommitted span with chunks of backing memory. */
1056 while (span_va_page < va_page) {
1057 struct amdgpu_sparse_backing *backing;
1058 uint32_t backing_start, backing_size;
1059
1060 backing_size = va_page - span_va_page;
1061 backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
1062 if (!backing) {
1063 ok = false;
1064 goto out;
1065 }
1066
1067 r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
1068 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1069 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1070 bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1071 AMDGPU_VM_PAGE_READABLE |
1072 AMDGPU_VM_PAGE_WRITEABLE |
1073 AMDGPU_VM_PAGE_EXECUTABLE,
1074 AMDGPU_VA_OP_REPLACE);
1075 if (r) {
1076 ok = sparse_backing_free(bo, backing, backing_start, backing_size);
1077 assert(ok && "sufficient memory should already be allocated");
1078
1079 ok = false;
1080 goto out;
1081 }
1082
1083 while (backing_size) {
1084 comm[span_va_page].backing = backing;
1085 comm[span_va_page].page = backing_start;
1086 span_va_page++;
1087 backing_start++;
1088 backing_size--;
1089 }
1090 }
1091 }
1092 } else {
1093 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1094 (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1095 bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1096 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1097 if (r) {
1098 ok = false;
1099 goto out;
1100 }
1101
1102 while (va_page < end_va_page) {
1103 struct amdgpu_sparse_backing *backing;
1104 uint32_t backing_start;
1105 uint32_t span_pages;
1106
1107 /* Skip pages that are already uncommitted. */
1108 if (!comm[va_page].backing) {
1109 va_page++;
1110 continue;
1111 }
1112
1113 /* Group contiguous spans of pages. */
1114 backing = comm[va_page].backing;
1115 backing_start = comm[va_page].page;
1116 comm[va_page].backing = NULL;
1117
1118 span_pages = 1;
1119 va_page++;
1120
1121 while (va_page < end_va_page &&
1122 comm[va_page].backing == backing &&
1123 comm[va_page].page == backing_start + span_pages) {
1124 comm[va_page].backing = NULL;
1125 va_page++;
1126 span_pages++;
1127 }
1128
1129 if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
1130 /* Couldn't allocate tracking data structures, so we have to leak */
1131 fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1132 ok = false;
1133 }
1134 }
1135 }
1136 out:
1137
1138 simple_mtx_unlock(&bo->lock);
1139
1140 return ok;
1141 }
1142
1143 static unsigned eg_tile_split(unsigned tile_split)
1144 {
1145 switch (tile_split) {
1146 case 0: tile_split = 64; break;
1147 case 1: tile_split = 128; break;
1148 case 2: tile_split = 256; break;
1149 case 3: tile_split = 512; break;
1150 default:
1151 case 4: tile_split = 1024; break;
1152 case 5: tile_split = 2048; break;
1153 case 6: tile_split = 4096; break;
1154 }
1155 return tile_split;
1156 }
1157
1158 static unsigned eg_tile_split_rev(unsigned eg_tile_split)
1159 {
1160 switch (eg_tile_split) {
1161 case 64: return 0;
1162 case 128: return 1;
1163 case 256: return 2;
1164 case 512: return 3;
1165 default:
1166 case 1024: return 4;
1167 case 2048: return 5;
1168 case 4096: return 6;
1169 }
1170 }
1171
1172 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
1173 struct radeon_bo_metadata *md)
1174 {
1175 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1176 struct amdgpu_bo_info info = {0};
1177 uint64_t tiling_flags;
1178 int r;
1179
1180 assert(bo->bo && "must not be called for slab entries");
1181
1182 r = amdgpu_bo_query_info(bo->bo, &info);
1183 if (r)
1184 return;
1185
1186 tiling_flags = info.metadata.tiling_info;
1187
1188 if (bo->ws->info.chip_class >= GFX9) {
1189 md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
1190 } else {
1191 md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
1192 md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
1193
1194 if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */
1195 md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
1196 else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
1197 md->u.legacy.microtile = RADEON_LAYOUT_TILED;
1198
1199 md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
1200 md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
1201 md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
1202 md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
1203 md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
1204 md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
1205 md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
1206 }
1207
1208 md->size_metadata = info.metadata.size_metadata;
1209 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1210 }
1211
1212 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
1213 struct radeon_bo_metadata *md)
1214 {
1215 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1216 struct amdgpu_bo_metadata metadata = {0};
1217 uint64_t tiling_flags = 0;
1218
1219 assert(bo->bo && "must not be called for slab entries");
1220
1221 if (bo->ws->info.chip_class >= GFX9) {
1222 tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
1223 } else {
1224 if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
1225 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
1226 else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
1227 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
1228 else
1229 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
1230
1231 tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
1232 tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
1233 tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
1234 if (md->u.legacy.tile_split)
1235 tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split));
1236 tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
1237 tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1);
1238
1239 if (md->u.legacy.scanout)
1240 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
1241 else
1242 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
1243 }
1244
1245 metadata.tiling_info = tiling_flags;
1246 metadata.size_metadata = md->size_metadata;
1247 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1248
1249 amdgpu_bo_set_metadata(bo->bo, &metadata);
1250 }
1251
1252 static struct pb_buffer *
1253 amdgpu_bo_create(struct radeon_winsys *rws,
1254 uint64_t size,
1255 unsigned alignment,
1256 enum radeon_bo_domain domain,
1257 enum radeon_bo_flag flags)
1258 {
1259 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1260 struct amdgpu_winsys_bo *bo;
1261 int heap = -1;
1262
1263 /* VRAM implies WC. This is not optional. */
1264 assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
1265
1266 /* NO_CPU_ACCESS is valid with VRAM only. */
1267 assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
1268
1269 /* Sparse buffers must have NO_CPU_ACCESS set. */
1270 assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
1271
1272 struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
1273 unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
1274
1275 /* Sub-allocate small buffers from slabs. */
1276 if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
1277 size <= max_slab_entry_size &&
1278 /* The alignment must be at most the size of the smallest slab entry or
1279 * the next power of two. */
1280 alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) {
1281 struct pb_slab_entry *entry;
1282 int heap = radeon_get_heap_index(domain, flags);
1283
1284 if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
1285 goto no_slab;
1286
1287 struct pb_slabs *slabs = get_slabs(ws, size);
1288 entry = pb_slab_alloc(slabs, size, heap);
1289 if (!entry) {
1290 /* Clean up buffer managers and try again. */
1291 amdgpu_clean_up_buffer_managers(ws);
1292
1293 entry = pb_slab_alloc(slabs, size, heap);
1294 }
1295 if (!entry)
1296 return NULL;
1297
1298 bo = NULL;
1299 bo = container_of(entry, bo, u.slab.entry);
1300
1301 pipe_reference_init(&bo->base.reference, 1);
1302
1303 return &bo->base;
1304 }
1305 no_slab:
1306
1307 if (flags & RADEON_FLAG_SPARSE) {
1308 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1309
1310 return amdgpu_bo_sparse_create(ws, size, domain, flags);
1311 }
1312
1313 /* This flag is irrelevant for the cache. */
1314 flags &= ~RADEON_FLAG_NO_SUBALLOC;
1315
1316 /* Align size to page size. This is the minimum alignment for normal
1317 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1318 * like constant/uniform buffers, can benefit from better and more reuse.
1319 */
1320 size = align64(size, ws->info.gart_page_size);
1321 alignment = align(alignment, ws->info.gart_page_size);
1322
1323 bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
1324
1325 if (use_reusable_pool) {
1326 heap = radeon_get_heap_index(domain, flags);
1327 assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
1328
1329 /* Get a buffer from the cache. */
1330 bo = (struct amdgpu_winsys_bo*)
1331 pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);
1332 if (bo)
1333 return &bo->base;
1334 }
1335
1336 /* Create a new one. */
1337 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1338 if (!bo) {
1339 /* Clean up buffer managers and try again. */
1340 amdgpu_clean_up_buffer_managers(ws);
1341
1342 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1343 if (!bo)
1344 return NULL;
1345 }
1346
1347 bo->u.real.use_reusable_pool = use_reusable_pool;
1348 return &bo->base;
1349 }
1350
1351 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1352 struct winsys_handle *whandle,
1353 unsigned *stride,
1354 unsigned *offset)
1355 {
1356 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1357 struct amdgpu_winsys_bo *bo = NULL;
1358 enum amdgpu_bo_handle_type type;
1359 struct amdgpu_bo_import_result result = {0};
1360 uint64_t va;
1361 amdgpu_va_handle va_handle = NULL;
1362 struct amdgpu_bo_info info = {0};
1363 enum radeon_bo_domain initial = 0;
1364 int r;
1365
1366 switch (whandle->type) {
1367 case WINSYS_HANDLE_TYPE_SHARED:
1368 type = amdgpu_bo_handle_type_gem_flink_name;
1369 break;
1370 case WINSYS_HANDLE_TYPE_FD:
1371 type = amdgpu_bo_handle_type_dma_buf_fd;
1372 break;
1373 default:
1374 return NULL;
1375 }
1376
1377 if (stride)
1378 *stride = whandle->stride;
1379 if (offset)
1380 *offset = whandle->offset;
1381
1382 r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
1383 if (r)
1384 return NULL;
1385
1386 simple_mtx_lock(&ws->bo_export_table_lock);
1387 bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
1388
1389 /* If the amdgpu_winsys_bo instance already exists, bump the reference
1390 * counter and return it.
1391 */
1392 if (bo) {
1393 p_atomic_inc(&bo->base.reference.count);
1394 simple_mtx_unlock(&ws->bo_export_table_lock);
1395
1396 /* Release the buffer handle, because we don't need it anymore.
1397 * This function is returning an existing buffer, which has its own
1398 * handle.
1399 */
1400 amdgpu_bo_free(result.buf_handle);
1401 return &bo->base;
1402 }
1403
1404 /* Get initial domains. */
1405 r = amdgpu_bo_query_info(result.buf_handle, &info);
1406 if (r)
1407 goto error;
1408
1409 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1410 result.alloc_size, 1 << 20, 0, &va, &va_handle,
1411 AMDGPU_VA_RANGE_HIGH);
1412 if (r)
1413 goto error;
1414
1415 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1416 if (!bo)
1417 goto error;
1418
1419 r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
1420 if (r)
1421 goto error;
1422
1423 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1424 initial |= RADEON_DOMAIN_VRAM;
1425 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1426 initial |= RADEON_DOMAIN_GTT;
1427
1428 /* Initialize the structure. */
1429 simple_mtx_init(&bo->lock, mtx_plain);
1430 pipe_reference_init(&bo->base.reference, 1);
1431 bo->base.alignment = info.phys_alignment;
1432 bo->bo = result.buf_handle;
1433 bo->base.size = result.alloc_size;
1434 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1435 bo->ws = ws;
1436 bo->va = va;
1437 bo->u.real.va_handle = va_handle;
1438 bo->initial_domain = initial;
1439 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1440 bo->is_shared = true;
1441
1442 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
1443 ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
1444 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
1445 ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
1446
1447 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1448
1449 amdgpu_add_buffer_to_global_list(bo);
1450
1451 util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1452 simple_mtx_unlock(&ws->bo_export_table_lock);
1453
1454 return &bo->base;
1455
1456 error:
1457 simple_mtx_unlock(&ws->bo_export_table_lock);
1458 if (bo)
1459 FREE(bo);
1460 if (va_handle)
1461 amdgpu_va_range_free(va_handle);
1462 amdgpu_bo_free(result.buf_handle);
1463 return NULL;
1464 }
1465
1466 static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
1467 unsigned stride, unsigned offset,
1468 unsigned slice_size,
1469 struct winsys_handle *whandle)
1470 {
1471 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
1472 struct amdgpu_winsys *ws = bo->ws;
1473 enum amdgpu_bo_handle_type type;
1474 int r;
1475
1476 /* Don't allow exports of slab entries and sparse buffers. */
1477 if (!bo->bo)
1478 return false;
1479
1480 bo->u.real.use_reusable_pool = false;
1481
1482 switch (whandle->type) {
1483 case WINSYS_HANDLE_TYPE_SHARED:
1484 type = amdgpu_bo_handle_type_gem_flink_name;
1485 break;
1486 case WINSYS_HANDLE_TYPE_FD:
1487 type = amdgpu_bo_handle_type_dma_buf_fd;
1488 break;
1489 case WINSYS_HANDLE_TYPE_KMS:
1490 type = amdgpu_bo_handle_type_kms;
1491 break;
1492 default:
1493 return false;
1494 }
1495
1496 r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
1497 if (r)
1498 return false;
1499
1500 simple_mtx_lock(&ws->bo_export_table_lock);
1501 util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1502 simple_mtx_unlock(&ws->bo_export_table_lock);
1503
1504 whandle->stride = stride;
1505 whandle->offset = offset;
1506 whandle->offset += slice_size * whandle->layer;
1507 bo->is_shared = true;
1508 return true;
1509 }
1510
1511 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1512 void *pointer, uint64_t size)
1513 {
1514 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1515 amdgpu_bo_handle buf_handle;
1516 struct amdgpu_winsys_bo *bo;
1517 uint64_t va;
1518 amdgpu_va_handle va_handle;
1519 /* Avoid failure when the size is not page aligned */
1520 uint64_t aligned_size = align64(size, ws->info.gart_page_size);
1521
1522 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1523 if (!bo)
1524 return NULL;
1525
1526 if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
1527 aligned_size, &buf_handle))
1528 goto error;
1529
1530 if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1531 aligned_size, 1 << 12, 0, &va, &va_handle,
1532 AMDGPU_VA_RANGE_HIGH))
1533 goto error_va_alloc;
1534
1535 if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
1536 goto error_va_map;
1537
1538 /* Initialize it. */
1539 bo->is_user_ptr = true;
1540 pipe_reference_init(&bo->base.reference, 1);
1541 simple_mtx_init(&bo->lock, mtx_plain);
1542 bo->bo = buf_handle;
1543 bo->base.alignment = 0;
1544 bo->base.size = size;
1545 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1546 bo->ws = ws;
1547 bo->cpu_ptr = pointer;
1548 bo->va = va;
1549 bo->u.real.va_handle = va_handle;
1550 bo->initial_domain = RADEON_DOMAIN_GTT;
1551 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1552
1553 ws->allocated_gtt += aligned_size;
1554
1555 amdgpu_add_buffer_to_global_list(bo);
1556
1557 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1558
1559 return (struct pb_buffer*)bo;
1560
1561 error_va_map:
1562 amdgpu_va_range_free(va_handle);
1563
1564 error_va_alloc:
1565 amdgpu_bo_free(buf_handle);
1566
1567 error:
1568 FREE(bo);
1569 return NULL;
1570 }
1571
1572 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
1573 {
1574 return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
1575 }
1576
1577 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
1578 {
1579 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1580
1581 return !bo->bo && !bo->sparse;
1582 }
1583
1584 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
1585 {
1586 return ((struct amdgpu_winsys_bo*)buf)->va;
1587 }
1588
1589 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
1590 {
1591 ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1592 ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1593 ws->base.buffer_map = amdgpu_bo_map;
1594 ws->base.buffer_unmap = amdgpu_bo_unmap;
1595 ws->base.buffer_wait = amdgpu_bo_wait;
1596 ws->base.buffer_create = amdgpu_bo_create;
1597 ws->base.buffer_from_handle = amdgpu_bo_from_handle;
1598 ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1599 ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1600 ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1601 ws->base.buffer_get_handle = amdgpu_bo_get_handle;
1602 ws->base.buffer_commit = amdgpu_bo_sparse_commit;
1603 ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1604 ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1605 }