winsys/amdgpu: don't set GTT with GDS & OA placements on APUs
[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_bo.c
1 /*
2 * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
3 * Copyright © 2015 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 #include "amdgpu_cs.h"
29
30 #include "util/os_time.h"
31 #include "util/u_hash_table.h"
32 #include "state_tracker/drm_driver.h"
33 #include <amdgpu_drm.h>
34 #include <xf86drm.h>
35 #include <stdio.h>
36 #include <inttypes.h>
37
38 #ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID
39 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6)
40 #endif
41
42 #ifndef AMDGPU_VA_RANGE_HIGH
43 #define AMDGPU_VA_RANGE_HIGH 0x2
44 #endif
45
46 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
47 #define DEBUG_SPARSE_COMMITS 0
48
49 struct amdgpu_sparse_backing_chunk {
50 uint32_t begin, end;
51 };
52
53 static struct pb_buffer *
54 amdgpu_bo_create(struct radeon_winsys *rws,
55 uint64_t size,
56 unsigned alignment,
57 enum radeon_bo_domain domain,
58 enum radeon_bo_flag flags);
59 static void amdgpu_bo_unmap(struct pb_buffer *buf);
60
61 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
62 enum radeon_bo_usage usage)
63 {
64 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
65 struct amdgpu_winsys *ws = bo->ws;
66 int64_t abs_timeout;
67
68 if (timeout == 0) {
69 if (p_atomic_read(&bo->num_active_ioctls))
70 return false;
71
72 } else {
73 abs_timeout = os_time_get_absolute_timeout(timeout);
74
75 /* Wait if any ioctl is being submitted with this buffer. */
76 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
77 return false;
78 }
79
80 if (bo->is_shared) {
81 /* We can't use user fences for shared buffers, because user fences
82 * are local to this process only. If we want to wait for all buffer
83 * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
84 */
85 bool buffer_busy = true;
86 int r;
87
88 r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
89 if (r)
90 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
91 r);
92 return !buffer_busy;
93 }
94
95 if (timeout == 0) {
96 unsigned idle_fences;
97 bool buffer_idle;
98
99 simple_mtx_lock(&ws->bo_fence_lock);
100
101 for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
102 if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
103 break;
104 }
105
106 /* Release the idle fences to avoid checking them again later. */
107 for (unsigned i = 0; i < idle_fences; ++i)
108 amdgpu_fence_reference(&bo->fences[i], NULL);
109
110 memmove(&bo->fences[0], &bo->fences[idle_fences],
111 (bo->num_fences - idle_fences) * sizeof(*bo->fences));
112 bo->num_fences -= idle_fences;
113
114 buffer_idle = !bo->num_fences;
115 simple_mtx_unlock(&ws->bo_fence_lock);
116
117 return buffer_idle;
118 } else {
119 bool buffer_idle = true;
120
121 simple_mtx_lock(&ws->bo_fence_lock);
122 while (bo->num_fences && buffer_idle) {
123 struct pipe_fence_handle *fence = NULL;
124 bool fence_idle = false;
125
126 amdgpu_fence_reference(&fence, bo->fences[0]);
127
128 /* Wait for the fence. */
129 simple_mtx_unlock(&ws->bo_fence_lock);
130 if (amdgpu_fence_wait(fence, abs_timeout, true))
131 fence_idle = true;
132 else
133 buffer_idle = false;
134 simple_mtx_lock(&ws->bo_fence_lock);
135
136 /* Release an idle fence to avoid checking it again later, keeping in
137 * mind that the fence array may have been modified by other threads.
138 */
139 if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
140 amdgpu_fence_reference(&bo->fences[0], NULL);
141 memmove(&bo->fences[0], &bo->fences[1],
142 (bo->num_fences - 1) * sizeof(*bo->fences));
143 bo->num_fences--;
144 }
145
146 amdgpu_fence_reference(&fence, NULL);
147 }
148 simple_mtx_unlock(&ws->bo_fence_lock);
149
150 return buffer_idle;
151 }
152 }
153
154 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
155 struct pb_buffer *buf)
156 {
157 return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
158 }
159
160 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
161 {
162 for (unsigned i = 0; i < bo->num_fences; ++i)
163 amdgpu_fence_reference(&bo->fences[i], NULL);
164
165 FREE(bo->fences);
166 bo->num_fences = 0;
167 bo->max_fences = 0;
168 }
169
170 void amdgpu_bo_destroy(struct pb_buffer *_buf)
171 {
172 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
173 struct amdgpu_winsys *ws = bo->ws;
174
175 assert(bo->bo && "must not be called for slab entries");
176
177 if (!bo->is_user_ptr && bo->cpu_ptr) {
178 bo->cpu_ptr = NULL;
179 amdgpu_bo_unmap(&bo->base);
180 }
181 assert(bo->is_user_ptr || bo->u.real.map_count == 0);
182
183 if (ws->debug_all_bos) {
184 simple_mtx_lock(&ws->global_bo_list_lock);
185 LIST_DEL(&bo->u.real.global_list_item);
186 ws->num_buffers--;
187 simple_mtx_unlock(&ws->global_bo_list_lock);
188 }
189
190 simple_mtx_lock(&ws->bo_export_table_lock);
191 util_hash_table_remove(ws->bo_export_table, bo->bo);
192 simple_mtx_unlock(&ws->bo_export_table_lock);
193
194 if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) {
195 amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
196 amdgpu_va_range_free(bo->u.real.va_handle);
197 }
198 amdgpu_bo_free(bo->bo);
199
200 amdgpu_bo_remove_fences(bo);
201
202 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
203 ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
204 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
205 ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
206
207 simple_mtx_destroy(&bo->lock);
208 FREE(bo);
209 }
210
211 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
212 {
213 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
214
215 assert(bo->bo); /* slab buffers have a separate vtbl */
216
217 if (bo->u.real.use_reusable_pool)
218 pb_cache_add_buffer(&bo->u.real.cache_entry);
219 else
220 amdgpu_bo_destroy(_buf);
221 }
222
223 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
224 {
225 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
226 pb_slabs_reclaim(&ws->bo_slabs[i]);
227
228 pb_cache_release_all_buffers(&ws->bo_cache);
229 }
230
231 static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
232 {
233 assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
234 int r = amdgpu_bo_cpu_map(bo->bo, cpu);
235 if (r) {
236 /* Clean up buffer managers and try again. */
237 amdgpu_clean_up_buffer_managers(bo->ws);
238 r = amdgpu_bo_cpu_map(bo->bo, cpu);
239 if (r)
240 return false;
241 }
242
243 if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
244 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
245 bo->ws->mapped_vram += bo->base.size;
246 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
247 bo->ws->mapped_gtt += bo->base.size;
248 bo->ws->num_mapped_buffers++;
249 }
250
251 return true;
252 }
253
254 static void *amdgpu_bo_map(struct pb_buffer *buf,
255 struct radeon_cmdbuf *rcs,
256 enum pipe_transfer_usage usage)
257 {
258 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
259 struct amdgpu_winsys_bo *real;
260 struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
261
262 assert(!bo->sparse);
263
264 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
265 if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
266 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
267 if (usage & PIPE_TRANSFER_DONTBLOCK) {
268 if (!(usage & PIPE_TRANSFER_WRITE)) {
269 /* Mapping for read.
270 *
271 * Since we are mapping for read, we don't need to wait
272 * if the GPU is using the buffer for read too
273 * (neither one is changing it).
274 *
275 * Only check whether the buffer is being used for write. */
276 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
277 RADEON_USAGE_WRITE)) {
278 cs->flush_cs(cs->flush_data,
279 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
280 return NULL;
281 }
282
283 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
284 RADEON_USAGE_WRITE)) {
285 return NULL;
286 }
287 } else {
288 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
289 cs->flush_cs(cs->flush_data,
290 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
291 return NULL;
292 }
293
294 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
295 RADEON_USAGE_READWRITE)) {
296 return NULL;
297 }
298 }
299 } else {
300 uint64_t time = os_time_get_nano();
301
302 if (!(usage & PIPE_TRANSFER_WRITE)) {
303 /* Mapping for read.
304 *
305 * Since we are mapping for read, we don't need to wait
306 * if the GPU is using the buffer for read too
307 * (neither one is changing it).
308 *
309 * Only check whether the buffer is being used for write. */
310 if (cs) {
311 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
312 RADEON_USAGE_WRITE)) {
313 cs->flush_cs(cs->flush_data,
314 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
315 } else {
316 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
317 if (p_atomic_read(&bo->num_active_ioctls))
318 amdgpu_cs_sync_flush(rcs);
319 }
320 }
321
322 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
323 RADEON_USAGE_WRITE);
324 } else {
325 /* Mapping for write. */
326 if (cs) {
327 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
328 cs->flush_cs(cs->flush_data,
329 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
330 } else {
331 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
332 if (p_atomic_read(&bo->num_active_ioctls))
333 amdgpu_cs_sync_flush(rcs);
334 }
335 }
336
337 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
338 RADEON_USAGE_READWRITE);
339 }
340
341 bo->ws->buffer_wait_time += os_time_get_nano() - time;
342 }
343 }
344
345 /* Buffer synchronization has been checked, now actually map the buffer. */
346 void *cpu = NULL;
347 uint64_t offset = 0;
348
349 if (bo->bo) {
350 real = bo;
351 } else {
352 real = bo->u.slab.real;
353 offset = bo->va - real->va;
354 }
355
356 if (usage & RADEON_TRANSFER_TEMPORARY) {
357 if (real->is_user_ptr) {
358 cpu = real->cpu_ptr;
359 } else {
360 if (!amdgpu_bo_do_map(real, &cpu))
361 return NULL;
362 }
363 } else {
364 cpu = p_atomic_read(&real->cpu_ptr);
365 if (!cpu) {
366 simple_mtx_lock(&real->lock);
367 /* Must re-check due to the possibility of a race. Re-check need not
368 * be atomic thanks to the lock. */
369 cpu = real->cpu_ptr;
370 if (!cpu) {
371 if (!amdgpu_bo_do_map(real, &cpu)) {
372 simple_mtx_unlock(&real->lock);
373 return NULL;
374 }
375 p_atomic_set(&real->cpu_ptr, cpu);
376 }
377 simple_mtx_unlock(&real->lock);
378 }
379 }
380
381 return (uint8_t*)cpu + offset;
382 }
383
384 static void amdgpu_bo_unmap(struct pb_buffer *buf)
385 {
386 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
387 struct amdgpu_winsys_bo *real;
388
389 assert(!bo->sparse);
390
391 if (bo->is_user_ptr)
392 return;
393
394 real = bo->bo ? bo : bo->u.slab.real;
395 assert(real->u.real.map_count != 0 && "too many unmaps");
396 if (p_atomic_dec_zero(&real->u.real.map_count)) {
397 assert(!real->cpu_ptr &&
398 "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
399
400 if (real->initial_domain & RADEON_DOMAIN_VRAM)
401 real->ws->mapped_vram -= real->base.size;
402 else if (real->initial_domain & RADEON_DOMAIN_GTT)
403 real->ws->mapped_gtt -= real->base.size;
404 real->ws->num_mapped_buffers--;
405 }
406
407 amdgpu_bo_cpu_unmap(real->bo);
408 }
409
410 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
411 amdgpu_bo_destroy_or_cache
412 /* other functions are never called */
413 };
414
415 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
416 {
417 struct amdgpu_winsys *ws = bo->ws;
418
419 assert(bo->bo);
420
421 if (ws->debug_all_bos) {
422 simple_mtx_lock(&ws->global_bo_list_lock);
423 LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list);
424 ws->num_buffers++;
425 simple_mtx_unlock(&ws->global_bo_list_lock);
426 }
427 }
428
429 static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws,
430 uint64_t size, unsigned alignment)
431 {
432 uint64_t vm_alignment = alignment;
433
434 /* Increase the VM alignment for faster address translation. */
435 if (size >= ws->info.pte_fragment_size)
436 vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
437
438 /* Gfx9: Increase the VM alignment to the most significant bit set
439 * in the size for faster address translation.
440 */
441 if (ws->info.chip_class >= GFX9) {
442 unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
443 uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
444
445 vm_alignment = MAX2(vm_alignment, msb_alignment);
446 }
447 return vm_alignment;
448 }
449
450 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
451 uint64_t size,
452 unsigned alignment,
453 enum radeon_bo_domain initial_domain,
454 unsigned flags,
455 int heap)
456 {
457 struct amdgpu_bo_alloc_request request = {0};
458 amdgpu_bo_handle buf_handle;
459 uint64_t va = 0;
460 struct amdgpu_winsys_bo *bo;
461 amdgpu_va_handle va_handle;
462 int r;
463
464 /* VRAM or GTT must be specified, but not both at the same time. */
465 assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
466 RADEON_DOMAIN_GDS |
467 RADEON_DOMAIN_OA)) == 1);
468
469 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
470 if (!bo) {
471 return NULL;
472 }
473
474 if (heap >= 0) {
475 pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
476 heap);
477 }
478 request.alloc_size = size;
479 request.phys_alignment = alignment;
480
481 if (initial_domain & RADEON_DOMAIN_VRAM) {
482 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
483
484 /* Since VRAM and GTT have almost the same performance on APUs, we could
485 * just set GTT. However, in order to decrease GTT(RAM) usage, which is
486 * shared with the OS, allow VRAM placements too. The idea is not to use
487 * VRAM usefully, but to use it so that it's not unused and wasted.
488 */
489 if (!ws->info.has_dedicated_vram)
490 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
491 }
492
493 if (initial_domain & RADEON_DOMAIN_GTT)
494 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
495 if (initial_domain & RADEON_DOMAIN_GDS)
496 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
497 if (initial_domain & RADEON_DOMAIN_OA)
498 request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
499
500 if (flags & RADEON_FLAG_NO_CPU_ACCESS)
501 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
502 if (flags & RADEON_FLAG_GTT_WC)
503 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
504 if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
505 ws->info.has_local_buffers)
506 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
507 if (ws->zero_all_vram_allocs &&
508 (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
509 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
510
511 r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
512 if (r) {
513 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
514 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);
515 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);
516 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);
517 goto error_bo_alloc;
518 }
519
520 if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
521 unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
522
523 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
524 size + va_gap_size,
525 amdgpu_get_optimal_vm_alignment(ws, size, alignment),
526 0, &va, &va_handle,
527 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
528 AMDGPU_VA_RANGE_HIGH);
529 if (r)
530 goto error_va_alloc;
531
532 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
533 AMDGPU_VM_PAGE_EXECUTABLE;
534
535 if (!(flags & RADEON_FLAG_READ_ONLY))
536 vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
537
538 r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
539 AMDGPU_VA_OP_MAP);
540 if (r)
541 goto error_va_map;
542 }
543
544 simple_mtx_init(&bo->lock, mtx_plain);
545 pipe_reference_init(&bo->base.reference, 1);
546 bo->base.alignment = alignment;
547 bo->base.usage = 0;
548 bo->base.size = size;
549 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
550 bo->ws = ws;
551 bo->bo = buf_handle;
552 bo->va = va;
553 bo->u.real.va_handle = va_handle;
554 bo->initial_domain = initial_domain;
555 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
556 bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID);
557
558 if (initial_domain & RADEON_DOMAIN_VRAM)
559 ws->allocated_vram += align64(size, ws->info.gart_page_size);
560 else if (initial_domain & RADEON_DOMAIN_GTT)
561 ws->allocated_gtt += align64(size, ws->info.gart_page_size);
562
563 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
564
565 amdgpu_add_buffer_to_global_list(bo);
566
567 return bo;
568
569 error_va_map:
570 amdgpu_va_range_free(va_handle);
571
572 error_va_alloc:
573 amdgpu_bo_free(buf_handle);
574
575 error_bo_alloc:
576 FREE(bo);
577 return NULL;
578 }
579
580 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
581 {
582 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
583
584 if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
585 return false;
586 }
587
588 return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
589 }
590
591 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
592 {
593 struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
594 bo = container_of(entry, bo, u.slab.entry);
595
596 return amdgpu_bo_can_reclaim(&bo->base);
597 }
598
599 static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
600 {
601 /* Find the correct slab allocator for the given size. */
602 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
603 struct pb_slabs *slabs = &ws->bo_slabs[i];
604
605 if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
606 return slabs;
607 }
608
609 assert(0);
610 return NULL;
611 }
612
613 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
614 {
615 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
616
617 assert(!bo->bo);
618
619 pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry);
620 }
621
622 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
623 amdgpu_bo_slab_destroy
624 /* other functions are never called */
625 };
626
627 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
628 unsigned entry_size,
629 unsigned group_index)
630 {
631 struct amdgpu_winsys *ws = priv;
632 struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
633 enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
634 enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
635 uint32_t base_id;
636 unsigned slab_size = 0;
637
638 if (!slab)
639 return NULL;
640
641 /* Determine the slab buffer size. */
642 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
643 struct pb_slabs *slabs = &ws->bo_slabs[i];
644 unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1);
645
646 if (entry_size <= max_entry_size) {
647 /* The slab size is twice the size of the largest possible entry. */
648 slab_size = max_entry_size * 2;
649
650 /* The largest slab should have the same size as the PTE fragment
651 * size to get faster address translation.
652 */
653 if (i == NUM_SLAB_ALLOCATORS - 1 &&
654 slab_size < ws->info.pte_fragment_size)
655 slab_size = ws->info.pte_fragment_size;
656 break;
657 }
658 }
659 assert(slab_size != 0);
660
661 slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
662 slab_size, slab_size,
663 domains, flags));
664 if (!slab->buffer)
665 goto fail;
666
667 slab->base.num_entries = slab->buffer->base.size / entry_size;
668 slab->base.num_free = slab->base.num_entries;
669 slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
670 if (!slab->entries)
671 goto fail_buffer;
672
673 LIST_INITHEAD(&slab->base.free);
674
675 base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
676
677 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
678 struct amdgpu_winsys_bo *bo = &slab->entries[i];
679
680 simple_mtx_init(&bo->lock, mtx_plain);
681 bo->base.alignment = entry_size;
682 bo->base.usage = slab->buffer->base.usage;
683 bo->base.size = entry_size;
684 bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
685 bo->ws = ws;
686 bo->va = slab->buffer->va + i * entry_size;
687 bo->initial_domain = domains;
688 bo->unique_id = base_id + i;
689 bo->u.slab.entry.slab = &slab->base;
690 bo->u.slab.entry.group_index = group_index;
691
692 if (slab->buffer->bo) {
693 /* The slab is not suballocated. */
694 bo->u.slab.real = slab->buffer;
695 } else {
696 /* The slab is allocated out of a bigger slab. */
697 bo->u.slab.real = slab->buffer->u.slab.real;
698 assert(bo->u.slab.real->bo);
699 }
700
701 LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
702 }
703
704 return &slab->base;
705
706 fail_buffer:
707 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
708 fail:
709 FREE(slab);
710 return NULL;
711 }
712
713 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
714 {
715 struct amdgpu_slab *slab = amdgpu_slab(pslab);
716
717 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
718 amdgpu_bo_remove_fences(&slab->entries[i]);
719 simple_mtx_destroy(&slab->entries[i].lock);
720 }
721
722 FREE(slab->entries);
723 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
724 FREE(slab);
725 }
726
727 #if DEBUG_SPARSE_COMMITS
728 static void
729 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
730 {
731 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
732 "Commitments:\n",
733 __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
734
735 struct amdgpu_sparse_backing *span_backing = NULL;
736 uint32_t span_first_backing_page = 0;
737 uint32_t span_first_va_page = 0;
738 uint32_t va_page = 0;
739
740 for (;;) {
741 struct amdgpu_sparse_backing *backing = 0;
742 uint32_t backing_page = 0;
743
744 if (va_page < bo->u.sparse.num_va_pages) {
745 backing = bo->u.sparse.commitments[va_page].backing;
746 backing_page = bo->u.sparse.commitments[va_page].page;
747 }
748
749 if (span_backing &&
750 (backing != span_backing ||
751 backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
752 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
753 span_first_va_page, va_page - 1, span_backing,
754 span_first_backing_page,
755 span_first_backing_page + (va_page - span_first_va_page) - 1);
756
757 span_backing = NULL;
758 }
759
760 if (va_page >= bo->u.sparse.num_va_pages)
761 break;
762
763 if (backing && !span_backing) {
764 span_backing = backing;
765 span_first_backing_page = backing_page;
766 span_first_va_page = va_page;
767 }
768
769 va_page++;
770 }
771
772 fprintf(stderr, "Backing:\n");
773
774 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
775 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
776 for (unsigned i = 0; i < backing->num_chunks; ++i)
777 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
778 }
779 }
780 #endif
781
782 /*
783 * Attempt to allocate the given number of backing pages. Fewer pages may be
784 * allocated (depending on the fragmentation of existing backing buffers),
785 * which will be reflected by a change to *pnum_pages.
786 */
787 static struct amdgpu_sparse_backing *
788 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
789 {
790 struct amdgpu_sparse_backing *best_backing;
791 unsigned best_idx;
792 uint32_t best_num_pages;
793
794 best_backing = NULL;
795 best_idx = 0;
796 best_num_pages = 0;
797
798 /* This is a very simple and inefficient best-fit algorithm. */
799 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
800 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
801 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
802 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
803 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
804 best_backing = backing;
805 best_idx = idx;
806 best_num_pages = cur_num_pages;
807 }
808 }
809 }
810
811 /* Allocate a new backing buffer if necessary. */
812 if (!best_backing) {
813 struct pb_buffer *buf;
814 uint64_t size;
815 uint32_t pages;
816
817 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
818 if (!best_backing)
819 return NULL;
820
821 best_backing->max_chunks = 4;
822 best_backing->chunks = CALLOC(best_backing->max_chunks,
823 sizeof(*best_backing->chunks));
824 if (!best_backing->chunks) {
825 FREE(best_backing);
826 return NULL;
827 }
828
829 assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
830
831 size = MIN3(bo->base.size / 16,
832 8 * 1024 * 1024,
833 bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
834 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
835
836 buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
837 bo->initial_domain,
838 bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
839 if (!buf) {
840 FREE(best_backing->chunks);
841 FREE(best_backing);
842 return NULL;
843 }
844
845 /* We might have gotten a bigger buffer than requested via caching. */
846 pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
847
848 best_backing->bo = amdgpu_winsys_bo(buf);
849 best_backing->num_chunks = 1;
850 best_backing->chunks[0].begin = 0;
851 best_backing->chunks[0].end = pages;
852
853 list_add(&best_backing->list, &bo->u.sparse.backing);
854 bo->u.sparse.num_backing_pages += pages;
855
856 best_idx = 0;
857 best_num_pages = pages;
858 }
859
860 *pnum_pages = MIN2(*pnum_pages, best_num_pages);
861 *pstart_page = best_backing->chunks[best_idx].begin;
862 best_backing->chunks[best_idx].begin += *pnum_pages;
863
864 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
865 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
866 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
867 best_backing->num_chunks--;
868 }
869
870 return best_backing;
871 }
872
873 static void
874 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
875 struct amdgpu_sparse_backing *backing)
876 {
877 struct amdgpu_winsys *ws = backing->bo->ws;
878
879 bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
880
881 simple_mtx_lock(&ws->bo_fence_lock);
882 amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
883 simple_mtx_unlock(&ws->bo_fence_lock);
884
885 list_del(&backing->list);
886 amdgpu_winsys_bo_reference(&backing->bo, NULL);
887 FREE(backing->chunks);
888 FREE(backing);
889 }
890
891 /*
892 * Return a range of pages from the given backing buffer back into the
893 * free structure.
894 */
895 static bool
896 sparse_backing_free(struct amdgpu_winsys_bo *bo,
897 struct amdgpu_sparse_backing *backing,
898 uint32_t start_page, uint32_t num_pages)
899 {
900 uint32_t end_page = start_page + num_pages;
901 unsigned low = 0;
902 unsigned high = backing->num_chunks;
903
904 /* Find the first chunk with begin >= start_page. */
905 while (low < high) {
906 unsigned mid = low + (high - low) / 2;
907
908 if (backing->chunks[mid].begin >= start_page)
909 high = mid;
910 else
911 low = mid + 1;
912 }
913
914 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
915 assert(low == 0 || backing->chunks[low - 1].end <= start_page);
916
917 if (low > 0 && backing->chunks[low - 1].end == start_page) {
918 backing->chunks[low - 1].end = end_page;
919
920 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
921 backing->chunks[low - 1].end = backing->chunks[low].end;
922 memmove(&backing->chunks[low], &backing->chunks[low + 1],
923 sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
924 backing->num_chunks--;
925 }
926 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
927 backing->chunks[low].begin = start_page;
928 } else {
929 if (backing->num_chunks >= backing->max_chunks) {
930 unsigned new_max_chunks = 2 * backing->max_chunks;
931 struct amdgpu_sparse_backing_chunk *new_chunks =
932 REALLOC(backing->chunks,
933 sizeof(*backing->chunks) * backing->max_chunks,
934 sizeof(*backing->chunks) * new_max_chunks);
935 if (!new_chunks)
936 return false;
937
938 backing->max_chunks = new_max_chunks;
939 backing->chunks = new_chunks;
940 }
941
942 memmove(&backing->chunks[low + 1], &backing->chunks[low],
943 sizeof(*backing->chunks) * (backing->num_chunks - low));
944 backing->chunks[low].begin = start_page;
945 backing->chunks[low].end = end_page;
946 backing->num_chunks++;
947 }
948
949 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
950 backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
951 sparse_free_backing_buffer(bo, backing);
952
953 return true;
954 }
955
956 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
957 {
958 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
959 int r;
960
961 assert(!bo->bo && bo->sparse);
962
963 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
964 (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
965 bo->va, 0, AMDGPU_VA_OP_CLEAR);
966 if (r) {
967 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
968 }
969
970 while (!list_empty(&bo->u.sparse.backing)) {
971 struct amdgpu_sparse_backing *dummy = NULL;
972 sparse_free_backing_buffer(bo,
973 container_of(bo->u.sparse.backing.next,
974 dummy, list));
975 }
976
977 amdgpu_va_range_free(bo->u.sparse.va_handle);
978 FREE(bo->u.sparse.commitments);
979 simple_mtx_destroy(&bo->lock);
980 FREE(bo);
981 }
982
983 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
984 amdgpu_bo_sparse_destroy
985 /* other functions are never called */
986 };
987
988 static struct pb_buffer *
989 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
990 enum radeon_bo_domain domain,
991 enum radeon_bo_flag flags)
992 {
993 struct amdgpu_winsys_bo *bo;
994 uint64_t map_size;
995 uint64_t va_gap_size;
996 int r;
997
998 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
999 * that exceed this limit. This is not really a restriction: we don't have
1000 * that much virtual address space anyway.
1001 */
1002 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
1003 return NULL;
1004
1005 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1006 if (!bo)
1007 return NULL;
1008
1009 simple_mtx_init(&bo->lock, mtx_plain);
1010 pipe_reference_init(&bo->base.reference, 1);
1011 bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
1012 bo->base.size = size;
1013 bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
1014 bo->ws = ws;
1015 bo->initial_domain = domain;
1016 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1017 bo->sparse = true;
1018 bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
1019
1020 bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1021 bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
1022 sizeof(*bo->u.sparse.commitments));
1023 if (!bo->u.sparse.commitments)
1024 goto error_alloc_commitments;
1025
1026 LIST_INITHEAD(&bo->u.sparse.backing);
1027
1028 /* For simplicity, we always map a multiple of the page size. */
1029 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
1030 va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
1031 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1032 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
1033 0, &bo->va, &bo->u.sparse.va_handle,
1034 AMDGPU_VA_RANGE_HIGH);
1035 if (r)
1036 goto error_va_alloc;
1037
1038 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
1039 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
1040 if (r)
1041 goto error_va_map;
1042
1043 return &bo->base;
1044
1045 error_va_map:
1046 amdgpu_va_range_free(bo->u.sparse.va_handle);
1047 error_va_alloc:
1048 FREE(bo->u.sparse.commitments);
1049 error_alloc_commitments:
1050 simple_mtx_destroy(&bo->lock);
1051 FREE(bo);
1052 return NULL;
1053 }
1054
1055 static bool
1056 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
1057 bool commit)
1058 {
1059 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1060 struct amdgpu_sparse_commitment *comm;
1061 uint32_t va_page, end_va_page;
1062 bool ok = true;
1063 int r;
1064
1065 assert(bo->sparse);
1066 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1067 assert(offset <= bo->base.size);
1068 assert(size <= bo->base.size - offset);
1069 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
1070
1071 comm = bo->u.sparse.commitments;
1072 va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1073 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1074
1075 simple_mtx_lock(&bo->lock);
1076
1077 #if DEBUG_SPARSE_COMMITS
1078 sparse_dump(bo, __func__);
1079 #endif
1080
1081 if (commit) {
1082 while (va_page < end_va_page) {
1083 uint32_t span_va_page;
1084
1085 /* Skip pages that are already committed. */
1086 if (comm[va_page].backing) {
1087 va_page++;
1088 continue;
1089 }
1090
1091 /* Determine length of uncommitted span. */
1092 span_va_page = va_page;
1093 while (va_page < end_va_page && !comm[va_page].backing)
1094 va_page++;
1095
1096 /* Fill the uncommitted span with chunks of backing memory. */
1097 while (span_va_page < va_page) {
1098 struct amdgpu_sparse_backing *backing;
1099 uint32_t backing_start, backing_size;
1100
1101 backing_size = va_page - span_va_page;
1102 backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
1103 if (!backing) {
1104 ok = false;
1105 goto out;
1106 }
1107
1108 r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
1109 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1110 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1111 bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1112 AMDGPU_VM_PAGE_READABLE |
1113 AMDGPU_VM_PAGE_WRITEABLE |
1114 AMDGPU_VM_PAGE_EXECUTABLE,
1115 AMDGPU_VA_OP_REPLACE);
1116 if (r) {
1117 ok = sparse_backing_free(bo, backing, backing_start, backing_size);
1118 assert(ok && "sufficient memory should already be allocated");
1119
1120 ok = false;
1121 goto out;
1122 }
1123
1124 while (backing_size) {
1125 comm[span_va_page].backing = backing;
1126 comm[span_va_page].page = backing_start;
1127 span_va_page++;
1128 backing_start++;
1129 backing_size--;
1130 }
1131 }
1132 }
1133 } else {
1134 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1135 (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1136 bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1137 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1138 if (r) {
1139 ok = false;
1140 goto out;
1141 }
1142
1143 while (va_page < end_va_page) {
1144 struct amdgpu_sparse_backing *backing;
1145 uint32_t backing_start;
1146 uint32_t span_pages;
1147
1148 /* Skip pages that are already uncommitted. */
1149 if (!comm[va_page].backing) {
1150 va_page++;
1151 continue;
1152 }
1153
1154 /* Group contiguous spans of pages. */
1155 backing = comm[va_page].backing;
1156 backing_start = comm[va_page].page;
1157 comm[va_page].backing = NULL;
1158
1159 span_pages = 1;
1160 va_page++;
1161
1162 while (va_page < end_va_page &&
1163 comm[va_page].backing == backing &&
1164 comm[va_page].page == backing_start + span_pages) {
1165 comm[va_page].backing = NULL;
1166 va_page++;
1167 span_pages++;
1168 }
1169
1170 if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
1171 /* Couldn't allocate tracking data structures, so we have to leak */
1172 fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1173 ok = false;
1174 }
1175 }
1176 }
1177 out:
1178
1179 simple_mtx_unlock(&bo->lock);
1180
1181 return ok;
1182 }
1183
1184 static unsigned eg_tile_split(unsigned tile_split)
1185 {
1186 switch (tile_split) {
1187 case 0: tile_split = 64; break;
1188 case 1: tile_split = 128; break;
1189 case 2: tile_split = 256; break;
1190 case 3: tile_split = 512; break;
1191 default:
1192 case 4: tile_split = 1024; break;
1193 case 5: tile_split = 2048; break;
1194 case 6: tile_split = 4096; break;
1195 }
1196 return tile_split;
1197 }
1198
1199 static unsigned eg_tile_split_rev(unsigned eg_tile_split)
1200 {
1201 switch (eg_tile_split) {
1202 case 64: return 0;
1203 case 128: return 1;
1204 case 256: return 2;
1205 case 512: return 3;
1206 default:
1207 case 1024: return 4;
1208 case 2048: return 5;
1209 case 4096: return 6;
1210 }
1211 }
1212
1213 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
1214 struct radeon_bo_metadata *md)
1215 {
1216 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1217 struct amdgpu_bo_info info = {0};
1218 uint64_t tiling_flags;
1219 int r;
1220
1221 assert(bo->bo && "must not be called for slab entries");
1222
1223 r = amdgpu_bo_query_info(bo->bo, &info);
1224 if (r)
1225 return;
1226
1227 tiling_flags = info.metadata.tiling_info;
1228
1229 if (bo->ws->info.chip_class >= GFX9) {
1230 md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
1231
1232 md->u.gfx9.dcc_offset_256B = AMDGPU_TILING_GET(tiling_flags, DCC_OFFSET_256B);
1233 md->u.gfx9.dcc_pitch_max = AMDGPU_TILING_GET(tiling_flags, DCC_PITCH_MAX);
1234 md->u.gfx9.dcc_independent_64B = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B);
1235 } else {
1236 md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
1237 md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
1238
1239 if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */
1240 md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
1241 else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
1242 md->u.legacy.microtile = RADEON_LAYOUT_TILED;
1243
1244 md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
1245 md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
1246 md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
1247 md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
1248 md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
1249 md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
1250 md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
1251 }
1252
1253 md->size_metadata = info.metadata.size_metadata;
1254 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1255 }
1256
1257 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
1258 struct radeon_bo_metadata *md)
1259 {
1260 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1261 struct amdgpu_bo_metadata metadata = {0};
1262 uint64_t tiling_flags = 0;
1263
1264 assert(bo->bo && "must not be called for slab entries");
1265
1266 if (bo->ws->info.chip_class >= GFX9) {
1267 tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
1268
1269 tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256B);
1270 tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max);
1271 tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64B);
1272 } else {
1273 if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
1274 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
1275 else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
1276 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
1277 else
1278 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
1279
1280 tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
1281 tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
1282 tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
1283 if (md->u.legacy.tile_split)
1284 tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split));
1285 tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
1286 tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1);
1287
1288 if (md->u.legacy.scanout)
1289 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
1290 else
1291 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
1292 }
1293
1294 metadata.tiling_info = tiling_flags;
1295 metadata.size_metadata = md->size_metadata;
1296 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1297
1298 amdgpu_bo_set_metadata(bo->bo, &metadata);
1299 }
1300
1301 static struct pb_buffer *
1302 amdgpu_bo_create(struct radeon_winsys *rws,
1303 uint64_t size,
1304 unsigned alignment,
1305 enum radeon_bo_domain domain,
1306 enum radeon_bo_flag flags)
1307 {
1308 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1309 struct amdgpu_winsys_bo *bo;
1310 int heap = -1;
1311
1312 /* VRAM implies WC. This is not optional. */
1313 assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
1314
1315 /* NO_CPU_ACCESS is valid with VRAM only. */
1316 assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
1317
1318 /* Sparse buffers must have NO_CPU_ACCESS set. */
1319 assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
1320
1321 struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
1322 unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
1323
1324 /* Sub-allocate small buffers from slabs. */
1325 if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
1326 size <= max_slab_entry_size &&
1327 /* The alignment must be at most the size of the smallest slab entry or
1328 * the next power of two. */
1329 alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) {
1330 struct pb_slab_entry *entry;
1331 int heap = radeon_get_heap_index(domain, flags);
1332
1333 if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
1334 goto no_slab;
1335
1336 struct pb_slabs *slabs = get_slabs(ws, size);
1337 entry = pb_slab_alloc(slabs, size, heap);
1338 if (!entry) {
1339 /* Clean up buffer managers and try again. */
1340 amdgpu_clean_up_buffer_managers(ws);
1341
1342 entry = pb_slab_alloc(slabs, size, heap);
1343 }
1344 if (!entry)
1345 return NULL;
1346
1347 bo = NULL;
1348 bo = container_of(entry, bo, u.slab.entry);
1349
1350 pipe_reference_init(&bo->base.reference, 1);
1351
1352 return &bo->base;
1353 }
1354 no_slab:
1355
1356 if (flags & RADEON_FLAG_SPARSE) {
1357 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1358
1359 return amdgpu_bo_sparse_create(ws, size, domain, flags);
1360 }
1361
1362 /* This flag is irrelevant for the cache. */
1363 flags &= ~RADEON_FLAG_NO_SUBALLOC;
1364
1365 /* Align size to page size. This is the minimum alignment for normal
1366 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1367 * like constant/uniform buffers, can benefit from better and more reuse.
1368 */
1369 if (domain & RADEON_DOMAIN_VRAM_GTT) {
1370 size = align64(size, ws->info.gart_page_size);
1371 alignment = align(alignment, ws->info.gart_page_size);
1372 }
1373
1374 bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
1375
1376 if (use_reusable_pool) {
1377 heap = radeon_get_heap_index(domain, flags);
1378 assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
1379
1380 /* Get a buffer from the cache. */
1381 bo = (struct amdgpu_winsys_bo*)
1382 pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);
1383 if (bo)
1384 return &bo->base;
1385 }
1386
1387 /* Create a new one. */
1388 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1389 if (!bo) {
1390 /* Clean up buffer managers and try again. */
1391 amdgpu_clean_up_buffer_managers(ws);
1392
1393 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1394 if (!bo)
1395 return NULL;
1396 }
1397
1398 bo->u.real.use_reusable_pool = use_reusable_pool;
1399 return &bo->base;
1400 }
1401
1402 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1403 struct winsys_handle *whandle,
1404 unsigned vm_alignment,
1405 unsigned *stride,
1406 unsigned *offset)
1407 {
1408 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1409 struct amdgpu_winsys_bo *bo = NULL;
1410 enum amdgpu_bo_handle_type type;
1411 struct amdgpu_bo_import_result result = {0};
1412 uint64_t va;
1413 amdgpu_va_handle va_handle = NULL;
1414 struct amdgpu_bo_info info = {0};
1415 enum radeon_bo_domain initial = 0;
1416 int r;
1417
1418 switch (whandle->type) {
1419 case WINSYS_HANDLE_TYPE_SHARED:
1420 type = amdgpu_bo_handle_type_gem_flink_name;
1421 break;
1422 case WINSYS_HANDLE_TYPE_FD:
1423 type = amdgpu_bo_handle_type_dma_buf_fd;
1424 break;
1425 default:
1426 return NULL;
1427 }
1428
1429 if (stride)
1430 *stride = whandle->stride;
1431 if (offset)
1432 *offset = whandle->offset;
1433
1434 r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
1435 if (r)
1436 return NULL;
1437
1438 simple_mtx_lock(&ws->bo_export_table_lock);
1439 bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
1440
1441 /* If the amdgpu_winsys_bo instance already exists, bump the reference
1442 * counter and return it.
1443 */
1444 if (bo) {
1445 p_atomic_inc(&bo->base.reference.count);
1446 simple_mtx_unlock(&ws->bo_export_table_lock);
1447
1448 /* Release the buffer handle, because we don't need it anymore.
1449 * This function is returning an existing buffer, which has its own
1450 * handle.
1451 */
1452 amdgpu_bo_free(result.buf_handle);
1453 return &bo->base;
1454 }
1455
1456 /* Get initial domains. */
1457 r = amdgpu_bo_query_info(result.buf_handle, &info);
1458 if (r)
1459 goto error;
1460
1461 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1462 result.alloc_size,
1463 amdgpu_get_optimal_vm_alignment(ws, result.alloc_size,
1464 vm_alignment),
1465 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
1466 if (r)
1467 goto error;
1468
1469 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1470 if (!bo)
1471 goto error;
1472
1473 r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
1474 if (r)
1475 goto error;
1476
1477 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1478 initial |= RADEON_DOMAIN_VRAM;
1479 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1480 initial |= RADEON_DOMAIN_GTT;
1481
1482 /* Initialize the structure. */
1483 simple_mtx_init(&bo->lock, mtx_plain);
1484 pipe_reference_init(&bo->base.reference, 1);
1485 bo->base.alignment = info.phys_alignment;
1486 bo->bo = result.buf_handle;
1487 bo->base.size = result.alloc_size;
1488 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1489 bo->ws = ws;
1490 bo->va = va;
1491 bo->u.real.va_handle = va_handle;
1492 bo->initial_domain = initial;
1493 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1494 bo->is_shared = true;
1495
1496 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
1497 ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
1498 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
1499 ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
1500
1501 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1502
1503 amdgpu_add_buffer_to_global_list(bo);
1504
1505 util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1506 simple_mtx_unlock(&ws->bo_export_table_lock);
1507
1508 return &bo->base;
1509
1510 error:
1511 simple_mtx_unlock(&ws->bo_export_table_lock);
1512 if (bo)
1513 FREE(bo);
1514 if (va_handle)
1515 amdgpu_va_range_free(va_handle);
1516 amdgpu_bo_free(result.buf_handle);
1517 return NULL;
1518 }
1519
1520 static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
1521 unsigned stride, unsigned offset,
1522 unsigned slice_size,
1523 struct winsys_handle *whandle)
1524 {
1525 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
1526 struct amdgpu_winsys *ws = bo->ws;
1527 enum amdgpu_bo_handle_type type;
1528 int r;
1529
1530 /* Don't allow exports of slab entries and sparse buffers. */
1531 if (!bo->bo)
1532 return false;
1533
1534 bo->u.real.use_reusable_pool = false;
1535
1536 switch (whandle->type) {
1537 case WINSYS_HANDLE_TYPE_SHARED:
1538 type = amdgpu_bo_handle_type_gem_flink_name;
1539 break;
1540 case WINSYS_HANDLE_TYPE_FD:
1541 type = amdgpu_bo_handle_type_dma_buf_fd;
1542 break;
1543 case WINSYS_HANDLE_TYPE_KMS:
1544 type = amdgpu_bo_handle_type_kms;
1545 break;
1546 default:
1547 return false;
1548 }
1549
1550 r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
1551 if (r)
1552 return false;
1553
1554 simple_mtx_lock(&ws->bo_export_table_lock);
1555 util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1556 simple_mtx_unlock(&ws->bo_export_table_lock);
1557
1558 whandle->stride = stride;
1559 whandle->offset = offset;
1560 whandle->offset += slice_size * whandle->layer;
1561 bo->is_shared = true;
1562 return true;
1563 }
1564
1565 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1566 void *pointer, uint64_t size)
1567 {
1568 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1569 amdgpu_bo_handle buf_handle;
1570 struct amdgpu_winsys_bo *bo;
1571 uint64_t va;
1572 amdgpu_va_handle va_handle;
1573 /* Avoid failure when the size is not page aligned */
1574 uint64_t aligned_size = align64(size, ws->info.gart_page_size);
1575
1576 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1577 if (!bo)
1578 return NULL;
1579
1580 if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
1581 aligned_size, &buf_handle))
1582 goto error;
1583
1584 if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1585 aligned_size,
1586 amdgpu_get_optimal_vm_alignment(ws, aligned_size,
1587 ws->info.gart_page_size),
1588 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
1589 goto error_va_alloc;
1590
1591 if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
1592 goto error_va_map;
1593
1594 /* Initialize it. */
1595 bo->is_user_ptr = true;
1596 pipe_reference_init(&bo->base.reference, 1);
1597 simple_mtx_init(&bo->lock, mtx_plain);
1598 bo->bo = buf_handle;
1599 bo->base.alignment = 0;
1600 bo->base.size = size;
1601 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1602 bo->ws = ws;
1603 bo->cpu_ptr = pointer;
1604 bo->va = va;
1605 bo->u.real.va_handle = va_handle;
1606 bo->initial_domain = RADEON_DOMAIN_GTT;
1607 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1608
1609 ws->allocated_gtt += aligned_size;
1610
1611 amdgpu_add_buffer_to_global_list(bo);
1612
1613 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1614
1615 return (struct pb_buffer*)bo;
1616
1617 error_va_map:
1618 amdgpu_va_range_free(va_handle);
1619
1620 error_va_alloc:
1621 amdgpu_bo_free(buf_handle);
1622
1623 error:
1624 FREE(bo);
1625 return NULL;
1626 }
1627
1628 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
1629 {
1630 return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
1631 }
1632
1633 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
1634 {
1635 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1636
1637 return !bo->bo && !bo->sparse;
1638 }
1639
1640 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
1641 {
1642 return ((struct amdgpu_winsys_bo*)buf)->va;
1643 }
1644
1645 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
1646 {
1647 ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1648 ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1649 ws->base.buffer_map = amdgpu_bo_map;
1650 ws->base.buffer_unmap = amdgpu_bo_unmap;
1651 ws->base.buffer_wait = amdgpu_bo_wait;
1652 ws->base.buffer_create = amdgpu_bo_create;
1653 ws->base.buffer_from_handle = amdgpu_bo_from_handle;
1654 ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1655 ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1656 ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1657 ws->base.buffer_get_handle = amdgpu_bo_get_handle;
1658 ws->base.buffer_commit = amdgpu_bo_sparse_commit;
1659 ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1660 ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1661 }