winsys/amdgpu: make amdgpu_bo_unmap non-static
[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_bo.c
1 /*
2 * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
3 * Copyright © 2015 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 #include "amdgpu_cs.h"
29
30 #include "util/hash_table.h"
31 #include "util/os_time.h"
32 #include "util/u_hash_table.h"
33 #include "frontend/drm_driver.h"
34 #include "drm-uapi/amdgpu_drm.h"
35 #include <xf86drm.h>
36 #include <stdio.h>
37 #include <inttypes.h>
38
39 #ifndef AMDGPU_VA_RANGE_HIGH
40 #define AMDGPU_VA_RANGE_HIGH 0x2
41 #endif
42
43 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
44 #define DEBUG_SPARSE_COMMITS 0
45
46 struct amdgpu_sparse_backing_chunk {
47 uint32_t begin, end;
48 };
49
50 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
51 enum radeon_bo_usage usage)
52 {
53 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
54 struct amdgpu_winsys *ws = bo->ws;
55 int64_t abs_timeout;
56
57 if (timeout == 0) {
58 if (p_atomic_read(&bo->num_active_ioctls))
59 return false;
60
61 } else {
62 abs_timeout = os_time_get_absolute_timeout(timeout);
63
64 /* Wait if any ioctl is being submitted with this buffer. */
65 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
66 return false;
67 }
68
69 if (bo->is_shared) {
70 /* We can't use user fences for shared buffers, because user fences
71 * are local to this process only. If we want to wait for all buffer
72 * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
73 */
74 bool buffer_busy = true;
75 int r;
76
77 r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
78 if (r)
79 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
80 r);
81 return !buffer_busy;
82 }
83
84 if (timeout == 0) {
85 unsigned idle_fences;
86 bool buffer_idle;
87
88 simple_mtx_lock(&ws->bo_fence_lock);
89
90 for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
91 if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
92 break;
93 }
94
95 /* Release the idle fences to avoid checking them again later. */
96 for (unsigned i = 0; i < idle_fences; ++i)
97 amdgpu_fence_reference(&bo->fences[i], NULL);
98
99 memmove(&bo->fences[0], &bo->fences[idle_fences],
100 (bo->num_fences - idle_fences) * sizeof(*bo->fences));
101 bo->num_fences -= idle_fences;
102
103 buffer_idle = !bo->num_fences;
104 simple_mtx_unlock(&ws->bo_fence_lock);
105
106 return buffer_idle;
107 } else {
108 bool buffer_idle = true;
109
110 simple_mtx_lock(&ws->bo_fence_lock);
111 while (bo->num_fences && buffer_idle) {
112 struct pipe_fence_handle *fence = NULL;
113 bool fence_idle = false;
114
115 amdgpu_fence_reference(&fence, bo->fences[0]);
116
117 /* Wait for the fence. */
118 simple_mtx_unlock(&ws->bo_fence_lock);
119 if (amdgpu_fence_wait(fence, abs_timeout, true))
120 fence_idle = true;
121 else
122 buffer_idle = false;
123 simple_mtx_lock(&ws->bo_fence_lock);
124
125 /* Release an idle fence to avoid checking it again later, keeping in
126 * mind that the fence array may have been modified by other threads.
127 */
128 if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
129 amdgpu_fence_reference(&bo->fences[0], NULL);
130 memmove(&bo->fences[0], &bo->fences[1],
131 (bo->num_fences - 1) * sizeof(*bo->fences));
132 bo->num_fences--;
133 }
134
135 amdgpu_fence_reference(&fence, NULL);
136 }
137 simple_mtx_unlock(&ws->bo_fence_lock);
138
139 return buffer_idle;
140 }
141 }
142
143 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
144 struct pb_buffer *buf)
145 {
146 return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
147 }
148
149 static enum radeon_bo_flag amdgpu_bo_get_flags(
150 struct pb_buffer *buf)
151 {
152 return ((struct amdgpu_winsys_bo*)buf)->flags;
153 }
154
155 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
156 {
157 for (unsigned i = 0; i < bo->num_fences; ++i)
158 amdgpu_fence_reference(&bo->fences[i], NULL);
159
160 FREE(bo->fences);
161 bo->num_fences = 0;
162 bo->max_fences = 0;
163 }
164
165 void amdgpu_bo_destroy(struct pb_buffer *_buf)
166 {
167 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
168 struct amdgpu_screen_winsys *sws_iter;
169 struct amdgpu_winsys *ws = bo->ws;
170
171 assert(bo->bo && "must not be called for slab entries");
172
173 if (!bo->is_user_ptr && bo->cpu_ptr) {
174 bo->cpu_ptr = NULL;
175 amdgpu_bo_unmap(&bo->base);
176 }
177 assert(bo->is_user_ptr || bo->u.real.map_count == 0);
178
179 if (ws->debug_all_bos) {
180 simple_mtx_lock(&ws->global_bo_list_lock);
181 list_del(&bo->u.real.global_list_item);
182 ws->num_buffers--;
183 simple_mtx_unlock(&ws->global_bo_list_lock);
184 }
185
186 /* Close all KMS handles retrieved for other DRM file descriptions */
187 simple_mtx_lock(&ws->sws_list_lock);
188 for (sws_iter = ws->sws_list; sws_iter; sws_iter = sws_iter->next) {
189 struct hash_entry *entry;
190
191 if (!sws_iter->kms_handles)
192 continue;
193
194 entry = _mesa_hash_table_search(sws_iter->kms_handles, bo);
195 if (entry) {
196 struct drm_gem_close args = { .handle = (uintptr_t)entry->data };
197
198 drmIoctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args);
199 _mesa_hash_table_remove(sws_iter->kms_handles, entry);
200 }
201 }
202 simple_mtx_unlock(&ws->sws_list_lock);
203
204 simple_mtx_lock(&ws->bo_export_table_lock);
205 _mesa_hash_table_remove_key(ws->bo_export_table, bo->bo);
206 simple_mtx_unlock(&ws->bo_export_table_lock);
207
208 if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) {
209 amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
210 amdgpu_va_range_free(bo->u.real.va_handle);
211 }
212 amdgpu_bo_free(bo->bo);
213
214 amdgpu_bo_remove_fences(bo);
215
216 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
217 ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
218 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
219 ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
220
221 simple_mtx_destroy(&bo->lock);
222 FREE(bo);
223 }
224
225 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
226 {
227 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
228
229 assert(bo->bo); /* slab buffers have a separate vtbl */
230
231 if (bo->u.real.use_reusable_pool)
232 pb_cache_add_buffer(&bo->u.real.cache_entry);
233 else
234 amdgpu_bo_destroy(_buf);
235 }
236
237 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
238 {
239 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
240 pb_slabs_reclaim(&ws->bo_slabs[i]);
241 if (ws->secure)
242 pb_slabs_reclaim(&ws->bo_slabs_encrypted[i]);
243 }
244
245 pb_cache_release_all_buffers(&ws->bo_cache);
246 }
247
248 static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
249 {
250 assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
251 int r = amdgpu_bo_cpu_map(bo->bo, cpu);
252 if (r) {
253 /* Clean up buffer managers and try again. */
254 amdgpu_clean_up_buffer_managers(bo->ws);
255 r = amdgpu_bo_cpu_map(bo->bo, cpu);
256 if (r)
257 return false;
258 }
259
260 if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
261 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
262 bo->ws->mapped_vram += bo->base.size;
263 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
264 bo->ws->mapped_gtt += bo->base.size;
265 bo->ws->num_mapped_buffers++;
266 }
267
268 return true;
269 }
270
271 void *amdgpu_bo_map(struct pb_buffer *buf,
272 struct radeon_cmdbuf *rcs,
273 enum pipe_transfer_usage usage)
274 {
275 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
276 struct amdgpu_winsys_bo *real;
277 struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
278
279 assert(!bo->sparse);
280
281 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
282 if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
283 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
284 if (usage & PIPE_TRANSFER_DONTBLOCK) {
285 if (!(usage & PIPE_TRANSFER_WRITE)) {
286 /* Mapping for read.
287 *
288 * Since we are mapping for read, we don't need to wait
289 * if the GPU is using the buffer for read too
290 * (neither one is changing it).
291 *
292 * Only check whether the buffer is being used for write. */
293 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
294 RADEON_USAGE_WRITE)) {
295 cs->flush_cs(cs->flush_data,
296 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
297 return NULL;
298 }
299
300 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
301 RADEON_USAGE_WRITE)) {
302 return NULL;
303 }
304 } else {
305 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
306 cs->flush_cs(cs->flush_data,
307 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
308 return NULL;
309 }
310
311 if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
312 RADEON_USAGE_READWRITE)) {
313 return NULL;
314 }
315 }
316 } else {
317 uint64_t time = os_time_get_nano();
318
319 if (!(usage & PIPE_TRANSFER_WRITE)) {
320 /* Mapping for read.
321 *
322 * Since we are mapping for read, we don't need to wait
323 * if the GPU is using the buffer for read too
324 * (neither one is changing it).
325 *
326 * Only check whether the buffer is being used for write. */
327 if (cs) {
328 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
329 RADEON_USAGE_WRITE)) {
330 cs->flush_cs(cs->flush_data,
331 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
332 } else {
333 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
334 if (p_atomic_read(&bo->num_active_ioctls))
335 amdgpu_cs_sync_flush(rcs);
336 }
337 }
338
339 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
340 RADEON_USAGE_WRITE);
341 } else {
342 /* Mapping for write. */
343 if (cs) {
344 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
345 cs->flush_cs(cs->flush_data,
346 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
347 } else {
348 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
349 if (p_atomic_read(&bo->num_active_ioctls))
350 amdgpu_cs_sync_flush(rcs);
351 }
352 }
353
354 amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
355 RADEON_USAGE_READWRITE);
356 }
357
358 bo->ws->buffer_wait_time += os_time_get_nano() - time;
359 }
360 }
361
362 /* Buffer synchronization has been checked, now actually map the buffer. */
363 void *cpu = NULL;
364 uint64_t offset = 0;
365
366 if (bo->bo) {
367 real = bo;
368 } else {
369 real = bo->u.slab.real;
370 offset = bo->va - real->va;
371 }
372
373 if (usage & RADEON_TRANSFER_TEMPORARY) {
374 if (real->is_user_ptr) {
375 cpu = real->cpu_ptr;
376 } else {
377 if (!amdgpu_bo_do_map(real, &cpu))
378 return NULL;
379 }
380 } else {
381 cpu = p_atomic_read(&real->cpu_ptr);
382 if (!cpu) {
383 simple_mtx_lock(&real->lock);
384 /* Must re-check due to the possibility of a race. Re-check need not
385 * be atomic thanks to the lock. */
386 cpu = real->cpu_ptr;
387 if (!cpu) {
388 if (!amdgpu_bo_do_map(real, &cpu)) {
389 simple_mtx_unlock(&real->lock);
390 return NULL;
391 }
392 p_atomic_set(&real->cpu_ptr, cpu);
393 }
394 simple_mtx_unlock(&real->lock);
395 }
396 }
397
398 return (uint8_t*)cpu + offset;
399 }
400
401 void amdgpu_bo_unmap(struct pb_buffer *buf)
402 {
403 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
404 struct amdgpu_winsys_bo *real;
405
406 assert(!bo->sparse);
407
408 if (bo->is_user_ptr)
409 return;
410
411 real = bo->bo ? bo : bo->u.slab.real;
412 assert(real->u.real.map_count != 0 && "too many unmaps");
413 if (p_atomic_dec_zero(&real->u.real.map_count)) {
414 assert(!real->cpu_ptr &&
415 "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
416
417 if (real->initial_domain & RADEON_DOMAIN_VRAM)
418 real->ws->mapped_vram -= real->base.size;
419 else if (real->initial_domain & RADEON_DOMAIN_GTT)
420 real->ws->mapped_gtt -= real->base.size;
421 real->ws->num_mapped_buffers--;
422 }
423
424 amdgpu_bo_cpu_unmap(real->bo);
425 }
426
427 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
428 amdgpu_bo_destroy_or_cache
429 /* other functions are never called */
430 };
431
432 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
433 {
434 struct amdgpu_winsys *ws = bo->ws;
435
436 assert(bo->bo);
437
438 if (ws->debug_all_bos) {
439 simple_mtx_lock(&ws->global_bo_list_lock);
440 list_addtail(&bo->u.real.global_list_item, &ws->global_bo_list);
441 ws->num_buffers++;
442 simple_mtx_unlock(&ws->global_bo_list_lock);
443 }
444 }
445
446 static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws,
447 uint64_t size, unsigned alignment)
448 {
449 uint64_t vm_alignment = alignment;
450
451 /* Increase the VM alignment for faster address translation. */
452 if (size >= ws->info.pte_fragment_size)
453 vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
454
455 /* Gfx9: Increase the VM alignment to the most significant bit set
456 * in the size for faster address translation.
457 */
458 if (ws->info.chip_class >= GFX9) {
459 unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
460 uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
461
462 vm_alignment = MAX2(vm_alignment, msb_alignment);
463 }
464 return vm_alignment;
465 }
466
467 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
468 uint64_t size,
469 unsigned alignment,
470 enum radeon_bo_domain initial_domain,
471 unsigned flags,
472 int heap)
473 {
474 struct amdgpu_bo_alloc_request request = {0};
475 amdgpu_bo_handle buf_handle;
476 uint64_t va = 0;
477 struct amdgpu_winsys_bo *bo;
478 amdgpu_va_handle va_handle = NULL;
479 int r;
480
481 /* VRAM or GTT must be specified, but not both at the same time. */
482 assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
483 RADEON_DOMAIN_GDS |
484 RADEON_DOMAIN_OA)) == 1);
485
486 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
487 if (!bo) {
488 return NULL;
489 }
490
491 if (heap >= 0) {
492 pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
493 heap);
494 }
495 request.alloc_size = size;
496 request.phys_alignment = alignment;
497
498 if (initial_domain & RADEON_DOMAIN_VRAM) {
499 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
500
501 /* Since VRAM and GTT have almost the same performance on APUs, we could
502 * just set GTT. However, in order to decrease GTT(RAM) usage, which is
503 * shared with the OS, allow VRAM placements too. The idea is not to use
504 * VRAM usefully, but to use it so that it's not unused and wasted.
505 */
506 if (!ws->info.has_dedicated_vram)
507 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
508 }
509
510 if (initial_domain & RADEON_DOMAIN_GTT)
511 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
512 if (initial_domain & RADEON_DOMAIN_GDS)
513 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
514 if (initial_domain & RADEON_DOMAIN_OA)
515 request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
516
517 if (flags & RADEON_FLAG_NO_CPU_ACCESS)
518 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
519 if (flags & RADEON_FLAG_GTT_WC)
520 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
521 if (ws->zero_all_vram_allocs &&
522 (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
523 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
524 if ((flags & RADEON_FLAG_ENCRYPTED) && ws->secure)
525 request.flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
526
527 r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
528 if (r) {
529 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
530 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);
531 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);
532 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);
533 fprintf(stderr, "amdgpu: flags : %" PRIx64 "\n", request.flags);
534 goto error_bo_alloc;
535 }
536
537 if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
538 unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
539
540 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
541 size + va_gap_size,
542 amdgpu_get_optimal_vm_alignment(ws, size, alignment),
543 0, &va, &va_handle,
544 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
545 AMDGPU_VA_RANGE_HIGH);
546 if (r)
547 goto error_va_alloc;
548
549 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
550 AMDGPU_VM_PAGE_EXECUTABLE;
551
552 if (!(flags & RADEON_FLAG_READ_ONLY))
553 vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
554
555 if (flags & RADEON_FLAG_UNCACHED)
556 vm_flags |= AMDGPU_VM_MTYPE_UC;
557
558 r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
559 AMDGPU_VA_OP_MAP);
560 if (r)
561 goto error_va_map;
562 }
563
564 simple_mtx_init(&bo->lock, mtx_plain);
565 pipe_reference_init(&bo->base.reference, 1);
566 bo->base.alignment = alignment;
567 bo->base.usage = 0;
568 bo->base.size = size;
569 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
570 bo->ws = ws;
571 bo->bo = buf_handle;
572 bo->va = va;
573 bo->u.real.va_handle = va_handle;
574 bo->initial_domain = initial_domain;
575 bo->flags = flags;
576 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
577
578 if (initial_domain & RADEON_DOMAIN_VRAM)
579 ws->allocated_vram += align64(size, ws->info.gart_page_size);
580 else if (initial_domain & RADEON_DOMAIN_GTT)
581 ws->allocated_gtt += align64(size, ws->info.gart_page_size);
582
583 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
584
585 amdgpu_add_buffer_to_global_list(bo);
586
587 return bo;
588
589 error_va_map:
590 amdgpu_va_range_free(va_handle);
591
592 error_va_alloc:
593 amdgpu_bo_free(buf_handle);
594
595 error_bo_alloc:
596 FREE(bo);
597 return NULL;
598 }
599
600 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
601 {
602 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
603
604 if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
605 return false;
606 }
607
608 return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
609 }
610
611 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
612 {
613 struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
614 bo = container_of(entry, bo, u.slab.entry);
615
616 return amdgpu_bo_can_reclaim(&bo->base);
617 }
618
619 static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size,
620 enum radeon_bo_flag flags)
621 {
622 struct pb_slabs *bo_slabs = ((flags & RADEON_FLAG_ENCRYPTED) && ws->secure) ?
623 ws->bo_slabs_encrypted : ws->bo_slabs;
624 /* Find the correct slab allocator for the given size. */
625 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
626 struct pb_slabs *slabs = &bo_slabs[i];
627
628 if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
629 return slabs;
630 }
631
632 assert(0);
633 return NULL;
634 }
635
636 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
637 {
638 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
639
640 assert(!bo->bo);
641
642 if (bo->flags & RADEON_FLAG_ENCRYPTED)
643 pb_slab_free(get_slabs(bo->ws,
644 bo->base.size,
645 RADEON_FLAG_ENCRYPTED), &bo->u.slab.entry);
646 else
647 pb_slab_free(get_slabs(bo->ws,
648 bo->base.size,
649 0), &bo->u.slab.entry);
650 }
651
652 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
653 amdgpu_bo_slab_destroy
654 /* other functions are never called */
655 };
656
657 static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
658 unsigned entry_size,
659 unsigned group_index,
660 bool encrypted)
661 {
662 struct amdgpu_winsys *ws = priv;
663 struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
664 enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
665 enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
666 uint32_t base_id;
667 unsigned slab_size = 0;
668
669 if (!slab)
670 return NULL;
671
672 if (encrypted)
673 flags |= RADEON_FLAG_ENCRYPTED;
674
675 struct pb_slabs *slabs = (flags & RADEON_FLAG_ENCRYPTED && ws->secure) ?
676 ws->bo_slabs_encrypted : ws->bo_slabs;
677
678 /* Determine the slab buffer size. */
679 for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
680 unsigned max_entry_size = 1 << (slabs[i].min_order + slabs[i].num_orders - 1);
681
682 if (entry_size <= max_entry_size) {
683 /* The slab size is twice the size of the largest possible entry. */
684 slab_size = max_entry_size * 2;
685
686 /* The largest slab should have the same size as the PTE fragment
687 * size to get faster address translation.
688 */
689 if (i == NUM_SLAB_ALLOCATORS - 1 &&
690 slab_size < ws->info.pte_fragment_size)
691 slab_size = ws->info.pte_fragment_size;
692 break;
693 }
694 }
695 assert(slab_size != 0);
696
697 slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
698 slab_size, slab_size,
699 domains, flags));
700 if (!slab->buffer)
701 goto fail;
702
703 slab->base.num_entries = slab->buffer->base.size / entry_size;
704 slab->base.num_free = slab->base.num_entries;
705 slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
706 if (!slab->entries)
707 goto fail_buffer;
708
709 list_inithead(&slab->base.free);
710
711 base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
712
713 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
714 struct amdgpu_winsys_bo *bo = &slab->entries[i];
715
716 simple_mtx_init(&bo->lock, mtx_plain);
717 bo->base.alignment = entry_size;
718 bo->base.usage = slab->buffer->base.usage;
719 bo->base.size = entry_size;
720 bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
721 bo->ws = ws;
722 bo->va = slab->buffer->va + i * entry_size;
723 bo->initial_domain = domains;
724 bo->unique_id = base_id + i;
725 bo->u.slab.entry.slab = &slab->base;
726 bo->u.slab.entry.group_index = group_index;
727
728 if (slab->buffer->bo) {
729 /* The slab is not suballocated. */
730 bo->u.slab.real = slab->buffer;
731 } else {
732 /* The slab is allocated out of a bigger slab. */
733 bo->u.slab.real = slab->buffer->u.slab.real;
734 assert(bo->u.slab.real->bo);
735 }
736
737 list_addtail(&bo->u.slab.entry.head, &slab->base.free);
738 }
739
740 return &slab->base;
741
742 fail_buffer:
743 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
744 fail:
745 FREE(slab);
746 return NULL;
747 }
748
749 struct pb_slab *amdgpu_bo_slab_alloc_encrypted(void *priv, unsigned heap,
750 unsigned entry_size,
751 unsigned group_index)
752 {
753 return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, true);
754 }
755
756 struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,
757 unsigned entry_size,
758 unsigned group_index)
759 {
760 return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, false);
761 }
762
763 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
764 {
765 struct amdgpu_slab *slab = amdgpu_slab(pslab);
766
767 for (unsigned i = 0; i < slab->base.num_entries; ++i) {
768 amdgpu_bo_remove_fences(&slab->entries[i]);
769 simple_mtx_destroy(&slab->entries[i].lock);
770 }
771
772 FREE(slab->entries);
773 amdgpu_winsys_bo_reference(&slab->buffer, NULL);
774 FREE(slab);
775 }
776
777 #if DEBUG_SPARSE_COMMITS
778 static void
779 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
780 {
781 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
782 "Commitments:\n",
783 __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
784
785 struct amdgpu_sparse_backing *span_backing = NULL;
786 uint32_t span_first_backing_page = 0;
787 uint32_t span_first_va_page = 0;
788 uint32_t va_page = 0;
789
790 for (;;) {
791 struct amdgpu_sparse_backing *backing = 0;
792 uint32_t backing_page = 0;
793
794 if (va_page < bo->u.sparse.num_va_pages) {
795 backing = bo->u.sparse.commitments[va_page].backing;
796 backing_page = bo->u.sparse.commitments[va_page].page;
797 }
798
799 if (span_backing &&
800 (backing != span_backing ||
801 backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
802 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
803 span_first_va_page, va_page - 1, span_backing,
804 span_first_backing_page,
805 span_first_backing_page + (va_page - span_first_va_page) - 1);
806
807 span_backing = NULL;
808 }
809
810 if (va_page >= bo->u.sparse.num_va_pages)
811 break;
812
813 if (backing && !span_backing) {
814 span_backing = backing;
815 span_first_backing_page = backing_page;
816 span_first_va_page = va_page;
817 }
818
819 va_page++;
820 }
821
822 fprintf(stderr, "Backing:\n");
823
824 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
825 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
826 for (unsigned i = 0; i < backing->num_chunks; ++i)
827 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
828 }
829 }
830 #endif
831
832 /*
833 * Attempt to allocate the given number of backing pages. Fewer pages may be
834 * allocated (depending on the fragmentation of existing backing buffers),
835 * which will be reflected by a change to *pnum_pages.
836 */
837 static struct amdgpu_sparse_backing *
838 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
839 {
840 struct amdgpu_sparse_backing *best_backing;
841 unsigned best_idx;
842 uint32_t best_num_pages;
843
844 best_backing = NULL;
845 best_idx = 0;
846 best_num_pages = 0;
847
848 /* This is a very simple and inefficient best-fit algorithm. */
849 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
850 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
851 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
852 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
853 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
854 best_backing = backing;
855 best_idx = idx;
856 best_num_pages = cur_num_pages;
857 }
858 }
859 }
860
861 /* Allocate a new backing buffer if necessary. */
862 if (!best_backing) {
863 struct pb_buffer *buf;
864 uint64_t size;
865 uint32_t pages;
866
867 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
868 if (!best_backing)
869 return NULL;
870
871 best_backing->max_chunks = 4;
872 best_backing->chunks = CALLOC(best_backing->max_chunks,
873 sizeof(*best_backing->chunks));
874 if (!best_backing->chunks) {
875 FREE(best_backing);
876 return NULL;
877 }
878
879 assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
880
881 size = MIN3(bo->base.size / 16,
882 8 * 1024 * 1024,
883 bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
884 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
885
886 buf = amdgpu_bo_create(bo->ws, size, RADEON_SPARSE_PAGE_SIZE,
887 bo->initial_domain,
888 bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
889 if (!buf) {
890 FREE(best_backing->chunks);
891 FREE(best_backing);
892 return NULL;
893 }
894
895 /* We might have gotten a bigger buffer than requested via caching. */
896 pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
897
898 best_backing->bo = amdgpu_winsys_bo(buf);
899 best_backing->num_chunks = 1;
900 best_backing->chunks[0].begin = 0;
901 best_backing->chunks[0].end = pages;
902
903 list_add(&best_backing->list, &bo->u.sparse.backing);
904 bo->u.sparse.num_backing_pages += pages;
905
906 best_idx = 0;
907 best_num_pages = pages;
908 }
909
910 *pnum_pages = MIN2(*pnum_pages, best_num_pages);
911 *pstart_page = best_backing->chunks[best_idx].begin;
912 best_backing->chunks[best_idx].begin += *pnum_pages;
913
914 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
915 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
916 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
917 best_backing->num_chunks--;
918 }
919
920 return best_backing;
921 }
922
923 static void
924 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
925 struct amdgpu_sparse_backing *backing)
926 {
927 struct amdgpu_winsys *ws = backing->bo->ws;
928
929 bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
930
931 simple_mtx_lock(&ws->bo_fence_lock);
932 amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
933 simple_mtx_unlock(&ws->bo_fence_lock);
934
935 list_del(&backing->list);
936 amdgpu_winsys_bo_reference(&backing->bo, NULL);
937 FREE(backing->chunks);
938 FREE(backing);
939 }
940
941 /*
942 * Return a range of pages from the given backing buffer back into the
943 * free structure.
944 */
945 static bool
946 sparse_backing_free(struct amdgpu_winsys_bo *bo,
947 struct amdgpu_sparse_backing *backing,
948 uint32_t start_page, uint32_t num_pages)
949 {
950 uint32_t end_page = start_page + num_pages;
951 unsigned low = 0;
952 unsigned high = backing->num_chunks;
953
954 /* Find the first chunk with begin >= start_page. */
955 while (low < high) {
956 unsigned mid = low + (high - low) / 2;
957
958 if (backing->chunks[mid].begin >= start_page)
959 high = mid;
960 else
961 low = mid + 1;
962 }
963
964 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
965 assert(low == 0 || backing->chunks[low - 1].end <= start_page);
966
967 if (low > 0 && backing->chunks[low - 1].end == start_page) {
968 backing->chunks[low - 1].end = end_page;
969
970 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
971 backing->chunks[low - 1].end = backing->chunks[low].end;
972 memmove(&backing->chunks[low], &backing->chunks[low + 1],
973 sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
974 backing->num_chunks--;
975 }
976 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
977 backing->chunks[low].begin = start_page;
978 } else {
979 if (backing->num_chunks >= backing->max_chunks) {
980 unsigned new_max_chunks = 2 * backing->max_chunks;
981 struct amdgpu_sparse_backing_chunk *new_chunks =
982 REALLOC(backing->chunks,
983 sizeof(*backing->chunks) * backing->max_chunks,
984 sizeof(*backing->chunks) * new_max_chunks);
985 if (!new_chunks)
986 return false;
987
988 backing->max_chunks = new_max_chunks;
989 backing->chunks = new_chunks;
990 }
991
992 memmove(&backing->chunks[low + 1], &backing->chunks[low],
993 sizeof(*backing->chunks) * (backing->num_chunks - low));
994 backing->chunks[low].begin = start_page;
995 backing->chunks[low].end = end_page;
996 backing->num_chunks++;
997 }
998
999 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
1000 backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
1001 sparse_free_backing_buffer(bo, backing);
1002
1003 return true;
1004 }
1005
1006 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
1007 {
1008 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1009 int r;
1010
1011 assert(!bo->bo && bo->sparse);
1012
1013 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1014 (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
1015 bo->va, 0, AMDGPU_VA_OP_CLEAR);
1016 if (r) {
1017 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
1018 }
1019
1020 while (!list_is_empty(&bo->u.sparse.backing)) {
1021 struct amdgpu_sparse_backing *dummy = NULL;
1022 sparse_free_backing_buffer(bo,
1023 container_of(bo->u.sparse.backing.next,
1024 dummy, list));
1025 }
1026
1027 amdgpu_va_range_free(bo->u.sparse.va_handle);
1028 FREE(bo->u.sparse.commitments);
1029 simple_mtx_destroy(&bo->lock);
1030 FREE(bo);
1031 }
1032
1033 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
1034 amdgpu_bo_sparse_destroy
1035 /* other functions are never called */
1036 };
1037
1038 static struct pb_buffer *
1039 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
1040 enum radeon_bo_domain domain,
1041 enum radeon_bo_flag flags)
1042 {
1043 struct amdgpu_winsys_bo *bo;
1044 uint64_t map_size;
1045 uint64_t va_gap_size;
1046 int r;
1047
1048 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
1049 * that exceed this limit. This is not really a restriction: we don't have
1050 * that much virtual address space anyway.
1051 */
1052 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
1053 return NULL;
1054
1055 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1056 if (!bo)
1057 return NULL;
1058
1059 simple_mtx_init(&bo->lock, mtx_plain);
1060 pipe_reference_init(&bo->base.reference, 1);
1061 bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
1062 bo->base.size = size;
1063 bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
1064 bo->ws = ws;
1065 bo->initial_domain = domain;
1066 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1067 bo->sparse = true;
1068 bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
1069
1070 bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1071 bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
1072 sizeof(*bo->u.sparse.commitments));
1073 if (!bo->u.sparse.commitments)
1074 goto error_alloc_commitments;
1075
1076 list_inithead(&bo->u.sparse.backing);
1077
1078 /* For simplicity, we always map a multiple of the page size. */
1079 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
1080 va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
1081 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1082 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
1083 0, &bo->va, &bo->u.sparse.va_handle,
1084 AMDGPU_VA_RANGE_HIGH);
1085 if (r)
1086 goto error_va_alloc;
1087
1088 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
1089 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
1090 if (r)
1091 goto error_va_map;
1092
1093 return &bo->base;
1094
1095 error_va_map:
1096 amdgpu_va_range_free(bo->u.sparse.va_handle);
1097 error_va_alloc:
1098 FREE(bo->u.sparse.commitments);
1099 error_alloc_commitments:
1100 simple_mtx_destroy(&bo->lock);
1101 FREE(bo);
1102 return NULL;
1103 }
1104
1105 static bool
1106 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
1107 bool commit)
1108 {
1109 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1110 struct amdgpu_sparse_commitment *comm;
1111 uint32_t va_page, end_va_page;
1112 bool ok = true;
1113 int r;
1114
1115 assert(bo->sparse);
1116 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1117 assert(offset <= bo->base.size);
1118 assert(size <= bo->base.size - offset);
1119 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
1120
1121 comm = bo->u.sparse.commitments;
1122 va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1123 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1124
1125 simple_mtx_lock(&bo->lock);
1126
1127 #if DEBUG_SPARSE_COMMITS
1128 sparse_dump(bo, __func__);
1129 #endif
1130
1131 if (commit) {
1132 while (va_page < end_va_page) {
1133 uint32_t span_va_page;
1134
1135 /* Skip pages that are already committed. */
1136 if (comm[va_page].backing) {
1137 va_page++;
1138 continue;
1139 }
1140
1141 /* Determine length of uncommitted span. */
1142 span_va_page = va_page;
1143 while (va_page < end_va_page && !comm[va_page].backing)
1144 va_page++;
1145
1146 /* Fill the uncommitted span with chunks of backing memory. */
1147 while (span_va_page < va_page) {
1148 struct amdgpu_sparse_backing *backing;
1149 uint32_t backing_start, backing_size;
1150
1151 backing_size = va_page - span_va_page;
1152 backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
1153 if (!backing) {
1154 ok = false;
1155 goto out;
1156 }
1157
1158 r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
1159 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1160 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1161 bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1162 AMDGPU_VM_PAGE_READABLE |
1163 AMDGPU_VM_PAGE_WRITEABLE |
1164 AMDGPU_VM_PAGE_EXECUTABLE,
1165 AMDGPU_VA_OP_REPLACE);
1166 if (r) {
1167 ok = sparse_backing_free(bo, backing, backing_start, backing_size);
1168 assert(ok && "sufficient memory should already be allocated");
1169
1170 ok = false;
1171 goto out;
1172 }
1173
1174 while (backing_size) {
1175 comm[span_va_page].backing = backing;
1176 comm[span_va_page].page = backing_start;
1177 span_va_page++;
1178 backing_start++;
1179 backing_size--;
1180 }
1181 }
1182 }
1183 } else {
1184 r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1185 (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1186 bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1187 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1188 if (r) {
1189 ok = false;
1190 goto out;
1191 }
1192
1193 while (va_page < end_va_page) {
1194 struct amdgpu_sparse_backing *backing;
1195 uint32_t backing_start;
1196 uint32_t span_pages;
1197
1198 /* Skip pages that are already uncommitted. */
1199 if (!comm[va_page].backing) {
1200 va_page++;
1201 continue;
1202 }
1203
1204 /* Group contiguous spans of pages. */
1205 backing = comm[va_page].backing;
1206 backing_start = comm[va_page].page;
1207 comm[va_page].backing = NULL;
1208
1209 span_pages = 1;
1210 va_page++;
1211
1212 while (va_page < end_va_page &&
1213 comm[va_page].backing == backing &&
1214 comm[va_page].page == backing_start + span_pages) {
1215 comm[va_page].backing = NULL;
1216 va_page++;
1217 span_pages++;
1218 }
1219
1220 if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
1221 /* Couldn't allocate tracking data structures, so we have to leak */
1222 fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1223 ok = false;
1224 }
1225 }
1226 }
1227 out:
1228
1229 simple_mtx_unlock(&bo->lock);
1230
1231 return ok;
1232 }
1233
1234 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
1235 struct radeon_bo_metadata *md,
1236 struct radeon_surf *surf)
1237 {
1238 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1239 struct amdgpu_bo_info info = {0};
1240 int r;
1241
1242 assert(bo->bo && "must not be called for slab entries");
1243
1244 r = amdgpu_bo_query_info(bo->bo, &info);
1245 if (r)
1246 return;
1247
1248 ac_surface_set_bo_metadata(&bo->ws->info, surf, info.metadata.tiling_info,
1249 &md->mode);
1250
1251 md->size_metadata = info.metadata.size_metadata;
1252 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1253 }
1254
1255 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
1256 struct radeon_bo_metadata *md,
1257 struct radeon_surf *surf)
1258 {
1259 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1260 struct amdgpu_bo_metadata metadata = {0};
1261
1262 assert(bo->bo && "must not be called for slab entries");
1263
1264 ac_surface_get_bo_metadata(&bo->ws->info, surf, &metadata.tiling_info);
1265
1266 metadata.size_metadata = md->size_metadata;
1267 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1268
1269 amdgpu_bo_set_metadata(bo->bo, &metadata);
1270 }
1271
1272 struct pb_buffer *
1273 amdgpu_bo_create(struct amdgpu_winsys *ws,
1274 uint64_t size,
1275 unsigned alignment,
1276 enum radeon_bo_domain domain,
1277 enum radeon_bo_flag flags)
1278 {
1279 struct amdgpu_winsys_bo *bo;
1280 int heap = -1;
1281
1282 if (domain & (RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA))
1283 flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_SUBALLOC;
1284
1285 /* VRAM implies WC. This is not optional. */
1286 assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
1287
1288 /* NO_CPU_ACCESS is not valid with GTT. */
1289 assert(!(domain & RADEON_DOMAIN_GTT) || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
1290
1291 /* Sparse buffers must have NO_CPU_ACCESS set. */
1292 assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
1293
1294 struct pb_slabs *slabs = (flags & RADEON_FLAG_ENCRYPTED && ws->secure) ?
1295 ws->bo_slabs_encrypted : ws->bo_slabs;
1296 struct pb_slabs *last_slab = &slabs[NUM_SLAB_ALLOCATORS - 1];
1297 unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
1298
1299 /* Sub-allocate small buffers from slabs. */
1300 if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
1301 size <= max_slab_entry_size &&
1302 /* The alignment must be at most the size of the smallest slab entry or
1303 * the next power of two. */
1304 alignment <= MAX2(1 << slabs[0].min_order, util_next_power_of_two(size))) {
1305 struct pb_slab_entry *entry;
1306 int heap = radeon_get_heap_index(domain, flags);
1307
1308 if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
1309 goto no_slab;
1310
1311 struct pb_slabs *slabs = get_slabs(ws, size, flags);
1312 entry = pb_slab_alloc(slabs, size, heap);
1313 if (!entry) {
1314 /* Clean up buffer managers and try again. */
1315 amdgpu_clean_up_buffer_managers(ws);
1316
1317 entry = pb_slab_alloc(slabs, size, heap);
1318 }
1319 if (!entry)
1320 return NULL;
1321
1322 bo = NULL;
1323 bo = container_of(entry, bo, u.slab.entry);
1324
1325 pipe_reference_init(&bo->base.reference, 1);
1326
1327 return &bo->base;
1328 }
1329 no_slab:
1330
1331 if (flags & RADEON_FLAG_SPARSE) {
1332 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1333
1334 return amdgpu_bo_sparse_create(ws, size, domain, flags);
1335 }
1336
1337 /* This flag is irrelevant for the cache. */
1338 flags &= ~RADEON_FLAG_NO_SUBALLOC;
1339
1340 /* Align size to page size. This is the minimum alignment for normal
1341 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1342 * like constant/uniform buffers, can benefit from better and more reuse.
1343 */
1344 if (domain & RADEON_DOMAIN_VRAM_GTT) {
1345 size = align64(size, ws->info.gart_page_size);
1346 alignment = align(alignment, ws->info.gart_page_size);
1347 }
1348
1349 bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
1350
1351 if (use_reusable_pool) {
1352 heap = radeon_get_heap_index(domain, flags & ~RADEON_FLAG_ENCRYPTED);
1353 assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
1354
1355 /* Get a buffer from the cache. */
1356 bo = (struct amdgpu_winsys_bo*)
1357 pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);
1358 if (bo)
1359 return &bo->base;
1360 }
1361
1362 /* Create a new one. */
1363 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1364 if (!bo) {
1365 /* Clean up buffer managers and try again. */
1366 amdgpu_clean_up_buffer_managers(ws);
1367
1368 bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1369 if (!bo)
1370 return NULL;
1371 }
1372
1373 bo->u.real.use_reusable_pool = use_reusable_pool;
1374 return &bo->base;
1375 }
1376
1377 static struct pb_buffer *
1378 amdgpu_buffer_create(struct radeon_winsys *ws,
1379 uint64_t size,
1380 unsigned alignment,
1381 enum radeon_bo_domain domain,
1382 enum radeon_bo_flag flags)
1383 {
1384 struct pb_buffer * res = amdgpu_bo_create(amdgpu_winsys(ws), size, alignment, domain,
1385 flags);
1386 return res;
1387 }
1388
1389 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1390 struct winsys_handle *whandle,
1391 unsigned vm_alignment)
1392 {
1393 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1394 struct amdgpu_winsys_bo *bo = NULL;
1395 enum amdgpu_bo_handle_type type;
1396 struct amdgpu_bo_import_result result = {0};
1397 uint64_t va;
1398 amdgpu_va_handle va_handle = NULL;
1399 struct amdgpu_bo_info info = {0};
1400 enum radeon_bo_domain initial = 0;
1401 enum radeon_bo_flag flags = 0;
1402 int r;
1403
1404 switch (whandle->type) {
1405 case WINSYS_HANDLE_TYPE_SHARED:
1406 type = amdgpu_bo_handle_type_gem_flink_name;
1407 break;
1408 case WINSYS_HANDLE_TYPE_FD:
1409 type = amdgpu_bo_handle_type_dma_buf_fd;
1410 break;
1411 default:
1412 return NULL;
1413 }
1414
1415 r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
1416 if (r)
1417 return NULL;
1418
1419 simple_mtx_lock(&ws->bo_export_table_lock);
1420 bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
1421
1422 /* If the amdgpu_winsys_bo instance already exists, bump the reference
1423 * counter and return it.
1424 */
1425 if (bo) {
1426 p_atomic_inc(&bo->base.reference.count);
1427 simple_mtx_unlock(&ws->bo_export_table_lock);
1428
1429 /* Release the buffer handle, because we don't need it anymore.
1430 * This function is returning an existing buffer, which has its own
1431 * handle.
1432 */
1433 amdgpu_bo_free(result.buf_handle);
1434 return &bo->base;
1435 }
1436
1437 /* Get initial domains. */
1438 r = amdgpu_bo_query_info(result.buf_handle, &info);
1439 if (r)
1440 goto error;
1441
1442 r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1443 result.alloc_size,
1444 amdgpu_get_optimal_vm_alignment(ws, result.alloc_size,
1445 vm_alignment),
1446 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
1447 if (r)
1448 goto error;
1449
1450 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1451 if (!bo)
1452 goto error;
1453
1454 r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
1455 if (r)
1456 goto error;
1457
1458 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1459 initial |= RADEON_DOMAIN_VRAM;
1460 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1461 initial |= RADEON_DOMAIN_GTT;
1462 if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
1463 flags |= RADEON_FLAG_NO_CPU_ACCESS;
1464 if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
1465 flags |= RADEON_FLAG_GTT_WC;
1466 if (info.alloc_flags & AMDGPU_GEM_CREATE_ENCRYPTED)
1467 flags |= RADEON_FLAG_ENCRYPTED;
1468
1469 /* Initialize the structure. */
1470 simple_mtx_init(&bo->lock, mtx_plain);
1471 pipe_reference_init(&bo->base.reference, 1);
1472 bo->base.alignment = info.phys_alignment;
1473 bo->bo = result.buf_handle;
1474 bo->base.size = result.alloc_size;
1475 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1476 bo->ws = ws;
1477 bo->va = va;
1478 bo->u.real.va_handle = va_handle;
1479 bo->initial_domain = initial;
1480 bo->flags = flags;
1481 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1482 bo->is_shared = true;
1483
1484 if (bo->initial_domain & RADEON_DOMAIN_VRAM)
1485 ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
1486 else if (bo->initial_domain & RADEON_DOMAIN_GTT)
1487 ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
1488
1489 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1490
1491 amdgpu_add_buffer_to_global_list(bo);
1492
1493 _mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);
1494 simple_mtx_unlock(&ws->bo_export_table_lock);
1495
1496 return &bo->base;
1497
1498 error:
1499 simple_mtx_unlock(&ws->bo_export_table_lock);
1500 if (bo)
1501 FREE(bo);
1502 if (va_handle)
1503 amdgpu_va_range_free(va_handle);
1504 amdgpu_bo_free(result.buf_handle);
1505 return NULL;
1506 }
1507
1508 static bool amdgpu_bo_get_handle(struct radeon_winsys *rws,
1509 struct pb_buffer *buffer,
1510 struct winsys_handle *whandle)
1511 {
1512 struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws);
1513 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
1514 struct amdgpu_winsys *ws = bo->ws;
1515 enum amdgpu_bo_handle_type type;
1516 struct hash_entry *entry;
1517 int r;
1518
1519 /* Don't allow exports of slab entries and sparse buffers. */
1520 if (!bo->bo)
1521 return false;
1522
1523 bo->u.real.use_reusable_pool = false;
1524
1525 switch (whandle->type) {
1526 case WINSYS_HANDLE_TYPE_SHARED:
1527 type = amdgpu_bo_handle_type_gem_flink_name;
1528 break;
1529 case WINSYS_HANDLE_TYPE_KMS:
1530 if (sws->fd == ws->fd) {
1531 whandle->handle = bo->u.real.kms_handle;
1532
1533 if (bo->is_shared)
1534 return true;
1535
1536 goto hash_table_set;
1537 }
1538
1539 simple_mtx_lock(&ws->sws_list_lock);
1540 entry = _mesa_hash_table_search(sws->kms_handles, bo);
1541 simple_mtx_unlock(&ws->sws_list_lock);
1542 if (entry) {
1543 whandle->handle = (uintptr_t)entry->data;
1544 return true;
1545 }
1546 /* Fall through */
1547 case WINSYS_HANDLE_TYPE_FD:
1548 type = amdgpu_bo_handle_type_dma_buf_fd;
1549 break;
1550 default:
1551 return false;
1552 }
1553
1554 r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
1555 if (r)
1556 return false;
1557
1558 if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
1559 int dma_fd = whandle->handle;
1560
1561 r = drmPrimeFDToHandle(sws->fd, dma_fd, &whandle->handle);
1562 close(dma_fd);
1563
1564 if (r)
1565 return false;
1566
1567 simple_mtx_lock(&ws->sws_list_lock);
1568 _mesa_hash_table_insert_pre_hashed(sws->kms_handles,
1569 bo->u.real.kms_handle, bo,
1570 (void*)(uintptr_t)whandle->handle);
1571 simple_mtx_unlock(&ws->sws_list_lock);
1572 }
1573
1574 hash_table_set:
1575 simple_mtx_lock(&ws->bo_export_table_lock);
1576 _mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);
1577 simple_mtx_unlock(&ws->bo_export_table_lock);
1578
1579 bo->is_shared = true;
1580 return true;
1581 }
1582
1583 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1584 void *pointer, uint64_t size)
1585 {
1586 struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1587 amdgpu_bo_handle buf_handle;
1588 struct amdgpu_winsys_bo *bo;
1589 uint64_t va;
1590 amdgpu_va_handle va_handle;
1591 /* Avoid failure when the size is not page aligned */
1592 uint64_t aligned_size = align64(size, ws->info.gart_page_size);
1593
1594 bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1595 if (!bo)
1596 return NULL;
1597
1598 if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
1599 aligned_size, &buf_handle))
1600 goto error;
1601
1602 if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1603 aligned_size,
1604 amdgpu_get_optimal_vm_alignment(ws, aligned_size,
1605 ws->info.gart_page_size),
1606 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
1607 goto error_va_alloc;
1608
1609 if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
1610 goto error_va_map;
1611
1612 /* Initialize it. */
1613 bo->is_user_ptr = true;
1614 pipe_reference_init(&bo->base.reference, 1);
1615 simple_mtx_init(&bo->lock, mtx_plain);
1616 bo->bo = buf_handle;
1617 bo->base.alignment = 0;
1618 bo->base.size = size;
1619 bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1620 bo->ws = ws;
1621 bo->cpu_ptr = pointer;
1622 bo->va = va;
1623 bo->u.real.va_handle = va_handle;
1624 bo->initial_domain = RADEON_DOMAIN_GTT;
1625 bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1626
1627 ws->allocated_gtt += aligned_size;
1628
1629 amdgpu_add_buffer_to_global_list(bo);
1630
1631 amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1632
1633 return (struct pb_buffer*)bo;
1634
1635 error_va_map:
1636 amdgpu_va_range_free(va_handle);
1637
1638 error_va_alloc:
1639 amdgpu_bo_free(buf_handle);
1640
1641 error:
1642 FREE(bo);
1643 return NULL;
1644 }
1645
1646 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
1647 {
1648 return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
1649 }
1650
1651 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
1652 {
1653 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1654
1655 return !bo->bo && !bo->sparse;
1656 }
1657
1658 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
1659 {
1660 return ((struct amdgpu_winsys_bo*)buf)->va;
1661 }
1662
1663 void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *ws)
1664 {
1665 ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1666 ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1667 ws->base.buffer_map = amdgpu_bo_map;
1668 ws->base.buffer_unmap = amdgpu_bo_unmap;
1669 ws->base.buffer_wait = amdgpu_bo_wait;
1670 ws->base.buffer_create = amdgpu_buffer_create;
1671 ws->base.buffer_from_handle = amdgpu_bo_from_handle;
1672 ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1673 ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1674 ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1675 ws->base.buffer_get_handle = amdgpu_bo_get_handle;
1676 ws->base.buffer_commit = amdgpu_bo_sparse_commit;
1677 ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1678 ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1679 ws->base.buffer_get_flags = amdgpu_bo_get_flags;
1680 }