radv: remove unnecessary RADV_DEBUG=nobatchchain option
[mesa.git] / src / amd / vulkan / winsys / amdgpu / radv_amdgpu_cs.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <stdlib.h>
26 #include <amdgpu.h>
27 #include <amdgpu_drm.h>
28 #include <assert.h>
29 #include <pthread.h>
30 #include <errno.h>
31
32 #include "ac_debug.h"
33 #include "radv_radeon_winsys.h"
34 #include "radv_amdgpu_cs.h"
35 #include "radv_amdgpu_bo.h"
36 #include "sid.h"
37
38
39 enum {
40 VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024
41 };
42
43 struct radv_amdgpu_cs {
44 struct radeon_cmdbuf base;
45 struct radv_amdgpu_winsys *ws;
46
47 struct amdgpu_cs_ib_info ib;
48
49 struct radeon_winsys_bo *ib_buffer;
50 uint8_t *ib_mapped;
51 unsigned max_num_buffers;
52 unsigned num_buffers;
53 struct drm_amdgpu_bo_list_entry *handles;
54
55 struct radeon_winsys_bo **old_ib_buffers;
56 unsigned num_old_ib_buffers;
57 unsigned max_num_old_ib_buffers;
58 unsigned *ib_size_ptr;
59 bool failed;
60 bool is_chained;
61
62 int buffer_hash_table[1024];
63 unsigned hw_ip;
64
65 unsigned num_virtual_buffers;
66 unsigned max_num_virtual_buffers;
67 struct radeon_winsys_bo **virtual_buffers;
68 int *virtual_buffer_hash_table;
69
70 /* For chips that don't support chaining. */
71 struct radeon_cmdbuf *old_cs_buffers;
72 unsigned num_old_cs_buffers;
73 };
74
75 static inline struct radv_amdgpu_cs *
76 radv_amdgpu_cs(struct radeon_cmdbuf *base)
77 {
78 return (struct radv_amdgpu_cs*)base;
79 }
80
81 static int ring_to_hw_ip(enum ring_type ring)
82 {
83 switch (ring) {
84 case RING_GFX:
85 return AMDGPU_HW_IP_GFX;
86 case RING_DMA:
87 return AMDGPU_HW_IP_DMA;
88 case RING_COMPUTE:
89 return AMDGPU_HW_IP_COMPUTE;
90 default:
91 unreachable("unsupported ring");
92 }
93 }
94
95 struct radv_amdgpu_cs_request {
96 /** Specify flags with additional information */
97 uint64_t flags;
98
99 /** Specify HW IP block type to which to send the IB. */
100 unsigned ip_type;
101
102 /** IP instance index if there are several IPs of the same type. */
103 unsigned ip_instance;
104
105 /**
106 * Specify ring index of the IP. We could have several rings
107 * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
108 */
109 uint32_t ring;
110
111 /**
112 * List handle with resources used by this request. This is a raw
113 * bo list handle used by the kernel.
114 */
115 uint32_t resources;
116
117 /**
118 * Number of dependencies this Command submission needs to
119 * wait for before starting execution.
120 */
121 uint32_t number_of_dependencies;
122
123 /**
124 * Array of dependencies which need to be met before
125 * execution can start.
126 */
127 struct amdgpu_cs_fence *dependencies;
128
129 /** Number of IBs to submit in the field ibs. */
130 uint32_t number_of_ibs;
131
132 /**
133 * IBs to submit. Those IBs will be submit together as single entity
134 */
135 struct amdgpu_cs_ib_info *ibs;
136
137 /**
138 * The returned sequence number for the command submission
139 */
140 uint64_t seq_no;
141
142 /**
143 * The fence information
144 */
145 struct amdgpu_cs_fence_info fence_info;
146 };
147
148
149 static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
150 uint32_t ip_type,
151 uint32_t ring,
152 struct radv_winsys_sem_info *sem_info);
153 static int radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
154 struct radv_amdgpu_cs_request *request,
155 struct radv_winsys_sem_info *sem_info);
156
157 static void radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx,
158 struct radv_amdgpu_fence *fence,
159 struct radv_amdgpu_cs_request *req)
160 {
161 fence->fence.context = ctx->ctx;
162 fence->fence.ip_type = req->ip_type;
163 fence->fence.ip_instance = req->ip_instance;
164 fence->fence.ring = req->ring;
165 fence->fence.fence = req->seq_no;
166 fence->user_ptr = (volatile uint64_t*)(ctx->fence_map + (req->ip_type * MAX_RINGS_PER_TYPE + req->ring) * sizeof(uint64_t));
167 }
168
169 static struct radeon_winsys_fence *radv_amdgpu_create_fence()
170 {
171 struct radv_amdgpu_fence *fence = calloc(1, sizeof(struct radv_amdgpu_fence));
172 fence->fence.fence = UINT64_MAX;
173 return (struct radeon_winsys_fence*)fence;
174 }
175
176 static void radv_amdgpu_destroy_fence(struct radeon_winsys_fence *_fence)
177 {
178 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
179 free(fence);
180 }
181
182 static void radv_amdgpu_reset_fence(struct radeon_winsys_fence *_fence)
183 {
184 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
185 fence->fence.fence = UINT64_MAX;
186 }
187
188 static void radv_amdgpu_signal_fence(struct radeon_winsys_fence *_fence)
189 {
190 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
191 fence->fence.fence = 0;
192 }
193
194 static bool radv_amdgpu_is_fence_waitable(struct radeon_winsys_fence *_fence)
195 {
196 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
197 return fence->fence.fence < UINT64_MAX;
198 }
199
200 static bool radv_amdgpu_fence_wait(struct radeon_winsys *_ws,
201 struct radeon_winsys_fence *_fence,
202 bool absolute,
203 uint64_t timeout)
204 {
205 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
206 unsigned flags = absolute ? AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE : 0;
207 int r;
208 uint32_t expired = 0;
209
210 /* Special casing 0 and UINT64_MAX so that they work without user_ptr/fence.ctx */
211 if (fence->fence.fence == UINT64_MAX)
212 return false;
213
214 if (fence->fence.fence == 0)
215 return true;
216
217 if (fence->user_ptr) {
218 if (*fence->user_ptr >= fence->fence.fence)
219 return true;
220 if (!absolute && !timeout)
221 return false;
222 }
223
224 /* Now use the libdrm query. */
225 r = amdgpu_cs_query_fence_status(&fence->fence,
226 timeout,
227 flags,
228 &expired);
229
230 if (r) {
231 fprintf(stderr, "amdgpu: radv_amdgpu_cs_query_fence_status failed.\n");
232 return false;
233 }
234
235 if (expired)
236 return true;
237
238 return false;
239 }
240
241
242 static bool radv_amdgpu_fences_wait(struct radeon_winsys *_ws,
243 struct radeon_winsys_fence *const *_fences,
244 uint32_t fence_count,
245 bool wait_all,
246 uint64_t timeout)
247 {
248 struct amdgpu_cs_fence *fences = malloc(sizeof(struct amdgpu_cs_fence) * fence_count);
249 int r;
250 uint32_t expired = 0, first = 0;
251
252 if (!fences)
253 return false;
254
255 for (uint32_t i = 0; i < fence_count; ++i)
256 fences[i] = ((struct radv_amdgpu_fence *)_fences[i])->fence;
257
258 /* Now use the libdrm query. */
259 r = amdgpu_cs_wait_fences(fences, fence_count, wait_all,
260 timeout, &expired, &first);
261
262 free(fences);
263 if (r) {
264 fprintf(stderr, "amdgpu: amdgpu_cs_wait_fences failed.\n");
265 return false;
266 }
267
268 if (expired)
269 return true;
270
271 return false;
272 }
273
274 static void radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
275 {
276 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
277
278 if (cs->ib_buffer)
279 cs->ws->base.buffer_destroy(cs->ib_buffer);
280 else
281 free(cs->base.buf);
282
283 for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
284 cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
285
286 for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
287 struct radeon_cmdbuf *rcs = &cs->old_cs_buffers[i];
288 free(rcs->buf);
289 }
290
291 free(cs->old_cs_buffers);
292 free(cs->old_ib_buffers);
293 free(cs->virtual_buffers);
294 free(cs->virtual_buffer_hash_table);
295 free(cs->handles);
296 free(cs);
297 }
298
299 static void radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs,
300 enum ring_type ring_type)
301 {
302 for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
303 cs->buffer_hash_table[i] = -1;
304
305 cs->hw_ip = ring_to_hw_ip(ring_type);
306 }
307
308 static struct radeon_cmdbuf *
309 radv_amdgpu_cs_create(struct radeon_winsys *ws,
310 enum ring_type ring_type)
311 {
312 struct radv_amdgpu_cs *cs;
313 uint32_t ib_size = 20 * 1024 * 4;
314 cs = calloc(1, sizeof(struct radv_amdgpu_cs));
315 if (!cs)
316 return NULL;
317
318 cs->ws = radv_amdgpu_winsys(ws);
319 radv_amdgpu_init_cs(cs, ring_type);
320
321 if (cs->ws->use_ib_bos) {
322 cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
323 RADEON_DOMAIN_GTT,
324 RADEON_FLAG_CPU_ACCESS |
325 RADEON_FLAG_NO_INTERPROCESS_SHARING |
326 RADEON_FLAG_READ_ONLY,
327 RADV_BO_PRIORITY_CS);
328 if (!cs->ib_buffer) {
329 free(cs);
330 return NULL;
331 }
332
333 cs->ib_mapped = ws->buffer_map(cs->ib_buffer);
334 if (!cs->ib_mapped) {
335 ws->buffer_destroy(cs->ib_buffer);
336 free(cs);
337 return NULL;
338 }
339
340 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
341 cs->base.buf = (uint32_t *)cs->ib_mapped;
342 cs->base.max_dw = ib_size / 4 - 4;
343 cs->ib_size_ptr = &cs->ib.size;
344 cs->ib.size = 0;
345
346 ws->cs_add_buffer(&cs->base, cs->ib_buffer);
347 } else {
348 cs->base.buf = malloc(16384);
349 cs->base.max_dw = 4096;
350 if (!cs->base.buf) {
351 free(cs);
352 return NULL;
353 }
354 }
355
356 return &cs->base;
357 }
358
359 static void radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
360 {
361 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
362
363 if (cs->failed) {
364 cs->base.cdw = 0;
365 return;
366 }
367
368 if (!cs->ws->use_ib_bos) {
369 const uint64_t limit_dws = 0xffff8;
370 uint64_t ib_dws = MAX2(cs->base.cdw + min_size,
371 MIN2(cs->base.max_dw * 2, limit_dws));
372
373 /* The total ib size cannot exceed limit_dws dwords. */
374 if (ib_dws > limit_dws)
375 {
376 /* The maximum size in dwords has been reached,
377 * try to allocate a new one.
378 */
379 cs->old_cs_buffers =
380 realloc(cs->old_cs_buffers,
381 (cs->num_old_cs_buffers + 1) * sizeof(*cs->old_cs_buffers));
382 if (!cs->old_cs_buffers) {
383 cs->failed = true;
384 cs->base.cdw = 0;
385 return;
386 }
387
388 /* Store the current one for submitting it later. */
389 cs->old_cs_buffers[cs->num_old_cs_buffers].cdw = cs->base.cdw;
390 cs->old_cs_buffers[cs->num_old_cs_buffers].max_dw = cs->base.max_dw;
391 cs->old_cs_buffers[cs->num_old_cs_buffers].buf = cs->base.buf;
392 cs->num_old_cs_buffers++;
393
394 /* Reset the cs, it will be re-allocated below. */
395 cs->base.cdw = 0;
396 cs->base.buf = NULL;
397
398 /* Re-compute the number of dwords to allocate. */
399 ib_dws = MAX2(cs->base.cdw + min_size,
400 MIN2(cs->base.max_dw * 2, limit_dws));
401 if (ib_dws > limit_dws) {
402 fprintf(stderr, "amdgpu: Too high number of "
403 "dwords to allocate\n");
404 cs->failed = true;
405 return;
406 }
407 }
408
409 uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
410 if (new_buf) {
411 cs->base.buf = new_buf;
412 cs->base.max_dw = ib_dws;
413 } else {
414 cs->failed = true;
415 cs->base.cdw = 0;
416 }
417 return;
418 }
419
420 uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
421
422 /* max that fits in the chain size field. */
423 ib_size = MIN2(ib_size, 0xfffff);
424
425 while (!cs->base.cdw || (cs->base.cdw & 7) != 4)
426 radeon_emit(&cs->base, 0xffff1000);
427
428 *cs->ib_size_ptr |= cs->base.cdw + 4;
429
430 if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) {
431 cs->max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2);
432 cs->old_ib_buffers = realloc(cs->old_ib_buffers,
433 cs->max_num_old_ib_buffers * sizeof(void*));
434 }
435
436 cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer;
437
438 cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
439 RADEON_DOMAIN_GTT,
440 RADEON_FLAG_CPU_ACCESS |
441 RADEON_FLAG_NO_INTERPROCESS_SHARING |
442 RADEON_FLAG_READ_ONLY,
443 RADV_BO_PRIORITY_CS);
444
445 if (!cs->ib_buffer) {
446 cs->base.cdw = 0;
447 cs->failed = true;
448 cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
449 }
450
451 cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
452 if (!cs->ib_mapped) {
453 cs->ws->base.buffer_destroy(cs->ib_buffer);
454 cs->base.cdw = 0;
455 cs->failed = true;
456 cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
457 }
458
459 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
460
461 radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
462 radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va);
463 radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32);
464 radeon_emit(&cs->base, S_3F2_CHAIN(1) | S_3F2_VALID(1));
465
466 cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
467
468 cs->base.buf = (uint32_t *)cs->ib_mapped;
469 cs->base.cdw = 0;
470 cs->base.max_dw = ib_size / 4 - 4;
471
472 }
473
474 static bool radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
475 {
476 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
477
478 if (cs->ws->use_ib_bos) {
479 while (!cs->base.cdw || (cs->base.cdw & 7) != 0)
480 radeon_emit(&cs->base, 0xffff1000);
481
482 *cs->ib_size_ptr |= cs->base.cdw;
483
484 cs->is_chained = false;
485 }
486
487 return !cs->failed;
488 }
489
490 static void radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
491 {
492 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
493 cs->base.cdw = 0;
494 cs->failed = false;
495
496 for (unsigned i = 0; i < cs->num_buffers; ++i) {
497 unsigned hash = cs->handles[i].bo_handle &
498 (ARRAY_SIZE(cs->buffer_hash_table) - 1);
499 cs->buffer_hash_table[hash] = -1;
500 }
501
502 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
503 unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
504 cs->virtual_buffer_hash_table[hash] = -1;
505 }
506
507 cs->num_buffers = 0;
508 cs->num_virtual_buffers = 0;
509
510 if (cs->ws->use_ib_bos) {
511 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
512
513 for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
514 cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
515
516 cs->num_old_ib_buffers = 0;
517 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
518 cs->ib_size_ptr = &cs->ib.size;
519 cs->ib.size = 0;
520 } else {
521 for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
522 struct radeon_cmdbuf *rcs = &cs->old_cs_buffers[i];
523 free(rcs->buf);
524 }
525
526 free(cs->old_cs_buffers);
527 cs->old_cs_buffers = NULL;
528 cs->num_old_cs_buffers = 0;
529 }
530 }
531
532 static int radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs,
533 uint32_t bo)
534 {
535 unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
536 int index = cs->buffer_hash_table[hash];
537
538 if (index == -1)
539 return -1;
540
541 if (cs->handles[index].bo_handle == bo)
542 return index;
543
544 for (unsigned i = 0; i < cs->num_buffers; ++i) {
545 if (cs->handles[i].bo_handle == bo) {
546 cs->buffer_hash_table[hash] = i;
547 return i;
548 }
549 }
550
551 return -1;
552 }
553
554 static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
555 uint32_t bo, uint8_t priority)
556 {
557 unsigned hash;
558 int index = radv_amdgpu_cs_find_buffer(cs, bo);
559
560 if (index != -1)
561 return;
562
563 if (cs->num_buffers == cs->max_num_buffers) {
564 unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
565 cs->handles = realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
566 cs->max_num_buffers = new_count;
567 }
568
569 cs->handles[cs->num_buffers].bo_handle = bo;
570 cs->handles[cs->num_buffers].bo_priority = priority;
571
572 hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
573 cs->buffer_hash_table[hash] = cs->num_buffers;
574
575 ++cs->num_buffers;
576 }
577
578 static void radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs,
579 struct radeon_winsys_bo *bo)
580 {
581 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
582 unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
583
584
585 if (!cs->virtual_buffer_hash_table) {
586 cs->virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
587 for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
588 cs->virtual_buffer_hash_table[i] = -1;
589 }
590
591 if (cs->virtual_buffer_hash_table[hash] >= 0) {
592 int idx = cs->virtual_buffer_hash_table[hash];
593 if (cs->virtual_buffers[idx] == bo) {
594 return;
595 }
596 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
597 if (cs->virtual_buffers[i] == bo) {
598 cs->virtual_buffer_hash_table[hash] = i;
599 return;
600 }
601 }
602 }
603
604 if(cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
605 cs->max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
606 cs->virtual_buffers = realloc(cs->virtual_buffers, sizeof(struct radv_amdgpu_virtual_virtual_buffer*) * cs->max_num_virtual_buffers);
607 }
608
609 cs->virtual_buffers[cs->num_virtual_buffers] = bo;
610
611 cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
612 ++cs->num_virtual_buffers;
613
614 }
615
616 static void radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs,
617 struct radeon_winsys_bo *_bo)
618 {
619 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
620 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
621
622 if (bo->is_virtual) {
623 radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
624 return;
625 }
626
627 if (bo->base.is_local)
628 return;
629
630 radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
631 }
632
633 static void radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent,
634 struct radeon_cmdbuf *_child)
635 {
636 struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
637 struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
638
639 for (unsigned i = 0; i < child->num_buffers; ++i) {
640 radv_amdgpu_cs_add_buffer_internal(parent,
641 child->handles[i].bo_handle,
642 child->handles[i].bo_priority);
643 }
644
645 for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
646 radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
647 }
648
649 if (parent->ws->use_ib_bos) {
650 if (parent->base.cdw + 4 > parent->base.max_dw)
651 radv_amdgpu_cs_grow(&parent->base, 4);
652
653 radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
654 radeon_emit(&parent->base, child->ib.ib_mc_address);
655 radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
656 radeon_emit(&parent->base, child->ib.size);
657 } else {
658 if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
659 radv_amdgpu_cs_grow(&parent->base, child->base.cdw);
660
661 memcpy(parent->base.buf + parent->base.cdw, child->base.buf, 4 * child->base.cdw);
662 parent->base.cdw += child->base.cdw;
663 }
664 }
665
666 static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
667 struct radeon_cmdbuf **cs_array,
668 unsigned count,
669 struct radv_amdgpu_winsys_bo **extra_bo_array,
670 unsigned num_extra_bo,
671 struct radeon_cmdbuf *extra_cs,
672 const struct radv_winsys_bo_list *radv_bo_list,
673 uint32_t *bo_list)
674 {
675 int r = 0;
676
677 if (ws->debug_all_bos) {
678 struct radv_amdgpu_winsys_bo *bo;
679 struct drm_amdgpu_bo_list_entry *handles;
680 unsigned num = 0;
681
682 pthread_mutex_lock(&ws->global_bo_list_lock);
683
684 handles = malloc(sizeof(handles[0]) * ws->num_buffers);
685 if (!handles) {
686 pthread_mutex_unlock(&ws->global_bo_list_lock);
687 return -ENOMEM;
688 }
689
690 LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
691 assert(num < ws->num_buffers);
692 handles[num].bo_handle = bo->bo_handle;
693 handles[num].bo_priority = bo->priority;
694 num++;
695 }
696
697 r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers,
698 handles, bo_list);
699 free(handles);
700 pthread_mutex_unlock(&ws->global_bo_list_lock);
701 } else if (count == 1 && !num_extra_bo && !extra_cs && !radv_bo_list &&
702 !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
703 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
704 if (cs->num_buffers == 0) {
705 *bo_list = 0;
706 return 0;
707 }
708 r = amdgpu_bo_list_create_raw(ws->dev, cs->num_buffers, cs->handles,
709 bo_list);
710 } else {
711 unsigned total_buffer_count = num_extra_bo;
712 unsigned unique_bo_count = num_extra_bo;
713 for (unsigned i = 0; i < count; ++i) {
714 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
715 total_buffer_count += cs->num_buffers;
716 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
717 total_buffer_count += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
718 }
719
720 if (extra_cs) {
721 total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
722 }
723
724 if (radv_bo_list) {
725 total_buffer_count += radv_bo_list->count;
726 }
727
728 if (total_buffer_count == 0) {
729 *bo_list = 0;
730 return 0;
731 }
732 struct drm_amdgpu_bo_list_entry *handles = malloc(sizeof(struct drm_amdgpu_bo_list_entry) * total_buffer_count);
733 if (!handles) {
734 free(handles);
735 return -ENOMEM;
736 }
737
738 for (unsigned i = 0; i < num_extra_bo; i++) {
739 handles[i].bo_handle = extra_bo_array[i]->bo_handle;
740 handles[i].bo_priority = extra_bo_array[i]->priority;
741 }
742
743 for (unsigned i = 0; i < count + !!extra_cs; ++i) {
744 struct radv_amdgpu_cs *cs;
745
746 if (i == count)
747 cs = (struct radv_amdgpu_cs*)extra_cs;
748 else
749 cs = (struct radv_amdgpu_cs*)cs_array[i];
750
751 if (!cs->num_buffers)
752 continue;
753
754 if (unique_bo_count == 0 && !cs->num_virtual_buffers) {
755 memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
756 unique_bo_count = cs->num_buffers;
757 continue;
758 }
759 int unique_bo_so_far = unique_bo_count;
760 for (unsigned j = 0; j < cs->num_buffers; ++j) {
761 bool found = false;
762 for (unsigned k = 0; k < unique_bo_so_far; ++k) {
763 if (handles[k].bo_handle == cs->handles[j].bo_handle) {
764 found = true;
765 break;
766 }
767 }
768 if (!found) {
769 handles[unique_bo_count] = cs->handles[j];
770 ++unique_bo_count;
771 }
772 }
773 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
774 struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
775 for(unsigned k = 0; k < virtual_bo->bo_count; ++k) {
776 struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
777 bool found = false;
778 for (unsigned m = 0; m < unique_bo_count; ++m) {
779 if (handles[m].bo_handle == bo->bo_handle) {
780 found = true;
781 break;
782 }
783 }
784 if (!found) {
785 handles[unique_bo_count].bo_handle = bo->bo_handle;
786 handles[unique_bo_count].bo_priority = bo->priority;
787 ++unique_bo_count;
788 }
789 }
790 }
791 }
792
793 if (radv_bo_list) {
794 unsigned unique_bo_so_far = unique_bo_count;
795 for (unsigned i = 0; i < radv_bo_list->count; ++i) {
796 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
797 bool found = false;
798 for (unsigned j = 0; j < unique_bo_so_far; ++j) {
799 if (bo->bo_handle == handles[j].bo_handle) {
800 found = true;
801 break;
802 }
803 }
804 if (!found) {
805 handles[unique_bo_count].bo_handle = bo->bo_handle;
806 handles[unique_bo_count].bo_priority = bo->priority;
807 ++unique_bo_count;
808 }
809 }
810 }
811
812 if (unique_bo_count > 0) {
813 r = amdgpu_bo_list_create_raw(ws->dev, unique_bo_count, handles,
814 bo_list);
815 } else {
816 *bo_list = 0;
817 }
818
819 free(handles);
820 }
821
822 return r;
823 }
824
825 static struct amdgpu_cs_fence_info radv_set_cs_fence(struct radv_amdgpu_ctx *ctx, int ip_type, int ring)
826 {
827 struct amdgpu_cs_fence_info ret = {0};
828 if (ctx->fence_map) {
829 ret.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
830 ret.offset = (ip_type * MAX_RINGS_PER_TYPE + ring) * sizeof(uint64_t);
831 }
832 return ret;
833 }
834
835 static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
836 struct radv_amdgpu_cs_request *request)
837 {
838 radv_amdgpu_request_to_fence(ctx,
839 &ctx->last_submission[request->ip_type][request->ring],
840 request);
841 }
842
843 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
844 int queue_idx,
845 struct radv_winsys_sem_info *sem_info,
846 const struct radv_winsys_bo_list *radv_bo_list,
847 struct radeon_cmdbuf **cs_array,
848 unsigned cs_count,
849 struct radeon_cmdbuf *initial_preamble_cs,
850 struct radeon_cmdbuf *continue_preamble_cs,
851 struct radeon_winsys_fence *_fence)
852 {
853 int r;
854 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
855 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
856 struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
857 uint32_t bo_list;
858 struct radv_amdgpu_cs_request request = {0};
859 struct amdgpu_cs_ib_info ibs[2];
860 unsigned number_of_ibs = 1;
861
862 for (unsigned i = cs_count; i--;) {
863 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
864
865 if (cs->is_chained) {
866 *cs->ib_size_ptr -= 4;
867 cs->is_chained = false;
868 }
869
870 if (i + 1 < cs_count) {
871 struct radv_amdgpu_cs *next = radv_amdgpu_cs(cs_array[i + 1]);
872 assert(cs->base.cdw + 4 <= cs->base.max_dw);
873
874 cs->is_chained = true;
875 *cs->ib_size_ptr += 4;
876
877 cs->base.buf[cs->base.cdw + 0] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
878 cs->base.buf[cs->base.cdw + 1] = next->ib.ib_mc_address;
879 cs->base.buf[cs->base.cdw + 2] = next->ib.ib_mc_address >> 32;
880 cs->base.buf[cs->base.cdw + 3] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | next->ib.size;
881 }
882 }
883
884 /* Create a buffer object list. */
885 r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, 0,
886 initial_preamble_cs, radv_bo_list,
887 &bo_list);
888 if (r) {
889 fprintf(stderr, "amdgpu: buffer list creation failed for the "
890 "chained submission(%d)\n", r);
891 return r;
892 }
893
894 /* Configure the CS request. */
895 if (initial_preamble_cs) {
896 ibs[0] = radv_amdgpu_cs(initial_preamble_cs)->ib;
897 ibs[1] = cs0->ib;
898 number_of_ibs++;
899 } else {
900 ibs[0] = cs0->ib;
901 }
902
903 request.ip_type = cs0->hw_ip;
904 request.ring = queue_idx;
905 request.number_of_ibs = number_of_ibs;
906 request.ibs = ibs;
907 request.resources = bo_list;
908 request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
909
910 /* Submit the CS. */
911 r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
912 if (r) {
913 if (r == -ENOMEM)
914 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
915 else
916 fprintf(stderr, "amdgpu: The CS has been rejected, "
917 "see dmesg for more information.\n");
918 }
919
920 amdgpu_bo_list_destroy_raw(ctx->ws->dev, bo_list);
921
922 if (r)
923 return r;
924
925 if (fence)
926 radv_amdgpu_request_to_fence(ctx, fence, &request);
927
928 radv_assign_last_submit(ctx, &request);
929
930 return 0;
931 }
932
933 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
934 int queue_idx,
935 struct radv_winsys_sem_info *sem_info,
936 const struct radv_winsys_bo_list *radv_bo_list,
937 struct radeon_cmdbuf **cs_array,
938 unsigned cs_count,
939 struct radeon_cmdbuf *initial_preamble_cs,
940 struct radeon_cmdbuf *continue_preamble_cs,
941 struct radeon_winsys_fence *_fence)
942 {
943 int r;
944 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
945 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
946 uint32_t bo_list;
947 struct radv_amdgpu_cs_request request = {};
948 struct amdgpu_cs_ib_info *ibs;
949 struct radv_amdgpu_cs *cs0;
950 unsigned number_of_ibs;
951
952 assert(cs_count);
953 cs0 = radv_amdgpu_cs(cs_array[0]);
954
955 /* Compute the number of IBs for this submit. */
956 number_of_ibs = cs_count + !!initial_preamble_cs;
957
958 /* Create a buffer object list. */
959 r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[0], cs_count, NULL, 0,
960 initial_preamble_cs, radv_bo_list,
961 &bo_list);
962 if (r) {
963 fprintf(stderr, "amdgpu: buffer list creation failed "
964 "for the fallback submission (%d)\n", r);
965 return r;
966 }
967
968 ibs = malloc(number_of_ibs * sizeof(*ibs));
969 if (!ibs) {
970 amdgpu_bo_list_destroy_raw(ctx->ws->dev, bo_list);
971 return -ENOMEM;
972 }
973
974 /* Configure the CS request. */
975 if (initial_preamble_cs)
976 ibs[0] = radv_amdgpu_cs(initial_preamble_cs)->ib;
977
978 for (unsigned i = 0; i < cs_count; i++) {
979 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
980
981 ibs[i + !!initial_preamble_cs] = cs->ib;
982
983 if (cs->is_chained) {
984 *cs->ib_size_ptr -= 4;
985 cs->is_chained = false;
986 }
987 }
988
989 request.ip_type = cs0->hw_ip;
990 request.ring = queue_idx;
991 request.resources = bo_list;
992 request.number_of_ibs = number_of_ibs;
993 request.ibs = ibs;
994 request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
995
996 /* Submit the CS. */
997 r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
998 if (r) {
999 if (r == -ENOMEM)
1000 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
1001 else
1002 fprintf(stderr, "amdgpu: The CS has been rejected, "
1003 "see dmesg for more information.\n");
1004 }
1005
1006 amdgpu_bo_list_destroy_raw(ctx->ws->dev, bo_list);
1007 free(ibs);
1008
1009 if (r)
1010 return r;
1011
1012 if (fence)
1013 radv_amdgpu_request_to_fence(ctx, fence, &request);
1014
1015 radv_assign_last_submit(ctx, &request);
1016
1017 return 0;
1018 }
1019
1020 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
1021 int queue_idx,
1022 struct radv_winsys_sem_info *sem_info,
1023 const struct radv_winsys_bo_list *radv_bo_list,
1024 struct radeon_cmdbuf **cs_array,
1025 unsigned cs_count,
1026 struct radeon_cmdbuf *initial_preamble_cs,
1027 struct radeon_cmdbuf *continue_preamble_cs,
1028 struct radeon_winsys_fence *_fence)
1029 {
1030 int r;
1031 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1032 struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence;
1033 struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
1034 struct radeon_winsys *ws = (struct radeon_winsys*)cs0->ws;
1035 uint32_t bo_list;
1036 struct radv_amdgpu_cs_request request;
1037 uint32_t pad_word = 0xffff1000U;
1038 bool emit_signal_sem = sem_info->cs_emit_signal;
1039
1040 if (radv_amdgpu_winsys(ws)->info.chip_class == GFX6)
1041 pad_word = 0x80000000;
1042
1043 assert(cs_count);
1044
1045 for (unsigned i = 0; i < cs_count;) {
1046 struct amdgpu_cs_ib_info *ibs;
1047 struct radeon_winsys_bo **bos;
1048 struct radeon_cmdbuf *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
1049 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
1050 unsigned number_of_ibs;
1051 uint32_t *ptr;
1052 unsigned cnt = 0;
1053 unsigned size = 0;
1054 unsigned pad_words = 0;
1055
1056 /* Compute the number of IBs for this submit. */
1057 number_of_ibs = cs->num_old_cs_buffers + 1;
1058
1059 ibs = malloc(number_of_ibs * sizeof(*ibs));
1060 if (!ibs)
1061 return -ENOMEM;
1062
1063 bos = malloc(number_of_ibs * sizeof(*bos));
1064 if (!bos) {
1065 free(ibs);
1066 return -ENOMEM;
1067 }
1068
1069 if (number_of_ibs > 1) {
1070 /* Special path when the maximum size in dwords has
1071 * been reached because we need to handle more than one
1072 * IB per submit.
1073 */
1074 struct radeon_cmdbuf **new_cs_array;
1075 unsigned idx = 0;
1076
1077 new_cs_array = malloc(cs->num_old_cs_buffers *
1078 sizeof(*new_cs_array));
1079 assert(new_cs_array);
1080
1081 for (unsigned j = 0; j < cs->num_old_cs_buffers; j++)
1082 new_cs_array[idx++] = &cs->old_cs_buffers[j];
1083 new_cs_array[idx++] = cs_array[i];
1084
1085 for (unsigned j = 0; j < number_of_ibs; j++) {
1086 struct radeon_cmdbuf *rcs = new_cs_array[j];
1087 bool needs_preamble = preamble_cs && j == 0;
1088 unsigned size = 0;
1089
1090 if (needs_preamble)
1091 size += preamble_cs->cdw;
1092 size += rcs->cdw;
1093
1094 assert(size < 0xffff8);
1095
1096 while (!size || (size & 7)) {
1097 size++;
1098 pad_words++;
1099 }
1100
1101 bos[j] = ws->buffer_create(ws, 4 * size, 4096,
1102 RADEON_DOMAIN_GTT,
1103 RADEON_FLAG_CPU_ACCESS |
1104 RADEON_FLAG_NO_INTERPROCESS_SHARING |
1105 RADEON_FLAG_READ_ONLY,
1106 RADV_BO_PRIORITY_CS);
1107 ptr = ws->buffer_map(bos[j]);
1108
1109 if (needs_preamble) {
1110 memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
1111 ptr += preamble_cs->cdw;
1112 }
1113
1114 memcpy(ptr, rcs->buf, 4 * rcs->cdw);
1115 ptr += rcs->cdw;
1116
1117 for (unsigned k = 0; k < pad_words; ++k)
1118 *ptr++ = pad_word;
1119
1120 ibs[j].size = size;
1121 ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]);
1122 ibs[j].flags = 0;
1123 }
1124
1125 cnt++;
1126 free(new_cs_array);
1127 } else {
1128 if (preamble_cs)
1129 size += preamble_cs->cdw;
1130
1131 while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
1132 size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
1133 ++cnt;
1134 }
1135
1136 while (!size || (size & 7)) {
1137 size++;
1138 pad_words++;
1139 }
1140 assert(cnt);
1141
1142 bos[0] = ws->buffer_create(ws, 4 * size, 4096,
1143 RADEON_DOMAIN_GTT,
1144 RADEON_FLAG_CPU_ACCESS |
1145 RADEON_FLAG_NO_INTERPROCESS_SHARING |
1146 RADEON_FLAG_READ_ONLY,
1147 RADV_BO_PRIORITY_CS);
1148 ptr = ws->buffer_map(bos[0]);
1149
1150 if (preamble_cs) {
1151 memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
1152 ptr += preamble_cs->cdw;
1153 }
1154
1155 for (unsigned j = 0; j < cnt; ++j) {
1156 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
1157 memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
1158 ptr += cs->base.cdw;
1159
1160 }
1161
1162 for (unsigned j = 0; j < pad_words; ++j)
1163 *ptr++ = pad_word;
1164
1165 ibs[0].size = size;
1166 ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]);
1167 ibs[0].flags = 0;
1168 }
1169
1170 r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
1171 (struct radv_amdgpu_winsys_bo **)bos,
1172 number_of_ibs, preamble_cs,
1173 radv_bo_list, &bo_list);
1174 if (r) {
1175 fprintf(stderr, "amdgpu: buffer list creation failed "
1176 "for the sysmem submission (%d)\n", r);
1177 free(ibs);
1178 free(bos);
1179 return r;
1180 }
1181
1182 memset(&request, 0, sizeof(request));
1183
1184 request.ip_type = cs0->hw_ip;
1185 request.ring = queue_idx;
1186 request.resources = bo_list;
1187 request.number_of_ibs = number_of_ibs;
1188 request.ibs = ibs;
1189 request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
1190
1191 sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
1192 r = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1193 if (r) {
1194 if (r == -ENOMEM)
1195 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
1196 else
1197 fprintf(stderr, "amdgpu: The CS has been rejected, "
1198 "see dmesg for more information.\n");
1199 }
1200
1201 amdgpu_bo_list_destroy_raw(ctx->ws->dev, bo_list);
1202
1203 for (unsigned j = 0; j < number_of_ibs; j++) {
1204 ws->buffer_destroy(bos[j]);
1205 }
1206
1207 free(ibs);
1208 free(bos);
1209
1210 if (r)
1211 return r;
1212
1213 i += cnt;
1214 }
1215 if (fence)
1216 radv_amdgpu_request_to_fence(ctx, fence, &request);
1217
1218 radv_assign_last_submit(ctx, &request);
1219
1220 return 0;
1221 }
1222
1223 static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
1224 int queue_idx,
1225 struct radeon_cmdbuf **cs_array,
1226 unsigned cs_count,
1227 struct radeon_cmdbuf *initial_preamble_cs,
1228 struct radeon_cmdbuf *continue_preamble_cs,
1229 struct radv_winsys_sem_info *sem_info,
1230 const struct radv_winsys_bo_list *bo_list,
1231 bool can_patch,
1232 struct radeon_winsys_fence *_fence)
1233 {
1234 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]);
1235 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1236 int ret;
1237
1238 if (cs->ws->noop)
1239 abort();
1240
1241 assert(sem_info);
1242 if (!cs->ws->use_ib_bos) {
1243 ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, bo_list, cs_array,
1244 cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
1245 } else if (can_patch) {
1246 ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, bo_list, cs_array,
1247 cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
1248 } else {
1249 ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, bo_list, cs_array,
1250 cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
1251 }
1252
1253 radv_amdgpu_signal_sems(ctx, cs->hw_ip, queue_idx, sem_info);
1254 return ret;
1255 }
1256
1257 static void *radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
1258 {
1259 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1260 void *ret = NULL;
1261
1262 if (!cs->ib_buffer)
1263 return NULL;
1264 for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
1265 struct radv_amdgpu_winsys_bo *bo;
1266
1267 bo = (struct radv_amdgpu_winsys_bo*)
1268 (i == cs->num_old_ib_buffers ? cs->ib_buffer : cs->old_ib_buffers[i]);
1269 if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1270 if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
1271 return (char *)ret + (addr - bo->base.va);
1272 }
1273 }
1274 if(cs->ws->debug_all_bos) {
1275 pthread_mutex_lock(&cs->ws->global_bo_list_lock);
1276 list_for_each_entry(struct radv_amdgpu_winsys_bo, bo,
1277 &cs->ws->global_bo_list, global_list_item) {
1278 if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
1279 if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0) {
1280 pthread_mutex_unlock(&cs->ws->global_bo_list_lock);
1281 return (char *)ret + (addr - bo->base.va);
1282 }
1283 }
1284 }
1285 pthread_mutex_unlock(&cs->ws->global_bo_list_lock);
1286 }
1287 return ret;
1288 }
1289
1290 static void radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs,
1291 FILE* file,
1292 const int *trace_ids, int trace_id_count)
1293 {
1294 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1295 void *ib = cs->base.buf;
1296 int num_dw = cs->base.cdw;
1297
1298 if (cs->ws->use_ib_bos) {
1299 ib = radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address);
1300 num_dw = cs->ib.size;
1301 }
1302 assert(ib);
1303 ac_parse_ib(file, ib, num_dw, trace_ids, trace_id_count, "main IB",
1304 cs->ws->info.chip_class, radv_amdgpu_winsys_get_cpu_addr, cs);
1305 }
1306
1307 static uint32_t radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1308 {
1309 switch (radv_priority) {
1310 case RADEON_CTX_PRIORITY_REALTIME:
1311 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1312 case RADEON_CTX_PRIORITY_HIGH:
1313 return AMDGPU_CTX_PRIORITY_HIGH;
1314 case RADEON_CTX_PRIORITY_MEDIUM:
1315 return AMDGPU_CTX_PRIORITY_NORMAL;
1316 case RADEON_CTX_PRIORITY_LOW:
1317 return AMDGPU_CTX_PRIORITY_LOW;
1318 default:
1319 unreachable("Invalid context priority");
1320 }
1321 }
1322
1323 static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_ws,
1324 enum radeon_ctx_priority priority)
1325 {
1326 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1327 struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1328 uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1329 int r;
1330
1331 if (!ctx)
1332 return NULL;
1333
1334 r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx);
1335 if (r) {
1336 fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1337 goto error_create;
1338 }
1339 ctx->ws = ws;
1340
1341 assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096);
1342 ctx->fence_bo = ws->base.buffer_create(&ws->base, 4096, 8,
1343 RADEON_DOMAIN_GTT,
1344 RADEON_FLAG_CPU_ACCESS |
1345 RADEON_FLAG_NO_INTERPROCESS_SHARING,
1346 RADV_BO_PRIORITY_CS);
1347 if (ctx->fence_bo)
1348 ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo);
1349 if (ctx->fence_map)
1350 memset(ctx->fence_map, 0, 4096);
1351 return (struct radeon_winsys_ctx *)ctx;
1352 error_create:
1353 FREE(ctx);
1354 return NULL;
1355 }
1356
1357 static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1358 {
1359 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1360 ctx->ws->base.buffer_destroy(ctx->fence_bo);
1361 amdgpu_cs_ctx_free(ctx->ctx);
1362 FREE(ctx);
1363 }
1364
1365 static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx,
1366 enum ring_type ring_type, int ring_index)
1367 {
1368 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1369 int ip_type = ring_to_hw_ip(ring_type);
1370
1371 if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1372 uint32_t expired;
1373 int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence,
1374 1000000000ull, 0, &expired);
1375
1376 if (ret || !expired)
1377 return false;
1378 }
1379
1380 return true;
1381 }
1382
1383 static struct radeon_winsys_sem *radv_amdgpu_create_sem(struct radeon_winsys *_ws)
1384 {
1385 struct amdgpu_cs_fence *sem = CALLOC_STRUCT(amdgpu_cs_fence);
1386 if (!sem)
1387 return NULL;
1388
1389 return (struct radeon_winsys_sem *)sem;
1390 }
1391
1392 static void radv_amdgpu_destroy_sem(struct radeon_winsys_sem *_sem)
1393 {
1394 struct amdgpu_cs_fence *sem = (struct amdgpu_cs_fence *)_sem;
1395 FREE(sem);
1396 }
1397
1398 static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
1399 uint32_t ip_type,
1400 uint32_t ring,
1401 struct radv_winsys_sem_info *sem_info)
1402 {
1403 for (unsigned i = 0; i < sem_info->signal.sem_count; i++) {
1404 struct amdgpu_cs_fence *sem = (struct amdgpu_cs_fence *)(sem_info->signal.sem)[i];
1405
1406 if (sem->context)
1407 return -EINVAL;
1408
1409 *sem = ctx->last_submission[ip_type][ring].fence;
1410 }
1411 return 0;
1412 }
1413
1414 static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
1415 struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1416 {
1417 struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
1418 if (!syncobj)
1419 return NULL;
1420
1421 for (unsigned i = 0; i < counts->syncobj_count; i++) {
1422 struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1423 sem->handle = counts->syncobj[i];
1424 }
1425
1426 chunk->chunk_id = chunk_id;
1427 chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * counts->syncobj_count;
1428 chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1429 return syncobj;
1430 }
1431
1432 static int radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
1433 struct radv_amdgpu_cs_request *request,
1434 struct radv_winsys_sem_info *sem_info)
1435 {
1436 int r;
1437 int num_chunks;
1438 int size;
1439 bool user_fence;
1440 struct drm_amdgpu_cs_chunk *chunks;
1441 struct drm_amdgpu_cs_chunk_data *chunk_data;
1442 struct drm_amdgpu_cs_chunk_dep *sem_dependencies = NULL;
1443 struct drm_amdgpu_cs_chunk_sem *wait_syncobj = NULL, *signal_syncobj = NULL;
1444 int i;
1445 struct amdgpu_cs_fence *sem;
1446
1447 user_fence = (request->fence_info.handle != NULL);
1448 size = request->number_of_ibs + (user_fence ? 2 : 1) + 3;
1449
1450 chunks = alloca(sizeof(struct drm_amdgpu_cs_chunk) * size);
1451
1452 size = request->number_of_ibs + (user_fence ? 1 : 0);
1453
1454 chunk_data = alloca(sizeof(struct drm_amdgpu_cs_chunk_data) * size);
1455
1456 num_chunks = request->number_of_ibs;
1457 for (i = 0; i < request->number_of_ibs; i++) {
1458 struct amdgpu_cs_ib_info *ib;
1459 chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1460 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1461 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1462
1463 ib = &request->ibs[i];
1464
1465 chunk_data[i].ib_data._pad = 0;
1466 chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1467 chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1468 chunk_data[i].ib_data.ip_type = request->ip_type;
1469 chunk_data[i].ib_data.ip_instance = request->ip_instance;
1470 chunk_data[i].ib_data.ring = request->ring;
1471 chunk_data[i].ib_data.flags = ib->flags;
1472 }
1473
1474 if (user_fence) {
1475 i = num_chunks++;
1476
1477 chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1478 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1479 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1480
1481 amdgpu_cs_chunk_fence_info_to_data(&request->fence_info,
1482 &chunk_data[i]);
1483 }
1484
1485 if (sem_info->wait.syncobj_count && sem_info->cs_emit_wait) {
1486 wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait,
1487 &chunks[num_chunks],
1488 AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1489 if (!wait_syncobj) {
1490 r = -ENOMEM;
1491 goto error_out;
1492 }
1493 num_chunks++;
1494
1495 if (sem_info->wait.sem_count == 0)
1496 sem_info->cs_emit_wait = false;
1497
1498 }
1499
1500 if (sem_info->wait.sem_count && sem_info->cs_emit_wait) {
1501 sem_dependencies = alloca(sizeof(struct drm_amdgpu_cs_chunk_dep) * sem_info->wait.sem_count);
1502 int sem_count = 0;
1503
1504 for (unsigned j = 0; j < sem_info->wait.sem_count; j++) {
1505 sem = (struct amdgpu_cs_fence *)sem_info->wait.sem[j];
1506 if (!sem->context)
1507 continue;
1508 struct drm_amdgpu_cs_chunk_dep *dep = &sem_dependencies[sem_count++];
1509
1510 amdgpu_cs_chunk_fence_to_dep(sem, dep);
1511
1512 sem->context = NULL;
1513 }
1514 i = num_chunks++;
1515
1516 /* dependencies chunk */
1517 chunks[i].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
1518 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_dep) / 4 * sem_count;
1519 chunks[i].chunk_data = (uint64_t)(uintptr_t)sem_dependencies;
1520
1521 sem_info->cs_emit_wait = false;
1522 }
1523
1524 if (sem_info->signal.syncobj_count && sem_info->cs_emit_signal) {
1525 signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal,
1526 &chunks[num_chunks],
1527 AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1528 if (!signal_syncobj) {
1529 r = -ENOMEM;
1530 goto error_out;
1531 }
1532 num_chunks++;
1533 }
1534
1535 r = amdgpu_cs_submit_raw2(ctx->ws->dev,
1536 ctx->ctx,
1537 request->resources,
1538 num_chunks,
1539 chunks,
1540 &request->seq_no);
1541 error_out:
1542 free(wait_syncobj);
1543 free(signal_syncobj);
1544 return r;
1545 }
1546
1547 static int radv_amdgpu_create_syncobj(struct radeon_winsys *_ws,
1548 uint32_t *handle)
1549 {
1550 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1551 return amdgpu_cs_create_syncobj(ws->dev, handle);
1552 }
1553
1554 static void radv_amdgpu_destroy_syncobj(struct radeon_winsys *_ws,
1555 uint32_t handle)
1556 {
1557 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1558 amdgpu_cs_destroy_syncobj(ws->dev, handle);
1559 }
1560
1561 static void radv_amdgpu_reset_syncobj(struct radeon_winsys *_ws,
1562 uint32_t handle)
1563 {
1564 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1565 amdgpu_cs_syncobj_reset(ws->dev, &handle, 1);
1566 }
1567
1568 static void radv_amdgpu_signal_syncobj(struct radeon_winsys *_ws,
1569 uint32_t handle)
1570 {
1571 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1572 amdgpu_cs_syncobj_signal(ws->dev, &handle, 1);
1573 }
1574
1575 static bool radv_amdgpu_wait_syncobj(struct radeon_winsys *_ws, const uint32_t *handles,
1576 uint32_t handle_count, bool wait_all, uint64_t timeout)
1577 {
1578 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1579 uint32_t tmp;
1580
1581 /* The timeouts are signed, while vulkan timeouts are unsigned. */
1582 timeout = MIN2(timeout, INT64_MAX);
1583
1584 int ret = amdgpu_cs_syncobj_wait(ws->dev, (uint32_t*)handles, handle_count, timeout,
1585 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
1586 (wait_all ? DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL : 0),
1587 &tmp);
1588 if (ret == 0) {
1589 return true;
1590 } else if (ret == -ETIME) {
1591 return false;
1592 } else {
1593 fprintf(stderr, "amdgpu: radv_amdgpu_wait_syncobj failed!\nerrno: %d\n", errno);
1594 return false;
1595 }
1596 }
1597
1598 static int radv_amdgpu_export_syncobj(struct radeon_winsys *_ws,
1599 uint32_t syncobj,
1600 int *fd)
1601 {
1602 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1603
1604 return amdgpu_cs_export_syncobj(ws->dev, syncobj, fd);
1605 }
1606
1607 static int radv_amdgpu_import_syncobj(struct radeon_winsys *_ws,
1608 int fd,
1609 uint32_t *syncobj)
1610 {
1611 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1612
1613 return amdgpu_cs_import_syncobj(ws->dev, fd, syncobj);
1614 }
1615
1616
1617 static int radv_amdgpu_export_syncobj_to_sync_file(struct radeon_winsys *_ws,
1618 uint32_t syncobj,
1619 int *fd)
1620 {
1621 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1622
1623 return amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, fd);
1624 }
1625
1626 static int radv_amdgpu_import_syncobj_from_sync_file(struct radeon_winsys *_ws,
1627 uint32_t syncobj,
1628 int fd)
1629 {
1630 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1631
1632 return amdgpu_cs_syncobj_import_sync_file(ws->dev, syncobj, fd);
1633 }
1634
1635 void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1636 {
1637 ws->base.ctx_create = radv_amdgpu_ctx_create;
1638 ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1639 ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1640 ws->base.cs_create = radv_amdgpu_cs_create;
1641 ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1642 ws->base.cs_grow = radv_amdgpu_cs_grow;
1643 ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1644 ws->base.cs_reset = radv_amdgpu_cs_reset;
1645 ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1646 ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1647 ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1648 ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1649 ws->base.create_fence = radv_amdgpu_create_fence;
1650 ws->base.destroy_fence = radv_amdgpu_destroy_fence;
1651 ws->base.reset_fence = radv_amdgpu_reset_fence;
1652 ws->base.signal_fence = radv_amdgpu_signal_fence;
1653 ws->base.is_fence_waitable = radv_amdgpu_is_fence_waitable;
1654 ws->base.create_sem = radv_amdgpu_create_sem;
1655 ws->base.destroy_sem = radv_amdgpu_destroy_sem;
1656 ws->base.create_syncobj = radv_amdgpu_create_syncobj;
1657 ws->base.destroy_syncobj = radv_amdgpu_destroy_syncobj;
1658 ws->base.reset_syncobj = radv_amdgpu_reset_syncobj;
1659 ws->base.signal_syncobj = radv_amdgpu_signal_syncobj;
1660 ws->base.wait_syncobj = radv_amdgpu_wait_syncobj;
1661 ws->base.export_syncobj = radv_amdgpu_export_syncobj;
1662 ws->base.import_syncobj = radv_amdgpu_import_syncobj;
1663 ws->base.export_syncobj_to_sync_file = radv_amdgpu_export_syncobj_to_sync_file;
1664 ws->base.import_syncobj_from_sync_file = radv_amdgpu_import_syncobj_from_sync_file;
1665 ws->base.fence_wait = radv_amdgpu_fence_wait;
1666 ws->base.fences_wait = radv_amdgpu_fences_wait;
1667 }