radv: reduce CPU overhead merging bo lists.
[mesa.git] / src / amd / vulkan / winsys / amdgpu / radv_amdgpu_cs.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <stdlib.h>
26 #include <amdgpu.h>
27 #include <amdgpu_drm.h>
28 #include <assert.h>
29
30 #include "ac_debug.h"
31 #include "amdgpu_id.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_amdgpu_cs.h"
34 #include "radv_amdgpu_bo.h"
35 #include "sid.h"
36
37 struct radv_amdgpu_cs {
38 struct radeon_winsys_cs base;
39 struct radv_amdgpu_winsys *ws;
40
41 struct amdgpu_cs_ib_info ib;
42
43 struct radeon_winsys_bo *ib_buffer;
44 uint8_t *ib_mapped;
45 unsigned max_num_buffers;
46 unsigned num_buffers;
47 amdgpu_bo_handle *handles;
48 uint8_t *priorities;
49
50 struct radeon_winsys_bo **old_ib_buffers;
51 unsigned num_old_ib_buffers;
52 unsigned max_num_old_ib_buffers;
53 unsigned *ib_size_ptr;
54 bool failed;
55 bool is_chained;
56
57 int buffer_hash_table[1024];
58 unsigned hw_ip;
59 };
60
61 static inline struct radv_amdgpu_cs *
62 radv_amdgpu_cs(struct radeon_winsys_cs *base)
63 {
64 return (struct radv_amdgpu_cs*)base;
65 }
66
67 static int ring_to_hw_ip(enum ring_type ring)
68 {
69 switch (ring) {
70 case RING_GFX:
71 return AMDGPU_HW_IP_GFX;
72 case RING_DMA:
73 return AMDGPU_HW_IP_DMA;
74 case RING_COMPUTE:
75 return AMDGPU_HW_IP_COMPUTE;
76 default:
77 unreachable("unsupported ring");
78 }
79 }
80
81 static void radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx,
82 struct amdgpu_cs_fence *fence,
83 struct amdgpu_cs_request *req)
84 {
85 fence->context = ctx->ctx;
86 fence->ip_type = req->ip_type;
87 fence->ip_instance = req->ip_instance;
88 fence->ring = req->ring;
89 fence->fence = req->seq_no;
90 }
91
92 static struct radeon_winsys_fence *radv_amdgpu_create_fence()
93 {
94 struct radv_amdgpu_cs_fence *fence = calloc(1, sizeof(struct amdgpu_cs_fence));
95 return (struct radeon_winsys_fence*)fence;
96 }
97
98 static void radv_amdgpu_destroy_fence(struct radeon_winsys_fence *_fence)
99 {
100 struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
101 free(fence);
102 }
103
104 static bool radv_amdgpu_fence_wait(struct radeon_winsys *_ws,
105 struct radeon_winsys_fence *_fence,
106 bool absolute,
107 uint64_t timeout)
108 {
109 struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
110 unsigned flags = absolute ? AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE : 0;
111 int r;
112 uint32_t expired = 0;
113
114 /* Now use the libdrm query. */
115 r = amdgpu_cs_query_fence_status(fence,
116 timeout,
117 flags,
118 &expired);
119
120 if (r) {
121 fprintf(stderr, "amdgpu: radv_amdgpu_cs_query_fence_status failed.\n");
122 return false;
123 }
124
125 if (expired)
126 return true;
127
128 return false;
129 }
130
131 static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
132 {
133 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
134
135 if (cs->ib_buffer)
136 cs->ws->base.buffer_destroy(cs->ib_buffer);
137 else
138 free(cs->base.buf);
139
140 for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
141 cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
142
143 free(cs->old_ib_buffers);
144 free(cs->handles);
145 free(cs->priorities);
146 free(cs);
147 }
148
149 static boolean radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs,
150 enum ring_type ring_type)
151 {
152 for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
153 cs->buffer_hash_table[i] = -1;
154
155 cs->hw_ip = ring_to_hw_ip(ring_type);
156 return true;
157 }
158
159 static struct radeon_winsys_cs *
160 radv_amdgpu_cs_create(struct radeon_winsys *ws,
161 enum ring_type ring_type)
162 {
163 struct radv_amdgpu_cs *cs;
164 uint32_t ib_size = 20 * 1024 * 4;
165 cs = calloc(1, sizeof(struct radv_amdgpu_cs));
166 if (!cs)
167 return NULL;
168
169 cs->ws = radv_amdgpu_winsys(ws);
170 radv_amdgpu_init_cs(cs, ring_type);
171
172 if (cs->ws->use_ib_bos) {
173 cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
174 RADEON_DOMAIN_GTT,
175 RADEON_FLAG_CPU_ACCESS);
176 if (!cs->ib_buffer) {
177 free(cs);
178 return NULL;
179 }
180
181 cs->ib_mapped = ws->buffer_map(cs->ib_buffer);
182 if (!cs->ib_mapped) {
183 ws->buffer_destroy(cs->ib_buffer);
184 free(cs);
185 return NULL;
186 }
187
188 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
189 cs->base.buf = (uint32_t *)cs->ib_mapped;
190 cs->base.max_dw = ib_size / 4 - 4;
191 cs->ib_size_ptr = &cs->ib.size;
192 cs->ib.size = 0;
193
194 ws->cs_add_buffer(&cs->base, cs->ib_buffer, 8);
195 } else {
196 cs->base.buf = malloc(16384);
197 cs->base.max_dw = 4096;
198 if (!cs->base.buf) {
199 free(cs);
200 return NULL;
201 }
202 }
203
204 return &cs->base;
205 }
206
207 static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
208 {
209 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
210
211 if (cs->failed) {
212 cs->base.cdw = 0;
213 return;
214 }
215
216 if (!cs->ws->use_ib_bos) {
217 const uint64_t limit_dws = 0xffff8;
218 uint64_t ib_dws = MAX2(cs->base.cdw + min_size,
219 MIN2(cs->base.max_dw * 2, limit_dws));
220
221 /* The total ib size cannot exceed limit_dws dwords. */
222 if (ib_dws > limit_dws)
223 {
224 cs->failed = true;
225 cs->base.cdw = 0;
226 return;
227 }
228
229 uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
230 if (new_buf) {
231 cs->base.buf = new_buf;
232 cs->base.max_dw = ib_dws;
233 } else {
234 cs->failed = true;
235 cs->base.cdw = 0;
236 }
237 return;
238 }
239
240 uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
241
242 /* max that fits in the chain size field. */
243 ib_size = MIN2(ib_size, 0xfffff);
244
245 while (!cs->base.cdw || (cs->base.cdw & 7) != 4)
246 cs->base.buf[cs->base.cdw++] = 0xffff1000;
247
248 *cs->ib_size_ptr |= cs->base.cdw + 4;
249
250 if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) {
251 cs->max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2);
252 cs->old_ib_buffers = realloc(cs->old_ib_buffers,
253 cs->max_num_old_ib_buffers * sizeof(void*));
254 }
255
256 cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer;
257
258 cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
259 RADEON_DOMAIN_GTT,
260 RADEON_FLAG_CPU_ACCESS);
261
262 if (!cs->ib_buffer) {
263 cs->base.cdw = 0;
264 cs->failed = true;
265 cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
266 }
267
268 cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
269 if (!cs->ib_mapped) {
270 cs->ws->base.buffer_destroy(cs->ib_buffer);
271 cs->base.cdw = 0;
272 cs->failed = true;
273 cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
274 }
275
276 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
277
278 cs->base.buf[cs->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
279 cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
280 cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->va >> 32;
281 cs->ib_size_ptr = cs->base.buf + cs->base.cdw;
282 cs->base.buf[cs->base.cdw++] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
283
284 cs->base.buf = (uint32_t *)cs->ib_mapped;
285 cs->base.cdw = 0;
286 cs->base.max_dw = ib_size / 4 - 4;
287
288 }
289
290 static bool radv_amdgpu_cs_finalize(struct radeon_winsys_cs *_cs)
291 {
292 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
293
294 if (cs->ws->use_ib_bos) {
295 while (!cs->base.cdw || (cs->base.cdw & 7) != 0)
296 cs->base.buf[cs->base.cdw++] = 0xffff1000;
297
298 *cs->ib_size_ptr |= cs->base.cdw;
299
300 cs->is_chained = false;
301 }
302
303 return !cs->failed;
304 }
305
306 static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs)
307 {
308 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
309 cs->base.cdw = 0;
310 cs->failed = false;
311
312 for (unsigned i = 0; i < cs->num_buffers; ++i) {
313 unsigned hash = ((uintptr_t)cs->handles[i] >> 6) &
314 (ARRAY_SIZE(cs->buffer_hash_table) - 1);
315 cs->buffer_hash_table[hash] = -1;
316 }
317
318 cs->num_buffers = 0;
319
320 if (cs->ws->use_ib_bos) {
321 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
322
323 for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
324 cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
325
326 cs->num_old_ib_buffers = 0;
327 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->va;
328 cs->ib_size_ptr = &cs->ib.size;
329 cs->ib.size = 0;
330 }
331 }
332
333 static int radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs,
334 amdgpu_bo_handle bo)
335 {
336 unsigned hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
337 int index = cs->buffer_hash_table[hash];
338
339 if (index == -1)
340 return -1;
341
342 if (cs->handles[index] == bo)
343 return index;
344
345 for (unsigned i = 0; i < cs->num_buffers; ++i) {
346 if (cs->handles[i] == bo) {
347 cs->buffer_hash_table[hash] = i;
348 return i;
349 }
350 }
351
352 return -1;
353 }
354
355 static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
356 amdgpu_bo_handle bo,
357 uint8_t priority)
358 {
359 unsigned hash;
360 int index = radv_amdgpu_cs_find_buffer(cs, bo);
361
362 if (index != -1) {
363 cs->priorities[index] = MAX2(cs->priorities[index], priority);
364 return;
365 }
366
367 if (cs->num_buffers == cs->max_num_buffers) {
368 unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
369 cs->handles = realloc(cs->handles, new_count * sizeof(amdgpu_bo_handle));
370 cs->priorities = realloc(cs->priorities, new_count * sizeof(uint8_t));
371 cs->max_num_buffers = new_count;
372 }
373
374 cs->handles[cs->num_buffers] = bo;
375 cs->priorities[cs->num_buffers] = priority;
376
377 hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
378 cs->buffer_hash_table[hash] = cs->num_buffers;
379
380 ++cs->num_buffers;
381 }
382
383 static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs,
384 struct radeon_winsys_bo *_bo,
385 uint8_t priority)
386 {
387 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
388 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
389
390 radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority);
391 }
392
393 static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent,
394 struct radeon_winsys_cs *_child)
395 {
396 struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
397 struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
398
399 for (unsigned i = 0; i < child->num_buffers; ++i) {
400 radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i],
401 child->priorities[i]);
402 }
403
404 if (parent->ws->use_ib_bos) {
405 if (parent->base.cdw + 4 > parent->base.max_dw)
406 radv_amdgpu_cs_grow(&parent->base, 4);
407
408 parent->base.buf[parent->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
409 parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address;
410 parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address >> 32;
411 parent->base.buf[parent->base.cdw++] = child->ib.size;
412 } else {
413 if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
414 radv_amdgpu_cs_grow(&parent->base, child->base.cdw);
415
416 memcpy(parent->base.buf + parent->base.cdw, child->base.buf, 4 * child->base.cdw);
417 parent->base.cdw += child->base.cdw;
418 }
419 }
420
421 static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
422 struct radeon_winsys_cs **cs_array,
423 unsigned count,
424 struct radv_amdgpu_winsys_bo *extra_bo,
425 struct radeon_winsys_cs *extra_cs,
426 amdgpu_bo_list_handle *bo_list)
427 {
428 int r;
429 if (ws->debug_all_bos) {
430 struct radv_amdgpu_winsys_bo *bo;
431 amdgpu_bo_handle *handles;
432 unsigned num = 0;
433
434 pthread_mutex_lock(&ws->global_bo_list_lock);
435
436 handles = malloc(sizeof(handles[0]) * ws->num_buffers);
437 if (!handles) {
438 pthread_mutex_unlock(&ws->global_bo_list_lock);
439 return -ENOMEM;
440 }
441
442 LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
443 assert(num < ws->num_buffers);
444 handles[num++] = bo->bo;
445 }
446
447 r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
448 handles, NULL,
449 bo_list);
450 free(handles);
451 pthread_mutex_unlock(&ws->global_bo_list_lock);
452 } else if (count == 1 && !extra_bo && !extra_cs) {
453 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
454 r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
455 cs->priorities, bo_list);
456 } else {
457 unsigned total_buffer_count = !!extra_bo;
458 unsigned unique_bo_count = !!extra_bo;
459 for (unsigned i = 0; i < count; ++i) {
460 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
461 total_buffer_count += cs->num_buffers;
462 }
463
464 if (extra_cs) {
465 total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
466 }
467
468 amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count);
469 uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count);
470 if (!handles || !priorities) {
471 free(handles);
472 free(priorities);
473 return -ENOMEM;
474 }
475
476 if (extra_bo) {
477 handles[0] = extra_bo->bo;
478 priorities[0] = 8;
479 }
480
481 for (unsigned i = 0; i < count + !!extra_cs; ++i) {
482 struct radv_amdgpu_cs *cs;
483
484 if (i == count)
485 cs = (struct radv_amdgpu_cs*)extra_cs;
486 else
487 cs = (struct radv_amdgpu_cs*)cs_array[i];
488
489 if (!cs->num_buffers)
490 continue;
491
492 if (unique_bo_count == 0) {
493 memcpy(handles, cs->handles, cs->num_buffers * sizeof(amdgpu_bo_handle));
494 memcpy(priorities, cs->priorities, cs->num_buffers * sizeof(uint8_t));
495 unique_bo_count = cs->num_buffers;
496 continue;
497 }
498 int unique_bo_so_far = unique_bo_count;
499 for (unsigned j = 0; j < cs->num_buffers; ++j) {
500 bool found = false;
501 for (unsigned k = 0; k < unique_bo_so_far; ++k) {
502 if (handles[k] == cs->handles[j]) {
503 found = true;
504 priorities[k] = MAX2(priorities[k],
505 cs->priorities[j]);
506 break;
507 }
508 }
509 if (!found) {
510 handles[unique_bo_count] = cs->handles[j];
511 priorities[unique_bo_count] = cs->priorities[j];
512 ++unique_bo_count;
513 }
514 }
515 }
516 r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
517 priorities, bo_list);
518
519 free(handles);
520 free(priorities);
521 }
522
523 return r;
524 }
525
526 static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
527 struct amdgpu_cs_request *request)
528 {
529 radv_amdgpu_request_to_fence(ctx,
530 &ctx->last_submission[request->ip_type][request->ring],
531 request);
532 }
533
534 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
535 int queue_idx,
536 struct radeon_winsys_cs **cs_array,
537 unsigned cs_count,
538 struct radeon_winsys_cs *preamble_cs,
539 struct radeon_winsys_fence *_fence)
540 {
541 int r;
542 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
543 struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
544 struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
545 amdgpu_bo_list_handle bo_list;
546 struct amdgpu_cs_request request = {0};
547 struct amdgpu_cs_ib_info ibs[2];
548
549 for (unsigned i = cs_count; i--;) {
550 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
551
552 if (cs->is_chained) {
553 *cs->ib_size_ptr -= 4;
554 cs->is_chained = false;
555 }
556
557 if (i + 1 < cs_count) {
558 struct radv_amdgpu_cs *next = radv_amdgpu_cs(cs_array[i + 1]);
559 assert(cs->base.cdw + 4 <= cs->base.max_dw);
560
561 cs->is_chained = true;
562 *cs->ib_size_ptr += 4;
563
564 cs->base.buf[cs->base.cdw + 0] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
565 cs->base.buf[cs->base.cdw + 1] = next->ib.ib_mc_address;
566 cs->base.buf[cs->base.cdw + 2] = next->ib.ib_mc_address >> 32;
567 cs->base.buf[cs->base.cdw + 3] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | next->ib.size;
568 }
569 }
570
571 r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, preamble_cs, &bo_list);
572 if (r) {
573 fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
574 return r;
575 }
576
577 request.ip_type = cs0->hw_ip;
578 request.ring = queue_idx;
579 request.number_of_ibs = 1;
580 request.ibs = &cs0->ib;
581 request.resources = bo_list;
582
583 if (preamble_cs) {
584 request.ibs = ibs;
585 request.number_of_ibs = 2;
586 ibs[1] = cs0->ib;
587 ibs[0] = ((struct radv_amdgpu_cs*)preamble_cs)->ib;
588 }
589
590 r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
591 if (r) {
592 if (r == -ENOMEM)
593 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
594 else
595 fprintf(stderr, "amdgpu: The CS has been rejected, "
596 "see dmesg for more information.\n");
597 }
598
599 amdgpu_bo_list_destroy(bo_list);
600
601 if (fence)
602 radv_amdgpu_request_to_fence(ctx, fence, &request);
603
604 radv_assign_last_submit(ctx, &request);
605
606 return r;
607 }
608
609 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
610 int queue_idx,
611 struct radeon_winsys_cs **cs_array,
612 unsigned cs_count,
613 struct radeon_winsys_cs *preamble_cs,
614 struct radeon_winsys_fence *_fence)
615 {
616 int r;
617 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
618 struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
619 amdgpu_bo_list_handle bo_list;
620 struct amdgpu_cs_request request;
621
622 assert(cs_count);
623
624 for (unsigned i = 0; i < cs_count;) {
625 struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[i]);
626 struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
627 unsigned cnt = MIN2(AMDGPU_CS_MAX_IBS_PER_SUBMIT - !!preamble_cs,
628 cs_count - i);
629
630 memset(&request, 0, sizeof(request));
631
632 r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
633 preamble_cs, &bo_list);
634 if (r) {
635 fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
636 return r;
637 }
638
639 request.ip_type = cs0->hw_ip;
640 request.ring = queue_idx;
641 request.resources = bo_list;
642 request.number_of_ibs = cnt + !!preamble_cs;
643 request.ibs = ibs;
644
645 if (preamble_cs) {
646 ibs[0] = radv_amdgpu_cs(preamble_cs)->ib;
647 }
648
649 for (unsigned j = 0; j < cnt; ++j) {
650 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
651 ibs[j + !!preamble_cs] = cs->ib;
652
653 if (cs->is_chained) {
654 *cs->ib_size_ptr -= 4;
655 cs->is_chained = false;
656 }
657 }
658
659 r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
660 if (r) {
661 if (r == -ENOMEM)
662 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
663 else
664 fprintf(stderr, "amdgpu: The CS has been rejected, "
665 "see dmesg for more information.\n");
666 }
667
668 amdgpu_bo_list_destroy(bo_list);
669
670 if (r)
671 return r;
672
673 i += cnt;
674 }
675 if (fence)
676 radv_amdgpu_request_to_fence(ctx, fence, &request);
677
678 radv_assign_last_submit(ctx, &request);
679
680 return 0;
681 }
682
683 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
684 int queue_idx,
685 struct radeon_winsys_cs **cs_array,
686 unsigned cs_count,
687 struct radeon_winsys_cs *preamble_cs,
688 struct radeon_winsys_fence *_fence)
689 {
690 int r;
691 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
692 struct amdgpu_cs_fence *fence = (struct amdgpu_cs_fence *)_fence;
693 struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[0]);
694 struct radeon_winsys *ws = (struct radeon_winsys*)cs0->ws;
695 amdgpu_bo_list_handle bo_list;
696 struct amdgpu_cs_request request;
697 uint32_t pad_word = 0xffff1000U;
698
699 if (radv_amdgpu_winsys(ws)->family == FAMILY_SI)
700 pad_word = 0x80000000;
701
702 assert(cs_count);
703
704 for (unsigned i = 0; i < cs_count;) {
705 struct amdgpu_cs_ib_info ib = {0};
706 struct radeon_winsys_bo *bo = NULL;
707 uint32_t *ptr;
708 unsigned cnt = 0;
709 unsigned size = 0;
710
711 if (preamble_cs)
712 size += preamble_cs->cdw;
713
714 while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
715 size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
716 ++cnt;
717 }
718
719 assert(cnt);
720
721 bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS);
722 ptr = ws->buffer_map(bo);
723
724 if (preamble_cs) {
725 memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
726 ptr += preamble_cs->cdw;
727 }
728
729 for (unsigned j = 0; j < cnt; ++j) {
730 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
731 memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
732 ptr += cs->base.cdw;
733
734 }
735
736 while(!size || (size & 7)) {
737 *ptr++ = pad_word;
738 ++size;
739 }
740
741 memset(&request, 0, sizeof(request));
742
743
744 r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
745 (struct radv_amdgpu_winsys_bo*)bo,
746 preamble_cs, &bo_list);
747 if (r) {
748 fprintf(stderr, "amdgpu: Failed to created the BO list for submission\n");
749 return r;
750 }
751
752 ib.size = size;
753 ib.ib_mc_address = ws->buffer_get_va(bo);
754
755 request.ip_type = cs0->hw_ip;
756 request.ring = queue_idx;
757 request.resources = bo_list;
758 request.number_of_ibs = 1;
759 request.ibs = &ib;
760
761 r = amdgpu_cs_submit(ctx->ctx, 0, &request, 1);
762 if (r) {
763 if (r == -ENOMEM)
764 fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
765 else
766 fprintf(stderr, "amdgpu: The CS has been rejected, "
767 "see dmesg for more information.\n");
768 }
769
770 amdgpu_bo_list_destroy(bo_list);
771
772 ws->buffer_destroy(bo);
773 if (r)
774 return r;
775
776 i += cnt;
777 }
778 if (fence)
779 radv_amdgpu_request_to_fence(ctx, fence, &request);
780
781 radv_assign_last_submit(ctx, &request);
782
783 return 0;
784 }
785
786 static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
787 int queue_idx,
788 struct radeon_winsys_cs **cs_array,
789 unsigned cs_count,
790 struct radeon_winsys_cs *preamble_cs,
791 struct radeon_winsys_sem **wait_sem,
792 unsigned wait_sem_count,
793 struct radeon_winsys_sem **signal_sem,
794 unsigned signal_sem_count,
795 bool can_patch,
796 struct radeon_winsys_fence *_fence)
797 {
798 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[0]);
799 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
800 int ret;
801 int i;
802
803 for (i = 0; i < wait_sem_count; i++) {
804 amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)wait_sem[i];
805 amdgpu_cs_wait_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
806 sem);
807 }
808 if (!cs->ws->use_ib_bos) {
809 ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, cs_array,
810 cs_count, preamble_cs, _fence);
811 } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && false) {
812 ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, cs_array,
813 cs_count, preamble_cs, _fence);
814 } else {
815 ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, cs_array,
816 cs_count, preamble_cs, _fence);
817 }
818
819 for (i = 0; i < signal_sem_count; i++) {
820 amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)signal_sem[i];
821 amdgpu_cs_signal_semaphore(ctx->ctx, cs->hw_ip, 0, queue_idx,
822 sem);
823 }
824 return ret;
825 }
826
827
828 static void *radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
829 {
830 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
831 void *ret = NULL;
832 for (unsigned i = 0; i <= cs->num_old_ib_buffers; ++i) {
833 struct radv_amdgpu_winsys_bo *bo;
834
835 bo = (struct radv_amdgpu_winsys_bo*)
836 (i == cs->num_old_ib_buffers ? cs->ib_buffer : cs->old_ib_buffers[i]);
837 if (addr >= bo->va && addr - bo->va < bo->size) {
838 if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
839 return (char *)ret + (addr - bo->va);
840 }
841 }
842 return ret;
843 }
844
845 static void radv_amdgpu_winsys_cs_dump(struct radeon_winsys_cs *_cs,
846 FILE* file,
847 uint32_t trace_id)
848 {
849 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
850
851 ac_parse_ib(file,
852 radv_amdgpu_winsys_get_cpu_addr(cs, cs->ib.ib_mc_address),
853 cs->ib.size, trace_id, "main IB", cs->ws->info.chip_class,
854 radv_amdgpu_winsys_get_cpu_addr, cs);
855 }
856
857 static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_ws)
858 {
859 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
860 struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
861 int r;
862
863 if (!ctx)
864 return NULL;
865 r = amdgpu_cs_ctx_create(ws->dev, &ctx->ctx);
866 if (r) {
867 fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create failed. (%i)\n", r);
868 goto error_create;
869 }
870 ctx->ws = ws;
871 return (struct radeon_winsys_ctx *)ctx;
872 error_create:
873 FREE(ctx);
874 return NULL;
875 }
876
877 static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
878 {
879 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
880 amdgpu_cs_ctx_free(ctx->ctx);
881 FREE(ctx);
882 }
883
884 static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx,
885 enum ring_type ring_type, int ring_index)
886 {
887 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
888 int ip_type = ring_to_hw_ip(ring_type);
889
890 if (ctx->last_submission[ip_type][ring_index].fence) {
891 uint32_t expired;
892 int ret = amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index],
893 1000000000ull, 0, &expired);
894
895 if (ret || !expired)
896 return false;
897 }
898
899 return true;
900 }
901
902 static struct radeon_winsys_sem *radv_amdgpu_create_sem(struct radeon_winsys *_ws)
903 {
904 int ret;
905 amdgpu_semaphore_handle sem;
906
907 ret = amdgpu_cs_create_semaphore(&sem);
908 if (ret)
909 return NULL;
910 return (struct radeon_winsys_sem *)sem;
911 }
912
913 static void radv_amdgpu_destroy_sem(struct radeon_winsys_sem *_sem)
914 {
915 amdgpu_semaphore_handle sem = (amdgpu_semaphore_handle)_sem;
916 amdgpu_cs_destroy_semaphore(sem);
917 }
918
919 void radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
920 {
921 ws->base.ctx_create = radv_amdgpu_ctx_create;
922 ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
923 ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
924 ws->base.cs_create = radv_amdgpu_cs_create;
925 ws->base.cs_destroy = radv_amdgpu_cs_destroy;
926 ws->base.cs_grow = radv_amdgpu_cs_grow;
927 ws->base.cs_finalize = radv_amdgpu_cs_finalize;
928 ws->base.cs_reset = radv_amdgpu_cs_reset;
929 ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
930 ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
931 ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
932 ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
933 ws->base.create_fence = radv_amdgpu_create_fence;
934 ws->base.destroy_fence = radv_amdgpu_destroy_fence;
935 ws->base.create_sem = radv_amdgpu_create_sem;
936 ws->base.destroy_sem = radv_amdgpu_destroy_sem;
937 ws->base.fence_wait = radv_amdgpu_fence_wait;
938 }