radv: set DISABLE_CONSTANT_ENCODE_REG to 1 for Raven2
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_private.h"
29 #include "radv_radeon_winsys.h"
30 #include "radv_shader.h"
31 #include "radv_cs.h"
32 #include "sid.h"
33 #include "vk_format.h"
34 #include "vk_util.h"
35 #include "radv_debug.h"
36 #include "radv_meta.h"
37
38 #include "ac_debug.h"
39
40 enum {
41 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
42 RADV_PREFETCH_VS = (1 << 1),
43 RADV_PREFETCH_TCS = (1 << 2),
44 RADV_PREFETCH_TES = (1 << 3),
45 RADV_PREFETCH_GS = (1 << 4),
46 RADV_PREFETCH_PS = (1 << 5),
47 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS |
48 RADV_PREFETCH_TCS |
49 RADV_PREFETCH_TES |
50 RADV_PREFETCH_GS |
51 RADV_PREFETCH_PS)
52 };
53
54 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
55 struct radv_image *image,
56 VkImageLayout src_layout,
57 VkImageLayout dst_layout,
58 uint32_t src_family,
59 uint32_t dst_family,
60 const VkImageSubresourceRange *range,
61 struct radv_sample_locations_state *sample_locs);
62
63 const struct radv_dynamic_state default_dynamic_state = {
64 .viewport = {
65 .count = 0,
66 },
67 .scissor = {
68 .count = 0,
69 },
70 .line_width = 1.0f,
71 .depth_bias = {
72 .bias = 0.0f,
73 .clamp = 0.0f,
74 .slope = 0.0f,
75 },
76 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
77 .depth_bounds = {
78 .min = 0.0f,
79 .max = 1.0f,
80 },
81 .stencil_compare_mask = {
82 .front = ~0u,
83 .back = ~0u,
84 },
85 .stencil_write_mask = {
86 .front = ~0u,
87 .back = ~0u,
88 },
89 .stencil_reference = {
90 .front = 0u,
91 .back = 0u,
92 },
93 };
94
95 static void
96 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
97 const struct radv_dynamic_state *src)
98 {
99 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
100 uint32_t copy_mask = src->mask;
101 uint32_t dest_mask = 0;
102
103 /* Make sure to copy the number of viewports/scissors because they can
104 * only be specified at pipeline creation time.
105 */
106 dest->viewport.count = src->viewport.count;
107 dest->scissor.count = src->scissor.count;
108 dest->discard_rectangle.count = src->discard_rectangle.count;
109 dest->sample_location.count = src->sample_location.count;
110
111 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
112 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
113 src->viewport.count * sizeof(VkViewport))) {
114 typed_memcpy(dest->viewport.viewports,
115 src->viewport.viewports,
116 src->viewport.count);
117 dest_mask |= RADV_DYNAMIC_VIEWPORT;
118 }
119 }
120
121 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
122 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
123 src->scissor.count * sizeof(VkRect2D))) {
124 typed_memcpy(dest->scissor.scissors,
125 src->scissor.scissors, src->scissor.count);
126 dest_mask |= RADV_DYNAMIC_SCISSOR;
127 }
128 }
129
130 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
131 if (dest->line_width != src->line_width) {
132 dest->line_width = src->line_width;
133 dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
134 }
135 }
136
137 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
138 if (memcmp(&dest->depth_bias, &src->depth_bias,
139 sizeof(src->depth_bias))) {
140 dest->depth_bias = src->depth_bias;
141 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
142 }
143 }
144
145 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
146 if (memcmp(&dest->blend_constants, &src->blend_constants,
147 sizeof(src->blend_constants))) {
148 typed_memcpy(dest->blend_constants,
149 src->blend_constants, 4);
150 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
151 }
152 }
153
154 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
155 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
156 sizeof(src->depth_bounds))) {
157 dest->depth_bounds = src->depth_bounds;
158 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
159 }
160 }
161
162 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
163 if (memcmp(&dest->stencil_compare_mask,
164 &src->stencil_compare_mask,
165 sizeof(src->stencil_compare_mask))) {
166 dest->stencil_compare_mask = src->stencil_compare_mask;
167 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
168 }
169 }
170
171 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
172 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
173 sizeof(src->stencil_write_mask))) {
174 dest->stencil_write_mask = src->stencil_write_mask;
175 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
176 }
177 }
178
179 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
180 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
181 sizeof(src->stencil_reference))) {
182 dest->stencil_reference = src->stencil_reference;
183 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
184 }
185 }
186
187 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
188 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
189 src->discard_rectangle.count * sizeof(VkRect2D))) {
190 typed_memcpy(dest->discard_rectangle.rectangles,
191 src->discard_rectangle.rectangles,
192 src->discard_rectangle.count);
193 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
194 }
195 }
196
197 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
198 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
199 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
200 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
201 memcmp(&dest->sample_location.locations,
202 &src->sample_location.locations,
203 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
204 dest->sample_location.per_pixel = src->sample_location.per_pixel;
205 dest->sample_location.grid_size = src->sample_location.grid_size;
206 typed_memcpy(dest->sample_location.locations,
207 src->sample_location.locations,
208 src->sample_location.count);
209 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
210 }
211 }
212
213 cmd_buffer->state.dirty |= dest_mask;
214 }
215
216 static void
217 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
218 struct radv_pipeline *pipeline)
219 {
220 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
221 struct radv_shader_info *info;
222
223 if (!pipeline->streamout_shader)
224 return;
225
226 info = &pipeline->streamout_shader->info.info;
227 for (int i = 0; i < MAX_SO_BUFFERS; i++)
228 so->stride_in_dw[i] = info->so.strides[i];
229
230 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
231 }
232
233 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
234 {
235 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
236 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
237 }
238
239 enum ring_type radv_queue_family_to_ring(int f) {
240 switch (f) {
241 case RADV_QUEUE_GENERAL:
242 return RING_GFX;
243 case RADV_QUEUE_COMPUTE:
244 return RING_COMPUTE;
245 case RADV_QUEUE_TRANSFER:
246 return RING_DMA;
247 default:
248 unreachable("Unknown queue family");
249 }
250 }
251
252 static VkResult radv_create_cmd_buffer(
253 struct radv_device * device,
254 struct radv_cmd_pool * pool,
255 VkCommandBufferLevel level,
256 VkCommandBuffer* pCommandBuffer)
257 {
258 struct radv_cmd_buffer *cmd_buffer;
259 unsigned ring;
260 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
261 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
262 if (cmd_buffer == NULL)
263 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
264
265 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
266 cmd_buffer->device = device;
267 cmd_buffer->pool = pool;
268 cmd_buffer->level = level;
269
270 if (pool) {
271 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
272 cmd_buffer->queue_family_index = pool->queue_family_index;
273
274 } else {
275 /* Init the pool_link so we can safely call list_del when we destroy
276 * the command buffer
277 */
278 list_inithead(&cmd_buffer->pool_link);
279 cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
280 }
281
282 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
283
284 cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
285 if (!cmd_buffer->cs) {
286 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
287 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
288 }
289
290 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
291
292 list_inithead(&cmd_buffer->upload.list);
293
294 return VK_SUCCESS;
295 }
296
297 static void
298 radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
299 {
300 list_del(&cmd_buffer->pool_link);
301
302 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
303 &cmd_buffer->upload.list, list) {
304 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
305 list_del(&up->list);
306 free(up);
307 }
308
309 if (cmd_buffer->upload.upload_bo)
310 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
311 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
312
313 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
314 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
315
316 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
317 }
318
319 static VkResult
320 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
321 {
322 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
323
324 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
325 &cmd_buffer->upload.list, list) {
326 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
327 list_del(&up->list);
328 free(up);
329 }
330
331 cmd_buffer->push_constant_stages = 0;
332 cmd_buffer->scratch_size_needed = 0;
333 cmd_buffer->compute_scratch_size_needed = 0;
334 cmd_buffer->esgs_ring_size_needed = 0;
335 cmd_buffer->gsvs_ring_size_needed = 0;
336 cmd_buffer->tess_rings_needed = false;
337 cmd_buffer->sample_positions_needed = false;
338
339 if (cmd_buffer->upload.upload_bo)
340 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
341 cmd_buffer->upload.upload_bo);
342 cmd_buffer->upload.offset = 0;
343
344 cmd_buffer->record_result = VK_SUCCESS;
345
346 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
347
348 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
349 cmd_buffer->descriptors[i].dirty = 0;
350 cmd_buffer->descriptors[i].valid = 0;
351 cmd_buffer->descriptors[i].push_dirty = false;
352 }
353
354 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
355 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
356 unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
357 unsigned fence_offset, eop_bug_offset;
358 void *fence_ptr;
359
360 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
361 &fence_ptr);
362
363 cmd_buffer->gfx9_fence_va =
364 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
365 cmd_buffer->gfx9_fence_va += fence_offset;
366
367 /* Allocate a buffer for the EOP bug on GFX9. */
368 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
369 &eop_bug_offset, &fence_ptr);
370 cmd_buffer->gfx9_eop_bug_va =
371 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
372 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
373 }
374
375 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
376
377 return cmd_buffer->record_result;
378 }
379
380 static bool
381 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
382 uint64_t min_needed)
383 {
384 uint64_t new_size;
385 struct radeon_winsys_bo *bo;
386 struct radv_cmd_buffer_upload *upload;
387 struct radv_device *device = cmd_buffer->device;
388
389 new_size = MAX2(min_needed, 16 * 1024);
390 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
391
392 bo = device->ws->buffer_create(device->ws,
393 new_size, 4096,
394 RADEON_DOMAIN_GTT,
395 RADEON_FLAG_CPU_ACCESS|
396 RADEON_FLAG_NO_INTERPROCESS_SHARING |
397 RADEON_FLAG_32BIT,
398 RADV_BO_PRIORITY_UPLOAD_BUFFER);
399
400 if (!bo) {
401 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
402 return false;
403 }
404
405 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
406 if (cmd_buffer->upload.upload_bo) {
407 upload = malloc(sizeof(*upload));
408
409 if (!upload) {
410 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
411 device->ws->buffer_destroy(bo);
412 return false;
413 }
414
415 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
416 list_add(&upload->list, &cmd_buffer->upload.list);
417 }
418
419 cmd_buffer->upload.upload_bo = bo;
420 cmd_buffer->upload.size = new_size;
421 cmd_buffer->upload.offset = 0;
422 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
423
424 if (!cmd_buffer->upload.map) {
425 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
426 return false;
427 }
428
429 return true;
430 }
431
432 bool
433 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
434 unsigned size,
435 unsigned alignment,
436 unsigned *out_offset,
437 void **ptr)
438 {
439 assert(util_is_power_of_two_nonzero(alignment));
440
441 uint64_t offset = align(cmd_buffer->upload.offset, alignment);
442 if (offset + size > cmd_buffer->upload.size) {
443 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
444 return false;
445 offset = 0;
446 }
447
448 *out_offset = offset;
449 *ptr = cmd_buffer->upload.map + offset;
450
451 cmd_buffer->upload.offset = offset + size;
452 return true;
453 }
454
455 bool
456 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
457 unsigned size, unsigned alignment,
458 const void *data, unsigned *out_offset)
459 {
460 uint8_t *ptr;
461
462 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
463 out_offset, (void **)&ptr))
464 return false;
465
466 if (ptr)
467 memcpy(ptr, data, size);
468
469 return true;
470 }
471
472 static void
473 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
474 unsigned count, const uint32_t *data)
475 {
476 struct radeon_cmdbuf *cs = cmd_buffer->cs;
477
478 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
479
480 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
481 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
482 S_370_WR_CONFIRM(1) |
483 S_370_ENGINE_SEL(V_370_ME));
484 radeon_emit(cs, va);
485 radeon_emit(cs, va >> 32);
486 radeon_emit_array(cs, data, count);
487 }
488
489 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
490 {
491 struct radv_device *device = cmd_buffer->device;
492 struct radeon_cmdbuf *cs = cmd_buffer->cs;
493 uint64_t va;
494
495 va = radv_buffer_get_va(device->trace_bo);
496 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
497 va += 4;
498
499 ++cmd_buffer->state.trace_id;
500 radv_emit_write_data_packet(cmd_buffer, va, 1,
501 &cmd_buffer->state.trace_id);
502
503 radeon_check_space(cmd_buffer->device->ws, cs, 2);
504
505 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
506 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
507 }
508
509 static void
510 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
511 enum radv_cmd_flush_bits flags)
512 {
513 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
514 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
515 RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
516
517 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
518
519 /* Force wait for graphics or compute engines to be idle. */
520 si_cs_emit_cache_flush(cmd_buffer->cs,
521 cmd_buffer->device->physical_device->rad_info.chip_class,
522 &cmd_buffer->gfx9_fence_idx,
523 cmd_buffer->gfx9_fence_va,
524 radv_cmd_buffer_uses_mec(cmd_buffer),
525 flags, cmd_buffer->gfx9_eop_bug_va);
526 }
527
528 if (unlikely(cmd_buffer->device->trace_bo))
529 radv_cmd_buffer_trace_emit(cmd_buffer);
530 }
531
532 static void
533 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
534 struct radv_pipeline *pipeline, enum ring_type ring)
535 {
536 struct radv_device *device = cmd_buffer->device;
537 uint32_t data[2];
538 uint64_t va;
539
540 va = radv_buffer_get_va(device->trace_bo);
541
542 switch (ring) {
543 case RING_GFX:
544 va += 8;
545 break;
546 case RING_COMPUTE:
547 va += 16;
548 break;
549 default:
550 assert(!"invalid ring type");
551 }
552
553 data[0] = (uintptr_t)pipeline;
554 data[1] = (uintptr_t)pipeline >> 32;
555
556 radv_emit_write_data_packet(cmd_buffer, va, 2, data);
557 }
558
559 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
560 VkPipelineBindPoint bind_point,
561 struct radv_descriptor_set *set,
562 unsigned idx)
563 {
564 struct radv_descriptor_state *descriptors_state =
565 radv_get_descriptors_state(cmd_buffer, bind_point);
566
567 descriptors_state->sets[idx] = set;
568
569 descriptors_state->valid |= (1u << idx); /* active descriptors */
570 descriptors_state->dirty |= (1u << idx);
571 }
572
573 static void
574 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer,
575 VkPipelineBindPoint bind_point)
576 {
577 struct radv_descriptor_state *descriptors_state =
578 radv_get_descriptors_state(cmd_buffer, bind_point);
579 struct radv_device *device = cmd_buffer->device;
580 uint32_t data[MAX_SETS * 2] = {};
581 uint64_t va;
582 unsigned i;
583 va = radv_buffer_get_va(device->trace_bo) + 24;
584
585 for_each_bit(i, descriptors_state->valid) {
586 struct radv_descriptor_set *set = descriptors_state->sets[i];
587 data[i * 2] = (uint64_t)(uintptr_t)set;
588 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
589 }
590
591 radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
592 }
593
594 struct radv_userdata_info *
595 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
596 gl_shader_stage stage,
597 int idx)
598 {
599 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
600 return &shader->info.user_sgprs_locs.shader_data[idx];
601 }
602
603 static void
604 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
605 struct radv_pipeline *pipeline,
606 gl_shader_stage stage,
607 int idx, uint64_t va)
608 {
609 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
610 uint32_t base_reg = pipeline->user_data_0[stage];
611 if (loc->sgpr_idx == -1)
612 return;
613
614 assert(loc->num_sgprs == 1);
615
616 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
617 base_reg + loc->sgpr_idx * 4, va, false);
618 }
619
620 static void
621 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
622 struct radv_pipeline *pipeline,
623 struct radv_descriptor_state *descriptors_state,
624 gl_shader_stage stage)
625 {
626 struct radv_device *device = cmd_buffer->device;
627 struct radeon_cmdbuf *cs = cmd_buffer->cs;
628 uint32_t sh_base = pipeline->user_data_0[stage];
629 struct radv_userdata_locations *locs =
630 &pipeline->shaders[stage]->info.user_sgprs_locs;
631 unsigned mask = locs->descriptor_sets_enabled;
632
633 mask &= descriptors_state->dirty & descriptors_state->valid;
634
635 while (mask) {
636 int start, count;
637
638 u_bit_scan_consecutive_range(&mask, &start, &count);
639
640 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
641 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
642
643 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
644 for (int i = 0; i < count; i++) {
645 struct radv_descriptor_set *set =
646 descriptors_state->sets[start + i];
647
648 radv_emit_shader_pointer_body(device, cs, set->va, true);
649 }
650 }
651 }
652
653 /**
654 * Convert the user sample locations to hardware sample locations (the values
655 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
656 */
657 static void
658 radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
659 uint32_t x, uint32_t y, VkOffset2D *sample_locs)
660 {
661 uint32_t x_offset = x % state->grid_size.width;
662 uint32_t y_offset = y % state->grid_size.height;
663 uint32_t num_samples = (uint32_t)state->per_pixel;
664 VkSampleLocationEXT *user_locs;
665 uint32_t pixel_offset;
666
667 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
668
669 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
670 user_locs = &state->locations[pixel_offset];
671
672 for (uint32_t i = 0; i < num_samples; i++) {
673 float shifted_pos_x = user_locs[i].x - 0.5;
674 float shifted_pos_y = user_locs[i].y - 0.5;
675
676 int32_t scaled_pos_x = floor(shifted_pos_x * 16);
677 int32_t scaled_pos_y = floor(shifted_pos_y * 16);
678
679 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
680 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
681 }
682 }
683
684 /**
685 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
686 * locations.
687 */
688 static void
689 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
690 uint32_t *sample_locs_pixel)
691 {
692 for (uint32_t i = 0; i < num_samples; i++) {
693 uint32_t sample_reg_idx = i / 4;
694 uint32_t sample_loc_idx = i % 4;
695 int32_t pos_x = sample_locs[i].x;
696 int32_t pos_y = sample_locs[i].y;
697
698 uint32_t shift_x = 8 * sample_loc_idx;
699 uint32_t shift_y = shift_x + 4;
700
701 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
702 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
703 }
704 }
705
706 /**
707 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
708 * sample locations.
709 */
710 static uint64_t
711 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer,
712 VkOffset2D *sample_locs,
713 uint32_t num_samples)
714 {
715 uint32_t centroid_priorities[num_samples];
716 uint32_t sample_mask = num_samples - 1;
717 uint32_t distances[num_samples];
718 uint64_t centroid_priority = 0;
719
720 /* Compute the distances from center for each sample. */
721 for (int i = 0; i < num_samples; i++) {
722 distances[i] = (sample_locs[i].x * sample_locs[i].x) +
723 (sample_locs[i].y * sample_locs[i].y);
724 }
725
726 /* Compute the centroid priorities by looking at the distances array. */
727 for (int i = 0; i < num_samples; i++) {
728 uint32_t min_idx = 0;
729
730 for (int j = 1; j < num_samples; j++) {
731 if (distances[j] < distances[min_idx])
732 min_idx = j;
733 }
734
735 centroid_priorities[i] = min_idx;
736 distances[min_idx] = 0xffffffff;
737 }
738
739 /* Compute the final centroid priority. */
740 for (int i = 0; i < 8; i++) {
741 centroid_priority |=
742 centroid_priorities[i & sample_mask] << (i * 4);
743 }
744
745 return centroid_priority << 32 | centroid_priority;
746 }
747
748 /**
749 * Emit the sample locations that are specified with VK_EXT_sample_locations.
750 */
751 static void
752 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
753 {
754 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
755 struct radv_multisample_state *ms = &pipeline->graphics.ms;
756 struct radv_sample_locations_state *sample_location =
757 &cmd_buffer->state.dynamic.sample_location;
758 uint32_t num_samples = (uint32_t)sample_location->per_pixel;
759 struct radeon_cmdbuf *cs = cmd_buffer->cs;
760 uint32_t sample_locs_pixel[4][2] = {};
761 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
762 uint32_t max_sample_dist = 0;
763 uint64_t centroid_priority;
764
765 if (!cmd_buffer->state.dynamic.sample_location.count)
766 return;
767
768 /* Convert the user sample locations to hardware sample locations. */
769 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
770 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
771 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
772 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
773
774 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
775 for (uint32_t i = 0; i < 4; i++) {
776 radv_compute_sample_locs_pixel(num_samples, sample_locs[i],
777 sample_locs_pixel[i]);
778 }
779
780 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
781 centroid_priority =
782 radv_compute_centroid_priority(cmd_buffer, sample_locs[0],
783 num_samples);
784
785 /* Compute the maximum sample distance from the specified locations. */
786 for (uint32_t i = 0; i < num_samples; i++) {
787 VkOffset2D offset = sample_locs[0][i];
788 max_sample_dist = MAX2(max_sample_dist,
789 MAX2(abs(offset.x), abs(offset.y)));
790 }
791
792 /* Emit the specified user sample locations. */
793 switch (num_samples) {
794 case 2:
795 case 4:
796 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
797 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
798 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
799 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
800 break;
801 case 8:
802 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
803 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
804 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
805 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
806 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
807 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
808 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
809 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
810 break;
811 default:
812 unreachable("invalid number of samples");
813 }
814
815 /* Emit the maximum sample distance and the centroid priority. */
816 uint32_t pa_sc_aa_config = ms->pa_sc_aa_config;
817
818 pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST;
819 pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist);
820
821 radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1);
822 radeon_emit(cs, pa_sc_aa_config);
823
824 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
825 radeon_emit(cs, centroid_priority);
826 radeon_emit(cs, centroid_priority >> 32);
827
828 /* GFX9: Flush DFSM when the AA mode changes. */
829 if (cmd_buffer->device->dfsm_allowed) {
830 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
831 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
832 }
833
834 cmd_buffer->state.context_roll_without_scissor_emitted = true;
835 }
836
837 static void
838 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
839 struct radv_pipeline *pipeline,
840 gl_shader_stage stage,
841 int idx, int count, uint32_t *values)
842 {
843 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
844 uint32_t base_reg = pipeline->user_data_0[stage];
845 if (loc->sgpr_idx == -1)
846 return;
847
848 assert(loc->num_sgprs == count);
849
850 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
851 radeon_emit_array(cmd_buffer->cs, values, count);
852 }
853
854 static void
855 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
856 struct radv_pipeline *pipeline)
857 {
858 int num_samples = pipeline->graphics.ms.num_samples;
859 struct radv_multisample_state *ms = &pipeline->graphics.ms;
860 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
861
862 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions)
863 cmd_buffer->sample_positions_needed = true;
864
865 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
866 return;
867
868 radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
869 radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
870 radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
871
872 radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
873
874 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
875
876 /* GFX9: Flush DFSM when the AA mode changes. */
877 if (cmd_buffer->device->dfsm_allowed) {
878 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
879 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
880 }
881
882 cmd_buffer->state.context_roll_without_scissor_emitted = true;
883 }
884
885 static void
886 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
887 struct radv_shader_variant *shader)
888 {
889 uint64_t va;
890
891 if (!shader)
892 return;
893
894 va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
895
896 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
897 }
898
899 static void
900 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
901 struct radv_pipeline *pipeline,
902 bool vertex_stage_only)
903 {
904 struct radv_cmd_state *state = &cmd_buffer->state;
905 uint32_t mask = state->prefetch_L2_mask;
906
907 if (vertex_stage_only) {
908 /* Fast prefetch path for starting draws as soon as possible.
909 */
910 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS |
911 RADV_PREFETCH_VBO_DESCRIPTORS);
912 }
913
914 if (mask & RADV_PREFETCH_VS)
915 radv_emit_shader_prefetch(cmd_buffer,
916 pipeline->shaders[MESA_SHADER_VERTEX]);
917
918 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
919 si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
920
921 if (mask & RADV_PREFETCH_TCS)
922 radv_emit_shader_prefetch(cmd_buffer,
923 pipeline->shaders[MESA_SHADER_TESS_CTRL]);
924
925 if (mask & RADV_PREFETCH_TES)
926 radv_emit_shader_prefetch(cmd_buffer,
927 pipeline->shaders[MESA_SHADER_TESS_EVAL]);
928
929 if (mask & RADV_PREFETCH_GS) {
930 radv_emit_shader_prefetch(cmd_buffer,
931 pipeline->shaders[MESA_SHADER_GEOMETRY]);
932 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
933 }
934
935 if (mask & RADV_PREFETCH_PS)
936 radv_emit_shader_prefetch(cmd_buffer,
937 pipeline->shaders[MESA_SHADER_FRAGMENT]);
938
939 state->prefetch_L2_mask &= ~mask;
940 }
941
942 static void
943 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
944 {
945 if (!cmd_buffer->device->physical_device->rbplus_allowed)
946 return;
947
948 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
949 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
950 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
951
952 unsigned sx_ps_downconvert = 0;
953 unsigned sx_blend_opt_epsilon = 0;
954 unsigned sx_blend_opt_control = 0;
955
956 for (unsigned i = 0; i < subpass->color_count; ++i) {
957 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
958 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
959 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
960 continue;
961 }
962
963 int idx = subpass->color_attachments[i].attachment;
964 struct radv_color_buffer_info *cb = &framebuffer->attachments[idx].cb;
965
966 unsigned format = G_028C70_FORMAT(cb->cb_color_info);
967 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
968 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
969 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
970
971 bool has_alpha, has_rgb;
972
973 /* Set if RGB and A are present. */
974 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
975
976 if (format == V_028C70_COLOR_8 ||
977 format == V_028C70_COLOR_16 ||
978 format == V_028C70_COLOR_32)
979 has_rgb = !has_alpha;
980 else
981 has_rgb = true;
982
983 /* Check the colormask and export format. */
984 if (!(colormask & 0x7))
985 has_rgb = false;
986 if (!(colormask & 0x8))
987 has_alpha = false;
988
989 if (spi_format == V_028714_SPI_SHADER_ZERO) {
990 has_rgb = false;
991 has_alpha = false;
992 }
993
994 /* Disable value checking for disabled channels. */
995 if (!has_rgb)
996 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
997 if (!has_alpha)
998 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
999
1000 /* Enable down-conversion for 32bpp and smaller formats. */
1001 switch (format) {
1002 case V_028C70_COLOR_8:
1003 case V_028C70_COLOR_8_8:
1004 case V_028C70_COLOR_8_8_8_8:
1005 /* For 1 and 2-channel formats, use the superset thereof. */
1006 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1007 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1008 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1009 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1010 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1011 }
1012 break;
1013
1014 case V_028C70_COLOR_5_6_5:
1015 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1016 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1017 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1018 }
1019 break;
1020
1021 case V_028C70_COLOR_1_5_5_5:
1022 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1023 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1024 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1025 }
1026 break;
1027
1028 case V_028C70_COLOR_4_4_4_4:
1029 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1030 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1031 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1032 }
1033 break;
1034
1035 case V_028C70_COLOR_32:
1036 if (swap == V_028C70_SWAP_STD &&
1037 spi_format == V_028714_SPI_SHADER_32_R)
1038 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1039 else if (swap == V_028C70_SWAP_ALT_REV &&
1040 spi_format == V_028714_SPI_SHADER_32_AR)
1041 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1042 break;
1043
1044 case V_028C70_COLOR_16:
1045 case V_028C70_COLOR_16_16:
1046 /* For 1-channel formats, use the superset thereof. */
1047 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1048 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1049 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1050 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1051 if (swap == V_028C70_SWAP_STD ||
1052 swap == V_028C70_SWAP_STD_REV)
1053 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1054 else
1055 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1056 }
1057 break;
1058
1059 case V_028C70_COLOR_10_11_11:
1060 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1061 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1062 sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
1063 }
1064 break;
1065
1066 case V_028C70_COLOR_2_10_10_10:
1067 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1068 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1069 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1070 }
1071 break;
1072 }
1073 }
1074
1075 for (unsigned i = subpass->color_count; i < 8; ++i) {
1076 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1077 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1078 }
1079 /* TODO: avoid redundantly setting context registers */
1080 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1081 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1082 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1083 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1084
1085 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1086 }
1087
1088 static void
1089 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1090 {
1091 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1092
1093 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1094 return;
1095
1096 radv_update_multisample_state(cmd_buffer, pipeline);
1097
1098 cmd_buffer->scratch_size_needed =
1099 MAX2(cmd_buffer->scratch_size_needed,
1100 pipeline->max_waves * pipeline->scratch_bytes_per_wave);
1101
1102 if (!cmd_buffer->state.emitted_pipeline ||
1103 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1104 pipeline->graphics.can_use_guardband)
1105 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1106
1107 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1108
1109 if (!cmd_buffer->state.emitted_pipeline ||
1110 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1111 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1112 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
1113 pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
1114 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1115 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1116 }
1117
1118 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1119 if (!pipeline->shaders[i])
1120 continue;
1121
1122 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1123 pipeline->shaders[i]->bo);
1124 }
1125
1126 if (radv_pipeline_has_gs(pipeline))
1127 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1128 pipeline->gs_copy_shader->bo);
1129
1130 if (unlikely(cmd_buffer->device->trace_bo))
1131 radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
1132
1133 cmd_buffer->state.emitted_pipeline = pipeline;
1134
1135 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1136 }
1137
1138 static void
1139 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1140 {
1141 si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
1142 cmd_buffer->state.dynamic.viewport.viewports);
1143 }
1144
1145 static void
1146 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1147 {
1148 uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1149
1150 si_write_scissors(cmd_buffer->cs, 0, count,
1151 cmd_buffer->state.dynamic.scissor.scissors,
1152 cmd_buffer->state.dynamic.viewport.viewports,
1153 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1154
1155 cmd_buffer->state.context_roll_without_scissor_emitted = false;
1156 }
1157
1158 static void
1159 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1160 {
1161 if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1162 return;
1163
1164 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1165 cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1166 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1167 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1168 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1169 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1170 S_028214_BR_Y(rect.offset.y + rect.extent.height));
1171 }
1172 }
1173
1174 static void
1175 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1176 {
1177 unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1178
1179 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1180 S_028A08_WIDTH(CLAMP(width, 0, 0xFFF)));
1181 }
1182
1183 static void
1184 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1185 {
1186 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1187
1188 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1189 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1190 }
1191
1192 static void
1193 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1194 {
1195 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1196
1197 radeon_set_context_reg_seq(cmd_buffer->cs,
1198 R_028430_DB_STENCILREFMASK, 2);
1199 radeon_emit(cmd_buffer->cs,
1200 S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1201 S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1202 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1203 S_028430_STENCILOPVAL(1));
1204 radeon_emit(cmd_buffer->cs,
1205 S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1206 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1207 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1208 S_028434_STENCILOPVAL_BF(1));
1209 }
1210
1211 static void
1212 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1213 {
1214 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1215
1216 radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
1217 fui(d->depth_bounds.min));
1218 radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
1219 fui(d->depth_bounds.max));
1220 }
1221
1222 static void
1223 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1224 {
1225 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1226 unsigned slope = fui(d->depth_bias.slope * 16.0f);
1227 unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
1228
1229
1230 radeon_set_context_reg_seq(cmd_buffer->cs,
1231 R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1232 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1233 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1234 radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
1235 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1236 radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
1237 }
1238
1239 static void
1240 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
1241 int index,
1242 struct radv_attachment_info *att,
1243 struct radv_image_view *iview,
1244 VkImageLayout layout)
1245 {
1246 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1247 struct radv_color_buffer_info *cb = &att->cb;
1248 uint32_t cb_color_info = cb->cb_color_info;
1249 struct radv_image *image = iview->image;
1250
1251 if (!radv_layout_dcc_compressed(image, layout,
1252 radv_image_queue_family_mask(image,
1253 cmd_buffer->queue_family_index,
1254 cmd_buffer->queue_family_index))) {
1255 cb_color_info &= C_028C70_DCC_ENABLE;
1256 }
1257
1258 if (radv_image_is_tc_compat_cmask(image) &&
1259 (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1260 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1261 /* If this bit is set, the FMASK decompression operation
1262 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1263 */
1264 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1265 }
1266
1267 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1268 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1269 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1270 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1271 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1272 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1273 radeon_emit(cmd_buffer->cs, cb_color_info);
1274 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1275 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1276 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1277 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1278 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1279 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1280
1281 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1282 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1283 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1284
1285 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1286 cb->cb_mrt_epitch);
1287 } else {
1288 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1289 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1290 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1291 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1292 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1293 radeon_emit(cmd_buffer->cs, cb_color_info);
1294 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1295 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1296 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1297 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1298 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1299 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1300
1301 if (is_vi) { /* DCC BASE */
1302 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1303 }
1304 }
1305
1306 if (radv_dcc_enabled(image, iview->base_mip)) {
1307 /* Drawing with DCC enabled also compresses colorbuffers. */
1308 VkImageSubresourceRange range = {
1309 .aspectMask = iview->aspect_mask,
1310 .baseMipLevel = iview->base_mip,
1311 .levelCount = iview->level_count,
1312 .baseArrayLayer = iview->base_layer,
1313 .layerCount = iview->layer_count,
1314 };
1315
1316 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1317 }
1318 }
1319
1320 static void
1321 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
1322 struct radv_ds_buffer_info *ds,
1323 struct radv_image *image, VkImageLayout layout,
1324 bool requires_cond_exec)
1325 {
1326 uint32_t db_z_info = ds->db_z_info;
1327 uint32_t db_z_info_reg;
1328
1329 if (!radv_image_is_tc_compat_htile(image))
1330 return;
1331
1332 if (!radv_layout_has_htile(image, layout,
1333 radv_image_queue_family_mask(image,
1334 cmd_buffer->queue_family_index,
1335 cmd_buffer->queue_family_index))) {
1336 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1337 }
1338
1339 db_z_info &= C_028040_ZRANGE_PRECISION;
1340
1341 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1342 db_z_info_reg = R_028038_DB_Z_INFO;
1343 } else {
1344 db_z_info_reg = R_028040_DB_Z_INFO;
1345 }
1346
1347 /* When we don't know the last fast clear value we need to emit a
1348 * conditional packet that will eventually skip the following
1349 * SET_CONTEXT_REG packet.
1350 */
1351 if (requires_cond_exec) {
1352 uint64_t va = radv_buffer_get_va(image->bo);
1353 va += image->offset + image->tc_compat_zrange_offset;
1354
1355 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1356 radeon_emit(cmd_buffer->cs, va);
1357 radeon_emit(cmd_buffer->cs, va >> 32);
1358 radeon_emit(cmd_buffer->cs, 0);
1359 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1360 }
1361
1362 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1363 }
1364
1365 static void
1366 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
1367 struct radv_ds_buffer_info *ds,
1368 struct radv_image *image,
1369 VkImageLayout layout)
1370 {
1371 uint32_t db_z_info = ds->db_z_info;
1372 uint32_t db_stencil_info = ds->db_stencil_info;
1373
1374 if (!radv_layout_has_htile(image, layout,
1375 radv_image_queue_family_mask(image,
1376 cmd_buffer->queue_family_index,
1377 cmd_buffer->queue_family_index))) {
1378 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1379 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1380 }
1381
1382 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1383 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1384
1385
1386 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1387 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1388 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1389 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1390 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1391
1392 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1393 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
1394 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
1395 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1396 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1397 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
1398 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1399 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
1400 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1401 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
1402 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1403
1404 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1405 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1406 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1407 } else {
1408 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1409
1410 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1411 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
1412 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
1413 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
1414 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
1415 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
1416 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
1417 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1418 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
1419 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
1420
1421 }
1422
1423 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1424 radv_update_zrange_precision(cmd_buffer, ds, image, layout, true);
1425
1426 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1427 ds->pa_su_poly_offset_db_fmt_cntl);
1428 }
1429
1430 /**
1431 * Update the fast clear depth/stencil values if the image is bound as a
1432 * depth/stencil buffer.
1433 */
1434 static void
1435 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1436 struct radv_image *image,
1437 VkClearDepthStencilValue ds_clear_value,
1438 VkImageAspectFlags aspects)
1439 {
1440 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1441 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1442 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1443 struct radv_attachment_info *att;
1444 uint32_t att_idx;
1445
1446 if (!framebuffer || !subpass)
1447 return;
1448
1449 if (!subpass->depth_stencil_attachment)
1450 return;
1451
1452 att_idx = subpass->depth_stencil_attachment->attachment;
1453 att = &framebuffer->attachments[att_idx];
1454 if (att->attachment->image != image)
1455 return;
1456
1457 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1458 radeon_emit(cs, ds_clear_value.stencil);
1459 radeon_emit(cs, fui(ds_clear_value.depth));
1460
1461 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1462 * only needed when clearing Z to 0.0.
1463 */
1464 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1465 ds_clear_value.depth == 0.0) {
1466 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1467
1468 radv_update_zrange_precision(cmd_buffer, &att->ds, image,
1469 layout, false);
1470 }
1471
1472 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1473 }
1474
1475 /**
1476 * Set the clear depth/stencil values to the image's metadata.
1477 */
1478 static void
1479 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1480 struct radv_image *image,
1481 VkClearDepthStencilValue ds_clear_value,
1482 VkImageAspectFlags aspects)
1483 {
1484 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1485 uint64_t va = radv_buffer_get_va(image->bo);
1486 unsigned reg_offset = 0, reg_count = 0;
1487
1488 va += image->offset + image->clear_value_offset;
1489
1490 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1491 ++reg_count;
1492 } else {
1493 ++reg_offset;
1494 va += 4;
1495 }
1496 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1497 ++reg_count;
1498
1499 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating));
1500 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1501 S_370_WR_CONFIRM(1) |
1502 S_370_ENGINE_SEL(V_370_PFP));
1503 radeon_emit(cs, va);
1504 radeon_emit(cs, va >> 32);
1505 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
1506 radeon_emit(cs, ds_clear_value.stencil);
1507 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1508 radeon_emit(cs, fui(ds_clear_value.depth));
1509 }
1510
1511 /**
1512 * Update the TC-compat metadata value for this image.
1513 */
1514 static void
1515 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1516 struct radv_image *image,
1517 uint32_t value)
1518 {
1519 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1520 uint64_t va = radv_buffer_get_va(image->bo);
1521 va += image->offset + image->tc_compat_zrange_offset;
1522
1523 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1524 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1525 S_370_WR_CONFIRM(1) |
1526 S_370_ENGINE_SEL(V_370_PFP));
1527 radeon_emit(cs, va);
1528 radeon_emit(cs, va >> 32);
1529 radeon_emit(cs, value);
1530 }
1531
1532 static void
1533 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1534 struct radv_image *image,
1535 VkClearDepthStencilValue ds_clear_value)
1536 {
1537 uint64_t va = radv_buffer_get_va(image->bo);
1538 va += image->offset + image->tc_compat_zrange_offset;
1539 uint32_t cond_val;
1540
1541 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
1542 * depth clear value is 0.0f.
1543 */
1544 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
1545
1546 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, cond_val);
1547 }
1548
1549 /**
1550 * Update the clear depth/stencil values for this image.
1551 */
1552 void
1553 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1554 struct radv_image *image,
1555 VkClearDepthStencilValue ds_clear_value,
1556 VkImageAspectFlags aspects)
1557 {
1558 assert(radv_image_has_htile(image));
1559
1560 radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects);
1561
1562 if (radv_image_is_tc_compat_htile(image) &&
1563 (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1564 radv_update_tc_compat_zrange_metadata(cmd_buffer, image,
1565 ds_clear_value);
1566 }
1567
1568 radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value,
1569 aspects);
1570 }
1571
1572 /**
1573 * Load the clear depth/stencil values from the image's metadata.
1574 */
1575 static void
1576 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1577 struct radv_image *image)
1578 {
1579 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1580 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
1581 uint64_t va = radv_buffer_get_va(image->bo);
1582 unsigned reg_offset = 0, reg_count = 0;
1583
1584 va += image->offset + image->clear_value_offset;
1585
1586 if (!radv_image_has_htile(image))
1587 return;
1588
1589 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1590 ++reg_count;
1591 } else {
1592 ++reg_offset;
1593 va += 4;
1594 }
1595 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1596 ++reg_count;
1597
1598 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
1599
1600 if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
1601 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
1602 radeon_emit(cs, va);
1603 radeon_emit(cs, va >> 32);
1604 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1605 radeon_emit(cs, reg_count);
1606 } else {
1607 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1608 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1609 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1610 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
1611 radeon_emit(cs, va);
1612 radeon_emit(cs, va >> 32);
1613 radeon_emit(cs, reg >> 2);
1614 radeon_emit(cs, 0);
1615
1616 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
1617 radeon_emit(cs, 0);
1618 }
1619 }
1620
1621 /*
1622 * With DCC some colors don't require CMASK elimination before being
1623 * used as a texture. This sets a predicate value to determine if the
1624 * cmask eliminate is required.
1625 */
1626 void
1627 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer,
1628 struct radv_image *image,
1629 const VkImageSubresourceRange *range, bool value)
1630 {
1631 uint64_t pred_val = value;
1632 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
1633 uint32_t level_count = radv_get_levelCount(image, range);
1634 uint32_t count = 2 * level_count;
1635
1636 assert(radv_dcc_enabled(image, range->baseMipLevel));
1637
1638 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1639 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1640 S_370_WR_CONFIRM(1) |
1641 S_370_ENGINE_SEL(V_370_PFP));
1642 radeon_emit(cmd_buffer->cs, va);
1643 radeon_emit(cmd_buffer->cs, va >> 32);
1644
1645 for (uint32_t l = 0; l < level_count; l++) {
1646 radeon_emit(cmd_buffer->cs, pred_val);
1647 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1648 }
1649 }
1650
1651 /**
1652 * Update the DCC predicate to reflect the compression state.
1653 */
1654 void
1655 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer,
1656 struct radv_image *image,
1657 const VkImageSubresourceRange *range, bool value)
1658 {
1659 uint64_t pred_val = value;
1660 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
1661 uint32_t level_count = radv_get_levelCount(image, range);
1662 uint32_t count = 2 * level_count;
1663
1664 assert(radv_dcc_enabled(image, range->baseMipLevel));
1665
1666 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1667 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1668 S_370_WR_CONFIRM(1) |
1669 S_370_ENGINE_SEL(V_370_PFP));
1670 radeon_emit(cmd_buffer->cs, va);
1671 radeon_emit(cmd_buffer->cs, va >> 32);
1672
1673 for (uint32_t l = 0; l < level_count; l++) {
1674 radeon_emit(cmd_buffer->cs, pred_val);
1675 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1676 }
1677 }
1678
1679 /**
1680 * Update the fast clear color values if the image is bound as a color buffer.
1681 */
1682 static void
1683 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
1684 struct radv_image *image,
1685 int cb_idx,
1686 uint32_t color_values[2])
1687 {
1688 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1689 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1690 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1691 struct radv_attachment_info *att;
1692 uint32_t att_idx;
1693
1694 if (!framebuffer || !subpass)
1695 return;
1696
1697 att_idx = subpass->color_attachments[cb_idx].attachment;
1698 if (att_idx == VK_ATTACHMENT_UNUSED)
1699 return;
1700
1701 att = &framebuffer->attachments[att_idx];
1702 if (att->attachment->image != image)
1703 return;
1704
1705 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
1706 radeon_emit(cs, color_values[0]);
1707 radeon_emit(cs, color_values[1]);
1708
1709 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1710 }
1711
1712 /**
1713 * Set the clear color values to the image's metadata.
1714 */
1715 static void
1716 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1717 struct radv_image *image,
1718 const VkImageSubresourceRange *range,
1719 uint32_t color_values[2])
1720 {
1721 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1722 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
1723 uint32_t level_count = radv_get_levelCount(image, range);
1724 uint32_t count = 2 * level_count;
1725
1726 assert(radv_image_has_cmask(image) ||
1727 radv_dcc_enabled(image, range->baseMipLevel));
1728
1729 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
1730 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1731 S_370_WR_CONFIRM(1) |
1732 S_370_ENGINE_SEL(V_370_PFP));
1733 radeon_emit(cs, va);
1734 radeon_emit(cs, va >> 32);
1735
1736 for (uint32_t l = 0; l < level_count; l++) {
1737 radeon_emit(cs, color_values[0]);
1738 radeon_emit(cs, color_values[1]);
1739 }
1740 }
1741
1742 /**
1743 * Update the clear color values for this image.
1744 */
1745 void
1746 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1747 const struct radv_image_view *iview,
1748 int cb_idx,
1749 uint32_t color_values[2])
1750 {
1751 struct radv_image *image = iview->image;
1752 VkImageSubresourceRange range = {
1753 .aspectMask = iview->aspect_mask,
1754 .baseMipLevel = iview->base_mip,
1755 .levelCount = iview->level_count,
1756 .baseArrayLayer = iview->base_layer,
1757 .layerCount = iview->layer_count,
1758 };
1759
1760 assert(radv_image_has_cmask(image) ||
1761 radv_dcc_enabled(image, iview->base_mip));
1762
1763 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
1764
1765 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
1766 color_values);
1767 }
1768
1769 /**
1770 * Load the clear color values from the image's metadata.
1771 */
1772 static void
1773 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1774 struct radv_image_view *iview,
1775 int cb_idx)
1776 {
1777 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1778 struct radv_image *image = iview->image;
1779 uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
1780
1781 if (!radv_image_has_cmask(image) &&
1782 !radv_dcc_enabled(image, iview->base_mip))
1783 return;
1784
1785 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
1786
1787 if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
1788 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
1789 radeon_emit(cs, va);
1790 radeon_emit(cs, va >> 32);
1791 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1792 radeon_emit(cs, 2);
1793 } else {
1794 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
1795 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1796 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1797 COPY_DATA_COUNT_SEL);
1798 radeon_emit(cs, va);
1799 radeon_emit(cs, va >> 32);
1800 radeon_emit(cs, reg >> 2);
1801 radeon_emit(cs, 0);
1802
1803 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
1804 radeon_emit(cs, 0);
1805 }
1806 }
1807
1808 static void
1809 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
1810 {
1811 int i;
1812 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1813 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1814 unsigned num_bpp64_colorbufs = 0;
1815
1816 /* this may happen for inherited secondary recording */
1817 if (!framebuffer)
1818 return;
1819
1820 for (i = 0; i < 8; ++i) {
1821 if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1822 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
1823 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
1824 continue;
1825 }
1826
1827 int idx = subpass->color_attachments[i].attachment;
1828 struct radv_attachment_info *att = &framebuffer->attachments[idx];
1829 struct radv_image_view *iview = att->attachment;
1830 struct radv_image *image = iview->image;
1831 VkImageLayout layout = subpass->color_attachments[i].layout;
1832
1833 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
1834
1835 assert(att->attachment->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
1836 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
1837 radv_emit_fb_color_state(cmd_buffer, i, att, iview, layout);
1838
1839 radv_load_color_clear_metadata(cmd_buffer, iview, i);
1840
1841 if (image->planes[0].surface.bpe >= 8)
1842 num_bpp64_colorbufs++;
1843 }
1844
1845 if (subpass->depth_stencil_attachment) {
1846 int idx = subpass->depth_stencil_attachment->attachment;
1847 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1848 struct radv_attachment_info *att = &framebuffer->attachments[idx];
1849 struct radv_image *image = att->attachment->image;
1850 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
1851 MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image,
1852 cmd_buffer->queue_family_index,
1853 cmd_buffer->queue_family_index);
1854 /* We currently don't support writing decompressed HTILE */
1855 assert(radv_layout_has_htile(image, layout, queue_mask) ==
1856 radv_layout_is_htile_compressed(image, layout, queue_mask));
1857
1858 radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout);
1859
1860 if (att->ds.offset_scale != cmd_buffer->state.offset_scale) {
1861 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1862 cmd_buffer->state.offset_scale = att->ds.offset_scale;
1863 }
1864 radv_load_ds_clear_metadata(cmd_buffer, image);
1865 } else {
1866 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
1867 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
1868 else
1869 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
1870
1871 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
1872 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
1873 }
1874 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
1875 S_028208_BR_X(framebuffer->width) |
1876 S_028208_BR_Y(framebuffer->height));
1877
1878 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
1879 bool disable_constant_encode =
1880 cmd_buffer->device->physical_device->has_dcc_constant_encode;
1881 uint8_t watermark = 4; /* Default value for GFX8. */
1882
1883 /* For optimal DCC performance. */
1884 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1885 if (num_bpp64_colorbufs >= 5) {
1886 watermark = 8;
1887 } else {
1888 watermark = 6;
1889 }
1890 }
1891
1892 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
1893 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
1894 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
1895 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
1896 }
1897
1898 if (cmd_buffer->device->dfsm_allowed) {
1899 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1900 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1901 }
1902
1903 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
1904 }
1905
1906 static void
1907 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
1908 {
1909 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1910 struct radv_cmd_state *state = &cmd_buffer->state;
1911
1912 if (state->index_type != state->last_index_type) {
1913 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1914 radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE,
1915 2, state->index_type);
1916 } else {
1917 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
1918 radeon_emit(cs, state->index_type);
1919 }
1920
1921 state->last_index_type = state->index_type;
1922 }
1923
1924 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
1925 radeon_emit(cs, state->index_va);
1926 radeon_emit(cs, state->index_va >> 32);
1927
1928 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
1929 radeon_emit(cs, state->max_index_count);
1930
1931 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
1932 }
1933
1934 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
1935 {
1936 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
1937 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1938 uint32_t pa_sc_mode_cntl_1 =
1939 pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
1940 uint32_t db_count_control;
1941
1942 if(!cmd_buffer->state.active_occlusion_queries) {
1943 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1944 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
1945 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
1946 has_perfect_queries) {
1947 /* Re-enable out-of-order rasterization if the
1948 * bound pipeline supports it and if it's has
1949 * been disabled before starting any perfect
1950 * occlusion queries.
1951 */
1952 radeon_set_context_reg(cmd_buffer->cs,
1953 R_028A4C_PA_SC_MODE_CNTL_1,
1954 pa_sc_mode_cntl_1);
1955 }
1956 }
1957 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
1958 } else {
1959 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1960 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
1961
1962 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1963 db_count_control =
1964 S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
1965 S_028004_SAMPLE_RATE(sample_rate) |
1966 S_028004_ZPASS_ENABLE(1) |
1967 S_028004_SLICE_EVEN_ENABLE(1) |
1968 S_028004_SLICE_ODD_ENABLE(1);
1969
1970 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
1971 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
1972 has_perfect_queries) {
1973 /* If the bound pipeline has enabled
1974 * out-of-order rasterization, we should
1975 * disable it before starting any perfect
1976 * occlusion queries.
1977 */
1978 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
1979
1980 radeon_set_context_reg(cmd_buffer->cs,
1981 R_028A4C_PA_SC_MODE_CNTL_1,
1982 pa_sc_mode_cntl_1);
1983 }
1984 } else {
1985 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
1986 S_028004_SAMPLE_RATE(sample_rate);
1987 }
1988 }
1989
1990 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
1991
1992 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1993 }
1994
1995 static void
1996 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
1997 {
1998 uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
1999
2000 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2001 radv_emit_viewport(cmd_buffer);
2002
2003 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
2004 !cmd_buffer->device->physical_device->has_scissor_bug)
2005 radv_emit_scissor(cmd_buffer);
2006
2007 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
2008 radv_emit_line_width(cmd_buffer);
2009
2010 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
2011 radv_emit_blend_constants(cmd_buffer);
2012
2013 if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2014 RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2015 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
2016 radv_emit_stencil(cmd_buffer);
2017
2018 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
2019 radv_emit_depth_bounds(cmd_buffer);
2020
2021 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
2022 radv_emit_depth_bias(cmd_buffer);
2023
2024 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
2025 radv_emit_discard_rectangle(cmd_buffer);
2026
2027 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
2028 radv_emit_sample_locations(cmd_buffer);
2029
2030 cmd_buffer->state.dirty &= ~states;
2031 }
2032
2033 static void
2034 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
2035 VkPipelineBindPoint bind_point)
2036 {
2037 struct radv_descriptor_state *descriptors_state =
2038 radv_get_descriptors_state(cmd_buffer, bind_point);
2039 struct radv_descriptor_set *set = &descriptors_state->push_set.set;
2040 unsigned bo_offset;
2041
2042 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
2043 set->mapped_ptr,
2044 &bo_offset))
2045 return;
2046
2047 set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2048 set->va += bo_offset;
2049 }
2050
2051 static void
2052 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
2053 VkPipelineBindPoint bind_point)
2054 {
2055 struct radv_descriptor_state *descriptors_state =
2056 radv_get_descriptors_state(cmd_buffer, bind_point);
2057 uint32_t size = MAX_SETS * 4;
2058 uint32_t offset;
2059 void *ptr;
2060
2061 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
2062 256, &offset, &ptr))
2063 return;
2064
2065 for (unsigned i = 0; i < MAX_SETS; i++) {
2066 uint32_t *uptr = ((uint32_t *)ptr) + i;
2067 uint64_t set_va = 0;
2068 struct radv_descriptor_set *set = descriptors_state->sets[i];
2069 if (descriptors_state->valid & (1u << i))
2070 set_va = set->va;
2071 uptr[0] = set_va & 0xffffffff;
2072 }
2073
2074 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2075 va += offset;
2076
2077 if (cmd_buffer->state.pipeline) {
2078 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
2079 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2080 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2081
2082 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
2083 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
2084 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2085
2086 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
2087 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
2088 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2089
2090 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2091 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
2092 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2093
2094 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2095 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
2096 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2097 }
2098
2099 if (cmd_buffer->state.compute_pipeline)
2100 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
2101 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2102 }
2103
2104 static void
2105 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
2106 VkShaderStageFlags stages)
2107 {
2108 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2109 VK_PIPELINE_BIND_POINT_COMPUTE :
2110 VK_PIPELINE_BIND_POINT_GRAPHICS;
2111 struct radv_descriptor_state *descriptors_state =
2112 radv_get_descriptors_state(cmd_buffer, bind_point);
2113 struct radv_cmd_state *state = &cmd_buffer->state;
2114 bool flush_indirect_descriptors;
2115
2116 if (!descriptors_state->dirty)
2117 return;
2118
2119 if (descriptors_state->push_dirty)
2120 radv_flush_push_descriptors(cmd_buffer, bind_point);
2121
2122 flush_indirect_descriptors =
2123 (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
2124 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
2125 (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
2126 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
2127
2128 if (flush_indirect_descriptors)
2129 radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
2130
2131 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
2132 cmd_buffer->cs,
2133 MAX_SETS * MESA_SHADER_STAGES * 4);
2134
2135 if (cmd_buffer->state.pipeline) {
2136 radv_foreach_stage(stage, stages) {
2137 if (!cmd_buffer->state.pipeline->shaders[stage])
2138 continue;
2139
2140 radv_emit_descriptor_pointers(cmd_buffer,
2141 cmd_buffer->state.pipeline,
2142 descriptors_state, stage);
2143 }
2144 }
2145
2146 if (cmd_buffer->state.compute_pipeline &&
2147 (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
2148 radv_emit_descriptor_pointers(cmd_buffer,
2149 cmd_buffer->state.compute_pipeline,
2150 descriptors_state,
2151 MESA_SHADER_COMPUTE);
2152 }
2153
2154 descriptors_state->dirty = 0;
2155 descriptors_state->push_dirty = false;
2156
2157 assert(cmd_buffer->cs->cdw <= cdw_max);
2158
2159 if (unlikely(cmd_buffer->device->trace_bo))
2160 radv_save_descriptors(cmd_buffer, bind_point);
2161 }
2162
2163 static void
2164 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
2165 VkShaderStageFlags stages)
2166 {
2167 struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT
2168 ? cmd_buffer->state.compute_pipeline
2169 : cmd_buffer->state.pipeline;
2170 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2171 VK_PIPELINE_BIND_POINT_COMPUTE :
2172 VK_PIPELINE_BIND_POINT_GRAPHICS;
2173 struct radv_descriptor_state *descriptors_state =
2174 radv_get_descriptors_state(cmd_buffer, bind_point);
2175 struct radv_pipeline_layout *layout = pipeline->layout;
2176 struct radv_shader_variant *shader, *prev_shader;
2177 bool need_push_constants = false;
2178 unsigned offset;
2179 void *ptr;
2180 uint64_t va;
2181
2182 stages &= cmd_buffer->push_constant_stages;
2183 if (!stages ||
2184 (!layout->push_constant_size && !layout->dynamic_offset_count))
2185 return;
2186
2187 radv_foreach_stage(stage, stages) {
2188 if (!pipeline->shaders[stage])
2189 continue;
2190
2191 need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
2192 need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
2193
2194 uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts;
2195 uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts;
2196
2197 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
2198 AC_UD_INLINE_PUSH_CONSTANTS,
2199 count,
2200 (uint32_t *)&cmd_buffer->push_constants[base * 4]);
2201 }
2202
2203 if (need_push_constants) {
2204 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
2205 16 * layout->dynamic_offset_count,
2206 256, &offset, &ptr))
2207 return;
2208
2209 memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
2210 memcpy((char*)ptr + layout->push_constant_size,
2211 descriptors_state->dynamic_buffers,
2212 16 * layout->dynamic_offset_count);
2213
2214 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2215 va += offset;
2216
2217 MAYBE_UNUSED unsigned cdw_max =
2218 radeon_check_space(cmd_buffer->device->ws,
2219 cmd_buffer->cs, MESA_SHADER_STAGES * 4);
2220
2221 prev_shader = NULL;
2222 radv_foreach_stage(stage, stages) {
2223 shader = radv_get_shader(pipeline, stage);
2224
2225 /* Avoid redundantly emitting the address for merged stages. */
2226 if (shader && shader != prev_shader) {
2227 radv_emit_userdata_address(cmd_buffer, pipeline, stage,
2228 AC_UD_PUSH_CONSTANTS, va);
2229
2230 prev_shader = shader;
2231 }
2232 }
2233 assert(cmd_buffer->cs->cdw <= cdw_max);
2234 }
2235
2236 cmd_buffer->push_constant_stages &= ~stages;
2237 }
2238
2239 static void
2240 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
2241 bool pipeline_is_dirty)
2242 {
2243 if ((pipeline_is_dirty ||
2244 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
2245 cmd_buffer->state.pipeline->num_vertex_bindings &&
2246 radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
2247 struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
2248 unsigned vb_offset;
2249 void *vb_ptr;
2250 uint32_t i = 0;
2251 uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
2252 uint64_t va;
2253
2254 /* allocate some descriptor state for vertex buffers */
2255 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
2256 &vb_offset, &vb_ptr))
2257 return;
2258
2259 for (i = 0; i < count; i++) {
2260 uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
2261 uint32_t offset;
2262 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
2263 uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
2264
2265 if (!buffer)
2266 continue;
2267
2268 va = radv_buffer_get_va(buffer->bo);
2269
2270 offset = cmd_buffer->vertex_bindings[i].offset;
2271 va += offset + buffer->offset;
2272 desc[0] = va;
2273 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2274 if (cmd_buffer->device->physical_device->rad_info.chip_class <= GFX7 && stride)
2275 desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
2276 else
2277 desc[2] = buffer->size - offset;
2278 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2279 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2280 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2281 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2282 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2283 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2284 }
2285
2286 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2287 va += vb_offset;
2288
2289 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2290 AC_UD_VS_VERTEX_BUFFERS, va);
2291
2292 cmd_buffer->state.vb_va = va;
2293 cmd_buffer->state.vb_size = count * 16;
2294 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2295 }
2296 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2297 }
2298
2299 static void
2300 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2301 {
2302 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2303 struct radv_userdata_info *loc;
2304 uint32_t base_reg;
2305
2306 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2307 if (!radv_get_shader(pipeline, stage))
2308 continue;
2309
2310 loc = radv_lookup_user_sgpr(pipeline, stage,
2311 AC_UD_STREAMOUT_BUFFERS);
2312 if (loc->sgpr_idx == -1)
2313 continue;
2314
2315 base_reg = pipeline->user_data_0[stage];
2316
2317 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2318 base_reg + loc->sgpr_idx * 4, va, false);
2319 }
2320
2321 if (pipeline->gs_copy_shader) {
2322 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
2323 if (loc->sgpr_idx != -1) {
2324 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
2325
2326 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2327 base_reg + loc->sgpr_idx * 4, va, false);
2328 }
2329 }
2330 }
2331
2332 static void
2333 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
2334 {
2335 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
2336 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
2337 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
2338 unsigned so_offset;
2339 void *so_ptr;
2340 uint64_t va;
2341
2342 /* Allocate some descriptor state for streamout buffers. */
2343 if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
2344 MAX_SO_BUFFERS * 16, 256,
2345 &so_offset, &so_ptr))
2346 return;
2347
2348 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
2349 struct radv_buffer *buffer = sb[i].buffer;
2350 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
2351
2352 if (!(so->enabled_mask & (1 << i)))
2353 continue;
2354
2355 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
2356
2357 va += sb[i].offset;
2358
2359 /* Set the descriptor.
2360 *
2361 * On GFX8, the format must be non-INVALID, otherwise
2362 * the buffer will be considered not bound and store
2363 * instructions will be no-ops.
2364 */
2365 desc[0] = va;
2366 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2367 desc[2] = 0xffffffff;
2368 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2369 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2370 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2371 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2372 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2373 }
2374
2375 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2376 va += so_offset;
2377
2378 radv_emit_streamout_buffers(cmd_buffer, va);
2379 }
2380
2381 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
2382 }
2383
2384 static void
2385 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2386 {
2387 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
2388 radv_flush_streamout_descriptors(cmd_buffer);
2389 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2390 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2391 }
2392
2393 struct radv_draw_info {
2394 /**
2395 * Number of vertices.
2396 */
2397 uint32_t count;
2398
2399 /**
2400 * Index of the first vertex.
2401 */
2402 int32_t vertex_offset;
2403
2404 /**
2405 * First instance id.
2406 */
2407 uint32_t first_instance;
2408
2409 /**
2410 * Number of instances.
2411 */
2412 uint32_t instance_count;
2413
2414 /**
2415 * First index (indexed draws only).
2416 */
2417 uint32_t first_index;
2418
2419 /**
2420 * Whether it's an indexed draw.
2421 */
2422 bool indexed;
2423
2424 /**
2425 * Indirect draw parameters resource.
2426 */
2427 struct radv_buffer *indirect;
2428 uint64_t indirect_offset;
2429 uint32_t stride;
2430
2431 /**
2432 * Draw count parameters resource.
2433 */
2434 struct radv_buffer *count_buffer;
2435 uint64_t count_buffer_offset;
2436
2437 /**
2438 * Stream output parameters resource.
2439 */
2440 struct radv_buffer *strmout_buffer;
2441 uint64_t strmout_buffer_offset;
2442 };
2443
2444 static void
2445 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
2446 const struct radv_draw_info *draw_info)
2447 {
2448 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2449 struct radv_cmd_state *state = &cmd_buffer->state;
2450 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2451 uint32_t ia_multi_vgt_param;
2452 int32_t primitive_reset_en;
2453
2454 /* Draw state. */
2455 ia_multi_vgt_param =
2456 si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
2457 draw_info->indirect,
2458 !!draw_info->strmout_buffer,
2459 draw_info->indirect ? 0 : draw_info->count);
2460
2461 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
2462 if (info->chip_class >= GFX9) {
2463 radeon_set_uconfig_reg_idx(cs,
2464 R_030960_IA_MULTI_VGT_PARAM,
2465 4, ia_multi_vgt_param);
2466 } else if (info->chip_class >= GFX7) {
2467 radeon_set_context_reg_idx(cs,
2468 R_028AA8_IA_MULTI_VGT_PARAM,
2469 1, ia_multi_vgt_param);
2470 } else {
2471 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
2472 ia_multi_vgt_param);
2473 }
2474 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
2475 }
2476
2477 /* Primitive restart. */
2478 primitive_reset_en =
2479 draw_info->indexed && state->pipeline->graphics.prim_restart_enable;
2480
2481 if (primitive_reset_en != state->last_primitive_reset_en) {
2482 state->last_primitive_reset_en = primitive_reset_en;
2483 if (info->chip_class >= GFX9) {
2484 radeon_set_uconfig_reg(cs,
2485 R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
2486 primitive_reset_en);
2487 } else {
2488 radeon_set_context_reg(cs,
2489 R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
2490 primitive_reset_en);
2491 }
2492 }
2493
2494 if (primitive_reset_en) {
2495 uint32_t primitive_reset_index =
2496 state->index_type ? 0xffffffffu : 0xffffu;
2497
2498 if (primitive_reset_index != state->last_primitive_reset_index) {
2499 radeon_set_context_reg(cs,
2500 R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
2501 primitive_reset_index);
2502 state->last_primitive_reset_index = primitive_reset_index;
2503 }
2504 }
2505
2506 if (draw_info->strmout_buffer) {
2507 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
2508
2509 va += draw_info->strmout_buffer->offset +
2510 draw_info->strmout_buffer_offset;
2511
2512 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
2513 draw_info->stride);
2514
2515 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2516 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2517 COPY_DATA_DST_SEL(COPY_DATA_REG) |
2518 COPY_DATA_WR_CONFIRM);
2519 radeon_emit(cs, va);
2520 radeon_emit(cs, va >> 32);
2521 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
2522 radeon_emit(cs, 0); /* unused */
2523
2524 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
2525 }
2526 }
2527
2528 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
2529 VkPipelineStageFlags src_stage_mask)
2530 {
2531 if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
2532 VK_PIPELINE_STAGE_TRANSFER_BIT |
2533 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2534 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2535 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2536 }
2537
2538 if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
2539 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
2540 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
2541 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
2542 VK_PIPELINE_STAGE_TRANSFER_BIT |
2543 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2544 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
2545 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2546 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
2547 } else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
2548 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
2549 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
2550 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
2551 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
2552 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
2553 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
2554 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
2555 }
2556 }
2557
2558 static enum radv_cmd_flush_bits
2559 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
2560 VkAccessFlags src_flags,
2561 struct radv_image *image)
2562 {
2563 bool flush_CB_meta = true, flush_DB_meta = true;
2564 enum radv_cmd_flush_bits flush_bits = 0;
2565 uint32_t b;
2566
2567 if (image) {
2568 if (!radv_image_has_CB_metadata(image))
2569 flush_CB_meta = false;
2570 if (!radv_image_has_htile(image))
2571 flush_DB_meta = false;
2572 }
2573
2574 for_each_bit(b, src_flags) {
2575 switch ((VkAccessFlagBits)(1 << b)) {
2576 case VK_ACCESS_SHADER_WRITE_BIT:
2577 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
2578 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2579 flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
2580 break;
2581 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
2582 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2583 if (flush_CB_meta)
2584 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2585 break;
2586 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
2587 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2588 if (flush_DB_meta)
2589 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2590 break;
2591 case VK_ACCESS_TRANSFER_WRITE_BIT:
2592 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2593 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
2594 RADV_CMD_FLAG_INV_GLOBAL_L2;
2595
2596 if (flush_CB_meta)
2597 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2598 if (flush_DB_meta)
2599 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2600 break;
2601 default:
2602 break;
2603 }
2604 }
2605 return flush_bits;
2606 }
2607
2608 static enum radv_cmd_flush_bits
2609 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
2610 VkAccessFlags dst_flags,
2611 struct radv_image *image)
2612 {
2613 bool flush_CB_meta = true, flush_DB_meta = true;
2614 enum radv_cmd_flush_bits flush_bits = 0;
2615 bool flush_CB = true, flush_DB = true;
2616 bool image_is_coherent = false;
2617 uint32_t b;
2618
2619 if (image) {
2620 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
2621 flush_CB = false;
2622 flush_DB = false;
2623 }
2624
2625 if (!radv_image_has_CB_metadata(image))
2626 flush_CB_meta = false;
2627 if (!radv_image_has_htile(image))
2628 flush_DB_meta = false;
2629
2630 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2631 if (image->info.samples == 1 &&
2632 (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
2633 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
2634 !vk_format_is_stencil(image->vk_format)) {
2635 /* Single-sample color and single-sample depth
2636 * (not stencil) are coherent with shaders on
2637 * GFX9.
2638 */
2639 image_is_coherent = true;
2640 }
2641 }
2642 }
2643
2644 for_each_bit(b, dst_flags) {
2645 switch ((VkAccessFlagBits)(1 << b)) {
2646 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
2647 case VK_ACCESS_INDEX_READ_BIT:
2648 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2649 break;
2650 case VK_ACCESS_UNIFORM_READ_BIT:
2651 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
2652 break;
2653 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
2654 case VK_ACCESS_TRANSFER_READ_BIT:
2655 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
2656 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 |
2657 RADV_CMD_FLAG_INV_GLOBAL_L2;
2658 break;
2659 case VK_ACCESS_SHADER_READ_BIT:
2660 flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1;
2661
2662 if (!image_is_coherent)
2663 flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2;
2664 break;
2665 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
2666 if (flush_CB)
2667 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2668 if (flush_CB_meta)
2669 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2670 break;
2671 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
2672 if (flush_DB)
2673 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2674 if (flush_DB_meta)
2675 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2676 break;
2677 default:
2678 break;
2679 }
2680 }
2681 return flush_bits;
2682 }
2683
2684 void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
2685 const struct radv_subpass_barrier *barrier)
2686 {
2687 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
2688 NULL);
2689 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
2690 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
2691 NULL);
2692 }
2693
2694 uint32_t
2695 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
2696 {
2697 struct radv_cmd_state *state = &cmd_buffer->state;
2698 uint32_t subpass_id = state->subpass - state->pass->subpasses;
2699
2700 /* The id of this subpass shouldn't exceed the number of subpasses in
2701 * this render pass minus 1.
2702 */
2703 assert(subpass_id < state->pass->subpass_count);
2704 return subpass_id;
2705 }
2706
2707 static struct radv_sample_locations_state *
2708 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer,
2709 uint32_t att_idx,
2710 bool begin_subpass)
2711 {
2712 struct radv_cmd_state *state = &cmd_buffer->state;
2713 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
2714 struct radv_image_view *view = state->framebuffer->attachments[att_idx].attachment;
2715
2716 if (view->image->info.samples == 1)
2717 return NULL;
2718
2719 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
2720 /* Return the initial sample locations if this is the initial
2721 * layout transition of the given subpass attachemnt.
2722 */
2723 if (state->attachments[att_idx].sample_location.count > 0)
2724 return &state->attachments[att_idx].sample_location;
2725 } else {
2726 /* Otherwise return the subpass sample locations if defined. */
2727 if (state->subpass_sample_locs) {
2728 /* Because the driver sets the current subpass before
2729 * initial layout transitions, we should use the sample
2730 * locations from the previous subpass to avoid an
2731 * off-by-one problem. Otherwise, use the sample
2732 * locations for the current subpass for final layout
2733 * transitions.
2734 */
2735 if (begin_subpass)
2736 subpass_id--;
2737
2738 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
2739 if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
2740 return &state->subpass_sample_locs[i].sample_location;
2741 }
2742 }
2743 }
2744
2745 return NULL;
2746 }
2747
2748 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
2749 struct radv_subpass_attachment att,
2750 bool begin_subpass)
2751 {
2752 unsigned idx = att.attachment;
2753 struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment;
2754 struct radv_sample_locations_state *sample_locs;
2755 VkImageSubresourceRange range;
2756 range.aspectMask = 0;
2757 range.baseMipLevel = view->base_mip;
2758 range.levelCount = 1;
2759 range.baseArrayLayer = view->base_layer;
2760 range.layerCount = cmd_buffer->state.framebuffer->layers;
2761
2762 if (cmd_buffer->state.subpass->view_mask) {
2763 /* If the current subpass uses multiview, the driver might have
2764 * performed a fast color/depth clear to the whole image
2765 * (including all layers). To make sure the driver will
2766 * decompress the image correctly (if needed), we have to
2767 * account for the "real" number of layers. If the view mask is
2768 * sparse, this will decompress more layers than needed.
2769 */
2770 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
2771 }
2772
2773 /* Get the subpass sample locations for the given attachment, if NULL
2774 * is returned the driver will use the default HW locations.
2775 */
2776 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx,
2777 begin_subpass);
2778
2779 radv_handle_image_transition(cmd_buffer,
2780 view->image,
2781 cmd_buffer->state.attachments[idx].current_layout,
2782 att.layout, 0, 0, &range, sample_locs);
2783
2784 cmd_buffer->state.attachments[idx].current_layout = att.layout;
2785
2786
2787 }
2788
2789 void
2790 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
2791 const struct radv_subpass *subpass)
2792 {
2793 cmd_buffer->state.subpass = subpass;
2794
2795 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
2796 }
2797
2798 static VkResult
2799 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
2800 struct radv_render_pass *pass,
2801 const VkRenderPassBeginInfo *info)
2802 {
2803 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
2804 vk_find_struct_const(info->pNext,
2805 RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
2806 struct radv_cmd_state *state = &cmd_buffer->state;
2807 struct radv_framebuffer *framebuffer = state->framebuffer;
2808
2809 if (!sample_locs) {
2810 state->subpass_sample_locs = NULL;
2811 return VK_SUCCESS;
2812 }
2813
2814 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
2815 const VkAttachmentSampleLocationsEXT *att_sample_locs =
2816 &sample_locs->pAttachmentInitialSampleLocations[i];
2817 uint32_t att_idx = att_sample_locs->attachmentIndex;
2818 struct radv_attachment_info *att = &framebuffer->attachments[att_idx];
2819 struct radv_image *image = att->attachment->image;
2820
2821 assert(vk_format_is_depth_or_stencil(image->vk_format));
2822
2823 /* From the Vulkan spec 1.1.108:
2824 *
2825 * "If the image referenced by the framebuffer attachment at
2826 * index attachmentIndex was not created with
2827 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
2828 * then the values specified in sampleLocationsInfo are
2829 * ignored."
2830 */
2831 if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
2832 continue;
2833
2834 const VkSampleLocationsInfoEXT *sample_locs_info =
2835 &att_sample_locs->sampleLocationsInfo;
2836
2837 state->attachments[att_idx].sample_location.per_pixel =
2838 sample_locs_info->sampleLocationsPerPixel;
2839 state->attachments[att_idx].sample_location.grid_size =
2840 sample_locs_info->sampleLocationGridSize;
2841 state->attachments[att_idx].sample_location.count =
2842 sample_locs_info->sampleLocationsCount;
2843 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
2844 sample_locs_info->pSampleLocations,
2845 sample_locs_info->sampleLocationsCount);
2846 }
2847
2848 state->subpass_sample_locs = vk_alloc(&cmd_buffer->pool->alloc,
2849 sample_locs->postSubpassSampleLocationsCount *
2850 sizeof(state->subpass_sample_locs[0]),
2851 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2852 if (state->subpass_sample_locs == NULL) {
2853 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2854 return cmd_buffer->record_result;
2855 }
2856
2857 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
2858
2859 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
2860 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
2861 &sample_locs->pPostSubpassSampleLocations[i];
2862 const VkSampleLocationsInfoEXT *sample_locs_info =
2863 &subpass_sample_locs_info->sampleLocationsInfo;
2864
2865 state->subpass_sample_locs[i].subpass_idx =
2866 subpass_sample_locs_info->subpassIndex;
2867 state->subpass_sample_locs[i].sample_location.per_pixel =
2868 sample_locs_info->sampleLocationsPerPixel;
2869 state->subpass_sample_locs[i].sample_location.grid_size =
2870 sample_locs_info->sampleLocationGridSize;
2871 state->subpass_sample_locs[i].sample_location.count =
2872 sample_locs_info->sampleLocationsCount;
2873 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
2874 sample_locs_info->pSampleLocations,
2875 sample_locs_info->sampleLocationsCount);
2876 }
2877
2878 return VK_SUCCESS;
2879 }
2880
2881 static VkResult
2882 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
2883 struct radv_render_pass *pass,
2884 const VkRenderPassBeginInfo *info)
2885 {
2886 struct radv_cmd_state *state = &cmd_buffer->state;
2887
2888 if (pass->attachment_count == 0) {
2889 state->attachments = NULL;
2890 return VK_SUCCESS;
2891 }
2892
2893 state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
2894 pass->attachment_count *
2895 sizeof(state->attachments[0]),
2896 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2897 if (state->attachments == NULL) {
2898 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
2899 return cmd_buffer->record_result;
2900 }
2901
2902 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
2903 struct radv_render_pass_attachment *att = &pass->attachments[i];
2904 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
2905 VkImageAspectFlags clear_aspects = 0;
2906
2907 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2908 /* color attachment */
2909 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2910 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
2911 }
2912 } else {
2913 /* depthstencil attachment */
2914 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
2915 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2916 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
2917 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
2918 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
2919 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
2920 }
2921 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
2922 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2923 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
2924 }
2925 }
2926
2927 state->attachments[i].pending_clear_aspects = clear_aspects;
2928 state->attachments[i].cleared_views = 0;
2929 if (clear_aspects && info) {
2930 assert(info->clearValueCount > i);
2931 state->attachments[i].clear_value = info->pClearValues[i];
2932 }
2933
2934 state->attachments[i].current_layout = att->initial_layout;
2935 state->attachments[i].sample_location.count = 0;
2936 }
2937
2938 return VK_SUCCESS;
2939 }
2940
2941 VkResult radv_AllocateCommandBuffers(
2942 VkDevice _device,
2943 const VkCommandBufferAllocateInfo *pAllocateInfo,
2944 VkCommandBuffer *pCommandBuffers)
2945 {
2946 RADV_FROM_HANDLE(radv_device, device, _device);
2947 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
2948
2949 VkResult result = VK_SUCCESS;
2950 uint32_t i;
2951
2952 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
2953
2954 if (!list_empty(&pool->free_cmd_buffers)) {
2955 struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
2956
2957 list_del(&cmd_buffer->pool_link);
2958 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
2959
2960 result = radv_reset_cmd_buffer(cmd_buffer);
2961 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
2962 cmd_buffer->level = pAllocateInfo->level;
2963
2964 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
2965 } else {
2966 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
2967 &pCommandBuffers[i]);
2968 }
2969 if (result != VK_SUCCESS)
2970 break;
2971 }
2972
2973 if (result != VK_SUCCESS) {
2974 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
2975 i, pCommandBuffers);
2976
2977 /* From the Vulkan 1.0.66 spec:
2978 *
2979 * "vkAllocateCommandBuffers can be used to create multiple
2980 * command buffers. If the creation of any of those command
2981 * buffers fails, the implementation must destroy all
2982 * successfully created command buffer objects from this
2983 * command, set all entries of the pCommandBuffers array to
2984 * NULL and return the error."
2985 */
2986 memset(pCommandBuffers, 0,
2987 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2988 }
2989
2990 return result;
2991 }
2992
2993 void radv_FreeCommandBuffers(
2994 VkDevice device,
2995 VkCommandPool commandPool,
2996 uint32_t commandBufferCount,
2997 const VkCommandBuffer *pCommandBuffers)
2998 {
2999 for (uint32_t i = 0; i < commandBufferCount; i++) {
3000 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
3001
3002 if (cmd_buffer) {
3003 if (cmd_buffer->pool) {
3004 list_del(&cmd_buffer->pool_link);
3005 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
3006 } else
3007 radv_cmd_buffer_destroy(cmd_buffer);
3008
3009 }
3010 }
3011 }
3012
3013 VkResult radv_ResetCommandBuffer(
3014 VkCommandBuffer commandBuffer,
3015 VkCommandBufferResetFlags flags)
3016 {
3017 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3018 return radv_reset_cmd_buffer(cmd_buffer);
3019 }
3020
3021 VkResult radv_BeginCommandBuffer(
3022 VkCommandBuffer commandBuffer,
3023 const VkCommandBufferBeginInfo *pBeginInfo)
3024 {
3025 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3026 VkResult result = VK_SUCCESS;
3027
3028 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
3029 /* If the command buffer has already been resetted with
3030 * vkResetCommandBuffer, no need to do it again.
3031 */
3032 result = radv_reset_cmd_buffer(cmd_buffer);
3033 if (result != VK_SUCCESS)
3034 return result;
3035 }
3036
3037 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
3038 cmd_buffer->state.last_primitive_reset_en = -1;
3039 cmd_buffer->state.last_index_type = -1;
3040 cmd_buffer->state.last_num_instances = -1;
3041 cmd_buffer->state.last_vertex_offset = -1;
3042 cmd_buffer->state.last_first_instance = -1;
3043 cmd_buffer->state.predication_type = -1;
3044 cmd_buffer->usage_flags = pBeginInfo->flags;
3045
3046 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
3047 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
3048 assert(pBeginInfo->pInheritanceInfo);
3049 cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
3050 cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
3051
3052 struct radv_subpass *subpass =
3053 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
3054
3055 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
3056 if (result != VK_SUCCESS)
3057 return result;
3058
3059 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3060 }
3061
3062 if (unlikely(cmd_buffer->device->trace_bo)) {
3063 struct radv_device *device = cmd_buffer->device;
3064
3065 radv_cs_add_buffer(device->ws, cmd_buffer->cs,
3066 device->trace_bo);
3067
3068 radv_cmd_buffer_trace_emit(cmd_buffer);
3069 }
3070
3071 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
3072
3073 return result;
3074 }
3075
3076 void radv_CmdBindVertexBuffers(
3077 VkCommandBuffer commandBuffer,
3078 uint32_t firstBinding,
3079 uint32_t bindingCount,
3080 const VkBuffer* pBuffers,
3081 const VkDeviceSize* pOffsets)
3082 {
3083 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3084 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
3085 bool changed = false;
3086
3087 /* We have to defer setting up vertex buffer since we need the buffer
3088 * stride from the pipeline. */
3089
3090 assert(firstBinding + bindingCount <= MAX_VBS);
3091 for (uint32_t i = 0; i < bindingCount; i++) {
3092 uint32_t idx = firstBinding + i;
3093
3094 if (!changed &&
3095 (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) ||
3096 vb[idx].offset != pOffsets[i])) {
3097 changed = true;
3098 }
3099
3100 vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
3101 vb[idx].offset = pOffsets[i];
3102
3103 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3104 vb[idx].buffer->bo);
3105 }
3106
3107 if (!changed) {
3108 /* No state changes. */
3109 return;
3110 }
3111
3112 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
3113 }
3114
3115 void radv_CmdBindIndexBuffer(
3116 VkCommandBuffer commandBuffer,
3117 VkBuffer buffer,
3118 VkDeviceSize offset,
3119 VkIndexType indexType)
3120 {
3121 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3122 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
3123
3124 if (cmd_buffer->state.index_buffer == index_buffer &&
3125 cmd_buffer->state.index_offset == offset &&
3126 cmd_buffer->state.index_type == indexType) {
3127 /* No state changes. */
3128 return;
3129 }
3130
3131 cmd_buffer->state.index_buffer = index_buffer;
3132 cmd_buffer->state.index_offset = offset;
3133 cmd_buffer->state.index_type = indexType; /* vk matches hw */
3134 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
3135 cmd_buffer->state.index_va += index_buffer->offset + offset;
3136
3137 int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
3138 cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift;
3139 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3140 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
3141 }
3142
3143
3144 static void
3145 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3146 VkPipelineBindPoint bind_point,
3147 struct radv_descriptor_set *set, unsigned idx)
3148 {
3149 struct radeon_winsys *ws = cmd_buffer->device->ws;
3150
3151 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
3152
3153 assert(set);
3154 assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
3155
3156 if (!cmd_buffer->device->use_global_bo_list) {
3157 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3158 if (set->descriptors[j])
3159 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
3160 }
3161
3162 if(set->bo)
3163 radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
3164 }
3165
3166 void radv_CmdBindDescriptorSets(
3167 VkCommandBuffer commandBuffer,
3168 VkPipelineBindPoint pipelineBindPoint,
3169 VkPipelineLayout _layout,
3170 uint32_t firstSet,
3171 uint32_t descriptorSetCount,
3172 const VkDescriptorSet* pDescriptorSets,
3173 uint32_t dynamicOffsetCount,
3174 const uint32_t* pDynamicOffsets)
3175 {
3176 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3177 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3178 unsigned dyn_idx = 0;
3179
3180 const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
3181 struct radv_descriptor_state *descriptors_state =
3182 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3183
3184 for (unsigned i = 0; i < descriptorSetCount; ++i) {
3185 unsigned idx = i + firstSet;
3186 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
3187 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, idx);
3188
3189 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
3190 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
3191 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
3192 assert(dyn_idx < dynamicOffsetCount);
3193
3194 struct radv_descriptor_range *range = set->dynamic_descriptors + j;
3195 uint64_t va = range->va + pDynamicOffsets[dyn_idx];
3196 dst[0] = va;
3197 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3198 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
3199 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3200 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3201 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3202 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
3203 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3204 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3205 cmd_buffer->push_constant_stages |=
3206 set->layout->dynamic_shader_stages;
3207 }
3208 }
3209 }
3210
3211 static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3212 struct radv_descriptor_set *set,
3213 struct radv_descriptor_set_layout *layout,
3214 VkPipelineBindPoint bind_point)
3215 {
3216 struct radv_descriptor_state *descriptors_state =
3217 radv_get_descriptors_state(cmd_buffer, bind_point);
3218 set->size = layout->size;
3219 set->layout = layout;
3220
3221 if (descriptors_state->push_set.capacity < set->size) {
3222 size_t new_size = MAX2(set->size, 1024);
3223 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
3224 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
3225
3226 free(set->mapped_ptr);
3227 set->mapped_ptr = malloc(new_size);
3228
3229 if (!set->mapped_ptr) {
3230 descriptors_state->push_set.capacity = 0;
3231 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3232 return false;
3233 }
3234
3235 descriptors_state->push_set.capacity = new_size;
3236 }
3237
3238 return true;
3239 }
3240
3241 void radv_meta_push_descriptor_set(
3242 struct radv_cmd_buffer* cmd_buffer,
3243 VkPipelineBindPoint pipelineBindPoint,
3244 VkPipelineLayout _layout,
3245 uint32_t set,
3246 uint32_t descriptorWriteCount,
3247 const VkWriteDescriptorSet* pDescriptorWrites)
3248 {
3249 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3250 struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
3251 unsigned bo_offset;
3252
3253 assert(set == 0);
3254 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3255
3256 push_set->size = layout->set[set].layout->size;
3257 push_set->layout = layout->set[set].layout;
3258
3259 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
3260 &bo_offset,
3261 (void**) &push_set->mapped_ptr))
3262 return;
3263
3264 push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3265 push_set->va += bo_offset;
3266
3267 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3268 radv_descriptor_set_to_handle(push_set),
3269 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3270
3271 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3272 }
3273
3274 void radv_CmdPushDescriptorSetKHR(
3275 VkCommandBuffer commandBuffer,
3276 VkPipelineBindPoint pipelineBindPoint,
3277 VkPipelineLayout _layout,
3278 uint32_t set,
3279 uint32_t descriptorWriteCount,
3280 const VkWriteDescriptorSet* pDescriptorWrites)
3281 {
3282 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3283 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3284 struct radv_descriptor_state *descriptors_state =
3285 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3286 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3287
3288 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3289
3290 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3291 layout->set[set].layout,
3292 pipelineBindPoint))
3293 return;
3294
3295 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
3296 * because it is invalid, according to Vulkan spec.
3297 */
3298 for (int i = 0; i < descriptorWriteCount; i++) {
3299 MAYBE_UNUSED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
3300 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
3301 }
3302
3303 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3304 radv_descriptor_set_to_handle(push_set),
3305 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3306
3307 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3308 descriptors_state->push_dirty = true;
3309 }
3310
3311 void radv_CmdPushDescriptorSetWithTemplateKHR(
3312 VkCommandBuffer commandBuffer,
3313 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
3314 VkPipelineLayout _layout,
3315 uint32_t set,
3316 const void* pData)
3317 {
3318 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3319 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3320 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
3321 struct radv_descriptor_state *descriptors_state =
3322 radv_get_descriptors_state(cmd_buffer, templ->bind_point);
3323 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3324
3325 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3326
3327 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3328 layout->set[set].layout,
3329 templ->bind_point))
3330 return;
3331
3332 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
3333 descriptorUpdateTemplate, pData);
3334
3335 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
3336 descriptors_state->push_dirty = true;
3337 }
3338
3339 void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
3340 VkPipelineLayout layout,
3341 VkShaderStageFlags stageFlags,
3342 uint32_t offset,
3343 uint32_t size,
3344 const void* pValues)
3345 {
3346 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3347 memcpy(cmd_buffer->push_constants + offset, pValues, size);
3348 cmd_buffer->push_constant_stages |= stageFlags;
3349 }
3350
3351 VkResult radv_EndCommandBuffer(
3352 VkCommandBuffer commandBuffer)
3353 {
3354 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3355
3356 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
3357 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
3358 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
3359
3360 /* Make sure to sync all pending active queries at the end of
3361 * command buffer.
3362 */
3363 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
3364
3365 si_emit_cache_flush(cmd_buffer);
3366 }
3367
3368 /* Make sure CP DMA is idle at the end of IBs because the kernel
3369 * doesn't wait for it.
3370 */
3371 si_cp_dma_wait_for_idle(cmd_buffer);
3372
3373 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
3374 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
3375
3376 if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
3377 return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
3378
3379 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
3380
3381 return cmd_buffer->record_result;
3382 }
3383
3384 static void
3385 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
3386 {
3387 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
3388
3389 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
3390 return;
3391
3392 assert(!pipeline->ctx_cs.cdw);
3393
3394 cmd_buffer->state.emitted_compute_pipeline = pipeline;
3395
3396 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
3397 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
3398
3399 cmd_buffer->compute_scratch_size_needed =
3400 MAX2(cmd_buffer->compute_scratch_size_needed,
3401 pipeline->max_waves * pipeline->scratch_bytes_per_wave);
3402
3403 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3404 pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
3405
3406 if (unlikely(cmd_buffer->device->trace_bo))
3407 radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
3408 }
3409
3410 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer,
3411 VkPipelineBindPoint bind_point)
3412 {
3413 struct radv_descriptor_state *descriptors_state =
3414 radv_get_descriptors_state(cmd_buffer, bind_point);
3415
3416 descriptors_state->dirty |= descriptors_state->valid;
3417 }
3418
3419 void radv_CmdBindPipeline(
3420 VkCommandBuffer commandBuffer,
3421 VkPipelineBindPoint pipelineBindPoint,
3422 VkPipeline _pipeline)
3423 {
3424 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3425 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
3426
3427 switch (pipelineBindPoint) {
3428 case VK_PIPELINE_BIND_POINT_COMPUTE:
3429 if (cmd_buffer->state.compute_pipeline == pipeline)
3430 return;
3431 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3432
3433 cmd_buffer->state.compute_pipeline = pipeline;
3434 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3435 break;
3436 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3437 if (cmd_buffer->state.pipeline == pipeline)
3438 return;
3439 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3440
3441 cmd_buffer->state.pipeline = pipeline;
3442 if (!pipeline)
3443 break;
3444
3445 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
3446 cmd_buffer->push_constant_stages |= pipeline->active_stages;
3447
3448 /* the new vertex shader might not have the same user regs */
3449 cmd_buffer->state.last_first_instance = -1;
3450 cmd_buffer->state.last_vertex_offset = -1;
3451
3452 /* Prefetch all pipeline shaders at first draw time. */
3453 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
3454
3455 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
3456 radv_bind_streamout_state(cmd_buffer, pipeline);
3457
3458 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
3459 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
3460 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
3461 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
3462
3463 if (radv_pipeline_has_tess(pipeline))
3464 cmd_buffer->tess_rings_needed = true;
3465 break;
3466 default:
3467 assert(!"invalid bind point");
3468 break;
3469 }
3470 }
3471
3472 void radv_CmdSetViewport(
3473 VkCommandBuffer commandBuffer,
3474 uint32_t firstViewport,
3475 uint32_t viewportCount,
3476 const VkViewport* pViewports)
3477 {
3478 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3479 struct radv_cmd_state *state = &cmd_buffer->state;
3480 MAYBE_UNUSED const uint32_t total_count = firstViewport + viewportCount;
3481
3482 assert(firstViewport < MAX_VIEWPORTS);
3483 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
3484
3485 if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
3486 pViewports, viewportCount * sizeof(*pViewports))) {
3487 return;
3488 }
3489
3490 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
3491 viewportCount * sizeof(*pViewports));
3492
3493 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
3494 }
3495
3496 void radv_CmdSetScissor(
3497 VkCommandBuffer commandBuffer,
3498 uint32_t firstScissor,
3499 uint32_t scissorCount,
3500 const VkRect2D* pScissors)
3501 {
3502 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3503 struct radv_cmd_state *state = &cmd_buffer->state;
3504 MAYBE_UNUSED const uint32_t total_count = firstScissor + scissorCount;
3505
3506 assert(firstScissor < MAX_SCISSORS);
3507 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
3508
3509 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
3510 scissorCount * sizeof(*pScissors))) {
3511 return;
3512 }
3513
3514 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
3515 scissorCount * sizeof(*pScissors));
3516
3517 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
3518 }
3519
3520 void radv_CmdSetLineWidth(
3521 VkCommandBuffer commandBuffer,
3522 float lineWidth)
3523 {
3524 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3525
3526 if (cmd_buffer->state.dynamic.line_width == lineWidth)
3527 return;
3528
3529 cmd_buffer->state.dynamic.line_width = lineWidth;
3530 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
3531 }
3532
3533 void radv_CmdSetDepthBias(
3534 VkCommandBuffer commandBuffer,
3535 float depthBiasConstantFactor,
3536 float depthBiasClamp,
3537 float depthBiasSlopeFactor)
3538 {
3539 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3540 struct radv_cmd_state *state = &cmd_buffer->state;
3541
3542 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
3543 state->dynamic.depth_bias.clamp == depthBiasClamp &&
3544 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
3545 return;
3546 }
3547
3548 state->dynamic.depth_bias.bias = depthBiasConstantFactor;
3549 state->dynamic.depth_bias.clamp = depthBiasClamp;
3550 state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
3551
3552 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
3553 }
3554
3555 void radv_CmdSetBlendConstants(
3556 VkCommandBuffer commandBuffer,
3557 const float blendConstants[4])
3558 {
3559 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3560 struct radv_cmd_state *state = &cmd_buffer->state;
3561
3562 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
3563 return;
3564
3565 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
3566
3567 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
3568 }
3569
3570 void radv_CmdSetDepthBounds(
3571 VkCommandBuffer commandBuffer,
3572 float minDepthBounds,
3573 float maxDepthBounds)
3574 {
3575 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3576 struct radv_cmd_state *state = &cmd_buffer->state;
3577
3578 if (state->dynamic.depth_bounds.min == minDepthBounds &&
3579 state->dynamic.depth_bounds.max == maxDepthBounds) {
3580 return;
3581 }
3582
3583 state->dynamic.depth_bounds.min = minDepthBounds;
3584 state->dynamic.depth_bounds.max = maxDepthBounds;
3585
3586 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
3587 }
3588
3589 void radv_CmdSetStencilCompareMask(
3590 VkCommandBuffer commandBuffer,
3591 VkStencilFaceFlags faceMask,
3592 uint32_t compareMask)
3593 {
3594 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3595 struct radv_cmd_state *state = &cmd_buffer->state;
3596 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
3597 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
3598
3599 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3600 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3601 return;
3602 }
3603
3604 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3605 state->dynamic.stencil_compare_mask.front = compareMask;
3606 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3607 state->dynamic.stencil_compare_mask.back = compareMask;
3608
3609 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
3610 }
3611
3612 void radv_CmdSetStencilWriteMask(
3613 VkCommandBuffer commandBuffer,
3614 VkStencilFaceFlags faceMask,
3615 uint32_t writeMask)
3616 {
3617 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3618 struct radv_cmd_state *state = &cmd_buffer->state;
3619 bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
3620 bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
3621
3622 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3623 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3624 return;
3625 }
3626
3627 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3628 state->dynamic.stencil_write_mask.front = writeMask;
3629 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3630 state->dynamic.stencil_write_mask.back = writeMask;
3631
3632 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
3633 }
3634
3635 void radv_CmdSetStencilReference(
3636 VkCommandBuffer commandBuffer,
3637 VkStencilFaceFlags faceMask,
3638 uint32_t reference)
3639 {
3640 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3641 struct radv_cmd_state *state = &cmd_buffer->state;
3642 bool front_same = state->dynamic.stencil_reference.front == reference;
3643 bool back_same = state->dynamic.stencil_reference.back == reference;
3644
3645 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3646 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3647 return;
3648 }
3649
3650 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3651 cmd_buffer->state.dynamic.stencil_reference.front = reference;
3652 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3653 cmd_buffer->state.dynamic.stencil_reference.back = reference;
3654
3655 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
3656 }
3657
3658 void radv_CmdSetDiscardRectangleEXT(
3659 VkCommandBuffer commandBuffer,
3660 uint32_t firstDiscardRectangle,
3661 uint32_t discardRectangleCount,
3662 const VkRect2D* pDiscardRectangles)
3663 {
3664 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3665 struct radv_cmd_state *state = &cmd_buffer->state;
3666 MAYBE_UNUSED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
3667
3668 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
3669 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
3670
3671 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
3672 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
3673 return;
3674 }
3675
3676 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
3677 pDiscardRectangles, discardRectangleCount);
3678
3679 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
3680 }
3681
3682 void radv_CmdSetSampleLocationsEXT(
3683 VkCommandBuffer commandBuffer,
3684 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
3685 {
3686 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3687 struct radv_cmd_state *state = &cmd_buffer->state;
3688
3689 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
3690
3691 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
3692 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
3693 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
3694 typed_memcpy(&state->dynamic.sample_location.locations[0],
3695 pSampleLocationsInfo->pSampleLocations,
3696 pSampleLocationsInfo->sampleLocationsCount);
3697
3698 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
3699 }
3700
3701 void radv_CmdExecuteCommands(
3702 VkCommandBuffer commandBuffer,
3703 uint32_t commandBufferCount,
3704 const VkCommandBuffer* pCmdBuffers)
3705 {
3706 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
3707
3708 assert(commandBufferCount > 0);
3709
3710 /* Emit pending flushes on primary prior to executing secondary */
3711 si_emit_cache_flush(primary);
3712
3713 for (uint32_t i = 0; i < commandBufferCount; i++) {
3714 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
3715
3716 primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
3717 secondary->scratch_size_needed);
3718 primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
3719 secondary->compute_scratch_size_needed);
3720
3721 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
3722 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
3723 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
3724 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
3725 if (secondary->tess_rings_needed)
3726 primary->tess_rings_needed = true;
3727 if (secondary->sample_positions_needed)
3728 primary->sample_positions_needed = true;
3729
3730 if (!secondary->state.framebuffer &&
3731 (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
3732 /* Emit the framebuffer state from primary if secondary
3733 * has been recorded without a framebuffer, otherwise
3734 * fast color/depth clears can't work.
3735 */
3736 radv_emit_framebuffer_state(primary);
3737 }
3738
3739 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
3740
3741
3742 /* When the secondary command buffer is compute only we don't
3743 * need to re-emit the current graphics pipeline.
3744 */
3745 if (secondary->state.emitted_pipeline) {
3746 primary->state.emitted_pipeline =
3747 secondary->state.emitted_pipeline;
3748 }
3749
3750 /* When the secondary command buffer is graphics only we don't
3751 * need to re-emit the current compute pipeline.
3752 */
3753 if (secondary->state.emitted_compute_pipeline) {
3754 primary->state.emitted_compute_pipeline =
3755 secondary->state.emitted_compute_pipeline;
3756 }
3757
3758 /* Only re-emit the draw packets when needed. */
3759 if (secondary->state.last_primitive_reset_en != -1) {
3760 primary->state.last_primitive_reset_en =
3761 secondary->state.last_primitive_reset_en;
3762 }
3763
3764 if (secondary->state.last_primitive_reset_index) {
3765 primary->state.last_primitive_reset_index =
3766 secondary->state.last_primitive_reset_index;
3767 }
3768
3769 if (secondary->state.last_ia_multi_vgt_param) {
3770 primary->state.last_ia_multi_vgt_param =
3771 secondary->state.last_ia_multi_vgt_param;
3772 }
3773
3774 primary->state.last_first_instance = secondary->state.last_first_instance;
3775 primary->state.last_num_instances = secondary->state.last_num_instances;
3776 primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
3777
3778 if (secondary->state.last_index_type != -1) {
3779 primary->state.last_index_type =
3780 secondary->state.last_index_type;
3781 }
3782 }
3783
3784 /* After executing commands from secondary buffers we have to dirty
3785 * some states.
3786 */
3787 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
3788 RADV_CMD_DIRTY_INDEX_BUFFER |
3789 RADV_CMD_DIRTY_DYNAMIC_ALL;
3790 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
3791 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
3792 }
3793
3794 VkResult radv_CreateCommandPool(
3795 VkDevice _device,
3796 const VkCommandPoolCreateInfo* pCreateInfo,
3797 const VkAllocationCallbacks* pAllocator,
3798 VkCommandPool* pCmdPool)
3799 {
3800 RADV_FROM_HANDLE(radv_device, device, _device);
3801 struct radv_cmd_pool *pool;
3802
3803 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
3804 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3805 if (pool == NULL)
3806 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
3807
3808 if (pAllocator)
3809 pool->alloc = *pAllocator;
3810 else
3811 pool->alloc = device->alloc;
3812
3813 list_inithead(&pool->cmd_buffers);
3814 list_inithead(&pool->free_cmd_buffers);
3815
3816 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
3817
3818 *pCmdPool = radv_cmd_pool_to_handle(pool);
3819
3820 return VK_SUCCESS;
3821
3822 }
3823
3824 void radv_DestroyCommandPool(
3825 VkDevice _device,
3826 VkCommandPool commandPool,
3827 const VkAllocationCallbacks* pAllocator)
3828 {
3829 RADV_FROM_HANDLE(radv_device, device, _device);
3830 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3831
3832 if (!pool)
3833 return;
3834
3835 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3836 &pool->cmd_buffers, pool_link) {
3837 radv_cmd_buffer_destroy(cmd_buffer);
3838 }
3839
3840 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3841 &pool->free_cmd_buffers, pool_link) {
3842 radv_cmd_buffer_destroy(cmd_buffer);
3843 }
3844
3845 vk_free2(&device->alloc, pAllocator, pool);
3846 }
3847
3848 VkResult radv_ResetCommandPool(
3849 VkDevice device,
3850 VkCommandPool commandPool,
3851 VkCommandPoolResetFlags flags)
3852 {
3853 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3854 VkResult result;
3855
3856 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
3857 &pool->cmd_buffers, pool_link) {
3858 result = radv_reset_cmd_buffer(cmd_buffer);
3859 if (result != VK_SUCCESS)
3860 return result;
3861 }
3862
3863 return VK_SUCCESS;
3864 }
3865
3866 void radv_TrimCommandPool(
3867 VkDevice device,
3868 VkCommandPool commandPool,
3869 VkCommandPoolTrimFlags flags)
3870 {
3871 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
3872
3873 if (!pool)
3874 return;
3875
3876 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
3877 &pool->free_cmd_buffers, pool_link) {
3878 radv_cmd_buffer_destroy(cmd_buffer);
3879 }
3880 }
3881
3882 static void
3883 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer,
3884 uint32_t subpass_id)
3885 {
3886 struct radv_cmd_state *state = &cmd_buffer->state;
3887 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
3888
3889 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
3890 cmd_buffer->cs, 4096);
3891
3892 radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
3893
3894 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3895
3896 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
3897 const uint32_t a = subpass->attachments[i].attachment;
3898 if (a == VK_ATTACHMENT_UNUSED)
3899 continue;
3900
3901 radv_handle_subpass_image_transition(cmd_buffer,
3902 subpass->attachments[i],
3903 true);
3904 }
3905
3906 radv_cmd_buffer_clear_subpass(cmd_buffer);
3907
3908 assert(cmd_buffer->cs->cdw <= cdw_max);
3909 }
3910
3911 static void
3912 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
3913 {
3914 struct radv_cmd_state *state = &cmd_buffer->state;
3915 const struct radv_subpass *subpass = state->subpass;
3916 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
3917
3918 radv_cmd_buffer_resolve_subpass(cmd_buffer);
3919
3920 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
3921 const uint32_t a = subpass->attachments[i].attachment;
3922 if (a == VK_ATTACHMENT_UNUSED)
3923 continue;
3924
3925 if (state->pass->attachments[a].last_subpass_idx != subpass_id)
3926 continue;
3927
3928 VkImageLayout layout = state->pass->attachments[a].final_layout;
3929 struct radv_subpass_attachment att = { a, layout };
3930 radv_handle_subpass_image_transition(cmd_buffer, att, false);
3931 }
3932 }
3933
3934 void radv_CmdBeginRenderPass(
3935 VkCommandBuffer commandBuffer,
3936 const VkRenderPassBeginInfo* pRenderPassBegin,
3937 VkSubpassContents contents)
3938 {
3939 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3940 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
3941 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
3942 VkResult result;
3943
3944 cmd_buffer->state.framebuffer = framebuffer;
3945 cmd_buffer->state.pass = pass;
3946 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
3947
3948 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
3949 if (result != VK_SUCCESS)
3950 return;
3951
3952 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
3953 if (result != VK_SUCCESS)
3954 return;
3955
3956 radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
3957 }
3958
3959 void radv_CmdBeginRenderPass2KHR(
3960 VkCommandBuffer commandBuffer,
3961 const VkRenderPassBeginInfo* pRenderPassBeginInfo,
3962 const VkSubpassBeginInfoKHR* pSubpassBeginInfo)
3963 {
3964 radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
3965 pSubpassBeginInfo->contents);
3966 }
3967
3968 void radv_CmdNextSubpass(
3969 VkCommandBuffer commandBuffer,
3970 VkSubpassContents contents)
3971 {
3972 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3973
3974 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
3975 radv_cmd_buffer_end_subpass(cmd_buffer);
3976 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
3977 }
3978
3979 void radv_CmdNextSubpass2KHR(
3980 VkCommandBuffer commandBuffer,
3981 const VkSubpassBeginInfoKHR* pSubpassBeginInfo,
3982 const VkSubpassEndInfoKHR* pSubpassEndInfo)
3983 {
3984 radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
3985 }
3986
3987 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
3988 {
3989 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3990 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
3991 if (!radv_get_shader(pipeline, stage))
3992 continue;
3993
3994 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
3995 if (loc->sgpr_idx == -1)
3996 continue;
3997 uint32_t base_reg = pipeline->user_data_0[stage];
3998 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
3999
4000 }
4001 if (pipeline->gs_copy_shader) {
4002 struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
4003 if (loc->sgpr_idx != -1) {
4004 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
4005 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
4006 }
4007 }
4008 }
4009
4010 static void
4011 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4012 uint32_t vertex_count,
4013 bool use_opaque)
4014 {
4015 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
4016 radeon_emit(cmd_buffer->cs, vertex_count);
4017 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
4018 S_0287F0_USE_OPAQUE(use_opaque));
4019 }
4020
4021 static void
4022 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
4023 uint64_t index_va,
4024 uint32_t index_count)
4025 {
4026 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
4027 radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
4028 radeon_emit(cmd_buffer->cs, index_va);
4029 radeon_emit(cmd_buffer->cs, index_va >> 32);
4030 radeon_emit(cmd_buffer->cs, index_count);
4031 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
4032 }
4033
4034 static void
4035 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4036 bool indexed,
4037 uint32_t draw_count,
4038 uint64_t count_va,
4039 uint32_t stride)
4040 {
4041 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4042 unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
4043 : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
4044 bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id;
4045 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
4046 bool predicating = cmd_buffer->state.predicating;
4047 assert(base_reg);
4048
4049 /* just reset draw state for vertex data */
4050 cmd_buffer->state.last_first_instance = -1;
4051 cmd_buffer->state.last_num_instances = -1;
4052 cmd_buffer->state.last_vertex_offset = -1;
4053
4054 if (draw_count == 1 && !count_va && !draw_id_enable) {
4055 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
4056 PKT3_DRAW_INDIRECT, 3, predicating));
4057 radeon_emit(cs, 0);
4058 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4059 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4060 radeon_emit(cs, di_src_sel);
4061 } else {
4062 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
4063 PKT3_DRAW_INDIRECT_MULTI,
4064 8, predicating));
4065 radeon_emit(cs, 0);
4066 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4067 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4068 radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
4069 S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
4070 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
4071 radeon_emit(cs, draw_count); /* count */
4072 radeon_emit(cs, count_va); /* count_addr */
4073 radeon_emit(cs, count_va >> 32);
4074 radeon_emit(cs, stride); /* stride */
4075 radeon_emit(cs, di_src_sel);
4076 }
4077 }
4078
4079 static void
4080 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
4081 const struct radv_draw_info *info)
4082 {
4083 struct radv_cmd_state *state = &cmd_buffer->state;
4084 struct radeon_winsys *ws = cmd_buffer->device->ws;
4085 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4086
4087 if (info->indirect) {
4088 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4089 uint64_t count_va = 0;
4090
4091 va += info->indirect->offset + info->indirect_offset;
4092
4093 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4094
4095 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
4096 radeon_emit(cs, 1);
4097 radeon_emit(cs, va);
4098 radeon_emit(cs, va >> 32);
4099
4100 if (info->count_buffer) {
4101 count_va = radv_buffer_get_va(info->count_buffer->bo);
4102 count_va += info->count_buffer->offset +
4103 info->count_buffer_offset;
4104
4105 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
4106 }
4107
4108 if (!state->subpass->view_mask) {
4109 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4110 info->indexed,
4111 info->count,
4112 count_va,
4113 info->stride);
4114 } else {
4115 unsigned i;
4116 for_each_bit(i, state->subpass->view_mask) {
4117 radv_emit_view_index(cmd_buffer, i);
4118
4119 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4120 info->indexed,
4121 info->count,
4122 count_va,
4123 info->stride);
4124 }
4125 }
4126 } else {
4127 assert(state->pipeline->graphics.vtx_base_sgpr);
4128
4129 if (info->vertex_offset != state->last_vertex_offset ||
4130 info->first_instance != state->last_first_instance) {
4131 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
4132 state->pipeline->graphics.vtx_emit_num);
4133
4134 radeon_emit(cs, info->vertex_offset);
4135 radeon_emit(cs, info->first_instance);
4136 if (state->pipeline->graphics.vtx_emit_num == 3)
4137 radeon_emit(cs, 0);
4138 state->last_first_instance = info->first_instance;
4139 state->last_vertex_offset = info->vertex_offset;
4140 }
4141
4142 if (state->last_num_instances != info->instance_count) {
4143 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
4144 radeon_emit(cs, info->instance_count);
4145 state->last_num_instances = info->instance_count;
4146 }
4147
4148 if (info->indexed) {
4149 int index_size = state->index_type ? 4 : 2;
4150 uint64_t index_va;
4151
4152 index_va = state->index_va;
4153 index_va += info->first_index * index_size;
4154
4155 if (!state->subpass->view_mask) {
4156 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4157 index_va,
4158 info->count);
4159 } else {
4160 unsigned i;
4161 for_each_bit(i, state->subpass->view_mask) {
4162 radv_emit_view_index(cmd_buffer, i);
4163
4164 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4165 index_va,
4166 info->count);
4167 }
4168 }
4169 } else {
4170 if (!state->subpass->view_mask) {
4171 radv_cs_emit_draw_packet(cmd_buffer,
4172 info->count,
4173 !!info->strmout_buffer);
4174 } else {
4175 unsigned i;
4176 for_each_bit(i, state->subpass->view_mask) {
4177 radv_emit_view_index(cmd_buffer, i);
4178
4179 radv_cs_emit_draw_packet(cmd_buffer,
4180 info->count,
4181 !!info->strmout_buffer);
4182 }
4183 }
4184 }
4185 }
4186 }
4187
4188 /*
4189 * Vega and raven have a bug which triggers if there are multiple context
4190 * register contexts active at the same time with different scissor values.
4191 *
4192 * There are two possible workarounds:
4193 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
4194 * there is only ever 1 active set of scissor values at the same time.
4195 *
4196 * 2) Whenever the hardware switches contexts we have to set the scissor
4197 * registers again even if it is a noop. That way the new context gets
4198 * the correct scissor values.
4199 *
4200 * This implements option 2. radv_need_late_scissor_emission needs to
4201 * return true on affected HW if radv_emit_all_graphics_states sets
4202 * any context registers.
4203 */
4204 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
4205 const struct radv_draw_info *info)
4206 {
4207 struct radv_cmd_state *state = &cmd_buffer->state;
4208
4209 if (!cmd_buffer->device->physical_device->has_scissor_bug)
4210 return false;
4211
4212 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
4213 return true;
4214
4215 uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
4216
4217 /* Index, vertex and streamout buffers don't change context regs, and
4218 * pipeline is already handled.
4219 */
4220 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
4221 RADV_CMD_DIRTY_VERTEX_BUFFER |
4222 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
4223 RADV_CMD_DIRTY_PIPELINE);
4224
4225 if (cmd_buffer->state.dirty & used_states)
4226 return true;
4227
4228 if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
4229 (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
4230 return true;
4231
4232 return false;
4233 }
4234
4235 static void
4236 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
4237 const struct radv_draw_info *info)
4238 {
4239 bool late_scissor_emission;
4240
4241 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
4242 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
4243 radv_emit_rbplus_state(cmd_buffer);
4244
4245 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
4246 radv_emit_graphics_pipeline(cmd_buffer);
4247
4248 /* This should be before the cmd_buffer->state.dirty is cleared
4249 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
4250 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
4251 late_scissor_emission =
4252 radv_need_late_scissor_emission(cmd_buffer, info);
4253
4254 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
4255 radv_emit_framebuffer_state(cmd_buffer);
4256
4257 if (info->indexed) {
4258 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
4259 radv_emit_index_buffer(cmd_buffer);
4260 } else {
4261 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
4262 * so the state must be re-emitted before the next indexed
4263 * draw.
4264 */
4265 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
4266 cmd_buffer->state.last_index_type = -1;
4267 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
4268 }
4269 }
4270
4271 radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
4272
4273 radv_emit_draw_registers(cmd_buffer, info);
4274
4275 if (late_scissor_emission)
4276 radv_emit_scissor(cmd_buffer);
4277 }
4278
4279 static void
4280 radv_draw(struct radv_cmd_buffer *cmd_buffer,
4281 const struct radv_draw_info *info)
4282 {
4283 struct radeon_info *rad_info =
4284 &cmd_buffer->device->physical_device->rad_info;
4285 bool has_prefetch =
4286 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
4287 bool pipeline_is_dirty =
4288 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
4289 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
4290
4291 MAYBE_UNUSED unsigned cdw_max =
4292 radeon_check_space(cmd_buffer->device->ws,
4293 cmd_buffer->cs, 4096);
4294
4295 if (likely(!info->indirect)) {
4296 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
4297 * no workaround for indirect draws, but we can at least skip
4298 * direct draws.
4299 */
4300 if (unlikely(!info->instance_count))
4301 return;
4302
4303 /* Handle count == 0. */
4304 if (unlikely(!info->count && !info->strmout_buffer))
4305 return;
4306 }
4307
4308 /* Use optimal packet order based on whether we need to sync the
4309 * pipeline.
4310 */
4311 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4312 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4313 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4314 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4315 /* If we have to wait for idle, set all states first, so that
4316 * all SET packets are processed in parallel with previous draw
4317 * calls. Then upload descriptors, set shader pointers, and
4318 * draw, and prefetch at the end. This ensures that the time
4319 * the CUs are idle is very short. (there are only SET_SH
4320 * packets between the wait and the draw)
4321 */
4322 radv_emit_all_graphics_states(cmd_buffer, info);
4323 si_emit_cache_flush(cmd_buffer);
4324 /* <-- CUs are idle here --> */
4325
4326 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4327
4328 radv_emit_draw_packets(cmd_buffer, info);
4329 /* <-- CUs are busy here --> */
4330
4331 /* Start prefetches after the draw has been started. Both will
4332 * run in parallel, but starting the draw first is more
4333 * important.
4334 */
4335 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4336 radv_emit_prefetch_L2(cmd_buffer,
4337 cmd_buffer->state.pipeline, false);
4338 }
4339 } else {
4340 /* If we don't wait for idle, start prefetches first, then set
4341 * states, and draw at the end.
4342 */
4343 si_emit_cache_flush(cmd_buffer);
4344
4345 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4346 /* Only prefetch the vertex shader and VBO descriptors
4347 * in order to start the draw as soon as possible.
4348 */
4349 radv_emit_prefetch_L2(cmd_buffer,
4350 cmd_buffer->state.pipeline, true);
4351 }
4352
4353 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4354
4355 radv_emit_all_graphics_states(cmd_buffer, info);
4356 radv_emit_draw_packets(cmd_buffer, info);
4357
4358 /* Prefetch the remaining shaders after the draw has been
4359 * started.
4360 */
4361 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4362 radv_emit_prefetch_L2(cmd_buffer,
4363 cmd_buffer->state.pipeline, false);
4364 }
4365 }
4366
4367 /* Workaround for a VGT hang when streamout is enabled.
4368 * It must be done after drawing.
4369 */
4370 if (cmd_buffer->state.streamout.streamout_enabled &&
4371 (rad_info->family == CHIP_HAWAII ||
4372 rad_info->family == CHIP_TONGA ||
4373 rad_info->family == CHIP_FIJI)) {
4374 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
4375 }
4376
4377 assert(cmd_buffer->cs->cdw <= cdw_max);
4378 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
4379 }
4380
4381 void radv_CmdDraw(
4382 VkCommandBuffer commandBuffer,
4383 uint32_t vertexCount,
4384 uint32_t instanceCount,
4385 uint32_t firstVertex,
4386 uint32_t firstInstance)
4387 {
4388 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4389 struct radv_draw_info info = {};
4390
4391 info.count = vertexCount;
4392 info.instance_count = instanceCount;
4393 info.first_instance = firstInstance;
4394 info.vertex_offset = firstVertex;
4395
4396 radv_draw(cmd_buffer, &info);
4397 }
4398
4399 void radv_CmdDrawIndexed(
4400 VkCommandBuffer commandBuffer,
4401 uint32_t indexCount,
4402 uint32_t instanceCount,
4403 uint32_t firstIndex,
4404 int32_t vertexOffset,
4405 uint32_t firstInstance)
4406 {
4407 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4408 struct radv_draw_info info = {};
4409
4410 info.indexed = true;
4411 info.count = indexCount;
4412 info.instance_count = instanceCount;
4413 info.first_index = firstIndex;
4414 info.vertex_offset = vertexOffset;
4415 info.first_instance = firstInstance;
4416
4417 radv_draw(cmd_buffer, &info);
4418 }
4419
4420 void radv_CmdDrawIndirect(
4421 VkCommandBuffer commandBuffer,
4422 VkBuffer _buffer,
4423 VkDeviceSize offset,
4424 uint32_t drawCount,
4425 uint32_t stride)
4426 {
4427 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4428 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4429 struct radv_draw_info info = {};
4430
4431 info.count = drawCount;
4432 info.indirect = buffer;
4433 info.indirect_offset = offset;
4434 info.stride = stride;
4435
4436 radv_draw(cmd_buffer, &info);
4437 }
4438
4439 void radv_CmdDrawIndexedIndirect(
4440 VkCommandBuffer commandBuffer,
4441 VkBuffer _buffer,
4442 VkDeviceSize offset,
4443 uint32_t drawCount,
4444 uint32_t stride)
4445 {
4446 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4447 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4448 struct radv_draw_info info = {};
4449
4450 info.indexed = true;
4451 info.count = drawCount;
4452 info.indirect = buffer;
4453 info.indirect_offset = offset;
4454 info.stride = stride;
4455
4456 radv_draw(cmd_buffer, &info);
4457 }
4458
4459 void radv_CmdDrawIndirectCountKHR(
4460 VkCommandBuffer commandBuffer,
4461 VkBuffer _buffer,
4462 VkDeviceSize offset,
4463 VkBuffer _countBuffer,
4464 VkDeviceSize countBufferOffset,
4465 uint32_t maxDrawCount,
4466 uint32_t stride)
4467 {
4468 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4469 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4470 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4471 struct radv_draw_info info = {};
4472
4473 info.count = maxDrawCount;
4474 info.indirect = buffer;
4475 info.indirect_offset = offset;
4476 info.count_buffer = count_buffer;
4477 info.count_buffer_offset = countBufferOffset;
4478 info.stride = stride;
4479
4480 radv_draw(cmd_buffer, &info);
4481 }
4482
4483 void radv_CmdDrawIndexedIndirectCountKHR(
4484 VkCommandBuffer commandBuffer,
4485 VkBuffer _buffer,
4486 VkDeviceSize offset,
4487 VkBuffer _countBuffer,
4488 VkDeviceSize countBufferOffset,
4489 uint32_t maxDrawCount,
4490 uint32_t stride)
4491 {
4492 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4493 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4494 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4495 struct radv_draw_info info = {};
4496
4497 info.indexed = true;
4498 info.count = maxDrawCount;
4499 info.indirect = buffer;
4500 info.indirect_offset = offset;
4501 info.count_buffer = count_buffer;
4502 info.count_buffer_offset = countBufferOffset;
4503 info.stride = stride;
4504
4505 radv_draw(cmd_buffer, &info);
4506 }
4507
4508 struct radv_dispatch_info {
4509 /**
4510 * Determine the layout of the grid (in block units) to be used.
4511 */
4512 uint32_t blocks[3];
4513
4514 /**
4515 * A starting offset for the grid. If unaligned is set, the offset
4516 * must still be aligned.
4517 */
4518 uint32_t offsets[3];
4519 /**
4520 * Whether it's an unaligned compute dispatch.
4521 */
4522 bool unaligned;
4523
4524 /**
4525 * Indirect compute parameters resource.
4526 */
4527 struct radv_buffer *indirect;
4528 uint64_t indirect_offset;
4529 };
4530
4531 static void
4532 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
4533 const struct radv_dispatch_info *info)
4534 {
4535 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4536 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4537 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
4538 struct radeon_winsys *ws = cmd_buffer->device->ws;
4539 bool predicating = cmd_buffer->state.predicating;
4540 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4541 struct radv_userdata_info *loc;
4542
4543 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
4544 AC_UD_CS_GRID_SIZE);
4545
4546 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
4547
4548 if (info->indirect) {
4549 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4550
4551 va += info->indirect->offset + info->indirect_offset;
4552
4553 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4554
4555 if (loc->sgpr_idx != -1) {
4556 for (unsigned i = 0; i < 3; ++i) {
4557 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4558 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
4559 COPY_DATA_DST_SEL(COPY_DATA_REG));
4560 radeon_emit(cs, (va + 4 * i));
4561 radeon_emit(cs, (va + 4 * i) >> 32);
4562 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
4563 + loc->sgpr_idx * 4) >> 2) + i);
4564 radeon_emit(cs, 0);
4565 }
4566 }
4567
4568 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
4569 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
4570 PKT3_SHADER_TYPE_S(1));
4571 radeon_emit(cs, va);
4572 radeon_emit(cs, va >> 32);
4573 radeon_emit(cs, dispatch_initiator);
4574 } else {
4575 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
4576 PKT3_SHADER_TYPE_S(1));
4577 radeon_emit(cs, 1);
4578 radeon_emit(cs, va);
4579 radeon_emit(cs, va >> 32);
4580
4581 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
4582 PKT3_SHADER_TYPE_S(1));
4583 radeon_emit(cs, 0);
4584 radeon_emit(cs, dispatch_initiator);
4585 }
4586 } else {
4587 unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
4588 unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] };
4589
4590 if (info->unaligned) {
4591 unsigned *cs_block_size = compute_shader->info.cs.block_size;
4592 unsigned remainder[3];
4593
4594 /* If aligned, these should be an entire block size,
4595 * not 0.
4596 */
4597 remainder[0] = blocks[0] + cs_block_size[0] -
4598 align_u32_npot(blocks[0], cs_block_size[0]);
4599 remainder[1] = blocks[1] + cs_block_size[1] -
4600 align_u32_npot(blocks[1], cs_block_size[1]);
4601 remainder[2] = blocks[2] + cs_block_size[2] -
4602 align_u32_npot(blocks[2], cs_block_size[2]);
4603
4604 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
4605 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
4606 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
4607
4608 for(unsigned i = 0; i < 3; ++i) {
4609 assert(offsets[i] % cs_block_size[i] == 0);
4610 offsets[i] /= cs_block_size[i];
4611 }
4612
4613 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
4614 radeon_emit(cs,
4615 S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
4616 S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
4617 radeon_emit(cs,
4618 S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
4619 S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
4620 radeon_emit(cs,
4621 S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
4622 S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
4623
4624 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
4625 }
4626
4627 if (loc->sgpr_idx != -1) {
4628 assert(loc->num_sgprs == 3);
4629
4630 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
4631 loc->sgpr_idx * 4, 3);
4632 radeon_emit(cs, blocks[0]);
4633 radeon_emit(cs, blocks[1]);
4634 radeon_emit(cs, blocks[2]);
4635 }
4636
4637 if (offsets[0] || offsets[1] || offsets[2]) {
4638 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
4639 radeon_emit(cs, offsets[0]);
4640 radeon_emit(cs, offsets[1]);
4641 radeon_emit(cs, offsets[2]);
4642
4643 /* The blocks in the packet are not counts but end values. */
4644 for (unsigned i = 0; i < 3; ++i)
4645 blocks[i] += offsets[i];
4646 } else {
4647 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
4648 }
4649
4650 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
4651 PKT3_SHADER_TYPE_S(1));
4652 radeon_emit(cs, blocks[0]);
4653 radeon_emit(cs, blocks[1]);
4654 radeon_emit(cs, blocks[2]);
4655 radeon_emit(cs, dispatch_initiator);
4656 }
4657
4658 assert(cmd_buffer->cs->cdw <= cdw_max);
4659 }
4660
4661 static void
4662 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
4663 {
4664 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4665 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4666 }
4667
4668 static void
4669 radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
4670 const struct radv_dispatch_info *info)
4671 {
4672 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4673 bool has_prefetch =
4674 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
4675 bool pipeline_is_dirty = pipeline &&
4676 pipeline != cmd_buffer->state.emitted_compute_pipeline;
4677
4678 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4679 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4680 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4681 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4682 /* If we have to wait for idle, set all states first, so that
4683 * all SET packets are processed in parallel with previous draw
4684 * calls. Then upload descriptors, set shader pointers, and
4685 * dispatch, and prefetch at the end. This ensures that the
4686 * time the CUs are idle is very short. (there are only SET_SH
4687 * packets between the wait and the draw)
4688 */
4689 radv_emit_compute_pipeline(cmd_buffer);
4690 si_emit_cache_flush(cmd_buffer);
4691 /* <-- CUs are idle here --> */
4692
4693 radv_upload_compute_shader_descriptors(cmd_buffer);
4694
4695 radv_emit_dispatch_packets(cmd_buffer, info);
4696 /* <-- CUs are busy here --> */
4697
4698 /* Start prefetches after the dispatch has been started. Both
4699 * will run in parallel, but starting the dispatch first is
4700 * more important.
4701 */
4702 if (has_prefetch && pipeline_is_dirty) {
4703 radv_emit_shader_prefetch(cmd_buffer,
4704 pipeline->shaders[MESA_SHADER_COMPUTE]);
4705 }
4706 } else {
4707 /* If we don't wait for idle, start prefetches first, then set
4708 * states, and dispatch at the end.
4709 */
4710 si_emit_cache_flush(cmd_buffer);
4711
4712 if (has_prefetch && pipeline_is_dirty) {
4713 radv_emit_shader_prefetch(cmd_buffer,
4714 pipeline->shaders[MESA_SHADER_COMPUTE]);
4715 }
4716
4717 radv_upload_compute_shader_descriptors(cmd_buffer);
4718
4719 radv_emit_compute_pipeline(cmd_buffer);
4720 radv_emit_dispatch_packets(cmd_buffer, info);
4721 }
4722
4723 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
4724 }
4725
4726 void radv_CmdDispatchBase(
4727 VkCommandBuffer commandBuffer,
4728 uint32_t base_x,
4729 uint32_t base_y,
4730 uint32_t base_z,
4731 uint32_t x,
4732 uint32_t y,
4733 uint32_t z)
4734 {
4735 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4736 struct radv_dispatch_info info = {};
4737
4738 info.blocks[0] = x;
4739 info.blocks[1] = y;
4740 info.blocks[2] = z;
4741
4742 info.offsets[0] = base_x;
4743 info.offsets[1] = base_y;
4744 info.offsets[2] = base_z;
4745 radv_dispatch(cmd_buffer, &info);
4746 }
4747
4748 void radv_CmdDispatch(
4749 VkCommandBuffer commandBuffer,
4750 uint32_t x,
4751 uint32_t y,
4752 uint32_t z)
4753 {
4754 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4755 }
4756
4757 void radv_CmdDispatchIndirect(
4758 VkCommandBuffer commandBuffer,
4759 VkBuffer _buffer,
4760 VkDeviceSize offset)
4761 {
4762 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4763 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4764 struct radv_dispatch_info info = {};
4765
4766 info.indirect = buffer;
4767 info.indirect_offset = offset;
4768
4769 radv_dispatch(cmd_buffer, &info);
4770 }
4771
4772 void radv_unaligned_dispatch(
4773 struct radv_cmd_buffer *cmd_buffer,
4774 uint32_t x,
4775 uint32_t y,
4776 uint32_t z)
4777 {
4778 struct radv_dispatch_info info = {};
4779
4780 info.blocks[0] = x;
4781 info.blocks[1] = y;
4782 info.blocks[2] = z;
4783 info.unaligned = 1;
4784
4785 radv_dispatch(cmd_buffer, &info);
4786 }
4787
4788 void radv_CmdEndRenderPass(
4789 VkCommandBuffer commandBuffer)
4790 {
4791 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4792
4793 radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
4794
4795 radv_cmd_buffer_end_subpass(cmd_buffer);
4796
4797 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4798 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4799
4800 cmd_buffer->state.pass = NULL;
4801 cmd_buffer->state.subpass = NULL;
4802 cmd_buffer->state.attachments = NULL;
4803 cmd_buffer->state.framebuffer = NULL;
4804 cmd_buffer->state.subpass_sample_locs = NULL;
4805 }
4806
4807 void radv_CmdEndRenderPass2KHR(
4808 VkCommandBuffer commandBuffer,
4809 const VkSubpassEndInfoKHR* pSubpassEndInfo)
4810 {
4811 radv_CmdEndRenderPass(commandBuffer);
4812 }
4813
4814 /*
4815 * For HTILE we have the following interesting clear words:
4816 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
4817 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
4818 * 0xfffffff0: Clear depth to 1.0
4819 * 0x00000000: Clear depth to 0.0
4820 */
4821 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
4822 struct radv_image *image,
4823 const VkImageSubresourceRange *range,
4824 uint32_t clear_word)
4825 {
4826 assert(range->baseMipLevel == 0);
4827 assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
4828 VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
4829 struct radv_cmd_state *state = &cmd_buffer->state;
4830 VkClearDepthStencilValue value = {};
4831
4832 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4833 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4834
4835 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, clear_word);
4836
4837 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4838
4839 if (vk_format_is_stencil(image->vk_format))
4840 aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4841
4842 radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects);
4843
4844 if (radv_image_is_tc_compat_htile(image)) {
4845 /* Initialize the TC-compat metada value to 0 because by
4846 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
4847 * need have to conditionally update its value when performing
4848 * a fast depth clear.
4849 */
4850 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, 0);
4851 }
4852 }
4853
4854 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
4855 struct radv_image *image,
4856 VkImageLayout src_layout,
4857 VkImageLayout dst_layout,
4858 unsigned src_queue_mask,
4859 unsigned dst_queue_mask,
4860 const VkImageSubresourceRange *range,
4861 struct radv_sample_locations_state *sample_locs)
4862 {
4863 if (!radv_image_has_htile(image))
4864 return;
4865
4866 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
4867 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
4868
4869 if (radv_layout_is_htile_compressed(image, dst_layout,
4870 dst_queue_mask)) {
4871 clear_value = 0;
4872 }
4873
4874 radv_initialize_htile(cmd_buffer, image, range, clear_value);
4875 } else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
4876 radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
4877 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
4878 radv_initialize_htile(cmd_buffer, image, range, clear_value);
4879 } else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
4880 !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
4881 VkImageSubresourceRange local_range = *range;
4882 local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4883 local_range.baseMipLevel = 0;
4884 local_range.levelCount = 1;
4885
4886 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4887 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4888
4889 radv_decompress_depth_image_inplace(cmd_buffer, image,
4890 &local_range, sample_locs);
4891
4892 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4893 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4894 }
4895 }
4896
4897 static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
4898 struct radv_image *image,
4899 const VkImageSubresourceRange *range,
4900 uint32_t value)
4901 {
4902 struct radv_cmd_state *state = &cmd_buffer->state;
4903
4904 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4905 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4906
4907 state->flush_bits |= radv_clear_cmask(cmd_buffer, image, range, value);
4908
4909 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4910 }
4911
4912 void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer,
4913 struct radv_image *image,
4914 const VkImageSubresourceRange *range)
4915 {
4916 struct radv_cmd_state *state = &cmd_buffer->state;
4917 static const uint32_t fmask_clear_values[4] = {
4918 0x00000000,
4919 0x02020202,
4920 0xE4E4E4E4,
4921 0x76543210
4922 };
4923 uint32_t log2_samples = util_logbase2(image->info.samples);
4924 uint32_t value = fmask_clear_values[log2_samples];
4925
4926 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4927 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4928
4929 state->flush_bits |= radv_clear_fmask(cmd_buffer, image, range, value);
4930
4931 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4932 }
4933
4934 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
4935 struct radv_image *image,
4936 const VkImageSubresourceRange *range, uint32_t value)
4937 {
4938 struct radv_cmd_state *state = &cmd_buffer->state;
4939 uint32_t level_count = radv_get_levelCount(image, range);
4940 unsigned size = 0;
4941
4942 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4943 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4944
4945 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
4946 /* Mipmap level aren't implemented. */
4947 assert(level_count == 1);
4948 state->flush_bits |= radv_clear_dcc(cmd_buffer, image,
4949 range, value);
4950 } else {
4951 /* Initialize the mipmap levels with DCC first. */
4952 for (unsigned l = 0; l < level_count; l++) {
4953 uint32_t level = range->baseMipLevel + l;
4954 struct legacy_surf_level *surf_level =
4955 &image->planes[0].surface.u.legacy.level[level];
4956
4957 if (!surf_level->dcc_fast_clear_size)
4958 break;
4959
4960 state->flush_bits |=
4961 radv_dcc_clear_level(cmd_buffer, image,
4962 level, value);
4963 }
4964
4965 /* When DCC is enabled with mipmaps, some levels might not
4966 * support fast clears and we have to initialize them as "fully
4967 * expanded".
4968 */
4969 /* Compute the size of all fast clearable DCC levels. */
4970 for (unsigned i = 0; i < image->planes[0].surface.num_dcc_levels; i++) {
4971 struct legacy_surf_level *surf_level =
4972 &image->planes[0].surface.u.legacy.level[i];
4973
4974 if (!surf_level->dcc_fast_clear_size)
4975 break;
4976
4977 size = surf_level->dcc_offset + surf_level->dcc_fast_clear_size;
4978 }
4979
4980 /* Initialize the mipmap levels without DCC. */
4981 if (size != image->planes[0].surface.dcc_size) {
4982 state->flush_bits |=
4983 radv_fill_buffer(cmd_buffer, image->bo,
4984 image->offset + image->dcc_offset + size,
4985 image->planes[0].surface.dcc_size - size,
4986 0xffffffff);
4987 }
4988 }
4989
4990 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4991 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4992 }
4993
4994 /**
4995 * Initialize DCC/FMASK/CMASK metadata for a color image.
4996 */
4997 static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,
4998 struct radv_image *image,
4999 VkImageLayout src_layout,
5000 VkImageLayout dst_layout,
5001 unsigned src_queue_mask,
5002 unsigned dst_queue_mask,
5003 const VkImageSubresourceRange *range)
5004 {
5005 if (radv_image_has_cmask(image)) {
5006 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5007
5008 /* TODO: clarify this. */
5009 if (radv_image_has_fmask(image)) {
5010 value = 0xccccccccu;
5011 }
5012
5013 radv_initialise_cmask(cmd_buffer, image, range, value);
5014 }
5015
5016 if (radv_image_has_fmask(image)) {
5017 radv_initialize_fmask(cmd_buffer, image, range);
5018 }
5019
5020 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5021 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5022 bool need_decompress_pass = false;
5023
5024 if (radv_layout_dcc_compressed(image, dst_layout,
5025 dst_queue_mask)) {
5026 value = 0x20202020u;
5027 need_decompress_pass = true;
5028 }
5029
5030 radv_initialize_dcc(cmd_buffer, image, range, value);
5031
5032 radv_update_fce_metadata(cmd_buffer, image, range,
5033 need_decompress_pass);
5034 }
5035
5036 if (radv_image_has_cmask(image) ||
5037 radv_dcc_enabled(image, range->baseMipLevel)) {
5038 uint32_t color_values[2] = {};
5039 radv_set_color_clear_metadata(cmd_buffer, image, range,
5040 color_values);
5041 }
5042 }
5043
5044 /**
5045 * Handle color image transitions for DCC/FMASK/CMASK.
5046 */
5047 static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer,
5048 struct radv_image *image,
5049 VkImageLayout src_layout,
5050 VkImageLayout dst_layout,
5051 unsigned src_queue_mask,
5052 unsigned dst_queue_mask,
5053 const VkImageSubresourceRange *range)
5054 {
5055 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5056 radv_init_color_image_metadata(cmd_buffer, image,
5057 src_layout, dst_layout,
5058 src_queue_mask, dst_queue_mask,
5059 range);
5060 return;
5061 }
5062
5063 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5064 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
5065 radv_initialize_dcc(cmd_buffer, image, range, 0xffffffffu);
5066 } else if (radv_layout_dcc_compressed(image, src_layout, src_queue_mask) &&
5067 !radv_layout_dcc_compressed(image, dst_layout, dst_queue_mask)) {
5068 radv_decompress_dcc(cmd_buffer, image, range);
5069 } else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
5070 !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
5071 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5072 }
5073 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
5074 bool fce_eliminate = false, fmask_expand = false;
5075
5076 if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
5077 !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
5078 fce_eliminate = true;
5079 }
5080
5081 if (radv_image_has_fmask(image)) {
5082 if (src_layout != VK_IMAGE_LAYOUT_GENERAL &&
5083 dst_layout == VK_IMAGE_LAYOUT_GENERAL) {
5084 /* A FMASK decompress is required before doing
5085 * a MSAA decompress using FMASK.
5086 */
5087 fmask_expand = true;
5088 }
5089 }
5090
5091 if (fce_eliminate || fmask_expand)
5092 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5093
5094 if (fmask_expand)
5095 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
5096 }
5097 }
5098
5099 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
5100 struct radv_image *image,
5101 VkImageLayout src_layout,
5102 VkImageLayout dst_layout,
5103 uint32_t src_family,
5104 uint32_t dst_family,
5105 const VkImageSubresourceRange *range,
5106 struct radv_sample_locations_state *sample_locs)
5107 {
5108 if (image->exclusive && src_family != dst_family) {
5109 /* This is an acquire or a release operation and there will be
5110 * a corresponding release/acquire. Do the transition in the
5111 * most flexible queue. */
5112
5113 assert(src_family == cmd_buffer->queue_family_index ||
5114 dst_family == cmd_buffer->queue_family_index);
5115
5116 if (src_family == VK_QUEUE_FAMILY_EXTERNAL)
5117 return;
5118
5119 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
5120 return;
5121
5122 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
5123 (src_family == RADV_QUEUE_GENERAL ||
5124 dst_family == RADV_QUEUE_GENERAL))
5125 return;
5126 }
5127
5128 if (src_layout == dst_layout)
5129 return;
5130
5131 unsigned src_queue_mask =
5132 radv_image_queue_family_mask(image, src_family,
5133 cmd_buffer->queue_family_index);
5134 unsigned dst_queue_mask =
5135 radv_image_queue_family_mask(image, dst_family,
5136 cmd_buffer->queue_family_index);
5137
5138 if (vk_format_is_depth(image->vk_format)) {
5139 radv_handle_depth_image_transition(cmd_buffer, image,
5140 src_layout, dst_layout,
5141 src_queue_mask, dst_queue_mask,
5142 range, sample_locs);
5143 } else {
5144 radv_handle_color_image_transition(cmd_buffer, image,
5145 src_layout, dst_layout,
5146 src_queue_mask, dst_queue_mask,
5147 range);
5148 }
5149 }
5150
5151 struct radv_barrier_info {
5152 uint32_t eventCount;
5153 const VkEvent *pEvents;
5154 VkPipelineStageFlags srcStageMask;
5155 VkPipelineStageFlags dstStageMask;
5156 };
5157
5158 static void
5159 radv_barrier(struct radv_cmd_buffer *cmd_buffer,
5160 uint32_t memoryBarrierCount,
5161 const VkMemoryBarrier *pMemoryBarriers,
5162 uint32_t bufferMemoryBarrierCount,
5163 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
5164 uint32_t imageMemoryBarrierCount,
5165 const VkImageMemoryBarrier *pImageMemoryBarriers,
5166 const struct radv_barrier_info *info)
5167 {
5168 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5169 enum radv_cmd_flush_bits src_flush_bits = 0;
5170 enum radv_cmd_flush_bits dst_flush_bits = 0;
5171
5172 for (unsigned i = 0; i < info->eventCount; ++i) {
5173 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
5174 uint64_t va = radv_buffer_get_va(event->bo);
5175
5176 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5177
5178 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
5179
5180 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
5181 assert(cmd_buffer->cs->cdw <= cdw_max);
5182 }
5183
5184 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
5185 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
5186 NULL);
5187 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
5188 NULL);
5189 }
5190
5191 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
5192 src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
5193 NULL);
5194 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
5195 NULL);
5196 }
5197
5198 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5199 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5200
5201 src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
5202 image);
5203 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
5204 image);
5205 }
5206
5207 /* The Vulkan spec 1.1.98 says:
5208 *
5209 * "An execution dependency with only
5210 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
5211 * will only prevent that stage from executing in subsequently
5212 * submitted commands. As this stage does not perform any actual
5213 * execution, this is not observable - in effect, it does not delay
5214 * processing of subsequent commands. Similarly an execution dependency
5215 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
5216 * will effectively not wait for any prior commands to complete."
5217 */
5218 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
5219 radv_stage_flush(cmd_buffer, info->srcStageMask);
5220 cmd_buffer->state.flush_bits |= src_flush_bits;
5221
5222 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5223 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5224
5225 const struct VkSampleLocationsInfoEXT *sample_locs_info =
5226 vk_find_struct_const(pImageMemoryBarriers[i].pNext,
5227 SAMPLE_LOCATIONS_INFO_EXT);
5228 struct radv_sample_locations_state sample_locations = {};
5229
5230 if (sample_locs_info) {
5231 assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
5232 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
5233 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
5234 sample_locations.count = sample_locs_info->sampleLocationsCount;
5235 typed_memcpy(&sample_locations.locations[0],
5236 sample_locs_info->pSampleLocations,
5237 sample_locs_info->sampleLocationsCount);
5238 }
5239
5240 radv_handle_image_transition(cmd_buffer, image,
5241 pImageMemoryBarriers[i].oldLayout,
5242 pImageMemoryBarriers[i].newLayout,
5243 pImageMemoryBarriers[i].srcQueueFamilyIndex,
5244 pImageMemoryBarriers[i].dstQueueFamilyIndex,
5245 &pImageMemoryBarriers[i].subresourceRange,
5246 sample_locs_info ? &sample_locations : NULL);
5247 }
5248
5249 /* Make sure CP DMA is idle because the driver might have performed a
5250 * DMA operation for copying or filling buffers/images.
5251 */
5252 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5253 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5254 si_cp_dma_wait_for_idle(cmd_buffer);
5255
5256 cmd_buffer->state.flush_bits |= dst_flush_bits;
5257 }
5258
5259 void radv_CmdPipelineBarrier(
5260 VkCommandBuffer commandBuffer,
5261 VkPipelineStageFlags srcStageMask,
5262 VkPipelineStageFlags destStageMask,
5263 VkBool32 byRegion,
5264 uint32_t memoryBarrierCount,
5265 const VkMemoryBarrier* pMemoryBarriers,
5266 uint32_t bufferMemoryBarrierCount,
5267 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5268 uint32_t imageMemoryBarrierCount,
5269 const VkImageMemoryBarrier* pImageMemoryBarriers)
5270 {
5271 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5272 struct radv_barrier_info info;
5273
5274 info.eventCount = 0;
5275 info.pEvents = NULL;
5276 info.srcStageMask = srcStageMask;
5277 info.dstStageMask = destStageMask;
5278
5279 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5280 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5281 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5282 }
5283
5284
5285 static void write_event(struct radv_cmd_buffer *cmd_buffer,
5286 struct radv_event *event,
5287 VkPipelineStageFlags stageMask,
5288 unsigned value)
5289 {
5290 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5291 uint64_t va = radv_buffer_get_va(event->bo);
5292
5293 si_emit_cache_flush(cmd_buffer);
5294
5295 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5296
5297 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21);
5298
5299 /* Flags that only require a top-of-pipe event. */
5300 VkPipelineStageFlags top_of_pipe_flags =
5301 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
5302
5303 /* Flags that only require a post-index-fetch event. */
5304 VkPipelineStageFlags post_index_fetch_flags =
5305 top_of_pipe_flags |
5306 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
5307 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
5308
5309 /* Make sure CP DMA is idle because the driver might have performed a
5310 * DMA operation for copying or filling buffers/images.
5311 */
5312 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5313 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5314 si_cp_dma_wait_for_idle(cmd_buffer);
5315
5316 /* TODO: Emit EOS events for syncing PS/CS stages. */
5317
5318 if (!(stageMask & ~top_of_pipe_flags)) {
5319 /* Just need to sync the PFP engine. */
5320 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5321 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5322 S_370_WR_CONFIRM(1) |
5323 S_370_ENGINE_SEL(V_370_PFP));
5324 radeon_emit(cs, va);
5325 radeon_emit(cs, va >> 32);
5326 radeon_emit(cs, value);
5327 } else if (!(stageMask & ~post_index_fetch_flags)) {
5328 /* Sync ME because PFP reads index and indirect buffers. */
5329 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5330 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5331 S_370_WR_CONFIRM(1) |
5332 S_370_ENGINE_SEL(V_370_ME));
5333 radeon_emit(cs, va);
5334 radeon_emit(cs, va >> 32);
5335 radeon_emit(cs, value);
5336 } else {
5337 /* Otherwise, sync all prior GPU work using an EOP event. */
5338 si_cs_emit_write_event_eop(cs,
5339 cmd_buffer->device->physical_device->rad_info.chip_class,
5340 radv_cmd_buffer_uses_mec(cmd_buffer),
5341 V_028A90_BOTTOM_OF_PIPE_TS, 0,
5342 EOP_DATA_SEL_VALUE_32BIT, va, value,
5343 cmd_buffer->gfx9_eop_bug_va);
5344 }
5345
5346 assert(cmd_buffer->cs->cdw <= cdw_max);
5347 }
5348
5349 void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
5350 VkEvent _event,
5351 VkPipelineStageFlags stageMask)
5352 {
5353 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5354 RADV_FROM_HANDLE(radv_event, event, _event);
5355
5356 write_event(cmd_buffer, event, stageMask, 1);
5357 }
5358
5359 void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
5360 VkEvent _event,
5361 VkPipelineStageFlags stageMask)
5362 {
5363 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5364 RADV_FROM_HANDLE(radv_event, event, _event);
5365
5366 write_event(cmd_buffer, event, stageMask, 0);
5367 }
5368
5369 void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
5370 uint32_t eventCount,
5371 const VkEvent* pEvents,
5372 VkPipelineStageFlags srcStageMask,
5373 VkPipelineStageFlags dstStageMask,
5374 uint32_t memoryBarrierCount,
5375 const VkMemoryBarrier* pMemoryBarriers,
5376 uint32_t bufferMemoryBarrierCount,
5377 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5378 uint32_t imageMemoryBarrierCount,
5379 const VkImageMemoryBarrier* pImageMemoryBarriers)
5380 {
5381 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5382 struct radv_barrier_info info;
5383
5384 info.eventCount = eventCount;
5385 info.pEvents = pEvents;
5386 info.srcStageMask = 0;
5387
5388 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5389 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5390 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5391 }
5392
5393
5394 void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,
5395 uint32_t deviceMask)
5396 {
5397 /* No-op */
5398 }
5399
5400 /* VK_EXT_conditional_rendering */
5401 void radv_CmdBeginConditionalRenderingEXT(
5402 VkCommandBuffer commandBuffer,
5403 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5404 {
5405 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5406 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
5407 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5408 bool draw_visible = true;
5409 uint64_t pred_value = 0;
5410 uint64_t va, new_va;
5411 unsigned pred_offset;
5412
5413 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
5414
5415 /* By default, if the 32-bit value at offset in buffer memory is zero,
5416 * then the rendering commands are discarded, otherwise they are
5417 * executed as normal. If the inverted flag is set, all commands are
5418 * discarded if the value is non zero.
5419 */
5420 if (pConditionalRenderingBegin->flags &
5421 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
5422 draw_visible = false;
5423 }
5424
5425 si_emit_cache_flush(cmd_buffer);
5426
5427 /* From the Vulkan spec 1.1.107:
5428 *
5429 * "If the 32-bit value at offset in buffer memory is zero, then the
5430 * rendering commands are discarded, otherwise they are executed as
5431 * normal. If the value of the predicate in buffer memory changes while
5432 * conditional rendering is active, the rendering commands may be
5433 * discarded in an implementation-dependent way. Some implementations
5434 * may latch the value of the predicate upon beginning conditional
5435 * rendering while others may read it before every rendering command."
5436 *
5437 * But, the AMD hardware treats the predicate as a 64-bit value which
5438 * means we need a workaround in the driver. Luckily, it's not required
5439 * to support if the value changes when predication is active.
5440 *
5441 * The workaround is as follows:
5442 * 1) allocate a 64-value in the upload BO and initialize it to 0
5443 * 2) copy the 32-bit predicate value to the upload BO
5444 * 3) use the new allocated VA address for predication
5445 *
5446 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
5447 * in ME (+ sync PFP) instead of PFP.
5448 */
5449 radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
5450
5451 new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
5452
5453 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
5454 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
5455 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
5456 COPY_DATA_WR_CONFIRM);
5457 radeon_emit(cs, va);
5458 radeon_emit(cs, va >> 32);
5459 radeon_emit(cs, new_va);
5460 radeon_emit(cs, new_va >> 32);
5461
5462 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
5463 radeon_emit(cs, 0);
5464
5465 /* Enable predication for this command buffer. */
5466 si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
5467 cmd_buffer->state.predicating = true;
5468
5469 /* Store conditional rendering user info. */
5470 cmd_buffer->state.predication_type = draw_visible;
5471 cmd_buffer->state.predication_va = new_va;
5472 }
5473
5474 void radv_CmdEndConditionalRenderingEXT(
5475 VkCommandBuffer commandBuffer)
5476 {
5477 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5478
5479 /* Disable predication for this command buffer. */
5480 si_emit_set_predication_state(cmd_buffer, false, 0);
5481 cmd_buffer->state.predicating = false;
5482
5483 /* Reset conditional rendering user info. */
5484 cmd_buffer->state.predication_type = -1;
5485 cmd_buffer->state.predication_va = 0;
5486 }
5487
5488 /* VK_EXT_transform_feedback */
5489 void radv_CmdBindTransformFeedbackBuffersEXT(
5490 VkCommandBuffer commandBuffer,
5491 uint32_t firstBinding,
5492 uint32_t bindingCount,
5493 const VkBuffer* pBuffers,
5494 const VkDeviceSize* pOffsets,
5495 const VkDeviceSize* pSizes)
5496 {
5497 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5498 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5499 uint8_t enabled_mask = 0;
5500
5501 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
5502 for (uint32_t i = 0; i < bindingCount; i++) {
5503 uint32_t idx = firstBinding + i;
5504
5505 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
5506 sb[idx].offset = pOffsets[i];
5507 sb[idx].size = pSizes[i];
5508
5509 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
5510 sb[idx].buffer->bo);
5511
5512 enabled_mask |= 1 << idx;
5513 }
5514
5515 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
5516
5517 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5518 }
5519
5520 static void
5521 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
5522 {
5523 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5524 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5525
5526 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
5527 radeon_emit(cs,
5528 S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
5529 S_028B94_RAST_STREAM(0) |
5530 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
5531 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
5532 S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
5533 radeon_emit(cs, so->hw_enabled_mask &
5534 so->enabled_stream_buffers_mask);
5535
5536 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5537 }
5538
5539 static void
5540 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
5541 {
5542 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5543 bool old_streamout_enabled = so->streamout_enabled;
5544 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
5545
5546 so->streamout_enabled = enable;
5547
5548 so->hw_enabled_mask = so->enabled_mask |
5549 (so->enabled_mask << 4) |
5550 (so->enabled_mask << 8) |
5551 (so->enabled_mask << 12);
5552
5553 if ((old_streamout_enabled != so->streamout_enabled) ||
5554 (old_hw_enabled_mask != so->hw_enabled_mask))
5555 radv_emit_streamout_enable(cmd_buffer);
5556 }
5557
5558 static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
5559 {
5560 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5561 unsigned reg_strmout_cntl;
5562
5563 /* The register is at different places on different ASICs. */
5564 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
5565 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
5566 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
5567 } else {
5568 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
5569 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
5570 }
5571
5572 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
5573 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
5574
5575 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
5576 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
5577 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
5578 radeon_emit(cs, 0);
5579 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
5580 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
5581 radeon_emit(cs, 4); /* poll interval */
5582 }
5583
5584 void radv_CmdBeginTransformFeedbackEXT(
5585 VkCommandBuffer commandBuffer,
5586 uint32_t firstCounterBuffer,
5587 uint32_t counterBufferCount,
5588 const VkBuffer* pCounterBuffers,
5589 const VkDeviceSize* pCounterBufferOffsets)
5590 {
5591 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5592 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5593 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5594 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5595 uint32_t i;
5596
5597 radv_flush_vgt_streamout(cmd_buffer);
5598
5599 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5600 for_each_bit(i, so->enabled_mask) {
5601 int32_t counter_buffer_idx = i - firstCounterBuffer;
5602 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5603 counter_buffer_idx = -1;
5604
5605 /* AMD GCN binds streamout buffers as shader resources.
5606 * VGT only counts primitives and tells the shader through
5607 * SGPRs what to do.
5608 */
5609 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
5610 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
5611 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
5612
5613 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5614
5615 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5616 /* The array of counter buffers is optional. */
5617 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5618 uint64_t va = radv_buffer_get_va(buffer->bo);
5619
5620 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5621
5622 /* Append */
5623 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5624 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5625 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5626 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
5627 radeon_emit(cs, 0); /* unused */
5628 radeon_emit(cs, 0); /* unused */
5629 radeon_emit(cs, va); /* src address lo */
5630 radeon_emit(cs, va >> 32); /* src address hi */
5631
5632 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5633 } else {
5634 /* Start from the beginning. */
5635 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5636 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5637 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5638 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
5639 radeon_emit(cs, 0); /* unused */
5640 radeon_emit(cs, 0); /* unused */
5641 radeon_emit(cs, 0); /* unused */
5642 radeon_emit(cs, 0); /* unused */
5643 }
5644 }
5645
5646 radv_set_streamout_enable(cmd_buffer, true);
5647 }
5648
5649 void radv_CmdEndTransformFeedbackEXT(
5650 VkCommandBuffer commandBuffer,
5651 uint32_t firstCounterBuffer,
5652 uint32_t counterBufferCount,
5653 const VkBuffer* pCounterBuffers,
5654 const VkDeviceSize* pCounterBufferOffsets)
5655 {
5656 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5657 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5658 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5659 uint32_t i;
5660
5661 radv_flush_vgt_streamout(cmd_buffer);
5662
5663 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5664 for_each_bit(i, so->enabled_mask) {
5665 int32_t counter_buffer_idx = i - firstCounterBuffer;
5666 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5667 counter_buffer_idx = -1;
5668
5669 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5670 /* The array of counters buffer is optional. */
5671 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5672 uint64_t va = radv_buffer_get_va(buffer->bo);
5673
5674 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5675
5676 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5677 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5678 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5679 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
5680 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
5681 radeon_emit(cs, va); /* dst address lo */
5682 radeon_emit(cs, va >> 32); /* dst address hi */
5683 radeon_emit(cs, 0); /* unused */
5684 radeon_emit(cs, 0); /* unused */
5685
5686 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5687 }
5688
5689 /* Deactivate transform feedback by zeroing the buffer size.
5690 * The counters (primitives generated, primitives emitted) may
5691 * be enabled even if there is not buffer bound. This ensures
5692 * that the primitives-emitted query won't increment.
5693 */
5694 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
5695
5696 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5697 }
5698
5699 radv_set_streamout_enable(cmd_buffer, false);
5700 }
5701
5702 void radv_CmdDrawIndirectByteCountEXT(
5703 VkCommandBuffer commandBuffer,
5704 uint32_t instanceCount,
5705 uint32_t firstInstance,
5706 VkBuffer _counterBuffer,
5707 VkDeviceSize counterBufferOffset,
5708 uint32_t counterOffset,
5709 uint32_t vertexStride)
5710 {
5711 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5712 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
5713 struct radv_draw_info info = {};
5714
5715 info.instance_count = instanceCount;
5716 info.first_instance = firstInstance;
5717 info.strmout_buffer = counterBuffer;
5718 info.strmout_buffer_offset = counterBufferOffset;
5719 info.stride = vertexStride;
5720
5721 radv_draw(cmd_buffer, &info);
5722 }
5723
5724 /* VK_AMD_buffer_marker */
5725 void radv_CmdWriteBufferMarkerAMD(
5726 VkCommandBuffer commandBuffer,
5727 VkPipelineStageFlagBits pipelineStage,
5728 VkBuffer dstBuffer,
5729 VkDeviceSize dstOffset,
5730 uint32_t marker)
5731 {
5732 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5733 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
5734 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5735 uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
5736
5737 si_emit_cache_flush(cmd_buffer);
5738
5739 if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
5740 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
5741 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
5742 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
5743 COPY_DATA_WR_CONFIRM);
5744 radeon_emit(cs, marker);
5745 radeon_emit(cs, 0);
5746 radeon_emit(cs, va);
5747 radeon_emit(cs, va >> 32);
5748 } else {
5749 si_cs_emit_write_event_eop(cs,
5750 cmd_buffer->device->physical_device->rad_info.chip_class,
5751 radv_cmd_buffer_uses_mec(cmd_buffer),
5752 V_028A90_BOTTOM_OF_PIPE_TS, 0,
5753 EOP_DATA_SEL_VALUE_32BIT,
5754 va, marker,
5755 cmd_buffer->gfx9_eop_bug_va);
5756 }
5757 }