21ef5caa8e565d3b5dcc74387ef7f972f2d92cce
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_private.h"
29 #include "radv_radeon_winsys.h"
30 #include "radv_shader.h"
31 #include "radv_cs.h"
32 #include "sid.h"
33 #include "vk_format.h"
34 #include "vk_util.h"
35 #include "radv_debug.h"
36 #include "radv_meta.h"
37
38 #include "ac_debug.h"
39
40 enum {
41 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
42 RADV_PREFETCH_VS = (1 << 1),
43 RADV_PREFETCH_TCS = (1 << 2),
44 RADV_PREFETCH_TES = (1 << 3),
45 RADV_PREFETCH_GS = (1 << 4),
46 RADV_PREFETCH_PS = (1 << 5),
47 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS |
48 RADV_PREFETCH_TCS |
49 RADV_PREFETCH_TES |
50 RADV_PREFETCH_GS |
51 RADV_PREFETCH_PS)
52 };
53
54 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
55 struct radv_image *image,
56 VkImageLayout src_layout,
57 bool src_render_loop,
58 VkImageLayout dst_layout,
59 bool dst_render_loop,
60 uint32_t src_family,
61 uint32_t dst_family,
62 const VkImageSubresourceRange *range,
63 struct radv_sample_locations_state *sample_locs);
64
65 const struct radv_dynamic_state default_dynamic_state = {
66 .viewport = {
67 .count = 0,
68 },
69 .scissor = {
70 .count = 0,
71 },
72 .line_width = 1.0f,
73 .depth_bias = {
74 .bias = 0.0f,
75 .clamp = 0.0f,
76 .slope = 0.0f,
77 },
78 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
79 .depth_bounds = {
80 .min = 0.0f,
81 .max = 1.0f,
82 },
83 .stencil_compare_mask = {
84 .front = ~0u,
85 .back = ~0u,
86 },
87 .stencil_write_mask = {
88 .front = ~0u,
89 .back = ~0u,
90 },
91 .stencil_reference = {
92 .front = 0u,
93 .back = 0u,
94 },
95 };
96
97 static void
98 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
99 const struct radv_dynamic_state *src)
100 {
101 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
102 uint32_t copy_mask = src->mask;
103 uint32_t dest_mask = 0;
104
105 /* Make sure to copy the number of viewports/scissors because they can
106 * only be specified at pipeline creation time.
107 */
108 dest->viewport.count = src->viewport.count;
109 dest->scissor.count = src->scissor.count;
110 dest->discard_rectangle.count = src->discard_rectangle.count;
111 dest->sample_location.count = src->sample_location.count;
112
113 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
114 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
115 src->viewport.count * sizeof(VkViewport))) {
116 typed_memcpy(dest->viewport.viewports,
117 src->viewport.viewports,
118 src->viewport.count);
119 dest_mask |= RADV_DYNAMIC_VIEWPORT;
120 }
121 }
122
123 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
124 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
125 src->scissor.count * sizeof(VkRect2D))) {
126 typed_memcpy(dest->scissor.scissors,
127 src->scissor.scissors, src->scissor.count);
128 dest_mask |= RADV_DYNAMIC_SCISSOR;
129 }
130 }
131
132 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
133 if (dest->line_width != src->line_width) {
134 dest->line_width = src->line_width;
135 dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
136 }
137 }
138
139 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
140 if (memcmp(&dest->depth_bias, &src->depth_bias,
141 sizeof(src->depth_bias))) {
142 dest->depth_bias = src->depth_bias;
143 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
144 }
145 }
146
147 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
148 if (memcmp(&dest->blend_constants, &src->blend_constants,
149 sizeof(src->blend_constants))) {
150 typed_memcpy(dest->blend_constants,
151 src->blend_constants, 4);
152 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
153 }
154 }
155
156 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
157 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
158 sizeof(src->depth_bounds))) {
159 dest->depth_bounds = src->depth_bounds;
160 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
161 }
162 }
163
164 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
165 if (memcmp(&dest->stencil_compare_mask,
166 &src->stencil_compare_mask,
167 sizeof(src->stencil_compare_mask))) {
168 dest->stencil_compare_mask = src->stencil_compare_mask;
169 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
170 }
171 }
172
173 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
174 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
175 sizeof(src->stencil_write_mask))) {
176 dest->stencil_write_mask = src->stencil_write_mask;
177 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
178 }
179 }
180
181 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
182 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
183 sizeof(src->stencil_reference))) {
184 dest->stencil_reference = src->stencil_reference;
185 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
186 }
187 }
188
189 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
190 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
191 src->discard_rectangle.count * sizeof(VkRect2D))) {
192 typed_memcpy(dest->discard_rectangle.rectangles,
193 src->discard_rectangle.rectangles,
194 src->discard_rectangle.count);
195 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
196 }
197 }
198
199 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
200 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
201 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
202 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
203 memcmp(&dest->sample_location.locations,
204 &src->sample_location.locations,
205 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
206 dest->sample_location.per_pixel = src->sample_location.per_pixel;
207 dest->sample_location.grid_size = src->sample_location.grid_size;
208 typed_memcpy(dest->sample_location.locations,
209 src->sample_location.locations,
210 src->sample_location.count);
211 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
212 }
213 }
214
215 cmd_buffer->state.dirty |= dest_mask;
216 }
217
218 static void
219 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
220 struct radv_pipeline *pipeline)
221 {
222 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
223 struct radv_shader_info *info;
224
225 if (!pipeline->streamout_shader ||
226 cmd_buffer->device->physical_device->use_ngg_streamout)
227 return;
228
229 info = &pipeline->streamout_shader->info;
230 for (int i = 0; i < MAX_SO_BUFFERS; i++)
231 so->stride_in_dw[i] = info->so.strides[i];
232
233 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
234 }
235
236 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
237 {
238 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
239 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
240 }
241
242 enum ring_type radv_queue_family_to_ring(int f) {
243 switch (f) {
244 case RADV_QUEUE_GENERAL:
245 return RING_GFX;
246 case RADV_QUEUE_COMPUTE:
247 return RING_COMPUTE;
248 case RADV_QUEUE_TRANSFER:
249 return RING_DMA;
250 default:
251 unreachable("Unknown queue family");
252 }
253 }
254
255 static VkResult radv_create_cmd_buffer(
256 struct radv_device * device,
257 struct radv_cmd_pool * pool,
258 VkCommandBufferLevel level,
259 VkCommandBuffer* pCommandBuffer)
260 {
261 struct radv_cmd_buffer *cmd_buffer;
262 unsigned ring;
263 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
264 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
265 if (cmd_buffer == NULL)
266 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
267
268 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
269 cmd_buffer->device = device;
270 cmd_buffer->pool = pool;
271 cmd_buffer->level = level;
272
273 if (pool) {
274 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
275 cmd_buffer->queue_family_index = pool->queue_family_index;
276
277 } else {
278 /* Init the pool_link so we can safely call list_del when we destroy
279 * the command buffer
280 */
281 list_inithead(&cmd_buffer->pool_link);
282 cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
283 }
284
285 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
286
287 cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
288 if (!cmd_buffer->cs) {
289 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
290 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
291 }
292
293 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
294
295 list_inithead(&cmd_buffer->upload.list);
296
297 return VK_SUCCESS;
298 }
299
300 static void
301 radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
302 {
303 list_del(&cmd_buffer->pool_link);
304
305 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
306 &cmd_buffer->upload.list, list) {
307 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
308 list_del(&up->list);
309 free(up);
310 }
311
312 if (cmd_buffer->upload.upload_bo)
313 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
314 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
315
316 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
317 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
318
319 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
320 }
321
322 static VkResult
323 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
324 {
325 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
326
327 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
328 &cmd_buffer->upload.list, list) {
329 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
330 list_del(&up->list);
331 free(up);
332 }
333
334 cmd_buffer->push_constant_stages = 0;
335 cmd_buffer->scratch_size_per_wave_needed = 0;
336 cmd_buffer->scratch_waves_wanted = 0;
337 cmd_buffer->compute_scratch_size_per_wave_needed = 0;
338 cmd_buffer->compute_scratch_waves_wanted = 0;
339 cmd_buffer->esgs_ring_size_needed = 0;
340 cmd_buffer->gsvs_ring_size_needed = 0;
341 cmd_buffer->tess_rings_needed = false;
342 cmd_buffer->gds_needed = false;
343 cmd_buffer->gds_oa_needed = false;
344 cmd_buffer->sample_positions_needed = false;
345
346 if (cmd_buffer->upload.upload_bo)
347 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
348 cmd_buffer->upload.upload_bo);
349 cmd_buffer->upload.offset = 0;
350
351 cmd_buffer->record_result = VK_SUCCESS;
352
353 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
354
355 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
356 cmd_buffer->descriptors[i].dirty = 0;
357 cmd_buffer->descriptors[i].valid = 0;
358 cmd_buffer->descriptors[i].push_dirty = false;
359 }
360
361 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
362 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
363 unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
364 unsigned fence_offset, eop_bug_offset;
365 void *fence_ptr;
366
367 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
368 &fence_ptr);
369
370 cmd_buffer->gfx9_fence_va =
371 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
372 cmd_buffer->gfx9_fence_va += fence_offset;
373
374 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
375 /* Allocate a buffer for the EOP bug on GFX9. */
376 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
377 &eop_bug_offset, &fence_ptr);
378 cmd_buffer->gfx9_eop_bug_va =
379 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
380 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
381 }
382 }
383
384 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
385
386 return cmd_buffer->record_result;
387 }
388
389 static bool
390 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
391 uint64_t min_needed)
392 {
393 uint64_t new_size;
394 struct radeon_winsys_bo *bo;
395 struct radv_cmd_buffer_upload *upload;
396 struct radv_device *device = cmd_buffer->device;
397
398 new_size = MAX2(min_needed, 16 * 1024);
399 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
400
401 bo = device->ws->buffer_create(device->ws,
402 new_size, 4096,
403 RADEON_DOMAIN_GTT,
404 RADEON_FLAG_CPU_ACCESS|
405 RADEON_FLAG_NO_INTERPROCESS_SHARING |
406 RADEON_FLAG_32BIT,
407 RADV_BO_PRIORITY_UPLOAD_BUFFER);
408
409 if (!bo) {
410 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
411 return false;
412 }
413
414 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
415 if (cmd_buffer->upload.upload_bo) {
416 upload = malloc(sizeof(*upload));
417
418 if (!upload) {
419 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
420 device->ws->buffer_destroy(bo);
421 return false;
422 }
423
424 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
425 list_add(&upload->list, &cmd_buffer->upload.list);
426 }
427
428 cmd_buffer->upload.upload_bo = bo;
429 cmd_buffer->upload.size = new_size;
430 cmd_buffer->upload.offset = 0;
431 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
432
433 if (!cmd_buffer->upload.map) {
434 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
435 return false;
436 }
437
438 return true;
439 }
440
441 bool
442 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
443 unsigned size,
444 unsigned alignment,
445 unsigned *out_offset,
446 void **ptr)
447 {
448 assert(util_is_power_of_two_nonzero(alignment));
449
450 uint64_t offset = align(cmd_buffer->upload.offset, alignment);
451 if (offset + size > cmd_buffer->upload.size) {
452 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
453 return false;
454 offset = 0;
455 }
456
457 *out_offset = offset;
458 *ptr = cmd_buffer->upload.map + offset;
459
460 cmd_buffer->upload.offset = offset + size;
461 return true;
462 }
463
464 bool
465 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
466 unsigned size, unsigned alignment,
467 const void *data, unsigned *out_offset)
468 {
469 uint8_t *ptr;
470
471 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
472 out_offset, (void **)&ptr))
473 return false;
474
475 if (ptr)
476 memcpy(ptr, data, size);
477
478 return true;
479 }
480
481 static void
482 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
483 unsigned count, const uint32_t *data)
484 {
485 struct radeon_cmdbuf *cs = cmd_buffer->cs;
486
487 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
488
489 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
490 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
491 S_370_WR_CONFIRM(1) |
492 S_370_ENGINE_SEL(V_370_ME));
493 radeon_emit(cs, va);
494 radeon_emit(cs, va >> 32);
495 radeon_emit_array(cs, data, count);
496 }
497
498 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
499 {
500 struct radv_device *device = cmd_buffer->device;
501 struct radeon_cmdbuf *cs = cmd_buffer->cs;
502 uint64_t va;
503
504 va = radv_buffer_get_va(device->trace_bo);
505 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
506 va += 4;
507
508 ++cmd_buffer->state.trace_id;
509 radv_emit_write_data_packet(cmd_buffer, va, 1,
510 &cmd_buffer->state.trace_id);
511
512 radeon_check_space(cmd_buffer->device->ws, cs, 2);
513
514 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
515 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
516 }
517
518 static void
519 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
520 enum radv_cmd_flush_bits flags)
521 {
522 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
523 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
524 RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
525
526 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
527
528 /* Force wait for graphics or compute engines to be idle. */
529 si_cs_emit_cache_flush(cmd_buffer->cs,
530 cmd_buffer->device->physical_device->rad_info.chip_class,
531 &cmd_buffer->gfx9_fence_idx,
532 cmd_buffer->gfx9_fence_va,
533 radv_cmd_buffer_uses_mec(cmd_buffer),
534 flags, cmd_buffer->gfx9_eop_bug_va);
535 }
536
537 if (unlikely(cmd_buffer->device->trace_bo))
538 radv_cmd_buffer_trace_emit(cmd_buffer);
539 }
540
541 static void
542 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
543 struct radv_pipeline *pipeline, enum ring_type ring)
544 {
545 struct radv_device *device = cmd_buffer->device;
546 uint32_t data[2];
547 uint64_t va;
548
549 va = radv_buffer_get_va(device->trace_bo);
550
551 switch (ring) {
552 case RING_GFX:
553 va += 8;
554 break;
555 case RING_COMPUTE:
556 va += 16;
557 break;
558 default:
559 assert(!"invalid ring type");
560 }
561
562 uint64_t pipeline_address = (uintptr_t)pipeline;
563 data[0] = pipeline_address;
564 data[1] = pipeline_address >> 32;
565
566 radv_emit_write_data_packet(cmd_buffer, va, 2, data);
567 }
568
569 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
570 VkPipelineBindPoint bind_point,
571 struct radv_descriptor_set *set,
572 unsigned idx)
573 {
574 struct radv_descriptor_state *descriptors_state =
575 radv_get_descriptors_state(cmd_buffer, bind_point);
576
577 descriptors_state->sets[idx] = set;
578
579 descriptors_state->valid |= (1u << idx); /* active descriptors */
580 descriptors_state->dirty |= (1u << idx);
581 }
582
583 static void
584 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer,
585 VkPipelineBindPoint bind_point)
586 {
587 struct radv_descriptor_state *descriptors_state =
588 radv_get_descriptors_state(cmd_buffer, bind_point);
589 struct radv_device *device = cmd_buffer->device;
590 uint32_t data[MAX_SETS * 2] = {};
591 uint64_t va;
592 unsigned i;
593 va = radv_buffer_get_va(device->trace_bo) + 24;
594
595 for_each_bit(i, descriptors_state->valid) {
596 struct radv_descriptor_set *set = descriptors_state->sets[i];
597 data[i * 2] = (uint64_t)(uintptr_t)set;
598 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
599 }
600
601 radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
602 }
603
604 struct radv_userdata_info *
605 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
606 gl_shader_stage stage,
607 int idx)
608 {
609 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
610 return &shader->info.user_sgprs_locs.shader_data[idx];
611 }
612
613 static void
614 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
615 struct radv_pipeline *pipeline,
616 gl_shader_stage stage,
617 int idx, uint64_t va)
618 {
619 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
620 uint32_t base_reg = pipeline->user_data_0[stage];
621 if (loc->sgpr_idx == -1)
622 return;
623
624 assert(loc->num_sgprs == 1);
625
626 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
627 base_reg + loc->sgpr_idx * 4, va, false);
628 }
629
630 static void
631 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
632 struct radv_pipeline *pipeline,
633 struct radv_descriptor_state *descriptors_state,
634 gl_shader_stage stage)
635 {
636 struct radv_device *device = cmd_buffer->device;
637 struct radeon_cmdbuf *cs = cmd_buffer->cs;
638 uint32_t sh_base = pipeline->user_data_0[stage];
639 struct radv_userdata_locations *locs =
640 &pipeline->shaders[stage]->info.user_sgprs_locs;
641 unsigned mask = locs->descriptor_sets_enabled;
642
643 mask &= descriptors_state->dirty & descriptors_state->valid;
644
645 while (mask) {
646 int start, count;
647
648 u_bit_scan_consecutive_range(&mask, &start, &count);
649
650 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
651 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
652
653 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
654 for (int i = 0; i < count; i++) {
655 struct radv_descriptor_set *set =
656 descriptors_state->sets[start + i];
657
658 radv_emit_shader_pointer_body(device, cs, set->va, true);
659 }
660 }
661 }
662
663 /**
664 * Convert the user sample locations to hardware sample locations (the values
665 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
666 */
667 static void
668 radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
669 uint32_t x, uint32_t y, VkOffset2D *sample_locs)
670 {
671 uint32_t x_offset = x % state->grid_size.width;
672 uint32_t y_offset = y % state->grid_size.height;
673 uint32_t num_samples = (uint32_t)state->per_pixel;
674 VkSampleLocationEXT *user_locs;
675 uint32_t pixel_offset;
676
677 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
678
679 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
680 user_locs = &state->locations[pixel_offset];
681
682 for (uint32_t i = 0; i < num_samples; i++) {
683 float shifted_pos_x = user_locs[i].x - 0.5;
684 float shifted_pos_y = user_locs[i].y - 0.5;
685
686 int32_t scaled_pos_x = floor(shifted_pos_x * 16);
687 int32_t scaled_pos_y = floor(shifted_pos_y * 16);
688
689 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
690 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
691 }
692 }
693
694 /**
695 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
696 * locations.
697 */
698 static void
699 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
700 uint32_t *sample_locs_pixel)
701 {
702 for (uint32_t i = 0; i < num_samples; i++) {
703 uint32_t sample_reg_idx = i / 4;
704 uint32_t sample_loc_idx = i % 4;
705 int32_t pos_x = sample_locs[i].x;
706 int32_t pos_y = sample_locs[i].y;
707
708 uint32_t shift_x = 8 * sample_loc_idx;
709 uint32_t shift_y = shift_x + 4;
710
711 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
712 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
713 }
714 }
715
716 /**
717 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
718 * sample locations.
719 */
720 static uint64_t
721 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer,
722 VkOffset2D *sample_locs,
723 uint32_t num_samples)
724 {
725 uint32_t centroid_priorities[num_samples];
726 uint32_t sample_mask = num_samples - 1;
727 uint32_t distances[num_samples];
728 uint64_t centroid_priority = 0;
729
730 /* Compute the distances from center for each sample. */
731 for (int i = 0; i < num_samples; i++) {
732 distances[i] = (sample_locs[i].x * sample_locs[i].x) +
733 (sample_locs[i].y * sample_locs[i].y);
734 }
735
736 /* Compute the centroid priorities by looking at the distances array. */
737 for (int i = 0; i < num_samples; i++) {
738 uint32_t min_idx = 0;
739
740 for (int j = 1; j < num_samples; j++) {
741 if (distances[j] < distances[min_idx])
742 min_idx = j;
743 }
744
745 centroid_priorities[i] = min_idx;
746 distances[min_idx] = 0xffffffff;
747 }
748
749 /* Compute the final centroid priority. */
750 for (int i = 0; i < 8; i++) {
751 centroid_priority |=
752 centroid_priorities[i & sample_mask] << (i * 4);
753 }
754
755 return centroid_priority << 32 | centroid_priority;
756 }
757
758 /**
759 * Emit the sample locations that are specified with VK_EXT_sample_locations.
760 */
761 static void
762 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
763 {
764 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
765 struct radv_multisample_state *ms = &pipeline->graphics.ms;
766 struct radv_sample_locations_state *sample_location =
767 &cmd_buffer->state.dynamic.sample_location;
768 uint32_t num_samples = (uint32_t)sample_location->per_pixel;
769 struct radeon_cmdbuf *cs = cmd_buffer->cs;
770 uint32_t sample_locs_pixel[4][2] = {};
771 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
772 uint32_t max_sample_dist = 0;
773 uint64_t centroid_priority;
774
775 if (!cmd_buffer->state.dynamic.sample_location.count)
776 return;
777
778 /* Convert the user sample locations to hardware sample locations. */
779 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
780 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
781 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
782 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
783
784 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
785 for (uint32_t i = 0; i < 4; i++) {
786 radv_compute_sample_locs_pixel(num_samples, sample_locs[i],
787 sample_locs_pixel[i]);
788 }
789
790 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
791 centroid_priority =
792 radv_compute_centroid_priority(cmd_buffer, sample_locs[0],
793 num_samples);
794
795 /* Compute the maximum sample distance from the specified locations. */
796 for (uint32_t i = 0; i < num_samples; i++) {
797 VkOffset2D offset = sample_locs[0][i];
798 max_sample_dist = MAX2(max_sample_dist,
799 MAX2(abs(offset.x), abs(offset.y)));
800 }
801
802 /* Emit the specified user sample locations. */
803 switch (num_samples) {
804 case 2:
805 case 4:
806 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
807 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
808 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
809 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
810 break;
811 case 8:
812 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
813 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
814 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
815 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
816 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
817 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
818 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
819 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
820 break;
821 default:
822 unreachable("invalid number of samples");
823 }
824
825 /* Emit the maximum sample distance and the centroid priority. */
826 uint32_t pa_sc_aa_config = ms->pa_sc_aa_config;
827
828 pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST;
829 pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist);
830
831 radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1);
832 radeon_emit(cs, pa_sc_aa_config);
833
834 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
835 radeon_emit(cs, centroid_priority);
836 radeon_emit(cs, centroid_priority >> 32);
837
838 /* GFX9: Flush DFSM when the AA mode changes. */
839 if (cmd_buffer->device->dfsm_allowed) {
840 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
841 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
842 }
843
844 cmd_buffer->state.context_roll_without_scissor_emitted = true;
845 }
846
847 static void
848 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
849 struct radv_pipeline *pipeline,
850 gl_shader_stage stage,
851 int idx, int count, uint32_t *values)
852 {
853 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
854 uint32_t base_reg = pipeline->user_data_0[stage];
855 if (loc->sgpr_idx == -1)
856 return;
857
858 assert(loc->num_sgprs == count);
859
860 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
861 radeon_emit_array(cmd_buffer->cs, values, count);
862 }
863
864 static void
865 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
866 struct radv_pipeline *pipeline)
867 {
868 int num_samples = pipeline->graphics.ms.num_samples;
869 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
870
871 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
872 cmd_buffer->sample_positions_needed = true;
873
874 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
875 return;
876
877 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
878
879 cmd_buffer->state.context_roll_without_scissor_emitted = true;
880 }
881
882 static void
883 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
884 struct radv_pipeline *pipeline)
885 {
886 const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
887
888
889 if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
890 return;
891
892 if (old_pipeline &&
893 old_pipeline->graphics.binning.pa_sc_binner_cntl_0 == pipeline->graphics.binning.pa_sc_binner_cntl_0 &&
894 old_pipeline->graphics.binning.db_dfsm_control == pipeline->graphics.binning.db_dfsm_control)
895 return;
896
897 bool binning_flush = false;
898 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
899 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
900 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
901 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
902 binning_flush = !old_pipeline ||
903 G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
904 G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
905 }
906
907 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
908 pipeline->graphics.binning.pa_sc_binner_cntl_0 |
909 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
910
911 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
912 radeon_set_context_reg(cmd_buffer->cs, R_028038_DB_DFSM_CONTROL,
913 pipeline->graphics.binning.db_dfsm_control);
914 } else {
915 radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
916 pipeline->graphics.binning.db_dfsm_control);
917 }
918
919 cmd_buffer->state.context_roll_without_scissor_emitted = true;
920 }
921
922
923 static void
924 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
925 struct radv_shader_variant *shader)
926 {
927 uint64_t va;
928
929 if (!shader)
930 return;
931
932 va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
933
934 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
935 }
936
937 static void
938 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
939 struct radv_pipeline *pipeline,
940 bool vertex_stage_only)
941 {
942 struct radv_cmd_state *state = &cmd_buffer->state;
943 uint32_t mask = state->prefetch_L2_mask;
944
945 if (vertex_stage_only) {
946 /* Fast prefetch path for starting draws as soon as possible.
947 */
948 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS |
949 RADV_PREFETCH_VBO_DESCRIPTORS);
950 }
951
952 if (mask & RADV_PREFETCH_VS)
953 radv_emit_shader_prefetch(cmd_buffer,
954 pipeline->shaders[MESA_SHADER_VERTEX]);
955
956 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
957 si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
958
959 if (mask & RADV_PREFETCH_TCS)
960 radv_emit_shader_prefetch(cmd_buffer,
961 pipeline->shaders[MESA_SHADER_TESS_CTRL]);
962
963 if (mask & RADV_PREFETCH_TES)
964 radv_emit_shader_prefetch(cmd_buffer,
965 pipeline->shaders[MESA_SHADER_TESS_EVAL]);
966
967 if (mask & RADV_PREFETCH_GS) {
968 radv_emit_shader_prefetch(cmd_buffer,
969 pipeline->shaders[MESA_SHADER_GEOMETRY]);
970 if (radv_pipeline_has_gs_copy_shader(pipeline))
971 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
972 }
973
974 if (mask & RADV_PREFETCH_PS)
975 radv_emit_shader_prefetch(cmd_buffer,
976 pipeline->shaders[MESA_SHADER_FRAGMENT]);
977
978 state->prefetch_L2_mask &= ~mask;
979 }
980
981 static void
982 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
983 {
984 if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
985 return;
986
987 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
988 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
989
990 unsigned sx_ps_downconvert = 0;
991 unsigned sx_blend_opt_epsilon = 0;
992 unsigned sx_blend_opt_control = 0;
993
994 if (!cmd_buffer->state.attachments || !subpass)
995 return;
996
997 for (unsigned i = 0; i < subpass->color_count; ++i) {
998 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
999 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1000 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1001 continue;
1002 }
1003
1004 int idx = subpass->color_attachments[i].attachment;
1005 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1006
1007 unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1008 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1009 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1010 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1011
1012 bool has_alpha, has_rgb;
1013
1014 /* Set if RGB and A are present. */
1015 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1016
1017 if (format == V_028C70_COLOR_8 ||
1018 format == V_028C70_COLOR_16 ||
1019 format == V_028C70_COLOR_32)
1020 has_rgb = !has_alpha;
1021 else
1022 has_rgb = true;
1023
1024 /* Check the colormask and export format. */
1025 if (!(colormask & 0x7))
1026 has_rgb = false;
1027 if (!(colormask & 0x8))
1028 has_alpha = false;
1029
1030 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1031 has_rgb = false;
1032 has_alpha = false;
1033 }
1034
1035 /* Disable value checking for disabled channels. */
1036 if (!has_rgb)
1037 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1038 if (!has_alpha)
1039 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1040
1041 /* Enable down-conversion for 32bpp and smaller formats. */
1042 switch (format) {
1043 case V_028C70_COLOR_8:
1044 case V_028C70_COLOR_8_8:
1045 case V_028C70_COLOR_8_8_8_8:
1046 /* For 1 and 2-channel formats, use the superset thereof. */
1047 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1048 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1049 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1050 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1051 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1052 }
1053 break;
1054
1055 case V_028C70_COLOR_5_6_5:
1056 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1057 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1058 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1059 }
1060 break;
1061
1062 case V_028C70_COLOR_1_5_5_5:
1063 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1064 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1065 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1066 }
1067 break;
1068
1069 case V_028C70_COLOR_4_4_4_4:
1070 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1071 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1072 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1073 }
1074 break;
1075
1076 case V_028C70_COLOR_32:
1077 if (swap == V_028C70_SWAP_STD &&
1078 spi_format == V_028714_SPI_SHADER_32_R)
1079 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1080 else if (swap == V_028C70_SWAP_ALT_REV &&
1081 spi_format == V_028714_SPI_SHADER_32_AR)
1082 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1083 break;
1084
1085 case V_028C70_COLOR_16:
1086 case V_028C70_COLOR_16_16:
1087 /* For 1-channel formats, use the superset thereof. */
1088 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1089 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1090 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1091 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1092 if (swap == V_028C70_SWAP_STD ||
1093 swap == V_028C70_SWAP_STD_REV)
1094 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1095 else
1096 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1097 }
1098 break;
1099
1100 case V_028C70_COLOR_10_11_11:
1101 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1102 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1103 sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
1104 }
1105 break;
1106
1107 case V_028C70_COLOR_2_10_10_10:
1108 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1109 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1110 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1111 }
1112 break;
1113 }
1114 }
1115
1116 for (unsigned i = subpass->color_count; i < 8; ++i) {
1117 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1118 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1119 }
1120 /* TODO: avoid redundantly setting context registers */
1121 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1122 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1123 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1124 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1125
1126 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1127 }
1128
1129 static void
1130 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1131 {
1132 if (!cmd_buffer->device->pbb_allowed)
1133 return;
1134
1135 struct radv_binning_settings settings =
1136 radv_get_binning_settings(cmd_buffer->device->physical_device);
1137 bool break_for_new_ps =
1138 (!cmd_buffer->state.emitted_pipeline ||
1139 cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
1140 cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
1141 (settings.context_states_per_bin > 1 ||
1142 settings.persistent_states_per_bin > 1);
1143 bool break_for_new_cb_target_mask =
1144 (!cmd_buffer->state.emitted_pipeline ||
1145 cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
1146 cmd_buffer->state.pipeline->graphics.cb_target_mask) &&
1147 settings.context_states_per_bin > 1;
1148
1149 if (!break_for_new_ps && !break_for_new_cb_target_mask)
1150 return;
1151
1152 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1153 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1154 }
1155
1156 static void
1157 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1158 {
1159 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1160
1161 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1162 return;
1163
1164 radv_update_multisample_state(cmd_buffer, pipeline);
1165 radv_update_binning_state(cmd_buffer, pipeline);
1166
1167 cmd_buffer->scratch_size_per_wave_needed = MAX2(cmd_buffer->scratch_size_per_wave_needed,
1168 pipeline->scratch_bytes_per_wave);
1169 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted,
1170 pipeline->max_waves);
1171
1172 if (!cmd_buffer->state.emitted_pipeline ||
1173 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1174 pipeline->graphics.can_use_guardband)
1175 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1176
1177 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1178
1179 if (!cmd_buffer->state.emitted_pipeline ||
1180 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1181 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1182 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
1183 pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
1184 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1185 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1186 }
1187
1188 radv_emit_batch_break_on_new_ps(cmd_buffer);
1189
1190 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1191 if (!pipeline->shaders[i])
1192 continue;
1193
1194 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1195 pipeline->shaders[i]->bo);
1196 }
1197
1198 if (radv_pipeline_has_gs_copy_shader(pipeline))
1199 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1200 pipeline->gs_copy_shader->bo);
1201
1202 if (unlikely(cmd_buffer->device->trace_bo))
1203 radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
1204
1205 cmd_buffer->state.emitted_pipeline = pipeline;
1206
1207 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1208 }
1209
1210 static void
1211 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1212 {
1213 si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
1214 cmd_buffer->state.dynamic.viewport.viewports);
1215 }
1216
1217 static void
1218 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1219 {
1220 uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1221
1222 si_write_scissors(cmd_buffer->cs, 0, count,
1223 cmd_buffer->state.dynamic.scissor.scissors,
1224 cmd_buffer->state.dynamic.viewport.viewports,
1225 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1226
1227 cmd_buffer->state.context_roll_without_scissor_emitted = false;
1228 }
1229
1230 static void
1231 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1232 {
1233 if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1234 return;
1235
1236 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1237 cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1238 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1239 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1240 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1241 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1242 S_028214_BR_Y(rect.offset.y + rect.extent.height));
1243 }
1244 }
1245
1246 static void
1247 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1248 {
1249 unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1250
1251 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1252 S_028A08_WIDTH(CLAMP(width, 0, 0xFFF)));
1253 }
1254
1255 static void
1256 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1257 {
1258 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1259
1260 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1261 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1262 }
1263
1264 static void
1265 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1266 {
1267 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1268
1269 radeon_set_context_reg_seq(cmd_buffer->cs,
1270 R_028430_DB_STENCILREFMASK, 2);
1271 radeon_emit(cmd_buffer->cs,
1272 S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1273 S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1274 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1275 S_028430_STENCILOPVAL(1));
1276 radeon_emit(cmd_buffer->cs,
1277 S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1278 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1279 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1280 S_028434_STENCILOPVAL_BF(1));
1281 }
1282
1283 static void
1284 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1285 {
1286 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1287
1288 radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
1289 fui(d->depth_bounds.min));
1290 radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
1291 fui(d->depth_bounds.max));
1292 }
1293
1294 static void
1295 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1296 {
1297 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1298 unsigned slope = fui(d->depth_bias.slope * 16.0f);
1299 unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
1300
1301
1302 radeon_set_context_reg_seq(cmd_buffer->cs,
1303 R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1304 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1305 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1306 radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
1307 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1308 radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
1309 }
1310
1311 static void
1312 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
1313 int index,
1314 struct radv_color_buffer_info *cb,
1315 struct radv_image_view *iview,
1316 VkImageLayout layout,
1317 bool in_render_loop)
1318 {
1319 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1320 uint32_t cb_color_info = cb->cb_color_info;
1321 struct radv_image *image = iview->image;
1322
1323 if (!radv_layout_dcc_compressed(cmd_buffer->device, image, layout, in_render_loop,
1324 radv_image_queue_family_mask(image,
1325 cmd_buffer->queue_family_index,
1326 cmd_buffer->queue_family_index))) {
1327 cb_color_info &= C_028C70_DCC_ENABLE;
1328 }
1329
1330 if (radv_image_is_tc_compat_cmask(image) &&
1331 (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1332 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1333 /* If this bit is set, the FMASK decompression operation
1334 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1335 */
1336 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1337 }
1338
1339 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1340 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1341 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1342 radeon_emit(cmd_buffer->cs, 0);
1343 radeon_emit(cmd_buffer->cs, 0);
1344 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1345 radeon_emit(cmd_buffer->cs, cb_color_info);
1346 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1347 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1348 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1349 radeon_emit(cmd_buffer->cs, 0);
1350 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1351 radeon_emit(cmd_buffer->cs, 0);
1352
1353 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 1);
1354 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1355
1356 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1357 cb->cb_color_base >> 32);
1358 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1359 cb->cb_color_cmask >> 32);
1360 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1361 cb->cb_color_fmask >> 32);
1362 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1363 cb->cb_dcc_base >> 32);
1364 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1365 cb->cb_color_attrib2);
1366 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1367 cb->cb_color_attrib3);
1368 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1369 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1370 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1371 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1372 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1373 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1374 radeon_emit(cmd_buffer->cs, cb_color_info);
1375 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1376 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1377 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1378 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1379 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1380 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1381
1382 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1383 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1384 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1385
1386 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1387 cb->cb_mrt_epitch);
1388 } else {
1389 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1390 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1391 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1392 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1393 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1394 radeon_emit(cmd_buffer->cs, cb_color_info);
1395 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1396 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1397 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1398 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1399 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1400 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1401
1402 if (is_vi) { /* DCC BASE */
1403 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1404 }
1405 }
1406
1407 if (radv_dcc_enabled(image, iview->base_mip)) {
1408 /* Drawing with DCC enabled also compresses colorbuffers. */
1409 VkImageSubresourceRange range = {
1410 .aspectMask = iview->aspect_mask,
1411 .baseMipLevel = iview->base_mip,
1412 .levelCount = iview->level_count,
1413 .baseArrayLayer = iview->base_layer,
1414 .layerCount = iview->layer_count,
1415 };
1416
1417 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1418 }
1419 }
1420
1421 static void
1422 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
1423 struct radv_ds_buffer_info *ds,
1424 const struct radv_image_view *iview,
1425 VkImageLayout layout,
1426 bool in_render_loop, bool requires_cond_exec)
1427 {
1428 const struct radv_image *image = iview->image;
1429 uint32_t db_z_info = ds->db_z_info;
1430 uint32_t db_z_info_reg;
1431
1432 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1433 !radv_image_is_tc_compat_htile(image))
1434 return;
1435
1436 if (!radv_layout_has_htile(image, layout, in_render_loop,
1437 radv_image_queue_family_mask(image,
1438 cmd_buffer->queue_family_index,
1439 cmd_buffer->queue_family_index))) {
1440 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1441 }
1442
1443 db_z_info &= C_028040_ZRANGE_PRECISION;
1444
1445 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1446 db_z_info_reg = R_028038_DB_Z_INFO;
1447 } else {
1448 db_z_info_reg = R_028040_DB_Z_INFO;
1449 }
1450
1451 /* When we don't know the last fast clear value we need to emit a
1452 * conditional packet that will eventually skip the following
1453 * SET_CONTEXT_REG packet.
1454 */
1455 if (requires_cond_exec) {
1456 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1457
1458 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1459 radeon_emit(cmd_buffer->cs, va);
1460 radeon_emit(cmd_buffer->cs, va >> 32);
1461 radeon_emit(cmd_buffer->cs, 0);
1462 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1463 }
1464
1465 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1466 }
1467
1468 static void
1469 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
1470 struct radv_ds_buffer_info *ds,
1471 struct radv_image_view *iview,
1472 VkImageLayout layout,
1473 bool in_render_loop)
1474 {
1475 const struct radv_image *image = iview->image;
1476 uint32_t db_z_info = ds->db_z_info;
1477 uint32_t db_stencil_info = ds->db_stencil_info;
1478
1479 if (!radv_layout_has_htile(image, layout, in_render_loop,
1480 radv_image_queue_family_mask(image,
1481 cmd_buffer->queue_family_index,
1482 cmd_buffer->queue_family_index))) {
1483 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1484 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1485 }
1486
1487 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1488 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1489
1490 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1491 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1492 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1493
1494 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1495 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1496 radeon_emit(cmd_buffer->cs, db_z_info);
1497 radeon_emit(cmd_buffer->cs, db_stencil_info);
1498 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1499 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1500 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1501 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1502
1503 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1504 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1505 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1506 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1507 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1508 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1509 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1510 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1511 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1512 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1513 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1514
1515 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1516 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
1517 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
1518 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1519 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1520 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
1521 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1522 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
1523 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1524 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
1525 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1526
1527 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1528 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1529 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1530 } else {
1531 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1532
1533 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1534 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
1535 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
1536 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
1537 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
1538 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
1539 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
1540 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1541 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
1542 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
1543
1544 }
1545
1546 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1547 radv_update_zrange_precision(cmd_buffer, ds, iview, layout,
1548 in_render_loop, true);
1549
1550 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1551 ds->pa_su_poly_offset_db_fmt_cntl);
1552 }
1553
1554 /**
1555 * Update the fast clear depth/stencil values if the image is bound as a
1556 * depth/stencil buffer.
1557 */
1558 static void
1559 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1560 const struct radv_image_view *iview,
1561 VkClearDepthStencilValue ds_clear_value,
1562 VkImageAspectFlags aspects)
1563 {
1564 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1565 const struct radv_image *image = iview->image;
1566 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1567 uint32_t att_idx;
1568
1569 if (!cmd_buffer->state.attachments || !subpass)
1570 return;
1571
1572 if (!subpass->depth_stencil_attachment)
1573 return;
1574
1575 att_idx = subpass->depth_stencil_attachment->attachment;
1576 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1577 return;
1578
1579 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
1580 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1581 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1582 radeon_emit(cs, ds_clear_value.stencil);
1583 radeon_emit(cs, fui(ds_clear_value.depth));
1584 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1585 radeon_set_context_reg_seq(cs, R_02802C_DB_DEPTH_CLEAR, 1);
1586 radeon_emit(cs, fui(ds_clear_value.depth));
1587 } else {
1588 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1589 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 1);
1590 radeon_emit(cs, ds_clear_value.stencil);
1591 }
1592
1593 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1594 * only needed when clearing Z to 0.0.
1595 */
1596 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1597 ds_clear_value.depth == 0.0) {
1598 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1599 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
1600
1601 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds,
1602 iview, layout, in_render_loop, false);
1603 }
1604
1605 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1606 }
1607
1608 /**
1609 * Set the clear depth/stencil values to the image's metadata.
1610 */
1611 static void
1612 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1613 struct radv_image *image,
1614 const VkImageSubresourceRange *range,
1615 VkClearDepthStencilValue ds_clear_value,
1616 VkImageAspectFlags aspects)
1617 {
1618 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1619 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
1620 uint32_t level_count = radv_get_levelCount(image, range);
1621
1622 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
1623 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1624 /* Use the fastest way when both aspects are used. */
1625 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
1626 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1627 S_370_WR_CONFIRM(1) |
1628 S_370_ENGINE_SEL(V_370_PFP));
1629 radeon_emit(cs, va);
1630 radeon_emit(cs, va >> 32);
1631
1632 for (uint32_t l = 0; l < level_count; l++) {
1633 radeon_emit(cs, ds_clear_value.stencil);
1634 radeon_emit(cs, fui(ds_clear_value.depth));
1635 }
1636 } else {
1637 /* Otherwise we need one WRITE_DATA packet per level. */
1638 for (uint32_t l = 0; l < level_count; l++) {
1639 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
1640 unsigned value;
1641
1642 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1643 value = fui(ds_clear_value.depth);
1644 va += 4;
1645 } else {
1646 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1647 value = ds_clear_value.stencil;
1648 }
1649
1650 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1651 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1652 S_370_WR_CONFIRM(1) |
1653 S_370_ENGINE_SEL(V_370_PFP));
1654 radeon_emit(cs, va);
1655 radeon_emit(cs, va >> 32);
1656 radeon_emit(cs, value);
1657 }
1658 }
1659 }
1660
1661 /**
1662 * Update the TC-compat metadata value for this image.
1663 */
1664 static void
1665 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1666 struct radv_image *image,
1667 const VkImageSubresourceRange *range,
1668 uint32_t value)
1669 {
1670 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1671
1672 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
1673 return;
1674
1675 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
1676 uint32_t level_count = radv_get_levelCount(image, range);
1677
1678 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
1679 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1680 S_370_WR_CONFIRM(1) |
1681 S_370_ENGINE_SEL(V_370_PFP));
1682 radeon_emit(cs, va);
1683 radeon_emit(cs, va >> 32);
1684
1685 for (uint32_t l = 0; l < level_count; l++)
1686 radeon_emit(cs, value);
1687 }
1688
1689 static void
1690 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1691 const struct radv_image_view *iview,
1692 VkClearDepthStencilValue ds_clear_value)
1693 {
1694 VkImageSubresourceRange range = {
1695 .aspectMask = iview->aspect_mask,
1696 .baseMipLevel = iview->base_mip,
1697 .levelCount = iview->level_count,
1698 .baseArrayLayer = iview->base_layer,
1699 .layerCount = iview->layer_count,
1700 };
1701 uint32_t cond_val;
1702
1703 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
1704 * depth clear value is 0.0f.
1705 */
1706 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
1707
1708 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range,
1709 cond_val);
1710 }
1711
1712 /**
1713 * Update the clear depth/stencil values for this image.
1714 */
1715 void
1716 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1717 const struct radv_image_view *iview,
1718 VkClearDepthStencilValue ds_clear_value,
1719 VkImageAspectFlags aspects)
1720 {
1721 VkImageSubresourceRange range = {
1722 .aspectMask = iview->aspect_mask,
1723 .baseMipLevel = iview->base_mip,
1724 .levelCount = iview->level_count,
1725 .baseArrayLayer = iview->base_layer,
1726 .layerCount = iview->layer_count,
1727 };
1728 struct radv_image *image = iview->image;
1729
1730 assert(radv_image_has_htile(image));
1731
1732 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range,
1733 ds_clear_value, aspects);
1734
1735 if (radv_image_is_tc_compat_htile(image) &&
1736 (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1737 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview,
1738 ds_clear_value);
1739 }
1740
1741 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value,
1742 aspects);
1743 }
1744
1745 /**
1746 * Load the clear depth/stencil values from the image's metadata.
1747 */
1748 static void
1749 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1750 const struct radv_image_view *iview)
1751 {
1752 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1753 const struct radv_image *image = iview->image;
1754 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
1755 uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
1756 unsigned reg_offset = 0, reg_count = 0;
1757
1758 if (!radv_image_has_htile(image))
1759 return;
1760
1761 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1762 ++reg_count;
1763 } else {
1764 ++reg_offset;
1765 va += 4;
1766 }
1767 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1768 ++reg_count;
1769
1770 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
1771
1772 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
1773 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
1774 radeon_emit(cs, va);
1775 radeon_emit(cs, va >> 32);
1776 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1777 radeon_emit(cs, reg_count);
1778 } else {
1779 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1780 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1781 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1782 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
1783 radeon_emit(cs, va);
1784 radeon_emit(cs, va >> 32);
1785 radeon_emit(cs, reg >> 2);
1786 radeon_emit(cs, 0);
1787
1788 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
1789 radeon_emit(cs, 0);
1790 }
1791 }
1792
1793 /*
1794 * With DCC some colors don't require CMASK elimination before being
1795 * used as a texture. This sets a predicate value to determine if the
1796 * cmask eliminate is required.
1797 */
1798 void
1799 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer,
1800 struct radv_image *image,
1801 const VkImageSubresourceRange *range, bool value)
1802 {
1803 uint64_t pred_val = value;
1804 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
1805 uint32_t level_count = radv_get_levelCount(image, range);
1806 uint32_t count = 2 * level_count;
1807
1808 assert(radv_dcc_enabled(image, range->baseMipLevel));
1809
1810 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1811 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1812 S_370_WR_CONFIRM(1) |
1813 S_370_ENGINE_SEL(V_370_PFP));
1814 radeon_emit(cmd_buffer->cs, va);
1815 radeon_emit(cmd_buffer->cs, va >> 32);
1816
1817 for (uint32_t l = 0; l < level_count; l++) {
1818 radeon_emit(cmd_buffer->cs, pred_val);
1819 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1820 }
1821 }
1822
1823 /**
1824 * Update the DCC predicate to reflect the compression state.
1825 */
1826 void
1827 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer,
1828 struct radv_image *image,
1829 const VkImageSubresourceRange *range, bool value)
1830 {
1831 uint64_t pred_val = value;
1832 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
1833 uint32_t level_count = radv_get_levelCount(image, range);
1834 uint32_t count = 2 * level_count;
1835
1836 assert(radv_dcc_enabled(image, range->baseMipLevel));
1837
1838 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1839 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1840 S_370_WR_CONFIRM(1) |
1841 S_370_ENGINE_SEL(V_370_PFP));
1842 radeon_emit(cmd_buffer->cs, va);
1843 radeon_emit(cmd_buffer->cs, va >> 32);
1844
1845 for (uint32_t l = 0; l < level_count; l++) {
1846 radeon_emit(cmd_buffer->cs, pred_val);
1847 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1848 }
1849 }
1850
1851 /**
1852 * Update the fast clear color values if the image is bound as a color buffer.
1853 */
1854 static void
1855 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
1856 struct radv_image *image,
1857 int cb_idx,
1858 uint32_t color_values[2])
1859 {
1860 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1861 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1862 uint32_t att_idx;
1863
1864 if (!cmd_buffer->state.attachments || !subpass)
1865 return;
1866
1867 att_idx = subpass->color_attachments[cb_idx].attachment;
1868 if (att_idx == VK_ATTACHMENT_UNUSED)
1869 return;
1870
1871 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1872 return;
1873
1874 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
1875 radeon_emit(cs, color_values[0]);
1876 radeon_emit(cs, color_values[1]);
1877
1878 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1879 }
1880
1881 /**
1882 * Set the clear color values to the image's metadata.
1883 */
1884 static void
1885 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1886 struct radv_image *image,
1887 const VkImageSubresourceRange *range,
1888 uint32_t color_values[2])
1889 {
1890 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1891 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
1892 uint32_t level_count = radv_get_levelCount(image, range);
1893 uint32_t count = 2 * level_count;
1894
1895 assert(radv_image_has_cmask(image) ||
1896 radv_dcc_enabled(image, range->baseMipLevel));
1897
1898 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
1899 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1900 S_370_WR_CONFIRM(1) |
1901 S_370_ENGINE_SEL(V_370_PFP));
1902 radeon_emit(cs, va);
1903 radeon_emit(cs, va >> 32);
1904
1905 for (uint32_t l = 0; l < level_count; l++) {
1906 radeon_emit(cs, color_values[0]);
1907 radeon_emit(cs, color_values[1]);
1908 }
1909 }
1910
1911 /**
1912 * Update the clear color values for this image.
1913 */
1914 void
1915 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1916 const struct radv_image_view *iview,
1917 int cb_idx,
1918 uint32_t color_values[2])
1919 {
1920 struct radv_image *image = iview->image;
1921 VkImageSubresourceRange range = {
1922 .aspectMask = iview->aspect_mask,
1923 .baseMipLevel = iview->base_mip,
1924 .levelCount = iview->level_count,
1925 .baseArrayLayer = iview->base_layer,
1926 .layerCount = iview->layer_count,
1927 };
1928
1929 assert(radv_image_has_cmask(image) ||
1930 radv_dcc_enabled(image, iview->base_mip));
1931
1932 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
1933
1934 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
1935 color_values);
1936 }
1937
1938 /**
1939 * Load the clear color values from the image's metadata.
1940 */
1941 static void
1942 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1943 struct radv_image_view *iview,
1944 int cb_idx)
1945 {
1946 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1947 struct radv_image *image = iview->image;
1948 uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
1949
1950 if (!radv_image_has_cmask(image) &&
1951 !radv_dcc_enabled(image, iview->base_mip))
1952 return;
1953
1954 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
1955
1956 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
1957 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
1958 radeon_emit(cs, va);
1959 radeon_emit(cs, va >> 32);
1960 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1961 radeon_emit(cs, 2);
1962 } else {
1963 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
1964 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1965 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1966 COPY_DATA_COUNT_SEL);
1967 radeon_emit(cs, va);
1968 radeon_emit(cs, va >> 32);
1969 radeon_emit(cs, reg >> 2);
1970 radeon_emit(cs, 0);
1971
1972 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
1973 radeon_emit(cs, 0);
1974 }
1975 }
1976
1977 static void
1978 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
1979 {
1980 int i;
1981 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1982 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1983
1984 /* this may happen for inherited secondary recording */
1985 if (!framebuffer)
1986 return;
1987
1988 for (i = 0; i < 8; ++i) {
1989 if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1990 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
1991 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
1992 continue;
1993 }
1994
1995 int idx = subpass->color_attachments[i].attachment;
1996 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
1997 VkImageLayout layout = subpass->color_attachments[i].layout;
1998 bool in_render_loop = subpass->color_attachments[i].in_render_loop;
1999
2000 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->bo);
2001
2002 assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2003 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2004 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, in_render_loop);
2005
2006 radv_load_color_clear_metadata(cmd_buffer, iview, i);
2007 }
2008
2009 if (subpass->depth_stencil_attachment) {
2010 int idx = subpass->depth_stencil_attachment->attachment;
2011 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2012 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2013 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2014 struct radv_image *image = iview->image;
2015 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.attachments[idx].iview->bo);
2016 ASSERTED uint32_t queue_mask = radv_image_queue_family_mask(image,
2017 cmd_buffer->queue_family_index,
2018 cmd_buffer->queue_family_index);
2019 /* We currently don't support writing decompressed HTILE */
2020 assert(radv_layout_has_htile(image, layout, in_render_loop, queue_mask) ==
2021 radv_layout_is_htile_compressed(image, layout, in_render_loop, queue_mask));
2022
2023 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, in_render_loop);
2024
2025 if (cmd_buffer->state.attachments[idx].ds.offset_scale != cmd_buffer->state.offset_scale) {
2026 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
2027 cmd_buffer->state.offset_scale = cmd_buffer->state.attachments[idx].ds.offset_scale;
2028 }
2029 radv_load_ds_clear_metadata(cmd_buffer, iview);
2030 } else {
2031 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
2032 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2033 else
2034 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2035
2036 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
2037 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2038 }
2039 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2040 S_028208_BR_X(framebuffer->width) |
2041 S_028208_BR_Y(framebuffer->height));
2042
2043 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2044 bool disable_constant_encode =
2045 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2046 enum chip_class chip_class =
2047 cmd_buffer->device->physical_device->rad_info.chip_class;
2048 uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2049
2050 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2051 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2052 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2053 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2054 }
2055
2056 if (cmd_buffer->device->dfsm_allowed) {
2057 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2058 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2059 }
2060
2061 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2062 }
2063
2064 static void
2065 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
2066 {
2067 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2068 struct radv_cmd_state *state = &cmd_buffer->state;
2069
2070 if (state->index_type != state->last_index_type) {
2071 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2072 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
2073 cs, R_03090C_VGT_INDEX_TYPE,
2074 2, state->index_type);
2075 } else {
2076 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2077 radeon_emit(cs, state->index_type);
2078 }
2079
2080 state->last_index_type = state->index_type;
2081 }
2082
2083 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2084 radeon_emit(cs, state->index_va);
2085 radeon_emit(cs, state->index_va >> 32);
2086
2087 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2088 radeon_emit(cs, state->max_index_count);
2089
2090 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2091 }
2092
2093 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2094 {
2095 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2096 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2097 uint32_t pa_sc_mode_cntl_1 =
2098 pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2099 uint32_t db_count_control;
2100
2101 if(!cmd_buffer->state.active_occlusion_queries) {
2102 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2103 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2104 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2105 has_perfect_queries) {
2106 /* Re-enable out-of-order rasterization if the
2107 * bound pipeline supports it and if it's has
2108 * been disabled before starting any perfect
2109 * occlusion queries.
2110 */
2111 radeon_set_context_reg(cmd_buffer->cs,
2112 R_028A4C_PA_SC_MODE_CNTL_1,
2113 pa_sc_mode_cntl_1);
2114 }
2115 }
2116 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2117 } else {
2118 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2119 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2120 bool gfx10_perfect = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2121
2122 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2123 db_count_control =
2124 S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
2125 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2126 S_028004_SAMPLE_RATE(sample_rate) |
2127 S_028004_ZPASS_ENABLE(1) |
2128 S_028004_SLICE_EVEN_ENABLE(1) |
2129 S_028004_SLICE_ODD_ENABLE(1);
2130
2131 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2132 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2133 has_perfect_queries) {
2134 /* If the bound pipeline has enabled
2135 * out-of-order rasterization, we should
2136 * disable it before starting any perfect
2137 * occlusion queries.
2138 */
2139 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2140
2141 radeon_set_context_reg(cmd_buffer->cs,
2142 R_028A4C_PA_SC_MODE_CNTL_1,
2143 pa_sc_mode_cntl_1);
2144 }
2145 } else {
2146 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2147 S_028004_SAMPLE_RATE(sample_rate);
2148 }
2149 }
2150
2151 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2152
2153 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2154 }
2155
2156 static void
2157 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
2158 {
2159 uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2160
2161 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2162 radv_emit_viewport(cmd_buffer);
2163
2164 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
2165 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
2166 radv_emit_scissor(cmd_buffer);
2167
2168 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
2169 radv_emit_line_width(cmd_buffer);
2170
2171 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
2172 radv_emit_blend_constants(cmd_buffer);
2173
2174 if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2175 RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2176 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
2177 radv_emit_stencil(cmd_buffer);
2178
2179 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
2180 radv_emit_depth_bounds(cmd_buffer);
2181
2182 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
2183 radv_emit_depth_bias(cmd_buffer);
2184
2185 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
2186 radv_emit_discard_rectangle(cmd_buffer);
2187
2188 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
2189 radv_emit_sample_locations(cmd_buffer);
2190
2191 cmd_buffer->state.dirty &= ~states;
2192 }
2193
2194 static void
2195 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
2196 VkPipelineBindPoint bind_point)
2197 {
2198 struct radv_descriptor_state *descriptors_state =
2199 radv_get_descriptors_state(cmd_buffer, bind_point);
2200 struct radv_descriptor_set *set = &descriptors_state->push_set.set;
2201 unsigned bo_offset;
2202
2203 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
2204 set->mapped_ptr,
2205 &bo_offset))
2206 return;
2207
2208 set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2209 set->va += bo_offset;
2210 }
2211
2212 static void
2213 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
2214 VkPipelineBindPoint bind_point)
2215 {
2216 struct radv_descriptor_state *descriptors_state =
2217 radv_get_descriptors_state(cmd_buffer, bind_point);
2218 uint32_t size = MAX_SETS * 4;
2219 uint32_t offset;
2220 void *ptr;
2221
2222 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
2223 256, &offset, &ptr))
2224 return;
2225
2226 for (unsigned i = 0; i < MAX_SETS; i++) {
2227 uint32_t *uptr = ((uint32_t *)ptr) + i;
2228 uint64_t set_va = 0;
2229 struct radv_descriptor_set *set = descriptors_state->sets[i];
2230 if (descriptors_state->valid & (1u << i))
2231 set_va = set->va;
2232 uptr[0] = set_va & 0xffffffff;
2233 }
2234
2235 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2236 va += offset;
2237
2238 if (cmd_buffer->state.pipeline) {
2239 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
2240 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2241 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2242
2243 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
2244 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
2245 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2246
2247 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
2248 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
2249 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2250
2251 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2252 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
2253 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2254
2255 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2256 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
2257 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2258 }
2259
2260 if (cmd_buffer->state.compute_pipeline)
2261 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
2262 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2263 }
2264
2265 static void
2266 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
2267 VkShaderStageFlags stages)
2268 {
2269 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2270 VK_PIPELINE_BIND_POINT_COMPUTE :
2271 VK_PIPELINE_BIND_POINT_GRAPHICS;
2272 struct radv_descriptor_state *descriptors_state =
2273 radv_get_descriptors_state(cmd_buffer, bind_point);
2274 struct radv_cmd_state *state = &cmd_buffer->state;
2275 bool flush_indirect_descriptors;
2276
2277 if (!descriptors_state->dirty)
2278 return;
2279
2280 if (descriptors_state->push_dirty)
2281 radv_flush_push_descriptors(cmd_buffer, bind_point);
2282
2283 flush_indirect_descriptors =
2284 (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
2285 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
2286 (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
2287 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
2288
2289 if (flush_indirect_descriptors)
2290 radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
2291
2292 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
2293 cmd_buffer->cs,
2294 MAX_SETS * MESA_SHADER_STAGES * 4);
2295
2296 if (cmd_buffer->state.pipeline) {
2297 radv_foreach_stage(stage, stages) {
2298 if (!cmd_buffer->state.pipeline->shaders[stage])
2299 continue;
2300
2301 radv_emit_descriptor_pointers(cmd_buffer,
2302 cmd_buffer->state.pipeline,
2303 descriptors_state, stage);
2304 }
2305 }
2306
2307 if (cmd_buffer->state.compute_pipeline &&
2308 (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
2309 radv_emit_descriptor_pointers(cmd_buffer,
2310 cmd_buffer->state.compute_pipeline,
2311 descriptors_state,
2312 MESA_SHADER_COMPUTE);
2313 }
2314
2315 descriptors_state->dirty = 0;
2316 descriptors_state->push_dirty = false;
2317
2318 assert(cmd_buffer->cs->cdw <= cdw_max);
2319
2320 if (unlikely(cmd_buffer->device->trace_bo))
2321 radv_save_descriptors(cmd_buffer, bind_point);
2322 }
2323
2324 static void
2325 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
2326 VkShaderStageFlags stages)
2327 {
2328 struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT
2329 ? cmd_buffer->state.compute_pipeline
2330 : cmd_buffer->state.pipeline;
2331 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2332 VK_PIPELINE_BIND_POINT_COMPUTE :
2333 VK_PIPELINE_BIND_POINT_GRAPHICS;
2334 struct radv_descriptor_state *descriptors_state =
2335 radv_get_descriptors_state(cmd_buffer, bind_point);
2336 struct radv_pipeline_layout *layout = pipeline->layout;
2337 struct radv_shader_variant *shader, *prev_shader;
2338 bool need_push_constants = false;
2339 unsigned offset;
2340 void *ptr;
2341 uint64_t va;
2342
2343 stages &= cmd_buffer->push_constant_stages;
2344 if (!stages ||
2345 (!layout->push_constant_size && !layout->dynamic_offset_count))
2346 return;
2347
2348 radv_foreach_stage(stage, stages) {
2349 shader = radv_get_shader(pipeline, stage);
2350 if (!shader)
2351 continue;
2352
2353 need_push_constants |= shader->info.loads_push_constants;
2354 need_push_constants |= shader->info.loads_dynamic_offsets;
2355
2356 uint8_t base = shader->info.base_inline_push_consts;
2357 uint8_t count = shader->info.num_inline_push_consts;
2358
2359 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
2360 AC_UD_INLINE_PUSH_CONSTANTS,
2361 count,
2362 (uint32_t *)&cmd_buffer->push_constants[base * 4]);
2363 }
2364
2365 if (need_push_constants) {
2366 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
2367 16 * layout->dynamic_offset_count,
2368 256, &offset, &ptr))
2369 return;
2370
2371 memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
2372 memcpy((char*)ptr + layout->push_constant_size,
2373 descriptors_state->dynamic_buffers,
2374 16 * layout->dynamic_offset_count);
2375
2376 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2377 va += offset;
2378
2379 ASSERTED unsigned cdw_max =
2380 radeon_check_space(cmd_buffer->device->ws,
2381 cmd_buffer->cs, MESA_SHADER_STAGES * 4);
2382
2383 prev_shader = NULL;
2384 radv_foreach_stage(stage, stages) {
2385 shader = radv_get_shader(pipeline, stage);
2386
2387 /* Avoid redundantly emitting the address for merged stages. */
2388 if (shader && shader != prev_shader) {
2389 radv_emit_userdata_address(cmd_buffer, pipeline, stage,
2390 AC_UD_PUSH_CONSTANTS, va);
2391
2392 prev_shader = shader;
2393 }
2394 }
2395 assert(cmd_buffer->cs->cdw <= cdw_max);
2396 }
2397
2398 cmd_buffer->push_constant_stages &= ~stages;
2399 }
2400
2401 static void
2402 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
2403 bool pipeline_is_dirty)
2404 {
2405 if ((pipeline_is_dirty ||
2406 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
2407 cmd_buffer->state.pipeline->num_vertex_bindings &&
2408 radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.has_vertex_buffers) {
2409 unsigned vb_offset;
2410 void *vb_ptr;
2411 uint32_t i = 0;
2412 uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
2413 uint64_t va;
2414
2415 /* allocate some descriptor state for vertex buffers */
2416 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
2417 &vb_offset, &vb_ptr))
2418 return;
2419
2420 for (i = 0; i < count; i++) {
2421 uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
2422 uint32_t offset;
2423 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
2424 uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
2425 unsigned num_records;
2426
2427 if (!buffer)
2428 continue;
2429
2430 va = radv_buffer_get_va(buffer->bo);
2431
2432 offset = cmd_buffer->vertex_bindings[i].offset;
2433 va += offset + buffer->offset;
2434
2435 num_records = buffer->size - offset;
2436 if (cmd_buffer->device->physical_device->rad_info.chip_class != GFX8 && stride)
2437 num_records /= stride;
2438
2439 desc[0] = va;
2440 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2441 desc[2] = num_records;
2442 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2443 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2444 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2445 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2446
2447 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2448 /* OOB_SELECT chooses the out-of-bounds check:
2449 * - 1: index >= NUM_RECORDS (Structured)
2450 * - 3: offset >= NUM_RECORDS (Raw)
2451 */
2452 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
2453
2454 desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) |
2455 S_008F0C_OOB_SELECT(oob_select) |
2456 S_008F0C_RESOURCE_LEVEL(1);
2457 } else {
2458 desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2459 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2460 }
2461 }
2462
2463 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2464 va += vb_offset;
2465
2466 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2467 AC_UD_VS_VERTEX_BUFFERS, va);
2468
2469 cmd_buffer->state.vb_va = va;
2470 cmd_buffer->state.vb_size = count * 16;
2471 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2472 }
2473 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2474 }
2475
2476 static void
2477 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2478 {
2479 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2480 struct radv_userdata_info *loc;
2481 uint32_t base_reg;
2482
2483 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2484 if (!radv_get_shader(pipeline, stage))
2485 continue;
2486
2487 loc = radv_lookup_user_sgpr(pipeline, stage,
2488 AC_UD_STREAMOUT_BUFFERS);
2489 if (loc->sgpr_idx == -1)
2490 continue;
2491
2492 base_reg = pipeline->user_data_0[stage];
2493
2494 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2495 base_reg + loc->sgpr_idx * 4, va, false);
2496 }
2497
2498 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
2499 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
2500 if (loc->sgpr_idx != -1) {
2501 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
2502
2503 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2504 base_reg + loc->sgpr_idx * 4, va, false);
2505 }
2506 }
2507 }
2508
2509 static void
2510 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
2511 {
2512 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
2513 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
2514 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
2515 unsigned so_offset;
2516 void *so_ptr;
2517 uint64_t va;
2518
2519 /* Allocate some descriptor state for streamout buffers. */
2520 if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
2521 MAX_SO_BUFFERS * 16, 256,
2522 &so_offset, &so_ptr))
2523 return;
2524
2525 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
2526 struct radv_buffer *buffer = sb[i].buffer;
2527 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
2528
2529 if (!(so->enabled_mask & (1 << i)))
2530 continue;
2531
2532 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
2533
2534 va += sb[i].offset;
2535
2536 /* Set the descriptor.
2537 *
2538 * On GFX8, the format must be non-INVALID, otherwise
2539 * the buffer will be considered not bound and store
2540 * instructions will be no-ops.
2541 */
2542 uint32_t size = 0xffffffff;
2543
2544 /* Compute the correct buffer size for NGG streamout
2545 * because it's used to determine the max emit per
2546 * buffer.
2547 */
2548 if (cmd_buffer->device->physical_device->use_ngg_streamout)
2549 size = buffer->size - sb[i].offset;
2550
2551 desc[0] = va;
2552 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2553 desc[2] = size;
2554 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2555 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2556 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2557 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2558
2559 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2560 desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
2561 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
2562 S_008F0C_RESOURCE_LEVEL(1);
2563 } else {
2564 desc[3] |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2565 }
2566 }
2567
2568 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2569 va += so_offset;
2570
2571 radv_emit_streamout_buffers(cmd_buffer, va);
2572 }
2573
2574 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
2575 }
2576
2577 static void
2578 radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
2579 {
2580 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2581 struct radv_userdata_info *loc;
2582 uint32_t ngg_gs_state = 0;
2583 uint32_t base_reg;
2584
2585 if (!radv_pipeline_has_gs(pipeline) ||
2586 !radv_pipeline_has_ngg(pipeline))
2587 return;
2588
2589 /* By default NGG GS queries are disabled but they are enabled if the
2590 * command buffer has active GDS queries or if it's a secondary command
2591 * buffer that inherits the number of generated primitives.
2592 */
2593 if (cmd_buffer->state.active_pipeline_gds_queries ||
2594 (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
2595 ngg_gs_state = 1;
2596
2597 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
2598 AC_UD_NGG_GS_STATE);
2599 base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
2600 assert(loc->sgpr_idx != -1);
2601
2602 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2603 ngg_gs_state);
2604 }
2605
2606 static void
2607 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2608 {
2609 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
2610 radv_flush_streamout_descriptors(cmd_buffer);
2611 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2612 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2613 radv_flush_ngg_gs_state(cmd_buffer);
2614 }
2615
2616 struct radv_draw_info {
2617 /**
2618 * Number of vertices.
2619 */
2620 uint32_t count;
2621
2622 /**
2623 * Index of the first vertex.
2624 */
2625 int32_t vertex_offset;
2626
2627 /**
2628 * First instance id.
2629 */
2630 uint32_t first_instance;
2631
2632 /**
2633 * Number of instances.
2634 */
2635 uint32_t instance_count;
2636
2637 /**
2638 * First index (indexed draws only).
2639 */
2640 uint32_t first_index;
2641
2642 /**
2643 * Whether it's an indexed draw.
2644 */
2645 bool indexed;
2646
2647 /**
2648 * Indirect draw parameters resource.
2649 */
2650 struct radv_buffer *indirect;
2651 uint64_t indirect_offset;
2652 uint32_t stride;
2653
2654 /**
2655 * Draw count parameters resource.
2656 */
2657 struct radv_buffer *count_buffer;
2658 uint64_t count_buffer_offset;
2659
2660 /**
2661 * Stream output parameters resource.
2662 */
2663 struct radv_buffer *strmout_buffer;
2664 uint64_t strmout_buffer_offset;
2665 };
2666
2667 static uint32_t
2668 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
2669 {
2670 switch (cmd_buffer->state.index_type) {
2671 case V_028A7C_VGT_INDEX_8:
2672 return 0xffu;
2673 case V_028A7C_VGT_INDEX_16:
2674 return 0xffffu;
2675 case V_028A7C_VGT_INDEX_32:
2676 return 0xffffffffu;
2677 default:
2678 unreachable("invalid index type");
2679 }
2680 }
2681
2682 static void
2683 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
2684 bool instanced_draw, bool indirect_draw,
2685 bool count_from_stream_output,
2686 uint32_t draw_vertex_count)
2687 {
2688 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2689 struct radv_cmd_state *state = &cmd_buffer->state;
2690 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2691 unsigned ia_multi_vgt_param;
2692
2693 ia_multi_vgt_param =
2694 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
2695 indirect_draw,
2696 count_from_stream_output,
2697 draw_vertex_count);
2698
2699 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
2700 if (info->chip_class == GFX9) {
2701 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
2702 cs,
2703 R_030960_IA_MULTI_VGT_PARAM,
2704 4, ia_multi_vgt_param);
2705 } else if (info->chip_class >= GFX7) {
2706 radeon_set_context_reg_idx(cs,
2707 R_028AA8_IA_MULTI_VGT_PARAM,
2708 1, ia_multi_vgt_param);
2709 } else {
2710 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
2711 ia_multi_vgt_param);
2712 }
2713 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
2714 }
2715 }
2716
2717 static void
2718 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
2719 const struct radv_draw_info *draw_info)
2720 {
2721 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2722 struct radv_cmd_state *state = &cmd_buffer->state;
2723 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2724 int32_t primitive_reset_en;
2725
2726 /* Draw state. */
2727 if (info->chip_class < GFX10) {
2728 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
2729 draw_info->indirect,
2730 !!draw_info->strmout_buffer,
2731 draw_info->indirect ? 0 : draw_info->count);
2732 }
2733
2734 /* Primitive restart. */
2735 primitive_reset_en =
2736 draw_info->indexed && state->pipeline->graphics.prim_restart_enable;
2737
2738 if (primitive_reset_en != state->last_primitive_reset_en) {
2739 state->last_primitive_reset_en = primitive_reset_en;
2740 if (info->chip_class >= GFX9) {
2741 radeon_set_uconfig_reg(cs,
2742 R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
2743 primitive_reset_en);
2744 } else {
2745 radeon_set_context_reg(cs,
2746 R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
2747 primitive_reset_en);
2748 }
2749 }
2750
2751 if (primitive_reset_en) {
2752 uint32_t primitive_reset_index =
2753 radv_get_primitive_reset_index(cmd_buffer);
2754
2755 if (primitive_reset_index != state->last_primitive_reset_index) {
2756 radeon_set_context_reg(cs,
2757 R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
2758 primitive_reset_index);
2759 state->last_primitive_reset_index = primitive_reset_index;
2760 }
2761 }
2762
2763 if (draw_info->strmout_buffer) {
2764 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
2765
2766 va += draw_info->strmout_buffer->offset +
2767 draw_info->strmout_buffer_offset;
2768
2769 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
2770 draw_info->stride);
2771
2772 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2773 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2774 COPY_DATA_DST_SEL(COPY_DATA_REG) |
2775 COPY_DATA_WR_CONFIRM);
2776 radeon_emit(cs, va);
2777 radeon_emit(cs, va >> 32);
2778 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
2779 radeon_emit(cs, 0); /* unused */
2780
2781 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
2782 }
2783 }
2784
2785 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
2786 VkPipelineStageFlags src_stage_mask)
2787 {
2788 if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
2789 VK_PIPELINE_STAGE_TRANSFER_BIT |
2790 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2791 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2792 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2793 }
2794
2795 if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
2796 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
2797 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
2798 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
2799 VK_PIPELINE_STAGE_TRANSFER_BIT |
2800 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2801 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
2802 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2803 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
2804 } else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
2805 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
2806 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
2807 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
2808 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
2809 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
2810 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
2811 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
2812 }
2813 }
2814
2815 static enum radv_cmd_flush_bits
2816 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
2817 VkAccessFlags src_flags,
2818 struct radv_image *image)
2819 {
2820 bool flush_CB_meta = true, flush_DB_meta = true;
2821 enum radv_cmd_flush_bits flush_bits = 0;
2822 uint32_t b;
2823
2824 if (image) {
2825 if (!radv_image_has_CB_metadata(image))
2826 flush_CB_meta = false;
2827 if (!radv_image_has_htile(image))
2828 flush_DB_meta = false;
2829 }
2830
2831 for_each_bit(b, src_flags) {
2832 switch ((VkAccessFlagBits)(1 << b)) {
2833 case VK_ACCESS_SHADER_WRITE_BIT:
2834 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
2835 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2836 flush_bits |= RADV_CMD_FLAG_WB_L2;
2837 break;
2838 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
2839 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2840 if (flush_CB_meta)
2841 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2842 break;
2843 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
2844 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2845 if (flush_DB_meta)
2846 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2847 break;
2848 case VK_ACCESS_TRANSFER_WRITE_BIT:
2849 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2850 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
2851 RADV_CMD_FLAG_INV_L2;
2852
2853 if (flush_CB_meta)
2854 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2855 if (flush_DB_meta)
2856 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2857 break;
2858 default:
2859 break;
2860 }
2861 }
2862 return flush_bits;
2863 }
2864
2865 static enum radv_cmd_flush_bits
2866 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
2867 VkAccessFlags dst_flags,
2868 struct radv_image *image)
2869 {
2870 bool flush_CB_meta = true, flush_DB_meta = true;
2871 enum radv_cmd_flush_bits flush_bits = 0;
2872 bool flush_CB = true, flush_DB = true;
2873 bool image_is_coherent = false;
2874 uint32_t b;
2875
2876 if (image) {
2877 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
2878 flush_CB = false;
2879 flush_DB = false;
2880 }
2881
2882 if (!radv_image_has_CB_metadata(image))
2883 flush_CB_meta = false;
2884 if (!radv_image_has_htile(image))
2885 flush_DB_meta = false;
2886
2887 /* TODO: implement shader coherent for GFX10 */
2888
2889 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
2890 if (image->info.samples == 1 &&
2891 (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
2892 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
2893 !vk_format_is_stencil(image->vk_format)) {
2894 /* Single-sample color and single-sample depth
2895 * (not stencil) are coherent with shaders on
2896 * GFX9.
2897 */
2898 image_is_coherent = true;
2899 }
2900 }
2901 }
2902
2903 for_each_bit(b, dst_flags) {
2904 switch ((VkAccessFlagBits)(1 << b)) {
2905 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
2906 case VK_ACCESS_INDEX_READ_BIT:
2907 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2908 break;
2909 case VK_ACCESS_UNIFORM_READ_BIT:
2910 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
2911 break;
2912 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
2913 case VK_ACCESS_TRANSFER_READ_BIT:
2914 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
2915 flush_bits |= RADV_CMD_FLAG_INV_VCACHE |
2916 RADV_CMD_FLAG_INV_L2;
2917 break;
2918 case VK_ACCESS_SHADER_READ_BIT:
2919 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
2920 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
2921 * invalidate the scalar cache. */
2922 if (cmd_buffer->device->physical_device->use_aco &&
2923 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8)
2924 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
2925
2926 if (!image_is_coherent)
2927 flush_bits |= RADV_CMD_FLAG_INV_L2;
2928 break;
2929 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
2930 if (flush_CB)
2931 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2932 if (flush_CB_meta)
2933 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2934 break;
2935 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
2936 if (flush_DB)
2937 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2938 if (flush_DB_meta)
2939 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2940 break;
2941 default:
2942 break;
2943 }
2944 }
2945 return flush_bits;
2946 }
2947
2948 void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
2949 const struct radv_subpass_barrier *barrier)
2950 {
2951 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
2952 NULL);
2953 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
2954 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
2955 NULL);
2956 }
2957
2958 uint32_t
2959 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
2960 {
2961 struct radv_cmd_state *state = &cmd_buffer->state;
2962 uint32_t subpass_id = state->subpass - state->pass->subpasses;
2963
2964 /* The id of this subpass shouldn't exceed the number of subpasses in
2965 * this render pass minus 1.
2966 */
2967 assert(subpass_id < state->pass->subpass_count);
2968 return subpass_id;
2969 }
2970
2971 static struct radv_sample_locations_state *
2972 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer,
2973 uint32_t att_idx,
2974 bool begin_subpass)
2975 {
2976 struct radv_cmd_state *state = &cmd_buffer->state;
2977 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
2978 struct radv_image_view *view = state->attachments[att_idx].iview;
2979
2980 if (view->image->info.samples == 1)
2981 return NULL;
2982
2983 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
2984 /* Return the initial sample locations if this is the initial
2985 * layout transition of the given subpass attachemnt.
2986 */
2987 if (state->attachments[att_idx].sample_location.count > 0)
2988 return &state->attachments[att_idx].sample_location;
2989 } else {
2990 /* Otherwise return the subpass sample locations if defined. */
2991 if (state->subpass_sample_locs) {
2992 /* Because the driver sets the current subpass before
2993 * initial layout transitions, we should use the sample
2994 * locations from the previous subpass to avoid an
2995 * off-by-one problem. Otherwise, use the sample
2996 * locations for the current subpass for final layout
2997 * transitions.
2998 */
2999 if (begin_subpass)
3000 subpass_id--;
3001
3002 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
3003 if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
3004 return &state->subpass_sample_locs[i].sample_location;
3005 }
3006 }
3007 }
3008
3009 return NULL;
3010 }
3011
3012 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
3013 struct radv_subpass_attachment att,
3014 bool begin_subpass)
3015 {
3016 unsigned idx = att.attachment;
3017 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
3018 struct radv_sample_locations_state *sample_locs;
3019 VkImageSubresourceRange range;
3020 range.aspectMask = view->aspect_mask;
3021 range.baseMipLevel = view->base_mip;
3022 range.levelCount = 1;
3023 range.baseArrayLayer = view->base_layer;
3024 range.layerCount = cmd_buffer->state.framebuffer->layers;
3025
3026 if (cmd_buffer->state.subpass->view_mask) {
3027 /* If the current subpass uses multiview, the driver might have
3028 * performed a fast color/depth clear to the whole image
3029 * (including all layers). To make sure the driver will
3030 * decompress the image correctly (if needed), we have to
3031 * account for the "real" number of layers. If the view mask is
3032 * sparse, this will decompress more layers than needed.
3033 */
3034 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
3035 }
3036
3037 /* Get the subpass sample locations for the given attachment, if NULL
3038 * is returned the driver will use the default HW locations.
3039 */
3040 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx,
3041 begin_subpass);
3042
3043 /* Determine if the subpass uses separate depth/stencil layouts. */
3044 bool uses_separate_depth_stencil_layouts = false;
3045 if ((cmd_buffer->state.attachments[idx].current_layout !=
3046 cmd_buffer->state.attachments[idx].current_stencil_layout) ||
3047 (att.layout != att.stencil_layout)) {
3048 uses_separate_depth_stencil_layouts = true;
3049 }
3050
3051 /* For separate layouts, perform depth and stencil transitions
3052 * separately.
3053 */
3054 if (uses_separate_depth_stencil_layouts &&
3055 (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT |
3056 VK_IMAGE_ASPECT_STENCIL_BIT))) {
3057 /* Depth-only transitions. */
3058 range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
3059 radv_handle_image_transition(cmd_buffer,
3060 view->image,
3061 cmd_buffer->state.attachments[idx].current_layout,
3062 cmd_buffer->state.attachments[idx].current_in_render_loop,
3063 att.layout, att.in_render_loop,
3064 0, 0, &range, sample_locs);
3065
3066 /* Stencil-only transitions. */
3067 range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
3068 radv_handle_image_transition(cmd_buffer,
3069 view->image,
3070 cmd_buffer->state.attachments[idx].current_stencil_layout,
3071 cmd_buffer->state.attachments[idx].current_in_render_loop,
3072 att.stencil_layout, att.in_render_loop,
3073 0, 0, &range, sample_locs);
3074 } else {
3075 radv_handle_image_transition(cmd_buffer,
3076 view->image,
3077 cmd_buffer->state.attachments[idx].current_layout,
3078 cmd_buffer->state.attachments[idx].current_in_render_loop,
3079 att.layout, att.in_render_loop,
3080 0, 0, &range, sample_locs);
3081 }
3082
3083 cmd_buffer->state.attachments[idx].current_layout = att.layout;
3084 cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
3085 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
3086
3087
3088 }
3089
3090 void
3091 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
3092 const struct radv_subpass *subpass)
3093 {
3094 cmd_buffer->state.subpass = subpass;
3095
3096 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
3097 }
3098
3099 static VkResult
3100 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
3101 struct radv_render_pass *pass,
3102 const VkRenderPassBeginInfo *info)
3103 {
3104 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
3105 vk_find_struct_const(info->pNext,
3106 RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
3107 struct radv_cmd_state *state = &cmd_buffer->state;
3108
3109 if (!sample_locs) {
3110 state->subpass_sample_locs = NULL;
3111 return VK_SUCCESS;
3112 }
3113
3114 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
3115 const VkAttachmentSampleLocationsEXT *att_sample_locs =
3116 &sample_locs->pAttachmentInitialSampleLocations[i];
3117 uint32_t att_idx = att_sample_locs->attachmentIndex;
3118 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
3119
3120 assert(vk_format_is_depth_or_stencil(image->vk_format));
3121
3122 /* From the Vulkan spec 1.1.108:
3123 *
3124 * "If the image referenced by the framebuffer attachment at
3125 * index attachmentIndex was not created with
3126 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
3127 * then the values specified in sampleLocationsInfo are
3128 * ignored."
3129 */
3130 if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
3131 continue;
3132
3133 const VkSampleLocationsInfoEXT *sample_locs_info =
3134 &att_sample_locs->sampleLocationsInfo;
3135
3136 state->attachments[att_idx].sample_location.per_pixel =
3137 sample_locs_info->sampleLocationsPerPixel;
3138 state->attachments[att_idx].sample_location.grid_size =
3139 sample_locs_info->sampleLocationGridSize;
3140 state->attachments[att_idx].sample_location.count =
3141 sample_locs_info->sampleLocationsCount;
3142 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
3143 sample_locs_info->pSampleLocations,
3144 sample_locs_info->sampleLocationsCount);
3145 }
3146
3147 state->subpass_sample_locs = vk_alloc(&cmd_buffer->pool->alloc,
3148 sample_locs->postSubpassSampleLocationsCount *
3149 sizeof(state->subpass_sample_locs[0]),
3150 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3151 if (state->subpass_sample_locs == NULL) {
3152 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3153 return cmd_buffer->record_result;
3154 }
3155
3156 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
3157
3158 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
3159 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
3160 &sample_locs->pPostSubpassSampleLocations[i];
3161 const VkSampleLocationsInfoEXT *sample_locs_info =
3162 &subpass_sample_locs_info->sampleLocationsInfo;
3163
3164 state->subpass_sample_locs[i].subpass_idx =
3165 subpass_sample_locs_info->subpassIndex;
3166 state->subpass_sample_locs[i].sample_location.per_pixel =
3167 sample_locs_info->sampleLocationsPerPixel;
3168 state->subpass_sample_locs[i].sample_location.grid_size =
3169 sample_locs_info->sampleLocationGridSize;
3170 state->subpass_sample_locs[i].sample_location.count =
3171 sample_locs_info->sampleLocationsCount;
3172 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
3173 sample_locs_info->pSampleLocations,
3174 sample_locs_info->sampleLocationsCount);
3175 }
3176
3177 return VK_SUCCESS;
3178 }
3179
3180 static VkResult
3181 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
3182 struct radv_render_pass *pass,
3183 const VkRenderPassBeginInfo *info)
3184 {
3185 struct radv_cmd_state *state = &cmd_buffer->state;
3186 const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
3187
3188 if (info) {
3189 attachment_info = vk_find_struct_const(info->pNext,
3190 RENDER_PASS_ATTACHMENT_BEGIN_INFO);
3191 }
3192
3193
3194 if (pass->attachment_count == 0) {
3195 state->attachments = NULL;
3196 return VK_SUCCESS;
3197 }
3198
3199 state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
3200 pass->attachment_count *
3201 sizeof(state->attachments[0]),
3202 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3203 if (state->attachments == NULL) {
3204 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3205 return cmd_buffer->record_result;
3206 }
3207
3208 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
3209 struct radv_render_pass_attachment *att = &pass->attachments[i];
3210 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
3211 VkImageAspectFlags clear_aspects = 0;
3212
3213 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3214 /* color attachment */
3215 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3216 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
3217 }
3218 } else {
3219 /* depthstencil attachment */
3220 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
3221 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3222 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
3223 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3224 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
3225 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3226 }
3227 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3228 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3229 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3230 }
3231 }
3232
3233 state->attachments[i].pending_clear_aspects = clear_aspects;
3234 state->attachments[i].cleared_views = 0;
3235 if (clear_aspects && info) {
3236 assert(info->clearValueCount > i);
3237 state->attachments[i].clear_value = info->pClearValues[i];
3238 }
3239
3240 state->attachments[i].current_layout = att->initial_layout;
3241 state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
3242 state->attachments[i].sample_location.count = 0;
3243
3244 struct radv_image_view *iview;
3245 if (attachment_info && attachment_info->attachmentCount > i) {
3246 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
3247 } else {
3248 iview = state->framebuffer->attachments[i];
3249 }
3250
3251 state->attachments[i].iview = iview;
3252 if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3253 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
3254 } else {
3255 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
3256 }
3257 }
3258
3259 return VK_SUCCESS;
3260 }
3261
3262 VkResult radv_AllocateCommandBuffers(
3263 VkDevice _device,
3264 const VkCommandBufferAllocateInfo *pAllocateInfo,
3265 VkCommandBuffer *pCommandBuffers)
3266 {
3267 RADV_FROM_HANDLE(radv_device, device, _device);
3268 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
3269
3270 VkResult result = VK_SUCCESS;
3271 uint32_t i;
3272
3273 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
3274
3275 if (!list_is_empty(&pool->free_cmd_buffers)) {
3276 struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
3277
3278 list_del(&cmd_buffer->pool_link);
3279 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
3280
3281 result = radv_reset_cmd_buffer(cmd_buffer);
3282 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
3283 cmd_buffer->level = pAllocateInfo->level;
3284
3285 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
3286 } else {
3287 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
3288 &pCommandBuffers[i]);
3289 }
3290 if (result != VK_SUCCESS)
3291 break;
3292 }
3293
3294 if (result != VK_SUCCESS) {
3295 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
3296 i, pCommandBuffers);
3297
3298 /* From the Vulkan 1.0.66 spec:
3299 *
3300 * "vkAllocateCommandBuffers can be used to create multiple
3301 * command buffers. If the creation of any of those command
3302 * buffers fails, the implementation must destroy all
3303 * successfully created command buffer objects from this
3304 * command, set all entries of the pCommandBuffers array to
3305 * NULL and return the error."
3306 */
3307 memset(pCommandBuffers, 0,
3308 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
3309 }
3310
3311 return result;
3312 }
3313
3314 void radv_FreeCommandBuffers(
3315 VkDevice device,
3316 VkCommandPool commandPool,
3317 uint32_t commandBufferCount,
3318 const VkCommandBuffer *pCommandBuffers)
3319 {
3320 for (uint32_t i = 0; i < commandBufferCount; i++) {
3321 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
3322
3323 if (cmd_buffer) {
3324 if (cmd_buffer->pool) {
3325 list_del(&cmd_buffer->pool_link);
3326 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
3327 } else
3328 radv_cmd_buffer_destroy(cmd_buffer);
3329
3330 }
3331 }
3332 }
3333
3334 VkResult radv_ResetCommandBuffer(
3335 VkCommandBuffer commandBuffer,
3336 VkCommandBufferResetFlags flags)
3337 {
3338 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3339 return radv_reset_cmd_buffer(cmd_buffer);
3340 }
3341
3342 VkResult radv_BeginCommandBuffer(
3343 VkCommandBuffer commandBuffer,
3344 const VkCommandBufferBeginInfo *pBeginInfo)
3345 {
3346 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3347 VkResult result = VK_SUCCESS;
3348
3349 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
3350 /* If the command buffer has already been resetted with
3351 * vkResetCommandBuffer, no need to do it again.
3352 */
3353 result = radv_reset_cmd_buffer(cmd_buffer);
3354 if (result != VK_SUCCESS)
3355 return result;
3356 }
3357
3358 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
3359 cmd_buffer->state.last_primitive_reset_en = -1;
3360 cmd_buffer->state.last_index_type = -1;
3361 cmd_buffer->state.last_num_instances = -1;
3362 cmd_buffer->state.last_vertex_offset = -1;
3363 cmd_buffer->state.last_first_instance = -1;
3364 cmd_buffer->state.predication_type = -1;
3365 cmd_buffer->usage_flags = pBeginInfo->flags;
3366
3367 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
3368 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
3369 assert(pBeginInfo->pInheritanceInfo);
3370 cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
3371 cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
3372
3373 struct radv_subpass *subpass =
3374 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
3375
3376 if (cmd_buffer->state.framebuffer) {
3377 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
3378 if (result != VK_SUCCESS)
3379 return result;
3380 }
3381
3382 cmd_buffer->state.inherited_pipeline_statistics =
3383 pBeginInfo->pInheritanceInfo->pipelineStatistics;
3384
3385 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3386 }
3387
3388 if (unlikely(cmd_buffer->device->trace_bo)) {
3389 struct radv_device *device = cmd_buffer->device;
3390
3391 radv_cs_add_buffer(device->ws, cmd_buffer->cs,
3392 device->trace_bo);
3393
3394 radv_cmd_buffer_trace_emit(cmd_buffer);
3395 }
3396
3397 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
3398
3399 return result;
3400 }
3401
3402 void radv_CmdBindVertexBuffers(
3403 VkCommandBuffer commandBuffer,
3404 uint32_t firstBinding,
3405 uint32_t bindingCount,
3406 const VkBuffer* pBuffers,
3407 const VkDeviceSize* pOffsets)
3408 {
3409 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3410 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
3411 bool changed = false;
3412
3413 /* We have to defer setting up vertex buffer since we need the buffer
3414 * stride from the pipeline. */
3415
3416 assert(firstBinding + bindingCount <= MAX_VBS);
3417 for (uint32_t i = 0; i < bindingCount; i++) {
3418 uint32_t idx = firstBinding + i;
3419
3420 if (!changed &&
3421 (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) ||
3422 vb[idx].offset != pOffsets[i])) {
3423 changed = true;
3424 }
3425
3426 vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
3427 vb[idx].offset = pOffsets[i];
3428
3429 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3430 vb[idx].buffer->bo);
3431 }
3432
3433 if (!changed) {
3434 /* No state changes. */
3435 return;
3436 }
3437
3438 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
3439 }
3440
3441 static uint32_t
3442 vk_to_index_type(VkIndexType type)
3443 {
3444 switch (type) {
3445 case VK_INDEX_TYPE_UINT8_EXT:
3446 return V_028A7C_VGT_INDEX_8;
3447 case VK_INDEX_TYPE_UINT16:
3448 return V_028A7C_VGT_INDEX_16;
3449 case VK_INDEX_TYPE_UINT32:
3450 return V_028A7C_VGT_INDEX_32;
3451 default:
3452 unreachable("invalid index type");
3453 }
3454 }
3455
3456 static uint32_t
3457 radv_get_vgt_index_size(uint32_t type)
3458 {
3459 switch (type) {
3460 case V_028A7C_VGT_INDEX_8:
3461 return 1;
3462 case V_028A7C_VGT_INDEX_16:
3463 return 2;
3464 case V_028A7C_VGT_INDEX_32:
3465 return 4;
3466 default:
3467 unreachable("invalid index type");
3468 }
3469 }
3470
3471 void radv_CmdBindIndexBuffer(
3472 VkCommandBuffer commandBuffer,
3473 VkBuffer buffer,
3474 VkDeviceSize offset,
3475 VkIndexType indexType)
3476 {
3477 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3478 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
3479
3480 if (cmd_buffer->state.index_buffer == index_buffer &&
3481 cmd_buffer->state.index_offset == offset &&
3482 cmd_buffer->state.index_type == indexType) {
3483 /* No state changes. */
3484 return;
3485 }
3486
3487 cmd_buffer->state.index_buffer = index_buffer;
3488 cmd_buffer->state.index_offset = offset;
3489 cmd_buffer->state.index_type = vk_to_index_type(indexType);
3490 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
3491 cmd_buffer->state.index_va += index_buffer->offset + offset;
3492
3493 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
3494 cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
3495 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3496 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
3497 }
3498
3499
3500 static void
3501 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3502 VkPipelineBindPoint bind_point,
3503 struct radv_descriptor_set *set, unsigned idx)
3504 {
3505 struct radeon_winsys *ws = cmd_buffer->device->ws;
3506
3507 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
3508
3509 assert(set);
3510 assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
3511
3512 if (!cmd_buffer->device->use_global_bo_list) {
3513 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3514 if (set->descriptors[j])
3515 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
3516 }
3517
3518 if(set->bo)
3519 radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
3520 }
3521
3522 void radv_CmdBindDescriptorSets(
3523 VkCommandBuffer commandBuffer,
3524 VkPipelineBindPoint pipelineBindPoint,
3525 VkPipelineLayout _layout,
3526 uint32_t firstSet,
3527 uint32_t descriptorSetCount,
3528 const VkDescriptorSet* pDescriptorSets,
3529 uint32_t dynamicOffsetCount,
3530 const uint32_t* pDynamicOffsets)
3531 {
3532 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3533 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3534 unsigned dyn_idx = 0;
3535
3536 const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
3537 struct radv_descriptor_state *descriptors_state =
3538 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3539
3540 for (unsigned i = 0; i < descriptorSetCount; ++i) {
3541 unsigned idx = i + firstSet;
3542 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
3543
3544 /* If the set is already bound we only need to update the
3545 * (potentially changed) dynamic offsets. */
3546 if (descriptors_state->sets[idx] != set ||
3547 !(descriptors_state->valid & (1u << idx))) {
3548 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, idx);
3549 }
3550
3551 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
3552 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
3553 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
3554 assert(dyn_idx < dynamicOffsetCount);
3555
3556 struct radv_descriptor_range *range = set->dynamic_descriptors + j;
3557 uint64_t va = range->va + pDynamicOffsets[dyn_idx];
3558 dst[0] = va;
3559 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3560 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
3561 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3562 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3563 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3564 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3565
3566 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3567 dst[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3568 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
3569 S_008F0C_RESOURCE_LEVEL(1);
3570 } else {
3571 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3572 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3573 }
3574
3575 cmd_buffer->push_constant_stages |=
3576 set->layout->dynamic_shader_stages;
3577 }
3578 }
3579 }
3580
3581 static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3582 struct radv_descriptor_set *set,
3583 struct radv_descriptor_set_layout *layout,
3584 VkPipelineBindPoint bind_point)
3585 {
3586 struct radv_descriptor_state *descriptors_state =
3587 radv_get_descriptors_state(cmd_buffer, bind_point);
3588 set->size = layout->size;
3589 set->layout = layout;
3590
3591 if (descriptors_state->push_set.capacity < set->size) {
3592 size_t new_size = MAX2(set->size, 1024);
3593 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
3594 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
3595
3596 free(set->mapped_ptr);
3597 set->mapped_ptr = malloc(new_size);
3598
3599 if (!set->mapped_ptr) {
3600 descriptors_state->push_set.capacity = 0;
3601 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3602 return false;
3603 }
3604
3605 descriptors_state->push_set.capacity = new_size;
3606 }
3607
3608 return true;
3609 }
3610
3611 void radv_meta_push_descriptor_set(
3612 struct radv_cmd_buffer* cmd_buffer,
3613 VkPipelineBindPoint pipelineBindPoint,
3614 VkPipelineLayout _layout,
3615 uint32_t set,
3616 uint32_t descriptorWriteCount,
3617 const VkWriteDescriptorSet* pDescriptorWrites)
3618 {
3619 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3620 struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
3621 unsigned bo_offset;
3622
3623 assert(set == 0);
3624 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3625
3626 push_set->size = layout->set[set].layout->size;
3627 push_set->layout = layout->set[set].layout;
3628
3629 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
3630 &bo_offset,
3631 (void**) &push_set->mapped_ptr))
3632 return;
3633
3634 push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3635 push_set->va += bo_offset;
3636
3637 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3638 radv_descriptor_set_to_handle(push_set),
3639 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3640
3641 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3642 }
3643
3644 void radv_CmdPushDescriptorSetKHR(
3645 VkCommandBuffer commandBuffer,
3646 VkPipelineBindPoint pipelineBindPoint,
3647 VkPipelineLayout _layout,
3648 uint32_t set,
3649 uint32_t descriptorWriteCount,
3650 const VkWriteDescriptorSet* pDescriptorWrites)
3651 {
3652 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3653 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3654 struct radv_descriptor_state *descriptors_state =
3655 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3656 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3657
3658 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3659
3660 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3661 layout->set[set].layout,
3662 pipelineBindPoint))
3663 return;
3664
3665 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
3666 * because it is invalid, according to Vulkan spec.
3667 */
3668 for (int i = 0; i < descriptorWriteCount; i++) {
3669 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
3670 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
3671 }
3672
3673 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3674 radv_descriptor_set_to_handle(push_set),
3675 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3676
3677 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3678 descriptors_state->push_dirty = true;
3679 }
3680
3681 void radv_CmdPushDescriptorSetWithTemplateKHR(
3682 VkCommandBuffer commandBuffer,
3683 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
3684 VkPipelineLayout _layout,
3685 uint32_t set,
3686 const void* pData)
3687 {
3688 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3689 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3690 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
3691 struct radv_descriptor_state *descriptors_state =
3692 radv_get_descriptors_state(cmd_buffer, templ->bind_point);
3693 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3694
3695 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3696
3697 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3698 layout->set[set].layout,
3699 templ->bind_point))
3700 return;
3701
3702 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
3703 descriptorUpdateTemplate, pData);
3704
3705 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
3706 descriptors_state->push_dirty = true;
3707 }
3708
3709 void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
3710 VkPipelineLayout layout,
3711 VkShaderStageFlags stageFlags,
3712 uint32_t offset,
3713 uint32_t size,
3714 const void* pValues)
3715 {
3716 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3717 memcpy(cmd_buffer->push_constants + offset, pValues, size);
3718 cmd_buffer->push_constant_stages |= stageFlags;
3719 }
3720
3721 VkResult radv_EndCommandBuffer(
3722 VkCommandBuffer commandBuffer)
3723 {
3724 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3725
3726 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
3727 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
3728 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
3729
3730 /* Make sure to sync all pending active queries at the end of
3731 * command buffer.
3732 */
3733 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
3734
3735 /* Since NGG streamout uses GDS, we need to make GDS idle when
3736 * we leave the IB, otherwise another process might overwrite
3737 * it while our shaders are busy.
3738 */
3739 if (cmd_buffer->gds_needed)
3740 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
3741
3742 si_emit_cache_flush(cmd_buffer);
3743 }
3744
3745 /* Make sure CP DMA is idle at the end of IBs because the kernel
3746 * doesn't wait for it.
3747 */
3748 si_cp_dma_wait_for_idle(cmd_buffer);
3749
3750 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
3751 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
3752
3753 if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
3754 return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
3755
3756 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
3757
3758 return cmd_buffer->record_result;
3759 }
3760
3761 static void
3762 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
3763 {
3764 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
3765
3766 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
3767 return;
3768
3769 assert(!pipeline->ctx_cs.cdw);
3770
3771 cmd_buffer->state.emitted_compute_pipeline = pipeline;
3772
3773 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
3774 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
3775
3776 cmd_buffer->compute_scratch_size_per_wave_needed = MAX2(cmd_buffer->compute_scratch_size_per_wave_needed,
3777 pipeline->scratch_bytes_per_wave);
3778 cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted,
3779 pipeline->max_waves);
3780
3781 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3782 pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
3783
3784 if (unlikely(cmd_buffer->device->trace_bo))
3785 radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
3786 }
3787
3788 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer,
3789 VkPipelineBindPoint bind_point)
3790 {
3791 struct radv_descriptor_state *descriptors_state =
3792 radv_get_descriptors_state(cmd_buffer, bind_point);
3793
3794 descriptors_state->dirty |= descriptors_state->valid;
3795 }
3796
3797 void radv_CmdBindPipeline(
3798 VkCommandBuffer commandBuffer,
3799 VkPipelineBindPoint pipelineBindPoint,
3800 VkPipeline _pipeline)
3801 {
3802 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3803 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
3804
3805 switch (pipelineBindPoint) {
3806 case VK_PIPELINE_BIND_POINT_COMPUTE:
3807 if (cmd_buffer->state.compute_pipeline == pipeline)
3808 return;
3809 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3810
3811 cmd_buffer->state.compute_pipeline = pipeline;
3812 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3813 break;
3814 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3815 if (cmd_buffer->state.pipeline == pipeline)
3816 return;
3817 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3818
3819 cmd_buffer->state.pipeline = pipeline;
3820 if (!pipeline)
3821 break;
3822
3823 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
3824 cmd_buffer->push_constant_stages |= pipeline->active_stages;
3825
3826 /* the new vertex shader might not have the same user regs */
3827 cmd_buffer->state.last_first_instance = -1;
3828 cmd_buffer->state.last_vertex_offset = -1;
3829
3830 /* Prefetch all pipeline shaders at first draw time. */
3831 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
3832
3833 if ((cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI10 ||
3834 cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI12 ||
3835 cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI14) &&
3836 cmd_buffer->state.emitted_pipeline &&
3837 radv_pipeline_has_ngg(cmd_buffer->state.emitted_pipeline) &&
3838 !radv_pipeline_has_ngg(cmd_buffer->state.pipeline)) {
3839 /* Transitioning from NGG to legacy GS requires
3840 * VGT_FLUSH on Navi10-14. VGT_FLUSH is also emitted
3841 * at the beginning of IBs when legacy GS ring pointers
3842 * are set.
3843 */
3844 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
3845 }
3846
3847 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
3848 radv_bind_streamout_state(cmd_buffer, pipeline);
3849
3850 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
3851 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
3852 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
3853 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
3854
3855 if (radv_pipeline_has_tess(pipeline))
3856 cmd_buffer->tess_rings_needed = true;
3857 break;
3858 default:
3859 assert(!"invalid bind point");
3860 break;
3861 }
3862 }
3863
3864 void radv_CmdSetViewport(
3865 VkCommandBuffer commandBuffer,
3866 uint32_t firstViewport,
3867 uint32_t viewportCount,
3868 const VkViewport* pViewports)
3869 {
3870 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3871 struct radv_cmd_state *state = &cmd_buffer->state;
3872 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
3873
3874 assert(firstViewport < MAX_VIEWPORTS);
3875 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
3876
3877 if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
3878 pViewports, viewportCount * sizeof(*pViewports))) {
3879 return;
3880 }
3881
3882 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
3883 viewportCount * sizeof(*pViewports));
3884
3885 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
3886 }
3887
3888 void radv_CmdSetScissor(
3889 VkCommandBuffer commandBuffer,
3890 uint32_t firstScissor,
3891 uint32_t scissorCount,
3892 const VkRect2D* pScissors)
3893 {
3894 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3895 struct radv_cmd_state *state = &cmd_buffer->state;
3896 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
3897
3898 assert(firstScissor < MAX_SCISSORS);
3899 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
3900
3901 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
3902 scissorCount * sizeof(*pScissors))) {
3903 return;
3904 }
3905
3906 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
3907 scissorCount * sizeof(*pScissors));
3908
3909 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
3910 }
3911
3912 void radv_CmdSetLineWidth(
3913 VkCommandBuffer commandBuffer,
3914 float lineWidth)
3915 {
3916 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3917
3918 if (cmd_buffer->state.dynamic.line_width == lineWidth)
3919 return;
3920
3921 cmd_buffer->state.dynamic.line_width = lineWidth;
3922 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
3923 }
3924
3925 void radv_CmdSetDepthBias(
3926 VkCommandBuffer commandBuffer,
3927 float depthBiasConstantFactor,
3928 float depthBiasClamp,
3929 float depthBiasSlopeFactor)
3930 {
3931 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3932 struct radv_cmd_state *state = &cmd_buffer->state;
3933
3934 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
3935 state->dynamic.depth_bias.clamp == depthBiasClamp &&
3936 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
3937 return;
3938 }
3939
3940 state->dynamic.depth_bias.bias = depthBiasConstantFactor;
3941 state->dynamic.depth_bias.clamp = depthBiasClamp;
3942 state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
3943
3944 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
3945 }
3946
3947 void radv_CmdSetBlendConstants(
3948 VkCommandBuffer commandBuffer,
3949 const float blendConstants[4])
3950 {
3951 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3952 struct radv_cmd_state *state = &cmd_buffer->state;
3953
3954 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
3955 return;
3956
3957 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
3958
3959 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
3960 }
3961
3962 void radv_CmdSetDepthBounds(
3963 VkCommandBuffer commandBuffer,
3964 float minDepthBounds,
3965 float maxDepthBounds)
3966 {
3967 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3968 struct radv_cmd_state *state = &cmd_buffer->state;
3969
3970 if (state->dynamic.depth_bounds.min == minDepthBounds &&
3971 state->dynamic.depth_bounds.max == maxDepthBounds) {
3972 return;
3973 }
3974
3975 state->dynamic.depth_bounds.min = minDepthBounds;
3976 state->dynamic.depth_bounds.max = maxDepthBounds;
3977
3978 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
3979 }
3980
3981 void radv_CmdSetStencilCompareMask(
3982 VkCommandBuffer commandBuffer,
3983 VkStencilFaceFlags faceMask,
3984 uint32_t compareMask)
3985 {
3986 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3987 struct radv_cmd_state *state = &cmd_buffer->state;
3988 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
3989 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
3990
3991 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3992 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3993 return;
3994 }
3995
3996 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3997 state->dynamic.stencil_compare_mask.front = compareMask;
3998 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3999 state->dynamic.stencil_compare_mask.back = compareMask;
4000
4001 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
4002 }
4003
4004 void radv_CmdSetStencilWriteMask(
4005 VkCommandBuffer commandBuffer,
4006 VkStencilFaceFlags faceMask,
4007 uint32_t writeMask)
4008 {
4009 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4010 struct radv_cmd_state *state = &cmd_buffer->state;
4011 bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
4012 bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
4013
4014 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4015 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4016 return;
4017 }
4018
4019 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4020 state->dynamic.stencil_write_mask.front = writeMask;
4021 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4022 state->dynamic.stencil_write_mask.back = writeMask;
4023
4024 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
4025 }
4026
4027 void radv_CmdSetStencilReference(
4028 VkCommandBuffer commandBuffer,
4029 VkStencilFaceFlags faceMask,
4030 uint32_t reference)
4031 {
4032 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4033 struct radv_cmd_state *state = &cmd_buffer->state;
4034 bool front_same = state->dynamic.stencil_reference.front == reference;
4035 bool back_same = state->dynamic.stencil_reference.back == reference;
4036
4037 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4038 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4039 return;
4040 }
4041
4042 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4043 cmd_buffer->state.dynamic.stencil_reference.front = reference;
4044 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4045 cmd_buffer->state.dynamic.stencil_reference.back = reference;
4046
4047 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
4048 }
4049
4050 void radv_CmdSetDiscardRectangleEXT(
4051 VkCommandBuffer commandBuffer,
4052 uint32_t firstDiscardRectangle,
4053 uint32_t discardRectangleCount,
4054 const VkRect2D* pDiscardRectangles)
4055 {
4056 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4057 struct radv_cmd_state *state = &cmd_buffer->state;
4058 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
4059
4060 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
4061 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
4062
4063 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
4064 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
4065 return;
4066 }
4067
4068 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
4069 pDiscardRectangles, discardRectangleCount);
4070
4071 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
4072 }
4073
4074 void radv_CmdSetSampleLocationsEXT(
4075 VkCommandBuffer commandBuffer,
4076 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
4077 {
4078 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4079 struct radv_cmd_state *state = &cmd_buffer->state;
4080
4081 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
4082
4083 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
4084 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
4085 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
4086 typed_memcpy(&state->dynamic.sample_location.locations[0],
4087 pSampleLocationsInfo->pSampleLocations,
4088 pSampleLocationsInfo->sampleLocationsCount);
4089
4090 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
4091 }
4092
4093 void radv_CmdExecuteCommands(
4094 VkCommandBuffer commandBuffer,
4095 uint32_t commandBufferCount,
4096 const VkCommandBuffer* pCmdBuffers)
4097 {
4098 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
4099
4100 assert(commandBufferCount > 0);
4101
4102 /* Emit pending flushes on primary prior to executing secondary */
4103 si_emit_cache_flush(primary);
4104
4105 for (uint32_t i = 0; i < commandBufferCount; i++) {
4106 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
4107
4108 primary->scratch_size_per_wave_needed = MAX2(primary->scratch_size_per_wave_needed,
4109 secondary->scratch_size_per_wave_needed);
4110 primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted,
4111 secondary->scratch_waves_wanted);
4112 primary->compute_scratch_size_per_wave_needed = MAX2(primary->compute_scratch_size_per_wave_needed,
4113 secondary->compute_scratch_size_per_wave_needed);
4114 primary->compute_scratch_waves_wanted = MAX2(primary->compute_scratch_waves_wanted,
4115 secondary->compute_scratch_waves_wanted);
4116
4117 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
4118 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
4119 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
4120 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
4121 if (secondary->tess_rings_needed)
4122 primary->tess_rings_needed = true;
4123 if (secondary->sample_positions_needed)
4124 primary->sample_positions_needed = true;
4125 if (secondary->gds_needed)
4126 primary->gds_needed = true;
4127
4128 if (!secondary->state.framebuffer &&
4129 (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
4130 /* Emit the framebuffer state from primary if secondary
4131 * has been recorded without a framebuffer, otherwise
4132 * fast color/depth clears can't work.
4133 */
4134 radv_emit_framebuffer_state(primary);
4135 }
4136
4137 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
4138
4139
4140 /* When the secondary command buffer is compute only we don't
4141 * need to re-emit the current graphics pipeline.
4142 */
4143 if (secondary->state.emitted_pipeline) {
4144 primary->state.emitted_pipeline =
4145 secondary->state.emitted_pipeline;
4146 }
4147
4148 /* When the secondary command buffer is graphics only we don't
4149 * need to re-emit the current compute pipeline.
4150 */
4151 if (secondary->state.emitted_compute_pipeline) {
4152 primary->state.emitted_compute_pipeline =
4153 secondary->state.emitted_compute_pipeline;
4154 }
4155
4156 /* Only re-emit the draw packets when needed. */
4157 if (secondary->state.last_primitive_reset_en != -1) {
4158 primary->state.last_primitive_reset_en =
4159 secondary->state.last_primitive_reset_en;
4160 }
4161
4162 if (secondary->state.last_primitive_reset_index) {
4163 primary->state.last_primitive_reset_index =
4164 secondary->state.last_primitive_reset_index;
4165 }
4166
4167 if (secondary->state.last_ia_multi_vgt_param) {
4168 primary->state.last_ia_multi_vgt_param =
4169 secondary->state.last_ia_multi_vgt_param;
4170 }
4171
4172 primary->state.last_first_instance = secondary->state.last_first_instance;
4173 primary->state.last_num_instances = secondary->state.last_num_instances;
4174 primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
4175
4176 if (secondary->state.last_index_type != -1) {
4177 primary->state.last_index_type =
4178 secondary->state.last_index_type;
4179 }
4180 }
4181
4182 /* After executing commands from secondary buffers we have to dirty
4183 * some states.
4184 */
4185 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
4186 RADV_CMD_DIRTY_INDEX_BUFFER |
4187 RADV_CMD_DIRTY_DYNAMIC_ALL;
4188 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
4189 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
4190 }
4191
4192 VkResult radv_CreateCommandPool(
4193 VkDevice _device,
4194 const VkCommandPoolCreateInfo* pCreateInfo,
4195 const VkAllocationCallbacks* pAllocator,
4196 VkCommandPool* pCmdPool)
4197 {
4198 RADV_FROM_HANDLE(radv_device, device, _device);
4199 struct radv_cmd_pool *pool;
4200
4201 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
4202 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4203 if (pool == NULL)
4204 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
4205
4206 if (pAllocator)
4207 pool->alloc = *pAllocator;
4208 else
4209 pool->alloc = device->alloc;
4210
4211 list_inithead(&pool->cmd_buffers);
4212 list_inithead(&pool->free_cmd_buffers);
4213
4214 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
4215
4216 *pCmdPool = radv_cmd_pool_to_handle(pool);
4217
4218 return VK_SUCCESS;
4219
4220 }
4221
4222 void radv_DestroyCommandPool(
4223 VkDevice _device,
4224 VkCommandPool commandPool,
4225 const VkAllocationCallbacks* pAllocator)
4226 {
4227 RADV_FROM_HANDLE(radv_device, device, _device);
4228 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4229
4230 if (!pool)
4231 return;
4232
4233 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4234 &pool->cmd_buffers, pool_link) {
4235 radv_cmd_buffer_destroy(cmd_buffer);
4236 }
4237
4238 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4239 &pool->free_cmd_buffers, pool_link) {
4240 radv_cmd_buffer_destroy(cmd_buffer);
4241 }
4242
4243 vk_free2(&device->alloc, pAllocator, pool);
4244 }
4245
4246 VkResult radv_ResetCommandPool(
4247 VkDevice device,
4248 VkCommandPool commandPool,
4249 VkCommandPoolResetFlags flags)
4250 {
4251 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4252 VkResult result;
4253
4254 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
4255 &pool->cmd_buffers, pool_link) {
4256 result = radv_reset_cmd_buffer(cmd_buffer);
4257 if (result != VK_SUCCESS)
4258 return result;
4259 }
4260
4261 return VK_SUCCESS;
4262 }
4263
4264 void radv_TrimCommandPool(
4265 VkDevice device,
4266 VkCommandPool commandPool,
4267 VkCommandPoolTrimFlags flags)
4268 {
4269 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4270
4271 if (!pool)
4272 return;
4273
4274 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4275 &pool->free_cmd_buffers, pool_link) {
4276 radv_cmd_buffer_destroy(cmd_buffer);
4277 }
4278 }
4279
4280 static void
4281 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer,
4282 uint32_t subpass_id)
4283 {
4284 struct radv_cmd_state *state = &cmd_buffer->state;
4285 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
4286
4287 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
4288 cmd_buffer->cs, 4096);
4289
4290 radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
4291
4292 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
4293
4294 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4295 const uint32_t a = subpass->attachments[i].attachment;
4296 if (a == VK_ATTACHMENT_UNUSED)
4297 continue;
4298
4299 radv_handle_subpass_image_transition(cmd_buffer,
4300 subpass->attachments[i],
4301 true);
4302 }
4303
4304 radv_cmd_buffer_clear_subpass(cmd_buffer);
4305
4306 assert(cmd_buffer->cs->cdw <= cdw_max);
4307 }
4308
4309 static void
4310 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
4311 {
4312 struct radv_cmd_state *state = &cmd_buffer->state;
4313 const struct radv_subpass *subpass = state->subpass;
4314 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4315
4316 radv_cmd_buffer_resolve_subpass(cmd_buffer);
4317
4318 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4319 const uint32_t a = subpass->attachments[i].attachment;
4320 if (a == VK_ATTACHMENT_UNUSED)
4321 continue;
4322
4323 if (state->pass->attachments[a].last_subpass_idx != subpass_id)
4324 continue;
4325
4326 VkImageLayout layout = state->pass->attachments[a].final_layout;
4327 VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
4328 struct radv_subpass_attachment att = { a, layout, stencil_layout };
4329 radv_handle_subpass_image_transition(cmd_buffer, att, false);
4330 }
4331 }
4332
4333 void radv_CmdBeginRenderPass(
4334 VkCommandBuffer commandBuffer,
4335 const VkRenderPassBeginInfo* pRenderPassBegin,
4336 VkSubpassContents contents)
4337 {
4338 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4339 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
4340 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
4341 VkResult result;
4342
4343 cmd_buffer->state.framebuffer = framebuffer;
4344 cmd_buffer->state.pass = pass;
4345 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
4346
4347 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
4348 if (result != VK_SUCCESS)
4349 return;
4350
4351 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
4352 if (result != VK_SUCCESS)
4353 return;
4354
4355 radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
4356 }
4357
4358 void radv_CmdBeginRenderPass2(
4359 VkCommandBuffer commandBuffer,
4360 const VkRenderPassBeginInfo* pRenderPassBeginInfo,
4361 const VkSubpassBeginInfo* pSubpassBeginInfo)
4362 {
4363 radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
4364 pSubpassBeginInfo->contents);
4365 }
4366
4367 void radv_CmdNextSubpass(
4368 VkCommandBuffer commandBuffer,
4369 VkSubpassContents contents)
4370 {
4371 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4372
4373 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
4374 radv_cmd_buffer_end_subpass(cmd_buffer);
4375 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
4376 }
4377
4378 void radv_CmdNextSubpass2(
4379 VkCommandBuffer commandBuffer,
4380 const VkSubpassBeginInfo* pSubpassBeginInfo,
4381 const VkSubpassEndInfo* pSubpassEndInfo)
4382 {
4383 radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
4384 }
4385
4386 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
4387 {
4388 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
4389 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
4390 if (!radv_get_shader(pipeline, stage))
4391 continue;
4392
4393 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
4394 if (loc->sgpr_idx == -1)
4395 continue;
4396 uint32_t base_reg = pipeline->user_data_0[stage];
4397 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
4398
4399 }
4400 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
4401 struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
4402 if (loc->sgpr_idx != -1) {
4403 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
4404 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
4405 }
4406 }
4407 }
4408
4409 static void
4410 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4411 uint32_t vertex_count,
4412 bool use_opaque)
4413 {
4414 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
4415 radeon_emit(cmd_buffer->cs, vertex_count);
4416 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
4417 S_0287F0_USE_OPAQUE(use_opaque));
4418 }
4419
4420 static void
4421 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
4422 uint64_t index_va,
4423 uint32_t index_count)
4424 {
4425 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
4426 radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
4427 radeon_emit(cmd_buffer->cs, index_va);
4428 radeon_emit(cmd_buffer->cs, index_va >> 32);
4429 radeon_emit(cmd_buffer->cs, index_count);
4430 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
4431 }
4432
4433 static void
4434 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4435 bool indexed,
4436 uint32_t draw_count,
4437 uint64_t count_va,
4438 uint32_t stride)
4439 {
4440 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4441 unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
4442 : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
4443 bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
4444 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
4445 bool predicating = cmd_buffer->state.predicating;
4446 assert(base_reg);
4447
4448 /* just reset draw state for vertex data */
4449 cmd_buffer->state.last_first_instance = -1;
4450 cmd_buffer->state.last_num_instances = -1;
4451 cmd_buffer->state.last_vertex_offset = -1;
4452
4453 if (draw_count == 1 && !count_va && !draw_id_enable) {
4454 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
4455 PKT3_DRAW_INDIRECT, 3, predicating));
4456 radeon_emit(cs, 0);
4457 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4458 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4459 radeon_emit(cs, di_src_sel);
4460 } else {
4461 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
4462 PKT3_DRAW_INDIRECT_MULTI,
4463 8, predicating));
4464 radeon_emit(cs, 0);
4465 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4466 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4467 radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
4468 S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
4469 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
4470 radeon_emit(cs, draw_count); /* count */
4471 radeon_emit(cs, count_va); /* count_addr */
4472 radeon_emit(cs, count_va >> 32);
4473 radeon_emit(cs, stride); /* stride */
4474 radeon_emit(cs, di_src_sel);
4475 }
4476 }
4477
4478 static void
4479 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
4480 const struct radv_draw_info *info)
4481 {
4482 struct radv_cmd_state *state = &cmd_buffer->state;
4483 struct radeon_winsys *ws = cmd_buffer->device->ws;
4484 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4485
4486 if (info->indirect) {
4487 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4488 uint64_t count_va = 0;
4489
4490 va += info->indirect->offset + info->indirect_offset;
4491
4492 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4493
4494 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
4495 radeon_emit(cs, 1);
4496 radeon_emit(cs, va);
4497 radeon_emit(cs, va >> 32);
4498
4499 if (info->count_buffer) {
4500 count_va = radv_buffer_get_va(info->count_buffer->bo);
4501 count_va += info->count_buffer->offset +
4502 info->count_buffer_offset;
4503
4504 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
4505 }
4506
4507 if (!state->subpass->view_mask) {
4508 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4509 info->indexed,
4510 info->count,
4511 count_va,
4512 info->stride);
4513 } else {
4514 unsigned i;
4515 for_each_bit(i, state->subpass->view_mask) {
4516 radv_emit_view_index(cmd_buffer, i);
4517
4518 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4519 info->indexed,
4520 info->count,
4521 count_va,
4522 info->stride);
4523 }
4524 }
4525 } else {
4526 assert(state->pipeline->graphics.vtx_base_sgpr);
4527
4528 if (info->vertex_offset != state->last_vertex_offset ||
4529 info->first_instance != state->last_first_instance) {
4530 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
4531 state->pipeline->graphics.vtx_emit_num);
4532
4533 radeon_emit(cs, info->vertex_offset);
4534 radeon_emit(cs, info->first_instance);
4535 if (state->pipeline->graphics.vtx_emit_num == 3)
4536 radeon_emit(cs, 0);
4537 state->last_first_instance = info->first_instance;
4538 state->last_vertex_offset = info->vertex_offset;
4539 }
4540
4541 if (state->last_num_instances != info->instance_count) {
4542 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
4543 radeon_emit(cs, info->instance_count);
4544 state->last_num_instances = info->instance_count;
4545 }
4546
4547 if (info->indexed) {
4548 int index_size = radv_get_vgt_index_size(state->index_type);
4549 uint64_t index_va;
4550
4551 /* Skip draw calls with 0-sized index buffers. They
4552 * cause a hang on some chips, like Navi10-14.
4553 */
4554 if (!cmd_buffer->state.max_index_count)
4555 return;
4556
4557 index_va = state->index_va;
4558 index_va += info->first_index * index_size;
4559
4560 if (!state->subpass->view_mask) {
4561 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4562 index_va,
4563 info->count);
4564 } else {
4565 unsigned i;
4566 for_each_bit(i, state->subpass->view_mask) {
4567 radv_emit_view_index(cmd_buffer, i);
4568
4569 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4570 index_va,
4571 info->count);
4572 }
4573 }
4574 } else {
4575 if (!state->subpass->view_mask) {
4576 radv_cs_emit_draw_packet(cmd_buffer,
4577 info->count,
4578 !!info->strmout_buffer);
4579 } else {
4580 unsigned i;
4581 for_each_bit(i, state->subpass->view_mask) {
4582 radv_emit_view_index(cmd_buffer, i);
4583
4584 radv_cs_emit_draw_packet(cmd_buffer,
4585 info->count,
4586 !!info->strmout_buffer);
4587 }
4588 }
4589 }
4590 }
4591 }
4592
4593 /*
4594 * Vega and raven have a bug which triggers if there are multiple context
4595 * register contexts active at the same time with different scissor values.
4596 *
4597 * There are two possible workarounds:
4598 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
4599 * there is only ever 1 active set of scissor values at the same time.
4600 *
4601 * 2) Whenever the hardware switches contexts we have to set the scissor
4602 * registers again even if it is a noop. That way the new context gets
4603 * the correct scissor values.
4604 *
4605 * This implements option 2. radv_need_late_scissor_emission needs to
4606 * return true on affected HW if radv_emit_all_graphics_states sets
4607 * any context registers.
4608 */
4609 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
4610 const struct radv_draw_info *info)
4611 {
4612 struct radv_cmd_state *state = &cmd_buffer->state;
4613
4614 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
4615 return false;
4616
4617 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
4618 return true;
4619
4620 uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
4621
4622 /* Index, vertex and streamout buffers don't change context regs, and
4623 * pipeline is already handled.
4624 */
4625 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
4626 RADV_CMD_DIRTY_VERTEX_BUFFER |
4627 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
4628 RADV_CMD_DIRTY_PIPELINE);
4629
4630 if (cmd_buffer->state.dirty & used_states)
4631 return true;
4632
4633 uint32_t primitive_reset_index =
4634 radv_get_primitive_reset_index(cmd_buffer);
4635
4636 if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
4637 primitive_reset_index != state->last_primitive_reset_index)
4638 return true;
4639
4640 return false;
4641 }
4642
4643 static void
4644 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
4645 const struct radv_draw_info *info)
4646 {
4647 bool late_scissor_emission;
4648
4649 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
4650 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
4651 radv_emit_rbplus_state(cmd_buffer);
4652
4653 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
4654 radv_emit_graphics_pipeline(cmd_buffer);
4655
4656 /* This should be before the cmd_buffer->state.dirty is cleared
4657 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
4658 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
4659 late_scissor_emission =
4660 radv_need_late_scissor_emission(cmd_buffer, info);
4661
4662 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
4663 radv_emit_framebuffer_state(cmd_buffer);
4664
4665 if (info->indexed) {
4666 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
4667 radv_emit_index_buffer(cmd_buffer);
4668 } else {
4669 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
4670 * so the state must be re-emitted before the next indexed
4671 * draw.
4672 */
4673 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
4674 cmd_buffer->state.last_index_type = -1;
4675 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
4676 }
4677 }
4678
4679 radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
4680
4681 radv_emit_draw_registers(cmd_buffer, info);
4682
4683 if (late_scissor_emission)
4684 radv_emit_scissor(cmd_buffer);
4685 }
4686
4687 static void
4688 radv_draw(struct radv_cmd_buffer *cmd_buffer,
4689 const struct radv_draw_info *info)
4690 {
4691 struct radeon_info *rad_info =
4692 &cmd_buffer->device->physical_device->rad_info;
4693 bool has_prefetch =
4694 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
4695 bool pipeline_is_dirty =
4696 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
4697 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
4698
4699 ASSERTED unsigned cdw_max =
4700 radeon_check_space(cmd_buffer->device->ws,
4701 cmd_buffer->cs, 4096);
4702
4703 if (likely(!info->indirect)) {
4704 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
4705 * no workaround for indirect draws, but we can at least skip
4706 * direct draws.
4707 */
4708 if (unlikely(!info->instance_count))
4709 return;
4710
4711 /* Handle count == 0. */
4712 if (unlikely(!info->count && !info->strmout_buffer))
4713 return;
4714 }
4715
4716 /* Use optimal packet order based on whether we need to sync the
4717 * pipeline.
4718 */
4719 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4720 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4721 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4722 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4723 /* If we have to wait for idle, set all states first, so that
4724 * all SET packets are processed in parallel with previous draw
4725 * calls. Then upload descriptors, set shader pointers, and
4726 * draw, and prefetch at the end. This ensures that the time
4727 * the CUs are idle is very short. (there are only SET_SH
4728 * packets between the wait and the draw)
4729 */
4730 radv_emit_all_graphics_states(cmd_buffer, info);
4731 si_emit_cache_flush(cmd_buffer);
4732 /* <-- CUs are idle here --> */
4733
4734 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4735
4736 radv_emit_draw_packets(cmd_buffer, info);
4737 /* <-- CUs are busy here --> */
4738
4739 /* Start prefetches after the draw has been started. Both will
4740 * run in parallel, but starting the draw first is more
4741 * important.
4742 */
4743 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4744 radv_emit_prefetch_L2(cmd_buffer,
4745 cmd_buffer->state.pipeline, false);
4746 }
4747 } else {
4748 /* If we don't wait for idle, start prefetches first, then set
4749 * states, and draw at the end.
4750 */
4751 si_emit_cache_flush(cmd_buffer);
4752
4753 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4754 /* Only prefetch the vertex shader and VBO descriptors
4755 * in order to start the draw as soon as possible.
4756 */
4757 radv_emit_prefetch_L2(cmd_buffer,
4758 cmd_buffer->state.pipeline, true);
4759 }
4760
4761 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4762
4763 radv_emit_all_graphics_states(cmd_buffer, info);
4764 radv_emit_draw_packets(cmd_buffer, info);
4765
4766 /* Prefetch the remaining shaders after the draw has been
4767 * started.
4768 */
4769 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4770 radv_emit_prefetch_L2(cmd_buffer,
4771 cmd_buffer->state.pipeline, false);
4772 }
4773 }
4774
4775 /* Workaround for a VGT hang when streamout is enabled.
4776 * It must be done after drawing.
4777 */
4778 if (cmd_buffer->state.streamout.streamout_enabled &&
4779 (rad_info->family == CHIP_HAWAII ||
4780 rad_info->family == CHIP_TONGA ||
4781 rad_info->family == CHIP_FIJI)) {
4782 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
4783 }
4784
4785 assert(cmd_buffer->cs->cdw <= cdw_max);
4786 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
4787 }
4788
4789 void radv_CmdDraw(
4790 VkCommandBuffer commandBuffer,
4791 uint32_t vertexCount,
4792 uint32_t instanceCount,
4793 uint32_t firstVertex,
4794 uint32_t firstInstance)
4795 {
4796 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4797 struct radv_draw_info info = {};
4798
4799 info.count = vertexCount;
4800 info.instance_count = instanceCount;
4801 info.first_instance = firstInstance;
4802 info.vertex_offset = firstVertex;
4803
4804 radv_draw(cmd_buffer, &info);
4805 }
4806
4807 void radv_CmdDrawIndexed(
4808 VkCommandBuffer commandBuffer,
4809 uint32_t indexCount,
4810 uint32_t instanceCount,
4811 uint32_t firstIndex,
4812 int32_t vertexOffset,
4813 uint32_t firstInstance)
4814 {
4815 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4816 struct radv_draw_info info = {};
4817
4818 info.indexed = true;
4819 info.count = indexCount;
4820 info.instance_count = instanceCount;
4821 info.first_index = firstIndex;
4822 info.vertex_offset = vertexOffset;
4823 info.first_instance = firstInstance;
4824
4825 radv_draw(cmd_buffer, &info);
4826 }
4827
4828 void radv_CmdDrawIndirect(
4829 VkCommandBuffer commandBuffer,
4830 VkBuffer _buffer,
4831 VkDeviceSize offset,
4832 uint32_t drawCount,
4833 uint32_t stride)
4834 {
4835 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4836 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4837 struct radv_draw_info info = {};
4838
4839 info.count = drawCount;
4840 info.indirect = buffer;
4841 info.indirect_offset = offset;
4842 info.stride = stride;
4843
4844 radv_draw(cmd_buffer, &info);
4845 }
4846
4847 void radv_CmdDrawIndexedIndirect(
4848 VkCommandBuffer commandBuffer,
4849 VkBuffer _buffer,
4850 VkDeviceSize offset,
4851 uint32_t drawCount,
4852 uint32_t stride)
4853 {
4854 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4855 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4856 struct radv_draw_info info = {};
4857
4858 info.indexed = true;
4859 info.count = drawCount;
4860 info.indirect = buffer;
4861 info.indirect_offset = offset;
4862 info.stride = stride;
4863
4864 radv_draw(cmd_buffer, &info);
4865 }
4866
4867 void radv_CmdDrawIndirectCount(
4868 VkCommandBuffer commandBuffer,
4869 VkBuffer _buffer,
4870 VkDeviceSize offset,
4871 VkBuffer _countBuffer,
4872 VkDeviceSize countBufferOffset,
4873 uint32_t maxDrawCount,
4874 uint32_t stride)
4875 {
4876 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4877 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4878 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4879 struct radv_draw_info info = {};
4880
4881 info.count = maxDrawCount;
4882 info.indirect = buffer;
4883 info.indirect_offset = offset;
4884 info.count_buffer = count_buffer;
4885 info.count_buffer_offset = countBufferOffset;
4886 info.stride = stride;
4887
4888 radv_draw(cmd_buffer, &info);
4889 }
4890
4891 void radv_CmdDrawIndexedIndirectCount(
4892 VkCommandBuffer commandBuffer,
4893 VkBuffer _buffer,
4894 VkDeviceSize offset,
4895 VkBuffer _countBuffer,
4896 VkDeviceSize countBufferOffset,
4897 uint32_t maxDrawCount,
4898 uint32_t stride)
4899 {
4900 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4901 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4902 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4903 struct radv_draw_info info = {};
4904
4905 info.indexed = true;
4906 info.count = maxDrawCount;
4907 info.indirect = buffer;
4908 info.indirect_offset = offset;
4909 info.count_buffer = count_buffer;
4910 info.count_buffer_offset = countBufferOffset;
4911 info.stride = stride;
4912
4913 radv_draw(cmd_buffer, &info);
4914 }
4915
4916 struct radv_dispatch_info {
4917 /**
4918 * Determine the layout of the grid (in block units) to be used.
4919 */
4920 uint32_t blocks[3];
4921
4922 /**
4923 * A starting offset for the grid. If unaligned is set, the offset
4924 * must still be aligned.
4925 */
4926 uint32_t offsets[3];
4927 /**
4928 * Whether it's an unaligned compute dispatch.
4929 */
4930 bool unaligned;
4931
4932 /**
4933 * Indirect compute parameters resource.
4934 */
4935 struct radv_buffer *indirect;
4936 uint64_t indirect_offset;
4937 };
4938
4939 static void
4940 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
4941 const struct radv_dispatch_info *info)
4942 {
4943 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4944 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4945 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
4946 struct radeon_winsys *ws = cmd_buffer->device->ws;
4947 bool predicating = cmd_buffer->state.predicating;
4948 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4949 struct radv_userdata_info *loc;
4950
4951 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
4952 AC_UD_CS_GRID_SIZE);
4953
4954 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
4955
4956 if (compute_shader->info.wave_size == 32) {
4957 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
4958 dispatch_initiator |= S_00B800_CS_W32_EN(1);
4959 }
4960
4961 if (info->indirect) {
4962 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4963
4964 va += info->indirect->offset + info->indirect_offset;
4965
4966 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4967
4968 if (loc->sgpr_idx != -1) {
4969 for (unsigned i = 0; i < 3; ++i) {
4970 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4971 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
4972 COPY_DATA_DST_SEL(COPY_DATA_REG));
4973 radeon_emit(cs, (va + 4 * i));
4974 radeon_emit(cs, (va + 4 * i) >> 32);
4975 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
4976 + loc->sgpr_idx * 4) >> 2) + i);
4977 radeon_emit(cs, 0);
4978 }
4979 }
4980
4981 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
4982 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
4983 PKT3_SHADER_TYPE_S(1));
4984 radeon_emit(cs, va);
4985 radeon_emit(cs, va >> 32);
4986 radeon_emit(cs, dispatch_initiator);
4987 } else {
4988 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
4989 PKT3_SHADER_TYPE_S(1));
4990 radeon_emit(cs, 1);
4991 radeon_emit(cs, va);
4992 radeon_emit(cs, va >> 32);
4993
4994 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
4995 PKT3_SHADER_TYPE_S(1));
4996 radeon_emit(cs, 0);
4997 radeon_emit(cs, dispatch_initiator);
4998 }
4999 } else {
5000 unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
5001 unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] };
5002
5003 if (info->unaligned) {
5004 unsigned *cs_block_size = compute_shader->info.cs.block_size;
5005 unsigned remainder[3];
5006
5007 /* If aligned, these should be an entire block size,
5008 * not 0.
5009 */
5010 remainder[0] = blocks[0] + cs_block_size[0] -
5011 align_u32_npot(blocks[0], cs_block_size[0]);
5012 remainder[1] = blocks[1] + cs_block_size[1] -
5013 align_u32_npot(blocks[1], cs_block_size[1]);
5014 remainder[2] = blocks[2] + cs_block_size[2] -
5015 align_u32_npot(blocks[2], cs_block_size[2]);
5016
5017 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
5018 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
5019 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
5020
5021 for(unsigned i = 0; i < 3; ++i) {
5022 assert(offsets[i] % cs_block_size[i] == 0);
5023 offsets[i] /= cs_block_size[i];
5024 }
5025
5026 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
5027 radeon_emit(cs,
5028 S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
5029 S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
5030 radeon_emit(cs,
5031 S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
5032 S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
5033 radeon_emit(cs,
5034 S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
5035 S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
5036
5037 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
5038 }
5039
5040 if (loc->sgpr_idx != -1) {
5041 assert(loc->num_sgprs == 3);
5042
5043 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
5044 loc->sgpr_idx * 4, 3);
5045 radeon_emit(cs, blocks[0]);
5046 radeon_emit(cs, blocks[1]);
5047 radeon_emit(cs, blocks[2]);
5048 }
5049
5050 if (offsets[0] || offsets[1] || offsets[2]) {
5051 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
5052 radeon_emit(cs, offsets[0]);
5053 radeon_emit(cs, offsets[1]);
5054 radeon_emit(cs, offsets[2]);
5055
5056 /* The blocks in the packet are not counts but end values. */
5057 for (unsigned i = 0; i < 3; ++i)
5058 blocks[i] += offsets[i];
5059 } else {
5060 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
5061 }
5062
5063 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
5064 PKT3_SHADER_TYPE_S(1));
5065 radeon_emit(cs, blocks[0]);
5066 radeon_emit(cs, blocks[1]);
5067 radeon_emit(cs, blocks[2]);
5068 radeon_emit(cs, dispatch_initiator);
5069 }
5070
5071 assert(cmd_buffer->cs->cdw <= cdw_max);
5072 }
5073
5074 static void
5075 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
5076 {
5077 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
5078 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
5079 }
5080
5081 static void
5082 radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
5083 const struct radv_dispatch_info *info)
5084 {
5085 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
5086 bool has_prefetch =
5087 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
5088 bool pipeline_is_dirty = pipeline &&
5089 pipeline != cmd_buffer->state.emitted_compute_pipeline;
5090
5091 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5092 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5093 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
5094 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
5095 /* If we have to wait for idle, set all states first, so that
5096 * all SET packets are processed in parallel with previous draw
5097 * calls. Then upload descriptors, set shader pointers, and
5098 * dispatch, and prefetch at the end. This ensures that the
5099 * time the CUs are idle is very short. (there are only SET_SH
5100 * packets between the wait and the draw)
5101 */
5102 radv_emit_compute_pipeline(cmd_buffer);
5103 si_emit_cache_flush(cmd_buffer);
5104 /* <-- CUs are idle here --> */
5105
5106 radv_upload_compute_shader_descriptors(cmd_buffer);
5107
5108 radv_emit_dispatch_packets(cmd_buffer, info);
5109 /* <-- CUs are busy here --> */
5110
5111 /* Start prefetches after the dispatch has been started. Both
5112 * will run in parallel, but starting the dispatch first is
5113 * more important.
5114 */
5115 if (has_prefetch && pipeline_is_dirty) {
5116 radv_emit_shader_prefetch(cmd_buffer,
5117 pipeline->shaders[MESA_SHADER_COMPUTE]);
5118 }
5119 } else {
5120 /* If we don't wait for idle, start prefetches first, then set
5121 * states, and dispatch at the end.
5122 */
5123 si_emit_cache_flush(cmd_buffer);
5124
5125 if (has_prefetch && pipeline_is_dirty) {
5126 radv_emit_shader_prefetch(cmd_buffer,
5127 pipeline->shaders[MESA_SHADER_COMPUTE]);
5128 }
5129
5130 radv_upload_compute_shader_descriptors(cmd_buffer);
5131
5132 radv_emit_compute_pipeline(cmd_buffer);
5133 radv_emit_dispatch_packets(cmd_buffer, info);
5134 }
5135
5136 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
5137 }
5138
5139 void radv_CmdDispatchBase(
5140 VkCommandBuffer commandBuffer,
5141 uint32_t base_x,
5142 uint32_t base_y,
5143 uint32_t base_z,
5144 uint32_t x,
5145 uint32_t y,
5146 uint32_t z)
5147 {
5148 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5149 struct radv_dispatch_info info = {};
5150
5151 info.blocks[0] = x;
5152 info.blocks[1] = y;
5153 info.blocks[2] = z;
5154
5155 info.offsets[0] = base_x;
5156 info.offsets[1] = base_y;
5157 info.offsets[2] = base_z;
5158 radv_dispatch(cmd_buffer, &info);
5159 }
5160
5161 void radv_CmdDispatch(
5162 VkCommandBuffer commandBuffer,
5163 uint32_t x,
5164 uint32_t y,
5165 uint32_t z)
5166 {
5167 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
5168 }
5169
5170 void radv_CmdDispatchIndirect(
5171 VkCommandBuffer commandBuffer,
5172 VkBuffer _buffer,
5173 VkDeviceSize offset)
5174 {
5175 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5176 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5177 struct radv_dispatch_info info = {};
5178
5179 info.indirect = buffer;
5180 info.indirect_offset = offset;
5181
5182 radv_dispatch(cmd_buffer, &info);
5183 }
5184
5185 void radv_unaligned_dispatch(
5186 struct radv_cmd_buffer *cmd_buffer,
5187 uint32_t x,
5188 uint32_t y,
5189 uint32_t z)
5190 {
5191 struct radv_dispatch_info info = {};
5192
5193 info.blocks[0] = x;
5194 info.blocks[1] = y;
5195 info.blocks[2] = z;
5196 info.unaligned = 1;
5197
5198 radv_dispatch(cmd_buffer, &info);
5199 }
5200
5201 void radv_CmdEndRenderPass(
5202 VkCommandBuffer commandBuffer)
5203 {
5204 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5205
5206 radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
5207
5208 radv_cmd_buffer_end_subpass(cmd_buffer);
5209
5210 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
5211 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
5212
5213 cmd_buffer->state.pass = NULL;
5214 cmd_buffer->state.subpass = NULL;
5215 cmd_buffer->state.attachments = NULL;
5216 cmd_buffer->state.framebuffer = NULL;
5217 cmd_buffer->state.subpass_sample_locs = NULL;
5218 }
5219
5220 void radv_CmdEndRenderPass2(
5221 VkCommandBuffer commandBuffer,
5222 const VkSubpassEndInfo* pSubpassEndInfo)
5223 {
5224 radv_CmdEndRenderPass(commandBuffer);
5225 }
5226
5227 /*
5228 * For HTILE we have the following interesting clear words:
5229 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
5230 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
5231 * 0xfffffff0: Clear depth to 1.0
5232 * 0x00000000: Clear depth to 0.0
5233 */
5234 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
5235 struct radv_image *image,
5236 const VkImageSubresourceRange *range)
5237 {
5238 assert(range->baseMipLevel == 0);
5239 assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
5240 VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
5241 struct radv_cmd_state *state = &cmd_buffer->state;
5242 uint32_t htile_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
5243 VkClearDepthStencilValue value = {};
5244
5245 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5246 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5247
5248 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
5249
5250 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5251
5252 if (vk_format_is_stencil(image->vk_format))
5253 aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5254
5255 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, aspects);
5256
5257 if (radv_image_is_tc_compat_htile(image)) {
5258 /* Initialize the TC-compat metada value to 0 because by
5259 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
5260 * need have to conditionally update its value when performing
5261 * a fast depth clear.
5262 */
5263 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
5264 }
5265 }
5266
5267 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
5268 struct radv_image *image,
5269 VkImageLayout src_layout,
5270 bool src_render_loop,
5271 VkImageLayout dst_layout,
5272 bool dst_render_loop,
5273 unsigned src_queue_mask,
5274 unsigned dst_queue_mask,
5275 const VkImageSubresourceRange *range,
5276 struct radv_sample_locations_state *sample_locs)
5277 {
5278 if (!radv_image_has_htile(image))
5279 return;
5280
5281 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5282 radv_initialize_htile(cmd_buffer, image, range);
5283 } else if (!radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) &&
5284 radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5285 radv_initialize_htile(cmd_buffer, image, range);
5286 } else if (radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) &&
5287 !radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5288 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5289 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5290
5291 radv_decompress_depth_image_inplace(cmd_buffer, image, range,
5292 sample_locs);
5293
5294 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5295 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5296 }
5297 }
5298
5299 static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
5300 struct radv_image *image,
5301 const VkImageSubresourceRange *range,
5302 uint32_t value)
5303 {
5304 struct radv_cmd_state *state = &cmd_buffer->state;
5305
5306 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5307 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5308
5309 state->flush_bits |= radv_clear_cmask(cmd_buffer, image, range, value);
5310
5311 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5312 }
5313
5314 void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer,
5315 struct radv_image *image,
5316 const VkImageSubresourceRange *range)
5317 {
5318 struct radv_cmd_state *state = &cmd_buffer->state;
5319 static const uint32_t fmask_clear_values[4] = {
5320 0x00000000,
5321 0x02020202,
5322 0xE4E4E4E4,
5323 0x76543210
5324 };
5325 uint32_t log2_samples = util_logbase2(image->info.samples);
5326 uint32_t value = fmask_clear_values[log2_samples];
5327
5328 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5329 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5330
5331 state->flush_bits |= radv_clear_fmask(cmd_buffer, image, range, value);
5332
5333 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5334 }
5335
5336 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
5337 struct radv_image *image,
5338 const VkImageSubresourceRange *range, uint32_t value)
5339 {
5340 struct radv_cmd_state *state = &cmd_buffer->state;
5341 unsigned size = 0;
5342
5343 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5344 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5345
5346 state->flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
5347
5348 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
5349 /* When DCC is enabled with mipmaps, some levels might not
5350 * support fast clears and we have to initialize them as "fully
5351 * expanded".
5352 */
5353 /* Compute the size of all fast clearable DCC levels. */
5354 for (unsigned i = 0; i < image->planes[0].surface.num_dcc_levels; i++) {
5355 struct legacy_surf_level *surf_level =
5356 &image->planes[0].surface.u.legacy.level[i];
5357 unsigned dcc_fast_clear_size =
5358 surf_level->dcc_slice_fast_clear_size * image->info.array_size;
5359
5360 if (!dcc_fast_clear_size)
5361 break;
5362
5363 size = surf_level->dcc_offset + dcc_fast_clear_size;
5364 }
5365
5366 /* Initialize the mipmap levels without DCC. */
5367 if (size != image->planes[0].surface.dcc_size) {
5368 state->flush_bits |=
5369 radv_fill_buffer(cmd_buffer, image->bo,
5370 image->offset + image->dcc_offset + size,
5371 image->planes[0].surface.dcc_size - size,
5372 0xffffffff);
5373 }
5374 }
5375
5376 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5377 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5378 }
5379
5380 /**
5381 * Initialize DCC/FMASK/CMASK metadata for a color image.
5382 */
5383 static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,
5384 struct radv_image *image,
5385 VkImageLayout src_layout,
5386 bool src_render_loop,
5387 VkImageLayout dst_layout,
5388 bool dst_render_loop,
5389 unsigned src_queue_mask,
5390 unsigned dst_queue_mask,
5391 const VkImageSubresourceRange *range)
5392 {
5393 if (radv_image_has_cmask(image)) {
5394 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5395
5396 /* TODO: clarify this. */
5397 if (radv_image_has_fmask(image)) {
5398 value = 0xccccccccu;
5399 }
5400
5401 radv_initialise_cmask(cmd_buffer, image, range, value);
5402 }
5403
5404 if (radv_image_has_fmask(image)) {
5405 radv_initialize_fmask(cmd_buffer, image, range);
5406 }
5407
5408 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5409 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5410 bool need_decompress_pass = false;
5411
5412 if (radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout,
5413 dst_render_loop,
5414 dst_queue_mask)) {
5415 value = 0x20202020u;
5416 need_decompress_pass = true;
5417 }
5418
5419 radv_initialize_dcc(cmd_buffer, image, range, value);
5420
5421 radv_update_fce_metadata(cmd_buffer, image, range,
5422 need_decompress_pass);
5423 }
5424
5425 if (radv_image_has_cmask(image) ||
5426 radv_dcc_enabled(image, range->baseMipLevel)) {
5427 uint32_t color_values[2] = {};
5428 radv_set_color_clear_metadata(cmd_buffer, image, range,
5429 color_values);
5430 }
5431 }
5432
5433 /**
5434 * Handle color image transitions for DCC/FMASK/CMASK.
5435 */
5436 static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer,
5437 struct radv_image *image,
5438 VkImageLayout src_layout,
5439 bool src_render_loop,
5440 VkImageLayout dst_layout,
5441 bool dst_render_loop,
5442 unsigned src_queue_mask,
5443 unsigned dst_queue_mask,
5444 const VkImageSubresourceRange *range)
5445 {
5446 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5447 radv_init_color_image_metadata(cmd_buffer, image,
5448 src_layout, src_render_loop,
5449 dst_layout, dst_render_loop,
5450 src_queue_mask, dst_queue_mask,
5451 range);
5452 return;
5453 }
5454
5455 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5456 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
5457 radv_initialize_dcc(cmd_buffer, image, range, 0xffffffffu);
5458 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, src_layout, src_render_loop, src_queue_mask) &&
5459 !radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout, dst_render_loop, dst_queue_mask)) {
5460 radv_decompress_dcc(cmd_buffer, image, range);
5461 } else if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
5462 !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5463 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5464 }
5465 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
5466 bool fce_eliminate = false, fmask_expand = false;
5467
5468 if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
5469 !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5470 fce_eliminate = true;
5471 }
5472
5473 if (radv_image_has_fmask(image)) {
5474 if (src_layout != VK_IMAGE_LAYOUT_GENERAL &&
5475 dst_layout == VK_IMAGE_LAYOUT_GENERAL) {
5476 /* A FMASK decompress is required before doing
5477 * a MSAA decompress using FMASK.
5478 */
5479 fmask_expand = true;
5480 }
5481 }
5482
5483 if (fce_eliminate || fmask_expand)
5484 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5485
5486 if (fmask_expand)
5487 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
5488 }
5489 }
5490
5491 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
5492 struct radv_image *image,
5493 VkImageLayout src_layout,
5494 bool src_render_loop,
5495 VkImageLayout dst_layout,
5496 bool dst_render_loop,
5497 uint32_t src_family,
5498 uint32_t dst_family,
5499 const VkImageSubresourceRange *range,
5500 struct radv_sample_locations_state *sample_locs)
5501 {
5502 if (image->exclusive && src_family != dst_family) {
5503 /* This is an acquire or a release operation and there will be
5504 * a corresponding release/acquire. Do the transition in the
5505 * most flexible queue. */
5506
5507 assert(src_family == cmd_buffer->queue_family_index ||
5508 dst_family == cmd_buffer->queue_family_index);
5509
5510 if (src_family == VK_QUEUE_FAMILY_EXTERNAL ||
5511 src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
5512 return;
5513
5514 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
5515 return;
5516
5517 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
5518 (src_family == RADV_QUEUE_GENERAL ||
5519 dst_family == RADV_QUEUE_GENERAL))
5520 return;
5521 }
5522
5523 if (src_layout == dst_layout)
5524 return;
5525
5526 unsigned src_queue_mask =
5527 radv_image_queue_family_mask(image, src_family,
5528 cmd_buffer->queue_family_index);
5529 unsigned dst_queue_mask =
5530 radv_image_queue_family_mask(image, dst_family,
5531 cmd_buffer->queue_family_index);
5532
5533 if (vk_format_is_depth(image->vk_format)) {
5534 radv_handle_depth_image_transition(cmd_buffer, image,
5535 src_layout, src_render_loop,
5536 dst_layout, dst_render_loop,
5537 src_queue_mask, dst_queue_mask,
5538 range, sample_locs);
5539 } else {
5540 radv_handle_color_image_transition(cmd_buffer, image,
5541 src_layout, src_render_loop,
5542 dst_layout, dst_render_loop,
5543 src_queue_mask, dst_queue_mask,
5544 range);
5545 }
5546 }
5547
5548 struct radv_barrier_info {
5549 uint32_t eventCount;
5550 const VkEvent *pEvents;
5551 VkPipelineStageFlags srcStageMask;
5552 VkPipelineStageFlags dstStageMask;
5553 };
5554
5555 static void
5556 radv_barrier(struct radv_cmd_buffer *cmd_buffer,
5557 uint32_t memoryBarrierCount,
5558 const VkMemoryBarrier *pMemoryBarriers,
5559 uint32_t bufferMemoryBarrierCount,
5560 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
5561 uint32_t imageMemoryBarrierCount,
5562 const VkImageMemoryBarrier *pImageMemoryBarriers,
5563 const struct radv_barrier_info *info)
5564 {
5565 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5566 enum radv_cmd_flush_bits src_flush_bits = 0;
5567 enum radv_cmd_flush_bits dst_flush_bits = 0;
5568
5569 for (unsigned i = 0; i < info->eventCount; ++i) {
5570 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
5571 uint64_t va = radv_buffer_get_va(event->bo);
5572
5573 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5574
5575 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
5576
5577 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
5578 assert(cmd_buffer->cs->cdw <= cdw_max);
5579 }
5580
5581 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
5582 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
5583 NULL);
5584 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
5585 NULL);
5586 }
5587
5588 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
5589 src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
5590 NULL);
5591 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
5592 NULL);
5593 }
5594
5595 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5596 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5597
5598 src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
5599 image);
5600 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
5601 image);
5602 }
5603
5604 /* The Vulkan spec 1.1.98 says:
5605 *
5606 * "An execution dependency with only
5607 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
5608 * will only prevent that stage from executing in subsequently
5609 * submitted commands. As this stage does not perform any actual
5610 * execution, this is not observable - in effect, it does not delay
5611 * processing of subsequent commands. Similarly an execution dependency
5612 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
5613 * will effectively not wait for any prior commands to complete."
5614 */
5615 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
5616 radv_stage_flush(cmd_buffer, info->srcStageMask);
5617 cmd_buffer->state.flush_bits |= src_flush_bits;
5618
5619 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5620 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5621
5622 const struct VkSampleLocationsInfoEXT *sample_locs_info =
5623 vk_find_struct_const(pImageMemoryBarriers[i].pNext,
5624 SAMPLE_LOCATIONS_INFO_EXT);
5625 struct radv_sample_locations_state sample_locations = {};
5626
5627 if (sample_locs_info) {
5628 assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
5629 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
5630 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
5631 sample_locations.count = sample_locs_info->sampleLocationsCount;
5632 typed_memcpy(&sample_locations.locations[0],
5633 sample_locs_info->pSampleLocations,
5634 sample_locs_info->sampleLocationsCount);
5635 }
5636
5637 radv_handle_image_transition(cmd_buffer, image,
5638 pImageMemoryBarriers[i].oldLayout,
5639 false, /* Outside of a renderpass we are never in a renderloop */
5640 pImageMemoryBarriers[i].newLayout,
5641 false, /* Outside of a renderpass we are never in a renderloop */
5642 pImageMemoryBarriers[i].srcQueueFamilyIndex,
5643 pImageMemoryBarriers[i].dstQueueFamilyIndex,
5644 &pImageMemoryBarriers[i].subresourceRange,
5645 sample_locs_info ? &sample_locations : NULL);
5646 }
5647
5648 /* Make sure CP DMA is idle because the driver might have performed a
5649 * DMA operation for copying or filling buffers/images.
5650 */
5651 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5652 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5653 si_cp_dma_wait_for_idle(cmd_buffer);
5654
5655 cmd_buffer->state.flush_bits |= dst_flush_bits;
5656 }
5657
5658 void radv_CmdPipelineBarrier(
5659 VkCommandBuffer commandBuffer,
5660 VkPipelineStageFlags srcStageMask,
5661 VkPipelineStageFlags destStageMask,
5662 VkBool32 byRegion,
5663 uint32_t memoryBarrierCount,
5664 const VkMemoryBarrier* pMemoryBarriers,
5665 uint32_t bufferMemoryBarrierCount,
5666 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5667 uint32_t imageMemoryBarrierCount,
5668 const VkImageMemoryBarrier* pImageMemoryBarriers)
5669 {
5670 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5671 struct radv_barrier_info info;
5672
5673 info.eventCount = 0;
5674 info.pEvents = NULL;
5675 info.srcStageMask = srcStageMask;
5676 info.dstStageMask = destStageMask;
5677
5678 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5679 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5680 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5681 }
5682
5683
5684 static void write_event(struct radv_cmd_buffer *cmd_buffer,
5685 struct radv_event *event,
5686 VkPipelineStageFlags stageMask,
5687 unsigned value)
5688 {
5689 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5690 uint64_t va = radv_buffer_get_va(event->bo);
5691
5692 si_emit_cache_flush(cmd_buffer);
5693
5694 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5695
5696 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21);
5697
5698 /* Flags that only require a top-of-pipe event. */
5699 VkPipelineStageFlags top_of_pipe_flags =
5700 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
5701
5702 /* Flags that only require a post-index-fetch event. */
5703 VkPipelineStageFlags post_index_fetch_flags =
5704 top_of_pipe_flags |
5705 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
5706 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
5707
5708 /* Make sure CP DMA is idle because the driver might have performed a
5709 * DMA operation for copying or filling buffers/images.
5710 */
5711 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5712 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5713 si_cp_dma_wait_for_idle(cmd_buffer);
5714
5715 /* TODO: Emit EOS events for syncing PS/CS stages. */
5716
5717 if (!(stageMask & ~top_of_pipe_flags)) {
5718 /* Just need to sync the PFP engine. */
5719 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5720 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5721 S_370_WR_CONFIRM(1) |
5722 S_370_ENGINE_SEL(V_370_PFP));
5723 radeon_emit(cs, va);
5724 radeon_emit(cs, va >> 32);
5725 radeon_emit(cs, value);
5726 } else if (!(stageMask & ~post_index_fetch_flags)) {
5727 /* Sync ME because PFP reads index and indirect buffers. */
5728 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5729 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5730 S_370_WR_CONFIRM(1) |
5731 S_370_ENGINE_SEL(V_370_ME));
5732 radeon_emit(cs, va);
5733 radeon_emit(cs, va >> 32);
5734 radeon_emit(cs, value);
5735 } else {
5736 /* Otherwise, sync all prior GPU work using an EOP event. */
5737 si_cs_emit_write_event_eop(cs,
5738 cmd_buffer->device->physical_device->rad_info.chip_class,
5739 radv_cmd_buffer_uses_mec(cmd_buffer),
5740 V_028A90_BOTTOM_OF_PIPE_TS, 0,
5741 EOP_DST_SEL_MEM,
5742 EOP_DATA_SEL_VALUE_32BIT, va, value,
5743 cmd_buffer->gfx9_eop_bug_va);
5744 }
5745
5746 assert(cmd_buffer->cs->cdw <= cdw_max);
5747 }
5748
5749 void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
5750 VkEvent _event,
5751 VkPipelineStageFlags stageMask)
5752 {
5753 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5754 RADV_FROM_HANDLE(radv_event, event, _event);
5755
5756 write_event(cmd_buffer, event, stageMask, 1);
5757 }
5758
5759 void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
5760 VkEvent _event,
5761 VkPipelineStageFlags stageMask)
5762 {
5763 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5764 RADV_FROM_HANDLE(radv_event, event, _event);
5765
5766 write_event(cmd_buffer, event, stageMask, 0);
5767 }
5768
5769 void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
5770 uint32_t eventCount,
5771 const VkEvent* pEvents,
5772 VkPipelineStageFlags srcStageMask,
5773 VkPipelineStageFlags dstStageMask,
5774 uint32_t memoryBarrierCount,
5775 const VkMemoryBarrier* pMemoryBarriers,
5776 uint32_t bufferMemoryBarrierCount,
5777 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5778 uint32_t imageMemoryBarrierCount,
5779 const VkImageMemoryBarrier* pImageMemoryBarriers)
5780 {
5781 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5782 struct radv_barrier_info info;
5783
5784 info.eventCount = eventCount;
5785 info.pEvents = pEvents;
5786 info.srcStageMask = 0;
5787
5788 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5789 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5790 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5791 }
5792
5793
5794 void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,
5795 uint32_t deviceMask)
5796 {
5797 /* No-op */
5798 }
5799
5800 /* VK_EXT_conditional_rendering */
5801 void radv_CmdBeginConditionalRenderingEXT(
5802 VkCommandBuffer commandBuffer,
5803 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5804 {
5805 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5806 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
5807 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5808 bool draw_visible = true;
5809 uint64_t pred_value = 0;
5810 uint64_t va, new_va;
5811 unsigned pred_offset;
5812
5813 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
5814
5815 /* By default, if the 32-bit value at offset in buffer memory is zero,
5816 * then the rendering commands are discarded, otherwise they are
5817 * executed as normal. If the inverted flag is set, all commands are
5818 * discarded if the value is non zero.
5819 */
5820 if (pConditionalRenderingBegin->flags &
5821 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
5822 draw_visible = false;
5823 }
5824
5825 si_emit_cache_flush(cmd_buffer);
5826
5827 /* From the Vulkan spec 1.1.107:
5828 *
5829 * "If the 32-bit value at offset in buffer memory is zero, then the
5830 * rendering commands are discarded, otherwise they are executed as
5831 * normal. If the value of the predicate in buffer memory changes while
5832 * conditional rendering is active, the rendering commands may be
5833 * discarded in an implementation-dependent way. Some implementations
5834 * may latch the value of the predicate upon beginning conditional
5835 * rendering while others may read it before every rendering command."
5836 *
5837 * But, the AMD hardware treats the predicate as a 64-bit value which
5838 * means we need a workaround in the driver. Luckily, it's not required
5839 * to support if the value changes when predication is active.
5840 *
5841 * The workaround is as follows:
5842 * 1) allocate a 64-value in the upload BO and initialize it to 0
5843 * 2) copy the 32-bit predicate value to the upload BO
5844 * 3) use the new allocated VA address for predication
5845 *
5846 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
5847 * in ME (+ sync PFP) instead of PFP.
5848 */
5849 radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
5850
5851 new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
5852
5853 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
5854 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
5855 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
5856 COPY_DATA_WR_CONFIRM);
5857 radeon_emit(cs, va);
5858 radeon_emit(cs, va >> 32);
5859 radeon_emit(cs, new_va);
5860 radeon_emit(cs, new_va >> 32);
5861
5862 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
5863 radeon_emit(cs, 0);
5864
5865 /* Enable predication for this command buffer. */
5866 si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
5867 cmd_buffer->state.predicating = true;
5868
5869 /* Store conditional rendering user info. */
5870 cmd_buffer->state.predication_type = draw_visible;
5871 cmd_buffer->state.predication_va = new_va;
5872 }
5873
5874 void radv_CmdEndConditionalRenderingEXT(
5875 VkCommandBuffer commandBuffer)
5876 {
5877 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5878
5879 /* Disable predication for this command buffer. */
5880 si_emit_set_predication_state(cmd_buffer, false, 0);
5881 cmd_buffer->state.predicating = false;
5882
5883 /* Reset conditional rendering user info. */
5884 cmd_buffer->state.predication_type = -1;
5885 cmd_buffer->state.predication_va = 0;
5886 }
5887
5888 /* VK_EXT_transform_feedback */
5889 void radv_CmdBindTransformFeedbackBuffersEXT(
5890 VkCommandBuffer commandBuffer,
5891 uint32_t firstBinding,
5892 uint32_t bindingCount,
5893 const VkBuffer* pBuffers,
5894 const VkDeviceSize* pOffsets,
5895 const VkDeviceSize* pSizes)
5896 {
5897 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5898 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5899 uint8_t enabled_mask = 0;
5900
5901 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
5902 for (uint32_t i = 0; i < bindingCount; i++) {
5903 uint32_t idx = firstBinding + i;
5904
5905 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
5906 sb[idx].offset = pOffsets[i];
5907 sb[idx].size = pSizes[i];
5908
5909 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
5910 sb[idx].buffer->bo);
5911
5912 enabled_mask |= 1 << idx;
5913 }
5914
5915 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
5916
5917 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5918 }
5919
5920 static void
5921 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
5922 {
5923 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5924 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5925
5926 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
5927 radeon_emit(cs,
5928 S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
5929 S_028B94_RAST_STREAM(0) |
5930 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
5931 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
5932 S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
5933 radeon_emit(cs, so->hw_enabled_mask &
5934 so->enabled_stream_buffers_mask);
5935
5936 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5937 }
5938
5939 static void
5940 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
5941 {
5942 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5943 bool old_streamout_enabled = so->streamout_enabled;
5944 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
5945
5946 so->streamout_enabled = enable;
5947
5948 so->hw_enabled_mask = so->enabled_mask |
5949 (so->enabled_mask << 4) |
5950 (so->enabled_mask << 8) |
5951 (so->enabled_mask << 12);
5952
5953 if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
5954 ((old_streamout_enabled != so->streamout_enabled) ||
5955 (old_hw_enabled_mask != so->hw_enabled_mask)))
5956 radv_emit_streamout_enable(cmd_buffer);
5957
5958 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
5959 cmd_buffer->gds_needed = true;
5960 cmd_buffer->gds_oa_needed = true;
5961 }
5962 }
5963
5964 static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
5965 {
5966 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5967 unsigned reg_strmout_cntl;
5968
5969 /* The register is at different places on different ASICs. */
5970 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
5971 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
5972 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
5973 } else {
5974 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
5975 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
5976 }
5977
5978 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
5979 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
5980
5981 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
5982 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
5983 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
5984 radeon_emit(cs, 0);
5985 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
5986 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
5987 radeon_emit(cs, 4); /* poll interval */
5988 }
5989
5990 static void
5991 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer,
5992 uint32_t firstCounterBuffer,
5993 uint32_t counterBufferCount,
5994 const VkBuffer *pCounterBuffers,
5995 const VkDeviceSize *pCounterBufferOffsets)
5996
5997 {
5998 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5999 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6000 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6001 uint32_t i;
6002
6003 radv_flush_vgt_streamout(cmd_buffer);
6004
6005 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6006 for_each_bit(i, so->enabled_mask) {
6007 int32_t counter_buffer_idx = i - firstCounterBuffer;
6008 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6009 counter_buffer_idx = -1;
6010
6011 /* AMD GCN binds streamout buffers as shader resources.
6012 * VGT only counts primitives and tells the shader through
6013 * SGPRs what to do.
6014 */
6015 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
6016 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
6017 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
6018
6019 cmd_buffer->state.context_roll_without_scissor_emitted = true;
6020
6021 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6022 /* The array of counter buffers is optional. */
6023 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6024 uint64_t va = radv_buffer_get_va(buffer->bo);
6025
6026 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
6027
6028 /* Append */
6029 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6030 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6031 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6032 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
6033 radeon_emit(cs, 0); /* unused */
6034 radeon_emit(cs, 0); /* unused */
6035 radeon_emit(cs, va); /* src address lo */
6036 radeon_emit(cs, va >> 32); /* src address hi */
6037
6038 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6039 } else {
6040 /* Start from the beginning. */
6041 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6042 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6043 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6044 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
6045 radeon_emit(cs, 0); /* unused */
6046 radeon_emit(cs, 0); /* unused */
6047 radeon_emit(cs, 0); /* unused */
6048 radeon_emit(cs, 0); /* unused */
6049 }
6050 }
6051
6052 radv_set_streamout_enable(cmd_buffer, true);
6053 }
6054
6055 static void
6056 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer,
6057 uint32_t firstCounterBuffer,
6058 uint32_t counterBufferCount,
6059 const VkBuffer *pCounterBuffers,
6060 const VkDeviceSize *pCounterBufferOffsets)
6061 {
6062 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6063 unsigned last_target = util_last_bit(so->enabled_mask) - 1;
6064 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6065 uint32_t i;
6066
6067 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6068 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6069
6070 /* Sync because the next streamout operation will overwrite GDS and we
6071 * have to make sure it's idle.
6072 * TODO: Improve by tracking if there is a streamout operation in
6073 * flight.
6074 */
6075 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
6076 si_emit_cache_flush(cmd_buffer);
6077
6078 for_each_bit(i, so->enabled_mask) {
6079 int32_t counter_buffer_idx = i - firstCounterBuffer;
6080 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6081 counter_buffer_idx = -1;
6082
6083 bool append = counter_buffer_idx >= 0 &&
6084 pCounterBuffers && pCounterBuffers[counter_buffer_idx];
6085 uint64_t va = 0;
6086
6087 if (append) {
6088 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6089
6090 va += radv_buffer_get_va(buffer->bo);
6091 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
6092
6093 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6094 }
6095
6096 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
6097 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
6098 S_411_DST_SEL(V_411_GDS) |
6099 S_411_CP_SYNC(i == last_target));
6100 radeon_emit(cs, va);
6101 radeon_emit(cs, va >> 32);
6102 radeon_emit(cs, 4 * i); /* destination in GDS */
6103 radeon_emit(cs, 0);
6104 radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
6105 S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
6106 }
6107
6108 radv_set_streamout_enable(cmd_buffer, true);
6109 }
6110
6111 void radv_CmdBeginTransformFeedbackEXT(
6112 VkCommandBuffer commandBuffer,
6113 uint32_t firstCounterBuffer,
6114 uint32_t counterBufferCount,
6115 const VkBuffer* pCounterBuffers,
6116 const VkDeviceSize* pCounterBufferOffsets)
6117 {
6118 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6119
6120 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6121 gfx10_emit_streamout_begin(cmd_buffer,
6122 firstCounterBuffer, counterBufferCount,
6123 pCounterBuffers, pCounterBufferOffsets);
6124 } else {
6125 radv_emit_streamout_begin(cmd_buffer,
6126 firstCounterBuffer, counterBufferCount,
6127 pCounterBuffers, pCounterBufferOffsets);
6128 }
6129 }
6130
6131 static void
6132 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer,
6133 uint32_t firstCounterBuffer,
6134 uint32_t counterBufferCount,
6135 const VkBuffer *pCounterBuffers,
6136 const VkDeviceSize *pCounterBufferOffsets)
6137 {
6138 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6139 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6140 uint32_t i;
6141
6142 radv_flush_vgt_streamout(cmd_buffer);
6143
6144 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6145 for_each_bit(i, so->enabled_mask) {
6146 int32_t counter_buffer_idx = i - firstCounterBuffer;
6147 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6148 counter_buffer_idx = -1;
6149
6150 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6151 /* The array of counters buffer is optional. */
6152 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6153 uint64_t va = radv_buffer_get_va(buffer->bo);
6154
6155 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
6156
6157 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6158 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6159 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6160 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
6161 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
6162 radeon_emit(cs, va); /* dst address lo */
6163 radeon_emit(cs, va >> 32); /* dst address hi */
6164 radeon_emit(cs, 0); /* unused */
6165 radeon_emit(cs, 0); /* unused */
6166
6167 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6168 }
6169
6170 /* Deactivate transform feedback by zeroing the buffer size.
6171 * The counters (primitives generated, primitives emitted) may
6172 * be enabled even if there is not buffer bound. This ensures
6173 * that the primitives-emitted query won't increment.
6174 */
6175 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
6176
6177 cmd_buffer->state.context_roll_without_scissor_emitted = true;
6178 }
6179
6180 radv_set_streamout_enable(cmd_buffer, false);
6181 }
6182
6183 static void
6184 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer,
6185 uint32_t firstCounterBuffer,
6186 uint32_t counterBufferCount,
6187 const VkBuffer *pCounterBuffers,
6188 const VkDeviceSize *pCounterBufferOffsets)
6189 {
6190 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6191 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6192 uint32_t i;
6193
6194 assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6195 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6196
6197 for_each_bit(i, so->enabled_mask) {
6198 int32_t counter_buffer_idx = i - firstCounterBuffer;
6199 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6200 counter_buffer_idx = -1;
6201
6202 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6203 /* The array of counters buffer is optional. */
6204 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6205 uint64_t va = radv_buffer_get_va(buffer->bo);
6206
6207 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
6208
6209 si_cs_emit_write_event_eop(cs,
6210 cmd_buffer->device->physical_device->rad_info.chip_class,
6211 radv_cmd_buffer_uses_mec(cmd_buffer),
6212 V_028A90_PS_DONE, 0,
6213 EOP_DST_SEL_TC_L2,
6214 EOP_DATA_SEL_GDS,
6215 va, EOP_DATA_GDS(i, 1), 0);
6216
6217 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6218 }
6219 }
6220
6221 radv_set_streamout_enable(cmd_buffer, false);
6222 }
6223
6224 void radv_CmdEndTransformFeedbackEXT(
6225 VkCommandBuffer commandBuffer,
6226 uint32_t firstCounterBuffer,
6227 uint32_t counterBufferCount,
6228 const VkBuffer* pCounterBuffers,
6229 const VkDeviceSize* pCounterBufferOffsets)
6230 {
6231 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6232
6233 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6234 gfx10_emit_streamout_end(cmd_buffer,
6235 firstCounterBuffer, counterBufferCount,
6236 pCounterBuffers, pCounterBufferOffsets);
6237 } else {
6238 radv_emit_streamout_end(cmd_buffer,
6239 firstCounterBuffer, counterBufferCount,
6240 pCounterBuffers, pCounterBufferOffsets);
6241 }
6242 }
6243
6244 void radv_CmdDrawIndirectByteCountEXT(
6245 VkCommandBuffer commandBuffer,
6246 uint32_t instanceCount,
6247 uint32_t firstInstance,
6248 VkBuffer _counterBuffer,
6249 VkDeviceSize counterBufferOffset,
6250 uint32_t counterOffset,
6251 uint32_t vertexStride)
6252 {
6253 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6254 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
6255 struct radv_draw_info info = {};
6256
6257 info.instance_count = instanceCount;
6258 info.first_instance = firstInstance;
6259 info.strmout_buffer = counterBuffer;
6260 info.strmout_buffer_offset = counterBufferOffset;
6261 info.stride = vertexStride;
6262
6263 radv_draw(cmd_buffer, &info);
6264 }
6265
6266 /* VK_AMD_buffer_marker */
6267 void radv_CmdWriteBufferMarkerAMD(
6268 VkCommandBuffer commandBuffer,
6269 VkPipelineStageFlagBits pipelineStage,
6270 VkBuffer dstBuffer,
6271 VkDeviceSize dstOffset,
6272 uint32_t marker)
6273 {
6274 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6275 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
6276 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6277 uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
6278
6279 si_emit_cache_flush(cmd_buffer);
6280
6281 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
6282
6283 if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
6284 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6285 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
6286 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6287 COPY_DATA_WR_CONFIRM);
6288 radeon_emit(cs, marker);
6289 radeon_emit(cs, 0);
6290 radeon_emit(cs, va);
6291 radeon_emit(cs, va >> 32);
6292 } else {
6293 si_cs_emit_write_event_eop(cs,
6294 cmd_buffer->device->physical_device->rad_info.chip_class,
6295 radv_cmd_buffer_uses_mec(cmd_buffer),
6296 V_028A90_BOTTOM_OF_PIPE_TS, 0,
6297 EOP_DST_SEL_MEM,
6298 EOP_DATA_SEL_VALUE_32BIT,
6299 va, marker,
6300 cmd_buffer->gfx9_eop_bug_va);
6301 }
6302
6303 assert(cmd_buffer->cs->cdw <= cdw_max);
6304 }