ac: add has_tc_compat_zrange_bug to ac_gpu_info
[mesa.git] / src / amd / vulkan / radv_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_private.h"
29 #include "radv_radeon_winsys.h"
30 #include "radv_shader.h"
31 #include "radv_cs.h"
32 #include "sid.h"
33 #include "vk_format.h"
34 #include "vk_util.h"
35 #include "radv_debug.h"
36 #include "radv_meta.h"
37
38 #include "ac_debug.h"
39
40 enum {
41 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
42 RADV_PREFETCH_VS = (1 << 1),
43 RADV_PREFETCH_TCS = (1 << 2),
44 RADV_PREFETCH_TES = (1 << 3),
45 RADV_PREFETCH_GS = (1 << 4),
46 RADV_PREFETCH_PS = (1 << 5),
47 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS |
48 RADV_PREFETCH_TCS |
49 RADV_PREFETCH_TES |
50 RADV_PREFETCH_GS |
51 RADV_PREFETCH_PS)
52 };
53
54 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
55 struct radv_image *image,
56 VkImageLayout src_layout,
57 bool src_render_loop,
58 VkImageLayout dst_layout,
59 bool dst_render_loop,
60 uint32_t src_family,
61 uint32_t dst_family,
62 const VkImageSubresourceRange *range,
63 struct radv_sample_locations_state *sample_locs);
64
65 const struct radv_dynamic_state default_dynamic_state = {
66 .viewport = {
67 .count = 0,
68 },
69 .scissor = {
70 .count = 0,
71 },
72 .line_width = 1.0f,
73 .depth_bias = {
74 .bias = 0.0f,
75 .clamp = 0.0f,
76 .slope = 0.0f,
77 },
78 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
79 .depth_bounds = {
80 .min = 0.0f,
81 .max = 1.0f,
82 },
83 .stencil_compare_mask = {
84 .front = ~0u,
85 .back = ~0u,
86 },
87 .stencil_write_mask = {
88 .front = ~0u,
89 .back = ~0u,
90 },
91 .stencil_reference = {
92 .front = 0u,
93 .back = 0u,
94 },
95 };
96
97 static void
98 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
99 const struct radv_dynamic_state *src)
100 {
101 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
102 uint32_t copy_mask = src->mask;
103 uint32_t dest_mask = 0;
104
105 /* Make sure to copy the number of viewports/scissors because they can
106 * only be specified at pipeline creation time.
107 */
108 dest->viewport.count = src->viewport.count;
109 dest->scissor.count = src->scissor.count;
110 dest->discard_rectangle.count = src->discard_rectangle.count;
111 dest->sample_location.count = src->sample_location.count;
112
113 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
114 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
115 src->viewport.count * sizeof(VkViewport))) {
116 typed_memcpy(dest->viewport.viewports,
117 src->viewport.viewports,
118 src->viewport.count);
119 dest_mask |= RADV_DYNAMIC_VIEWPORT;
120 }
121 }
122
123 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
124 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
125 src->scissor.count * sizeof(VkRect2D))) {
126 typed_memcpy(dest->scissor.scissors,
127 src->scissor.scissors, src->scissor.count);
128 dest_mask |= RADV_DYNAMIC_SCISSOR;
129 }
130 }
131
132 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
133 if (dest->line_width != src->line_width) {
134 dest->line_width = src->line_width;
135 dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
136 }
137 }
138
139 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
140 if (memcmp(&dest->depth_bias, &src->depth_bias,
141 sizeof(src->depth_bias))) {
142 dest->depth_bias = src->depth_bias;
143 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
144 }
145 }
146
147 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
148 if (memcmp(&dest->blend_constants, &src->blend_constants,
149 sizeof(src->blend_constants))) {
150 typed_memcpy(dest->blend_constants,
151 src->blend_constants, 4);
152 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
153 }
154 }
155
156 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
157 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
158 sizeof(src->depth_bounds))) {
159 dest->depth_bounds = src->depth_bounds;
160 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
161 }
162 }
163
164 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
165 if (memcmp(&dest->stencil_compare_mask,
166 &src->stencil_compare_mask,
167 sizeof(src->stencil_compare_mask))) {
168 dest->stencil_compare_mask = src->stencil_compare_mask;
169 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
170 }
171 }
172
173 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
174 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
175 sizeof(src->stencil_write_mask))) {
176 dest->stencil_write_mask = src->stencil_write_mask;
177 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
178 }
179 }
180
181 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
182 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
183 sizeof(src->stencil_reference))) {
184 dest->stencil_reference = src->stencil_reference;
185 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
186 }
187 }
188
189 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
190 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
191 src->discard_rectangle.count * sizeof(VkRect2D))) {
192 typed_memcpy(dest->discard_rectangle.rectangles,
193 src->discard_rectangle.rectangles,
194 src->discard_rectangle.count);
195 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
196 }
197 }
198
199 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
200 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
201 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
202 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
203 memcmp(&dest->sample_location.locations,
204 &src->sample_location.locations,
205 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
206 dest->sample_location.per_pixel = src->sample_location.per_pixel;
207 dest->sample_location.grid_size = src->sample_location.grid_size;
208 typed_memcpy(dest->sample_location.locations,
209 src->sample_location.locations,
210 src->sample_location.count);
211 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
212 }
213 }
214
215 cmd_buffer->state.dirty |= dest_mask;
216 }
217
218 static void
219 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
220 struct radv_pipeline *pipeline)
221 {
222 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
223 struct radv_shader_info *info;
224
225 if (!pipeline->streamout_shader)
226 return;
227
228 info = &pipeline->streamout_shader->info.info;
229 for (int i = 0; i < MAX_SO_BUFFERS; i++)
230 so->stride_in_dw[i] = info->so.strides[i];
231
232 so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
233 }
234
235 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
236 {
237 return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
238 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
239 }
240
241 enum ring_type radv_queue_family_to_ring(int f) {
242 switch (f) {
243 case RADV_QUEUE_GENERAL:
244 return RING_GFX;
245 case RADV_QUEUE_COMPUTE:
246 return RING_COMPUTE;
247 case RADV_QUEUE_TRANSFER:
248 return RING_DMA;
249 default:
250 unreachable("Unknown queue family");
251 }
252 }
253
254 static VkResult radv_create_cmd_buffer(
255 struct radv_device * device,
256 struct radv_cmd_pool * pool,
257 VkCommandBufferLevel level,
258 VkCommandBuffer* pCommandBuffer)
259 {
260 struct radv_cmd_buffer *cmd_buffer;
261 unsigned ring;
262 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
263 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
264 if (cmd_buffer == NULL)
265 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
266
267 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
268 cmd_buffer->device = device;
269 cmd_buffer->pool = pool;
270 cmd_buffer->level = level;
271
272 if (pool) {
273 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
274 cmd_buffer->queue_family_index = pool->queue_family_index;
275
276 } else {
277 /* Init the pool_link so we can safely call list_del when we destroy
278 * the command buffer
279 */
280 list_inithead(&cmd_buffer->pool_link);
281 cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
282 }
283
284 ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
285
286 cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
287 if (!cmd_buffer->cs) {
288 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
289 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
290 }
291
292 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
293
294 list_inithead(&cmd_buffer->upload.list);
295
296 return VK_SUCCESS;
297 }
298
299 static void
300 radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
301 {
302 list_del(&cmd_buffer->pool_link);
303
304 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
305 &cmd_buffer->upload.list, list) {
306 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
307 list_del(&up->list);
308 free(up);
309 }
310
311 if (cmd_buffer->upload.upload_bo)
312 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
313 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
314
315 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
316 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
317
318 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
319 }
320
321 static VkResult
322 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
323 {
324 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
325
326 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
327 &cmd_buffer->upload.list, list) {
328 cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
329 list_del(&up->list);
330 free(up);
331 }
332
333 cmd_buffer->push_constant_stages = 0;
334 cmd_buffer->scratch_size_needed = 0;
335 cmd_buffer->compute_scratch_size_needed = 0;
336 cmd_buffer->esgs_ring_size_needed = 0;
337 cmd_buffer->gsvs_ring_size_needed = 0;
338 cmd_buffer->tess_rings_needed = false;
339 cmd_buffer->sample_positions_needed = false;
340
341 if (cmd_buffer->upload.upload_bo)
342 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
343 cmd_buffer->upload.upload_bo);
344 cmd_buffer->upload.offset = 0;
345
346 cmd_buffer->record_result = VK_SUCCESS;
347
348 memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
349
350 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
351 cmd_buffer->descriptors[i].dirty = 0;
352 cmd_buffer->descriptors[i].valid = 0;
353 cmd_buffer->descriptors[i].push_dirty = false;
354 }
355
356 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
357 cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
358 unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
359 unsigned fence_offset, eop_bug_offset;
360 void *fence_ptr;
361
362 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
363 &fence_ptr);
364
365 cmd_buffer->gfx9_fence_va =
366 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
367 cmd_buffer->gfx9_fence_va += fence_offset;
368
369 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
370 /* Allocate a buffer for the EOP bug on GFX9. */
371 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
372 &eop_bug_offset, &fence_ptr);
373 cmd_buffer->gfx9_eop_bug_va =
374 radv_buffer_get_va(cmd_buffer->upload.upload_bo);
375 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
376 }
377 }
378
379 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
380
381 return cmd_buffer->record_result;
382 }
383
384 static bool
385 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
386 uint64_t min_needed)
387 {
388 uint64_t new_size;
389 struct radeon_winsys_bo *bo;
390 struct radv_cmd_buffer_upload *upload;
391 struct radv_device *device = cmd_buffer->device;
392
393 new_size = MAX2(min_needed, 16 * 1024);
394 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
395
396 bo = device->ws->buffer_create(device->ws,
397 new_size, 4096,
398 RADEON_DOMAIN_GTT,
399 RADEON_FLAG_CPU_ACCESS|
400 RADEON_FLAG_NO_INTERPROCESS_SHARING |
401 RADEON_FLAG_32BIT,
402 RADV_BO_PRIORITY_UPLOAD_BUFFER);
403
404 if (!bo) {
405 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
406 return false;
407 }
408
409 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
410 if (cmd_buffer->upload.upload_bo) {
411 upload = malloc(sizeof(*upload));
412
413 if (!upload) {
414 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
415 device->ws->buffer_destroy(bo);
416 return false;
417 }
418
419 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
420 list_add(&upload->list, &cmd_buffer->upload.list);
421 }
422
423 cmd_buffer->upload.upload_bo = bo;
424 cmd_buffer->upload.size = new_size;
425 cmd_buffer->upload.offset = 0;
426 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
427
428 if (!cmd_buffer->upload.map) {
429 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
430 return false;
431 }
432
433 return true;
434 }
435
436 bool
437 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
438 unsigned size,
439 unsigned alignment,
440 unsigned *out_offset,
441 void **ptr)
442 {
443 assert(util_is_power_of_two_nonzero(alignment));
444
445 uint64_t offset = align(cmd_buffer->upload.offset, alignment);
446 if (offset + size > cmd_buffer->upload.size) {
447 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
448 return false;
449 offset = 0;
450 }
451
452 *out_offset = offset;
453 *ptr = cmd_buffer->upload.map + offset;
454
455 cmd_buffer->upload.offset = offset + size;
456 return true;
457 }
458
459 bool
460 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
461 unsigned size, unsigned alignment,
462 const void *data, unsigned *out_offset)
463 {
464 uint8_t *ptr;
465
466 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
467 out_offset, (void **)&ptr))
468 return false;
469
470 if (ptr)
471 memcpy(ptr, data, size);
472
473 return true;
474 }
475
476 static void
477 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
478 unsigned count, const uint32_t *data)
479 {
480 struct radeon_cmdbuf *cs = cmd_buffer->cs;
481
482 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
483
484 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
485 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
486 S_370_WR_CONFIRM(1) |
487 S_370_ENGINE_SEL(V_370_ME));
488 radeon_emit(cs, va);
489 radeon_emit(cs, va >> 32);
490 radeon_emit_array(cs, data, count);
491 }
492
493 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
494 {
495 struct radv_device *device = cmd_buffer->device;
496 struct radeon_cmdbuf *cs = cmd_buffer->cs;
497 uint64_t va;
498
499 va = radv_buffer_get_va(device->trace_bo);
500 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
501 va += 4;
502
503 ++cmd_buffer->state.trace_id;
504 radv_emit_write_data_packet(cmd_buffer, va, 1,
505 &cmd_buffer->state.trace_id);
506
507 radeon_check_space(cmd_buffer->device->ws, cs, 2);
508
509 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
510 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
511 }
512
513 static void
514 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
515 enum radv_cmd_flush_bits flags)
516 {
517 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
518 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
519 RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
520
521 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
522
523 /* Force wait for graphics or compute engines to be idle. */
524 si_cs_emit_cache_flush(cmd_buffer->cs,
525 cmd_buffer->device->physical_device->rad_info.chip_class,
526 &cmd_buffer->gfx9_fence_idx,
527 cmd_buffer->gfx9_fence_va,
528 radv_cmd_buffer_uses_mec(cmd_buffer),
529 flags, cmd_buffer->gfx9_eop_bug_va);
530 }
531
532 if (unlikely(cmd_buffer->device->trace_bo))
533 radv_cmd_buffer_trace_emit(cmd_buffer);
534 }
535
536 static void
537 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
538 struct radv_pipeline *pipeline, enum ring_type ring)
539 {
540 struct radv_device *device = cmd_buffer->device;
541 uint32_t data[2];
542 uint64_t va;
543
544 va = radv_buffer_get_va(device->trace_bo);
545
546 switch (ring) {
547 case RING_GFX:
548 va += 8;
549 break;
550 case RING_COMPUTE:
551 va += 16;
552 break;
553 default:
554 assert(!"invalid ring type");
555 }
556
557 data[0] = (uintptr_t)pipeline;
558 data[1] = (uintptr_t)pipeline >> 32;
559
560 radv_emit_write_data_packet(cmd_buffer, va, 2, data);
561 }
562
563 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
564 VkPipelineBindPoint bind_point,
565 struct radv_descriptor_set *set,
566 unsigned idx)
567 {
568 struct radv_descriptor_state *descriptors_state =
569 radv_get_descriptors_state(cmd_buffer, bind_point);
570
571 descriptors_state->sets[idx] = set;
572
573 descriptors_state->valid |= (1u << idx); /* active descriptors */
574 descriptors_state->dirty |= (1u << idx);
575 }
576
577 static void
578 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer,
579 VkPipelineBindPoint bind_point)
580 {
581 struct radv_descriptor_state *descriptors_state =
582 radv_get_descriptors_state(cmd_buffer, bind_point);
583 struct radv_device *device = cmd_buffer->device;
584 uint32_t data[MAX_SETS * 2] = {};
585 uint64_t va;
586 unsigned i;
587 va = radv_buffer_get_va(device->trace_bo) + 24;
588
589 for_each_bit(i, descriptors_state->valid) {
590 struct radv_descriptor_set *set = descriptors_state->sets[i];
591 data[i * 2] = (uint64_t)(uintptr_t)set;
592 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
593 }
594
595 radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
596 }
597
598 struct radv_userdata_info *
599 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
600 gl_shader_stage stage,
601 int idx)
602 {
603 struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
604 return &shader->info.user_sgprs_locs.shader_data[idx];
605 }
606
607 static void
608 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
609 struct radv_pipeline *pipeline,
610 gl_shader_stage stage,
611 int idx, uint64_t va)
612 {
613 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
614 uint32_t base_reg = pipeline->user_data_0[stage];
615 if (loc->sgpr_idx == -1)
616 return;
617
618 assert(loc->num_sgprs == 1);
619
620 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
621 base_reg + loc->sgpr_idx * 4, va, false);
622 }
623
624 static void
625 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
626 struct radv_pipeline *pipeline,
627 struct radv_descriptor_state *descriptors_state,
628 gl_shader_stage stage)
629 {
630 struct radv_device *device = cmd_buffer->device;
631 struct radeon_cmdbuf *cs = cmd_buffer->cs;
632 uint32_t sh_base = pipeline->user_data_0[stage];
633 struct radv_userdata_locations *locs =
634 &pipeline->shaders[stage]->info.user_sgprs_locs;
635 unsigned mask = locs->descriptor_sets_enabled;
636
637 mask &= descriptors_state->dirty & descriptors_state->valid;
638
639 while (mask) {
640 int start, count;
641
642 u_bit_scan_consecutive_range(&mask, &start, &count);
643
644 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
645 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
646
647 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
648 for (int i = 0; i < count; i++) {
649 struct radv_descriptor_set *set =
650 descriptors_state->sets[start + i];
651
652 radv_emit_shader_pointer_body(device, cs, set->va, true);
653 }
654 }
655 }
656
657 /**
658 * Convert the user sample locations to hardware sample locations (the values
659 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
660 */
661 static void
662 radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
663 uint32_t x, uint32_t y, VkOffset2D *sample_locs)
664 {
665 uint32_t x_offset = x % state->grid_size.width;
666 uint32_t y_offset = y % state->grid_size.height;
667 uint32_t num_samples = (uint32_t)state->per_pixel;
668 VkSampleLocationEXT *user_locs;
669 uint32_t pixel_offset;
670
671 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
672
673 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
674 user_locs = &state->locations[pixel_offset];
675
676 for (uint32_t i = 0; i < num_samples; i++) {
677 float shifted_pos_x = user_locs[i].x - 0.5;
678 float shifted_pos_y = user_locs[i].y - 0.5;
679
680 int32_t scaled_pos_x = floor(shifted_pos_x * 16);
681 int32_t scaled_pos_y = floor(shifted_pos_y * 16);
682
683 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
684 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
685 }
686 }
687
688 /**
689 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
690 * locations.
691 */
692 static void
693 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
694 uint32_t *sample_locs_pixel)
695 {
696 for (uint32_t i = 0; i < num_samples; i++) {
697 uint32_t sample_reg_idx = i / 4;
698 uint32_t sample_loc_idx = i % 4;
699 int32_t pos_x = sample_locs[i].x;
700 int32_t pos_y = sample_locs[i].y;
701
702 uint32_t shift_x = 8 * sample_loc_idx;
703 uint32_t shift_y = shift_x + 4;
704
705 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
706 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
707 }
708 }
709
710 /**
711 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
712 * sample locations.
713 */
714 static uint64_t
715 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer,
716 VkOffset2D *sample_locs,
717 uint32_t num_samples)
718 {
719 uint32_t centroid_priorities[num_samples];
720 uint32_t sample_mask = num_samples - 1;
721 uint32_t distances[num_samples];
722 uint64_t centroid_priority = 0;
723
724 /* Compute the distances from center for each sample. */
725 for (int i = 0; i < num_samples; i++) {
726 distances[i] = (sample_locs[i].x * sample_locs[i].x) +
727 (sample_locs[i].y * sample_locs[i].y);
728 }
729
730 /* Compute the centroid priorities by looking at the distances array. */
731 for (int i = 0; i < num_samples; i++) {
732 uint32_t min_idx = 0;
733
734 for (int j = 1; j < num_samples; j++) {
735 if (distances[j] < distances[min_idx])
736 min_idx = j;
737 }
738
739 centroid_priorities[i] = min_idx;
740 distances[min_idx] = 0xffffffff;
741 }
742
743 /* Compute the final centroid priority. */
744 for (int i = 0; i < 8; i++) {
745 centroid_priority |=
746 centroid_priorities[i & sample_mask] << (i * 4);
747 }
748
749 return centroid_priority << 32 | centroid_priority;
750 }
751
752 /**
753 * Emit the sample locations that are specified with VK_EXT_sample_locations.
754 */
755 static void
756 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
757 {
758 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
759 struct radv_multisample_state *ms = &pipeline->graphics.ms;
760 struct radv_sample_locations_state *sample_location =
761 &cmd_buffer->state.dynamic.sample_location;
762 uint32_t num_samples = (uint32_t)sample_location->per_pixel;
763 struct radeon_cmdbuf *cs = cmd_buffer->cs;
764 uint32_t sample_locs_pixel[4][2] = {};
765 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
766 uint32_t max_sample_dist = 0;
767 uint64_t centroid_priority;
768
769 if (!cmd_buffer->state.dynamic.sample_location.count)
770 return;
771
772 /* Convert the user sample locations to hardware sample locations. */
773 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
774 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
775 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
776 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
777
778 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
779 for (uint32_t i = 0; i < 4; i++) {
780 radv_compute_sample_locs_pixel(num_samples, sample_locs[i],
781 sample_locs_pixel[i]);
782 }
783
784 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
785 centroid_priority =
786 radv_compute_centroid_priority(cmd_buffer, sample_locs[0],
787 num_samples);
788
789 /* Compute the maximum sample distance from the specified locations. */
790 for (uint32_t i = 0; i < num_samples; i++) {
791 VkOffset2D offset = sample_locs[0][i];
792 max_sample_dist = MAX2(max_sample_dist,
793 MAX2(abs(offset.x), abs(offset.y)));
794 }
795
796 /* Emit the specified user sample locations. */
797 switch (num_samples) {
798 case 2:
799 case 4:
800 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
801 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
802 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
803 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
804 break;
805 case 8:
806 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
807 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
808 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
809 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
810 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
811 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
812 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
813 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
814 break;
815 default:
816 unreachable("invalid number of samples");
817 }
818
819 /* Emit the maximum sample distance and the centroid priority. */
820 uint32_t pa_sc_aa_config = ms->pa_sc_aa_config;
821
822 pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST;
823 pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist);
824
825 radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1);
826 radeon_emit(cs, pa_sc_aa_config);
827
828 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
829 radeon_emit(cs, centroid_priority);
830 radeon_emit(cs, centroid_priority >> 32);
831
832 /* GFX9: Flush DFSM when the AA mode changes. */
833 if (cmd_buffer->device->dfsm_allowed) {
834 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
835 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
836 }
837
838 cmd_buffer->state.context_roll_without_scissor_emitted = true;
839 }
840
841 static void
842 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
843 struct radv_pipeline *pipeline,
844 gl_shader_stage stage,
845 int idx, int count, uint32_t *values)
846 {
847 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
848 uint32_t base_reg = pipeline->user_data_0[stage];
849 if (loc->sgpr_idx == -1)
850 return;
851
852 assert(loc->num_sgprs == count);
853
854 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
855 radeon_emit_array(cmd_buffer->cs, values, count);
856 }
857
858 static void
859 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
860 struct radv_pipeline *pipeline)
861 {
862 int num_samples = pipeline->graphics.ms.num_samples;
863 struct radv_multisample_state *ms = &pipeline->graphics.ms;
864 struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
865
866 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions)
867 cmd_buffer->sample_positions_needed = true;
868
869 if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
870 return;
871
872 radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
873 radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
874 radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
875
876 radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
877
878 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
879
880 /* GFX9: Flush DFSM when the AA mode changes. */
881 if (cmd_buffer->device->dfsm_allowed) {
882 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
883 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
884 }
885
886 cmd_buffer->state.context_roll_without_scissor_emitted = true;
887 }
888
889 static void
890 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
891 struct radv_pipeline *pipeline)
892 {
893 const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
894
895
896 if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
897 return;
898
899 if (old_pipeline &&
900 old_pipeline->graphics.binning.pa_sc_binner_cntl_0 == pipeline->graphics.binning.pa_sc_binner_cntl_0 &&
901 old_pipeline->graphics.binning.db_dfsm_control == pipeline->graphics.binning.db_dfsm_control)
902 return;
903
904 bool binning_flush = false;
905 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
906 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
907 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
908 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
909 binning_flush = !old_pipeline ||
910 G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
911 G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
912 }
913
914 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
915 pipeline->graphics.binning.pa_sc_binner_cntl_0 |
916 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
917
918 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
919 radeon_set_context_reg(cmd_buffer->cs, R_028038_DB_DFSM_CONTROL,
920 pipeline->graphics.binning.db_dfsm_control);
921 } else {
922 radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
923 pipeline->graphics.binning.db_dfsm_control);
924 }
925
926 cmd_buffer->state.context_roll_without_scissor_emitted = true;
927 }
928
929
930 static void
931 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
932 struct radv_shader_variant *shader)
933 {
934 uint64_t va;
935
936 if (!shader)
937 return;
938
939 va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
940
941 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
942 }
943
944 static void
945 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
946 struct radv_pipeline *pipeline,
947 bool vertex_stage_only)
948 {
949 struct radv_cmd_state *state = &cmd_buffer->state;
950 uint32_t mask = state->prefetch_L2_mask;
951
952 if (vertex_stage_only) {
953 /* Fast prefetch path for starting draws as soon as possible.
954 */
955 mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS |
956 RADV_PREFETCH_VBO_DESCRIPTORS);
957 }
958
959 if (mask & RADV_PREFETCH_VS)
960 radv_emit_shader_prefetch(cmd_buffer,
961 pipeline->shaders[MESA_SHADER_VERTEX]);
962
963 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
964 si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
965
966 if (mask & RADV_PREFETCH_TCS)
967 radv_emit_shader_prefetch(cmd_buffer,
968 pipeline->shaders[MESA_SHADER_TESS_CTRL]);
969
970 if (mask & RADV_PREFETCH_TES)
971 radv_emit_shader_prefetch(cmd_buffer,
972 pipeline->shaders[MESA_SHADER_TESS_EVAL]);
973
974 if (mask & RADV_PREFETCH_GS) {
975 radv_emit_shader_prefetch(cmd_buffer,
976 pipeline->shaders[MESA_SHADER_GEOMETRY]);
977 if (radv_pipeline_has_gs_copy_shader(pipeline))
978 radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
979 }
980
981 if (mask & RADV_PREFETCH_PS)
982 radv_emit_shader_prefetch(cmd_buffer,
983 pipeline->shaders[MESA_SHADER_FRAGMENT]);
984
985 state->prefetch_L2_mask &= ~mask;
986 }
987
988 static void
989 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
990 {
991 if (!cmd_buffer->device->physical_device->rbplus_allowed)
992 return;
993
994 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
995 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
996
997 unsigned sx_ps_downconvert = 0;
998 unsigned sx_blend_opt_epsilon = 0;
999 unsigned sx_blend_opt_control = 0;
1000
1001 for (unsigned i = 0; i < subpass->color_count; ++i) {
1002 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1003 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1004 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1005 continue;
1006 }
1007
1008 int idx = subpass->color_attachments[i].attachment;
1009 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1010
1011 unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1012 unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1013 uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1014 uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1015
1016 bool has_alpha, has_rgb;
1017
1018 /* Set if RGB and A are present. */
1019 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1020
1021 if (format == V_028C70_COLOR_8 ||
1022 format == V_028C70_COLOR_16 ||
1023 format == V_028C70_COLOR_32)
1024 has_rgb = !has_alpha;
1025 else
1026 has_rgb = true;
1027
1028 /* Check the colormask and export format. */
1029 if (!(colormask & 0x7))
1030 has_rgb = false;
1031 if (!(colormask & 0x8))
1032 has_alpha = false;
1033
1034 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1035 has_rgb = false;
1036 has_alpha = false;
1037 }
1038
1039 /* Disable value checking for disabled channels. */
1040 if (!has_rgb)
1041 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1042 if (!has_alpha)
1043 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1044
1045 /* Enable down-conversion for 32bpp and smaller formats. */
1046 switch (format) {
1047 case V_028C70_COLOR_8:
1048 case V_028C70_COLOR_8_8:
1049 case V_028C70_COLOR_8_8_8_8:
1050 /* For 1 and 2-channel formats, use the superset thereof. */
1051 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1052 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1053 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1054 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1055 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1056 }
1057 break;
1058
1059 case V_028C70_COLOR_5_6_5:
1060 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1061 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1062 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1063 }
1064 break;
1065
1066 case V_028C70_COLOR_1_5_5_5:
1067 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1068 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1069 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1070 }
1071 break;
1072
1073 case V_028C70_COLOR_4_4_4_4:
1074 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1075 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1076 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1077 }
1078 break;
1079
1080 case V_028C70_COLOR_32:
1081 if (swap == V_028C70_SWAP_STD &&
1082 spi_format == V_028714_SPI_SHADER_32_R)
1083 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1084 else if (swap == V_028C70_SWAP_ALT_REV &&
1085 spi_format == V_028714_SPI_SHADER_32_AR)
1086 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1087 break;
1088
1089 case V_028C70_COLOR_16:
1090 case V_028C70_COLOR_16_16:
1091 /* For 1-channel formats, use the superset thereof. */
1092 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1093 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1094 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1095 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1096 if (swap == V_028C70_SWAP_STD ||
1097 swap == V_028C70_SWAP_STD_REV)
1098 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1099 else
1100 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1101 }
1102 break;
1103
1104 case V_028C70_COLOR_10_11_11:
1105 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1106 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1107 sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
1108 }
1109 break;
1110
1111 case V_028C70_COLOR_2_10_10_10:
1112 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1113 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1114 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1115 }
1116 break;
1117 }
1118 }
1119
1120 for (unsigned i = subpass->color_count; i < 8; ++i) {
1121 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1122 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1123 }
1124 /* TODO: avoid redundantly setting context registers */
1125 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1126 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1127 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1128 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1129
1130 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1131 }
1132
1133 static void
1134 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1135 {
1136 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1137
1138 if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1139 return;
1140
1141 radv_update_multisample_state(cmd_buffer, pipeline);
1142 radv_update_binning_state(cmd_buffer, pipeline);
1143
1144 cmd_buffer->scratch_size_needed =
1145 MAX2(cmd_buffer->scratch_size_needed,
1146 pipeline->max_waves * pipeline->scratch_bytes_per_wave);
1147
1148 if (!cmd_buffer->state.emitted_pipeline ||
1149 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1150 pipeline->graphics.can_use_guardband)
1151 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1152
1153 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1154
1155 if (!cmd_buffer->state.emitted_pipeline ||
1156 cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1157 cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1158 memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
1159 pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
1160 radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1161 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1162 }
1163
1164 for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1165 if (!pipeline->shaders[i])
1166 continue;
1167
1168 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1169 pipeline->shaders[i]->bo);
1170 }
1171
1172 if (radv_pipeline_has_gs_copy_shader(pipeline))
1173 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1174 pipeline->gs_copy_shader->bo);
1175
1176 if (unlikely(cmd_buffer->device->trace_bo))
1177 radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
1178
1179 cmd_buffer->state.emitted_pipeline = pipeline;
1180
1181 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1182 }
1183
1184 static void
1185 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1186 {
1187 si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
1188 cmd_buffer->state.dynamic.viewport.viewports);
1189 }
1190
1191 static void
1192 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1193 {
1194 uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1195
1196 si_write_scissors(cmd_buffer->cs, 0, count,
1197 cmd_buffer->state.dynamic.scissor.scissors,
1198 cmd_buffer->state.dynamic.viewport.viewports,
1199 cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1200
1201 cmd_buffer->state.context_roll_without_scissor_emitted = false;
1202 }
1203
1204 static void
1205 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1206 {
1207 if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1208 return;
1209
1210 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1211 cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1212 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1213 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1214 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1215 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1216 S_028214_BR_Y(rect.offset.y + rect.extent.height));
1217 }
1218 }
1219
1220 static void
1221 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1222 {
1223 unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1224
1225 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1226 S_028A08_WIDTH(CLAMP(width, 0, 0xFFF)));
1227 }
1228
1229 static void
1230 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1231 {
1232 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1233
1234 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1235 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1236 }
1237
1238 static void
1239 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1240 {
1241 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1242
1243 radeon_set_context_reg_seq(cmd_buffer->cs,
1244 R_028430_DB_STENCILREFMASK, 2);
1245 radeon_emit(cmd_buffer->cs,
1246 S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1247 S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1248 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1249 S_028430_STENCILOPVAL(1));
1250 radeon_emit(cmd_buffer->cs,
1251 S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1252 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1253 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1254 S_028434_STENCILOPVAL_BF(1));
1255 }
1256
1257 static void
1258 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1259 {
1260 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1261
1262 radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
1263 fui(d->depth_bounds.min));
1264 radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
1265 fui(d->depth_bounds.max));
1266 }
1267
1268 static void
1269 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1270 {
1271 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1272 unsigned slope = fui(d->depth_bias.slope * 16.0f);
1273 unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
1274
1275
1276 radeon_set_context_reg_seq(cmd_buffer->cs,
1277 R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1278 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1279 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1280 radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
1281 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1282 radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
1283 }
1284
1285 static void
1286 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
1287 int index,
1288 struct radv_color_buffer_info *cb,
1289 struct radv_image_view *iview,
1290 VkImageLayout layout,
1291 bool in_render_loop)
1292 {
1293 bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1294 uint32_t cb_color_info = cb->cb_color_info;
1295 struct radv_image *image = iview->image;
1296
1297 if (!radv_layout_dcc_compressed(cmd_buffer->device, image, layout, in_render_loop,
1298 radv_image_queue_family_mask(image,
1299 cmd_buffer->queue_family_index,
1300 cmd_buffer->queue_family_index))) {
1301 cb_color_info &= C_028C70_DCC_ENABLE;
1302 }
1303
1304 if (radv_image_is_tc_compat_cmask(image) &&
1305 (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1306 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1307 /* If this bit is set, the FMASK decompression operation
1308 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1309 */
1310 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1311 }
1312
1313 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1314 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1315 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1316 radeon_emit(cmd_buffer->cs, 0);
1317 radeon_emit(cmd_buffer->cs, 0);
1318 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1319 radeon_emit(cmd_buffer->cs, cb_color_info);
1320 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1321 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1322 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1323 radeon_emit(cmd_buffer->cs, 0);
1324 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1325 radeon_emit(cmd_buffer->cs, 0);
1326
1327 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 1);
1328 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1329
1330 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1331 cb->cb_color_base >> 32);
1332 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1333 cb->cb_color_cmask >> 32);
1334 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1335 cb->cb_color_fmask >> 32);
1336 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1337 cb->cb_dcc_base >> 32);
1338 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1339 cb->cb_color_attrib2);
1340 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1341 cb->cb_color_attrib3);
1342 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1343 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1344 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1345 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1346 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1347 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1348 radeon_emit(cmd_buffer->cs, cb_color_info);
1349 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1350 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1351 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1352 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1353 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1354 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1355
1356 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1357 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1358 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1359
1360 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1361 cb->cb_mrt_epitch);
1362 } else {
1363 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1364 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1365 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1366 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1367 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1368 radeon_emit(cmd_buffer->cs, cb_color_info);
1369 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1370 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1371 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1372 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1373 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1374 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1375
1376 if (is_vi) { /* DCC BASE */
1377 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1378 }
1379 }
1380
1381 if (radv_dcc_enabled(image, iview->base_mip)) {
1382 /* Drawing with DCC enabled also compresses colorbuffers. */
1383 VkImageSubresourceRange range = {
1384 .aspectMask = iview->aspect_mask,
1385 .baseMipLevel = iview->base_mip,
1386 .levelCount = iview->level_count,
1387 .baseArrayLayer = iview->base_layer,
1388 .layerCount = iview->layer_count,
1389 };
1390
1391 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1392 }
1393 }
1394
1395 static void
1396 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
1397 struct radv_ds_buffer_info *ds,
1398 const struct radv_image_view *iview,
1399 VkImageLayout layout,
1400 bool in_render_loop, bool requires_cond_exec)
1401 {
1402 const struct radv_image *image = iview->image;
1403 uint32_t db_z_info = ds->db_z_info;
1404 uint32_t db_z_info_reg;
1405
1406 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1407 !radv_image_is_tc_compat_htile(image))
1408 return;
1409
1410 if (!radv_layout_has_htile(image, layout, in_render_loop,
1411 radv_image_queue_family_mask(image,
1412 cmd_buffer->queue_family_index,
1413 cmd_buffer->queue_family_index))) {
1414 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1415 }
1416
1417 db_z_info &= C_028040_ZRANGE_PRECISION;
1418
1419 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1420 db_z_info_reg = R_028038_DB_Z_INFO;
1421 } else {
1422 db_z_info_reg = R_028040_DB_Z_INFO;
1423 }
1424
1425 /* When we don't know the last fast clear value we need to emit a
1426 * conditional packet that will eventually skip the following
1427 * SET_CONTEXT_REG packet.
1428 */
1429 if (requires_cond_exec) {
1430 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1431
1432 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1433 radeon_emit(cmd_buffer->cs, va);
1434 radeon_emit(cmd_buffer->cs, va >> 32);
1435 radeon_emit(cmd_buffer->cs, 0);
1436 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1437 }
1438
1439 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1440 }
1441
1442 static void
1443 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
1444 struct radv_ds_buffer_info *ds,
1445 struct radv_image_view *iview,
1446 VkImageLayout layout,
1447 bool in_render_loop)
1448 {
1449 const struct radv_image *image = iview->image;
1450 uint32_t db_z_info = ds->db_z_info;
1451 uint32_t db_stencil_info = ds->db_stencil_info;
1452
1453 if (!radv_layout_has_htile(image, layout, in_render_loop,
1454 radv_image_queue_family_mask(image,
1455 cmd_buffer->queue_family_index,
1456 cmd_buffer->queue_family_index))) {
1457 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1458 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1459 }
1460
1461 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1462 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1463
1464 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1465 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1466 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1467
1468 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1469 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1470 radeon_emit(cmd_buffer->cs, db_z_info);
1471 radeon_emit(cmd_buffer->cs, db_stencil_info);
1472 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1473 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1474 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1475 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1476
1477 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1478 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1479 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1480 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1481 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1482 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1483 } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1484 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1485 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1486 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1487 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1488
1489 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1490 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
1491 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
1492 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1493 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1494 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
1495 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1496 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
1497 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1498 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
1499 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1500
1501 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1502 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1503 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1504 } else {
1505 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1506
1507 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1508 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
1509 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
1510 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
1511 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
1512 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
1513 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
1514 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1515 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
1516 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
1517
1518 }
1519
1520 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1521 radv_update_zrange_precision(cmd_buffer, ds, iview, layout,
1522 in_render_loop, true);
1523
1524 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1525 ds->pa_su_poly_offset_db_fmt_cntl);
1526 }
1527
1528 /**
1529 * Update the fast clear depth/stencil values if the image is bound as a
1530 * depth/stencil buffer.
1531 */
1532 static void
1533 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1534 const struct radv_image_view *iview,
1535 VkClearDepthStencilValue ds_clear_value,
1536 VkImageAspectFlags aspects)
1537 {
1538 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1539 const struct radv_image *image = iview->image;
1540 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1541 uint32_t att_idx;
1542
1543 if (!cmd_buffer->state.attachments || !subpass)
1544 return;
1545
1546 if (!subpass->depth_stencil_attachment)
1547 return;
1548
1549 att_idx = subpass->depth_stencil_attachment->attachment;
1550 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1551 return;
1552
1553 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1554 radeon_emit(cs, ds_clear_value.stencil);
1555 radeon_emit(cs, fui(ds_clear_value.depth));
1556
1557 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1558 * only needed when clearing Z to 0.0.
1559 */
1560 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1561 ds_clear_value.depth == 0.0) {
1562 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1563 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
1564
1565 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds,
1566 iview, layout, in_render_loop, false);
1567 }
1568
1569 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1570 }
1571
1572 /**
1573 * Set the clear depth/stencil values to the image's metadata.
1574 */
1575 static void
1576 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1577 struct radv_image *image,
1578 const VkImageSubresourceRange *range,
1579 VkClearDepthStencilValue ds_clear_value,
1580 VkImageAspectFlags aspects)
1581 {
1582 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1583 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
1584 uint32_t level_count = radv_get_levelCount(image, range);
1585
1586 if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1587 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1588 /* Use the fastest way when both aspects are used. */
1589 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
1590 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1591 S_370_WR_CONFIRM(1) |
1592 S_370_ENGINE_SEL(V_370_PFP));
1593 radeon_emit(cs, va);
1594 radeon_emit(cs, va >> 32);
1595
1596 for (uint32_t l = 0; l < level_count; l++) {
1597 radeon_emit(cs, ds_clear_value.stencil);
1598 radeon_emit(cs, fui(ds_clear_value.depth));
1599 }
1600 } else {
1601 /* Otherwise we need one WRITE_DATA packet per level. */
1602 for (uint32_t l = 0; l < level_count; l++) {
1603 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
1604 unsigned value;
1605
1606 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
1607 value = fui(ds_clear_value.depth);
1608 va += 4;
1609 } else {
1610 value = ds_clear_value.stencil;
1611 }
1612
1613 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1614 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1615 S_370_WR_CONFIRM(1) |
1616 S_370_ENGINE_SEL(V_370_PFP));
1617 radeon_emit(cs, va);
1618 radeon_emit(cs, va >> 32);
1619 radeon_emit(cs, value);
1620 }
1621 }
1622 }
1623
1624 /**
1625 * Update the TC-compat metadata value for this image.
1626 */
1627 static void
1628 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1629 struct radv_image *image,
1630 const VkImageSubresourceRange *range,
1631 uint32_t value)
1632 {
1633 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1634
1635 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
1636 return;
1637
1638 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
1639 uint32_t level_count = radv_get_levelCount(image, range);
1640
1641 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
1642 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1643 S_370_WR_CONFIRM(1) |
1644 S_370_ENGINE_SEL(V_370_PFP));
1645 radeon_emit(cs, va);
1646 radeon_emit(cs, va >> 32);
1647
1648 for (uint32_t l = 0; l < level_count; l++)
1649 radeon_emit(cs, value);
1650 }
1651
1652 static void
1653 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1654 const struct radv_image_view *iview,
1655 VkClearDepthStencilValue ds_clear_value)
1656 {
1657 VkImageSubresourceRange range = {
1658 .aspectMask = iview->aspect_mask,
1659 .baseMipLevel = iview->base_mip,
1660 .levelCount = iview->level_count,
1661 .baseArrayLayer = iview->base_layer,
1662 .layerCount = iview->layer_count,
1663 };
1664 uint32_t cond_val;
1665
1666 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
1667 * depth clear value is 0.0f.
1668 */
1669 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
1670
1671 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range,
1672 cond_val);
1673 }
1674
1675 /**
1676 * Update the clear depth/stencil values for this image.
1677 */
1678 void
1679 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1680 const struct radv_image_view *iview,
1681 VkClearDepthStencilValue ds_clear_value,
1682 VkImageAspectFlags aspects)
1683 {
1684 VkImageSubresourceRange range = {
1685 .aspectMask = iview->aspect_mask,
1686 .baseMipLevel = iview->base_mip,
1687 .levelCount = iview->level_count,
1688 .baseArrayLayer = iview->base_layer,
1689 .layerCount = iview->layer_count,
1690 };
1691 struct radv_image *image = iview->image;
1692
1693 assert(radv_image_has_htile(image));
1694
1695 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range,
1696 ds_clear_value, aspects);
1697
1698 if (radv_image_is_tc_compat_htile(image) &&
1699 (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1700 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview,
1701 ds_clear_value);
1702 }
1703
1704 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value,
1705 aspects);
1706 }
1707
1708 /**
1709 * Load the clear depth/stencil values from the image's metadata.
1710 */
1711 static void
1712 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1713 const struct radv_image_view *iview)
1714 {
1715 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1716 const struct radv_image *image = iview->image;
1717 VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
1718 uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
1719 unsigned reg_offset = 0, reg_count = 0;
1720
1721 if (!radv_image_has_htile(image))
1722 return;
1723
1724 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1725 ++reg_count;
1726 } else {
1727 ++reg_offset;
1728 va += 4;
1729 }
1730 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
1731 ++reg_count;
1732
1733 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
1734
1735 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
1736 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
1737 radeon_emit(cs, va);
1738 radeon_emit(cs, va >> 32);
1739 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1740 radeon_emit(cs, reg_count);
1741 } else {
1742 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1743 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1744 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1745 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
1746 radeon_emit(cs, va);
1747 radeon_emit(cs, va >> 32);
1748 radeon_emit(cs, reg >> 2);
1749 radeon_emit(cs, 0);
1750
1751 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
1752 radeon_emit(cs, 0);
1753 }
1754 }
1755
1756 /*
1757 * With DCC some colors don't require CMASK elimination before being
1758 * used as a texture. This sets a predicate value to determine if the
1759 * cmask eliminate is required.
1760 */
1761 void
1762 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer,
1763 struct radv_image *image,
1764 const VkImageSubresourceRange *range, bool value)
1765 {
1766 uint64_t pred_val = value;
1767 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
1768 uint32_t level_count = radv_get_levelCount(image, range);
1769 uint32_t count = 2 * level_count;
1770
1771 assert(radv_dcc_enabled(image, range->baseMipLevel));
1772
1773 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1774 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1775 S_370_WR_CONFIRM(1) |
1776 S_370_ENGINE_SEL(V_370_PFP));
1777 radeon_emit(cmd_buffer->cs, va);
1778 radeon_emit(cmd_buffer->cs, va >> 32);
1779
1780 for (uint32_t l = 0; l < level_count; l++) {
1781 radeon_emit(cmd_buffer->cs, pred_val);
1782 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1783 }
1784 }
1785
1786 /**
1787 * Update the DCC predicate to reflect the compression state.
1788 */
1789 void
1790 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer,
1791 struct radv_image *image,
1792 const VkImageSubresourceRange *range, bool value)
1793 {
1794 uint64_t pred_val = value;
1795 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
1796 uint32_t level_count = radv_get_levelCount(image, range);
1797 uint32_t count = 2 * level_count;
1798
1799 assert(radv_dcc_enabled(image, range->baseMipLevel));
1800
1801 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
1802 radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
1803 S_370_WR_CONFIRM(1) |
1804 S_370_ENGINE_SEL(V_370_PFP));
1805 radeon_emit(cmd_buffer->cs, va);
1806 radeon_emit(cmd_buffer->cs, va >> 32);
1807
1808 for (uint32_t l = 0; l < level_count; l++) {
1809 radeon_emit(cmd_buffer->cs, pred_val);
1810 radeon_emit(cmd_buffer->cs, pred_val >> 32);
1811 }
1812 }
1813
1814 /**
1815 * Update the fast clear color values if the image is bound as a color buffer.
1816 */
1817 static void
1818 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
1819 struct radv_image *image,
1820 int cb_idx,
1821 uint32_t color_values[2])
1822 {
1823 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1824 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1825 uint32_t att_idx;
1826
1827 if (!cmd_buffer->state.attachments || !subpass)
1828 return;
1829
1830 att_idx = subpass->color_attachments[cb_idx].attachment;
1831 if (att_idx == VK_ATTACHMENT_UNUSED)
1832 return;
1833
1834 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1835 return;
1836
1837 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
1838 radeon_emit(cs, color_values[0]);
1839 radeon_emit(cs, color_values[1]);
1840
1841 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1842 }
1843
1844 /**
1845 * Set the clear color values to the image's metadata.
1846 */
1847 static void
1848 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1849 struct radv_image *image,
1850 const VkImageSubresourceRange *range,
1851 uint32_t color_values[2])
1852 {
1853 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1854 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
1855 uint32_t level_count = radv_get_levelCount(image, range);
1856 uint32_t count = 2 * level_count;
1857
1858 assert(radv_image_has_cmask(image) ||
1859 radv_dcc_enabled(image, range->baseMipLevel));
1860
1861 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
1862 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1863 S_370_WR_CONFIRM(1) |
1864 S_370_ENGINE_SEL(V_370_PFP));
1865 radeon_emit(cs, va);
1866 radeon_emit(cs, va >> 32);
1867
1868 for (uint32_t l = 0; l < level_count; l++) {
1869 radeon_emit(cs, color_values[0]);
1870 radeon_emit(cs, color_values[1]);
1871 }
1872 }
1873
1874 /**
1875 * Update the clear color values for this image.
1876 */
1877 void
1878 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1879 const struct radv_image_view *iview,
1880 int cb_idx,
1881 uint32_t color_values[2])
1882 {
1883 struct radv_image *image = iview->image;
1884 VkImageSubresourceRange range = {
1885 .aspectMask = iview->aspect_mask,
1886 .baseMipLevel = iview->base_mip,
1887 .levelCount = iview->level_count,
1888 .baseArrayLayer = iview->base_layer,
1889 .layerCount = iview->layer_count,
1890 };
1891
1892 assert(radv_image_has_cmask(image) ||
1893 radv_dcc_enabled(image, iview->base_mip));
1894
1895 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
1896
1897 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
1898 color_values);
1899 }
1900
1901 /**
1902 * Load the clear color values from the image's metadata.
1903 */
1904 static void
1905 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1906 struct radv_image_view *iview,
1907 int cb_idx)
1908 {
1909 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1910 struct radv_image *image = iview->image;
1911 uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
1912
1913 if (!radv_image_has_cmask(image) &&
1914 !radv_dcc_enabled(image, iview->base_mip))
1915 return;
1916
1917 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
1918
1919 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
1920 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
1921 radeon_emit(cs, va);
1922 radeon_emit(cs, va >> 32);
1923 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
1924 radeon_emit(cs, 2);
1925 } else {
1926 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
1927 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
1928 COPY_DATA_DST_SEL(COPY_DATA_REG) |
1929 COPY_DATA_COUNT_SEL);
1930 radeon_emit(cs, va);
1931 radeon_emit(cs, va >> 32);
1932 radeon_emit(cs, reg >> 2);
1933 radeon_emit(cs, 0);
1934
1935 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
1936 radeon_emit(cs, 0);
1937 }
1938 }
1939
1940 static void
1941 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
1942 {
1943 int i;
1944 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1945 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1946
1947 /* this may happen for inherited secondary recording */
1948 if (!framebuffer)
1949 return;
1950
1951 for (i = 0; i < 8; ++i) {
1952 if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1953 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
1954 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
1955 continue;
1956 }
1957
1958 int idx = subpass->color_attachments[i].attachment;
1959 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
1960 VkImageLayout layout = subpass->color_attachments[i].layout;
1961 bool in_render_loop = subpass->color_attachments[i].in_render_loop;
1962
1963 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->bo);
1964
1965 assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
1966 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
1967 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, in_render_loop);
1968
1969 radv_load_color_clear_metadata(cmd_buffer, iview, i);
1970 }
1971
1972 if (subpass->depth_stencil_attachment) {
1973 int idx = subpass->depth_stencil_attachment->attachment;
1974 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1975 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
1976 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
1977 struct radv_image *image = iview->image;
1978 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.attachments[idx].iview->bo);
1979 ASSERTED uint32_t queue_mask = radv_image_queue_family_mask(image,
1980 cmd_buffer->queue_family_index,
1981 cmd_buffer->queue_family_index);
1982 /* We currently don't support writing decompressed HTILE */
1983 assert(radv_layout_has_htile(image, layout, in_render_loop, queue_mask) ==
1984 radv_layout_is_htile_compressed(image, layout, in_render_loop, queue_mask));
1985
1986 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, in_render_loop);
1987
1988 if (cmd_buffer->state.attachments[idx].ds.offset_scale != cmd_buffer->state.offset_scale) {
1989 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1990 cmd_buffer->state.offset_scale = cmd_buffer->state.attachments[idx].ds.offset_scale;
1991 }
1992 radv_load_ds_clear_metadata(cmd_buffer, iview);
1993 } else {
1994 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
1995 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
1996 else
1997 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
1998
1999 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
2000 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2001 }
2002 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2003 S_028208_BR_X(framebuffer->width) |
2004 S_028208_BR_Y(framebuffer->height));
2005
2006 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2007 bool disable_constant_encode =
2008 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2009 enum chip_class chip_class =
2010 cmd_buffer->device->physical_device->rad_info.chip_class;
2011 uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2012
2013 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2014 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2015 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2016 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2017 }
2018
2019 if (cmd_buffer->device->pbb_allowed) {
2020 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2021 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2022 }
2023
2024 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2025 }
2026
2027 static void
2028 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
2029 {
2030 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2031 struct radv_cmd_state *state = &cmd_buffer->state;
2032
2033 if (state->index_type != state->last_index_type) {
2034 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2035 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
2036 cs, R_03090C_VGT_INDEX_TYPE,
2037 2, state->index_type);
2038 } else {
2039 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2040 radeon_emit(cs, state->index_type);
2041 }
2042
2043 state->last_index_type = state->index_type;
2044 }
2045
2046 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2047 radeon_emit(cs, state->index_va);
2048 radeon_emit(cs, state->index_va >> 32);
2049
2050 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2051 radeon_emit(cs, state->max_index_count);
2052
2053 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2054 }
2055
2056 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2057 {
2058 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2059 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2060 uint32_t pa_sc_mode_cntl_1 =
2061 pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2062 uint32_t db_count_control;
2063
2064 if(!cmd_buffer->state.active_occlusion_queries) {
2065 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2066 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2067 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2068 has_perfect_queries) {
2069 /* Re-enable out-of-order rasterization if the
2070 * bound pipeline supports it and if it's has
2071 * been disabled before starting any perfect
2072 * occlusion queries.
2073 */
2074 radeon_set_context_reg(cmd_buffer->cs,
2075 R_028A4C_PA_SC_MODE_CNTL_1,
2076 pa_sc_mode_cntl_1);
2077 }
2078 }
2079 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2080 } else {
2081 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2082 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2083 bool gfx10_perfect = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2084
2085 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2086 db_count_control =
2087 S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
2088 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2089 S_028004_SAMPLE_RATE(sample_rate) |
2090 S_028004_ZPASS_ENABLE(1) |
2091 S_028004_SLICE_EVEN_ENABLE(1) |
2092 S_028004_SLICE_ODD_ENABLE(1);
2093
2094 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2095 pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2096 has_perfect_queries) {
2097 /* If the bound pipeline has enabled
2098 * out-of-order rasterization, we should
2099 * disable it before starting any perfect
2100 * occlusion queries.
2101 */
2102 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2103
2104 radeon_set_context_reg(cmd_buffer->cs,
2105 R_028A4C_PA_SC_MODE_CNTL_1,
2106 pa_sc_mode_cntl_1);
2107 }
2108 } else {
2109 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2110 S_028004_SAMPLE_RATE(sample_rate);
2111 }
2112 }
2113
2114 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2115
2116 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2117 }
2118
2119 static void
2120 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
2121 {
2122 uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2123
2124 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2125 radv_emit_viewport(cmd_buffer);
2126
2127 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
2128 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
2129 radv_emit_scissor(cmd_buffer);
2130
2131 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
2132 radv_emit_line_width(cmd_buffer);
2133
2134 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
2135 radv_emit_blend_constants(cmd_buffer);
2136
2137 if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2138 RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2139 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
2140 radv_emit_stencil(cmd_buffer);
2141
2142 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
2143 radv_emit_depth_bounds(cmd_buffer);
2144
2145 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
2146 radv_emit_depth_bias(cmd_buffer);
2147
2148 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
2149 radv_emit_discard_rectangle(cmd_buffer);
2150
2151 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
2152 radv_emit_sample_locations(cmd_buffer);
2153
2154 cmd_buffer->state.dirty &= ~states;
2155 }
2156
2157 static void
2158 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
2159 VkPipelineBindPoint bind_point)
2160 {
2161 struct radv_descriptor_state *descriptors_state =
2162 radv_get_descriptors_state(cmd_buffer, bind_point);
2163 struct radv_descriptor_set *set = &descriptors_state->push_set.set;
2164 unsigned bo_offset;
2165
2166 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
2167 set->mapped_ptr,
2168 &bo_offset))
2169 return;
2170
2171 set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2172 set->va += bo_offset;
2173 }
2174
2175 static void
2176 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
2177 VkPipelineBindPoint bind_point)
2178 {
2179 struct radv_descriptor_state *descriptors_state =
2180 radv_get_descriptors_state(cmd_buffer, bind_point);
2181 uint32_t size = MAX_SETS * 4;
2182 uint32_t offset;
2183 void *ptr;
2184
2185 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
2186 256, &offset, &ptr))
2187 return;
2188
2189 for (unsigned i = 0; i < MAX_SETS; i++) {
2190 uint32_t *uptr = ((uint32_t *)ptr) + i;
2191 uint64_t set_va = 0;
2192 struct radv_descriptor_set *set = descriptors_state->sets[i];
2193 if (descriptors_state->valid & (1u << i))
2194 set_va = set->va;
2195 uptr[0] = set_va & 0xffffffff;
2196 }
2197
2198 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2199 va += offset;
2200
2201 if (cmd_buffer->state.pipeline) {
2202 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
2203 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2204 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2205
2206 if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
2207 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
2208 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2209
2210 if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
2211 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
2212 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2213
2214 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2215 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
2216 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2217
2218 if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2219 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
2220 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2221 }
2222
2223 if (cmd_buffer->state.compute_pipeline)
2224 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
2225 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2226 }
2227
2228 static void
2229 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
2230 VkShaderStageFlags stages)
2231 {
2232 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2233 VK_PIPELINE_BIND_POINT_COMPUTE :
2234 VK_PIPELINE_BIND_POINT_GRAPHICS;
2235 struct radv_descriptor_state *descriptors_state =
2236 radv_get_descriptors_state(cmd_buffer, bind_point);
2237 struct radv_cmd_state *state = &cmd_buffer->state;
2238 bool flush_indirect_descriptors;
2239
2240 if (!descriptors_state->dirty)
2241 return;
2242
2243 if (descriptors_state->push_dirty)
2244 radv_flush_push_descriptors(cmd_buffer, bind_point);
2245
2246 flush_indirect_descriptors =
2247 (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
2248 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
2249 (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
2250 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
2251
2252 if (flush_indirect_descriptors)
2253 radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
2254
2255 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
2256 cmd_buffer->cs,
2257 MAX_SETS * MESA_SHADER_STAGES * 4);
2258
2259 if (cmd_buffer->state.pipeline) {
2260 radv_foreach_stage(stage, stages) {
2261 if (!cmd_buffer->state.pipeline->shaders[stage])
2262 continue;
2263
2264 radv_emit_descriptor_pointers(cmd_buffer,
2265 cmd_buffer->state.pipeline,
2266 descriptors_state, stage);
2267 }
2268 }
2269
2270 if (cmd_buffer->state.compute_pipeline &&
2271 (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
2272 radv_emit_descriptor_pointers(cmd_buffer,
2273 cmd_buffer->state.compute_pipeline,
2274 descriptors_state,
2275 MESA_SHADER_COMPUTE);
2276 }
2277
2278 descriptors_state->dirty = 0;
2279 descriptors_state->push_dirty = false;
2280
2281 assert(cmd_buffer->cs->cdw <= cdw_max);
2282
2283 if (unlikely(cmd_buffer->device->trace_bo))
2284 radv_save_descriptors(cmd_buffer, bind_point);
2285 }
2286
2287 static void
2288 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
2289 VkShaderStageFlags stages)
2290 {
2291 struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT
2292 ? cmd_buffer->state.compute_pipeline
2293 : cmd_buffer->state.pipeline;
2294 VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2295 VK_PIPELINE_BIND_POINT_COMPUTE :
2296 VK_PIPELINE_BIND_POINT_GRAPHICS;
2297 struct radv_descriptor_state *descriptors_state =
2298 radv_get_descriptors_state(cmd_buffer, bind_point);
2299 struct radv_pipeline_layout *layout = pipeline->layout;
2300 struct radv_shader_variant *shader, *prev_shader;
2301 bool need_push_constants = false;
2302 unsigned offset;
2303 void *ptr;
2304 uint64_t va;
2305
2306 stages &= cmd_buffer->push_constant_stages;
2307 if (!stages ||
2308 (!layout->push_constant_size && !layout->dynamic_offset_count))
2309 return;
2310
2311 radv_foreach_stage(stage, stages) {
2312 if (!pipeline->shaders[stage])
2313 continue;
2314
2315 need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
2316 need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
2317
2318 uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts;
2319 uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts;
2320
2321 radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
2322 AC_UD_INLINE_PUSH_CONSTANTS,
2323 count,
2324 (uint32_t *)&cmd_buffer->push_constants[base * 4]);
2325 }
2326
2327 if (need_push_constants) {
2328 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
2329 16 * layout->dynamic_offset_count,
2330 256, &offset, &ptr))
2331 return;
2332
2333 memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
2334 memcpy((char*)ptr + layout->push_constant_size,
2335 descriptors_state->dynamic_buffers,
2336 16 * layout->dynamic_offset_count);
2337
2338 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2339 va += offset;
2340
2341 ASSERTED unsigned cdw_max =
2342 radeon_check_space(cmd_buffer->device->ws,
2343 cmd_buffer->cs, MESA_SHADER_STAGES * 4);
2344
2345 prev_shader = NULL;
2346 radv_foreach_stage(stage, stages) {
2347 shader = radv_get_shader(pipeline, stage);
2348
2349 /* Avoid redundantly emitting the address for merged stages. */
2350 if (shader && shader != prev_shader) {
2351 radv_emit_userdata_address(cmd_buffer, pipeline, stage,
2352 AC_UD_PUSH_CONSTANTS, va);
2353
2354 prev_shader = shader;
2355 }
2356 }
2357 assert(cmd_buffer->cs->cdw <= cdw_max);
2358 }
2359
2360 cmd_buffer->push_constant_stages &= ~stages;
2361 }
2362
2363 static void
2364 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
2365 bool pipeline_is_dirty)
2366 {
2367 if ((pipeline_is_dirty ||
2368 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
2369 cmd_buffer->state.pipeline->num_vertex_bindings &&
2370 radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
2371 struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
2372 unsigned vb_offset;
2373 void *vb_ptr;
2374 uint32_t i = 0;
2375 uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
2376 uint64_t va;
2377
2378 /* allocate some descriptor state for vertex buffers */
2379 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
2380 &vb_offset, &vb_ptr))
2381 return;
2382
2383 for (i = 0; i < count; i++) {
2384 uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
2385 uint32_t offset;
2386 struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
2387 uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
2388
2389 if (!buffer)
2390 continue;
2391
2392 va = radv_buffer_get_va(buffer->bo);
2393
2394 offset = cmd_buffer->vertex_bindings[i].offset;
2395 va += offset + buffer->offset;
2396 desc[0] = va;
2397 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2398 if (cmd_buffer->device->physical_device->rad_info.chip_class <= GFX7 && stride)
2399 desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
2400 else
2401 desc[2] = buffer->size - offset;
2402 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2403 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2404 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2405 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2406
2407 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2408 desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) |
2409 S_008F0C_OOB_SELECT(1) |
2410 S_008F0C_RESOURCE_LEVEL(1);
2411 } else {
2412 desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2413 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2414 }
2415 }
2416
2417 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2418 va += vb_offset;
2419
2420 radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2421 AC_UD_VS_VERTEX_BUFFERS, va);
2422
2423 cmd_buffer->state.vb_va = va;
2424 cmd_buffer->state.vb_size = count * 16;
2425 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2426 }
2427 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2428 }
2429
2430 static void
2431 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2432 {
2433 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2434 struct radv_userdata_info *loc;
2435 uint32_t base_reg;
2436
2437 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2438 if (!radv_get_shader(pipeline, stage))
2439 continue;
2440
2441 loc = radv_lookup_user_sgpr(pipeline, stage,
2442 AC_UD_STREAMOUT_BUFFERS);
2443 if (loc->sgpr_idx == -1)
2444 continue;
2445
2446 base_reg = pipeline->user_data_0[stage];
2447
2448 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2449 base_reg + loc->sgpr_idx * 4, va, false);
2450 }
2451
2452 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
2453 loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
2454 if (loc->sgpr_idx != -1) {
2455 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
2456
2457 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2458 base_reg + loc->sgpr_idx * 4, va, false);
2459 }
2460 }
2461 }
2462
2463 static void
2464 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
2465 {
2466 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
2467 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
2468 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
2469 unsigned so_offset;
2470 void *so_ptr;
2471 uint64_t va;
2472
2473 /* Allocate some descriptor state for streamout buffers. */
2474 if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
2475 MAX_SO_BUFFERS * 16, 256,
2476 &so_offset, &so_ptr))
2477 return;
2478
2479 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
2480 struct radv_buffer *buffer = sb[i].buffer;
2481 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
2482
2483 if (!(so->enabled_mask & (1 << i)))
2484 continue;
2485
2486 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
2487
2488 va += sb[i].offset;
2489
2490 /* Set the descriptor.
2491 *
2492 * On GFX8, the format must be non-INVALID, otherwise
2493 * the buffer will be considered not bound and store
2494 * instructions will be no-ops.
2495 */
2496 desc[0] = va;
2497 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2498 desc[2] = 0xffffffff;
2499 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2500 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2501 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2502 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2503
2504 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2505 desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
2506 S_008F0C_OOB_SELECT(3) |
2507 S_008F0C_RESOURCE_LEVEL(1);
2508 } else {
2509 desc[3] |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2510 }
2511 }
2512
2513 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2514 va += so_offset;
2515
2516 radv_emit_streamout_buffers(cmd_buffer, va);
2517 }
2518
2519 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
2520 }
2521
2522 static void
2523 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2524 {
2525 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
2526 radv_flush_streamout_descriptors(cmd_buffer);
2527 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2528 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2529 }
2530
2531 struct radv_draw_info {
2532 /**
2533 * Number of vertices.
2534 */
2535 uint32_t count;
2536
2537 /**
2538 * Index of the first vertex.
2539 */
2540 int32_t vertex_offset;
2541
2542 /**
2543 * First instance id.
2544 */
2545 uint32_t first_instance;
2546
2547 /**
2548 * Number of instances.
2549 */
2550 uint32_t instance_count;
2551
2552 /**
2553 * First index (indexed draws only).
2554 */
2555 uint32_t first_index;
2556
2557 /**
2558 * Whether it's an indexed draw.
2559 */
2560 bool indexed;
2561
2562 /**
2563 * Indirect draw parameters resource.
2564 */
2565 struct radv_buffer *indirect;
2566 uint64_t indirect_offset;
2567 uint32_t stride;
2568
2569 /**
2570 * Draw count parameters resource.
2571 */
2572 struct radv_buffer *count_buffer;
2573 uint64_t count_buffer_offset;
2574
2575 /**
2576 * Stream output parameters resource.
2577 */
2578 struct radv_buffer *strmout_buffer;
2579 uint64_t strmout_buffer_offset;
2580 };
2581
2582 static uint32_t
2583 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
2584 {
2585 switch (cmd_buffer->state.index_type) {
2586 case V_028A7C_VGT_INDEX_8:
2587 return 0xffu;
2588 case V_028A7C_VGT_INDEX_16:
2589 return 0xffffu;
2590 case V_028A7C_VGT_INDEX_32:
2591 return 0xffffffffu;
2592 default:
2593 unreachable("invalid index type");
2594 }
2595 }
2596
2597 static void
2598 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
2599 bool instanced_draw, bool indirect_draw,
2600 bool count_from_stream_output,
2601 uint32_t draw_vertex_count)
2602 {
2603 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2604 struct radv_cmd_state *state = &cmd_buffer->state;
2605 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2606 unsigned ia_multi_vgt_param;
2607
2608 ia_multi_vgt_param =
2609 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
2610 indirect_draw,
2611 count_from_stream_output,
2612 draw_vertex_count);
2613
2614 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
2615 if (info->chip_class == GFX9) {
2616 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
2617 cs,
2618 R_030960_IA_MULTI_VGT_PARAM,
2619 4, ia_multi_vgt_param);
2620 } else if (info->chip_class >= GFX7) {
2621 radeon_set_context_reg_idx(cs,
2622 R_028AA8_IA_MULTI_VGT_PARAM,
2623 1, ia_multi_vgt_param);
2624 } else {
2625 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
2626 ia_multi_vgt_param);
2627 }
2628 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
2629 }
2630 }
2631
2632 static void
2633 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
2634 const struct radv_draw_info *draw_info)
2635 {
2636 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
2637 struct radv_cmd_state *state = &cmd_buffer->state;
2638 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2639 int32_t primitive_reset_en;
2640
2641 /* Draw state. */
2642 if (info->chip_class < GFX10) {
2643 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
2644 draw_info->indirect,
2645 !!draw_info->strmout_buffer,
2646 draw_info->indirect ? 0 : draw_info->count);
2647 }
2648
2649 /* Primitive restart. */
2650 primitive_reset_en =
2651 draw_info->indexed && state->pipeline->graphics.prim_restart_enable;
2652
2653 if (primitive_reset_en != state->last_primitive_reset_en) {
2654 state->last_primitive_reset_en = primitive_reset_en;
2655 if (info->chip_class >= GFX9) {
2656 radeon_set_uconfig_reg(cs,
2657 R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
2658 primitive_reset_en);
2659 } else {
2660 radeon_set_context_reg(cs,
2661 R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
2662 primitive_reset_en);
2663 }
2664 }
2665
2666 if (primitive_reset_en) {
2667 uint32_t primitive_reset_index =
2668 radv_get_primitive_reset_index(cmd_buffer);
2669
2670 if (primitive_reset_index != state->last_primitive_reset_index) {
2671 radeon_set_context_reg(cs,
2672 R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
2673 primitive_reset_index);
2674 state->last_primitive_reset_index = primitive_reset_index;
2675 }
2676 }
2677
2678 if (draw_info->strmout_buffer) {
2679 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
2680
2681 va += draw_info->strmout_buffer->offset +
2682 draw_info->strmout_buffer_offset;
2683
2684 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
2685 draw_info->stride);
2686
2687 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2688 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2689 COPY_DATA_DST_SEL(COPY_DATA_REG) |
2690 COPY_DATA_WR_CONFIRM);
2691 radeon_emit(cs, va);
2692 radeon_emit(cs, va >> 32);
2693 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
2694 radeon_emit(cs, 0); /* unused */
2695
2696 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
2697 }
2698 }
2699
2700 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
2701 VkPipelineStageFlags src_stage_mask)
2702 {
2703 if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
2704 VK_PIPELINE_STAGE_TRANSFER_BIT |
2705 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2706 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2707 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2708 }
2709
2710 if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
2711 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
2712 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
2713 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
2714 VK_PIPELINE_STAGE_TRANSFER_BIT |
2715 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
2716 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
2717 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
2718 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
2719 } else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
2720 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
2721 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
2722 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
2723 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
2724 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
2725 VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
2726 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
2727 }
2728 }
2729
2730 static enum radv_cmd_flush_bits
2731 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
2732 VkAccessFlags src_flags,
2733 struct radv_image *image)
2734 {
2735 bool flush_CB_meta = true, flush_DB_meta = true;
2736 enum radv_cmd_flush_bits flush_bits = 0;
2737 uint32_t b;
2738
2739 if (image) {
2740 if (!radv_image_has_CB_metadata(image))
2741 flush_CB_meta = false;
2742 if (!radv_image_has_htile(image))
2743 flush_DB_meta = false;
2744 }
2745
2746 for_each_bit(b, src_flags) {
2747 switch ((VkAccessFlagBits)(1 << b)) {
2748 case VK_ACCESS_SHADER_WRITE_BIT:
2749 case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
2750 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2751 flush_bits |= RADV_CMD_FLAG_WB_L2;
2752 break;
2753 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
2754 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2755 if (flush_CB_meta)
2756 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2757 break;
2758 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
2759 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2760 if (flush_DB_meta)
2761 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2762 break;
2763 case VK_ACCESS_TRANSFER_WRITE_BIT:
2764 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2765 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
2766 RADV_CMD_FLAG_INV_L2;
2767
2768 if (flush_CB_meta)
2769 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2770 if (flush_DB_meta)
2771 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2772 break;
2773 default:
2774 break;
2775 }
2776 }
2777 return flush_bits;
2778 }
2779
2780 static enum radv_cmd_flush_bits
2781 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
2782 VkAccessFlags dst_flags,
2783 struct radv_image *image)
2784 {
2785 bool flush_CB_meta = true, flush_DB_meta = true;
2786 enum radv_cmd_flush_bits flush_bits = 0;
2787 bool flush_CB = true, flush_DB = true;
2788 bool image_is_coherent = false;
2789 uint32_t b;
2790
2791 if (image) {
2792 if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
2793 flush_CB = false;
2794 flush_DB = false;
2795 }
2796
2797 if (!radv_image_has_CB_metadata(image))
2798 flush_CB_meta = false;
2799 if (!radv_image_has_htile(image))
2800 flush_DB_meta = false;
2801
2802 /* TODO: implement shader coherent for GFX10 */
2803
2804 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
2805 if (image->info.samples == 1 &&
2806 (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
2807 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
2808 !vk_format_is_stencil(image->vk_format)) {
2809 /* Single-sample color and single-sample depth
2810 * (not stencil) are coherent with shaders on
2811 * GFX9.
2812 */
2813 image_is_coherent = true;
2814 }
2815 }
2816 }
2817
2818 for_each_bit(b, dst_flags) {
2819 switch ((VkAccessFlagBits)(1 << b)) {
2820 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
2821 case VK_ACCESS_INDEX_READ_BIT:
2822 case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
2823 break;
2824 case VK_ACCESS_UNIFORM_READ_BIT:
2825 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
2826 break;
2827 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
2828 case VK_ACCESS_TRANSFER_READ_BIT:
2829 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
2830 flush_bits |= RADV_CMD_FLAG_INV_VCACHE |
2831 RADV_CMD_FLAG_INV_L2;
2832 break;
2833 case VK_ACCESS_SHADER_READ_BIT:
2834 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
2835
2836 if (!image_is_coherent)
2837 flush_bits |= RADV_CMD_FLAG_INV_L2;
2838 break;
2839 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
2840 if (flush_CB)
2841 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
2842 if (flush_CB_meta)
2843 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2844 break;
2845 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
2846 if (flush_DB)
2847 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2848 if (flush_DB_meta)
2849 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
2850 break;
2851 default:
2852 break;
2853 }
2854 }
2855 return flush_bits;
2856 }
2857
2858 void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
2859 const struct radv_subpass_barrier *barrier)
2860 {
2861 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
2862 NULL);
2863 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
2864 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
2865 NULL);
2866 }
2867
2868 uint32_t
2869 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
2870 {
2871 struct radv_cmd_state *state = &cmd_buffer->state;
2872 uint32_t subpass_id = state->subpass - state->pass->subpasses;
2873
2874 /* The id of this subpass shouldn't exceed the number of subpasses in
2875 * this render pass minus 1.
2876 */
2877 assert(subpass_id < state->pass->subpass_count);
2878 return subpass_id;
2879 }
2880
2881 static struct radv_sample_locations_state *
2882 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer,
2883 uint32_t att_idx,
2884 bool begin_subpass)
2885 {
2886 struct radv_cmd_state *state = &cmd_buffer->state;
2887 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
2888 struct radv_image_view *view = state->attachments[att_idx].iview;
2889
2890 if (view->image->info.samples == 1)
2891 return NULL;
2892
2893 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
2894 /* Return the initial sample locations if this is the initial
2895 * layout transition of the given subpass attachemnt.
2896 */
2897 if (state->attachments[att_idx].sample_location.count > 0)
2898 return &state->attachments[att_idx].sample_location;
2899 } else {
2900 /* Otherwise return the subpass sample locations if defined. */
2901 if (state->subpass_sample_locs) {
2902 /* Because the driver sets the current subpass before
2903 * initial layout transitions, we should use the sample
2904 * locations from the previous subpass to avoid an
2905 * off-by-one problem. Otherwise, use the sample
2906 * locations for the current subpass for final layout
2907 * transitions.
2908 */
2909 if (begin_subpass)
2910 subpass_id--;
2911
2912 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
2913 if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
2914 return &state->subpass_sample_locs[i].sample_location;
2915 }
2916 }
2917 }
2918
2919 return NULL;
2920 }
2921
2922 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
2923 struct radv_subpass_attachment att,
2924 bool begin_subpass)
2925 {
2926 unsigned idx = att.attachment;
2927 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
2928 struct radv_sample_locations_state *sample_locs;
2929 VkImageSubresourceRange range;
2930 range.aspectMask = 0;
2931 range.baseMipLevel = view->base_mip;
2932 range.levelCount = 1;
2933 range.baseArrayLayer = view->base_layer;
2934 range.layerCount = cmd_buffer->state.framebuffer->layers;
2935
2936 if (cmd_buffer->state.subpass->view_mask) {
2937 /* If the current subpass uses multiview, the driver might have
2938 * performed a fast color/depth clear to the whole image
2939 * (including all layers). To make sure the driver will
2940 * decompress the image correctly (if needed), we have to
2941 * account for the "real" number of layers. If the view mask is
2942 * sparse, this will decompress more layers than needed.
2943 */
2944 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
2945 }
2946
2947 /* Get the subpass sample locations for the given attachment, if NULL
2948 * is returned the driver will use the default HW locations.
2949 */
2950 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx,
2951 begin_subpass);
2952
2953 radv_handle_image_transition(cmd_buffer,
2954 view->image,
2955 cmd_buffer->state.attachments[idx].current_layout,
2956 cmd_buffer->state.attachments[idx].current_in_render_loop,
2957 att.layout, att.in_render_loop,
2958 0, 0, &range, sample_locs);
2959
2960 cmd_buffer->state.attachments[idx].current_layout = att.layout;
2961 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
2962
2963
2964 }
2965
2966 void
2967 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
2968 const struct radv_subpass *subpass)
2969 {
2970 cmd_buffer->state.subpass = subpass;
2971
2972 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
2973 }
2974
2975 static VkResult
2976 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
2977 struct radv_render_pass *pass,
2978 const VkRenderPassBeginInfo *info)
2979 {
2980 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
2981 vk_find_struct_const(info->pNext,
2982 RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
2983 struct radv_cmd_state *state = &cmd_buffer->state;
2984
2985 if (!sample_locs) {
2986 state->subpass_sample_locs = NULL;
2987 return VK_SUCCESS;
2988 }
2989
2990 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
2991 const VkAttachmentSampleLocationsEXT *att_sample_locs =
2992 &sample_locs->pAttachmentInitialSampleLocations[i];
2993 uint32_t att_idx = att_sample_locs->attachmentIndex;
2994 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
2995
2996 assert(vk_format_is_depth_or_stencil(image->vk_format));
2997
2998 /* From the Vulkan spec 1.1.108:
2999 *
3000 * "If the image referenced by the framebuffer attachment at
3001 * index attachmentIndex was not created with
3002 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
3003 * then the values specified in sampleLocationsInfo are
3004 * ignored."
3005 */
3006 if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
3007 continue;
3008
3009 const VkSampleLocationsInfoEXT *sample_locs_info =
3010 &att_sample_locs->sampleLocationsInfo;
3011
3012 state->attachments[att_idx].sample_location.per_pixel =
3013 sample_locs_info->sampleLocationsPerPixel;
3014 state->attachments[att_idx].sample_location.grid_size =
3015 sample_locs_info->sampleLocationGridSize;
3016 state->attachments[att_idx].sample_location.count =
3017 sample_locs_info->sampleLocationsCount;
3018 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
3019 sample_locs_info->pSampleLocations,
3020 sample_locs_info->sampleLocationsCount);
3021 }
3022
3023 state->subpass_sample_locs = vk_alloc(&cmd_buffer->pool->alloc,
3024 sample_locs->postSubpassSampleLocationsCount *
3025 sizeof(state->subpass_sample_locs[0]),
3026 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3027 if (state->subpass_sample_locs == NULL) {
3028 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3029 return cmd_buffer->record_result;
3030 }
3031
3032 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
3033
3034 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
3035 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
3036 &sample_locs->pPostSubpassSampleLocations[i];
3037 const VkSampleLocationsInfoEXT *sample_locs_info =
3038 &subpass_sample_locs_info->sampleLocationsInfo;
3039
3040 state->subpass_sample_locs[i].subpass_idx =
3041 subpass_sample_locs_info->subpassIndex;
3042 state->subpass_sample_locs[i].sample_location.per_pixel =
3043 sample_locs_info->sampleLocationsPerPixel;
3044 state->subpass_sample_locs[i].sample_location.grid_size =
3045 sample_locs_info->sampleLocationGridSize;
3046 state->subpass_sample_locs[i].sample_location.count =
3047 sample_locs_info->sampleLocationsCount;
3048 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
3049 sample_locs_info->pSampleLocations,
3050 sample_locs_info->sampleLocationsCount);
3051 }
3052
3053 return VK_SUCCESS;
3054 }
3055
3056 static VkResult
3057 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
3058 struct radv_render_pass *pass,
3059 const VkRenderPassBeginInfo *info)
3060 {
3061 struct radv_cmd_state *state = &cmd_buffer->state;
3062 const struct VkRenderPassAttachmentBeginInfoKHR *attachment_info = NULL;
3063
3064 if (info) {
3065 attachment_info = vk_find_struct_const(info->pNext,
3066 RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR);
3067 }
3068
3069
3070 if (pass->attachment_count == 0) {
3071 state->attachments = NULL;
3072 return VK_SUCCESS;
3073 }
3074
3075 state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
3076 pass->attachment_count *
3077 sizeof(state->attachments[0]),
3078 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3079 if (state->attachments == NULL) {
3080 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3081 return cmd_buffer->record_result;
3082 }
3083
3084 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
3085 struct radv_render_pass_attachment *att = &pass->attachments[i];
3086 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
3087 VkImageAspectFlags clear_aspects = 0;
3088
3089 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3090 /* color attachment */
3091 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3092 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
3093 }
3094 } else {
3095 /* depthstencil attachment */
3096 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
3097 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3098 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
3099 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3100 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
3101 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3102 }
3103 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3104 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3105 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3106 }
3107 }
3108
3109 state->attachments[i].pending_clear_aspects = clear_aspects;
3110 state->attachments[i].cleared_views = 0;
3111 if (clear_aspects && info) {
3112 assert(info->clearValueCount > i);
3113 state->attachments[i].clear_value = info->pClearValues[i];
3114 }
3115
3116 state->attachments[i].current_layout = att->initial_layout;
3117 state->attachments[i].sample_location.count = 0;
3118
3119 struct radv_image_view *iview;
3120 if (attachment_info && attachment_info->attachmentCount > i) {
3121 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
3122 } else {
3123 iview = state->framebuffer->attachments[i];
3124 }
3125
3126 state->attachments[i].iview = iview;
3127 if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3128 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
3129 } else {
3130 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
3131 }
3132 }
3133
3134 return VK_SUCCESS;
3135 }
3136
3137 VkResult radv_AllocateCommandBuffers(
3138 VkDevice _device,
3139 const VkCommandBufferAllocateInfo *pAllocateInfo,
3140 VkCommandBuffer *pCommandBuffers)
3141 {
3142 RADV_FROM_HANDLE(radv_device, device, _device);
3143 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
3144
3145 VkResult result = VK_SUCCESS;
3146 uint32_t i;
3147
3148 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
3149
3150 if (!list_empty(&pool->free_cmd_buffers)) {
3151 struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
3152
3153 list_del(&cmd_buffer->pool_link);
3154 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
3155
3156 result = radv_reset_cmd_buffer(cmd_buffer);
3157 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
3158 cmd_buffer->level = pAllocateInfo->level;
3159
3160 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
3161 } else {
3162 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
3163 &pCommandBuffers[i]);
3164 }
3165 if (result != VK_SUCCESS)
3166 break;
3167 }
3168
3169 if (result != VK_SUCCESS) {
3170 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
3171 i, pCommandBuffers);
3172
3173 /* From the Vulkan 1.0.66 spec:
3174 *
3175 * "vkAllocateCommandBuffers can be used to create multiple
3176 * command buffers. If the creation of any of those command
3177 * buffers fails, the implementation must destroy all
3178 * successfully created command buffer objects from this
3179 * command, set all entries of the pCommandBuffers array to
3180 * NULL and return the error."
3181 */
3182 memset(pCommandBuffers, 0,
3183 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
3184 }
3185
3186 return result;
3187 }
3188
3189 void radv_FreeCommandBuffers(
3190 VkDevice device,
3191 VkCommandPool commandPool,
3192 uint32_t commandBufferCount,
3193 const VkCommandBuffer *pCommandBuffers)
3194 {
3195 for (uint32_t i = 0; i < commandBufferCount; i++) {
3196 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
3197
3198 if (cmd_buffer) {
3199 if (cmd_buffer->pool) {
3200 list_del(&cmd_buffer->pool_link);
3201 list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
3202 } else
3203 radv_cmd_buffer_destroy(cmd_buffer);
3204
3205 }
3206 }
3207 }
3208
3209 VkResult radv_ResetCommandBuffer(
3210 VkCommandBuffer commandBuffer,
3211 VkCommandBufferResetFlags flags)
3212 {
3213 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3214 return radv_reset_cmd_buffer(cmd_buffer);
3215 }
3216
3217 VkResult radv_BeginCommandBuffer(
3218 VkCommandBuffer commandBuffer,
3219 const VkCommandBufferBeginInfo *pBeginInfo)
3220 {
3221 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3222 VkResult result = VK_SUCCESS;
3223
3224 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
3225 /* If the command buffer has already been resetted with
3226 * vkResetCommandBuffer, no need to do it again.
3227 */
3228 result = radv_reset_cmd_buffer(cmd_buffer);
3229 if (result != VK_SUCCESS)
3230 return result;
3231 }
3232
3233 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
3234 cmd_buffer->state.last_primitive_reset_en = -1;
3235 cmd_buffer->state.last_index_type = -1;
3236 cmd_buffer->state.last_num_instances = -1;
3237 cmd_buffer->state.last_vertex_offset = -1;
3238 cmd_buffer->state.last_first_instance = -1;
3239 cmd_buffer->state.predication_type = -1;
3240 cmd_buffer->usage_flags = pBeginInfo->flags;
3241
3242 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
3243 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
3244 assert(pBeginInfo->pInheritanceInfo);
3245 cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
3246 cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
3247
3248 struct radv_subpass *subpass =
3249 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
3250
3251 if (cmd_buffer->state.framebuffer) {
3252 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
3253 if (result != VK_SUCCESS)
3254 return result;
3255 }
3256
3257 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3258 }
3259
3260 if (unlikely(cmd_buffer->device->trace_bo)) {
3261 struct radv_device *device = cmd_buffer->device;
3262
3263 radv_cs_add_buffer(device->ws, cmd_buffer->cs,
3264 device->trace_bo);
3265
3266 radv_cmd_buffer_trace_emit(cmd_buffer);
3267 }
3268
3269 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
3270
3271 return result;
3272 }
3273
3274 void radv_CmdBindVertexBuffers(
3275 VkCommandBuffer commandBuffer,
3276 uint32_t firstBinding,
3277 uint32_t bindingCount,
3278 const VkBuffer* pBuffers,
3279 const VkDeviceSize* pOffsets)
3280 {
3281 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3282 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
3283 bool changed = false;
3284
3285 /* We have to defer setting up vertex buffer since we need the buffer
3286 * stride from the pipeline. */
3287
3288 assert(firstBinding + bindingCount <= MAX_VBS);
3289 for (uint32_t i = 0; i < bindingCount; i++) {
3290 uint32_t idx = firstBinding + i;
3291
3292 if (!changed &&
3293 (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) ||
3294 vb[idx].offset != pOffsets[i])) {
3295 changed = true;
3296 }
3297
3298 vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
3299 vb[idx].offset = pOffsets[i];
3300
3301 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3302 vb[idx].buffer->bo);
3303 }
3304
3305 if (!changed) {
3306 /* No state changes. */
3307 return;
3308 }
3309
3310 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
3311 }
3312
3313 static uint32_t
3314 vk_to_index_type(VkIndexType type)
3315 {
3316 switch (type) {
3317 case VK_INDEX_TYPE_UINT8_EXT:
3318 return V_028A7C_VGT_INDEX_8;
3319 case VK_INDEX_TYPE_UINT16:
3320 return V_028A7C_VGT_INDEX_16;
3321 case VK_INDEX_TYPE_UINT32:
3322 return V_028A7C_VGT_INDEX_32;
3323 default:
3324 unreachable("invalid index type");
3325 }
3326 }
3327
3328 static uint32_t
3329 radv_get_vgt_index_size(uint32_t type)
3330 {
3331 switch (type) {
3332 case V_028A7C_VGT_INDEX_8:
3333 return 1;
3334 case V_028A7C_VGT_INDEX_16:
3335 return 2;
3336 case V_028A7C_VGT_INDEX_32:
3337 return 4;
3338 default:
3339 unreachable("invalid index type");
3340 }
3341 }
3342
3343 void radv_CmdBindIndexBuffer(
3344 VkCommandBuffer commandBuffer,
3345 VkBuffer buffer,
3346 VkDeviceSize offset,
3347 VkIndexType indexType)
3348 {
3349 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3350 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
3351
3352 if (cmd_buffer->state.index_buffer == index_buffer &&
3353 cmd_buffer->state.index_offset == offset &&
3354 cmd_buffer->state.index_type == indexType) {
3355 /* No state changes. */
3356 return;
3357 }
3358
3359 cmd_buffer->state.index_buffer = index_buffer;
3360 cmd_buffer->state.index_offset = offset;
3361 cmd_buffer->state.index_type = vk_to_index_type(indexType);
3362 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
3363 cmd_buffer->state.index_va += index_buffer->offset + offset;
3364
3365 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
3366 cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
3367 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3368 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
3369 }
3370
3371
3372 static void
3373 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3374 VkPipelineBindPoint bind_point,
3375 struct radv_descriptor_set *set, unsigned idx)
3376 {
3377 struct radeon_winsys *ws = cmd_buffer->device->ws;
3378
3379 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
3380
3381 assert(set);
3382 assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
3383
3384 if (!cmd_buffer->device->use_global_bo_list) {
3385 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3386 if (set->descriptors[j])
3387 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
3388 }
3389
3390 if(set->bo)
3391 radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
3392 }
3393
3394 void radv_CmdBindDescriptorSets(
3395 VkCommandBuffer commandBuffer,
3396 VkPipelineBindPoint pipelineBindPoint,
3397 VkPipelineLayout _layout,
3398 uint32_t firstSet,
3399 uint32_t descriptorSetCount,
3400 const VkDescriptorSet* pDescriptorSets,
3401 uint32_t dynamicOffsetCount,
3402 const uint32_t* pDynamicOffsets)
3403 {
3404 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3405 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3406 unsigned dyn_idx = 0;
3407
3408 const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
3409 struct radv_descriptor_state *descriptors_state =
3410 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3411
3412 for (unsigned i = 0; i < descriptorSetCount; ++i) {
3413 unsigned idx = i + firstSet;
3414 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
3415
3416 /* If the set is already bound we only need to update the
3417 * (potentially changed) dynamic offsets. */
3418 if (descriptors_state->sets[idx] != set ||
3419 !(descriptors_state->valid & (1u << idx))) {
3420 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, idx);
3421 }
3422
3423 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
3424 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
3425 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
3426 assert(dyn_idx < dynamicOffsetCount);
3427
3428 struct radv_descriptor_range *range = set->dynamic_descriptors + j;
3429 uint64_t va = range->va + pDynamicOffsets[dyn_idx];
3430 dst[0] = va;
3431 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3432 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
3433 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3434 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3435 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3436 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3437
3438 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3439 dst[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3440 S_008F0C_OOB_SELECT(3) |
3441 S_008F0C_RESOURCE_LEVEL(1);
3442 } else {
3443 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3444 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3445 }
3446
3447 cmd_buffer->push_constant_stages |=
3448 set->layout->dynamic_shader_stages;
3449 }
3450 }
3451 }
3452
3453 static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3454 struct radv_descriptor_set *set,
3455 struct radv_descriptor_set_layout *layout,
3456 VkPipelineBindPoint bind_point)
3457 {
3458 struct radv_descriptor_state *descriptors_state =
3459 radv_get_descriptors_state(cmd_buffer, bind_point);
3460 set->size = layout->size;
3461 set->layout = layout;
3462
3463 if (descriptors_state->push_set.capacity < set->size) {
3464 size_t new_size = MAX2(set->size, 1024);
3465 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
3466 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
3467
3468 free(set->mapped_ptr);
3469 set->mapped_ptr = malloc(new_size);
3470
3471 if (!set->mapped_ptr) {
3472 descriptors_state->push_set.capacity = 0;
3473 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3474 return false;
3475 }
3476
3477 descriptors_state->push_set.capacity = new_size;
3478 }
3479
3480 return true;
3481 }
3482
3483 void radv_meta_push_descriptor_set(
3484 struct radv_cmd_buffer* cmd_buffer,
3485 VkPipelineBindPoint pipelineBindPoint,
3486 VkPipelineLayout _layout,
3487 uint32_t set,
3488 uint32_t descriptorWriteCount,
3489 const VkWriteDescriptorSet* pDescriptorWrites)
3490 {
3491 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3492 struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
3493 unsigned bo_offset;
3494
3495 assert(set == 0);
3496 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3497
3498 push_set->size = layout->set[set].layout->size;
3499 push_set->layout = layout->set[set].layout;
3500
3501 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
3502 &bo_offset,
3503 (void**) &push_set->mapped_ptr))
3504 return;
3505
3506 push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3507 push_set->va += bo_offset;
3508
3509 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3510 radv_descriptor_set_to_handle(push_set),
3511 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3512
3513 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3514 }
3515
3516 void radv_CmdPushDescriptorSetKHR(
3517 VkCommandBuffer commandBuffer,
3518 VkPipelineBindPoint pipelineBindPoint,
3519 VkPipelineLayout _layout,
3520 uint32_t set,
3521 uint32_t descriptorWriteCount,
3522 const VkWriteDescriptorSet* pDescriptorWrites)
3523 {
3524 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3525 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3526 struct radv_descriptor_state *descriptors_state =
3527 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3528 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3529
3530 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3531
3532 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3533 layout->set[set].layout,
3534 pipelineBindPoint))
3535 return;
3536
3537 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
3538 * because it is invalid, according to Vulkan spec.
3539 */
3540 for (int i = 0; i < descriptorWriteCount; i++) {
3541 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
3542 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
3543 }
3544
3545 radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
3546 radv_descriptor_set_to_handle(push_set),
3547 descriptorWriteCount, pDescriptorWrites, 0, NULL);
3548
3549 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
3550 descriptors_state->push_dirty = true;
3551 }
3552
3553 void radv_CmdPushDescriptorSetWithTemplateKHR(
3554 VkCommandBuffer commandBuffer,
3555 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
3556 VkPipelineLayout _layout,
3557 uint32_t set,
3558 const void* pData)
3559 {
3560 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3561 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3562 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
3563 struct radv_descriptor_state *descriptors_state =
3564 radv_get_descriptors_state(cmd_buffer, templ->bind_point);
3565 struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
3566
3567 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
3568
3569 if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
3570 layout->set[set].layout,
3571 templ->bind_point))
3572 return;
3573
3574 radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
3575 descriptorUpdateTemplate, pData);
3576
3577 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
3578 descriptors_state->push_dirty = true;
3579 }
3580
3581 void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
3582 VkPipelineLayout layout,
3583 VkShaderStageFlags stageFlags,
3584 uint32_t offset,
3585 uint32_t size,
3586 const void* pValues)
3587 {
3588 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3589 memcpy(cmd_buffer->push_constants + offset, pValues, size);
3590 cmd_buffer->push_constant_stages |= stageFlags;
3591 }
3592
3593 VkResult radv_EndCommandBuffer(
3594 VkCommandBuffer commandBuffer)
3595 {
3596 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3597
3598 if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
3599 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
3600 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
3601
3602 /* Make sure to sync all pending active queries at the end of
3603 * command buffer.
3604 */
3605 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
3606
3607 si_emit_cache_flush(cmd_buffer);
3608 }
3609
3610 /* Make sure CP DMA is idle at the end of IBs because the kernel
3611 * doesn't wait for it.
3612 */
3613 si_cp_dma_wait_for_idle(cmd_buffer);
3614
3615 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
3616 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
3617
3618 if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
3619 return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
3620
3621 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
3622
3623 return cmd_buffer->record_result;
3624 }
3625
3626 static void
3627 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
3628 {
3629 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
3630
3631 if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
3632 return;
3633
3634 assert(!pipeline->ctx_cs.cdw);
3635
3636 cmd_buffer->state.emitted_compute_pipeline = pipeline;
3637
3638 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
3639 radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
3640
3641 cmd_buffer->compute_scratch_size_needed =
3642 MAX2(cmd_buffer->compute_scratch_size_needed,
3643 pipeline->max_waves * pipeline->scratch_bytes_per_wave);
3644
3645 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
3646 pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
3647
3648 if (unlikely(cmd_buffer->device->trace_bo))
3649 radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
3650 }
3651
3652 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer,
3653 VkPipelineBindPoint bind_point)
3654 {
3655 struct radv_descriptor_state *descriptors_state =
3656 radv_get_descriptors_state(cmd_buffer, bind_point);
3657
3658 descriptors_state->dirty |= descriptors_state->valid;
3659 }
3660
3661 void radv_CmdBindPipeline(
3662 VkCommandBuffer commandBuffer,
3663 VkPipelineBindPoint pipelineBindPoint,
3664 VkPipeline _pipeline)
3665 {
3666 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3667 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
3668
3669 switch (pipelineBindPoint) {
3670 case VK_PIPELINE_BIND_POINT_COMPUTE:
3671 if (cmd_buffer->state.compute_pipeline == pipeline)
3672 return;
3673 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3674
3675 cmd_buffer->state.compute_pipeline = pipeline;
3676 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3677 break;
3678 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3679 if (cmd_buffer->state.pipeline == pipeline)
3680 return;
3681 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
3682
3683 cmd_buffer->state.pipeline = pipeline;
3684 if (!pipeline)
3685 break;
3686
3687 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
3688 cmd_buffer->push_constant_stages |= pipeline->active_stages;
3689
3690 /* the new vertex shader might not have the same user regs */
3691 cmd_buffer->state.last_first_instance = -1;
3692 cmd_buffer->state.last_vertex_offset = -1;
3693
3694 /* Prefetch all pipeline shaders at first draw time. */
3695 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
3696
3697 if ((cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI10 ||
3698 cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI12 ||
3699 cmd_buffer->device->physical_device->rad_info.family == CHIP_NAVI14) &&
3700 cmd_buffer->state.emitted_pipeline &&
3701 radv_pipeline_has_ngg(cmd_buffer->state.emitted_pipeline) &&
3702 !radv_pipeline_has_ngg(cmd_buffer->state.pipeline)) {
3703 /* Transitioning from NGG to legacy GS requires
3704 * VGT_FLUSH on Navi10-14. VGT_FLUSH is also emitted
3705 * at the beginning of IBs when legacy GS ring pointers
3706 * are set.
3707 */
3708 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
3709 }
3710
3711 radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
3712 radv_bind_streamout_state(cmd_buffer, pipeline);
3713
3714 if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
3715 cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
3716 if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
3717 cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
3718
3719 if (radv_pipeline_has_tess(pipeline))
3720 cmd_buffer->tess_rings_needed = true;
3721 break;
3722 default:
3723 assert(!"invalid bind point");
3724 break;
3725 }
3726 }
3727
3728 void radv_CmdSetViewport(
3729 VkCommandBuffer commandBuffer,
3730 uint32_t firstViewport,
3731 uint32_t viewportCount,
3732 const VkViewport* pViewports)
3733 {
3734 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3735 struct radv_cmd_state *state = &cmd_buffer->state;
3736 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
3737
3738 assert(firstViewport < MAX_VIEWPORTS);
3739 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
3740
3741 if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
3742 pViewports, viewportCount * sizeof(*pViewports))) {
3743 return;
3744 }
3745
3746 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
3747 viewportCount * sizeof(*pViewports));
3748
3749 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
3750 }
3751
3752 void radv_CmdSetScissor(
3753 VkCommandBuffer commandBuffer,
3754 uint32_t firstScissor,
3755 uint32_t scissorCount,
3756 const VkRect2D* pScissors)
3757 {
3758 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3759 struct radv_cmd_state *state = &cmd_buffer->state;
3760 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
3761
3762 assert(firstScissor < MAX_SCISSORS);
3763 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
3764
3765 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
3766 scissorCount * sizeof(*pScissors))) {
3767 return;
3768 }
3769
3770 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
3771 scissorCount * sizeof(*pScissors));
3772
3773 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
3774 }
3775
3776 void radv_CmdSetLineWidth(
3777 VkCommandBuffer commandBuffer,
3778 float lineWidth)
3779 {
3780 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3781
3782 if (cmd_buffer->state.dynamic.line_width == lineWidth)
3783 return;
3784
3785 cmd_buffer->state.dynamic.line_width = lineWidth;
3786 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
3787 }
3788
3789 void radv_CmdSetDepthBias(
3790 VkCommandBuffer commandBuffer,
3791 float depthBiasConstantFactor,
3792 float depthBiasClamp,
3793 float depthBiasSlopeFactor)
3794 {
3795 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3796 struct radv_cmd_state *state = &cmd_buffer->state;
3797
3798 if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
3799 state->dynamic.depth_bias.clamp == depthBiasClamp &&
3800 state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
3801 return;
3802 }
3803
3804 state->dynamic.depth_bias.bias = depthBiasConstantFactor;
3805 state->dynamic.depth_bias.clamp = depthBiasClamp;
3806 state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
3807
3808 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
3809 }
3810
3811 void radv_CmdSetBlendConstants(
3812 VkCommandBuffer commandBuffer,
3813 const float blendConstants[4])
3814 {
3815 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3816 struct radv_cmd_state *state = &cmd_buffer->state;
3817
3818 if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
3819 return;
3820
3821 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
3822
3823 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
3824 }
3825
3826 void radv_CmdSetDepthBounds(
3827 VkCommandBuffer commandBuffer,
3828 float minDepthBounds,
3829 float maxDepthBounds)
3830 {
3831 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3832 struct radv_cmd_state *state = &cmd_buffer->state;
3833
3834 if (state->dynamic.depth_bounds.min == minDepthBounds &&
3835 state->dynamic.depth_bounds.max == maxDepthBounds) {
3836 return;
3837 }
3838
3839 state->dynamic.depth_bounds.min = minDepthBounds;
3840 state->dynamic.depth_bounds.max = maxDepthBounds;
3841
3842 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
3843 }
3844
3845 void radv_CmdSetStencilCompareMask(
3846 VkCommandBuffer commandBuffer,
3847 VkStencilFaceFlags faceMask,
3848 uint32_t compareMask)
3849 {
3850 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3851 struct radv_cmd_state *state = &cmd_buffer->state;
3852 bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
3853 bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
3854
3855 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3856 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3857 return;
3858 }
3859
3860 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3861 state->dynamic.stencil_compare_mask.front = compareMask;
3862 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3863 state->dynamic.stencil_compare_mask.back = compareMask;
3864
3865 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
3866 }
3867
3868 void radv_CmdSetStencilWriteMask(
3869 VkCommandBuffer commandBuffer,
3870 VkStencilFaceFlags faceMask,
3871 uint32_t writeMask)
3872 {
3873 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3874 struct radv_cmd_state *state = &cmd_buffer->state;
3875 bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
3876 bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
3877
3878 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3879 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3880 return;
3881 }
3882
3883 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3884 state->dynamic.stencil_write_mask.front = writeMask;
3885 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3886 state->dynamic.stencil_write_mask.back = writeMask;
3887
3888 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
3889 }
3890
3891 void radv_CmdSetStencilReference(
3892 VkCommandBuffer commandBuffer,
3893 VkStencilFaceFlags faceMask,
3894 uint32_t reference)
3895 {
3896 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3897 struct radv_cmd_state *state = &cmd_buffer->state;
3898 bool front_same = state->dynamic.stencil_reference.front == reference;
3899 bool back_same = state->dynamic.stencil_reference.back == reference;
3900
3901 if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
3902 (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
3903 return;
3904 }
3905
3906 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3907 cmd_buffer->state.dynamic.stencil_reference.front = reference;
3908 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3909 cmd_buffer->state.dynamic.stencil_reference.back = reference;
3910
3911 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
3912 }
3913
3914 void radv_CmdSetDiscardRectangleEXT(
3915 VkCommandBuffer commandBuffer,
3916 uint32_t firstDiscardRectangle,
3917 uint32_t discardRectangleCount,
3918 const VkRect2D* pDiscardRectangles)
3919 {
3920 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3921 struct radv_cmd_state *state = &cmd_buffer->state;
3922 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
3923
3924 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
3925 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
3926
3927 if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
3928 pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
3929 return;
3930 }
3931
3932 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
3933 pDiscardRectangles, discardRectangleCount);
3934
3935 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
3936 }
3937
3938 void radv_CmdSetSampleLocationsEXT(
3939 VkCommandBuffer commandBuffer,
3940 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
3941 {
3942 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3943 struct radv_cmd_state *state = &cmd_buffer->state;
3944
3945 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
3946
3947 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
3948 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
3949 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
3950 typed_memcpy(&state->dynamic.sample_location.locations[0],
3951 pSampleLocationsInfo->pSampleLocations,
3952 pSampleLocationsInfo->sampleLocationsCount);
3953
3954 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
3955 }
3956
3957 void radv_CmdExecuteCommands(
3958 VkCommandBuffer commandBuffer,
3959 uint32_t commandBufferCount,
3960 const VkCommandBuffer* pCmdBuffers)
3961 {
3962 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
3963
3964 assert(commandBufferCount > 0);
3965
3966 /* Emit pending flushes on primary prior to executing secondary */
3967 si_emit_cache_flush(primary);
3968
3969 for (uint32_t i = 0; i < commandBufferCount; i++) {
3970 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
3971
3972 primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
3973 secondary->scratch_size_needed);
3974 primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
3975 secondary->compute_scratch_size_needed);
3976
3977 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
3978 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
3979 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
3980 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
3981 if (secondary->tess_rings_needed)
3982 primary->tess_rings_needed = true;
3983 if (secondary->sample_positions_needed)
3984 primary->sample_positions_needed = true;
3985
3986 if (!secondary->state.framebuffer &&
3987 (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
3988 /* Emit the framebuffer state from primary if secondary
3989 * has been recorded without a framebuffer, otherwise
3990 * fast color/depth clears can't work.
3991 */
3992 radv_emit_framebuffer_state(primary);
3993 }
3994
3995 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
3996
3997
3998 /* When the secondary command buffer is compute only we don't
3999 * need to re-emit the current graphics pipeline.
4000 */
4001 if (secondary->state.emitted_pipeline) {
4002 primary->state.emitted_pipeline =
4003 secondary->state.emitted_pipeline;
4004 }
4005
4006 /* When the secondary command buffer is graphics only we don't
4007 * need to re-emit the current compute pipeline.
4008 */
4009 if (secondary->state.emitted_compute_pipeline) {
4010 primary->state.emitted_compute_pipeline =
4011 secondary->state.emitted_compute_pipeline;
4012 }
4013
4014 /* Only re-emit the draw packets when needed. */
4015 if (secondary->state.last_primitive_reset_en != -1) {
4016 primary->state.last_primitive_reset_en =
4017 secondary->state.last_primitive_reset_en;
4018 }
4019
4020 if (secondary->state.last_primitive_reset_index) {
4021 primary->state.last_primitive_reset_index =
4022 secondary->state.last_primitive_reset_index;
4023 }
4024
4025 if (secondary->state.last_ia_multi_vgt_param) {
4026 primary->state.last_ia_multi_vgt_param =
4027 secondary->state.last_ia_multi_vgt_param;
4028 }
4029
4030 primary->state.last_first_instance = secondary->state.last_first_instance;
4031 primary->state.last_num_instances = secondary->state.last_num_instances;
4032 primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
4033
4034 if (secondary->state.last_index_type != -1) {
4035 primary->state.last_index_type =
4036 secondary->state.last_index_type;
4037 }
4038 }
4039
4040 /* After executing commands from secondary buffers we have to dirty
4041 * some states.
4042 */
4043 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
4044 RADV_CMD_DIRTY_INDEX_BUFFER |
4045 RADV_CMD_DIRTY_DYNAMIC_ALL;
4046 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
4047 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
4048 }
4049
4050 VkResult radv_CreateCommandPool(
4051 VkDevice _device,
4052 const VkCommandPoolCreateInfo* pCreateInfo,
4053 const VkAllocationCallbacks* pAllocator,
4054 VkCommandPool* pCmdPool)
4055 {
4056 RADV_FROM_HANDLE(radv_device, device, _device);
4057 struct radv_cmd_pool *pool;
4058
4059 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
4060 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4061 if (pool == NULL)
4062 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
4063
4064 if (pAllocator)
4065 pool->alloc = *pAllocator;
4066 else
4067 pool->alloc = device->alloc;
4068
4069 list_inithead(&pool->cmd_buffers);
4070 list_inithead(&pool->free_cmd_buffers);
4071
4072 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
4073
4074 *pCmdPool = radv_cmd_pool_to_handle(pool);
4075
4076 return VK_SUCCESS;
4077
4078 }
4079
4080 void radv_DestroyCommandPool(
4081 VkDevice _device,
4082 VkCommandPool commandPool,
4083 const VkAllocationCallbacks* pAllocator)
4084 {
4085 RADV_FROM_HANDLE(radv_device, device, _device);
4086 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4087
4088 if (!pool)
4089 return;
4090
4091 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4092 &pool->cmd_buffers, pool_link) {
4093 radv_cmd_buffer_destroy(cmd_buffer);
4094 }
4095
4096 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4097 &pool->free_cmd_buffers, pool_link) {
4098 radv_cmd_buffer_destroy(cmd_buffer);
4099 }
4100
4101 vk_free2(&device->alloc, pAllocator, pool);
4102 }
4103
4104 VkResult radv_ResetCommandPool(
4105 VkDevice device,
4106 VkCommandPool commandPool,
4107 VkCommandPoolResetFlags flags)
4108 {
4109 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4110 VkResult result;
4111
4112 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
4113 &pool->cmd_buffers, pool_link) {
4114 result = radv_reset_cmd_buffer(cmd_buffer);
4115 if (result != VK_SUCCESS)
4116 return result;
4117 }
4118
4119 return VK_SUCCESS;
4120 }
4121
4122 void radv_TrimCommandPool(
4123 VkDevice device,
4124 VkCommandPool commandPool,
4125 VkCommandPoolTrimFlags flags)
4126 {
4127 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4128
4129 if (!pool)
4130 return;
4131
4132 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4133 &pool->free_cmd_buffers, pool_link) {
4134 radv_cmd_buffer_destroy(cmd_buffer);
4135 }
4136 }
4137
4138 static void
4139 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer,
4140 uint32_t subpass_id)
4141 {
4142 struct radv_cmd_state *state = &cmd_buffer->state;
4143 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
4144
4145 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
4146 cmd_buffer->cs, 4096);
4147
4148 radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
4149
4150 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
4151
4152 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4153 const uint32_t a = subpass->attachments[i].attachment;
4154 if (a == VK_ATTACHMENT_UNUSED)
4155 continue;
4156
4157 radv_handle_subpass_image_transition(cmd_buffer,
4158 subpass->attachments[i],
4159 true);
4160 }
4161
4162 radv_cmd_buffer_clear_subpass(cmd_buffer);
4163
4164 assert(cmd_buffer->cs->cdw <= cdw_max);
4165 }
4166
4167 static void
4168 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
4169 {
4170 struct radv_cmd_state *state = &cmd_buffer->state;
4171 const struct radv_subpass *subpass = state->subpass;
4172 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4173
4174 radv_cmd_buffer_resolve_subpass(cmd_buffer);
4175
4176 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4177 const uint32_t a = subpass->attachments[i].attachment;
4178 if (a == VK_ATTACHMENT_UNUSED)
4179 continue;
4180
4181 if (state->pass->attachments[a].last_subpass_idx != subpass_id)
4182 continue;
4183
4184 VkImageLayout layout = state->pass->attachments[a].final_layout;
4185 struct radv_subpass_attachment att = { a, layout };
4186 radv_handle_subpass_image_transition(cmd_buffer, att, false);
4187 }
4188 }
4189
4190 void radv_CmdBeginRenderPass(
4191 VkCommandBuffer commandBuffer,
4192 const VkRenderPassBeginInfo* pRenderPassBegin,
4193 VkSubpassContents contents)
4194 {
4195 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4196 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
4197 RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
4198 VkResult result;
4199
4200 cmd_buffer->state.framebuffer = framebuffer;
4201 cmd_buffer->state.pass = pass;
4202 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
4203
4204 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
4205 if (result != VK_SUCCESS)
4206 return;
4207
4208 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
4209 if (result != VK_SUCCESS)
4210 return;
4211
4212 radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
4213 }
4214
4215 void radv_CmdBeginRenderPass2KHR(
4216 VkCommandBuffer commandBuffer,
4217 const VkRenderPassBeginInfo* pRenderPassBeginInfo,
4218 const VkSubpassBeginInfoKHR* pSubpassBeginInfo)
4219 {
4220 radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
4221 pSubpassBeginInfo->contents);
4222 }
4223
4224 void radv_CmdNextSubpass(
4225 VkCommandBuffer commandBuffer,
4226 VkSubpassContents contents)
4227 {
4228 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4229
4230 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
4231 radv_cmd_buffer_end_subpass(cmd_buffer);
4232 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
4233 }
4234
4235 void radv_CmdNextSubpass2KHR(
4236 VkCommandBuffer commandBuffer,
4237 const VkSubpassBeginInfoKHR* pSubpassBeginInfo,
4238 const VkSubpassEndInfoKHR* pSubpassEndInfo)
4239 {
4240 radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
4241 }
4242
4243 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
4244 {
4245 struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
4246 for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
4247 if (!radv_get_shader(pipeline, stage))
4248 continue;
4249
4250 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
4251 if (loc->sgpr_idx == -1)
4252 continue;
4253 uint32_t base_reg = pipeline->user_data_0[stage];
4254 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
4255
4256 }
4257 if (radv_pipeline_has_gs_copy_shader(pipeline)) {
4258 struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
4259 if (loc->sgpr_idx != -1) {
4260 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
4261 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
4262 }
4263 }
4264 }
4265
4266 static void
4267 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4268 uint32_t vertex_count,
4269 bool use_opaque)
4270 {
4271 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
4272 radeon_emit(cmd_buffer->cs, vertex_count);
4273 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
4274 S_0287F0_USE_OPAQUE(use_opaque));
4275 }
4276
4277 static void
4278 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
4279 uint64_t index_va,
4280 uint32_t index_count)
4281 {
4282 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
4283 radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
4284 radeon_emit(cmd_buffer->cs, index_va);
4285 radeon_emit(cmd_buffer->cs, index_va >> 32);
4286 radeon_emit(cmd_buffer->cs, index_count);
4287 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
4288 }
4289
4290 static void
4291 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
4292 bool indexed,
4293 uint32_t draw_count,
4294 uint64_t count_va,
4295 uint32_t stride)
4296 {
4297 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4298 unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
4299 : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
4300 bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id;
4301 uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
4302 bool predicating = cmd_buffer->state.predicating;
4303 assert(base_reg);
4304
4305 /* just reset draw state for vertex data */
4306 cmd_buffer->state.last_first_instance = -1;
4307 cmd_buffer->state.last_num_instances = -1;
4308 cmd_buffer->state.last_vertex_offset = -1;
4309
4310 if (draw_count == 1 && !count_va && !draw_id_enable) {
4311 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
4312 PKT3_DRAW_INDIRECT, 3, predicating));
4313 radeon_emit(cs, 0);
4314 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4315 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4316 radeon_emit(cs, di_src_sel);
4317 } else {
4318 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
4319 PKT3_DRAW_INDIRECT_MULTI,
4320 8, predicating));
4321 radeon_emit(cs, 0);
4322 radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
4323 radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
4324 radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
4325 S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
4326 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
4327 radeon_emit(cs, draw_count); /* count */
4328 radeon_emit(cs, count_va); /* count_addr */
4329 radeon_emit(cs, count_va >> 32);
4330 radeon_emit(cs, stride); /* stride */
4331 radeon_emit(cs, di_src_sel);
4332 }
4333 }
4334
4335 static void
4336 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
4337 const struct radv_draw_info *info)
4338 {
4339 struct radv_cmd_state *state = &cmd_buffer->state;
4340 struct radeon_winsys *ws = cmd_buffer->device->ws;
4341 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4342
4343 if (info->indirect) {
4344 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4345 uint64_t count_va = 0;
4346
4347 va += info->indirect->offset + info->indirect_offset;
4348
4349 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4350
4351 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
4352 radeon_emit(cs, 1);
4353 radeon_emit(cs, va);
4354 radeon_emit(cs, va >> 32);
4355
4356 if (info->count_buffer) {
4357 count_va = radv_buffer_get_va(info->count_buffer->bo);
4358 count_va += info->count_buffer->offset +
4359 info->count_buffer_offset;
4360
4361 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
4362 }
4363
4364 if (!state->subpass->view_mask) {
4365 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4366 info->indexed,
4367 info->count,
4368 count_va,
4369 info->stride);
4370 } else {
4371 unsigned i;
4372 for_each_bit(i, state->subpass->view_mask) {
4373 radv_emit_view_index(cmd_buffer, i);
4374
4375 radv_cs_emit_indirect_draw_packet(cmd_buffer,
4376 info->indexed,
4377 info->count,
4378 count_va,
4379 info->stride);
4380 }
4381 }
4382 } else {
4383 assert(state->pipeline->graphics.vtx_base_sgpr);
4384
4385 if (info->vertex_offset != state->last_vertex_offset ||
4386 info->first_instance != state->last_first_instance) {
4387 radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
4388 state->pipeline->graphics.vtx_emit_num);
4389
4390 radeon_emit(cs, info->vertex_offset);
4391 radeon_emit(cs, info->first_instance);
4392 if (state->pipeline->graphics.vtx_emit_num == 3)
4393 radeon_emit(cs, 0);
4394 state->last_first_instance = info->first_instance;
4395 state->last_vertex_offset = info->vertex_offset;
4396 }
4397
4398 if (state->last_num_instances != info->instance_count) {
4399 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
4400 radeon_emit(cs, info->instance_count);
4401 state->last_num_instances = info->instance_count;
4402 }
4403
4404 if (info->indexed) {
4405 int index_size = radv_get_vgt_index_size(state->index_type);
4406 uint64_t index_va;
4407
4408 /* Skip draw calls with 0-sized index buffers. They
4409 * cause a hang on some chips, like Navi10-14.
4410 */
4411 if (!cmd_buffer->state.max_index_count)
4412 return;
4413
4414 index_va = state->index_va;
4415 index_va += info->first_index * index_size;
4416
4417 if (!state->subpass->view_mask) {
4418 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4419 index_va,
4420 info->count);
4421 } else {
4422 unsigned i;
4423 for_each_bit(i, state->subpass->view_mask) {
4424 radv_emit_view_index(cmd_buffer, i);
4425
4426 radv_cs_emit_draw_indexed_packet(cmd_buffer,
4427 index_va,
4428 info->count);
4429 }
4430 }
4431 } else {
4432 if (!state->subpass->view_mask) {
4433 radv_cs_emit_draw_packet(cmd_buffer,
4434 info->count,
4435 !!info->strmout_buffer);
4436 } else {
4437 unsigned i;
4438 for_each_bit(i, state->subpass->view_mask) {
4439 radv_emit_view_index(cmd_buffer, i);
4440
4441 radv_cs_emit_draw_packet(cmd_buffer,
4442 info->count,
4443 !!info->strmout_buffer);
4444 }
4445 }
4446 }
4447 }
4448 }
4449
4450 /*
4451 * Vega and raven have a bug which triggers if there are multiple context
4452 * register contexts active at the same time with different scissor values.
4453 *
4454 * There are two possible workarounds:
4455 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
4456 * there is only ever 1 active set of scissor values at the same time.
4457 *
4458 * 2) Whenever the hardware switches contexts we have to set the scissor
4459 * registers again even if it is a noop. That way the new context gets
4460 * the correct scissor values.
4461 *
4462 * This implements option 2. radv_need_late_scissor_emission needs to
4463 * return true on affected HW if radv_emit_all_graphics_states sets
4464 * any context registers.
4465 */
4466 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
4467 const struct radv_draw_info *info)
4468 {
4469 struct radv_cmd_state *state = &cmd_buffer->state;
4470
4471 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
4472 return false;
4473
4474 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
4475 return true;
4476
4477 uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
4478
4479 /* Index, vertex and streamout buffers don't change context regs, and
4480 * pipeline is already handled.
4481 */
4482 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
4483 RADV_CMD_DIRTY_VERTEX_BUFFER |
4484 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
4485 RADV_CMD_DIRTY_PIPELINE);
4486
4487 if (cmd_buffer->state.dirty & used_states)
4488 return true;
4489
4490 uint32_t primitive_reset_index =
4491 radv_get_primitive_reset_index(cmd_buffer);
4492
4493 if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
4494 primitive_reset_index != state->last_primitive_reset_index)
4495 return true;
4496
4497 return false;
4498 }
4499
4500 static void
4501 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
4502 const struct radv_draw_info *info)
4503 {
4504 bool late_scissor_emission;
4505
4506 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
4507 cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
4508 radv_emit_rbplus_state(cmd_buffer);
4509
4510 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
4511 radv_emit_graphics_pipeline(cmd_buffer);
4512
4513 /* This should be before the cmd_buffer->state.dirty is cleared
4514 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
4515 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
4516 late_scissor_emission =
4517 radv_need_late_scissor_emission(cmd_buffer, info);
4518
4519 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
4520 radv_emit_framebuffer_state(cmd_buffer);
4521
4522 if (info->indexed) {
4523 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
4524 radv_emit_index_buffer(cmd_buffer);
4525 } else {
4526 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
4527 * so the state must be re-emitted before the next indexed
4528 * draw.
4529 */
4530 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
4531 cmd_buffer->state.last_index_type = -1;
4532 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
4533 }
4534 }
4535
4536 radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
4537
4538 radv_emit_draw_registers(cmd_buffer, info);
4539
4540 if (late_scissor_emission)
4541 radv_emit_scissor(cmd_buffer);
4542 }
4543
4544 static void
4545 radv_draw(struct radv_cmd_buffer *cmd_buffer,
4546 const struct radv_draw_info *info)
4547 {
4548 struct radeon_info *rad_info =
4549 &cmd_buffer->device->physical_device->rad_info;
4550 bool has_prefetch =
4551 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
4552 bool pipeline_is_dirty =
4553 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
4554 cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
4555
4556 ASSERTED unsigned cdw_max =
4557 radeon_check_space(cmd_buffer->device->ws,
4558 cmd_buffer->cs, 4096);
4559
4560 if (likely(!info->indirect)) {
4561 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
4562 * no workaround for indirect draws, but we can at least skip
4563 * direct draws.
4564 */
4565 if (unlikely(!info->instance_count))
4566 return;
4567
4568 /* Handle count == 0. */
4569 if (unlikely(!info->count && !info->strmout_buffer))
4570 return;
4571 }
4572
4573 /* Use optimal packet order based on whether we need to sync the
4574 * pipeline.
4575 */
4576 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4577 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4578 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4579 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4580 /* If we have to wait for idle, set all states first, so that
4581 * all SET packets are processed in parallel with previous draw
4582 * calls. Then upload descriptors, set shader pointers, and
4583 * draw, and prefetch at the end. This ensures that the time
4584 * the CUs are idle is very short. (there are only SET_SH
4585 * packets between the wait and the draw)
4586 */
4587 radv_emit_all_graphics_states(cmd_buffer, info);
4588 si_emit_cache_flush(cmd_buffer);
4589 /* <-- CUs are idle here --> */
4590
4591 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4592
4593 radv_emit_draw_packets(cmd_buffer, info);
4594 /* <-- CUs are busy here --> */
4595
4596 /* Start prefetches after the draw has been started. Both will
4597 * run in parallel, but starting the draw first is more
4598 * important.
4599 */
4600 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4601 radv_emit_prefetch_L2(cmd_buffer,
4602 cmd_buffer->state.pipeline, false);
4603 }
4604 } else {
4605 /* If we don't wait for idle, start prefetches first, then set
4606 * states, and draw at the end.
4607 */
4608 si_emit_cache_flush(cmd_buffer);
4609
4610 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4611 /* Only prefetch the vertex shader and VBO descriptors
4612 * in order to start the draw as soon as possible.
4613 */
4614 radv_emit_prefetch_L2(cmd_buffer,
4615 cmd_buffer->state.pipeline, true);
4616 }
4617
4618 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
4619
4620 radv_emit_all_graphics_states(cmd_buffer, info);
4621 radv_emit_draw_packets(cmd_buffer, info);
4622
4623 /* Prefetch the remaining shaders after the draw has been
4624 * started.
4625 */
4626 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
4627 radv_emit_prefetch_L2(cmd_buffer,
4628 cmd_buffer->state.pipeline, false);
4629 }
4630 }
4631
4632 /* Workaround for a VGT hang when streamout is enabled.
4633 * It must be done after drawing.
4634 */
4635 if (cmd_buffer->state.streamout.streamout_enabled &&
4636 (rad_info->family == CHIP_HAWAII ||
4637 rad_info->family == CHIP_TONGA ||
4638 rad_info->family == CHIP_FIJI)) {
4639 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
4640 }
4641
4642 assert(cmd_buffer->cs->cdw <= cdw_max);
4643 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
4644 }
4645
4646 void radv_CmdDraw(
4647 VkCommandBuffer commandBuffer,
4648 uint32_t vertexCount,
4649 uint32_t instanceCount,
4650 uint32_t firstVertex,
4651 uint32_t firstInstance)
4652 {
4653 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4654 struct radv_draw_info info = {};
4655
4656 info.count = vertexCount;
4657 info.instance_count = instanceCount;
4658 info.first_instance = firstInstance;
4659 info.vertex_offset = firstVertex;
4660
4661 radv_draw(cmd_buffer, &info);
4662 }
4663
4664 void radv_CmdDrawIndexed(
4665 VkCommandBuffer commandBuffer,
4666 uint32_t indexCount,
4667 uint32_t instanceCount,
4668 uint32_t firstIndex,
4669 int32_t vertexOffset,
4670 uint32_t firstInstance)
4671 {
4672 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4673 struct radv_draw_info info = {};
4674
4675 info.indexed = true;
4676 info.count = indexCount;
4677 info.instance_count = instanceCount;
4678 info.first_index = firstIndex;
4679 info.vertex_offset = vertexOffset;
4680 info.first_instance = firstInstance;
4681
4682 radv_draw(cmd_buffer, &info);
4683 }
4684
4685 void radv_CmdDrawIndirect(
4686 VkCommandBuffer commandBuffer,
4687 VkBuffer _buffer,
4688 VkDeviceSize offset,
4689 uint32_t drawCount,
4690 uint32_t stride)
4691 {
4692 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4693 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4694 struct radv_draw_info info = {};
4695
4696 info.count = drawCount;
4697 info.indirect = buffer;
4698 info.indirect_offset = offset;
4699 info.stride = stride;
4700
4701 radv_draw(cmd_buffer, &info);
4702 }
4703
4704 void radv_CmdDrawIndexedIndirect(
4705 VkCommandBuffer commandBuffer,
4706 VkBuffer _buffer,
4707 VkDeviceSize offset,
4708 uint32_t drawCount,
4709 uint32_t stride)
4710 {
4711 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4712 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4713 struct radv_draw_info info = {};
4714
4715 info.indexed = true;
4716 info.count = drawCount;
4717 info.indirect = buffer;
4718 info.indirect_offset = offset;
4719 info.stride = stride;
4720
4721 radv_draw(cmd_buffer, &info);
4722 }
4723
4724 void radv_CmdDrawIndirectCountKHR(
4725 VkCommandBuffer commandBuffer,
4726 VkBuffer _buffer,
4727 VkDeviceSize offset,
4728 VkBuffer _countBuffer,
4729 VkDeviceSize countBufferOffset,
4730 uint32_t maxDrawCount,
4731 uint32_t stride)
4732 {
4733 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4734 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4735 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4736 struct radv_draw_info info = {};
4737
4738 info.count = maxDrawCount;
4739 info.indirect = buffer;
4740 info.indirect_offset = offset;
4741 info.count_buffer = count_buffer;
4742 info.count_buffer_offset = countBufferOffset;
4743 info.stride = stride;
4744
4745 radv_draw(cmd_buffer, &info);
4746 }
4747
4748 void radv_CmdDrawIndexedIndirectCountKHR(
4749 VkCommandBuffer commandBuffer,
4750 VkBuffer _buffer,
4751 VkDeviceSize offset,
4752 VkBuffer _countBuffer,
4753 VkDeviceSize countBufferOffset,
4754 uint32_t maxDrawCount,
4755 uint32_t stride)
4756 {
4757 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4758 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
4759 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
4760 struct radv_draw_info info = {};
4761
4762 info.indexed = true;
4763 info.count = maxDrawCount;
4764 info.indirect = buffer;
4765 info.indirect_offset = offset;
4766 info.count_buffer = count_buffer;
4767 info.count_buffer_offset = countBufferOffset;
4768 info.stride = stride;
4769
4770 radv_draw(cmd_buffer, &info);
4771 }
4772
4773 struct radv_dispatch_info {
4774 /**
4775 * Determine the layout of the grid (in block units) to be used.
4776 */
4777 uint32_t blocks[3];
4778
4779 /**
4780 * A starting offset for the grid. If unaligned is set, the offset
4781 * must still be aligned.
4782 */
4783 uint32_t offsets[3];
4784 /**
4785 * Whether it's an unaligned compute dispatch.
4786 */
4787 bool unaligned;
4788
4789 /**
4790 * Indirect compute parameters resource.
4791 */
4792 struct radv_buffer *indirect;
4793 uint64_t indirect_offset;
4794 };
4795
4796 static void
4797 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
4798 const struct radv_dispatch_info *info)
4799 {
4800 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4801 struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4802 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
4803 struct radeon_winsys *ws = cmd_buffer->device->ws;
4804 bool predicating = cmd_buffer->state.predicating;
4805 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4806 struct radv_userdata_info *loc;
4807
4808 loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
4809 AC_UD_CS_GRID_SIZE);
4810
4811 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
4812
4813 if (info->indirect) {
4814 uint64_t va = radv_buffer_get_va(info->indirect->bo);
4815
4816 va += info->indirect->offset + info->indirect_offset;
4817
4818 radv_cs_add_buffer(ws, cs, info->indirect->bo);
4819
4820 if (loc->sgpr_idx != -1) {
4821 for (unsigned i = 0; i < 3; ++i) {
4822 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4823 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
4824 COPY_DATA_DST_SEL(COPY_DATA_REG));
4825 radeon_emit(cs, (va + 4 * i));
4826 radeon_emit(cs, (va + 4 * i) >> 32);
4827 radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
4828 + loc->sgpr_idx * 4) >> 2) + i);
4829 radeon_emit(cs, 0);
4830 }
4831 }
4832
4833 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
4834 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
4835 PKT3_SHADER_TYPE_S(1));
4836 radeon_emit(cs, va);
4837 radeon_emit(cs, va >> 32);
4838 radeon_emit(cs, dispatch_initiator);
4839 } else {
4840 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
4841 PKT3_SHADER_TYPE_S(1));
4842 radeon_emit(cs, 1);
4843 radeon_emit(cs, va);
4844 radeon_emit(cs, va >> 32);
4845
4846 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
4847 PKT3_SHADER_TYPE_S(1));
4848 radeon_emit(cs, 0);
4849 radeon_emit(cs, dispatch_initiator);
4850 }
4851 } else {
4852 unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
4853 unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] };
4854
4855 if (info->unaligned) {
4856 unsigned *cs_block_size = compute_shader->info.cs.block_size;
4857 unsigned remainder[3];
4858
4859 /* If aligned, these should be an entire block size,
4860 * not 0.
4861 */
4862 remainder[0] = blocks[0] + cs_block_size[0] -
4863 align_u32_npot(blocks[0], cs_block_size[0]);
4864 remainder[1] = blocks[1] + cs_block_size[1] -
4865 align_u32_npot(blocks[1], cs_block_size[1]);
4866 remainder[2] = blocks[2] + cs_block_size[2] -
4867 align_u32_npot(blocks[2], cs_block_size[2]);
4868
4869 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
4870 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
4871 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
4872
4873 for(unsigned i = 0; i < 3; ++i) {
4874 assert(offsets[i] % cs_block_size[i] == 0);
4875 offsets[i] /= cs_block_size[i];
4876 }
4877
4878 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
4879 radeon_emit(cs,
4880 S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
4881 S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
4882 radeon_emit(cs,
4883 S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
4884 S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
4885 radeon_emit(cs,
4886 S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
4887 S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
4888
4889 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
4890 }
4891
4892 if (loc->sgpr_idx != -1) {
4893 assert(loc->num_sgprs == 3);
4894
4895 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
4896 loc->sgpr_idx * 4, 3);
4897 radeon_emit(cs, blocks[0]);
4898 radeon_emit(cs, blocks[1]);
4899 radeon_emit(cs, blocks[2]);
4900 }
4901
4902 if (offsets[0] || offsets[1] || offsets[2]) {
4903 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
4904 radeon_emit(cs, offsets[0]);
4905 radeon_emit(cs, offsets[1]);
4906 radeon_emit(cs, offsets[2]);
4907
4908 /* The blocks in the packet are not counts but end values. */
4909 for (unsigned i = 0; i < 3; ++i)
4910 blocks[i] += offsets[i];
4911 } else {
4912 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
4913 }
4914
4915 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
4916 PKT3_SHADER_TYPE_S(1));
4917 radeon_emit(cs, blocks[0]);
4918 radeon_emit(cs, blocks[1]);
4919 radeon_emit(cs, blocks[2]);
4920 radeon_emit(cs, dispatch_initiator);
4921 }
4922
4923 assert(cmd_buffer->cs->cdw <= cdw_max);
4924 }
4925
4926 static void
4927 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
4928 {
4929 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4930 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
4931 }
4932
4933 static void
4934 radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
4935 const struct radv_dispatch_info *info)
4936 {
4937 struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4938 bool has_prefetch =
4939 cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
4940 bool pipeline_is_dirty = pipeline &&
4941 pipeline != cmd_buffer->state.emitted_compute_pipeline;
4942
4943 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
4944 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
4945 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
4946 RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
4947 /* If we have to wait for idle, set all states first, so that
4948 * all SET packets are processed in parallel with previous draw
4949 * calls. Then upload descriptors, set shader pointers, and
4950 * dispatch, and prefetch at the end. This ensures that the
4951 * time the CUs are idle is very short. (there are only SET_SH
4952 * packets between the wait and the draw)
4953 */
4954 radv_emit_compute_pipeline(cmd_buffer);
4955 si_emit_cache_flush(cmd_buffer);
4956 /* <-- CUs are idle here --> */
4957
4958 radv_upload_compute_shader_descriptors(cmd_buffer);
4959
4960 radv_emit_dispatch_packets(cmd_buffer, info);
4961 /* <-- CUs are busy here --> */
4962
4963 /* Start prefetches after the dispatch has been started. Both
4964 * will run in parallel, but starting the dispatch first is
4965 * more important.
4966 */
4967 if (has_prefetch && pipeline_is_dirty) {
4968 radv_emit_shader_prefetch(cmd_buffer,
4969 pipeline->shaders[MESA_SHADER_COMPUTE]);
4970 }
4971 } else {
4972 /* If we don't wait for idle, start prefetches first, then set
4973 * states, and dispatch at the end.
4974 */
4975 si_emit_cache_flush(cmd_buffer);
4976
4977 if (has_prefetch && pipeline_is_dirty) {
4978 radv_emit_shader_prefetch(cmd_buffer,
4979 pipeline->shaders[MESA_SHADER_COMPUTE]);
4980 }
4981
4982 radv_upload_compute_shader_descriptors(cmd_buffer);
4983
4984 radv_emit_compute_pipeline(cmd_buffer);
4985 radv_emit_dispatch_packets(cmd_buffer, info);
4986 }
4987
4988 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
4989 }
4990
4991 void radv_CmdDispatchBase(
4992 VkCommandBuffer commandBuffer,
4993 uint32_t base_x,
4994 uint32_t base_y,
4995 uint32_t base_z,
4996 uint32_t x,
4997 uint32_t y,
4998 uint32_t z)
4999 {
5000 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5001 struct radv_dispatch_info info = {};
5002
5003 info.blocks[0] = x;
5004 info.blocks[1] = y;
5005 info.blocks[2] = z;
5006
5007 info.offsets[0] = base_x;
5008 info.offsets[1] = base_y;
5009 info.offsets[2] = base_z;
5010 radv_dispatch(cmd_buffer, &info);
5011 }
5012
5013 void radv_CmdDispatch(
5014 VkCommandBuffer commandBuffer,
5015 uint32_t x,
5016 uint32_t y,
5017 uint32_t z)
5018 {
5019 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
5020 }
5021
5022 void radv_CmdDispatchIndirect(
5023 VkCommandBuffer commandBuffer,
5024 VkBuffer _buffer,
5025 VkDeviceSize offset)
5026 {
5027 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5028 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5029 struct radv_dispatch_info info = {};
5030
5031 info.indirect = buffer;
5032 info.indirect_offset = offset;
5033
5034 radv_dispatch(cmd_buffer, &info);
5035 }
5036
5037 void radv_unaligned_dispatch(
5038 struct radv_cmd_buffer *cmd_buffer,
5039 uint32_t x,
5040 uint32_t y,
5041 uint32_t z)
5042 {
5043 struct radv_dispatch_info info = {};
5044
5045 info.blocks[0] = x;
5046 info.blocks[1] = y;
5047 info.blocks[2] = z;
5048 info.unaligned = 1;
5049
5050 radv_dispatch(cmd_buffer, &info);
5051 }
5052
5053 void radv_CmdEndRenderPass(
5054 VkCommandBuffer commandBuffer)
5055 {
5056 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5057
5058 radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
5059
5060 radv_cmd_buffer_end_subpass(cmd_buffer);
5061
5062 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
5063 vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
5064
5065 cmd_buffer->state.pass = NULL;
5066 cmd_buffer->state.subpass = NULL;
5067 cmd_buffer->state.attachments = NULL;
5068 cmd_buffer->state.framebuffer = NULL;
5069 cmd_buffer->state.subpass_sample_locs = NULL;
5070 }
5071
5072 void radv_CmdEndRenderPass2KHR(
5073 VkCommandBuffer commandBuffer,
5074 const VkSubpassEndInfoKHR* pSubpassEndInfo)
5075 {
5076 radv_CmdEndRenderPass(commandBuffer);
5077 }
5078
5079 /*
5080 * For HTILE we have the following interesting clear words:
5081 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
5082 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
5083 * 0xfffffff0: Clear depth to 1.0
5084 * 0x00000000: Clear depth to 0.0
5085 */
5086 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
5087 struct radv_image *image,
5088 const VkImageSubresourceRange *range,
5089 uint32_t clear_word)
5090 {
5091 assert(range->baseMipLevel == 0);
5092 assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
5093 VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
5094 struct radv_cmd_state *state = &cmd_buffer->state;
5095 VkClearDepthStencilValue value = {};
5096
5097 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5098 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5099
5100 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, clear_word);
5101
5102 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5103
5104 if (vk_format_is_stencil(image->vk_format))
5105 aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5106
5107 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, aspects);
5108
5109 if (radv_image_is_tc_compat_htile(image)) {
5110 /* Initialize the TC-compat metada value to 0 because by
5111 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
5112 * need have to conditionally update its value when performing
5113 * a fast depth clear.
5114 */
5115 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
5116 }
5117 }
5118
5119 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
5120 struct radv_image *image,
5121 VkImageLayout src_layout,
5122 bool src_render_loop,
5123 VkImageLayout dst_layout,
5124 bool dst_render_loop,
5125 unsigned src_queue_mask,
5126 unsigned dst_queue_mask,
5127 const VkImageSubresourceRange *range,
5128 struct radv_sample_locations_state *sample_locs)
5129 {
5130 if (!radv_image_has_htile(image))
5131 return;
5132
5133 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5134 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
5135
5136 if (radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop,
5137 dst_queue_mask)) {
5138 clear_value = 0;
5139 }
5140
5141 radv_initialize_htile(cmd_buffer, image, range, clear_value);
5142 } else if (!radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) &&
5143 radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5144 uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
5145 radv_initialize_htile(cmd_buffer, image, range, clear_value);
5146 } else if (radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) &&
5147 !radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5148 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5149 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5150
5151 radv_decompress_depth_image_inplace(cmd_buffer, image, range,
5152 sample_locs);
5153
5154 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5155 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5156 }
5157 }
5158
5159 static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
5160 struct radv_image *image,
5161 const VkImageSubresourceRange *range,
5162 uint32_t value)
5163 {
5164 struct radv_cmd_state *state = &cmd_buffer->state;
5165
5166 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5167 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5168
5169 state->flush_bits |= radv_clear_cmask(cmd_buffer, image, range, value);
5170
5171 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5172 }
5173
5174 void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer,
5175 struct radv_image *image,
5176 const VkImageSubresourceRange *range)
5177 {
5178 struct radv_cmd_state *state = &cmd_buffer->state;
5179 static const uint32_t fmask_clear_values[4] = {
5180 0x00000000,
5181 0x02020202,
5182 0xE4E4E4E4,
5183 0x76543210
5184 };
5185 uint32_t log2_samples = util_logbase2(image->info.samples);
5186 uint32_t value = fmask_clear_values[log2_samples];
5187
5188 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5189 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5190
5191 state->flush_bits |= radv_clear_fmask(cmd_buffer, image, range, value);
5192
5193 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5194 }
5195
5196 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
5197 struct radv_image *image,
5198 const VkImageSubresourceRange *range, uint32_t value)
5199 {
5200 struct radv_cmd_state *state = &cmd_buffer->state;
5201 unsigned size = 0;
5202
5203 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5204 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5205
5206 state->flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
5207
5208 if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
5209 /* When DCC is enabled with mipmaps, some levels might not
5210 * support fast clears and we have to initialize them as "fully
5211 * expanded".
5212 */
5213 /* Compute the size of all fast clearable DCC levels. */
5214 for (unsigned i = 0; i < image->planes[0].surface.num_dcc_levels; i++) {
5215 struct legacy_surf_level *surf_level =
5216 &image->planes[0].surface.u.legacy.level[i];
5217 unsigned dcc_fast_clear_size =
5218 surf_level->dcc_slice_fast_clear_size * image->info.array_size;
5219
5220 if (!dcc_fast_clear_size)
5221 break;
5222
5223 size = surf_level->dcc_offset + dcc_fast_clear_size;
5224 }
5225
5226 /* Initialize the mipmap levels without DCC. */
5227 if (size != image->planes[0].surface.dcc_size) {
5228 state->flush_bits |=
5229 radv_fill_buffer(cmd_buffer, image->bo,
5230 image->offset + image->dcc_offset + size,
5231 image->planes[0].surface.dcc_size - size,
5232 0xffffffff);
5233 }
5234 }
5235
5236 state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5237 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5238 }
5239
5240 /**
5241 * Initialize DCC/FMASK/CMASK metadata for a color image.
5242 */
5243 static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,
5244 struct radv_image *image,
5245 VkImageLayout src_layout,
5246 bool src_render_loop,
5247 VkImageLayout dst_layout,
5248 bool dst_render_loop,
5249 unsigned src_queue_mask,
5250 unsigned dst_queue_mask,
5251 const VkImageSubresourceRange *range)
5252 {
5253 if (radv_image_has_cmask(image)) {
5254 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5255
5256 /* TODO: clarify this. */
5257 if (radv_image_has_fmask(image)) {
5258 value = 0xccccccccu;
5259 }
5260
5261 radv_initialise_cmask(cmd_buffer, image, range, value);
5262 }
5263
5264 if (radv_image_has_fmask(image)) {
5265 radv_initialize_fmask(cmd_buffer, image, range);
5266 }
5267
5268 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5269 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
5270 bool need_decompress_pass = false;
5271
5272 if (radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout,
5273 dst_render_loop,
5274 dst_queue_mask)) {
5275 value = 0x20202020u;
5276 need_decompress_pass = true;
5277 }
5278
5279 radv_initialize_dcc(cmd_buffer, image, range, value);
5280
5281 radv_update_fce_metadata(cmd_buffer, image, range,
5282 need_decompress_pass);
5283 }
5284
5285 if (radv_image_has_cmask(image) ||
5286 radv_dcc_enabled(image, range->baseMipLevel)) {
5287 uint32_t color_values[2] = {};
5288 radv_set_color_clear_metadata(cmd_buffer, image, range,
5289 color_values);
5290 }
5291 }
5292
5293 /**
5294 * Handle color image transitions for DCC/FMASK/CMASK.
5295 */
5296 static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer,
5297 struct radv_image *image,
5298 VkImageLayout src_layout,
5299 bool src_render_loop,
5300 VkImageLayout dst_layout,
5301 bool dst_render_loop,
5302 unsigned src_queue_mask,
5303 unsigned dst_queue_mask,
5304 const VkImageSubresourceRange *range)
5305 {
5306 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5307 radv_init_color_image_metadata(cmd_buffer, image,
5308 src_layout, src_render_loop,
5309 dst_layout, dst_render_loop,
5310 src_queue_mask, dst_queue_mask,
5311 range);
5312 return;
5313 }
5314
5315 if (radv_dcc_enabled(image, range->baseMipLevel)) {
5316 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
5317 radv_initialize_dcc(cmd_buffer, image, range, 0xffffffffu);
5318 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, src_layout, src_render_loop, src_queue_mask) &&
5319 !radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout, dst_render_loop, dst_queue_mask)) {
5320 radv_decompress_dcc(cmd_buffer, image, range);
5321 } else if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
5322 !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5323 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5324 }
5325 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
5326 bool fce_eliminate = false, fmask_expand = false;
5327
5328 if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
5329 !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
5330 fce_eliminate = true;
5331 }
5332
5333 if (radv_image_has_fmask(image)) {
5334 if (src_layout != VK_IMAGE_LAYOUT_GENERAL &&
5335 dst_layout == VK_IMAGE_LAYOUT_GENERAL) {
5336 /* A FMASK decompress is required before doing
5337 * a MSAA decompress using FMASK.
5338 */
5339 fmask_expand = true;
5340 }
5341 }
5342
5343 if (fce_eliminate || fmask_expand)
5344 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
5345
5346 if (fmask_expand)
5347 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
5348 }
5349 }
5350
5351 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
5352 struct radv_image *image,
5353 VkImageLayout src_layout,
5354 bool src_render_loop,
5355 VkImageLayout dst_layout,
5356 bool dst_render_loop,
5357 uint32_t src_family,
5358 uint32_t dst_family,
5359 const VkImageSubresourceRange *range,
5360 struct radv_sample_locations_state *sample_locs)
5361 {
5362 if (image->exclusive && src_family != dst_family) {
5363 /* This is an acquire or a release operation and there will be
5364 * a corresponding release/acquire. Do the transition in the
5365 * most flexible queue. */
5366
5367 assert(src_family == cmd_buffer->queue_family_index ||
5368 dst_family == cmd_buffer->queue_family_index);
5369
5370 if (src_family == VK_QUEUE_FAMILY_EXTERNAL ||
5371 src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
5372 return;
5373
5374 if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
5375 return;
5376
5377 if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
5378 (src_family == RADV_QUEUE_GENERAL ||
5379 dst_family == RADV_QUEUE_GENERAL))
5380 return;
5381 }
5382
5383 if (src_layout == dst_layout)
5384 return;
5385
5386 unsigned src_queue_mask =
5387 radv_image_queue_family_mask(image, src_family,
5388 cmd_buffer->queue_family_index);
5389 unsigned dst_queue_mask =
5390 radv_image_queue_family_mask(image, dst_family,
5391 cmd_buffer->queue_family_index);
5392
5393 if (vk_format_is_depth(image->vk_format)) {
5394 radv_handle_depth_image_transition(cmd_buffer, image,
5395 src_layout, src_render_loop,
5396 dst_layout, dst_render_loop,
5397 src_queue_mask, dst_queue_mask,
5398 range, sample_locs);
5399 } else {
5400 radv_handle_color_image_transition(cmd_buffer, image,
5401 src_layout, src_render_loop,
5402 dst_layout, dst_render_loop,
5403 src_queue_mask, dst_queue_mask,
5404 range);
5405 }
5406 }
5407
5408 struct radv_barrier_info {
5409 uint32_t eventCount;
5410 const VkEvent *pEvents;
5411 VkPipelineStageFlags srcStageMask;
5412 VkPipelineStageFlags dstStageMask;
5413 };
5414
5415 static void
5416 radv_barrier(struct radv_cmd_buffer *cmd_buffer,
5417 uint32_t memoryBarrierCount,
5418 const VkMemoryBarrier *pMemoryBarriers,
5419 uint32_t bufferMemoryBarrierCount,
5420 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
5421 uint32_t imageMemoryBarrierCount,
5422 const VkImageMemoryBarrier *pImageMemoryBarriers,
5423 const struct radv_barrier_info *info)
5424 {
5425 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5426 enum radv_cmd_flush_bits src_flush_bits = 0;
5427 enum radv_cmd_flush_bits dst_flush_bits = 0;
5428
5429 for (unsigned i = 0; i < info->eventCount; ++i) {
5430 RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
5431 uint64_t va = radv_buffer_get_va(event->bo);
5432
5433 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5434
5435 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
5436
5437 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
5438 assert(cmd_buffer->cs->cdw <= cdw_max);
5439 }
5440
5441 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
5442 src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
5443 NULL);
5444 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
5445 NULL);
5446 }
5447
5448 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
5449 src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
5450 NULL);
5451 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
5452 NULL);
5453 }
5454
5455 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5456 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5457
5458 src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
5459 image);
5460 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
5461 image);
5462 }
5463
5464 /* The Vulkan spec 1.1.98 says:
5465 *
5466 * "An execution dependency with only
5467 * VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
5468 * will only prevent that stage from executing in subsequently
5469 * submitted commands. As this stage does not perform any actual
5470 * execution, this is not observable - in effect, it does not delay
5471 * processing of subsequent commands. Similarly an execution dependency
5472 * with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
5473 * will effectively not wait for any prior commands to complete."
5474 */
5475 if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
5476 radv_stage_flush(cmd_buffer, info->srcStageMask);
5477 cmd_buffer->state.flush_bits |= src_flush_bits;
5478
5479 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
5480 RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
5481
5482 const struct VkSampleLocationsInfoEXT *sample_locs_info =
5483 vk_find_struct_const(pImageMemoryBarriers[i].pNext,
5484 SAMPLE_LOCATIONS_INFO_EXT);
5485 struct radv_sample_locations_state sample_locations = {};
5486
5487 if (sample_locs_info) {
5488 assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
5489 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
5490 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
5491 sample_locations.count = sample_locs_info->sampleLocationsCount;
5492 typed_memcpy(&sample_locations.locations[0],
5493 sample_locs_info->pSampleLocations,
5494 sample_locs_info->sampleLocationsCount);
5495 }
5496
5497 radv_handle_image_transition(cmd_buffer, image,
5498 pImageMemoryBarriers[i].oldLayout,
5499 false, /* Outside of a renderpass we are never in a renderloop */
5500 pImageMemoryBarriers[i].newLayout,
5501 false, /* Outside of a renderpass we are never in a renderloop */
5502 pImageMemoryBarriers[i].srcQueueFamilyIndex,
5503 pImageMemoryBarriers[i].dstQueueFamilyIndex,
5504 &pImageMemoryBarriers[i].subresourceRange,
5505 sample_locs_info ? &sample_locations : NULL);
5506 }
5507
5508 /* Make sure CP DMA is idle because the driver might have performed a
5509 * DMA operation for copying or filling buffers/images.
5510 */
5511 if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5512 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5513 si_cp_dma_wait_for_idle(cmd_buffer);
5514
5515 cmd_buffer->state.flush_bits |= dst_flush_bits;
5516 }
5517
5518 void radv_CmdPipelineBarrier(
5519 VkCommandBuffer commandBuffer,
5520 VkPipelineStageFlags srcStageMask,
5521 VkPipelineStageFlags destStageMask,
5522 VkBool32 byRegion,
5523 uint32_t memoryBarrierCount,
5524 const VkMemoryBarrier* pMemoryBarriers,
5525 uint32_t bufferMemoryBarrierCount,
5526 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5527 uint32_t imageMemoryBarrierCount,
5528 const VkImageMemoryBarrier* pImageMemoryBarriers)
5529 {
5530 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5531 struct radv_barrier_info info;
5532
5533 info.eventCount = 0;
5534 info.pEvents = NULL;
5535 info.srcStageMask = srcStageMask;
5536 info.dstStageMask = destStageMask;
5537
5538 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5539 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5540 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5541 }
5542
5543
5544 static void write_event(struct radv_cmd_buffer *cmd_buffer,
5545 struct radv_event *event,
5546 VkPipelineStageFlags stageMask,
5547 unsigned value)
5548 {
5549 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5550 uint64_t va = radv_buffer_get_va(event->bo);
5551
5552 si_emit_cache_flush(cmd_buffer);
5553
5554 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
5555
5556 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21);
5557
5558 /* Flags that only require a top-of-pipe event. */
5559 VkPipelineStageFlags top_of_pipe_flags =
5560 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
5561
5562 /* Flags that only require a post-index-fetch event. */
5563 VkPipelineStageFlags post_index_fetch_flags =
5564 top_of_pipe_flags |
5565 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
5566 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
5567
5568 /* Make sure CP DMA is idle because the driver might have performed a
5569 * DMA operation for copying or filling buffers/images.
5570 */
5571 if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
5572 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
5573 si_cp_dma_wait_for_idle(cmd_buffer);
5574
5575 /* TODO: Emit EOS events for syncing PS/CS stages. */
5576
5577 if (!(stageMask & ~top_of_pipe_flags)) {
5578 /* Just need to sync the PFP engine. */
5579 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5580 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5581 S_370_WR_CONFIRM(1) |
5582 S_370_ENGINE_SEL(V_370_PFP));
5583 radeon_emit(cs, va);
5584 radeon_emit(cs, va >> 32);
5585 radeon_emit(cs, value);
5586 } else if (!(stageMask & ~post_index_fetch_flags)) {
5587 /* Sync ME because PFP reads index and indirect buffers. */
5588 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
5589 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
5590 S_370_WR_CONFIRM(1) |
5591 S_370_ENGINE_SEL(V_370_ME));
5592 radeon_emit(cs, va);
5593 radeon_emit(cs, va >> 32);
5594 radeon_emit(cs, value);
5595 } else {
5596 /* Otherwise, sync all prior GPU work using an EOP event. */
5597 si_cs_emit_write_event_eop(cs,
5598 cmd_buffer->device->physical_device->rad_info.chip_class,
5599 radv_cmd_buffer_uses_mec(cmd_buffer),
5600 V_028A90_BOTTOM_OF_PIPE_TS, 0,
5601 EOP_DST_SEL_MEM,
5602 EOP_DATA_SEL_VALUE_32BIT, va, value,
5603 cmd_buffer->gfx9_eop_bug_va);
5604 }
5605
5606 assert(cmd_buffer->cs->cdw <= cdw_max);
5607 }
5608
5609 void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
5610 VkEvent _event,
5611 VkPipelineStageFlags stageMask)
5612 {
5613 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5614 RADV_FROM_HANDLE(radv_event, event, _event);
5615
5616 write_event(cmd_buffer, event, stageMask, 1);
5617 }
5618
5619 void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
5620 VkEvent _event,
5621 VkPipelineStageFlags stageMask)
5622 {
5623 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5624 RADV_FROM_HANDLE(radv_event, event, _event);
5625
5626 write_event(cmd_buffer, event, stageMask, 0);
5627 }
5628
5629 void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
5630 uint32_t eventCount,
5631 const VkEvent* pEvents,
5632 VkPipelineStageFlags srcStageMask,
5633 VkPipelineStageFlags dstStageMask,
5634 uint32_t memoryBarrierCount,
5635 const VkMemoryBarrier* pMemoryBarriers,
5636 uint32_t bufferMemoryBarrierCount,
5637 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
5638 uint32_t imageMemoryBarrierCount,
5639 const VkImageMemoryBarrier* pImageMemoryBarriers)
5640 {
5641 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5642 struct radv_barrier_info info;
5643
5644 info.eventCount = eventCount;
5645 info.pEvents = pEvents;
5646 info.srcStageMask = 0;
5647
5648 radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
5649 bufferMemoryBarrierCount, pBufferMemoryBarriers,
5650 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
5651 }
5652
5653
5654 void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,
5655 uint32_t deviceMask)
5656 {
5657 /* No-op */
5658 }
5659
5660 /* VK_EXT_conditional_rendering */
5661 void radv_CmdBeginConditionalRenderingEXT(
5662 VkCommandBuffer commandBuffer,
5663 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5664 {
5665 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5666 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
5667 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5668 bool draw_visible = true;
5669 uint64_t pred_value = 0;
5670 uint64_t va, new_va;
5671 unsigned pred_offset;
5672
5673 va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
5674
5675 /* By default, if the 32-bit value at offset in buffer memory is zero,
5676 * then the rendering commands are discarded, otherwise they are
5677 * executed as normal. If the inverted flag is set, all commands are
5678 * discarded if the value is non zero.
5679 */
5680 if (pConditionalRenderingBegin->flags &
5681 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
5682 draw_visible = false;
5683 }
5684
5685 si_emit_cache_flush(cmd_buffer);
5686
5687 /* From the Vulkan spec 1.1.107:
5688 *
5689 * "If the 32-bit value at offset in buffer memory is zero, then the
5690 * rendering commands are discarded, otherwise they are executed as
5691 * normal. If the value of the predicate in buffer memory changes while
5692 * conditional rendering is active, the rendering commands may be
5693 * discarded in an implementation-dependent way. Some implementations
5694 * may latch the value of the predicate upon beginning conditional
5695 * rendering while others may read it before every rendering command."
5696 *
5697 * But, the AMD hardware treats the predicate as a 64-bit value which
5698 * means we need a workaround in the driver. Luckily, it's not required
5699 * to support if the value changes when predication is active.
5700 *
5701 * The workaround is as follows:
5702 * 1) allocate a 64-value in the upload BO and initialize it to 0
5703 * 2) copy the 32-bit predicate value to the upload BO
5704 * 3) use the new allocated VA address for predication
5705 *
5706 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
5707 * in ME (+ sync PFP) instead of PFP.
5708 */
5709 radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
5710
5711 new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
5712
5713 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
5714 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
5715 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
5716 COPY_DATA_WR_CONFIRM);
5717 radeon_emit(cs, va);
5718 radeon_emit(cs, va >> 32);
5719 radeon_emit(cs, new_va);
5720 radeon_emit(cs, new_va >> 32);
5721
5722 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
5723 radeon_emit(cs, 0);
5724
5725 /* Enable predication for this command buffer. */
5726 si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
5727 cmd_buffer->state.predicating = true;
5728
5729 /* Store conditional rendering user info. */
5730 cmd_buffer->state.predication_type = draw_visible;
5731 cmd_buffer->state.predication_va = new_va;
5732 }
5733
5734 void radv_CmdEndConditionalRenderingEXT(
5735 VkCommandBuffer commandBuffer)
5736 {
5737 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5738
5739 /* Disable predication for this command buffer. */
5740 si_emit_set_predication_state(cmd_buffer, false, 0);
5741 cmd_buffer->state.predicating = false;
5742
5743 /* Reset conditional rendering user info. */
5744 cmd_buffer->state.predication_type = -1;
5745 cmd_buffer->state.predication_va = 0;
5746 }
5747
5748 /* VK_EXT_transform_feedback */
5749 void radv_CmdBindTransformFeedbackBuffersEXT(
5750 VkCommandBuffer commandBuffer,
5751 uint32_t firstBinding,
5752 uint32_t bindingCount,
5753 const VkBuffer* pBuffers,
5754 const VkDeviceSize* pOffsets,
5755 const VkDeviceSize* pSizes)
5756 {
5757 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5758 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5759 uint8_t enabled_mask = 0;
5760
5761 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
5762 for (uint32_t i = 0; i < bindingCount; i++) {
5763 uint32_t idx = firstBinding + i;
5764
5765 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
5766 sb[idx].offset = pOffsets[i];
5767 sb[idx].size = pSizes[i];
5768
5769 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
5770 sb[idx].buffer->bo);
5771
5772 enabled_mask |= 1 << idx;
5773 }
5774
5775 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
5776
5777 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5778 }
5779
5780 static void
5781 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
5782 {
5783 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5784 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5785
5786 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
5787 radeon_emit(cs,
5788 S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
5789 S_028B94_RAST_STREAM(0) |
5790 S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
5791 S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
5792 S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
5793 radeon_emit(cs, so->hw_enabled_mask &
5794 so->enabled_stream_buffers_mask);
5795
5796 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5797 }
5798
5799 static void
5800 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
5801 {
5802 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5803 bool old_streamout_enabled = so->streamout_enabled;
5804 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
5805
5806 so->streamout_enabled = enable;
5807
5808 so->hw_enabled_mask = so->enabled_mask |
5809 (so->enabled_mask << 4) |
5810 (so->enabled_mask << 8) |
5811 (so->enabled_mask << 12);
5812
5813 if ((old_streamout_enabled != so->streamout_enabled) ||
5814 (old_hw_enabled_mask != so->hw_enabled_mask))
5815 radv_emit_streamout_enable(cmd_buffer);
5816 }
5817
5818 static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
5819 {
5820 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5821 unsigned reg_strmout_cntl;
5822
5823 /* The register is at different places on different ASICs. */
5824 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
5825 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
5826 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
5827 } else {
5828 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
5829 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
5830 }
5831
5832 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
5833 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
5834
5835 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
5836 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
5837 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
5838 radeon_emit(cs, 0);
5839 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
5840 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
5841 radeon_emit(cs, 4); /* poll interval */
5842 }
5843
5844 static void
5845 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer,
5846 uint32_t firstCounterBuffer,
5847 uint32_t counterBufferCount,
5848 const VkBuffer *pCounterBuffers,
5849 const VkDeviceSize *pCounterBufferOffsets)
5850
5851 {
5852 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5853 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5854 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5855 uint32_t i;
5856
5857 radv_flush_vgt_streamout(cmd_buffer);
5858
5859 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5860 for_each_bit(i, so->enabled_mask) {
5861 int32_t counter_buffer_idx = i - firstCounterBuffer;
5862 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5863 counter_buffer_idx = -1;
5864
5865 /* AMD GCN binds streamout buffers as shader resources.
5866 * VGT only counts primitives and tells the shader through
5867 * SGPRs what to do.
5868 */
5869 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
5870 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
5871 radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
5872
5873 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5874
5875 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5876 /* The array of counter buffers is optional. */
5877 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5878 uint64_t va = radv_buffer_get_va(buffer->bo);
5879
5880 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5881
5882 /* Append */
5883 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5884 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5885 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5886 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
5887 radeon_emit(cs, 0); /* unused */
5888 radeon_emit(cs, 0); /* unused */
5889 radeon_emit(cs, va); /* src address lo */
5890 radeon_emit(cs, va >> 32); /* src address hi */
5891
5892 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5893 } else {
5894 /* Start from the beginning. */
5895 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5896 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5897 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5898 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
5899 radeon_emit(cs, 0); /* unused */
5900 radeon_emit(cs, 0); /* unused */
5901 radeon_emit(cs, 0); /* unused */
5902 radeon_emit(cs, 0); /* unused */
5903 }
5904 }
5905
5906 radv_set_streamout_enable(cmd_buffer, true);
5907 }
5908
5909 void radv_CmdBeginTransformFeedbackEXT(
5910 VkCommandBuffer commandBuffer,
5911 uint32_t firstCounterBuffer,
5912 uint32_t counterBufferCount,
5913 const VkBuffer* pCounterBuffers,
5914 const VkDeviceSize* pCounterBufferOffsets)
5915 {
5916 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5917
5918 radv_emit_streamout_begin(cmd_buffer,
5919 firstCounterBuffer, counterBufferCount,
5920 pCounterBuffers, pCounterBufferOffsets);
5921 }
5922
5923 static void
5924 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer,
5925 uint32_t firstCounterBuffer,
5926 uint32_t counterBufferCount,
5927 const VkBuffer *pCounterBuffers,
5928 const VkDeviceSize *pCounterBufferOffsets)
5929 {
5930 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5931 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5932 uint32_t i;
5933
5934 radv_flush_vgt_streamout(cmd_buffer);
5935
5936 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
5937 for_each_bit(i, so->enabled_mask) {
5938 int32_t counter_buffer_idx = i - firstCounterBuffer;
5939 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
5940 counter_buffer_idx = -1;
5941
5942 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
5943 /* The array of counters buffer is optional. */
5944 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
5945 uint64_t va = radv_buffer_get_va(buffer->bo);
5946
5947 va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
5948
5949 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
5950 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
5951 STRMOUT_DATA_TYPE(1) | /* offset in bytes */
5952 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
5953 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
5954 radeon_emit(cs, va); /* dst address lo */
5955 radeon_emit(cs, va >> 32); /* dst address hi */
5956 radeon_emit(cs, 0); /* unused */
5957 radeon_emit(cs, 0); /* unused */
5958
5959 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
5960 }
5961
5962 /* Deactivate transform feedback by zeroing the buffer size.
5963 * The counters (primitives generated, primitives emitted) may
5964 * be enabled even if there is not buffer bound. This ensures
5965 * that the primitives-emitted query won't increment.
5966 */
5967 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
5968
5969 cmd_buffer->state.context_roll_without_scissor_emitted = true;
5970 }
5971
5972 radv_set_streamout_enable(cmd_buffer, false);
5973 }
5974
5975 void radv_CmdEndTransformFeedbackEXT(
5976 VkCommandBuffer commandBuffer,
5977 uint32_t firstCounterBuffer,
5978 uint32_t counterBufferCount,
5979 const VkBuffer* pCounterBuffers,
5980 const VkDeviceSize* pCounterBufferOffsets)
5981 {
5982 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5983
5984 radv_emit_streamout_end(cmd_buffer,
5985 firstCounterBuffer, counterBufferCount,
5986 pCounterBuffers, pCounterBufferOffsets);
5987 }
5988
5989 void radv_CmdDrawIndirectByteCountEXT(
5990 VkCommandBuffer commandBuffer,
5991 uint32_t instanceCount,
5992 uint32_t firstInstance,
5993 VkBuffer _counterBuffer,
5994 VkDeviceSize counterBufferOffset,
5995 uint32_t counterOffset,
5996 uint32_t vertexStride)
5997 {
5998 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5999 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
6000 struct radv_draw_info info = {};
6001
6002 info.instance_count = instanceCount;
6003 info.first_instance = firstInstance;
6004 info.strmout_buffer = counterBuffer;
6005 info.strmout_buffer_offset = counterBufferOffset;
6006 info.stride = vertexStride;
6007
6008 radv_draw(cmd_buffer, &info);
6009 }
6010
6011 /* VK_AMD_buffer_marker */
6012 void radv_CmdWriteBufferMarkerAMD(
6013 VkCommandBuffer commandBuffer,
6014 VkPipelineStageFlagBits pipelineStage,
6015 VkBuffer dstBuffer,
6016 VkDeviceSize dstOffset,
6017 uint32_t marker)
6018 {
6019 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6020 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
6021 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6022 uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
6023
6024 si_emit_cache_flush(cmd_buffer);
6025
6026 if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
6027 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6028 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
6029 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6030 COPY_DATA_WR_CONFIRM);
6031 radeon_emit(cs, marker);
6032 radeon_emit(cs, 0);
6033 radeon_emit(cs, va);
6034 radeon_emit(cs, va >> 32);
6035 } else {
6036 si_cs_emit_write_event_eop(cs,
6037 cmd_buffer->device->physical_device->rad_info.chip_class,
6038 radv_cmd_buffer_uses_mec(cmd_buffer),
6039 V_028A90_BOTTOM_OF_PIPE_TS, 0,
6040 EOP_DST_SEL_MEM,
6041 EOP_DATA_SEL_VALUE_32BIT,
6042 va, marker,
6043 cmd_buffer->gfx9_eop_bug_va);
6044 }
6045 }