freedreno/a6xx: Rename and document HLSQ_UPDATE_CNTL
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36
37 void
38 tu_bo_list_init(struct tu_bo_list *list)
39 {
40 list->count = list->capacity = 0;
41 list->bo_infos = NULL;
42 }
43
44 void
45 tu_bo_list_destroy(struct tu_bo_list *list)
46 {
47 free(list->bo_infos);
48 }
49
50 void
51 tu_bo_list_reset(struct tu_bo_list *list)
52 {
53 list->count = 0;
54 }
55
56 /**
57 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
58 */
59 static uint32_t
60 tu_bo_list_add_info(struct tu_bo_list *list,
61 const struct drm_msm_gem_submit_bo *bo_info)
62 {
63 assert(bo_info->handle != 0);
64
65 for (uint32_t i = 0; i < list->count; ++i) {
66 if (list->bo_infos[i].handle == bo_info->handle) {
67 assert(list->bo_infos[i].presumed == bo_info->presumed);
68 list->bo_infos[i].flags |= bo_info->flags;
69 return i;
70 }
71 }
72
73 /* grow list->bo_infos if needed */
74 if (list->count == list->capacity) {
75 uint32_t new_capacity = MAX2(2 * list->count, 16);
76 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
77 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
78 if (!new_bo_infos)
79 return TU_BO_LIST_FAILED;
80 list->bo_infos = new_bo_infos;
81 list->capacity = new_capacity;
82 }
83
84 list->bo_infos[list->count] = *bo_info;
85 return list->count++;
86 }
87
88 uint32_t
89 tu_bo_list_add(struct tu_bo_list *list,
90 const struct tu_bo *bo,
91 uint32_t flags)
92 {
93 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
94 .flags = flags,
95 .handle = bo->gem_handle,
96 .presumed = bo->iova,
97 });
98 }
99
100 VkResult
101 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
102 {
103 for (uint32_t i = 0; i < other->count; i++) {
104 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
105 return VK_ERROR_OUT_OF_HOST_MEMORY;
106 }
107
108 return VK_SUCCESS;
109 }
110
111 void
112 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
113 struct tu_cs *cs,
114 enum vgt_event_type event)
115 {
116 bool need_seqno = false;
117 switch (event) {
118 case CACHE_FLUSH_TS:
119 case WT_DONE_TS:
120 case RB_DONE_TS:
121 case PC_CCU_FLUSH_DEPTH_TS:
122 case PC_CCU_FLUSH_COLOR_TS:
123 case PC_CCU_RESOLVE_TS:
124 need_seqno = true;
125 break;
126 default:
127 break;
128 }
129
130 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
131 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
132 if (need_seqno) {
133 tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
134 tu_cs_emit(cs, 0);
135 }
136 }
137
138 static void
139 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
140 struct tu_cs *cs,
141 enum tu_cmd_flush_bits flushes)
142 {
143 /* Experiments show that invalidating CCU while it still has data in it
144 * doesn't work, so make sure to always flush before invalidating in case
145 * any data remains that hasn't yet been made available through a barrier.
146 * However it does seem to work for UCHE.
147 */
148 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
149 TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
150 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
151 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
152 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
153 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
154 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
155 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
156 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
157 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
158 if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
159 tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
160 if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
161 tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
162 if (flushes & TU_CMD_FLAG_WFI)
163 tu_cs_emit_wfi(cs);
164 }
165
166 /* "Normal" cache flushes, that don't require any special handling */
167
168 static void
169 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
170 struct tu_cs *cs)
171 {
172 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
173 cmd_buffer->state.cache.flush_bits = 0;
174 }
175
176 /* Renderpass cache flushes */
177
178 void
179 tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
180 struct tu_cs *cs)
181 {
182 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
183 cmd_buffer->state.renderpass_cache.flush_bits = 0;
184 }
185
186 /* Cache flushes for things that use the color/depth read/write path (i.e.
187 * blits and draws). This deals with changing CCU state as well as the usual
188 * cache flushing.
189 */
190
191 void
192 tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
193 struct tu_cs *cs,
194 enum tu_cmd_ccu_state ccu_state)
195 {
196 enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
197
198 assert(ccu_state != TU_CMD_CCU_UNKNOWN);
199
200 /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
201 * the CCU may also contain data that we haven't flushed out yet, so we
202 * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
203 * emit a WFI as it isn't pipelined.
204 */
205 if (ccu_state != cmd_buffer->state.ccu_state) {
206 if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
207 flushes |=
208 TU_CMD_FLAG_CCU_FLUSH_COLOR |
209 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
210 cmd_buffer->state.cache.pending_flush_bits &= ~(
211 TU_CMD_FLAG_CCU_FLUSH_COLOR |
212 TU_CMD_FLAG_CCU_FLUSH_DEPTH);
213 }
214 flushes |=
215 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
216 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
217 TU_CMD_FLAG_WFI;
218 cmd_buffer->state.cache.pending_flush_bits &= ~(
219 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
220 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
221 }
222
223 tu6_emit_flushes(cmd_buffer, cs, flushes);
224 cmd_buffer->state.cache.flush_bits = 0;
225
226 if (ccu_state != cmd_buffer->state.ccu_state) {
227 struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
228 tu_cs_emit_regs(cs,
229 A6XX_RB_CCU_CNTL(.offset =
230 ccu_state == TU_CMD_CCU_GMEM ?
231 phys_dev->ccu_offset_gmem :
232 phys_dev->ccu_offset_bypass,
233 .gmem = ccu_state == TU_CMD_CCU_GMEM));
234 cmd_buffer->state.ccu_state = ccu_state;
235 }
236 }
237
238 static void
239 tu6_emit_zs(struct tu_cmd_buffer *cmd,
240 const struct tu_subpass *subpass,
241 struct tu_cs *cs)
242 {
243 const struct tu_framebuffer *fb = cmd->state.framebuffer;
244
245 const uint32_t a = subpass->depth_stencil_attachment.attachment;
246 if (a == VK_ATTACHMENT_UNUSED) {
247 tu_cs_emit_regs(cs,
248 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
249 A6XX_RB_DEPTH_BUFFER_PITCH(0),
250 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
251 A6XX_RB_DEPTH_BUFFER_BASE(0),
252 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
253
254 tu_cs_emit_regs(cs,
255 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
256
257 tu_cs_emit_regs(cs,
258 A6XX_GRAS_LRZ_BUFFER_BASE(0),
259 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
260 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
261
262 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
263
264 return;
265 }
266
267 const struct tu_image_view *iview = fb->attachments[a].attachment;
268 const struct tu_render_pass_attachment *attachment =
269 &cmd->state.pass->attachments[a];
270 enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
271
272 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
273 tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
274 tu_cs_image_ref(cs, iview, 0);
275 tu_cs_emit(cs, attachment->gmem_offset);
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
279
280 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3);
281 tu_cs_image_flag_ref(cs, iview, 0);
282
283 tu_cs_emit_regs(cs,
284 A6XX_GRAS_LRZ_BUFFER_BASE(0),
285 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
286 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
287
288 if (attachment->format == VK_FORMAT_S8_UINT) {
289 tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
290 tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
291 tu_cs_image_ref(cs, iview, 0);
292 tu_cs_emit(cs, attachment->gmem_offset);
293 } else {
294 tu_cs_emit_regs(cs,
295 A6XX_RB_STENCIL_INFO(0));
296 }
297 }
298
299 static void
300 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
301 const struct tu_subpass *subpass,
302 struct tu_cs *cs)
303 {
304 const struct tu_framebuffer *fb = cmd->state.framebuffer;
305
306 for (uint32_t i = 0; i < subpass->color_count; ++i) {
307 uint32_t a = subpass->color_attachments[i].attachment;
308 if (a == VK_ATTACHMENT_UNUSED)
309 continue;
310
311 const struct tu_image_view *iview = fb->attachments[a].attachment;
312
313 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
314 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
315 tu_cs_image_ref(cs, iview, 0);
316 tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
317
318 tu_cs_emit_regs(cs,
319 A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
320
321 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR_LO(i), 3);
322 tu_cs_image_flag_ref(cs, iview, 0);
323 }
324
325 tu_cs_emit_regs(cs,
326 A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
327 tu_cs_emit_regs(cs,
328 A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
329
330 tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(fb->layers - 1));
331 }
332
333 void
334 tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
335 {
336 const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
337 bool msaa_disable = samples == MSAA_ONE;
338
339 tu_cs_emit_regs(cs,
340 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
341 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
342 .msaa_disable = msaa_disable));
343
344 tu_cs_emit_regs(cs,
345 A6XX_GRAS_RAS_MSAA_CNTL(samples),
346 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
347 .msaa_disable = msaa_disable));
348
349 tu_cs_emit_regs(cs,
350 A6XX_RB_RAS_MSAA_CNTL(samples),
351 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
352 .msaa_disable = msaa_disable));
353
354 tu_cs_emit_regs(cs,
355 A6XX_RB_MSAA_CNTL(samples));
356 }
357
358 static void
359 tu6_emit_bin_size(struct tu_cs *cs,
360 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
361 {
362 tu_cs_emit_regs(cs,
363 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
364 .binh = bin_h,
365 .dword = flags));
366
367 tu_cs_emit_regs(cs,
368 A6XX_RB_BIN_CONTROL(.binw = bin_w,
369 .binh = bin_h,
370 .dword = flags));
371
372 /* no flag for RB_BIN_CONTROL2... */
373 tu_cs_emit_regs(cs,
374 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
375 .binh = bin_h));
376 }
377
378 static void
379 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
380 const struct tu_subpass *subpass,
381 struct tu_cs *cs,
382 bool binning)
383 {
384 const struct tu_framebuffer *fb = cmd->state.framebuffer;
385 uint32_t cntl = 0;
386 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
387 if (binning) {
388 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
389 } else {
390 uint32_t mrts_ubwc_enable = 0;
391 for (uint32_t i = 0; i < subpass->color_count; ++i) {
392 uint32_t a = subpass->color_attachments[i].attachment;
393 if (a == VK_ATTACHMENT_UNUSED)
394 continue;
395
396 const struct tu_image_view *iview = fb->attachments[a].attachment;
397 if (iview->ubwc_enabled)
398 mrts_ubwc_enable |= 1 << i;
399 }
400
401 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
402
403 const uint32_t a = subpass->depth_stencil_attachment.attachment;
404 if (a != VK_ATTACHMENT_UNUSED) {
405 const struct tu_image_view *iview = fb->attachments[a].attachment;
406 if (iview->ubwc_enabled)
407 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
408 }
409
410 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
411 * in order to set it correctly for the different subpasses. However,
412 * that means the packets we're emitting also happen during binning. So
413 * we need to guard the write on !BINNING at CP execution time.
414 */
415 tu_cs_reserve(cs, 3 + 4);
416 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
417 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
418 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
419 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
420 }
421
422 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
423 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
424 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
425 tu_cs_emit(cs, cntl);
426 }
427
428 static void
429 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
430 {
431 const VkRect2D *render_area = &cmd->state.render_area;
432 uint32_t x1 = render_area->offset.x;
433 uint32_t y1 = render_area->offset.y;
434 uint32_t x2 = x1 + render_area->extent.width - 1;
435 uint32_t y2 = y1 + render_area->extent.height - 1;
436
437 if (align) {
438 x1 = x1 & ~(GMEM_ALIGN_W - 1);
439 y1 = y1 & ~(GMEM_ALIGN_H - 1);
440 x2 = ALIGN_POT(x2 + 1, GMEM_ALIGN_W) - 1;
441 y2 = ALIGN_POT(y2 + 1, GMEM_ALIGN_H) - 1;
442 }
443
444 tu_cs_emit_regs(cs,
445 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
446 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
447 }
448
449 void
450 tu6_emit_window_scissor(struct tu_cs *cs,
451 uint32_t x1,
452 uint32_t y1,
453 uint32_t x2,
454 uint32_t y2)
455 {
456 tu_cs_emit_regs(cs,
457 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
458 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
459
460 tu_cs_emit_regs(cs,
461 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
462 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
463 }
464
465 void
466 tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
467 {
468 tu_cs_emit_regs(cs,
469 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
470
471 tu_cs_emit_regs(cs,
472 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
473
474 tu_cs_emit_regs(cs,
475 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
476
477 tu_cs_emit_regs(cs,
478 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
479 }
480
481 static void
482 tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
483 {
484 uint32_t enable_mask;
485 switch (id) {
486 case TU_DRAW_STATE_PROGRAM:
487 case TU_DRAW_STATE_VI:
488 case TU_DRAW_STATE_FS_CONST:
489 /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
490 * when resources would actually be used in the binning shader.
491 * Presumably the overhead of prefetching the resources isn't
492 * worth it.
493 */
494 case TU_DRAW_STATE_DESC_SETS_LOAD:
495 enable_mask = CP_SET_DRAW_STATE__0_GMEM |
496 CP_SET_DRAW_STATE__0_SYSMEM;
497 break;
498 case TU_DRAW_STATE_PROGRAM_BINNING:
499 case TU_DRAW_STATE_VI_BINNING:
500 enable_mask = CP_SET_DRAW_STATE__0_BINNING;
501 break;
502 case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
503 enable_mask = CP_SET_DRAW_STATE__0_GMEM;
504 break;
505 case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
506 enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
507 break;
508 default:
509 enable_mask = CP_SET_DRAW_STATE__0_GMEM |
510 CP_SET_DRAW_STATE__0_SYSMEM |
511 CP_SET_DRAW_STATE__0_BINNING;
512 break;
513 }
514
515 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
516 enable_mask |
517 CP_SET_DRAW_STATE__0_GROUP_ID(id) |
518 COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));
519 tu_cs_emit_qw(cs, state.iova);
520 }
521
522 /* note: get rid of this eventually */
523 static void
524 tu_cs_emit_sds_ib(struct tu_cs *cs, uint32_t id, struct tu_cs_entry entry)
525 {
526 tu_cs_emit_draw_state(cs, id, (struct tu_draw_state) {
527 .iova = entry.size ? entry.bo->iova + entry.offset : 0,
528 .size = entry.size / 4,
529 });
530 }
531
532 static bool
533 use_hw_binning(struct tu_cmd_buffer *cmd)
534 {
535 const struct tu_framebuffer *fb = cmd->state.framebuffer;
536
537 /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible
538 * with non-hw binning GMEM rendering. this is required because some of the
539 * XFB commands need to only be executed once
540 */
541 if (cmd->state.xfb_used)
542 return true;
543
544 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
545 return false;
546
547 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
548 return true;
549
550 return (fb->tile_count.width * fb->tile_count.height) > 2;
551 }
552
553 static bool
554 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
555 {
556 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
557 return true;
558
559 /* can't fit attachments into gmem */
560 if (!cmd->state.pass->gmem_pixels)
561 return true;
562
563 if (cmd->state.framebuffer->layers > 1)
564 return true;
565
566 if (cmd->has_tess)
567 return true;
568
569 return false;
570 }
571
572 static void
573 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
574 struct tu_cs *cs,
575 uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
576 {
577 const struct tu_framebuffer *fb = cmd->state.framebuffer;
578
579 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
580 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
581
582 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
583 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
584
585 const uint32_t x1 = fb->tile0.width * tx;
586 const uint32_t y1 = fb->tile0.height * ty;
587 const uint32_t x2 = x1 + fb->tile0.width - 1;
588 const uint32_t y2 = y1 + fb->tile0.height - 1;
589 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
590 tu6_emit_window_offset(cs, x1, y1);
591
592 tu_cs_emit_regs(cs,
593 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
594
595 if (use_hw_binning(cmd)) {
596 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
597
598 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
599 tu_cs_emit(cs, 0x0);
600
601 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
602 tu_cs_emit(cs, fb->pipe_sizes[pipe] |
603 CP_SET_BIN_DATA5_0_VSC_N(slot));
604 tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
605 tu_cs_emit(cs, pipe * 4);
606 tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
607
608 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
609 tu_cs_emit(cs, 0x0);
610
611 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
612 tu_cs_emit(cs, 0x0);
613 } else {
614 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
615 tu_cs_emit(cs, 0x1);
616
617 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
618 tu_cs_emit(cs, 0x0);
619 }
620 }
621
622 static void
623 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
624 struct tu_cs *cs,
625 uint32_t a,
626 uint32_t gmem_a)
627 {
628 const struct tu_framebuffer *fb = cmd->state.framebuffer;
629 struct tu_image_view *dst = fb->attachments[a].attachment;
630 struct tu_image_view *src = fb->attachments[gmem_a].attachment;
631
632 tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.render_area);
633 }
634
635 static void
636 tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
637 struct tu_cs *cs,
638 const struct tu_subpass *subpass)
639 {
640 if (subpass->resolve_attachments) {
641 /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
642 * Commands":
643 *
644 * End-of-subpass multisample resolves are treated as color
645 * attachment writes for the purposes of synchronization. That is,
646 * they are considered to execute in the
647 * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
648 * their writes are synchronized with
649 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
650 * rendering within a subpass and any resolve operations at the end
651 * of the subpass occurs automatically, without need for explicit
652 * dependencies or pipeline barriers. However, if the resolve
653 * attachment is also used in a different subpass, an explicit
654 * dependency is needed.
655 *
656 * We use the CP_BLIT path for sysmem resolves, which is really a
657 * transfer command, so we have to manually flush similar to the gmem
658 * resolve case. However, a flush afterwards isn't needed because of the
659 * last sentence and the fact that we're in sysmem mode.
660 */
661 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
662 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
663
664 /* Wait for the flushes to land before using the 2D engine */
665 tu_cs_emit_wfi(cs);
666
667 for (unsigned i = 0; i < subpass->color_count; i++) {
668 uint32_t a = subpass->resolve_attachments[i].attachment;
669 if (a == VK_ATTACHMENT_UNUSED)
670 continue;
671
672 tu6_emit_sysmem_resolve(cmd, cs, a,
673 subpass->color_attachments[i].attachment);
674 }
675 }
676 }
677
678 static void
679 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
680 {
681 const struct tu_render_pass *pass = cmd->state.pass;
682 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
683
684 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
685 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
686 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
687 CP_SET_DRAW_STATE__0_GROUP_ID(0));
688 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
689 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
690
691 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
692 tu_cs_emit(cs, 0x0);
693
694 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
695 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
696
697 tu6_emit_blit_scissor(cmd, cs, true);
698
699 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
700 if (pass->attachments[a].gmem_offset >= 0)
701 tu_store_gmem_attachment(cmd, cs, a, a);
702 }
703
704 if (subpass->resolve_attachments) {
705 for (unsigned i = 0; i < subpass->color_count; i++) {
706 uint32_t a = subpass->resolve_attachments[i].attachment;
707 if (a != VK_ATTACHMENT_UNUSED)
708 tu_store_gmem_attachment(cmd, cs, a,
709 subpass->color_attachments[i].attachment);
710 }
711 }
712 }
713
714 static void
715 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
716 {
717 struct tu_device *dev = cmd->device;
718 const struct tu_physical_device *phys_dev = dev->physical_device;
719
720 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
721
722 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
723 .vs_state = true,
724 .hs_state = true,
725 .ds_state = true,
726 .gs_state = true,
727 .fs_state = true,
728 .cs_state = true,
729 .gfx_ibo = true,
730 .cs_ibo = true,
731 .gfx_shared_const = true,
732 .cs_shared_const = true,
733 .gfx_bindless = 0x1f,
734 .cs_bindless = 0x1f));
735
736 tu_cs_emit_regs(cs,
737 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
738 cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
739 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
740 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
741 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
742 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
743 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
744 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
745 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
746 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
747
748 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
749 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
750 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
751 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
752 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
753 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
754 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
755 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
756 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
757 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
758 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
759 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
760 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
761
762 /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
763 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
764 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
765 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
766
767 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
768
769 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
770
771 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
772 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
773 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
774 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
775 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
776 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
777 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
778 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
779 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
780 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
781 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
782
783 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
784
785 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236,
786 A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
787 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
788
789 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
790 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
791
792 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
793 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
794 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
795
796 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
797 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
798
799 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
800
801 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
802
803 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
804 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
805 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
806 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
807 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
808 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
809 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
810 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
811 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
812
813 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
814
815 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
816
817 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
818
819 /* we don't use this yet.. probably best to disable.. */
820 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
821 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
822 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
823 CP_SET_DRAW_STATE__0_GROUP_ID(0));
824 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
825 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
826
827 tu_cs_emit_regs(cs,
828 A6XX_SP_HS_CTRL_REG0(0));
829
830 tu_cs_emit_regs(cs,
831 A6XX_SP_GS_CTRL_REG0(0));
832
833 tu_cs_emit_regs(cs,
834 A6XX_GRAS_LRZ_CNTL(0));
835
836 tu_cs_emit_regs(cs,
837 A6XX_RB_LRZ_CNTL(0));
838
839 tu_cs_emit_regs(cs,
840 A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
841 .bo_offset = gb_offset(border_color)));
842 tu_cs_emit_regs(cs,
843 A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
844 .bo_offset = gb_offset(border_color)));
845
846 /* VSC buffers:
847 * use vsc pitches from the largest values used so far with this device
848 * if there hasn't been overflow, there will already be a scratch bo
849 * allocated for these sizes
850 *
851 * if overflow is detected, the stream size is increased by 2x
852 */
853 mtx_lock(&dev->vsc_pitch_mtx);
854
855 struct tu6_global *global = dev->global_bo.map;
856
857 uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
858 uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
859
860 if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
861 dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
862
863 if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
864 dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
865
866 cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
867 cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
868
869 mtx_unlock(&dev->vsc_pitch_mtx);
870
871 struct tu_bo *vsc_bo;
872 uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
873 cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
874
875 tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
876
877 tu_cs_emit_regs(cs,
878 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
879 tu_cs_emit_regs(cs,
880 A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
881 tu_cs_emit_regs(cs,
882 A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
883 .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
884
885 tu_bo_list_add(&cmd->bo_list, vsc_bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
886
887 tu_cs_sanity_check(cs);
888 }
889
890 static void
891 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
892 {
893 const struct tu_framebuffer *fb = cmd->state.framebuffer;
894
895 tu_cs_emit_regs(cs,
896 A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
897 .height = fb->tile0.height));
898
899 tu_cs_emit_regs(cs,
900 A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
901 .ny = fb->tile_count.height));
902
903 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
904 tu_cs_emit_array(cs, fb->pipe_config, 32);
905
906 tu_cs_emit_regs(cs,
907 A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
908 A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
909
910 tu_cs_emit_regs(cs,
911 A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
912 A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
913 }
914
915 static void
916 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
917 {
918 const struct tu_framebuffer *fb = cmd->state.framebuffer;
919 const uint32_t used_pipe_count =
920 fb->pipe_count.width * fb->pipe_count.height;
921
922 for (int i = 0; i < used_pipe_count; i++) {
923 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
924 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
925 CP_COND_WRITE5_0_WRITE_MEMORY);
926 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
927 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
928 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
929 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
930 tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
931 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
932
933 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
934 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
935 CP_COND_WRITE5_0_WRITE_MEMORY);
936 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
937 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
938 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
939 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
940 tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
941 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
942 }
943
944 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
945 }
946
947 static void
948 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
949 {
950 struct tu_physical_device *phys_dev = cmd->device->physical_device;
951 const struct tu_framebuffer *fb = cmd->state.framebuffer;
952
953 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
954
955 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
956 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
957
958 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
959 tu_cs_emit(cs, 0x1);
960
961 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
962 tu_cs_emit(cs, 0x1);
963
964 tu_cs_emit_wfi(cs);
965
966 tu_cs_emit_regs(cs,
967 A6XX_VFD_MODE_CNTL(.binning_pass = true));
968
969 update_vsc_pipe(cmd, cs);
970
971 tu_cs_emit_regs(cs,
972 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
973
974 tu_cs_emit_regs(cs,
975 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
976
977 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
978 tu_cs_emit(cs, UNK_2C);
979
980 tu_cs_emit_regs(cs,
981 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
982
983 tu_cs_emit_regs(cs,
984 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
985
986 /* emit IB to binning drawcmds: */
987 tu_cs_emit_call(cs, &cmd->draw_cs);
988
989 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
990 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
991 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
992 CP_SET_DRAW_STATE__0_GROUP_ID(0));
993 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
994 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
995
996 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
997 tu_cs_emit(cs, UNK_2D);
998
999 /* This flush is probably required because the VSC, which produces the
1000 * visibility stream, is a client of UCHE, whereas the CP needs to read the
1001 * visibility stream (without caching) to do draw skipping. The
1002 * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1003 * submitted are finished before reading the VSC regs (in
1004 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
1005 * part of draws).
1006 */
1007 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
1008
1009 tu_cs_emit_wfi(cs);
1010
1011 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1012
1013 emit_vsc_overflow_test(cmd, cs);
1014
1015 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1016 tu_cs_emit(cs, 0x0);
1017
1018 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1019 tu_cs_emit(cs, 0x0);
1020 }
1021
1022 static void
1023 tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
1024 const struct tu_subpass *subpass,
1025 struct tu_cs_entry *ib,
1026 bool gmem)
1027 {
1028 /* note: we can probably emit input attachments just once for the whole
1029 * renderpass, this would avoid emitting both sysmem/gmem versions
1030 *
1031 * emit two texture descriptors for each input, as a workaround for
1032 * d24s8, which can be sampled as both float (depth) and integer (stencil)
1033 * tu_shader lowers uint input attachment loads to use the 2nd descriptor
1034 * in the pair
1035 * TODO: a smarter workaround
1036 */
1037
1038 if (!subpass->input_count)
1039 return;
1040
1041 struct tu_cs_memory texture;
1042 VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
1043 A6XX_TEX_CONST_DWORDS, &texture);
1044 assert(result == VK_SUCCESS);
1045
1046 for (unsigned i = 0; i < subpass->input_count * 2; i++) {
1047 uint32_t a = subpass->input_attachments[i / 2].attachment;
1048 if (a == VK_ATTACHMENT_UNUSED)
1049 continue;
1050
1051 struct tu_image_view *iview =
1052 cmd->state.framebuffer->attachments[a].attachment;
1053 const struct tu_render_pass_attachment *att =
1054 &cmd->state.pass->attachments[a];
1055 uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
1056
1057 memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
1058
1059 if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
1060 /* note this works because spec says fb and input attachments
1061 * must use identity swizzle
1062 */
1063 dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1064 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1065 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1066 dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_S8Z24_UINT) |
1067 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
1068 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1069 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1070 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1071 }
1072
1073 if (!gmem)
1074 continue;
1075
1076 /* patched for gmem */
1077 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1078 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1079 dst[2] =
1080 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1081 A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * att->cpp);
1082 dst[3] = 0;
1083 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
1084 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
1085 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1086 dst[i] = 0;
1087 }
1088
1089 struct tu_cs cs;
1090 tu_cs_begin_sub_stream(&cmd->sub_cs, 9, &cs);
1091
1092 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
1093 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1094 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1095 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1096 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1097 CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
1098 tu_cs_emit_qw(&cs, texture.iova);
1099
1100 tu_cs_emit_pkt4(&cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
1101 tu_cs_emit_qw(&cs, texture.iova);
1102
1103 tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
1104
1105 *ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
1106 }
1107
1108 static void
1109 tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1110 {
1111 struct tu_cs *cs = &cmd->draw_cs;
1112
1113 tu_emit_input_attachments(cmd, subpass, &cmd->state.ia_gmem_ib, true);
1114 tu_emit_input_attachments(cmd, subpass, &cmd->state.ia_sysmem_ib, false);
1115
1116 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
1117 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, cmd->state.ia_gmem_ib);
1118 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, cmd->state.ia_sysmem_ib);
1119 }
1120
1121 static void
1122 tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
1123 const VkRenderPassBeginInfo *info)
1124 {
1125 struct tu_cs *cs = &cmd->draw_cs;
1126
1127 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1128
1129 tu6_emit_blit_scissor(cmd, cs, true);
1130
1131 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1132 tu_load_gmem_attachment(cmd, cs, i, false);
1133
1134 tu6_emit_blit_scissor(cmd, cs, false);
1135
1136 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1137 tu_clear_gmem_attachment(cmd, cs, i, info);
1138
1139 tu_cond_exec_end(cs);
1140
1141 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1142
1143 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1144 tu_clear_sysmem_attachment(cmd, cs, i, info);
1145
1146 tu_cond_exec_end(cs);
1147 }
1148
1149 static void
1150 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1151 {
1152 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1153
1154 assert(fb->width > 0 && fb->height > 0);
1155 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1156 tu6_emit_window_offset(cs, 0, 0);
1157
1158 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1159
1160 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1161
1162 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1163 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1164
1165 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1166 tu_cs_emit(cs, 0x0);
1167
1168 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1169
1170 /* enable stream-out, with sysmem there is only one pass: */
1171 tu_cs_emit_regs(cs,
1172 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1173
1174 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1175 tu_cs_emit(cs, 0x1);
1176
1177 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1178 tu_cs_emit(cs, 0x0);
1179
1180 tu_cs_sanity_check(cs);
1181 }
1182
1183 static void
1184 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1185 {
1186 /* Do any resolves of the last subpass. These are handled in the
1187 * tile_store_ib in the gmem path.
1188 */
1189 tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1190
1191 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1192
1193 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1194 tu_cs_emit(cs, 0x0);
1195
1196 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1197
1198 tu_cs_sanity_check(cs);
1199 }
1200
1201 static void
1202 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1203 {
1204 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1205
1206 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1207
1208 /* lrz clear? */
1209
1210 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1211 tu_cs_emit(cs, 0x0);
1212
1213 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1214
1215 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1216 if (use_hw_binning(cmd)) {
1217 /* enable stream-out during binning pass: */
1218 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1219
1220 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1221 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1222
1223 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1224
1225 tu6_emit_binning_pass(cmd, cs);
1226
1227 /* and disable stream-out for draw pass: */
1228 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
1229
1230 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1231 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1232
1233 tu_cs_emit_regs(cs,
1234 A6XX_VFD_MODE_CNTL(0));
1235
1236 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1237
1238 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1239
1240 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1241 tu_cs_emit(cs, 0x1);
1242 } else {
1243 /* no binning pass, so enable stream-out for draw pass:: */
1244 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1245
1246 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 0x6000000);
1247 }
1248
1249 tu_cs_sanity_check(cs);
1250 }
1251
1252 static void
1253 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1254 {
1255 tu_cs_emit_call(cs, &cmd->draw_cs);
1256
1257 if (use_hw_binning(cmd)) {
1258 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1259 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1260 }
1261
1262 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1263
1264 tu_cs_sanity_check(cs);
1265 }
1266
1267 static void
1268 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1269 {
1270 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1271
1272 tu_cs_emit_regs(cs,
1273 A6XX_GRAS_LRZ_CNTL(0));
1274
1275 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1276
1277 tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1278
1279 tu_cs_sanity_check(cs);
1280 }
1281
1282 static void
1283 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1284 {
1285 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1286
1287 tu6_tile_render_begin(cmd, &cmd->cs);
1288
1289 uint32_t pipe = 0;
1290 for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
1291 for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) {
1292 uint32_t tx1 = px * fb->pipe0.width;
1293 uint32_t ty1 = py * fb->pipe0.height;
1294 uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
1295 uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
1296 uint32_t slot = 0;
1297 for (uint32_t ty = ty1; ty < ty2; ty++) {
1298 for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
1299 tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
1300 tu6_render_tile(cmd, &cmd->cs);
1301 }
1302 }
1303 }
1304 }
1305
1306 tu6_tile_render_end(cmd, &cmd->cs);
1307 }
1308
1309 static void
1310 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1311 {
1312 tu6_sysmem_render_begin(cmd, &cmd->cs);
1313
1314 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1315
1316 tu6_sysmem_render_end(cmd, &cmd->cs);
1317 }
1318
1319 static void
1320 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1321 {
1322 const uint32_t tile_store_space = 11 + (35 * 2) * cmd->state.pass->attachment_count;
1323 struct tu_cs sub_cs;
1324
1325 VkResult result =
1326 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1327 if (result != VK_SUCCESS) {
1328 cmd->record_result = result;
1329 return;
1330 }
1331
1332 /* emit to tile-store sub_cs */
1333 tu6_emit_tile_store(cmd, &sub_cs);
1334
1335 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1336 }
1337
1338 static VkResult
1339 tu_create_cmd_buffer(struct tu_device *device,
1340 struct tu_cmd_pool *pool,
1341 VkCommandBufferLevel level,
1342 VkCommandBuffer *pCommandBuffer)
1343 {
1344 struct tu_cmd_buffer *cmd_buffer;
1345
1346 cmd_buffer = vk_object_zalloc(&device->vk, NULL, sizeof(*cmd_buffer),
1347 VK_OBJECT_TYPE_COMMAND_BUFFER);
1348 if (cmd_buffer == NULL)
1349 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1350
1351 cmd_buffer->device = device;
1352 cmd_buffer->pool = pool;
1353 cmd_buffer->level = level;
1354
1355 if (pool) {
1356 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1357 cmd_buffer->queue_family_index = pool->queue_family_index;
1358
1359 } else {
1360 /* Init the pool_link so we can safely call list_del when we destroy
1361 * the command buffer
1362 */
1363 list_inithead(&cmd_buffer->pool_link);
1364 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1365 }
1366
1367 tu_bo_list_init(&cmd_buffer->bo_list);
1368 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1369 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1370 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1371 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1372
1373 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1374
1375 list_inithead(&cmd_buffer->upload.list);
1376
1377 return VK_SUCCESS;
1378 }
1379
1380 static void
1381 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1382 {
1383 list_del(&cmd_buffer->pool_link);
1384
1385 tu_cs_finish(&cmd_buffer->cs);
1386 tu_cs_finish(&cmd_buffer->draw_cs);
1387 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1388 tu_cs_finish(&cmd_buffer->sub_cs);
1389
1390 tu_bo_list_destroy(&cmd_buffer->bo_list);
1391 vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
1392 }
1393
1394 static VkResult
1395 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1396 {
1397 cmd_buffer->record_result = VK_SUCCESS;
1398
1399 tu_bo_list_reset(&cmd_buffer->bo_list);
1400 tu_cs_reset(&cmd_buffer->cs);
1401 tu_cs_reset(&cmd_buffer->draw_cs);
1402 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1403 tu_cs_reset(&cmd_buffer->sub_cs);
1404
1405 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
1406 memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
1407
1408 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1409
1410 return cmd_buffer->record_result;
1411 }
1412
1413 VkResult
1414 tu_AllocateCommandBuffers(VkDevice _device,
1415 const VkCommandBufferAllocateInfo *pAllocateInfo,
1416 VkCommandBuffer *pCommandBuffers)
1417 {
1418 TU_FROM_HANDLE(tu_device, device, _device);
1419 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1420
1421 VkResult result = VK_SUCCESS;
1422 uint32_t i;
1423
1424 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1425
1426 if (!list_is_empty(&pool->free_cmd_buffers)) {
1427 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1428 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1429
1430 list_del(&cmd_buffer->pool_link);
1431 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1432
1433 result = tu_reset_cmd_buffer(cmd_buffer);
1434 cmd_buffer->level = pAllocateInfo->level;
1435
1436 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1437 } else {
1438 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1439 &pCommandBuffers[i]);
1440 }
1441 if (result != VK_SUCCESS)
1442 break;
1443 }
1444
1445 if (result != VK_SUCCESS) {
1446 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1447 pCommandBuffers);
1448
1449 /* From the Vulkan 1.0.66 spec:
1450 *
1451 * "vkAllocateCommandBuffers can be used to create multiple
1452 * command buffers. If the creation of any of those command
1453 * buffers fails, the implementation must destroy all
1454 * successfully created command buffer objects from this
1455 * command, set all entries of the pCommandBuffers array to
1456 * NULL and return the error."
1457 */
1458 memset(pCommandBuffers, 0,
1459 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1460 }
1461
1462 return result;
1463 }
1464
1465 void
1466 tu_FreeCommandBuffers(VkDevice device,
1467 VkCommandPool commandPool,
1468 uint32_t commandBufferCount,
1469 const VkCommandBuffer *pCommandBuffers)
1470 {
1471 for (uint32_t i = 0; i < commandBufferCount; i++) {
1472 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1473
1474 if (cmd_buffer) {
1475 if (cmd_buffer->pool) {
1476 list_del(&cmd_buffer->pool_link);
1477 list_addtail(&cmd_buffer->pool_link,
1478 &cmd_buffer->pool->free_cmd_buffers);
1479 } else
1480 tu_cmd_buffer_destroy(cmd_buffer);
1481 }
1482 }
1483 }
1484
1485 VkResult
1486 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1487 VkCommandBufferResetFlags flags)
1488 {
1489 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1490 return tu_reset_cmd_buffer(cmd_buffer);
1491 }
1492
1493 /* Initialize the cache, assuming all necessary flushes have happened but *not*
1494 * invalidations.
1495 */
1496 static void
1497 tu_cache_init(struct tu_cache_state *cache)
1498 {
1499 cache->flush_bits = 0;
1500 cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1501 }
1502
1503 VkResult
1504 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1505 const VkCommandBufferBeginInfo *pBeginInfo)
1506 {
1507 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1508 VkResult result = VK_SUCCESS;
1509
1510 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1511 /* If the command buffer has already been resetted with
1512 * vkResetCommandBuffer, no need to do it again.
1513 */
1514 result = tu_reset_cmd_buffer(cmd_buffer);
1515 if (result != VK_SUCCESS)
1516 return result;
1517 }
1518
1519 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1520 cmd_buffer->state.index_size = 0xff; /* dirty restart index */
1521
1522 tu_cache_init(&cmd_buffer->state.cache);
1523 tu_cache_init(&cmd_buffer->state.renderpass_cache);
1524 cmd_buffer->usage_flags = pBeginInfo->flags;
1525
1526 tu_cs_begin(&cmd_buffer->cs);
1527 tu_cs_begin(&cmd_buffer->draw_cs);
1528 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1529
1530 /* setup initial configuration into command buffer */
1531 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1532 switch (cmd_buffer->queue_family_index) {
1533 case TU_QUEUE_GENERAL:
1534 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1535 break;
1536 default:
1537 break;
1538 }
1539 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1540 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1541 assert(pBeginInfo->pInheritanceInfo);
1542 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1543 cmd_buffer->state.subpass =
1544 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1545 } else {
1546 /* When executing in the middle of another command buffer, the CCU
1547 * state is unknown.
1548 */
1549 cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1550 }
1551 }
1552
1553 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1554
1555 return VK_SUCCESS;
1556 }
1557
1558 /* Sets vertex buffers to HW binding points. We emit VBs in SDS (so that bin
1559 * rendering can skip over unused state), so we need to collect all the
1560 * bindings together into a single state emit at draw time.
1561 */
1562 void
1563 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1564 uint32_t firstBinding,
1565 uint32_t bindingCount,
1566 const VkBuffer *pBuffers,
1567 const VkDeviceSize *pOffsets)
1568 {
1569 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1570
1571 assert(firstBinding + bindingCount <= MAX_VBS);
1572
1573 for (uint32_t i = 0; i < bindingCount; i++) {
1574 struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1575
1576 cmd->state.vb.buffers[firstBinding + i] = buf;
1577 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
1578
1579 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1580 }
1581
1582 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1583 }
1584
1585 void
1586 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1587 VkBuffer buffer,
1588 VkDeviceSize offset,
1589 VkIndexType indexType)
1590 {
1591 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1592 TU_FROM_HANDLE(tu_buffer, buf, buffer);
1593
1594
1595
1596 uint32_t index_size, index_shift, restart_index;
1597
1598 switch (indexType) {
1599 case VK_INDEX_TYPE_UINT16:
1600 index_size = INDEX4_SIZE_16_BIT;
1601 index_shift = 1;
1602 restart_index = 0xffff;
1603 break;
1604 case VK_INDEX_TYPE_UINT32:
1605 index_size = INDEX4_SIZE_32_BIT;
1606 index_shift = 2;
1607 restart_index = 0xffffffff;
1608 break;
1609 case VK_INDEX_TYPE_UINT8_EXT:
1610 index_size = INDEX4_SIZE_8_BIT;
1611 index_shift = 0;
1612 restart_index = 0xff;
1613 break;
1614 default:
1615 unreachable("invalid VkIndexType");
1616 }
1617
1618 /* initialize/update the restart index */
1619 if (cmd->state.index_size != index_size)
1620 tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
1621
1622 assert(buf->size >= offset);
1623
1624 cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;
1625 cmd->state.max_index_count = (buf->size - offset) >> index_shift;
1626 cmd->state.index_size = index_size;
1627
1628 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1629 }
1630
1631 void
1632 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1633 VkPipelineBindPoint pipelineBindPoint,
1634 VkPipelineLayout _layout,
1635 uint32_t firstSet,
1636 uint32_t descriptorSetCount,
1637 const VkDescriptorSet *pDescriptorSets,
1638 uint32_t dynamicOffsetCount,
1639 const uint32_t *pDynamicOffsets)
1640 {
1641 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1642 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1643 unsigned dyn_idx = 0;
1644
1645 struct tu_descriptor_state *descriptors_state =
1646 tu_get_descriptors_state(cmd, pipelineBindPoint);
1647
1648 for (unsigned i = 0; i < descriptorSetCount; ++i) {
1649 unsigned idx = i + firstSet;
1650 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1651
1652 descriptors_state->sets[idx] = set;
1653
1654 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1655 /* update the contents of the dynamic descriptor set */
1656 unsigned src_idx = j;
1657 unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1658 assert(dyn_idx < dynamicOffsetCount);
1659
1660 uint32_t *dst =
1661 &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1662 uint32_t *src =
1663 &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1664 uint32_t offset = pDynamicOffsets[dyn_idx];
1665
1666 /* Patch the storage/uniform descriptors right away. */
1667 if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1668 /* Note: we can assume here that the addition won't roll over and
1669 * change the SIZE field.
1670 */
1671 uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1672 va += offset;
1673 dst[0] = va;
1674 dst[1] = va >> 32;
1675 } else {
1676 memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1677 /* Note: A6XX_IBO_5_DEPTH is always 0 */
1678 uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1679 va += offset;
1680 dst[4] = va;
1681 dst[5] = va >> 32;
1682 }
1683 }
1684
1685 for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
1686 if (set->buffers[j]) {
1687 tu_bo_list_add(&cmd->bo_list, set->buffers[j],
1688 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1689 }
1690 }
1691
1692 if (set->size > 0) {
1693 tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
1694 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1695 }
1696 }
1697 assert(dyn_idx == dynamicOffsetCount);
1698
1699 uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;
1700 uint64_t addr[MAX_SETS + 1] = {};
1701 struct tu_cs cs;
1702
1703 for (uint32_t i = 0; i < MAX_SETS; i++) {
1704 struct tu_descriptor_set *set = descriptors_state->sets[i];
1705 if (set)
1706 addr[i] = set->va | 3;
1707 }
1708
1709 if (layout->dynamic_offset_count) {
1710 /* allocate and fill out dynamic descriptor set */
1711 struct tu_cs_memory dynamic_desc_set;
1712 VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
1713 A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
1714 assert(result == VK_SUCCESS);
1715
1716 memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
1717 layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
1718 addr[MAX_SETS] = dynamic_desc_set.iova | 3;
1719 }
1720
1721 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1722 sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
1723 hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
1724 hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
1725
1726 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS;
1727 } else {
1728 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
1729
1730 sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
1731 hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
1732 hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
1733
1734 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
1735 }
1736
1737 tu_cs_begin_sub_stream(&cmd->sub_cs, 24, &cs);
1738
1739 tu_cs_emit_pkt4(&cs, sp_bindless_base_reg, 10);
1740 tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
1741 tu_cs_emit_pkt4(&cs, hlsq_bindless_base_reg, 10);
1742 tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
1743 tu_cs_emit_regs(&cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));
1744
1745 struct tu_cs_entry ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
1746 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1747 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
1748 tu_cs_emit_sds_ib(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, ib);
1749 cmd->state.desc_sets_ib = ib;
1750 } else {
1751 /* note: for compute we could emit directly, instead of a CP_INDIRECT
1752 * however, the blob uses draw states for compute
1753 */
1754 tu_cs_emit_ib(&cmd->cs, &ib);
1755 }
1756 }
1757
1758 void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1759 uint32_t firstBinding,
1760 uint32_t bindingCount,
1761 const VkBuffer *pBuffers,
1762 const VkDeviceSize *pOffsets,
1763 const VkDeviceSize *pSizes)
1764 {
1765 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1766 struct tu_cs *cs = &cmd->draw_cs;
1767
1768 /* using COND_REG_EXEC for xfb commands matches the blob behavior
1769 * presumably there isn't any benefit using a draw state when the
1770 * condition is (SYSMEM | BINNING)
1771 */
1772 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1773 CP_COND_REG_EXEC_0_SYSMEM |
1774 CP_COND_REG_EXEC_0_BINNING);
1775
1776 for (uint32_t i = 0; i < bindingCount; i++) {
1777 TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
1778 uint64_t iova = buf->bo->iova + pOffsets[i];
1779 uint32_t size = buf->bo->size - pOffsets[i];
1780 uint32_t idx = i + firstBinding;
1781
1782 if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
1783 size = pSizes[i];
1784
1785 /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
1786 uint32_t offset = iova & 0x1f;
1787 iova &= ~(uint64_t) 0x1f;
1788
1789 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
1790 tu_cs_emit_qw(cs, iova);
1791 tu_cs_emit(cs, size + offset);
1792
1793 cmd->state.streamout_offset[idx] = offset;
1794
1795 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE);
1796 }
1797
1798 tu_cond_exec_end(cs);
1799 }
1800
1801 void
1802 tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1803 uint32_t firstCounterBuffer,
1804 uint32_t counterBufferCount,
1805 const VkBuffer *pCounterBuffers,
1806 const VkDeviceSize *pCounterBufferOffsets)
1807 {
1808 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1809 struct tu_cs *cs = &cmd->draw_cs;
1810
1811 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1812 CP_COND_REG_EXEC_0_SYSMEM |
1813 CP_COND_REG_EXEC_0_BINNING);
1814
1815 /* TODO: only update offset for active buffers */
1816 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
1817 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
1818
1819 for (uint32_t i = 0; i < counterBufferCount; i++) {
1820 uint32_t idx = firstCounterBuffer + i;
1821 uint32_t offset = cmd->state.streamout_offset[idx];
1822
1823 if (!pCounterBuffers[i])
1824 continue;
1825
1826 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1827
1828 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1829
1830 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1831 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1832 CP_MEM_TO_REG_0_UNK31 |
1833 CP_MEM_TO_REG_0_CNT(1));
1834 tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]);
1835
1836 if (offset) {
1837 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1838 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1839 CP_REG_RMW_0_SRC1_ADD);
1840 tu_cs_emit_qw(cs, 0xffffffff);
1841 tu_cs_emit_qw(cs, offset);
1842 }
1843 }
1844
1845 tu_cond_exec_end(cs);
1846 }
1847
1848 void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1849 uint32_t firstCounterBuffer,
1850 uint32_t counterBufferCount,
1851 const VkBuffer *pCounterBuffers,
1852 const VkDeviceSize *pCounterBufferOffsets)
1853 {
1854 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1855 struct tu_cs *cs = &cmd->draw_cs;
1856
1857 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1858 CP_COND_REG_EXEC_0_SYSMEM |
1859 CP_COND_REG_EXEC_0_BINNING);
1860
1861 /* TODO: only flush buffers that need to be flushed */
1862 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1863 /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
1864 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
1865 tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
1866 tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
1867 }
1868
1869 for (uint32_t i = 0; i < counterBufferCount; i++) {
1870 uint32_t idx = firstCounterBuffer + i;
1871 uint32_t offset = cmd->state.streamout_offset[idx];
1872
1873 if (!pCounterBuffers[i])
1874 continue;
1875
1876 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1877
1878 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE);
1879
1880 /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
1881 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1882 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1883 CP_MEM_TO_REG_0_SHIFT_BY_2 |
1884 0x40000 | /* ??? */
1885 CP_MEM_TO_REG_0_UNK31 |
1886 CP_MEM_TO_REG_0_CNT(1));
1887 tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
1888
1889 if (offset) {
1890 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1891 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1892 CP_REG_RMW_0_SRC1_ADD);
1893 tu_cs_emit_qw(cs, 0xffffffff);
1894 tu_cs_emit_qw(cs, -offset);
1895 }
1896
1897 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1898 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1899 CP_REG_TO_MEM_0_CNT(1));
1900 tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]);
1901 }
1902
1903 tu_cond_exec_end(cs);
1904
1905 cmd->state.xfb_used = true;
1906 }
1907
1908 void
1909 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
1910 VkPipelineLayout layout,
1911 VkShaderStageFlags stageFlags,
1912 uint32_t offset,
1913 uint32_t size,
1914 const void *pValues)
1915 {
1916 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1917 memcpy((void*) cmd->push_constants + offset, pValues, size);
1918 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
1919 }
1920
1921 /* Flush everything which has been made available but we haven't actually
1922 * flushed yet.
1923 */
1924 static void
1925 tu_flush_all_pending(struct tu_cache_state *cache)
1926 {
1927 cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
1928 cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
1929 }
1930
1931 VkResult
1932 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
1933 {
1934 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1935
1936 /* We currently flush CCU at the end of the command buffer, like
1937 * what the blob does. There's implicit synchronization around every
1938 * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
1939 * know yet if this command buffer will be the last in the submit so we
1940 * have to defensively flush everything else.
1941 *
1942 * TODO: We could definitely do better than this, since these flushes
1943 * aren't required by Vulkan, but we'd need kernel support to do that.
1944 * Ideally, we'd like the kernel to flush everything afterwards, so that we
1945 * wouldn't have to do any flushes here, and when submitting multiple
1946 * command buffers there wouldn't be any unnecessary flushes in between.
1947 */
1948 if (cmd_buffer->state.pass) {
1949 tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
1950 tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
1951 } else {
1952 tu_flush_all_pending(&cmd_buffer->state.cache);
1953 cmd_buffer->state.cache.flush_bits |=
1954 TU_CMD_FLAG_CCU_FLUSH_COLOR |
1955 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
1956 tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
1957 }
1958
1959 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->global_bo,
1960 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1961
1962 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
1963 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
1964 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1965 }
1966
1967 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
1968 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
1969 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1970 }
1971
1972 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
1973 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
1974 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1975 }
1976
1977 tu_cs_end(&cmd_buffer->cs);
1978 tu_cs_end(&cmd_buffer->draw_cs);
1979 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
1980
1981 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
1982
1983 return cmd_buffer->record_result;
1984 }
1985
1986 static struct tu_cs
1987 tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
1988 {
1989 struct tu_cs_memory memory;
1990 struct tu_cs cs;
1991
1992 /* TODO: share this logic with tu_pipeline_static_state */
1993 tu_cs_alloc(&cmd->sub_cs, size, 1, &memory);
1994 tu_cs_init_external(&cs, memory.map, memory.map + size);
1995 tu_cs_begin(&cs);
1996 tu_cs_reserve_space(&cs, size);
1997
1998 assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
1999 cmd->state.dynamic_state[id].iova = memory.iova;
2000 cmd->state.dynamic_state[id].size = size;
2001
2002 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2003 tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2004
2005 return cs;
2006 }
2007
2008 void
2009 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2010 VkPipelineBindPoint pipelineBindPoint,
2011 VkPipeline _pipeline)
2012 {
2013 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2014 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2015
2016 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2017 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2018 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2019 }
2020
2021 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2022 cmd->state.compute_pipeline = pipeline;
2023 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2024 return;
2025 }
2026
2027 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2028
2029 cmd->state.pipeline = pipeline;
2030 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2031
2032 struct tu_cs *cs = &cmd->draw_cs;
2033 uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
2034 uint32_t i;
2035
2036 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
2037 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib);
2038 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib);
2039 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib);
2040 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib);
2041 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib);
2042 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS, pipeline->ds.state_ib);
2043 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_BLEND, pipeline->blend.state_ib);
2044
2045 for_each_bit(i, mask)
2046 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
2047
2048 /* If the new pipeline requires more VBs than we had previously set up, we
2049 * need to re-emit them in SDS. If it requires the same set or fewer, we
2050 * can just re-use the old SDS.
2051 */
2052 if (pipeline->vi.bindings_used & ~cmd->vertex_bindings_set)
2053 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2054
2055 /* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */
2056 if (pipeline->layout->dynamic_offset_count)
2057 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2058
2059 /* dynamic linewidth state depends pipeline state's gras_su_cntl
2060 * so the dynamic state ib must be updated when pipeline changes
2061 */
2062 if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_LINE_WIDTH)) {
2063 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_LINE_WIDTH, 2);
2064
2065 cmd->state.dynamic_gras_su_cntl &= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2066 cmd->state.dynamic_gras_su_cntl |= pipeline->gras_su_cntl;
2067
2068 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.dynamic_gras_su_cntl));
2069 }
2070 }
2071
2072 void
2073 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2074 uint32_t firstViewport,
2075 uint32_t viewportCount,
2076 const VkViewport *pViewports)
2077 {
2078 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2079 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 18);
2080
2081 assert(firstViewport == 0 && viewportCount == 1);
2082
2083 tu6_emit_viewport(&cs, pViewports);
2084 }
2085
2086 void
2087 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2088 uint32_t firstScissor,
2089 uint32_t scissorCount,
2090 const VkRect2D *pScissors)
2091 {
2092 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2093 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 3);
2094
2095 assert(firstScissor == 0 && scissorCount == 1);
2096
2097 tu6_emit_scissor(&cs, pScissors);
2098 }
2099
2100 void
2101 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2102 {
2103 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2104 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_LINE_WIDTH, 2);
2105
2106 cmd->state.dynamic_gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2107 cmd->state.dynamic_gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
2108
2109 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.dynamic_gras_su_cntl));
2110 }
2111
2112 void
2113 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2114 float depthBiasConstantFactor,
2115 float depthBiasClamp,
2116 float depthBiasSlopeFactor)
2117 {
2118 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2119 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
2120
2121 tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
2122 }
2123
2124 void
2125 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2126 const float blendConstants[4])
2127 {
2128 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
2130
2131 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2132 tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
2133 }
2134
2135 void
2136 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2137 float minDepthBounds,
2138 float maxDepthBounds)
2139 {
2140 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2141 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
2142
2143 tu_cs_emit_regs(&cs,
2144 A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
2145 A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
2146 }
2147
2148 static void
2149 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
2150 {
2151 if (face & VK_STENCIL_FACE_FRONT_BIT)
2152 *value = (*value & 0xff00) | (mask & 0xff);
2153 if (face & VK_STENCIL_FACE_BACK_BIT)
2154 *value = (*value & 0xff) | (mask & 0xff) << 8;
2155 }
2156
2157 void
2158 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2159 VkStencilFaceFlags faceMask,
2160 uint32_t compareMask)
2161 {
2162 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2163 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
2164
2165 update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
2166
2167 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
2168 }
2169
2170 void
2171 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2172 VkStencilFaceFlags faceMask,
2173 uint32_t writeMask)
2174 {
2175 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2176 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
2177
2178 update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
2179
2180 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
2181 }
2182
2183 void
2184 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2185 VkStencilFaceFlags faceMask,
2186 uint32_t reference)
2187 {
2188 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2189 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
2190
2191 update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
2192
2193 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
2194 }
2195
2196 void
2197 tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2198 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2199 {
2200 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2201 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
2202
2203 assert(pSampleLocationsInfo);
2204
2205 tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
2206 }
2207
2208 static void
2209 tu_flush_for_access(struct tu_cache_state *cache,
2210 enum tu_cmd_access_mask src_mask,
2211 enum tu_cmd_access_mask dst_mask)
2212 {
2213 enum tu_cmd_flush_bits flush_bits = 0;
2214
2215 if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2216 cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2217 }
2218
2219 #define SRC_FLUSH(domain, flush, invalidate) \
2220 if (src_mask & TU_ACCESS_##domain##_WRITE) { \
2221 cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
2222 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2223 }
2224
2225 SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2226 SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2227 SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2228
2229 #undef SRC_FLUSH
2230
2231 #define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
2232 if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
2233 flush_bits |= TU_CMD_FLAG_##flush; \
2234 cache->pending_flush_bits |= \
2235 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2236 }
2237
2238 SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2239 SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2240
2241 #undef SRC_INCOHERENT_FLUSH
2242
2243 if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
2244 flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2245 }
2246
2247 #define DST_FLUSH(domain, flush, invalidate) \
2248 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2249 TU_ACCESS_##domain##_WRITE)) { \
2250 flush_bits |= cache->pending_flush_bits & \
2251 (TU_CMD_FLAG_##invalidate | \
2252 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2253 }
2254
2255 DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2256 DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2257 DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2258
2259 #undef DST_FLUSH
2260
2261 #define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2262 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2263 TU_ACCESS_##domain##_WRITE)) { \
2264 flush_bits |= TU_CMD_FLAG_##invalidate | \
2265 (cache->pending_flush_bits & \
2266 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2267 }
2268
2269 DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2270 DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2271
2272 #undef DST_INCOHERENT_FLUSH
2273
2274 if (dst_mask & TU_ACCESS_WFI_READ) {
2275 flush_bits |= TU_CMD_FLAG_WFI;
2276 }
2277
2278 cache->flush_bits |= flush_bits;
2279 cache->pending_flush_bits &= ~flush_bits;
2280 }
2281
2282 static enum tu_cmd_access_mask
2283 vk2tu_access(VkAccessFlags flags, bool gmem)
2284 {
2285 enum tu_cmd_access_mask mask = 0;
2286
2287 /* If the GPU writes a buffer that is then read by an indirect draw
2288 * command, we theoretically need a WFI + WAIT_FOR_ME combination to
2289 * wait for the writes to complete. The WAIT_FOR_ME is performed as part
2290 * of the draw by the firmware, so we just need to execute a WFI.
2291 */
2292 if (flags &
2293 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
2294 VK_ACCESS_MEMORY_READ_BIT)) {
2295 mask |= TU_ACCESS_WFI_READ;
2296 }
2297
2298 if (flags &
2299 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2300 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
2301 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2302 VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
2303 VK_ACCESS_MEMORY_READ_BIT)) {
2304 mask |= TU_ACCESS_SYSMEM_READ;
2305 }
2306
2307 if (flags &
2308 (VK_ACCESS_HOST_WRITE_BIT |
2309 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
2310 VK_ACCESS_MEMORY_WRITE_BIT)) {
2311 mask |= TU_ACCESS_SYSMEM_WRITE;
2312 }
2313
2314 if (flags &
2315 (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2316 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2317 VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2318 /* TODO: Is there a no-cache bit for textures so that we can ignore
2319 * these?
2320 */
2321 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2322 VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2323 VK_ACCESS_MEMORY_READ_BIT)) {
2324 mask |= TU_ACCESS_UCHE_READ;
2325 }
2326
2327 if (flags &
2328 (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2329 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2330 VK_ACCESS_MEMORY_WRITE_BIT)) {
2331 mask |= TU_ACCESS_UCHE_WRITE;
2332 }
2333
2334 /* When using GMEM, the CCU is always flushed automatically to GMEM, and
2335 * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2336 * previous writes in sysmem mode when transitioning to GMEM. Therefore we
2337 * can ignore CCU and pretend that color attachments and transfers use
2338 * sysmem directly.
2339 */
2340
2341 if (flags &
2342 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2343 VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2344 VK_ACCESS_MEMORY_READ_BIT)) {
2345 if (gmem)
2346 mask |= TU_ACCESS_SYSMEM_READ;
2347 else
2348 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2349 }
2350
2351 if (flags &
2352 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2353 VK_ACCESS_MEMORY_READ_BIT)) {
2354 if (gmem)
2355 mask |= TU_ACCESS_SYSMEM_READ;
2356 else
2357 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2358 }
2359
2360 if (flags &
2361 (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2362 VK_ACCESS_MEMORY_WRITE_BIT)) {
2363 if (gmem) {
2364 mask |= TU_ACCESS_SYSMEM_WRITE;
2365 } else {
2366 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2367 }
2368 }
2369
2370 if (flags &
2371 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2372 VK_ACCESS_MEMORY_WRITE_BIT)) {
2373 if (gmem) {
2374 mask |= TU_ACCESS_SYSMEM_WRITE;
2375 } else {
2376 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2377 }
2378 }
2379
2380 /* When the dst access is a transfer read/write, it seems we sometimes need
2381 * to insert a WFI after any flushes, to guarantee that the flushes finish
2382 * before the 2D engine starts. However the opposite (i.e. a WFI after
2383 * CP_BLIT and before any subsequent flush) does not seem to be needed, and
2384 * the blob doesn't emit such a WFI.
2385 */
2386
2387 if (flags &
2388 (VK_ACCESS_TRANSFER_WRITE_BIT |
2389 VK_ACCESS_MEMORY_WRITE_BIT)) {
2390 if (gmem) {
2391 mask |= TU_ACCESS_SYSMEM_WRITE;
2392 } else {
2393 mask |= TU_ACCESS_CCU_COLOR_WRITE;
2394 }
2395 mask |= TU_ACCESS_WFI_READ;
2396 }
2397
2398 if (flags &
2399 (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2400 VK_ACCESS_MEMORY_READ_BIT)) {
2401 mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
2402 }
2403
2404 return mask;
2405 }
2406
2407
2408 void
2409 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2410 uint32_t commandBufferCount,
2411 const VkCommandBuffer *pCmdBuffers)
2412 {
2413 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2414 VkResult result;
2415
2416 assert(commandBufferCount > 0);
2417
2418 /* Emit any pending flushes. */
2419 if (cmd->state.pass) {
2420 tu_flush_all_pending(&cmd->state.renderpass_cache);
2421 tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2422 } else {
2423 tu_flush_all_pending(&cmd->state.cache);
2424 tu_emit_cache_flush(cmd, &cmd->cs);
2425 }
2426
2427 for (uint32_t i = 0; i < commandBufferCount; i++) {
2428 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2429
2430 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2431 if (result != VK_SUCCESS) {
2432 cmd->record_result = result;
2433 break;
2434 }
2435
2436 if (secondary->usage_flags &
2437 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2438 assert(tu_cs_is_empty(&secondary->cs));
2439
2440 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2441 if (result != VK_SUCCESS) {
2442 cmd->record_result = result;
2443 break;
2444 }
2445
2446 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2447 &secondary->draw_epilogue_cs);
2448 if (result != VK_SUCCESS) {
2449 cmd->record_result = result;
2450 break;
2451 }
2452
2453 if (secondary->has_tess)
2454 cmd->has_tess = true;
2455 } else {
2456 assert(tu_cs_is_empty(&secondary->draw_cs));
2457 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2458
2459 for (uint32_t j = 0; j < secondary->cs.bo_count; j++) {
2460 tu_bo_list_add(&cmd->bo_list, secondary->cs.bos[j],
2461 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2462 }
2463
2464 tu_cs_add_entries(&cmd->cs, &secondary->cs);
2465 }
2466
2467 cmd->state.index_size = secondary->state.index_size; /* for restart index update */
2468 }
2469 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2470
2471 /* After executing secondary command buffers, there may have been arbitrary
2472 * flushes executed, so when we encounter a pipeline barrier with a
2473 * srcMask, we have to assume that we need to invalidate. Therefore we need
2474 * to re-initialize the cache with all pending invalidate bits set.
2475 */
2476 if (cmd->state.pass) {
2477 tu_cache_init(&cmd->state.renderpass_cache);
2478 } else {
2479 tu_cache_init(&cmd->state.cache);
2480 }
2481 }
2482
2483 VkResult
2484 tu_CreateCommandPool(VkDevice _device,
2485 const VkCommandPoolCreateInfo *pCreateInfo,
2486 const VkAllocationCallbacks *pAllocator,
2487 VkCommandPool *pCmdPool)
2488 {
2489 TU_FROM_HANDLE(tu_device, device, _device);
2490 struct tu_cmd_pool *pool;
2491
2492 pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
2493 VK_OBJECT_TYPE_COMMAND_POOL);
2494 if (pool == NULL)
2495 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2496
2497 if (pAllocator)
2498 pool->alloc = *pAllocator;
2499 else
2500 pool->alloc = device->vk.alloc;
2501
2502 list_inithead(&pool->cmd_buffers);
2503 list_inithead(&pool->free_cmd_buffers);
2504
2505 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2506
2507 *pCmdPool = tu_cmd_pool_to_handle(pool);
2508
2509 return VK_SUCCESS;
2510 }
2511
2512 void
2513 tu_DestroyCommandPool(VkDevice _device,
2514 VkCommandPool commandPool,
2515 const VkAllocationCallbacks *pAllocator)
2516 {
2517 TU_FROM_HANDLE(tu_device, device, _device);
2518 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2519
2520 if (!pool)
2521 return;
2522
2523 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2524 &pool->cmd_buffers, pool_link)
2525 {
2526 tu_cmd_buffer_destroy(cmd_buffer);
2527 }
2528
2529 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2530 &pool->free_cmd_buffers, pool_link)
2531 {
2532 tu_cmd_buffer_destroy(cmd_buffer);
2533 }
2534
2535 vk_object_free(&device->vk, pAllocator, pool);
2536 }
2537
2538 VkResult
2539 tu_ResetCommandPool(VkDevice device,
2540 VkCommandPool commandPool,
2541 VkCommandPoolResetFlags flags)
2542 {
2543 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2544 VkResult result;
2545
2546 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2547 pool_link)
2548 {
2549 result = tu_reset_cmd_buffer(cmd_buffer);
2550 if (result != VK_SUCCESS)
2551 return result;
2552 }
2553
2554 return VK_SUCCESS;
2555 }
2556
2557 void
2558 tu_TrimCommandPool(VkDevice device,
2559 VkCommandPool commandPool,
2560 VkCommandPoolTrimFlags flags)
2561 {
2562 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2563
2564 if (!pool)
2565 return;
2566
2567 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2568 &pool->free_cmd_buffers, pool_link)
2569 {
2570 tu_cmd_buffer_destroy(cmd_buffer);
2571 }
2572 }
2573
2574 static void
2575 tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
2576 const struct tu_subpass_barrier *barrier,
2577 bool external)
2578 {
2579 /* Note: we don't know until the end of the subpass whether we'll use
2580 * sysmem, so assume sysmem here to be safe.
2581 */
2582 struct tu_cache_state *cache =
2583 external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
2584 enum tu_cmd_access_mask src_flags =
2585 vk2tu_access(barrier->src_access_mask, false);
2586 enum tu_cmd_access_mask dst_flags =
2587 vk2tu_access(barrier->dst_access_mask, false);
2588
2589 if (barrier->incoherent_ccu_color)
2590 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2591 if (barrier->incoherent_ccu_depth)
2592 src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2593
2594 tu_flush_for_access(cache, src_flags, dst_flags);
2595 }
2596
2597 void
2598 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2599 const VkRenderPassBeginInfo *pRenderPassBegin,
2600 VkSubpassContents contents)
2601 {
2602 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2603 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2604 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2605
2606 cmd->state.pass = pass;
2607 cmd->state.subpass = pass->subpasses;
2608 cmd->state.framebuffer = fb;
2609 cmd->state.render_area = pRenderPassBegin->renderArea;
2610
2611 tu_cmd_prepare_tile_store_ib(cmd);
2612
2613 /* Note: because this is external, any flushes will happen before draw_cs
2614 * gets called. However deferred flushes could have to happen later as part
2615 * of the subpass.
2616 */
2617 tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
2618 cmd->state.renderpass_cache.pending_flush_bits =
2619 cmd->state.cache.pending_flush_bits;
2620 cmd->state.renderpass_cache.flush_bits = 0;
2621
2622 tu_emit_renderpass_begin(cmd, pRenderPassBegin);
2623
2624 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2625 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2626 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
2627 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2628
2629 tu_set_input_attachments(cmd, cmd->state.subpass);
2630
2631 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2632 const struct tu_image_view *iview = fb->attachments[i].attachment;
2633 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2634 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2635 }
2636
2637 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2638 }
2639
2640 void
2641 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2642 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2643 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2644 {
2645 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2646 pSubpassBeginInfo->contents);
2647 }
2648
2649 void
2650 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2651 {
2652 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2653 const struct tu_render_pass *pass = cmd->state.pass;
2654 struct tu_cs *cs = &cmd->draw_cs;
2655
2656 const struct tu_subpass *subpass = cmd->state.subpass++;
2657
2658 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2659
2660 if (subpass->resolve_attachments) {
2661 tu6_emit_blit_scissor(cmd, cs, true);
2662
2663 for (unsigned i = 0; i < subpass->color_count; i++) {
2664 uint32_t a = subpass->resolve_attachments[i].attachment;
2665 if (a == VK_ATTACHMENT_UNUSED)
2666 continue;
2667
2668 tu_store_gmem_attachment(cmd, cs, a,
2669 subpass->color_attachments[i].attachment);
2670
2671 if (pass->attachments[a].gmem_offset < 0)
2672 continue;
2673
2674 /* TODO:
2675 * check if the resolved attachment is needed by later subpasses,
2676 * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
2677 */
2678 tu_finishme("missing GMEM->GMEM resolve path\n");
2679 tu_load_gmem_attachment(cmd, cs, a, true);
2680 }
2681 }
2682
2683 tu_cond_exec_end(cs);
2684
2685 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2686
2687 tu6_emit_sysmem_resolves(cmd, cs, subpass);
2688
2689 tu_cond_exec_end(cs);
2690
2691 /* Handle dependencies for the next subpass */
2692 tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
2693
2694 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2695 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2696 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2697 tu6_emit_msaa(cs, cmd->state.subpass->samples);
2698 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2699
2700 tu_set_input_attachments(cmd, cmd->state.subpass);
2701 }
2702
2703 void
2704 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2705 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2706 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2707 {
2708 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2709 }
2710
2711 static void
2712 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2713 struct tu_descriptor_state *descriptors_state,
2714 gl_shader_stage type,
2715 uint32_t *push_constants)
2716 {
2717 const struct tu_program_descriptor_linkage *link =
2718 &pipeline->program.link[type];
2719 const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
2720
2721 if (link->push_consts.count > 0) {
2722 unsigned num_units = link->push_consts.count;
2723 unsigned offset = link->push_consts.lo;
2724 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
2725 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
2726 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2727 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2728 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2729 CP_LOAD_STATE6_0_NUM_UNIT(num_units));
2730 tu_cs_emit(cs, 0);
2731 tu_cs_emit(cs, 0);
2732 for (unsigned i = 0; i < num_units * 4; i++)
2733 tu_cs_emit(cs, push_constants[i + offset * 4]);
2734 }
2735
2736 for (uint32_t i = 0; i < state->num_enabled; i++) {
2737 uint32_t size = state->range[i].end - state->range[i].start;
2738 uint32_t offset = state->range[i].start;
2739
2740 /* and even if the start of the const buffer is before
2741 * first_immediate, the end may not be:
2742 */
2743 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2744
2745 if (size == 0)
2746 continue;
2747
2748 /* things should be aligned to vec4: */
2749 debug_assert((state->range[i].offset % 16) == 0);
2750 debug_assert((size % 16) == 0);
2751 debug_assert((offset % 16) == 0);
2752
2753 /* Dig out the descriptor from the descriptor state and read the VA from
2754 * it.
2755 */
2756 assert(state->range[i].ubo.bindless);
2757 uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
2758 descriptors_state->dynamic_descriptors :
2759 descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
2760 unsigned block = state->range[i].ubo.block;
2761 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
2762 uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
2763 assert(va);
2764
2765 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2766 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2767 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2768 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2769 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2770 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2771 tu_cs_emit_qw(cs, va + offset);
2772 }
2773 }
2774
2775 static struct tu_cs_entry
2776 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2777 const struct tu_pipeline *pipeline,
2778 struct tu_descriptor_state *descriptors_state,
2779 gl_shader_stage type)
2780 {
2781 struct tu_cs cs;
2782 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2783
2784 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2785
2786 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2787 }
2788
2789 static struct tu_cs_entry
2790 tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
2791 const struct tu_pipeline *pipeline)
2792 {
2793 struct tu_cs cs;
2794 tu_cs_begin_sub_stream(&cmd->sub_cs, 4 * MAX_VBS, &cs);
2795
2796 int binding;
2797 for_each_bit(binding, pipeline->vi.bindings_used) {
2798 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
2799 const VkDeviceSize offset = buf->bo_offset +
2800 cmd->state.vb.offsets[binding];
2801
2802 tu_cs_emit_regs(&cs,
2803 A6XX_VFD_FETCH_BASE(binding, .bo = buf->bo, .bo_offset = offset),
2804 A6XX_VFD_FETCH_SIZE(binding, buf->size - offset));
2805
2806 }
2807
2808 cmd->vertex_bindings_set = pipeline->vi.bindings_used;
2809
2810 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2811 }
2812
2813 static uint64_t
2814 get_tess_param_bo_size(const struct tu_pipeline *pipeline,
2815 uint32_t draw_count)
2816 {
2817 /* TODO: For indirect draws, we can't compute the BO size ahead of time.
2818 * Still not sure what to do here, so just allocate a reasonably large
2819 * BO and hope for the best for now. */
2820 if (!draw_count)
2821 draw_count = 2048;
2822
2823 /* the tess param BO is pipeline->tess.param_stride bytes per patch,
2824 * which includes both the per-vertex outputs and per-patch outputs
2825 * build_primitive_map in ir3 calculates this stride
2826 */
2827 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
2828 uint32_t num_patches = draw_count / verts_per_patch;
2829 return num_patches * pipeline->tess.param_stride;
2830 }
2831
2832 static uint64_t
2833 get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
2834 uint32_t draw_count)
2835 {
2836 /* TODO: For indirect draws, we can't compute the BO size ahead of time.
2837 * Still not sure what to do here, so just allocate a reasonably large
2838 * BO and hope for the best for now. */
2839 if (!draw_count)
2840 draw_count = 2048;
2841
2842 /* Each distinct patch gets its own tess factor output. */
2843 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
2844 uint32_t num_patches = draw_count / verts_per_patch;
2845 uint32_t factor_stride;
2846 switch (pipeline->tess.patch_type) {
2847 case IR3_TESS_ISOLINES:
2848 factor_stride = 12;
2849 break;
2850 case IR3_TESS_TRIANGLES:
2851 factor_stride = 20;
2852 break;
2853 case IR3_TESS_QUADS:
2854 factor_stride = 28;
2855 break;
2856 default:
2857 unreachable("bad tessmode");
2858 }
2859 return factor_stride * num_patches;
2860 }
2861
2862 static VkResult
2863 tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
2864 uint32_t draw_count,
2865 const struct tu_pipeline *pipeline,
2866 struct tu_cs_entry *entry)
2867 {
2868 struct tu_cs cs;
2869 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 20, &cs);
2870 if (result != VK_SUCCESS)
2871 return result;
2872
2873 uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
2874 uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
2875 uint64_t tess_bo_size = tess_factor_size + tess_param_size;
2876 if (tess_bo_size > 0) {
2877 struct tu_bo *tess_bo;
2878 result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
2879 if (result != VK_SUCCESS)
2880 return result;
2881
2882 tu_bo_list_add(&cmd->bo_list, tess_bo,
2883 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2884 uint64_t tess_factor_iova = tess_bo->iova;
2885 uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
2886
2887 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2888 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
2889 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2890 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2891 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
2892 CP_LOAD_STATE6_0_NUM_UNIT(1));
2893 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2894 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2895 tu_cs_emit_qw(&cs, tess_param_iova);
2896 tu_cs_emit_qw(&cs, tess_factor_iova);
2897
2898 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2899 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
2900 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2901 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2902 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
2903 CP_LOAD_STATE6_0_NUM_UNIT(1));
2904 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2905 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2906 tu_cs_emit_qw(&cs, tess_param_iova);
2907 tu_cs_emit_qw(&cs, tess_factor_iova);
2908
2909 tu_cs_emit_pkt4(&cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
2910 tu_cs_emit_qw(&cs, tess_factor_iova);
2911
2912 /* TODO: Without this WFI here, the hardware seems unable to read these
2913 * addresses we just emitted. Freedreno emits these consts as part of
2914 * IB1 instead of in a draw state which might make this WFI unnecessary,
2915 * but it requires a bit more indirection (SS6_INDIRECT for consts). */
2916 tu_cs_emit_wfi(&cs);
2917 }
2918 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2919 return VK_SUCCESS;
2920 }
2921
2922 static VkResult
2923 tu6_draw_common(struct tu_cmd_buffer *cmd,
2924 struct tu_cs *cs,
2925 bool indexed,
2926 /* note: draw_count is 0 for indirect */
2927 uint32_t draw_count)
2928 {
2929 const struct tu_pipeline *pipeline = cmd->state.pipeline;
2930 VkResult result;
2931
2932 struct tu_descriptor_state *descriptors_state =
2933 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
2934
2935 tu_emit_cache_flush_renderpass(cmd, cs);
2936
2937 /* TODO lrz */
2938
2939 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
2940 .primitive_restart =
2941 pipeline->ia.primitive_restart && indexed,
2942 .tess_upper_left_domain_origin =
2943 pipeline->tess.upper_left_domain_origin));
2944
2945 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
2946 cmd->state.shader_const_ib[MESA_SHADER_VERTEX] =
2947 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX);
2948 cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL] =
2949 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_TESS_CTRL);
2950 cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL] =
2951 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_TESS_EVAL);
2952 cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY] =
2953 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_GEOMETRY);
2954 cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT] =
2955 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
2956 }
2957
2958 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
2959 /* We need to reload the descriptors every time the descriptor sets
2960 * change. However, the commands we send only depend on the pipeline
2961 * because the whole point is to cache descriptors which are used by the
2962 * pipeline. There's a problem here, in that the firmware has an
2963 * "optimization" which skips executing groups that are set to the same
2964 * value as the last draw. This means that if the descriptor sets change
2965 * but not the pipeline, we'd try to re-execute the same buffer which
2966 * the firmware would ignore and we wouldn't pre-load the new
2967 * descriptors. The blob seems to re-emit the LOAD_STATE group whenever
2968 * the descriptor sets change, which we emulate here by copying the
2969 * pre-prepared buffer.
2970 */
2971 const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib;
2972 if (load_entry->size > 0) {
2973 struct tu_cs load_cs;
2974 result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs);
2975 if (result != VK_SUCCESS)
2976 return result;
2977 tu_cs_emit_array(&load_cs,
2978 (uint32_t *)((char *)load_entry->bo->map + load_entry->offset),
2979 load_entry->size / 4);
2980 cmd->state.desc_sets_load_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs);
2981 } else {
2982 cmd->state.desc_sets_load_ib.size = 0;
2983 }
2984 }
2985
2986 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
2987 cmd->state.vertex_buffers_ib = tu6_emit_vertex_buffers(cmd, pipeline);
2988
2989 bool has_tess =
2990 pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
2991 struct tu_cs_entry tess_consts = {};
2992 if (has_tess) {
2993 cmd->has_tess = true;
2994 result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts);
2995 if (result != VK_SUCCESS)
2996 return result;
2997 }
2998
2999 /* for the first draw in a renderpass, re-emit all the draw states
3000 *
3001 * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
3002 * used, then draw states must be re-emitted. note however this only happens
3003 * in the sysmem path, so this can be skipped this for the gmem path (TODO)
3004 *
3005 * the two input attachment states are excluded because secondary command
3006 * buffer doesn't have a state ib to restore it, and not re-emitting them
3007 * is OK since CmdClearAttachments won't disable/overwrite them
3008 */
3009 if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
3010 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3011
3012 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib);
3013 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib);
3014 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
3015 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib);
3016 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib);
3017 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib);
3018 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS, pipeline->ds.state_ib);
3019 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_BLEND, pipeline->blend.state_ib);
3020 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]);
3021 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_HS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL]);
3022 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL]);
3023 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
3024 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
3025 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib);
3026 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
3027 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
3028 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3029
3030 for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
3031 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
3032 ((pipeline->dynamic_state_mask & BIT(i)) ?
3033 cmd->state.dynamic_state[i] :
3034 pipeline->dynamic_state[i]));
3035 }
3036 } else {
3037
3038 /* emit draw states that were just updated
3039 * note we eventually don't want to have to emit anything here
3040 */
3041 uint32_t draw_state_count =
3042 has_tess +
3043 ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 5 : 0) +
3044 ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) +
3045 ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
3046 1; /* vs_params */
3047
3048 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
3049
3050 /* We may need to re-emit tess consts if the current draw call is
3051 * sufficiently larger than the last draw call. */
3052 if (has_tess)
3053 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
3054 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3055 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]);
3056 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_HS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL]);
3057 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL]);
3058 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
3059 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
3060 }
3061 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS)
3062 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
3063 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
3064 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
3065 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3066 }
3067
3068 tu_cs_sanity_check(cs);
3069
3070 /* There are too many graphics dirty bits to list here, so just list the
3071 * bits to preserve instead. The only things not emitted here are
3072 * compute-related state.
3073 */
3074 cmd->state.dirty &= (TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3075 return VK_SUCCESS;
3076 }
3077
3078 static uint32_t
3079 tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
3080 {
3081 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3082 uint32_t initiator =
3083 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(pipeline->ia.primtype) |
3084 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
3085 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
3086 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
3087
3088 if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
3089 initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
3090
3091 switch (pipeline->tess.patch_type) {
3092 case IR3_TESS_TRIANGLES:
3093 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
3094 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3095 break;
3096 case IR3_TESS_ISOLINES:
3097 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
3098 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3099 break;
3100 case IR3_TESS_NONE:
3101 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
3102 break;
3103 case IR3_TESS_QUADS:
3104 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
3105 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3106 break;
3107 }
3108 return initiator;
3109 }
3110
3111
3112 static uint32_t
3113 vs_params_offset(struct tu_cmd_buffer *cmd)
3114 {
3115 const struct tu_program_descriptor_linkage *link =
3116 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3117 const struct ir3_const_state *const_state = &link->const_state;
3118
3119 if (const_state->offsets.driver_param >= link->constlen)
3120 return 0;
3121
3122 /* this layout is required by CP_DRAW_INDIRECT_MULTI */
3123 STATIC_ASSERT(IR3_DP_DRAWID == 0);
3124 STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
3125 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3126
3127 /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
3128 assert(const_state->offsets.driver_param != 0);
3129
3130 return const_state->offsets.driver_param;
3131 }
3132
3133 static struct tu_draw_state
3134 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3135 uint32_t vertex_offset,
3136 uint32_t first_instance)
3137 {
3138 uint32_t offset = vs_params_offset(cmd);
3139
3140 struct tu_cs cs;
3141 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
3142 if (result != VK_SUCCESS) {
3143 cmd->record_result = result;
3144 return (struct tu_draw_state) {};
3145 }
3146
3147 /* TODO: don't make a new draw state when it doesn't change */
3148
3149 tu_cs_emit_regs(&cs,
3150 A6XX_VFD_INDEX_OFFSET(vertex_offset),
3151 A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
3152
3153 if (offset) {
3154 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3155 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3156 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3157 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3158 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3159 CP_LOAD_STATE6_0_NUM_UNIT(1));
3160 tu_cs_emit(&cs, 0);
3161 tu_cs_emit(&cs, 0);
3162
3163 tu_cs_emit(&cs, 0);
3164 tu_cs_emit(&cs, vertex_offset);
3165 tu_cs_emit(&cs, first_instance);
3166 tu_cs_emit(&cs, 0);
3167 }
3168
3169 struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3170 return (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
3171 }
3172
3173 void
3174 tu_CmdDraw(VkCommandBuffer commandBuffer,
3175 uint32_t vertexCount,
3176 uint32_t instanceCount,
3177 uint32_t firstVertex,
3178 uint32_t firstInstance)
3179 {
3180 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3181 struct tu_cs *cs = &cmd->draw_cs;
3182
3183 cmd->state.vs_params = tu6_emit_vs_params(cmd, firstVertex, firstInstance);
3184
3185 tu6_draw_common(cmd, cs, false, vertexCount);
3186
3187 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3188 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3189 tu_cs_emit(cs, instanceCount);
3190 tu_cs_emit(cs, vertexCount);
3191 }
3192
3193 void
3194 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3195 uint32_t indexCount,
3196 uint32_t instanceCount,
3197 uint32_t firstIndex,
3198 int32_t vertexOffset,
3199 uint32_t firstInstance)
3200 {
3201 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3202 struct tu_cs *cs = &cmd->draw_cs;
3203
3204 cmd->state.vs_params = tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
3205
3206 tu6_draw_common(cmd, cs, true, indexCount);
3207
3208 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3209 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
3210 tu_cs_emit(cs, instanceCount);
3211 tu_cs_emit(cs, indexCount);
3212 tu_cs_emit(cs, firstIndex);
3213 tu_cs_emit_qw(cs, cmd->state.index_va);
3214 tu_cs_emit(cs, cmd->state.max_index_count);
3215 }
3216
3217 void
3218 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3219 VkBuffer _buffer,
3220 VkDeviceSize offset,
3221 uint32_t drawCount,
3222 uint32_t stride)
3223 {
3224 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3225 TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3226 struct tu_cs *cs = &cmd->draw_cs;
3227
3228 cmd->state.vs_params = (struct tu_draw_state) {};
3229
3230 tu6_draw_common(cmd, cs, false, 0);
3231
3232 /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
3233 * doesn't wait for WFIs to be completed and leads to GPU fault/hang
3234 * TODO: this could be worked around in a more performant way,
3235 * or there may exist newer firmware that has been fixed
3236 */
3237 if (cmd->device->physical_device->gpu_id != 650)
3238 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
3239
3240 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
3241 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3242 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
3243 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
3244 tu_cs_emit(cs, drawCount);
3245 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
3246 tu_cs_emit(cs, stride);
3247
3248 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3249 }
3250
3251 void
3252 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3253 VkBuffer _buffer,
3254 VkDeviceSize offset,
3255 uint32_t drawCount,
3256 uint32_t stride)
3257 {
3258 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3259 TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3260 struct tu_cs *cs = &cmd->draw_cs;
3261
3262 cmd->state.vs_params = (struct tu_draw_state) {};
3263
3264 tu6_draw_common(cmd, cs, true, 0);
3265
3266 /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
3267 * doesn't wait for WFIs to be completed and leads to GPU fault/hang
3268 * TODO: this could be worked around in a more performant way,
3269 * or there may exist newer firmware that has been fixed
3270 */
3271 if (cmd->device->physical_device->gpu_id != 650)
3272 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
3273
3274 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
3275 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
3276 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
3277 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
3278 tu_cs_emit(cs, drawCount);
3279 tu_cs_emit_qw(cs, cmd->state.index_va);
3280 tu_cs_emit(cs, cmd->state.max_index_count);
3281 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
3282 tu_cs_emit(cs, stride);
3283
3284 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3285 }
3286
3287 void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3288 uint32_t instanceCount,
3289 uint32_t firstInstance,
3290 VkBuffer _counterBuffer,
3291 VkDeviceSize counterBufferOffset,
3292 uint32_t counterOffset,
3293 uint32_t vertexStride)
3294 {
3295 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3296 TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
3297 struct tu_cs *cs = &cmd->draw_cs;
3298
3299 cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
3300
3301 tu6_draw_common(cmd, cs, false, 0);
3302
3303 tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
3304 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
3305 tu_cs_emit(cs, instanceCount);
3306 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);
3307 tu_cs_emit(cs, counterOffset);
3308 tu_cs_emit(cs, vertexStride);
3309
3310 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3311 }
3312
3313 struct tu_dispatch_info
3314 {
3315 /**
3316 * Determine the layout of the grid (in block units) to be used.
3317 */
3318 uint32_t blocks[3];
3319
3320 /**
3321 * A starting offset for the grid. If unaligned is set, the offset
3322 * must still be aligned.
3323 */
3324 uint32_t offsets[3];
3325 /**
3326 * Whether it's an unaligned compute dispatch.
3327 */
3328 bool unaligned;
3329
3330 /**
3331 * Indirect compute parameters resource.
3332 */
3333 struct tu_buffer *indirect;
3334 uint64_t indirect_offset;
3335 };
3336
3337 static void
3338 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3339 const struct tu_dispatch_info *info)
3340 {
3341 gl_shader_stage type = MESA_SHADER_COMPUTE;
3342 const struct tu_program_descriptor_linkage *link =
3343 &pipeline->program.link[type];
3344 const struct ir3_const_state *const_state = &link->const_state;
3345 uint32_t offset = const_state->offsets.driver_param;
3346
3347 if (link->constlen <= offset)
3348 return;
3349
3350 if (!info->indirect) {
3351 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3352 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3353 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3354 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3355 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3356 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3357 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3358 };
3359
3360 uint32_t num_consts = MIN2(const_state->num_driver_params,
3361 (link->constlen - offset) * 4);
3362 /* push constants */
3363 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3364 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3365 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3366 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3367 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3368 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3369 tu_cs_emit(cs, 0);
3370 tu_cs_emit(cs, 0);
3371 uint32_t i;
3372 for (i = 0; i < num_consts; i++)
3373 tu_cs_emit(cs, driver_params[i]);
3374 } else {
3375 tu_finishme("Indirect driver params");
3376 }
3377 }
3378
3379 static void
3380 tu_dispatch(struct tu_cmd_buffer *cmd,
3381 const struct tu_dispatch_info *info)
3382 {
3383 struct tu_cs *cs = &cmd->cs;
3384 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3385 struct tu_descriptor_state *descriptors_state =
3386 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3387
3388 /* TODO: We could probably flush less if we add a compute_flush_bits
3389 * bitfield.
3390 */
3391 tu_emit_cache_flush(cmd, cs);
3392
3393 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3394 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3395
3396 struct tu_cs_entry ib;
3397
3398 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3399 if (ib.size)
3400 tu_cs_emit_ib(cs, &ib);
3401
3402 tu_emit_compute_driver_params(cs, pipeline, info);
3403
3404 if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) &&
3405 pipeline->load_state.state_ib.size > 0) {
3406 tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);
3407 }
3408
3409 cmd->state.dirty &=
3410 ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3411
3412 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3413 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3414
3415 const uint32_t *local_size = pipeline->compute.local_size;
3416 const uint32_t *num_groups = info->blocks;
3417 tu_cs_emit_regs(cs,
3418 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3419 .localsizex = local_size[0] - 1,
3420 .localsizey = local_size[1] - 1,
3421 .localsizez = local_size[2] - 1),
3422 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3423 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3424 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3425 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3426 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3427 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3428
3429 tu_cs_emit_regs(cs,
3430 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3431 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3432 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3433
3434 if (info->indirect) {
3435 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3436
3437 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3438 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3439
3440 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3441 tu_cs_emit(cs, 0x00000000);
3442 tu_cs_emit_qw(cs, iova);
3443 tu_cs_emit(cs,
3444 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3445 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3446 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3447 } else {
3448 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3449 tu_cs_emit(cs, 0x00000000);
3450 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3451 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3452 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3453 }
3454
3455 tu_cs_emit_wfi(cs);
3456 }
3457
3458 void
3459 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3460 uint32_t base_x,
3461 uint32_t base_y,
3462 uint32_t base_z,
3463 uint32_t x,
3464 uint32_t y,
3465 uint32_t z)
3466 {
3467 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3468 struct tu_dispatch_info info = {};
3469
3470 info.blocks[0] = x;
3471 info.blocks[1] = y;
3472 info.blocks[2] = z;
3473
3474 info.offsets[0] = base_x;
3475 info.offsets[1] = base_y;
3476 info.offsets[2] = base_z;
3477 tu_dispatch(cmd_buffer, &info);
3478 }
3479
3480 void
3481 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3482 uint32_t x,
3483 uint32_t y,
3484 uint32_t z)
3485 {
3486 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3487 }
3488
3489 void
3490 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3491 VkBuffer _buffer,
3492 VkDeviceSize offset)
3493 {
3494 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3495 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3496 struct tu_dispatch_info info = {};
3497
3498 info.indirect = buffer;
3499 info.indirect_offset = offset;
3500
3501 tu_dispatch(cmd_buffer, &info);
3502 }
3503
3504 void
3505 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3506 {
3507 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3508
3509 tu_cs_end(&cmd_buffer->draw_cs);
3510 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3511
3512 if (use_sysmem_rendering(cmd_buffer))
3513 tu_cmd_render_sysmem(cmd_buffer);
3514 else
3515 tu_cmd_render_tiles(cmd_buffer);
3516
3517 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
3518 rendered */
3519 tu_cs_discard_entries(&cmd_buffer->draw_cs);
3520 tu_cs_begin(&cmd_buffer->draw_cs);
3521 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
3522 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
3523
3524 cmd_buffer->state.cache.pending_flush_bits |=
3525 cmd_buffer->state.renderpass_cache.pending_flush_bits;
3526 tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
3527
3528 cmd_buffer->state.pass = NULL;
3529 cmd_buffer->state.subpass = NULL;
3530 cmd_buffer->state.framebuffer = NULL;
3531 }
3532
3533 void
3534 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
3535 const VkSubpassEndInfoKHR *pSubpassEndInfo)
3536 {
3537 tu_CmdEndRenderPass(commandBuffer);
3538 }
3539
3540 struct tu_barrier_info
3541 {
3542 uint32_t eventCount;
3543 const VkEvent *pEvents;
3544 VkPipelineStageFlags srcStageMask;
3545 };
3546
3547 static void
3548 tu_barrier(struct tu_cmd_buffer *cmd,
3549 uint32_t memoryBarrierCount,
3550 const VkMemoryBarrier *pMemoryBarriers,
3551 uint32_t bufferMemoryBarrierCount,
3552 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3553 uint32_t imageMemoryBarrierCount,
3554 const VkImageMemoryBarrier *pImageMemoryBarriers,
3555 const struct tu_barrier_info *info)
3556 {
3557 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
3558 VkAccessFlags srcAccessMask = 0;
3559 VkAccessFlags dstAccessMask = 0;
3560
3561 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3562 srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
3563 dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
3564 }
3565
3566 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
3567 srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
3568 dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
3569 }
3570
3571 enum tu_cmd_access_mask src_flags = 0;
3572 enum tu_cmd_access_mask dst_flags = 0;
3573
3574 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
3575 TU_FROM_HANDLE(tu_image, image, pImageMemoryBarriers[i].image);
3576 VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
3577 /* For non-linear images, PREINITIALIZED is the same as UNDEFINED */
3578 if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
3579 (image->tiling != VK_IMAGE_TILING_LINEAR &&
3580 old_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
3581 /* The underlying memory for this image may have been used earlier
3582 * within the same queue submission for a different image, which
3583 * means that there may be old, stale cache entries which are in the
3584 * "wrong" location, which could cause problems later after writing
3585 * to the image. We don't want these entries being flushed later and
3586 * overwriting the actual image, so we need to flush the CCU.
3587 */
3588 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
3589 }
3590 srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
3591 dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
3592 }
3593
3594 /* Inside a renderpass, we don't know yet whether we'll be using sysmem
3595 * so we have to use the sysmem flushes.
3596 */
3597 bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
3598 !cmd->state.pass;
3599 src_flags |= vk2tu_access(srcAccessMask, gmem);
3600 dst_flags |= vk2tu_access(dstAccessMask, gmem);
3601
3602 struct tu_cache_state *cache =
3603 cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
3604 tu_flush_for_access(cache, src_flags, dst_flags);
3605
3606 for (uint32_t i = 0; i < info->eventCount; i++) {
3607 TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
3608
3609 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
3610
3611 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
3612 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
3613 CP_WAIT_REG_MEM_0_POLL_MEMORY);
3614 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
3615 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
3616 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
3617 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
3618 }
3619 }
3620
3621 void
3622 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
3623 VkPipelineStageFlags srcStageMask,
3624 VkPipelineStageFlags dstStageMask,
3625 VkDependencyFlags dependencyFlags,
3626 uint32_t memoryBarrierCount,
3627 const VkMemoryBarrier *pMemoryBarriers,
3628 uint32_t bufferMemoryBarrierCount,
3629 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3630 uint32_t imageMemoryBarrierCount,
3631 const VkImageMemoryBarrier *pImageMemoryBarriers)
3632 {
3633 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3634 struct tu_barrier_info info;
3635
3636 info.eventCount = 0;
3637 info.pEvents = NULL;
3638 info.srcStageMask = srcStageMask;
3639
3640 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
3641 bufferMemoryBarrierCount, pBufferMemoryBarriers,
3642 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
3643 }
3644
3645 static void
3646 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
3647 VkPipelineStageFlags stageMask, unsigned value)
3648 {
3649 struct tu_cs *cs = &cmd->cs;
3650
3651 /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
3652 assert(!cmd->state.pass);
3653
3654 tu_emit_cache_flush(cmd, cs);
3655
3656 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
3657
3658 /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
3659 * read by the CP, so the draw indirect stage counts as top-of-pipe too.
3660 */
3661 VkPipelineStageFlags top_of_pipe_flags =
3662 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
3663 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
3664
3665 if (!(stageMask & ~top_of_pipe_flags)) {
3666 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
3667 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
3668 tu_cs_emit(cs, value);
3669 } else {
3670 /* Use a RB_DONE_TS event to wait for everything to complete. */
3671 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
3672 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
3673 tu_cs_emit_qw(cs, event->bo.iova);
3674 tu_cs_emit(cs, value);
3675 }
3676 }
3677
3678 void
3679 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
3680 VkEvent _event,
3681 VkPipelineStageFlags stageMask)
3682 {
3683 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3684 TU_FROM_HANDLE(tu_event, event, _event);
3685
3686 write_event(cmd, event, stageMask, 1);
3687 }
3688
3689 void
3690 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
3691 VkEvent _event,
3692 VkPipelineStageFlags stageMask)
3693 {
3694 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3695 TU_FROM_HANDLE(tu_event, event, _event);
3696
3697 write_event(cmd, event, stageMask, 0);
3698 }
3699
3700 void
3701 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
3702 uint32_t eventCount,
3703 const VkEvent *pEvents,
3704 VkPipelineStageFlags srcStageMask,
3705 VkPipelineStageFlags dstStageMask,
3706 uint32_t memoryBarrierCount,
3707 const VkMemoryBarrier *pMemoryBarriers,
3708 uint32_t bufferMemoryBarrierCount,
3709 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3710 uint32_t imageMemoryBarrierCount,
3711 const VkImageMemoryBarrier *pImageMemoryBarriers)
3712 {
3713 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3714 struct tu_barrier_info info;
3715
3716 info.eventCount = eventCount;
3717 info.pEvents = pEvents;
3718 info.srcStageMask = 0;
3719
3720 tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
3721 bufferMemoryBarrierCount, pBufferMemoryBarriers,
3722 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
3723 }
3724
3725 void
3726 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
3727 {
3728 /* No-op */
3729 }