fb24e17be0d5bce93b9db8c28069a98a73ed329f
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36
37 void
38 tu_bo_list_init(struct tu_bo_list *list)
39 {
40 list->count = list->capacity = 0;
41 list->bo_infos = NULL;
42 }
43
44 void
45 tu_bo_list_destroy(struct tu_bo_list *list)
46 {
47 free(list->bo_infos);
48 }
49
50 void
51 tu_bo_list_reset(struct tu_bo_list *list)
52 {
53 list->count = 0;
54 }
55
56 /**
57 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
58 */
59 static uint32_t
60 tu_bo_list_add_info(struct tu_bo_list *list,
61 const struct drm_msm_gem_submit_bo *bo_info)
62 {
63 assert(bo_info->handle != 0);
64
65 for (uint32_t i = 0; i < list->count; ++i) {
66 if (list->bo_infos[i].handle == bo_info->handle) {
67 assert(list->bo_infos[i].presumed == bo_info->presumed);
68 list->bo_infos[i].flags |= bo_info->flags;
69 return i;
70 }
71 }
72
73 /* grow list->bo_infos if needed */
74 if (list->count == list->capacity) {
75 uint32_t new_capacity = MAX2(2 * list->count, 16);
76 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
77 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
78 if (!new_bo_infos)
79 return TU_BO_LIST_FAILED;
80 list->bo_infos = new_bo_infos;
81 list->capacity = new_capacity;
82 }
83
84 list->bo_infos[list->count] = *bo_info;
85 return list->count++;
86 }
87
88 uint32_t
89 tu_bo_list_add(struct tu_bo_list *list,
90 const struct tu_bo *bo,
91 uint32_t flags)
92 {
93 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
94 .flags = flags,
95 .handle = bo->gem_handle,
96 .presumed = bo->iova,
97 });
98 }
99
100 VkResult
101 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
102 {
103 for (uint32_t i = 0; i < other->count; i++) {
104 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
105 return VK_ERROR_OUT_OF_HOST_MEMORY;
106 }
107
108 return VK_SUCCESS;
109 }
110
111 static void
112 tu_tiling_config_get_tile(const struct tu_framebuffer *fb,
113 uint32_t tx,
114 uint32_t ty,
115 uint32_t *pipe,
116 uint32_t *slot)
117 {
118 /* find the pipe and the slot for tile (tx, ty) */
119 const uint32_t px = tx / fb->pipe0.width;
120 const uint32_t py = ty / fb->pipe0.height;
121 const uint32_t sx = tx - fb->pipe0.width * px;
122 const uint32_t sy = ty - fb->pipe0.height * py;
123 /* last pipe has different width */
124 const uint32_t pipe_width =
125 MIN2(fb->pipe0.width,
126 fb->tile_count.width - px * fb->pipe0.width);
127
128 assert(tx < fb->tile_count.width && ty < fb->tile_count.height);
129 assert(px < fb->pipe_count.width && py < fb->pipe_count.height);
130 assert(sx < fb->pipe0.width && sy < fb->pipe0.height);
131
132 /* convert to 1D indices */
133 *pipe = fb->pipe_count.width * py + px;
134 *slot = pipe_width * sy + sx;
135 }
136
137 void
138 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
139 struct tu_cs *cs,
140 enum vgt_event_type event)
141 {
142 bool need_seqno = false;
143 switch (event) {
144 case CACHE_FLUSH_TS:
145 case WT_DONE_TS:
146 case RB_DONE_TS:
147 case PC_CCU_FLUSH_DEPTH_TS:
148 case PC_CCU_FLUSH_COLOR_TS:
149 case PC_CCU_RESOLVE_TS:
150 need_seqno = true;
151 break;
152 default:
153 break;
154 }
155
156 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
157 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
158 if (need_seqno) {
159 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
160 tu_cs_emit(cs, 0);
161 }
162 }
163
164 static void
165 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
166 struct tu_cs *cs,
167 enum tu_cmd_flush_bits flushes)
168 {
169 /* Experiments show that invalidating CCU while it still has data in it
170 * doesn't work, so make sure to always flush before invalidating in case
171 * any data remains that hasn't yet been made available through a barrier.
172 * However it does seem to work for UCHE.
173 */
174 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
175 TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
176 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
177 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
178 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
179 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
180 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
181 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
182 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
183 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
184 if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
185 tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
186 if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
187 tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
188 if (flushes & TU_CMD_FLAG_WFI)
189 tu_cs_emit_wfi(cs);
190 }
191
192 /* "Normal" cache flushes, that don't require any special handling */
193
194 static void
195 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
196 struct tu_cs *cs)
197 {
198 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
199 cmd_buffer->state.cache.flush_bits = 0;
200 }
201
202 /* Renderpass cache flushes */
203
204 void
205 tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
206 struct tu_cs *cs)
207 {
208 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
209 cmd_buffer->state.renderpass_cache.flush_bits = 0;
210 }
211
212 /* Cache flushes for things that use the color/depth read/write path (i.e.
213 * blits and draws). This deals with changing CCU state as well as the usual
214 * cache flushing.
215 */
216
217 void
218 tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
219 struct tu_cs *cs,
220 enum tu_cmd_ccu_state ccu_state)
221 {
222 enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
223
224 assert(ccu_state != TU_CMD_CCU_UNKNOWN);
225
226 /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
227 * the CCU may also contain data that we haven't flushed out yet, so we
228 * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
229 * emit a WFI as it isn't pipelined.
230 */
231 if (ccu_state != cmd_buffer->state.ccu_state) {
232 if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
233 flushes |=
234 TU_CMD_FLAG_CCU_FLUSH_COLOR |
235 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
236 cmd_buffer->state.cache.pending_flush_bits &= ~(
237 TU_CMD_FLAG_CCU_FLUSH_COLOR |
238 TU_CMD_FLAG_CCU_FLUSH_DEPTH);
239 }
240 flushes |=
241 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
242 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
243 TU_CMD_FLAG_WFI;
244 cmd_buffer->state.cache.pending_flush_bits &= ~(
245 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
246 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
247 }
248
249 tu6_emit_flushes(cmd_buffer, cs, flushes);
250 cmd_buffer->state.cache.flush_bits = 0;
251
252 if (ccu_state != cmd_buffer->state.ccu_state) {
253 struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
254 tu_cs_emit_regs(cs,
255 A6XX_RB_CCU_CNTL(.offset =
256 ccu_state == TU_CMD_CCU_GMEM ?
257 phys_dev->ccu_offset_gmem :
258 phys_dev->ccu_offset_bypass,
259 .gmem = ccu_state == TU_CMD_CCU_GMEM));
260 cmd_buffer->state.ccu_state = ccu_state;
261 }
262 }
263
264 static void
265 tu6_emit_zs(struct tu_cmd_buffer *cmd,
266 const struct tu_subpass *subpass,
267 struct tu_cs *cs)
268 {
269 const struct tu_framebuffer *fb = cmd->state.framebuffer;
270
271 const uint32_t a = subpass->depth_stencil_attachment.attachment;
272 if (a == VK_ATTACHMENT_UNUSED) {
273 tu_cs_emit_regs(cs,
274 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
275 A6XX_RB_DEPTH_BUFFER_PITCH(0),
276 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
277 A6XX_RB_DEPTH_BUFFER_BASE(0),
278 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
279
280 tu_cs_emit_regs(cs,
281 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
282
283 tu_cs_emit_regs(cs,
284 A6XX_GRAS_LRZ_BUFFER_BASE(0),
285 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
286 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
287
288 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
289
290 return;
291 }
292
293 const struct tu_image_view *iview = fb->attachments[a].attachment;
294 const struct tu_render_pass_attachment *attachment =
295 &cmd->state.pass->attachments[a];
296 enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
297
298 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
299 tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
300 tu_cs_image_ref(cs, iview, 0);
301 tu_cs_emit(cs, attachment->gmem_offset);
302
303 tu_cs_emit_regs(cs,
304 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
305
306 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3);
307 tu_cs_image_flag_ref(cs, iview, 0);
308
309 tu_cs_emit_regs(cs,
310 A6XX_GRAS_LRZ_BUFFER_BASE(0),
311 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
312 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
313
314 if (attachment->format == VK_FORMAT_S8_UINT) {
315 tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
316 tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
317 tu_cs_image_ref(cs, iview, 0);
318 tu_cs_emit(cs, attachment->gmem_offset);
319 } else {
320 tu_cs_emit_regs(cs,
321 A6XX_RB_STENCIL_INFO(0));
322 }
323 }
324
325 static void
326 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
327 const struct tu_subpass *subpass,
328 struct tu_cs *cs)
329 {
330 const struct tu_framebuffer *fb = cmd->state.framebuffer;
331
332 for (uint32_t i = 0; i < subpass->color_count; ++i) {
333 uint32_t a = subpass->color_attachments[i].attachment;
334 if (a == VK_ATTACHMENT_UNUSED)
335 continue;
336
337 const struct tu_image_view *iview = fb->attachments[a].attachment;
338
339 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
340 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
341 tu_cs_image_ref(cs, iview, 0);
342 tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
343
344 tu_cs_emit_regs(cs,
345 A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
346
347 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR_LO(i), 3);
348 tu_cs_image_flag_ref(cs, iview, 0);
349 }
350
351 tu_cs_emit_regs(cs,
352 A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
353 tu_cs_emit_regs(cs,
354 A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
355
356 tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(fb->layers - 1));
357 }
358
359 void
360 tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
361 {
362 const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
363 bool msaa_disable = samples == MSAA_ONE;
364
365 tu_cs_emit_regs(cs,
366 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
367 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
368 .msaa_disable = msaa_disable));
369
370 tu_cs_emit_regs(cs,
371 A6XX_GRAS_RAS_MSAA_CNTL(samples),
372 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
373 .msaa_disable = msaa_disable));
374
375 tu_cs_emit_regs(cs,
376 A6XX_RB_RAS_MSAA_CNTL(samples),
377 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
378 .msaa_disable = msaa_disable));
379
380 tu_cs_emit_regs(cs,
381 A6XX_RB_MSAA_CNTL(samples));
382 }
383
384 static void
385 tu6_emit_bin_size(struct tu_cs *cs,
386 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
387 {
388 tu_cs_emit_regs(cs,
389 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
390 .binh = bin_h,
391 .dword = flags));
392
393 tu_cs_emit_regs(cs,
394 A6XX_RB_BIN_CONTROL(.binw = bin_w,
395 .binh = bin_h,
396 .dword = flags));
397
398 /* no flag for RB_BIN_CONTROL2... */
399 tu_cs_emit_regs(cs,
400 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
401 .binh = bin_h));
402 }
403
404 static void
405 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
406 const struct tu_subpass *subpass,
407 struct tu_cs *cs,
408 bool binning)
409 {
410 const struct tu_framebuffer *fb = cmd->state.framebuffer;
411 uint32_t cntl = 0;
412 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
413 if (binning) {
414 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
415 } else {
416 uint32_t mrts_ubwc_enable = 0;
417 for (uint32_t i = 0; i < subpass->color_count; ++i) {
418 uint32_t a = subpass->color_attachments[i].attachment;
419 if (a == VK_ATTACHMENT_UNUSED)
420 continue;
421
422 const struct tu_image_view *iview = fb->attachments[a].attachment;
423 if (iview->ubwc_enabled)
424 mrts_ubwc_enable |= 1 << i;
425 }
426
427 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
428
429 const uint32_t a = subpass->depth_stencil_attachment.attachment;
430 if (a != VK_ATTACHMENT_UNUSED) {
431 const struct tu_image_view *iview = fb->attachments[a].attachment;
432 if (iview->ubwc_enabled)
433 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
434 }
435
436 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
437 * in order to set it correctly for the different subpasses. However,
438 * that means the packets we're emitting also happen during binning. So
439 * we need to guard the write on !BINNING at CP execution time.
440 */
441 tu_cs_reserve(cs, 3 + 4);
442 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
443 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
444 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
445 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
446 }
447
448 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
449 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
450 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
451 tu_cs_emit(cs, cntl);
452 }
453
454 static void
455 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
456 {
457 const VkRect2D *render_area = &cmd->state.render_area;
458 uint32_t x1 = render_area->offset.x;
459 uint32_t y1 = render_area->offset.y;
460 uint32_t x2 = x1 + render_area->extent.width - 1;
461 uint32_t y2 = y1 + render_area->extent.height - 1;
462
463 if (align) {
464 x1 = x1 & ~(GMEM_ALIGN_W - 1);
465 y1 = y1 & ~(GMEM_ALIGN_H - 1);
466 x2 = ALIGN_POT(x2 + 1, GMEM_ALIGN_W) - 1;
467 y2 = ALIGN_POT(y2 + 1, GMEM_ALIGN_H) - 1;
468 }
469
470 tu_cs_emit_regs(cs,
471 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
472 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
473 }
474
475 void
476 tu6_emit_window_scissor(struct tu_cs *cs,
477 uint32_t x1,
478 uint32_t y1,
479 uint32_t x2,
480 uint32_t y2)
481 {
482 tu_cs_emit_regs(cs,
483 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
484 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
485
486 tu_cs_emit_regs(cs,
487 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
488 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
489 }
490
491 void
492 tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
493 {
494 tu_cs_emit_regs(cs,
495 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
496
497 tu_cs_emit_regs(cs,
498 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
499
500 tu_cs_emit_regs(cs,
501 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
502
503 tu_cs_emit_regs(cs,
504 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
505 }
506
507 static void
508 tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
509 {
510 uint32_t enable_mask;
511 switch (id) {
512 case TU_DRAW_STATE_PROGRAM:
513 case TU_DRAW_STATE_VI:
514 case TU_DRAW_STATE_FS_CONST:
515 /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
516 * when resources would actually be used in the binning shader.
517 * Presumably the overhead of prefetching the resources isn't
518 * worth it.
519 */
520 case TU_DRAW_STATE_DESC_SETS_LOAD:
521 enable_mask = CP_SET_DRAW_STATE__0_GMEM |
522 CP_SET_DRAW_STATE__0_SYSMEM;
523 break;
524 case TU_DRAW_STATE_PROGRAM_BINNING:
525 case TU_DRAW_STATE_VI_BINNING:
526 enable_mask = CP_SET_DRAW_STATE__0_BINNING;
527 break;
528 case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
529 enable_mask = CP_SET_DRAW_STATE__0_GMEM;
530 break;
531 case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
532 enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
533 break;
534 default:
535 enable_mask = CP_SET_DRAW_STATE__0_GMEM |
536 CP_SET_DRAW_STATE__0_SYSMEM |
537 CP_SET_DRAW_STATE__0_BINNING;
538 break;
539 }
540
541 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
542 enable_mask |
543 CP_SET_DRAW_STATE__0_GROUP_ID(id) |
544 COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));
545 tu_cs_emit_qw(cs, state.iova);
546 }
547
548 /* note: get rid of this eventually */
549 static void
550 tu_cs_emit_sds_ib(struct tu_cs *cs, uint32_t id, struct tu_cs_entry entry)
551 {
552 tu_cs_emit_draw_state(cs, id, (struct tu_draw_state) {
553 .iova = entry.size ? entry.bo->iova + entry.offset : 0,
554 .size = entry.size / 4,
555 });
556 }
557
558 static bool
559 use_hw_binning(struct tu_cmd_buffer *cmd)
560 {
561 const struct tu_framebuffer *fb = cmd->state.framebuffer;
562
563 /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible
564 * with non-hw binning GMEM rendering. this is required because some of the
565 * XFB commands need to only be executed once
566 */
567 if (cmd->state.xfb_used)
568 return true;
569
570 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
571 return false;
572
573 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
574 return true;
575
576 return (fb->tile_count.width * fb->tile_count.height) > 2;
577 }
578
579 static bool
580 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
581 {
582 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
583 return true;
584
585 /* can't fit attachments into gmem */
586 if (!cmd->state.pass->gmem_pixels)
587 return true;
588
589 if (cmd->state.framebuffer->layers > 1)
590 return true;
591
592 if (cmd->has_tess)
593 return true;
594
595 return false;
596 }
597
598 static void
599 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
600 struct tu_cs *cs,
601 uint32_t tx, uint32_t ty)
602 {
603 const struct tu_framebuffer *fb = cmd->state.framebuffer;
604 uint32_t pipe, slot;
605
606 tu_tiling_config_get_tile(fb, tx, ty, &pipe, &slot);
607
608 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
609 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
610
611 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
612 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
613
614 const uint32_t x1 = fb->tile0.width * tx;
615 const uint32_t y1 = fb->tile0.height * ty;
616 const uint32_t x2 = x1 + fb->tile0.width - 1;
617 const uint32_t y2 = y1 + fb->tile0.height - 1;
618 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
619 tu6_emit_window_offset(cs, x1, y1);
620
621 tu_cs_emit_regs(cs,
622 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
623
624 if (use_hw_binning(cmd)) {
625 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
626
627 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
628 tu_cs_emit(cs, 0x0);
629
630 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
631 tu_cs_emit(cs, fb->pipe_sizes[pipe] |
632 CP_SET_BIN_DATA5_0_VSC_N(slot));
633 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * cmd->vsc_draw_strm_pitch);
634 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * 4 + 32 * cmd->vsc_draw_strm_pitch);
635 tu_cs_emit_qw(cs, cmd->vsc_prim_strm.iova + pipe * cmd->vsc_prim_strm_pitch);
636
637 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
638 tu_cs_emit(cs, 0x0);
639
640 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
641 tu_cs_emit(cs, 0x0);
642 } else {
643 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
644 tu_cs_emit(cs, 0x1);
645
646 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
647 tu_cs_emit(cs, 0x0);
648 }
649 }
650
651 static void
652 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
653 struct tu_cs *cs,
654 uint32_t a,
655 uint32_t gmem_a)
656 {
657 const struct tu_framebuffer *fb = cmd->state.framebuffer;
658 struct tu_image_view *dst = fb->attachments[a].attachment;
659 struct tu_image_view *src = fb->attachments[gmem_a].attachment;
660
661 tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.render_area);
662 }
663
664 static void
665 tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
666 struct tu_cs *cs,
667 const struct tu_subpass *subpass)
668 {
669 if (subpass->resolve_attachments) {
670 /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
671 * Commands":
672 *
673 * End-of-subpass multisample resolves are treated as color
674 * attachment writes for the purposes of synchronization. That is,
675 * they are considered to execute in the
676 * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
677 * their writes are synchronized with
678 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
679 * rendering within a subpass and any resolve operations at the end
680 * of the subpass occurs automatically, without need for explicit
681 * dependencies or pipeline barriers. However, if the resolve
682 * attachment is also used in a different subpass, an explicit
683 * dependency is needed.
684 *
685 * We use the CP_BLIT path for sysmem resolves, which is really a
686 * transfer command, so we have to manually flush similar to the gmem
687 * resolve case. However, a flush afterwards isn't needed because of the
688 * last sentence and the fact that we're in sysmem mode.
689 */
690 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
691 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
692
693 /* Wait for the flushes to land before using the 2D engine */
694 tu_cs_emit_wfi(cs);
695
696 for (unsigned i = 0; i < subpass->color_count; i++) {
697 uint32_t a = subpass->resolve_attachments[i].attachment;
698 if (a == VK_ATTACHMENT_UNUSED)
699 continue;
700
701 tu6_emit_sysmem_resolve(cmd, cs, a,
702 subpass->color_attachments[i].attachment);
703 }
704 }
705 }
706
707 static void
708 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
709 {
710 const struct tu_render_pass *pass = cmd->state.pass;
711 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
712
713 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
714 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
715 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
716 CP_SET_DRAW_STATE__0_GROUP_ID(0));
717 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
718 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
719
720 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
721 tu_cs_emit(cs, 0x0);
722
723 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
724 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
725
726 tu6_emit_blit_scissor(cmd, cs, true);
727
728 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
729 if (pass->attachments[a].gmem_offset >= 0)
730 tu_store_gmem_attachment(cmd, cs, a, a);
731 }
732
733 if (subpass->resolve_attachments) {
734 for (unsigned i = 0; i < subpass->color_count; i++) {
735 uint32_t a = subpass->resolve_attachments[i].attachment;
736 if (a != VK_ATTACHMENT_UNUSED)
737 tu_store_gmem_attachment(cmd, cs, a,
738 subpass->color_attachments[i].attachment);
739 }
740 }
741 }
742
743 static void
744 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
745 {
746 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
747
748 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
749
750 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
751
752 tu_cs_emit_regs(cs,
753 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
754 cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
755 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
756 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
757 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
758 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
759 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
760 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
761 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
762 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
763
764 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
765 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
766 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
767 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
768 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
769 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
770 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
771 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
772 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
773 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
774 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
775 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
776 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
777 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
778
779 /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
780 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
781 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
782 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
783
784 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
785
786 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
787
788 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
789 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
790 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
791 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
792 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
793 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
794 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
795 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
796 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
797 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
798 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
799
800 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
801 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
802
803 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236,
804 A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
805 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
806
807 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
808 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
809
810 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
811 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
812 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
813
814 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
815 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
816
817 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
818
819 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
820
821 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
822 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
823 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
824 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
825 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
826 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
827 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
828 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
829 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
830 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
831
832 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
833
834 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
835
836 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
837
838 /* we don't use this yet.. probably best to disable.. */
839 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
840 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
841 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
842 CP_SET_DRAW_STATE__0_GROUP_ID(0));
843 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
844 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
845
846 tu_cs_emit_regs(cs,
847 A6XX_SP_HS_CTRL_REG0(0));
848
849 tu_cs_emit_regs(cs,
850 A6XX_SP_GS_CTRL_REG0(0));
851
852 tu_cs_emit_regs(cs,
853 A6XX_GRAS_LRZ_CNTL(0));
854
855 tu_cs_emit_regs(cs,
856 A6XX_RB_LRZ_CNTL(0));
857
858 tu_cs_emit_regs(cs,
859 A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
860 tu_cs_emit_regs(cs,
861 A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
862
863 tu_cs_sanity_check(cs);
864 }
865
866 static void
867 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
868 {
869 const struct tu_framebuffer *fb = cmd->state.framebuffer;
870
871 tu_cs_emit_regs(cs,
872 A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
873 .height = fb->tile0.height),
874 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = &cmd->vsc_draw_strm,
875 .bo_offset = 32 * cmd->vsc_draw_strm_pitch));
876
877 tu_cs_emit_regs(cs,
878 A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
879 .ny = fb->tile_count.height));
880
881 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
882 tu_cs_emit_array(cs, fb->pipe_config, 32);
883
884 tu_cs_emit_regs(cs,
885 A6XX_VSC_PRIM_STRM_ADDRESS(.bo = &cmd->vsc_prim_strm),
886 A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
887 A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - 64));
888
889 tu_cs_emit_regs(cs,
890 A6XX_VSC_DRAW_STRM_ADDRESS(.bo = &cmd->vsc_draw_strm),
891 A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
892 A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - 64));
893 }
894
895 static void
896 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
897 {
898 const struct tu_framebuffer *fb = cmd->state.framebuffer;
899 const uint32_t used_pipe_count =
900 fb->pipe_count.width * fb->pipe_count.height;
901
902 /* Clear vsc_scratch: */
903 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
904 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
905 tu_cs_emit(cs, 0x0);
906
907 /* Check for overflow, write vsc_scratch if detected: */
908 for (int i = 0; i < used_pipe_count; i++) {
909 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
910 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
911 CP_COND_WRITE5_0_WRITE_MEMORY);
912 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
913 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
914 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - 64));
915 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
916 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
917 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_draw_strm_pitch));
918
919 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
920 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
921 CP_COND_WRITE5_0_WRITE_MEMORY);
922 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
923 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
924 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - 64));
925 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
926 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
927 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_prim_strm_pitch));
928 }
929
930 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
931 }
932
933 static void
934 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
935 {
936 struct tu_physical_device *phys_dev = cmd->device->physical_device;
937 const struct tu_framebuffer *fb = cmd->state.framebuffer;
938
939 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
940
941 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
942 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
943
944 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
945 tu_cs_emit(cs, 0x1);
946
947 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
948 tu_cs_emit(cs, 0x1);
949
950 tu_cs_emit_wfi(cs);
951
952 tu_cs_emit_regs(cs,
953 A6XX_VFD_MODE_CNTL(.binning_pass = true));
954
955 update_vsc_pipe(cmd, cs);
956
957 tu_cs_emit_regs(cs,
958 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
959
960 tu_cs_emit_regs(cs,
961 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
962
963 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
964 tu_cs_emit(cs, UNK_2C);
965
966 tu_cs_emit_regs(cs,
967 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
968
969 tu_cs_emit_regs(cs,
970 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
971
972 /* emit IB to binning drawcmds: */
973 tu_cs_emit_call(cs, &cmd->draw_cs);
974
975 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
976 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
977 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
978 CP_SET_DRAW_STATE__0_GROUP_ID(0));
979 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
980 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
981
982 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
983 tu_cs_emit(cs, UNK_2D);
984
985 /* This flush is probably required because the VSC, which produces the
986 * visibility stream, is a client of UCHE, whereas the CP needs to read the
987 * visibility stream (without caching) to do draw skipping. The
988 * WFI+WAIT_FOR_ME combination guarantees that the binning commands
989 * submitted are finished before reading the VSC regs (in
990 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
991 * part of draws).
992 */
993 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
994
995 tu_cs_emit_wfi(cs);
996
997 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
998
999 emit_vsc_overflow_test(cmd, cs);
1000
1001 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1002 tu_cs_emit(cs, 0x0);
1003
1004 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1005 tu_cs_emit(cs, 0x0);
1006 }
1007
1008 static void
1009 tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
1010 const struct tu_subpass *subpass,
1011 struct tu_cs_entry *ib,
1012 bool gmem)
1013 {
1014 /* note: we can probably emit input attachments just once for the whole
1015 * renderpass, this would avoid emitting both sysmem/gmem versions
1016 *
1017 * emit two texture descriptors for each input, as a workaround for
1018 * d24s8, which can be sampled as both float (depth) and integer (stencil)
1019 * tu_shader lowers uint input attachment loads to use the 2nd descriptor
1020 * in the pair
1021 * TODO: a smarter workaround
1022 */
1023
1024 if (!subpass->input_count)
1025 return;
1026
1027 struct tu_cs_memory texture;
1028 VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
1029 A6XX_TEX_CONST_DWORDS, &texture);
1030 assert(result == VK_SUCCESS);
1031
1032 for (unsigned i = 0; i < subpass->input_count * 2; i++) {
1033 uint32_t a = subpass->input_attachments[i / 2].attachment;
1034 if (a == VK_ATTACHMENT_UNUSED)
1035 continue;
1036
1037 struct tu_image_view *iview =
1038 cmd->state.framebuffer->attachments[a].attachment;
1039 const struct tu_render_pass_attachment *att =
1040 &cmd->state.pass->attachments[a];
1041 uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
1042
1043 memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
1044
1045 if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
1046 /* note this works because spec says fb and input attachments
1047 * must use identity swizzle
1048 */
1049 dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1050 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1051 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1052 dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_S8Z24_UINT) |
1053 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
1054 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1055 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1056 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1057 }
1058
1059 if (!gmem)
1060 continue;
1061
1062 /* patched for gmem */
1063 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1064 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1065 dst[2] =
1066 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1067 A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * att->cpp);
1068 dst[3] = 0;
1069 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
1070 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
1071 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1072 dst[i] = 0;
1073 }
1074
1075 struct tu_cs cs;
1076 tu_cs_begin_sub_stream(&cmd->sub_cs, 9, &cs);
1077
1078 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
1079 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1080 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1081 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1082 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1083 CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
1084 tu_cs_emit_qw(&cs, texture.iova);
1085
1086 tu_cs_emit_pkt4(&cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
1087 tu_cs_emit_qw(&cs, texture.iova);
1088
1089 tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
1090
1091 *ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
1092 }
1093
1094 static void
1095 tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1096 {
1097 struct tu_cs *cs = &cmd->draw_cs;
1098
1099 tu_emit_input_attachments(cmd, subpass, &cmd->state.ia_gmem_ib, true);
1100 tu_emit_input_attachments(cmd, subpass, &cmd->state.ia_sysmem_ib, false);
1101
1102 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
1103 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, cmd->state.ia_gmem_ib);
1104 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, cmd->state.ia_sysmem_ib);
1105 }
1106
1107 static void
1108 tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
1109 const VkRenderPassBeginInfo *info)
1110 {
1111 struct tu_cs *cs = &cmd->draw_cs;
1112
1113 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1114
1115 tu6_emit_blit_scissor(cmd, cs, true);
1116
1117 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1118 tu_load_gmem_attachment(cmd, cs, i, false);
1119
1120 tu6_emit_blit_scissor(cmd, cs, false);
1121
1122 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1123 tu_clear_gmem_attachment(cmd, cs, i, info);
1124
1125 tu_cond_exec_end(cs);
1126
1127 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1128
1129 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1130 tu_clear_sysmem_attachment(cmd, cs, i, info);
1131
1132 tu_cond_exec_end(cs);
1133 }
1134
1135 static void
1136 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1137 {
1138 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1139
1140 assert(fb->width > 0 && fb->height > 0);
1141 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1142 tu6_emit_window_offset(cs, 0, 0);
1143
1144 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1145
1146 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1147
1148 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1149 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1150
1151 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1152 tu_cs_emit(cs, 0x0);
1153
1154 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1155
1156 /* enable stream-out, with sysmem there is only one pass: */
1157 tu_cs_emit_regs(cs,
1158 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1159
1160 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1161 tu_cs_emit(cs, 0x1);
1162
1163 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1164 tu_cs_emit(cs, 0x0);
1165
1166 tu_cs_sanity_check(cs);
1167 }
1168
1169 static void
1170 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1171 {
1172 /* Do any resolves of the last subpass. These are handled in the
1173 * tile_store_ib in the gmem path.
1174 */
1175 tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1176
1177 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1178
1179 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1180 tu_cs_emit(cs, 0x0);
1181
1182 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1183
1184 tu_cs_sanity_check(cs);
1185 }
1186
1187 static void
1188 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1189 {
1190 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1191
1192 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1193
1194 /* lrz clear? */
1195
1196 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1197 tu_cs_emit(cs, 0x0);
1198
1199 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1200
1201 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1202 if (use_hw_binning(cmd)) {
1203 /* enable stream-out during binning pass: */
1204 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1205
1206 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1207 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1208
1209 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1210
1211 tu6_emit_binning_pass(cmd, cs);
1212
1213 /* and disable stream-out for draw pass: */
1214 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
1215
1216 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1217 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1218
1219 tu_cs_emit_regs(cs,
1220 A6XX_VFD_MODE_CNTL(0));
1221
1222 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1223
1224 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1225
1226 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1227 tu_cs_emit(cs, 0x1);
1228 } else {
1229 /* no binning pass, so enable stream-out for draw pass:: */
1230 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1231
1232 tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 0x6000000);
1233 }
1234
1235 tu_cs_sanity_check(cs);
1236 }
1237
1238 static void
1239 tu6_render_tile(struct tu_cmd_buffer *cmd,
1240 struct tu_cs *cs,
1241 uint32_t tx, uint32_t ty)
1242 {
1243 tu6_emit_tile_select(cmd, cs, tx, ty);
1244
1245 tu_cs_emit_call(cs, &cmd->draw_cs);
1246
1247 if (use_hw_binning(cmd)) {
1248 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1249 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1250 }
1251
1252 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1253
1254 tu_cs_sanity_check(cs);
1255 }
1256
1257 static void
1258 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1259 {
1260 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1261
1262 tu_cs_emit_regs(cs,
1263 A6XX_GRAS_LRZ_CNTL(0));
1264
1265 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1266
1267 tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1268
1269 tu_cs_sanity_check(cs);
1270 }
1271
1272 static void
1273 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1274 {
1275 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1276
1277 if (use_hw_binning(cmd))
1278 cmd->use_vsc_data = true;
1279
1280 tu6_tile_render_begin(cmd, &cmd->cs);
1281
1282 for (uint32_t y = 0; y < fb->tile_count.height; y++) {
1283 for (uint32_t x = 0; x < fb->tile_count.width; x++)
1284 tu6_render_tile(cmd, &cmd->cs, x, y);
1285 }
1286
1287 tu6_tile_render_end(cmd, &cmd->cs);
1288 }
1289
1290 static void
1291 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1292 {
1293 tu6_sysmem_render_begin(cmd, &cmd->cs);
1294
1295 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1296
1297 tu6_sysmem_render_end(cmd, &cmd->cs);
1298 }
1299
1300 static void
1301 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1302 {
1303 const uint32_t tile_store_space = 11 + (35 * 2) * cmd->state.pass->attachment_count;
1304 struct tu_cs sub_cs;
1305
1306 VkResult result =
1307 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1308 if (result != VK_SUCCESS) {
1309 cmd->record_result = result;
1310 return;
1311 }
1312
1313 /* emit to tile-store sub_cs */
1314 tu6_emit_tile_store(cmd, &sub_cs);
1315
1316 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1317 }
1318
1319 static VkResult
1320 tu_create_cmd_buffer(struct tu_device *device,
1321 struct tu_cmd_pool *pool,
1322 VkCommandBufferLevel level,
1323 VkCommandBuffer *pCommandBuffer)
1324 {
1325 struct tu_cmd_buffer *cmd_buffer;
1326 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1327 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1328 if (cmd_buffer == NULL)
1329 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1330
1331 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1332 cmd_buffer->device = device;
1333 cmd_buffer->pool = pool;
1334 cmd_buffer->level = level;
1335
1336 if (pool) {
1337 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1338 cmd_buffer->queue_family_index = pool->queue_family_index;
1339
1340 } else {
1341 /* Init the pool_link so we can safely call list_del when we destroy
1342 * the command buffer
1343 */
1344 list_inithead(&cmd_buffer->pool_link);
1345 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1346 }
1347
1348 tu_bo_list_init(&cmd_buffer->bo_list);
1349 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1350 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1351 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1352 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1353
1354 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1355
1356 list_inithead(&cmd_buffer->upload.list);
1357
1358 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1359 if (result != VK_SUCCESS)
1360 goto fail_scratch_bo;
1361
1362 /* TODO: resize on overflow */
1363 cmd_buffer->vsc_draw_strm_pitch = device->vsc_draw_strm_pitch;
1364 cmd_buffer->vsc_prim_strm_pitch = device->vsc_prim_strm_pitch;
1365 cmd_buffer->vsc_draw_strm = device->vsc_draw_strm;
1366 cmd_buffer->vsc_prim_strm = device->vsc_prim_strm;
1367
1368 return VK_SUCCESS;
1369
1370 fail_scratch_bo:
1371 list_del(&cmd_buffer->pool_link);
1372 return result;
1373 }
1374
1375 static void
1376 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1377 {
1378 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1379
1380 list_del(&cmd_buffer->pool_link);
1381
1382 tu_cs_finish(&cmd_buffer->cs);
1383 tu_cs_finish(&cmd_buffer->draw_cs);
1384 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1385 tu_cs_finish(&cmd_buffer->sub_cs);
1386
1387 tu_bo_list_destroy(&cmd_buffer->bo_list);
1388 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1389 }
1390
1391 static VkResult
1392 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1393 {
1394 cmd_buffer->record_result = VK_SUCCESS;
1395
1396 tu_bo_list_reset(&cmd_buffer->bo_list);
1397 tu_cs_reset(&cmd_buffer->cs);
1398 tu_cs_reset(&cmd_buffer->draw_cs);
1399 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1400 tu_cs_reset(&cmd_buffer->sub_cs);
1401
1402 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
1403 memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
1404
1405 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1406
1407 return cmd_buffer->record_result;
1408 }
1409
1410 VkResult
1411 tu_AllocateCommandBuffers(VkDevice _device,
1412 const VkCommandBufferAllocateInfo *pAllocateInfo,
1413 VkCommandBuffer *pCommandBuffers)
1414 {
1415 TU_FROM_HANDLE(tu_device, device, _device);
1416 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1417
1418 VkResult result = VK_SUCCESS;
1419 uint32_t i;
1420
1421 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1422
1423 if (!list_is_empty(&pool->free_cmd_buffers)) {
1424 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1425 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1426
1427 list_del(&cmd_buffer->pool_link);
1428 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1429
1430 result = tu_reset_cmd_buffer(cmd_buffer);
1431 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1432 cmd_buffer->level = pAllocateInfo->level;
1433
1434 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1435 } else {
1436 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1437 &pCommandBuffers[i]);
1438 }
1439 if (result != VK_SUCCESS)
1440 break;
1441 }
1442
1443 if (result != VK_SUCCESS) {
1444 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1445 pCommandBuffers);
1446
1447 /* From the Vulkan 1.0.66 spec:
1448 *
1449 * "vkAllocateCommandBuffers can be used to create multiple
1450 * command buffers. If the creation of any of those command
1451 * buffers fails, the implementation must destroy all
1452 * successfully created command buffer objects from this
1453 * command, set all entries of the pCommandBuffers array to
1454 * NULL and return the error."
1455 */
1456 memset(pCommandBuffers, 0,
1457 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1458 }
1459
1460 return result;
1461 }
1462
1463 void
1464 tu_FreeCommandBuffers(VkDevice device,
1465 VkCommandPool commandPool,
1466 uint32_t commandBufferCount,
1467 const VkCommandBuffer *pCommandBuffers)
1468 {
1469 for (uint32_t i = 0; i < commandBufferCount; i++) {
1470 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1471
1472 if (cmd_buffer) {
1473 if (cmd_buffer->pool) {
1474 list_del(&cmd_buffer->pool_link);
1475 list_addtail(&cmd_buffer->pool_link,
1476 &cmd_buffer->pool->free_cmd_buffers);
1477 } else
1478 tu_cmd_buffer_destroy(cmd_buffer);
1479 }
1480 }
1481 }
1482
1483 VkResult
1484 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1485 VkCommandBufferResetFlags flags)
1486 {
1487 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1488 return tu_reset_cmd_buffer(cmd_buffer);
1489 }
1490
1491 /* Initialize the cache, assuming all necessary flushes have happened but *not*
1492 * invalidations.
1493 */
1494 static void
1495 tu_cache_init(struct tu_cache_state *cache)
1496 {
1497 cache->flush_bits = 0;
1498 cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1499 }
1500
1501 VkResult
1502 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1503 const VkCommandBufferBeginInfo *pBeginInfo)
1504 {
1505 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1506 VkResult result = VK_SUCCESS;
1507
1508 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1509 /* If the command buffer has already been resetted with
1510 * vkResetCommandBuffer, no need to do it again.
1511 */
1512 result = tu_reset_cmd_buffer(cmd_buffer);
1513 if (result != VK_SUCCESS)
1514 return result;
1515 }
1516
1517 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1518 cmd_buffer->state.index_size = 0xff; /* dirty restart index */
1519
1520 tu_cache_init(&cmd_buffer->state.cache);
1521 tu_cache_init(&cmd_buffer->state.renderpass_cache);
1522 cmd_buffer->usage_flags = pBeginInfo->flags;
1523
1524 tu_cs_begin(&cmd_buffer->cs);
1525 tu_cs_begin(&cmd_buffer->draw_cs);
1526 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1527
1528 /* setup initial configuration into command buffer */
1529 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1530 switch (cmd_buffer->queue_family_index) {
1531 case TU_QUEUE_GENERAL:
1532 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1533 break;
1534 default:
1535 break;
1536 }
1537 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1538 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1539 assert(pBeginInfo->pInheritanceInfo);
1540 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1541 cmd_buffer->state.subpass =
1542 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1543 } else {
1544 /* When executing in the middle of another command buffer, the CCU
1545 * state is unknown.
1546 */
1547 cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1548 }
1549 }
1550
1551 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1552
1553 return VK_SUCCESS;
1554 }
1555
1556 /* Sets vertex buffers to HW binding points. We emit VBs in SDS (so that bin
1557 * rendering can skip over unused state), so we need to collect all the
1558 * bindings together into a single state emit at draw time.
1559 */
1560 void
1561 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1562 uint32_t firstBinding,
1563 uint32_t bindingCount,
1564 const VkBuffer *pBuffers,
1565 const VkDeviceSize *pOffsets)
1566 {
1567 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1568
1569 assert(firstBinding + bindingCount <= MAX_VBS);
1570
1571 for (uint32_t i = 0; i < bindingCount; i++) {
1572 struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1573
1574 cmd->state.vb.buffers[firstBinding + i] = buf;
1575 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
1576
1577 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1578 }
1579
1580 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1581 }
1582
1583 void
1584 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1585 VkBuffer buffer,
1586 VkDeviceSize offset,
1587 VkIndexType indexType)
1588 {
1589 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1590 TU_FROM_HANDLE(tu_buffer, buf, buffer);
1591
1592
1593
1594 uint32_t index_size, index_shift, restart_index;
1595
1596 switch (indexType) {
1597 case VK_INDEX_TYPE_UINT16:
1598 index_size = INDEX4_SIZE_16_BIT;
1599 index_shift = 1;
1600 restart_index = 0xffff;
1601 break;
1602 case VK_INDEX_TYPE_UINT32:
1603 index_size = INDEX4_SIZE_32_BIT;
1604 index_shift = 2;
1605 restart_index = 0xffffffff;
1606 break;
1607 case VK_INDEX_TYPE_UINT8_EXT:
1608 index_size = INDEX4_SIZE_8_BIT;
1609 index_shift = 0;
1610 restart_index = 0xff;
1611 break;
1612 default:
1613 unreachable("invalid VkIndexType");
1614 }
1615
1616 /* initialize/update the restart index */
1617 if (cmd->state.index_size != index_size)
1618 tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
1619
1620 assert(buf->size >= offset);
1621
1622 cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;
1623 cmd->state.max_index_count = (buf->size - offset) >> index_shift;
1624 cmd->state.index_size = index_size;
1625
1626 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1627 }
1628
1629 void
1630 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1631 VkPipelineBindPoint pipelineBindPoint,
1632 VkPipelineLayout _layout,
1633 uint32_t firstSet,
1634 uint32_t descriptorSetCount,
1635 const VkDescriptorSet *pDescriptorSets,
1636 uint32_t dynamicOffsetCount,
1637 const uint32_t *pDynamicOffsets)
1638 {
1639 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1640 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1641 unsigned dyn_idx = 0;
1642
1643 struct tu_descriptor_state *descriptors_state =
1644 tu_get_descriptors_state(cmd, pipelineBindPoint);
1645
1646 for (unsigned i = 0; i < descriptorSetCount; ++i) {
1647 unsigned idx = i + firstSet;
1648 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1649
1650 descriptors_state->sets[idx] = set;
1651
1652 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1653 /* update the contents of the dynamic descriptor set */
1654 unsigned src_idx = j;
1655 unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1656 assert(dyn_idx < dynamicOffsetCount);
1657
1658 uint32_t *dst =
1659 &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1660 uint32_t *src =
1661 &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1662 uint32_t offset = pDynamicOffsets[dyn_idx];
1663
1664 /* Patch the storage/uniform descriptors right away. */
1665 if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1666 /* Note: we can assume here that the addition won't roll over and
1667 * change the SIZE field.
1668 */
1669 uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1670 va += offset;
1671 dst[0] = va;
1672 dst[1] = va >> 32;
1673 } else {
1674 memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1675 /* Note: A6XX_IBO_5_DEPTH is always 0 */
1676 uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1677 va += offset;
1678 dst[4] = va;
1679 dst[5] = va >> 32;
1680 }
1681 }
1682
1683 for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
1684 if (set->buffers[j]) {
1685 tu_bo_list_add(&cmd->bo_list, set->buffers[j],
1686 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1687 }
1688 }
1689
1690 if (set->size > 0) {
1691 tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
1692 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1693 }
1694 }
1695 assert(dyn_idx == dynamicOffsetCount);
1696
1697 uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_update_value;
1698 uint64_t addr[MAX_SETS + 1] = {};
1699 struct tu_cs cs;
1700
1701 for (uint32_t i = 0; i < MAX_SETS; i++) {
1702 struct tu_descriptor_set *set = descriptors_state->sets[i];
1703 if (set)
1704 addr[i] = set->va | 3;
1705 }
1706
1707 if (layout->dynamic_offset_count) {
1708 /* allocate and fill out dynamic descriptor set */
1709 struct tu_cs_memory dynamic_desc_set;
1710 VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
1711 A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
1712 assert(result == VK_SUCCESS);
1713
1714 memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
1715 layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
1716 addr[MAX_SETS] = dynamic_desc_set.iova | 3;
1717 }
1718
1719 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1720 sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
1721 hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
1722 hlsq_update_value = 0x7c000;
1723
1724 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS;
1725 } else {
1726 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
1727
1728 sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
1729 hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
1730 hlsq_update_value = 0x3e00;
1731
1732 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
1733 }
1734
1735 tu_cs_begin_sub_stream(&cmd->sub_cs, 24, &cs);
1736
1737 tu_cs_emit_pkt4(&cs, sp_bindless_base_reg, 10);
1738 tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
1739 tu_cs_emit_pkt4(&cs, hlsq_bindless_base_reg, 10);
1740 tu_cs_emit_array(&cs, (const uint32_t*) addr, 10);
1741 tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(.dword = hlsq_update_value));
1742
1743 struct tu_cs_entry ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
1744 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1745 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
1746 tu_cs_emit_sds_ib(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, ib);
1747 cmd->state.desc_sets_ib = ib;
1748 } else {
1749 /* note: for compute we could emit directly, instead of a CP_INDIRECT
1750 * however, the blob uses draw states for compute
1751 */
1752 tu_cs_emit_ib(&cmd->cs, &ib);
1753 }
1754 }
1755
1756 void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1757 uint32_t firstBinding,
1758 uint32_t bindingCount,
1759 const VkBuffer *pBuffers,
1760 const VkDeviceSize *pOffsets,
1761 const VkDeviceSize *pSizes)
1762 {
1763 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1764 struct tu_cs *cs = &cmd->draw_cs;
1765
1766 /* using COND_REG_EXEC for xfb commands matches the blob behavior
1767 * presumably there isn't any benefit using a draw state when the
1768 * condition is (SYSMEM | BINNING)
1769 */
1770 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1771 CP_COND_REG_EXEC_0_SYSMEM |
1772 CP_COND_REG_EXEC_0_BINNING);
1773
1774 for (uint32_t i = 0; i < bindingCount; i++) {
1775 TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
1776 uint64_t iova = buf->bo->iova + pOffsets[i];
1777 uint32_t size = buf->bo->size - pOffsets[i];
1778 uint32_t idx = i + firstBinding;
1779
1780 if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
1781 size = pSizes[i];
1782
1783 /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
1784 uint32_t offset = iova & 0x1f;
1785 iova &= ~(uint64_t) 0x1f;
1786
1787 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
1788 tu_cs_emit_qw(cs, iova);
1789 tu_cs_emit(cs, size + offset);
1790
1791 cmd->state.streamout_offset[idx] = offset;
1792
1793 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE);
1794 }
1795
1796 tu_cond_exec_end(cs);
1797 }
1798
1799 void
1800 tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1801 uint32_t firstCounterBuffer,
1802 uint32_t counterBufferCount,
1803 const VkBuffer *pCounterBuffers,
1804 const VkDeviceSize *pCounterBufferOffsets)
1805 {
1806 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1807 struct tu_cs *cs = &cmd->draw_cs;
1808
1809 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1810 CP_COND_REG_EXEC_0_SYSMEM |
1811 CP_COND_REG_EXEC_0_BINNING);
1812
1813 /* TODO: only update offset for active buffers */
1814 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
1815 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
1816
1817 for (uint32_t i = 0; i < counterBufferCount; i++) {
1818 uint32_t idx = firstCounterBuffer + i;
1819 uint32_t offset = cmd->state.streamout_offset[idx];
1820
1821 if (!pCounterBuffers[i])
1822 continue;
1823
1824 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1825
1826 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1827
1828 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1829 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1830 CP_MEM_TO_REG_0_UNK31 |
1831 CP_MEM_TO_REG_0_CNT(1));
1832 tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]);
1833
1834 if (offset) {
1835 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1836 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1837 CP_REG_RMW_0_SRC1_ADD);
1838 tu_cs_emit_qw(cs, 0xffffffff);
1839 tu_cs_emit_qw(cs, offset);
1840 }
1841 }
1842
1843 tu_cond_exec_end(cs);
1844 }
1845
1846 void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1847 uint32_t firstCounterBuffer,
1848 uint32_t counterBufferCount,
1849 const VkBuffer *pCounterBuffers,
1850 const VkDeviceSize *pCounterBufferOffsets)
1851 {
1852 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1853 struct tu_cs *cs = &cmd->draw_cs;
1854
1855 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1856 CP_COND_REG_EXEC_0_SYSMEM |
1857 CP_COND_REG_EXEC_0_BINNING);
1858
1859 /* TODO: only flush buffers that need to be flushed */
1860 for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1861 /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
1862 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
1863 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[i]));
1864 tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
1865 }
1866
1867 for (uint32_t i = 0; i < counterBufferCount; i++) {
1868 uint32_t idx = firstCounterBuffer + i;
1869 uint32_t offset = cmd->state.streamout_offset[idx];
1870
1871 if (!pCounterBuffers[i])
1872 continue;
1873
1874 TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1875
1876 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_WRITE);
1877
1878 /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
1879 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1880 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1881 CP_MEM_TO_REG_0_SHIFT_BY_2 |
1882 0x40000 | /* ??? */
1883 CP_MEM_TO_REG_0_UNK31 |
1884 CP_MEM_TO_REG_0_CNT(1));
1885 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[idx]));
1886
1887 if (offset) {
1888 tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1889 tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1890 CP_REG_RMW_0_SRC1_ADD);
1891 tu_cs_emit_qw(cs, 0xffffffff);
1892 tu_cs_emit_qw(cs, -offset);
1893 }
1894
1895 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1896 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1897 CP_REG_TO_MEM_0_CNT(1));
1898 tu_cs_emit_qw(cs, buf->bo->iova + pCounterBufferOffsets[i]);
1899 }
1900
1901 tu_cond_exec_end(cs);
1902
1903 cmd->state.xfb_used = true;
1904 }
1905
1906 void
1907 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
1908 VkPipelineLayout layout,
1909 VkShaderStageFlags stageFlags,
1910 uint32_t offset,
1911 uint32_t size,
1912 const void *pValues)
1913 {
1914 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1915 memcpy((void*) cmd->push_constants + offset, pValues, size);
1916 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
1917 }
1918
1919 /* Flush everything which has been made available but we haven't actually
1920 * flushed yet.
1921 */
1922 static void
1923 tu_flush_all_pending(struct tu_cache_state *cache)
1924 {
1925 cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
1926 cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
1927 }
1928
1929 VkResult
1930 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
1931 {
1932 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1933
1934 /* We currently flush CCU at the end of the command buffer, like
1935 * what the blob does. There's implicit synchronization around every
1936 * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
1937 * know yet if this command buffer will be the last in the submit so we
1938 * have to defensively flush everything else.
1939 *
1940 * TODO: We could definitely do better than this, since these flushes
1941 * aren't required by Vulkan, but we'd need kernel support to do that.
1942 * Ideally, we'd like the kernel to flush everything afterwards, so that we
1943 * wouldn't have to do any flushes here, and when submitting multiple
1944 * command buffers there wouldn't be any unnecessary flushes in between.
1945 */
1946 if (cmd_buffer->state.pass) {
1947 tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
1948 tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
1949 } else {
1950 tu_flush_all_pending(&cmd_buffer->state.cache);
1951 cmd_buffer->state.cache.flush_bits |=
1952 TU_CMD_FLAG_CCU_FLUSH_COLOR |
1953 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
1954 tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
1955 }
1956
1957 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
1958 MSM_SUBMIT_BO_WRITE);
1959
1960 if (cmd_buffer->use_vsc_data) {
1961 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_draw_strm,
1962 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1963 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_prim_strm,
1964 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1965 }
1966
1967 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->border_color,
1968 MSM_SUBMIT_BO_READ);
1969
1970 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
1971 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
1972 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1973 }
1974
1975 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
1976 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
1977 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1978 }
1979
1980 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
1981 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
1982 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1983 }
1984
1985 tu_cs_end(&cmd_buffer->cs);
1986 tu_cs_end(&cmd_buffer->draw_cs);
1987 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
1988
1989 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
1990
1991 return cmd_buffer->record_result;
1992 }
1993
1994 static struct tu_cs
1995 tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
1996 {
1997 struct tu_cs_memory memory;
1998 struct tu_cs cs;
1999
2000 /* TODO: share this logic with tu_pipeline_static_state */
2001 tu_cs_alloc(&cmd->sub_cs, size, 1, &memory);
2002 tu_cs_init_external(&cs, memory.map, memory.map + size);
2003 tu_cs_begin(&cs);
2004 tu_cs_reserve_space(&cs, size);
2005
2006 assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
2007 cmd->state.dynamic_state[id].iova = memory.iova;
2008 cmd->state.dynamic_state[id].size = size;
2009
2010 tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2011 tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2012
2013 return cs;
2014 }
2015
2016 void
2017 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2018 VkPipelineBindPoint pipelineBindPoint,
2019 VkPipeline _pipeline)
2020 {
2021 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2022 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2023
2024 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2025 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2026 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2027 }
2028
2029 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2030 cmd->state.compute_pipeline = pipeline;
2031 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2032 return;
2033 }
2034
2035 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2036
2037 cmd->state.pipeline = pipeline;
2038 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2039
2040 struct tu_cs *cs = &cmd->draw_cs;
2041 uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
2042 uint32_t i;
2043
2044 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
2045 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib);
2046 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib);
2047 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib);
2048 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib);
2049 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib);
2050 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS, pipeline->ds.state_ib);
2051 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_BLEND, pipeline->blend.state_ib);
2052
2053 for_each_bit(i, mask)
2054 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
2055
2056 /* If the new pipeline requires more VBs than we had previously set up, we
2057 * need to re-emit them in SDS. If it requires the same set or fewer, we
2058 * can just re-use the old SDS.
2059 */
2060 if (pipeline->vi.bindings_used & ~cmd->vertex_bindings_set)
2061 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2062
2063 /* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */
2064 if (pipeline->layout->dynamic_offset_count)
2065 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2066
2067 /* dynamic linewidth state depends pipeline state's gras_su_cntl
2068 * so the dynamic state ib must be updated when pipeline changes
2069 */
2070 if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_LINE_WIDTH)) {
2071 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_LINE_WIDTH, 2);
2072
2073 cmd->state.dynamic_gras_su_cntl &= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2074 cmd->state.dynamic_gras_su_cntl |= pipeline->gras_su_cntl;
2075
2076 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.dynamic_gras_su_cntl));
2077 }
2078 }
2079
2080 void
2081 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2082 uint32_t firstViewport,
2083 uint32_t viewportCount,
2084 const VkViewport *pViewports)
2085 {
2086 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2087 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 18);
2088
2089 assert(firstViewport == 0 && viewportCount == 1);
2090
2091 tu6_emit_viewport(&cs, pViewports);
2092 }
2093
2094 void
2095 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2096 uint32_t firstScissor,
2097 uint32_t scissorCount,
2098 const VkRect2D *pScissors)
2099 {
2100 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2101 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 3);
2102
2103 assert(firstScissor == 0 && scissorCount == 1);
2104
2105 tu6_emit_scissor(&cs, pScissors);
2106 }
2107
2108 void
2109 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2110 {
2111 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2112 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_LINE_WIDTH, 2);
2113
2114 cmd->state.dynamic_gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2115 cmd->state.dynamic_gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
2116
2117 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.dynamic_gras_su_cntl));
2118 }
2119
2120 void
2121 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2122 float depthBiasConstantFactor,
2123 float depthBiasClamp,
2124 float depthBiasSlopeFactor)
2125 {
2126 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2127 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
2128
2129 tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
2130 }
2131
2132 void
2133 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2134 const float blendConstants[4])
2135 {
2136 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2137 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
2138
2139 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2140 tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
2141 }
2142
2143 void
2144 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2145 float minDepthBounds,
2146 float maxDepthBounds)
2147 {
2148 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
2150
2151 tu_cs_emit_regs(&cs,
2152 A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
2153 A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
2154 }
2155
2156 static void
2157 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
2158 {
2159 if (face & VK_STENCIL_FACE_FRONT_BIT)
2160 *value = (*value & 0xff00) | (mask & 0xff);
2161 if (face & VK_STENCIL_FACE_BACK_BIT)
2162 *value = (*value & 0xff) | (mask & 0xff) << 8;
2163 }
2164
2165 void
2166 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2167 VkStencilFaceFlags faceMask,
2168 uint32_t compareMask)
2169 {
2170 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2171 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
2172
2173 update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
2174
2175 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
2176 }
2177
2178 void
2179 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2180 VkStencilFaceFlags faceMask,
2181 uint32_t writeMask)
2182 {
2183 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2184 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
2185
2186 update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
2187
2188 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
2189 }
2190
2191 void
2192 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2193 VkStencilFaceFlags faceMask,
2194 uint32_t reference)
2195 {
2196 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2197 struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
2198
2199 update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
2200
2201 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
2202 }
2203
2204 void
2205 tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2206 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2207 {
2208 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2209 struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
2210
2211 assert(pSampleLocationsInfo);
2212
2213 tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
2214 }
2215
2216 static void
2217 tu_flush_for_access(struct tu_cache_state *cache,
2218 enum tu_cmd_access_mask src_mask,
2219 enum tu_cmd_access_mask dst_mask)
2220 {
2221 enum tu_cmd_flush_bits flush_bits = 0;
2222
2223 if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2224 cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2225 }
2226
2227 #define SRC_FLUSH(domain, flush, invalidate) \
2228 if (src_mask & TU_ACCESS_##domain##_WRITE) { \
2229 cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
2230 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2231 }
2232
2233 SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2234 SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2235 SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2236
2237 #undef SRC_FLUSH
2238
2239 #define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
2240 if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
2241 flush_bits |= TU_CMD_FLAG_##flush; \
2242 cache->pending_flush_bits |= \
2243 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2244 }
2245
2246 SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2247 SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2248
2249 #undef SRC_INCOHERENT_FLUSH
2250
2251 if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
2252 flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2253 }
2254
2255 #define DST_FLUSH(domain, flush, invalidate) \
2256 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2257 TU_ACCESS_##domain##_WRITE)) { \
2258 flush_bits |= cache->pending_flush_bits & \
2259 (TU_CMD_FLAG_##invalidate | \
2260 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2261 }
2262
2263 DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2264 DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2265 DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2266
2267 #undef DST_FLUSH
2268
2269 #define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2270 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2271 TU_ACCESS_##domain##_WRITE)) { \
2272 flush_bits |= TU_CMD_FLAG_##invalidate | \
2273 (cache->pending_flush_bits & \
2274 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2275 }
2276
2277 DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2278 DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2279
2280 #undef DST_INCOHERENT_FLUSH
2281
2282 if (dst_mask & TU_ACCESS_WFI_READ) {
2283 flush_bits |= TU_CMD_FLAG_WFI;
2284 }
2285
2286 cache->flush_bits |= flush_bits;
2287 cache->pending_flush_bits &= ~flush_bits;
2288 }
2289
2290 static enum tu_cmd_access_mask
2291 vk2tu_access(VkAccessFlags flags, bool gmem)
2292 {
2293 enum tu_cmd_access_mask mask = 0;
2294
2295 /* If the GPU writes a buffer that is then read by an indirect draw
2296 * command, we theoretically need a WFI + WAIT_FOR_ME combination to
2297 * wait for the writes to complete. The WAIT_FOR_ME is performed as part
2298 * of the draw by the firmware, so we just need to execute a WFI.
2299 */
2300 if (flags &
2301 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
2302 VK_ACCESS_MEMORY_READ_BIT)) {
2303 mask |= TU_ACCESS_WFI_READ;
2304 }
2305
2306 if (flags &
2307 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2308 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
2309 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2310 VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
2311 VK_ACCESS_MEMORY_READ_BIT)) {
2312 mask |= TU_ACCESS_SYSMEM_READ;
2313 }
2314
2315 if (flags &
2316 (VK_ACCESS_HOST_WRITE_BIT |
2317 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
2318 VK_ACCESS_MEMORY_WRITE_BIT)) {
2319 mask |= TU_ACCESS_SYSMEM_WRITE;
2320 }
2321
2322 if (flags &
2323 (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2324 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2325 VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2326 /* TODO: Is there a no-cache bit for textures so that we can ignore
2327 * these?
2328 */
2329 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2330 VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2331 VK_ACCESS_MEMORY_READ_BIT)) {
2332 mask |= TU_ACCESS_UCHE_READ;
2333 }
2334
2335 if (flags &
2336 (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2337 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2338 VK_ACCESS_MEMORY_WRITE_BIT)) {
2339 mask |= TU_ACCESS_UCHE_WRITE;
2340 }
2341
2342 /* When using GMEM, the CCU is always flushed automatically to GMEM, and
2343 * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2344 * previous writes in sysmem mode when transitioning to GMEM. Therefore we
2345 * can ignore CCU and pretend that color attachments and transfers use
2346 * sysmem directly.
2347 */
2348
2349 if (flags &
2350 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2351 VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2352 VK_ACCESS_MEMORY_READ_BIT)) {
2353 if (gmem)
2354 mask |= TU_ACCESS_SYSMEM_READ;
2355 else
2356 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2357 }
2358
2359 if (flags &
2360 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2361 VK_ACCESS_MEMORY_READ_BIT)) {
2362 if (gmem)
2363 mask |= TU_ACCESS_SYSMEM_READ;
2364 else
2365 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2366 }
2367
2368 if (flags &
2369 (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2370 VK_ACCESS_MEMORY_WRITE_BIT)) {
2371 if (gmem) {
2372 mask |= TU_ACCESS_SYSMEM_WRITE;
2373 } else {
2374 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2375 }
2376 }
2377
2378 if (flags &
2379 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2380 VK_ACCESS_MEMORY_WRITE_BIT)) {
2381 if (gmem) {
2382 mask |= TU_ACCESS_SYSMEM_WRITE;
2383 } else {
2384 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2385 }
2386 }
2387
2388 /* When the dst access is a transfer read/write, it seems we sometimes need
2389 * to insert a WFI after any flushes, to guarantee that the flushes finish
2390 * before the 2D engine starts. However the opposite (i.e. a WFI after
2391 * CP_BLIT and before any subsequent flush) does not seem to be needed, and
2392 * the blob doesn't emit such a WFI.
2393 */
2394
2395 if (flags &
2396 (VK_ACCESS_TRANSFER_WRITE_BIT |
2397 VK_ACCESS_MEMORY_WRITE_BIT)) {
2398 if (gmem) {
2399 mask |= TU_ACCESS_SYSMEM_WRITE;
2400 } else {
2401 mask |= TU_ACCESS_CCU_COLOR_WRITE;
2402 }
2403 mask |= TU_ACCESS_WFI_READ;
2404 }
2405
2406 if (flags &
2407 (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2408 VK_ACCESS_MEMORY_READ_BIT)) {
2409 mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
2410 }
2411
2412 return mask;
2413 }
2414
2415
2416 void
2417 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2418 uint32_t commandBufferCount,
2419 const VkCommandBuffer *pCmdBuffers)
2420 {
2421 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2422 VkResult result;
2423
2424 assert(commandBufferCount > 0);
2425
2426 /* Emit any pending flushes. */
2427 if (cmd->state.pass) {
2428 tu_flush_all_pending(&cmd->state.renderpass_cache);
2429 tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2430 } else {
2431 tu_flush_all_pending(&cmd->state.cache);
2432 tu_emit_cache_flush(cmd, &cmd->cs);
2433 }
2434
2435 for (uint32_t i = 0; i < commandBufferCount; i++) {
2436 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2437
2438 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2439 if (result != VK_SUCCESS) {
2440 cmd->record_result = result;
2441 break;
2442 }
2443
2444 if (secondary->usage_flags &
2445 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2446 assert(tu_cs_is_empty(&secondary->cs));
2447
2448 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2449 if (result != VK_SUCCESS) {
2450 cmd->record_result = result;
2451 break;
2452 }
2453
2454 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2455 &secondary->draw_epilogue_cs);
2456 if (result != VK_SUCCESS) {
2457 cmd->record_result = result;
2458 break;
2459 }
2460
2461 if (secondary->has_tess)
2462 cmd->has_tess = true;
2463 } else {
2464 assert(tu_cs_is_empty(&secondary->draw_cs));
2465 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2466
2467 for (uint32_t j = 0; j < secondary->cs.bo_count; j++) {
2468 tu_bo_list_add(&cmd->bo_list, secondary->cs.bos[j],
2469 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2470 }
2471
2472 tu_cs_add_entries(&cmd->cs, &secondary->cs);
2473 }
2474
2475 cmd->state.index_size = secondary->state.index_size; /* for restart index update */
2476 }
2477 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2478
2479 /* After executing secondary command buffers, there may have been arbitrary
2480 * flushes executed, so when we encounter a pipeline barrier with a
2481 * srcMask, we have to assume that we need to invalidate. Therefore we need
2482 * to re-initialize the cache with all pending invalidate bits set.
2483 */
2484 if (cmd->state.pass) {
2485 tu_cache_init(&cmd->state.renderpass_cache);
2486 } else {
2487 tu_cache_init(&cmd->state.cache);
2488 }
2489 }
2490
2491 VkResult
2492 tu_CreateCommandPool(VkDevice _device,
2493 const VkCommandPoolCreateInfo *pCreateInfo,
2494 const VkAllocationCallbacks *pAllocator,
2495 VkCommandPool *pCmdPool)
2496 {
2497 TU_FROM_HANDLE(tu_device, device, _device);
2498 struct tu_cmd_pool *pool;
2499
2500 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2501 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2502 if (pool == NULL)
2503 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2504
2505 if (pAllocator)
2506 pool->alloc = *pAllocator;
2507 else
2508 pool->alloc = device->alloc;
2509
2510 list_inithead(&pool->cmd_buffers);
2511 list_inithead(&pool->free_cmd_buffers);
2512
2513 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2514
2515 *pCmdPool = tu_cmd_pool_to_handle(pool);
2516
2517 return VK_SUCCESS;
2518 }
2519
2520 void
2521 tu_DestroyCommandPool(VkDevice _device,
2522 VkCommandPool commandPool,
2523 const VkAllocationCallbacks *pAllocator)
2524 {
2525 TU_FROM_HANDLE(tu_device, device, _device);
2526 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2527
2528 if (!pool)
2529 return;
2530
2531 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2532 &pool->cmd_buffers, pool_link)
2533 {
2534 tu_cmd_buffer_destroy(cmd_buffer);
2535 }
2536
2537 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2538 &pool->free_cmd_buffers, pool_link)
2539 {
2540 tu_cmd_buffer_destroy(cmd_buffer);
2541 }
2542
2543 vk_free2(&device->alloc, pAllocator, pool);
2544 }
2545
2546 VkResult
2547 tu_ResetCommandPool(VkDevice device,
2548 VkCommandPool commandPool,
2549 VkCommandPoolResetFlags flags)
2550 {
2551 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2552 VkResult result;
2553
2554 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2555 pool_link)
2556 {
2557 result = tu_reset_cmd_buffer(cmd_buffer);
2558 if (result != VK_SUCCESS)
2559 return result;
2560 }
2561
2562 return VK_SUCCESS;
2563 }
2564
2565 void
2566 tu_TrimCommandPool(VkDevice device,
2567 VkCommandPool commandPool,
2568 VkCommandPoolTrimFlags flags)
2569 {
2570 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2571
2572 if (!pool)
2573 return;
2574
2575 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2576 &pool->free_cmd_buffers, pool_link)
2577 {
2578 tu_cmd_buffer_destroy(cmd_buffer);
2579 }
2580 }
2581
2582 static void
2583 tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
2584 const struct tu_subpass_barrier *barrier,
2585 bool external)
2586 {
2587 /* Note: we don't know until the end of the subpass whether we'll use
2588 * sysmem, so assume sysmem here to be safe.
2589 */
2590 struct tu_cache_state *cache =
2591 external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
2592 enum tu_cmd_access_mask src_flags =
2593 vk2tu_access(barrier->src_access_mask, false);
2594 enum tu_cmd_access_mask dst_flags =
2595 vk2tu_access(barrier->dst_access_mask, false);
2596
2597 if (barrier->incoherent_ccu_color)
2598 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2599 if (barrier->incoherent_ccu_depth)
2600 src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2601
2602 tu_flush_for_access(cache, src_flags, dst_flags);
2603 }
2604
2605 void
2606 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2607 const VkRenderPassBeginInfo *pRenderPassBegin,
2608 VkSubpassContents contents)
2609 {
2610 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2611 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2612 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2613
2614 cmd->state.pass = pass;
2615 cmd->state.subpass = pass->subpasses;
2616 cmd->state.framebuffer = fb;
2617 cmd->state.render_area = pRenderPassBegin->renderArea;
2618
2619 tu_cmd_prepare_tile_store_ib(cmd);
2620
2621 /* Note: because this is external, any flushes will happen before draw_cs
2622 * gets called. However deferred flushes could have to happen later as part
2623 * of the subpass.
2624 */
2625 tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
2626 cmd->state.renderpass_cache.pending_flush_bits =
2627 cmd->state.cache.pending_flush_bits;
2628 cmd->state.renderpass_cache.flush_bits = 0;
2629
2630 tu_emit_renderpass_begin(cmd, pRenderPassBegin);
2631
2632 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2633 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2634 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
2635 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2636
2637 tu_set_input_attachments(cmd, cmd->state.subpass);
2638
2639 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2640 const struct tu_image_view *iview = fb->attachments[i].attachment;
2641 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2642 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2643 }
2644
2645 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2646 }
2647
2648 void
2649 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2650 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2651 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2652 {
2653 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2654 pSubpassBeginInfo->contents);
2655 }
2656
2657 void
2658 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2659 {
2660 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2661 const struct tu_render_pass *pass = cmd->state.pass;
2662 struct tu_cs *cs = &cmd->draw_cs;
2663
2664 const struct tu_subpass *subpass = cmd->state.subpass++;
2665
2666 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2667
2668 if (subpass->resolve_attachments) {
2669 tu6_emit_blit_scissor(cmd, cs, true);
2670
2671 for (unsigned i = 0; i < subpass->color_count; i++) {
2672 uint32_t a = subpass->resolve_attachments[i].attachment;
2673 if (a == VK_ATTACHMENT_UNUSED)
2674 continue;
2675
2676 tu_store_gmem_attachment(cmd, cs, a,
2677 subpass->color_attachments[i].attachment);
2678
2679 if (pass->attachments[a].gmem_offset < 0)
2680 continue;
2681
2682 /* TODO:
2683 * check if the resolved attachment is needed by later subpasses,
2684 * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
2685 */
2686 tu_finishme("missing GMEM->GMEM resolve path\n");
2687 tu_load_gmem_attachment(cmd, cs, a, true);
2688 }
2689 }
2690
2691 tu_cond_exec_end(cs);
2692
2693 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2694
2695 tu6_emit_sysmem_resolves(cmd, cs, subpass);
2696
2697 tu_cond_exec_end(cs);
2698
2699 /* Handle dependencies for the next subpass */
2700 tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
2701
2702 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2703 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2704 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2705 tu6_emit_msaa(cs, cmd->state.subpass->samples);
2706 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2707
2708 tu_set_input_attachments(cmd, cmd->state.subpass);
2709 }
2710
2711 void
2712 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2713 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2714 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2715 {
2716 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2717 }
2718
2719 static void
2720 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2721 struct tu_descriptor_state *descriptors_state,
2722 gl_shader_stage type,
2723 uint32_t *push_constants)
2724 {
2725 const struct tu_program_descriptor_linkage *link =
2726 &pipeline->program.link[type];
2727 const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
2728
2729 if (link->push_consts.count > 0) {
2730 unsigned num_units = link->push_consts.count;
2731 unsigned offset = link->push_consts.lo;
2732 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
2733 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
2734 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2735 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2736 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2737 CP_LOAD_STATE6_0_NUM_UNIT(num_units));
2738 tu_cs_emit(cs, 0);
2739 tu_cs_emit(cs, 0);
2740 for (unsigned i = 0; i < num_units * 4; i++)
2741 tu_cs_emit(cs, push_constants[i + offset * 4]);
2742 }
2743
2744 for (uint32_t i = 0; i < state->num_enabled; i++) {
2745 uint32_t size = state->range[i].end - state->range[i].start;
2746 uint32_t offset = state->range[i].start;
2747
2748 /* and even if the start of the const buffer is before
2749 * first_immediate, the end may not be:
2750 */
2751 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2752
2753 if (size == 0)
2754 continue;
2755
2756 /* things should be aligned to vec4: */
2757 debug_assert((state->range[i].offset % 16) == 0);
2758 debug_assert((size % 16) == 0);
2759 debug_assert((offset % 16) == 0);
2760
2761 /* Dig out the descriptor from the descriptor state and read the VA from
2762 * it.
2763 */
2764 assert(state->range[i].ubo.bindless);
2765 uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
2766 descriptors_state->dynamic_descriptors :
2767 descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
2768 unsigned block = state->range[i].ubo.block;
2769 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
2770 uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
2771 assert(va);
2772
2773 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2774 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2775 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2776 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2777 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2778 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2779 tu_cs_emit_qw(cs, va + offset);
2780 }
2781 }
2782
2783 static struct tu_cs_entry
2784 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2785 const struct tu_pipeline *pipeline,
2786 struct tu_descriptor_state *descriptors_state,
2787 gl_shader_stage type)
2788 {
2789 struct tu_cs cs;
2790 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2791
2792 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2793
2794 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2795 }
2796
2797 static struct tu_cs_entry
2798 tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
2799 const struct tu_pipeline *pipeline)
2800 {
2801 struct tu_cs cs;
2802 tu_cs_begin_sub_stream(&cmd->sub_cs, 4 * MAX_VBS, &cs);
2803
2804 int binding;
2805 for_each_bit(binding, pipeline->vi.bindings_used) {
2806 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
2807 const VkDeviceSize offset = buf->bo_offset +
2808 cmd->state.vb.offsets[binding];
2809
2810 tu_cs_emit_regs(&cs,
2811 A6XX_VFD_FETCH_BASE(binding, .bo = buf->bo, .bo_offset = offset),
2812 A6XX_VFD_FETCH_SIZE(binding, buf->size - offset));
2813
2814 }
2815
2816 cmd->vertex_bindings_set = pipeline->vi.bindings_used;
2817
2818 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2819 }
2820
2821 static uint64_t
2822 get_tess_param_bo_size(const struct tu_pipeline *pipeline,
2823 uint32_t draw_count)
2824 {
2825 /* TODO: For indirect draws, we can't compute the BO size ahead of time.
2826 * Still not sure what to do here, so just allocate a reasonably large
2827 * BO and hope for the best for now.
2828 * (maxTessellationControlPerVertexOutputComponents * 2048 vertices +
2829 * maxTessellationControlPerPatchOutputComponents * 512 patches) */
2830 if (!draw_count) {
2831 return ((128 * 2048) + (128 * 512)) * 4;
2832 }
2833
2834 /* For each patch, adreno lays out the tess param BO in memory as:
2835 * (v_input[0][0])...(v_input[i][j])(p_input[0])...(p_input[k]).
2836 * where i = # vertices per patch, j = # per-vertex outputs, and
2837 * k = # per-patch outputs.*/
2838 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
2839 uint32_t num_patches = draw_count / verts_per_patch;
2840 return draw_count * pipeline->tess.per_vertex_output_size +
2841 pipeline->tess.per_patch_output_size * num_patches;
2842 }
2843
2844 static uint64_t
2845 get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
2846 uint32_t draw_count)
2847 {
2848 /* TODO: For indirect draws, we can't compute the BO size ahead of time.
2849 * Still not sure what to do here, so just allocate a reasonably large
2850 * BO and hope for the best for now.
2851 * (quad factor stride * 512 patches) */
2852 if (!draw_count) {
2853 return (28 * 512) * 4;
2854 }
2855
2856 /* Each distinct patch gets its own tess factor output. */
2857 uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
2858 uint32_t num_patches = draw_count / verts_per_patch;
2859 uint32_t factor_stride;
2860 switch (pipeline->tess.patch_type) {
2861 case IR3_TESS_ISOLINES:
2862 factor_stride = 12;
2863 break;
2864 case IR3_TESS_TRIANGLES:
2865 factor_stride = 20;
2866 break;
2867 case IR3_TESS_QUADS:
2868 factor_stride = 28;
2869 break;
2870 default:
2871 unreachable("bad tessmode");
2872 }
2873 return factor_stride * num_patches;
2874 }
2875
2876 static VkResult
2877 tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
2878 uint32_t draw_count,
2879 const struct tu_pipeline *pipeline,
2880 struct tu_cs_entry *entry)
2881 {
2882 struct tu_cs cs;
2883 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 20, &cs);
2884 if (result != VK_SUCCESS)
2885 return result;
2886
2887 uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
2888 uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
2889 uint64_t tess_bo_size = tess_factor_size + tess_param_size;
2890 if (tess_bo_size > 0) {
2891 struct tu_bo *tess_bo;
2892 result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
2893 if (result != VK_SUCCESS)
2894 return result;
2895
2896 tu_bo_list_add(&cmd->bo_list, tess_bo,
2897 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2898 uint64_t tess_factor_iova = tess_bo->iova;
2899 uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
2900
2901 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2902 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
2903 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2904 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2905 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
2906 CP_LOAD_STATE6_0_NUM_UNIT(1));
2907 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2908 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2909 tu_cs_emit_qw(&cs, tess_param_iova);
2910 tu_cs_emit_qw(&cs, tess_factor_iova);
2911
2912 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2913 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
2914 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2915 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2916 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
2917 CP_LOAD_STATE6_0_NUM_UNIT(1));
2918 tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2919 tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2920 tu_cs_emit_qw(&cs, tess_param_iova);
2921 tu_cs_emit_qw(&cs, tess_factor_iova);
2922
2923 tu_cs_emit_pkt4(&cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
2924 tu_cs_emit_qw(&cs, tess_factor_iova);
2925
2926 /* TODO: Without this WFI here, the hardware seems unable to read these
2927 * addresses we just emitted. Freedreno emits these consts as part of
2928 * IB1 instead of in a draw state which might make this WFI unnecessary,
2929 * but it requires a bit more indirection (SS6_INDIRECT for consts). */
2930 tu_cs_emit_wfi(&cs);
2931 }
2932 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2933 return VK_SUCCESS;
2934 }
2935
2936 static VkResult
2937 tu6_draw_common(struct tu_cmd_buffer *cmd,
2938 struct tu_cs *cs,
2939 bool indexed,
2940 /* note: draw_count is 0 for indirect */
2941 uint32_t draw_count)
2942 {
2943 const struct tu_pipeline *pipeline = cmd->state.pipeline;
2944 VkResult result;
2945
2946 struct tu_descriptor_state *descriptors_state =
2947 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
2948
2949 tu_emit_cache_flush_renderpass(cmd, cs);
2950
2951 /* TODO lrz */
2952
2953 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
2954 .primitive_restart =
2955 pipeline->ia.primitive_restart && indexed,
2956 .tess_upper_left_domain_origin =
2957 pipeline->tess.upper_left_domain_origin));
2958
2959 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
2960 cmd->state.shader_const_ib[MESA_SHADER_VERTEX] =
2961 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX);
2962 cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL] =
2963 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_TESS_CTRL);
2964 cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL] =
2965 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_TESS_EVAL);
2966 cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY] =
2967 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_GEOMETRY);
2968 cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT] =
2969 tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
2970 }
2971
2972 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
2973 /* We need to reload the descriptors every time the descriptor sets
2974 * change. However, the commands we send only depend on the pipeline
2975 * because the whole point is to cache descriptors which are used by the
2976 * pipeline. There's a problem here, in that the firmware has an
2977 * "optimization" which skips executing groups that are set to the same
2978 * value as the last draw. This means that if the descriptor sets change
2979 * but not the pipeline, we'd try to re-execute the same buffer which
2980 * the firmware would ignore and we wouldn't pre-load the new
2981 * descriptors. The blob seems to re-emit the LOAD_STATE group whenever
2982 * the descriptor sets change, which we emulate here by copying the
2983 * pre-prepared buffer.
2984 */
2985 const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib;
2986 if (load_entry->size > 0) {
2987 struct tu_cs load_cs;
2988 result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs);
2989 if (result != VK_SUCCESS)
2990 return result;
2991 tu_cs_emit_array(&load_cs,
2992 (uint32_t *)((char *)load_entry->bo->map + load_entry->offset),
2993 load_entry->size / 4);
2994 cmd->state.desc_sets_load_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs);
2995 } else {
2996 cmd->state.desc_sets_load_ib.size = 0;
2997 }
2998 }
2999
3000 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
3001 cmd->state.vertex_buffers_ib = tu6_emit_vertex_buffers(cmd, pipeline);
3002
3003 bool has_tess =
3004 pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
3005 struct tu_cs_entry tess_consts = {};
3006 if (has_tess) {
3007 cmd->has_tess = true;
3008 result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts);
3009 if (result != VK_SUCCESS)
3010 return result;
3011 }
3012
3013 /* for the first draw in a renderpass, re-emit all the draw states
3014 *
3015 * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
3016 * used, then draw states must be re-emitted. note however this only happens
3017 * in the sysmem path, so this can be skipped this for the gmem path (TODO)
3018 *
3019 * the two input attachment states are excluded because secondary command
3020 * buffer doesn't have a state ib to restore it, and not re-emitting them
3021 * is OK since CmdClearAttachments won't disable/overwrite them
3022 */
3023 if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
3024 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3025
3026 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib);
3027 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib);
3028 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
3029 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib);
3030 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib);
3031 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib);
3032 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS, pipeline->ds.state_ib);
3033 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_BLEND, pipeline->blend.state_ib);
3034 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]);
3035 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_HS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL]);
3036 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL]);
3037 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
3038 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
3039 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib);
3040 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
3041 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
3042 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3043
3044 for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
3045 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
3046 ((pipeline->dynamic_state_mask & BIT(i)) ?
3047 cmd->state.dynamic_state[i] :
3048 pipeline->dynamic_state[i]));
3049 }
3050 } else {
3051
3052 /* emit draw states that were just updated
3053 * note we eventually don't want to have to emit anything here
3054 */
3055 uint32_t draw_state_count =
3056 has_tess +
3057 ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 5 : 0) +
3058 ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) +
3059 ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
3060 1; /* vs_params */
3061
3062 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
3063
3064 /* We may need to re-emit tess consts if the current draw call is
3065 * sufficiently larger than the last draw call. */
3066 if (has_tess)
3067 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
3068 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3069 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]);
3070 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_HS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_CTRL]);
3071 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DS_CONST, cmd->state.shader_const_ib[MESA_SHADER_TESS_EVAL]);
3072 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
3073 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]);
3074 }
3075 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS)
3076 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
3077 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
3078 tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
3079 tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3080 }
3081
3082 tu_cs_sanity_check(cs);
3083
3084 /* There are too many graphics dirty bits to list here, so just list the
3085 * bits to preserve instead. The only things not emitted here are
3086 * compute-related state.
3087 */
3088 cmd->state.dirty &= (TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3089 return VK_SUCCESS;
3090 }
3091
3092 static uint32_t
3093 tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
3094 {
3095 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3096 uint32_t initiator =
3097 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(pipeline->ia.primtype) |
3098 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
3099 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
3100 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
3101
3102 if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
3103 initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
3104
3105 switch (pipeline->tess.patch_type) {
3106 case IR3_TESS_TRIANGLES:
3107 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
3108 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3109 break;
3110 case IR3_TESS_ISOLINES:
3111 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
3112 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3113 break;
3114 case IR3_TESS_NONE:
3115 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
3116 break;
3117 case IR3_TESS_QUADS:
3118 initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
3119 CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3120 break;
3121 }
3122 return initiator;
3123 }
3124
3125
3126 static uint32_t
3127 vs_params_offset(struct tu_cmd_buffer *cmd)
3128 {
3129 const struct tu_program_descriptor_linkage *link =
3130 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3131 const struct ir3_const_state *const_state = &link->const_state;
3132
3133 if (const_state->offsets.driver_param >= link->constlen)
3134 return 0;
3135
3136 /* this layout is required by CP_DRAW_INDIRECT_MULTI */
3137 STATIC_ASSERT(IR3_DP_DRAWID == 0);
3138 STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
3139 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3140
3141 /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
3142 assert(const_state->offsets.driver_param != 0);
3143
3144 return const_state->offsets.driver_param;
3145 }
3146
3147 static struct tu_draw_state
3148 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3149 uint32_t vertex_offset,
3150 uint32_t first_instance)
3151 {
3152 uint32_t offset = vs_params_offset(cmd);
3153
3154 struct tu_cs cs;
3155 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
3156 if (result != VK_SUCCESS) {
3157 cmd->record_result = result;
3158 return (struct tu_draw_state) {};
3159 }
3160
3161 /* TODO: don't make a new draw state when it doesn't change */
3162
3163 tu_cs_emit_regs(&cs,
3164 A6XX_VFD_INDEX_OFFSET(vertex_offset),
3165 A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
3166
3167 if (offset) {
3168 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3169 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3170 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3171 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3172 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3173 CP_LOAD_STATE6_0_NUM_UNIT(1));
3174 tu_cs_emit(&cs, 0);
3175 tu_cs_emit(&cs, 0);
3176
3177 tu_cs_emit(&cs, 0);
3178 tu_cs_emit(&cs, vertex_offset);
3179 tu_cs_emit(&cs, first_instance);
3180 tu_cs_emit(&cs, 0);
3181 }
3182
3183 struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3184 return (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
3185 }
3186
3187 void
3188 tu_CmdDraw(VkCommandBuffer commandBuffer,
3189 uint32_t vertexCount,
3190 uint32_t instanceCount,
3191 uint32_t firstVertex,
3192 uint32_t firstInstance)
3193 {
3194 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3195 struct tu_cs *cs = &cmd->draw_cs;
3196
3197 cmd->state.vs_params = tu6_emit_vs_params(cmd, firstVertex, firstInstance);
3198
3199 tu6_draw_common(cmd, cs, false, vertexCount);
3200
3201 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3202 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3203 tu_cs_emit(cs, instanceCount);
3204 tu_cs_emit(cs, vertexCount);
3205 }
3206
3207 void
3208 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3209 uint32_t indexCount,
3210 uint32_t instanceCount,
3211 uint32_t firstIndex,
3212 int32_t vertexOffset,
3213 uint32_t firstInstance)
3214 {
3215 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3216 struct tu_cs *cs = &cmd->draw_cs;
3217
3218 cmd->state.vs_params = tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
3219
3220 tu6_draw_common(cmd, cs, true, indexCount);
3221
3222 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3223 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
3224 tu_cs_emit(cs, instanceCount);
3225 tu_cs_emit(cs, indexCount);
3226 tu_cs_emit(cs, firstIndex);
3227 tu_cs_emit_qw(cs, cmd->state.index_va);
3228 tu_cs_emit(cs, cmd->state.max_index_count);
3229 }
3230
3231 void
3232 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3233 VkBuffer _buffer,
3234 VkDeviceSize offset,
3235 uint32_t drawCount,
3236 uint32_t stride)
3237 {
3238 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3239 TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3240 struct tu_cs *cs = &cmd->draw_cs;
3241
3242 cmd->state.vs_params = (struct tu_draw_state) {};
3243
3244 tu6_draw_common(cmd, cs, false, 0);
3245
3246 /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
3247 * doesn't wait for WFIs to be completed and leads to GPU fault/hang
3248 * TODO: this could be worked around in a more performant way,
3249 * or there may exist newer firmware that has been fixed
3250 */
3251 if (cmd->device->physical_device->gpu_id != 650)
3252 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
3253
3254 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
3255 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3256 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
3257 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
3258 tu_cs_emit(cs, drawCount);
3259 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
3260 tu_cs_emit(cs, stride);
3261
3262 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3263 }
3264
3265 void
3266 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3267 VkBuffer _buffer,
3268 VkDeviceSize offset,
3269 uint32_t drawCount,
3270 uint32_t stride)
3271 {
3272 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3273 TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3274 struct tu_cs *cs = &cmd->draw_cs;
3275
3276 cmd->state.vs_params = (struct tu_draw_state) {};
3277
3278 tu6_draw_common(cmd, cs, true, 0);
3279
3280 /* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
3281 * doesn't wait for WFIs to be completed and leads to GPU fault/hang
3282 * TODO: this could be worked around in a more performant way,
3283 * or there may exist newer firmware that has been fixed
3284 */
3285 if (cmd->device->physical_device->gpu_id != 650)
3286 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
3287
3288 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
3289 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
3290 tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
3291 A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
3292 tu_cs_emit(cs, drawCount);
3293 tu_cs_emit_qw(cs, cmd->state.index_va);
3294 tu_cs_emit(cs, cmd->state.max_index_count);
3295 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
3296 tu_cs_emit(cs, stride);
3297
3298 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3299 }
3300
3301 void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3302 uint32_t instanceCount,
3303 uint32_t firstInstance,
3304 VkBuffer _counterBuffer,
3305 VkDeviceSize counterBufferOffset,
3306 uint32_t counterOffset,
3307 uint32_t vertexStride)
3308 {
3309 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3310 TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
3311 struct tu_cs *cs = &cmd->draw_cs;
3312
3313 cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
3314
3315 tu6_draw_common(cmd, cs, false, 0);
3316
3317 tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
3318 tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
3319 tu_cs_emit(cs, instanceCount);
3320 tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);
3321 tu_cs_emit(cs, counterOffset);
3322 tu_cs_emit(cs, vertexStride);
3323
3324 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3325 }
3326
3327 struct tu_dispatch_info
3328 {
3329 /**
3330 * Determine the layout of the grid (in block units) to be used.
3331 */
3332 uint32_t blocks[3];
3333
3334 /**
3335 * A starting offset for the grid. If unaligned is set, the offset
3336 * must still be aligned.
3337 */
3338 uint32_t offsets[3];
3339 /**
3340 * Whether it's an unaligned compute dispatch.
3341 */
3342 bool unaligned;
3343
3344 /**
3345 * Indirect compute parameters resource.
3346 */
3347 struct tu_buffer *indirect;
3348 uint64_t indirect_offset;
3349 };
3350
3351 static void
3352 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3353 const struct tu_dispatch_info *info)
3354 {
3355 gl_shader_stage type = MESA_SHADER_COMPUTE;
3356 const struct tu_program_descriptor_linkage *link =
3357 &pipeline->program.link[type];
3358 const struct ir3_const_state *const_state = &link->const_state;
3359 uint32_t offset = const_state->offsets.driver_param;
3360
3361 if (link->constlen <= offset)
3362 return;
3363
3364 if (!info->indirect) {
3365 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3366 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3367 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3368 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3369 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3370 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3371 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3372 };
3373
3374 uint32_t num_consts = MIN2(const_state->num_driver_params,
3375 (link->constlen - offset) * 4);
3376 /* push constants */
3377 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3378 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3379 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3380 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3381 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3382 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3383 tu_cs_emit(cs, 0);
3384 tu_cs_emit(cs, 0);
3385 uint32_t i;
3386 for (i = 0; i < num_consts; i++)
3387 tu_cs_emit(cs, driver_params[i]);
3388 } else {
3389 tu_finishme("Indirect driver params");
3390 }
3391 }
3392
3393 static void
3394 tu_dispatch(struct tu_cmd_buffer *cmd,
3395 const struct tu_dispatch_info *info)
3396 {
3397 struct tu_cs *cs = &cmd->cs;
3398 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3399 struct tu_descriptor_state *descriptors_state =
3400 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3401
3402 /* TODO: We could probably flush less if we add a compute_flush_bits
3403 * bitfield.
3404 */
3405 tu_emit_cache_flush(cmd, cs);
3406
3407 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3408 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3409
3410 struct tu_cs_entry ib;
3411
3412 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3413 if (ib.size)
3414 tu_cs_emit_ib(cs, &ib);
3415
3416 tu_emit_compute_driver_params(cs, pipeline, info);
3417
3418 if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) &&
3419 pipeline->load_state.state_ib.size > 0) {
3420 tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);
3421 }
3422
3423 cmd->state.dirty &=
3424 ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3425
3426 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3427 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3428
3429 const uint32_t *local_size = pipeline->compute.local_size;
3430 const uint32_t *num_groups = info->blocks;
3431 tu_cs_emit_regs(cs,
3432 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3433 .localsizex = local_size[0] - 1,
3434 .localsizey = local_size[1] - 1,
3435 .localsizez = local_size[2] - 1),
3436 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3437 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3438 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3439 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3440 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3441 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3442
3443 tu_cs_emit_regs(cs,
3444 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3445 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3446 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3447
3448 if (info->indirect) {
3449 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3450
3451 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3452 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3453
3454 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3455 tu_cs_emit(cs, 0x00000000);
3456 tu_cs_emit_qw(cs, iova);
3457 tu_cs_emit(cs,
3458 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3459 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3460 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3461 } else {
3462 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3463 tu_cs_emit(cs, 0x00000000);
3464 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3465 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3466 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3467 }
3468
3469 tu_cs_emit_wfi(cs);
3470 }
3471
3472 void
3473 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3474 uint32_t base_x,
3475 uint32_t base_y,
3476 uint32_t base_z,
3477 uint32_t x,
3478 uint32_t y,
3479 uint32_t z)
3480 {
3481 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3482 struct tu_dispatch_info info = {};
3483
3484 info.blocks[0] = x;
3485 info.blocks[1] = y;
3486 info.blocks[2] = z;
3487
3488 info.offsets[0] = base_x;
3489 info.offsets[1] = base_y;
3490 info.offsets[2] = base_z;
3491 tu_dispatch(cmd_buffer, &info);
3492 }
3493
3494 void
3495 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3496 uint32_t x,
3497 uint32_t y,
3498 uint32_t z)
3499 {
3500 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3501 }
3502
3503 void
3504 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3505 VkBuffer _buffer,
3506 VkDeviceSize offset)
3507 {
3508 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3509 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3510 struct tu_dispatch_info info = {};
3511
3512 info.indirect = buffer;
3513 info.indirect_offset = offset;
3514
3515 tu_dispatch(cmd_buffer, &info);
3516 }
3517
3518 void
3519 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3520 {
3521 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3522
3523 tu_cs_end(&cmd_buffer->draw_cs);
3524 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3525
3526 if (use_sysmem_rendering(cmd_buffer))
3527 tu_cmd_render_sysmem(cmd_buffer);
3528 else
3529 tu_cmd_render_tiles(cmd_buffer);
3530
3531 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
3532 rendered */
3533 tu_cs_discard_entries(&cmd_buffer->draw_cs);
3534 tu_cs_begin(&cmd_buffer->draw_cs);
3535 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
3536 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
3537
3538 cmd_buffer->state.cache.pending_flush_bits |=
3539 cmd_buffer->state.renderpass_cache.pending_flush_bits;
3540 tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
3541
3542 cmd_buffer->state.pass = NULL;
3543 cmd_buffer->state.subpass = NULL;
3544 cmd_buffer->state.framebuffer = NULL;
3545 }
3546
3547 void
3548 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
3549 const VkSubpassEndInfoKHR *pSubpassEndInfo)
3550 {
3551 tu_CmdEndRenderPass(commandBuffer);
3552 }
3553
3554 struct tu_barrier_info
3555 {
3556 uint32_t eventCount;
3557 const VkEvent *pEvents;
3558 VkPipelineStageFlags srcStageMask;
3559 };
3560
3561 static void
3562 tu_barrier(struct tu_cmd_buffer *cmd,
3563 uint32_t memoryBarrierCount,
3564 const VkMemoryBarrier *pMemoryBarriers,
3565 uint32_t bufferMemoryBarrierCount,
3566 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3567 uint32_t imageMemoryBarrierCount,
3568 const VkImageMemoryBarrier *pImageMemoryBarriers,
3569 const struct tu_barrier_info *info)
3570 {
3571 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
3572 VkAccessFlags srcAccessMask = 0;
3573 VkAccessFlags dstAccessMask = 0;
3574
3575 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3576 srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
3577 dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
3578 }
3579
3580 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
3581 srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
3582 dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
3583 }
3584
3585 enum tu_cmd_access_mask src_flags = 0;
3586 enum tu_cmd_access_mask dst_flags = 0;
3587
3588 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
3589 TU_FROM_HANDLE(tu_image, image, pImageMemoryBarriers[i].image);
3590 VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
3591 /* For non-linear images, PREINITIALIZED is the same as UNDEFINED */
3592 if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
3593 (image->tiling != VK_IMAGE_TILING_LINEAR &&
3594 old_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
3595 /* The underlying memory for this image may have been used earlier
3596 * within the same queue submission for a different image, which
3597 * means that there may be old, stale cache entries which are in the
3598 * "wrong" location, which could cause problems later after writing
3599 * to the image. We don't want these entries being flushed later and
3600 * overwriting the actual image, so we need to flush the CCU.
3601 */
3602 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
3603 }
3604 srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
3605 dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
3606 }
3607
3608 /* Inside a renderpass, we don't know yet whether we'll be using sysmem
3609 * so we have to use the sysmem flushes.
3610 */
3611 bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
3612 !cmd->state.pass;
3613 src_flags |= vk2tu_access(srcAccessMask, gmem);
3614 dst_flags |= vk2tu_access(dstAccessMask, gmem);
3615
3616 struct tu_cache_state *cache =
3617 cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
3618 tu_flush_for_access(cache, src_flags, dst_flags);
3619
3620 for (uint32_t i = 0; i < info->eventCount; i++) {
3621 TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
3622
3623 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
3624
3625 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
3626 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
3627 CP_WAIT_REG_MEM_0_POLL_MEMORY);
3628 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
3629 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
3630 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
3631 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
3632 }
3633 }
3634
3635 void
3636 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
3637 VkPipelineStageFlags srcStageMask,
3638 VkPipelineStageFlags dstStageMask,
3639 VkDependencyFlags dependencyFlags,
3640 uint32_t memoryBarrierCount,
3641 const VkMemoryBarrier *pMemoryBarriers,
3642 uint32_t bufferMemoryBarrierCount,
3643 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3644 uint32_t imageMemoryBarrierCount,
3645 const VkImageMemoryBarrier *pImageMemoryBarriers)
3646 {
3647 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3648 struct tu_barrier_info info;
3649
3650 info.eventCount = 0;
3651 info.pEvents = NULL;
3652 info.srcStageMask = srcStageMask;
3653
3654 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
3655 bufferMemoryBarrierCount, pBufferMemoryBarriers,
3656 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
3657 }
3658
3659 static void
3660 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
3661 VkPipelineStageFlags stageMask, unsigned value)
3662 {
3663 struct tu_cs *cs = &cmd->cs;
3664
3665 /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
3666 assert(!cmd->state.pass);
3667
3668 tu_emit_cache_flush(cmd, cs);
3669
3670 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
3671
3672 /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
3673 * read by the CP, so the draw indirect stage counts as top-of-pipe too.
3674 */
3675 VkPipelineStageFlags top_of_pipe_flags =
3676 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
3677 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
3678
3679 if (!(stageMask & ~top_of_pipe_flags)) {
3680 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
3681 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
3682 tu_cs_emit(cs, value);
3683 } else {
3684 /* Use a RB_DONE_TS event to wait for everything to complete. */
3685 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
3686 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
3687 tu_cs_emit_qw(cs, event->bo.iova);
3688 tu_cs_emit(cs, value);
3689 }
3690 }
3691
3692 void
3693 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
3694 VkEvent _event,
3695 VkPipelineStageFlags stageMask)
3696 {
3697 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3698 TU_FROM_HANDLE(tu_event, event, _event);
3699
3700 write_event(cmd, event, stageMask, 1);
3701 }
3702
3703 void
3704 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
3705 VkEvent _event,
3706 VkPipelineStageFlags stageMask)
3707 {
3708 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3709 TU_FROM_HANDLE(tu_event, event, _event);
3710
3711 write_event(cmd, event, stageMask, 0);
3712 }
3713
3714 void
3715 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
3716 uint32_t eventCount,
3717 const VkEvent *pEvents,
3718 VkPipelineStageFlags srcStageMask,
3719 VkPipelineStageFlags dstStageMask,
3720 uint32_t memoryBarrierCount,
3721 const VkMemoryBarrier *pMemoryBarriers,
3722 uint32_t bufferMemoryBarrierCount,
3723 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3724 uint32_t imageMemoryBarrierCount,
3725 const VkImageMemoryBarrier *pImageMemoryBarriers)
3726 {
3727 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3728 struct tu_barrier_info info;
3729
3730 info.eventCount = eventCount;
3731 info.pEvents = pEvents;
3732 info.srcStageMask = 0;
3733
3734 tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
3735 bufferMemoryBarrierCount, pBufferMemoryBarriers,
3736 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
3737 }
3738
3739 void
3740 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
3741 {
3742 /* No-op */
3743 }