turnip: improve dirty bit handling a bit
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36
37 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
38
39 void
40 tu_bo_list_init(struct tu_bo_list *list)
41 {
42 list->count = list->capacity = 0;
43 list->bo_infos = NULL;
44 }
45
46 void
47 tu_bo_list_destroy(struct tu_bo_list *list)
48 {
49 free(list->bo_infos);
50 }
51
52 void
53 tu_bo_list_reset(struct tu_bo_list *list)
54 {
55 list->count = 0;
56 }
57
58 /**
59 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
60 */
61 static uint32_t
62 tu_bo_list_add_info(struct tu_bo_list *list,
63 const struct drm_msm_gem_submit_bo *bo_info)
64 {
65 assert(bo_info->handle != 0);
66
67 for (uint32_t i = 0; i < list->count; ++i) {
68 if (list->bo_infos[i].handle == bo_info->handle) {
69 assert(list->bo_infos[i].presumed == bo_info->presumed);
70 list->bo_infos[i].flags |= bo_info->flags;
71 return i;
72 }
73 }
74
75 /* grow list->bo_infos if needed */
76 if (list->count == list->capacity) {
77 uint32_t new_capacity = MAX2(2 * list->count, 16);
78 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
79 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
80 if (!new_bo_infos)
81 return TU_BO_LIST_FAILED;
82 list->bo_infos = new_bo_infos;
83 list->capacity = new_capacity;
84 }
85
86 list->bo_infos[list->count] = *bo_info;
87 return list->count++;
88 }
89
90 uint32_t
91 tu_bo_list_add(struct tu_bo_list *list,
92 const struct tu_bo *bo,
93 uint32_t flags)
94 {
95 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
96 .flags = flags,
97 .handle = bo->gem_handle,
98 .presumed = bo->iova,
99 });
100 }
101
102 VkResult
103 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
104 {
105 for (uint32_t i = 0; i < other->count; i++) {
106 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
107 return VK_ERROR_OUT_OF_HOST_MEMORY;
108 }
109
110 return VK_SUCCESS;
111 }
112
113 static void
114 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
115 const struct tu_device *dev,
116 const struct tu_render_pass *pass)
117 {
118 const uint32_t tile_align_w = pass->tile_align_w;
119 const uint32_t max_tile_width = 1024;
120
121 /* note: don't offset the tiling config by render_area.offset,
122 * because binning pass can't deal with it
123 * this means we might end up with more tiles than necessary,
124 * but load/store/etc are still scissored to the render_area
125 */
126 tiling->tile0.offset = (VkOffset2D) {};
127
128 const uint32_t ra_width =
129 tiling->render_area.extent.width +
130 (tiling->render_area.offset.x - tiling->tile0.offset.x);
131 const uint32_t ra_height =
132 tiling->render_area.extent.height +
133 (tiling->render_area.offset.y - tiling->tile0.offset.y);
134
135 /* start from 1 tile */
136 tiling->tile_count = (VkExtent2D) {
137 .width = 1,
138 .height = 1,
139 };
140 tiling->tile0.extent = (VkExtent2D) {
141 .width = util_align_npot(ra_width, tile_align_w),
142 .height = align(ra_height, TILE_ALIGN_H),
143 };
144
145 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
146 /* start with 2x2 tiles */
147 tiling->tile_count.width = 2;
148 tiling->tile_count.height = 2;
149 tiling->tile0.extent.width = util_align_npot(DIV_ROUND_UP(ra_width, 2), tile_align_w);
150 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), TILE_ALIGN_H);
151 }
152
153 /* do not exceed max tile width */
154 while (tiling->tile0.extent.width > max_tile_width) {
155 tiling->tile_count.width++;
156 tiling->tile0.extent.width =
157 util_align_npot(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
158 }
159
160 /* will force to sysmem, don't bother trying to have a valid tile config
161 * TODO: just skip all GMEM stuff when sysmem is forced?
162 */
163 if (!pass->gmem_pixels)
164 return;
165
166 /* do not exceed gmem size */
167 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pass->gmem_pixels) {
168 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
169 tiling->tile_count.width++;
170 tiling->tile0.extent.width =
171 util_align_npot(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
172 } else {
173 /* if this assert fails then layout is impossible.. */
174 assert(tiling->tile0.extent.height > TILE_ALIGN_H);
175 tiling->tile_count.height++;
176 tiling->tile0.extent.height =
177 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), TILE_ALIGN_H);
178 }
179 }
180 }
181
182 static void
183 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
184 const struct tu_device *dev)
185 {
186 const uint32_t max_pipe_count = 32; /* A6xx */
187
188 /* start from 1 tile per pipe */
189 tiling->pipe0 = (VkExtent2D) {
190 .width = 1,
191 .height = 1,
192 };
193 tiling->pipe_count = tiling->tile_count;
194
195 while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
196 if (tiling->pipe0.width < tiling->pipe0.height) {
197 tiling->pipe0.width += 1;
198 tiling->pipe_count.width =
199 DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
200 } else {
201 tiling->pipe0.height += 1;
202 tiling->pipe_count.height =
203 DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
204 }
205 }
206 }
207
208 static void
209 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
210 const struct tu_device *dev)
211 {
212 const uint32_t max_pipe_count = 32; /* A6xx */
213 const uint32_t used_pipe_count =
214 tiling->pipe_count.width * tiling->pipe_count.height;
215 const VkExtent2D last_pipe = {
216 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
217 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
218 };
219
220 assert(used_pipe_count <= max_pipe_count);
221 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
222
223 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
224 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
225 const uint32_t pipe_x = tiling->pipe0.width * x;
226 const uint32_t pipe_y = tiling->pipe0.height * y;
227 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
228 ? last_pipe.width
229 : tiling->pipe0.width;
230 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
231 ? last_pipe.height
232 : tiling->pipe0.height;
233 const uint32_t n = tiling->pipe_count.width * y + x;
234
235 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
236 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
237 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
238 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
239 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
240 }
241 }
242
243 memset(tiling->pipe_config + used_pipe_count, 0,
244 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
245 }
246
247 static void
248 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
249 const struct tu_device *dev,
250 uint32_t tx,
251 uint32_t ty,
252 struct tu_tile *tile)
253 {
254 /* find the pipe and the slot for tile (tx, ty) */
255 const uint32_t px = tx / tiling->pipe0.width;
256 const uint32_t py = ty / tiling->pipe0.height;
257 const uint32_t sx = tx - tiling->pipe0.width * px;
258 const uint32_t sy = ty - tiling->pipe0.height * py;
259 /* last pipe has different width */
260 const uint32_t pipe_width =
261 MIN2(tiling->pipe0.width,
262 tiling->tile_count.width - px * tiling->pipe0.width);
263
264 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
265 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
266 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
267
268 /* convert to 1D indices */
269 tile->pipe = tiling->pipe_count.width * py + px;
270 tile->slot = pipe_width * sy + sx;
271
272 /* get the blit area for the tile */
273 tile->begin = (VkOffset2D) {
274 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
275 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
276 };
277 tile->end.x =
278 (tx == tiling->tile_count.width - 1)
279 ? tiling->render_area.offset.x + tiling->render_area.extent.width
280 : tile->begin.x + tiling->tile0.extent.width;
281 tile->end.y =
282 (ty == tiling->tile_count.height - 1)
283 ? tiling->render_area.offset.y + tiling->render_area.extent.height
284 : tile->begin.y + tiling->tile0.extent.height;
285 }
286
287 enum a3xx_msaa_samples
288 tu_msaa_samples(uint32_t samples)
289 {
290 switch (samples) {
291 case 1:
292 return MSAA_ONE;
293 case 2:
294 return MSAA_TWO;
295 case 4:
296 return MSAA_FOUR;
297 case 8:
298 return MSAA_EIGHT;
299 default:
300 assert(!"invalid sample count");
301 return MSAA_ONE;
302 }
303 }
304
305 static enum a4xx_index_size
306 tu6_index_size(VkIndexType type)
307 {
308 switch (type) {
309 case VK_INDEX_TYPE_UINT16:
310 return INDEX4_SIZE_16_BIT;
311 case VK_INDEX_TYPE_UINT32:
312 return INDEX4_SIZE_32_BIT;
313 default:
314 unreachable("invalid VkIndexType");
315 return INDEX4_SIZE_8_BIT;
316 }
317 }
318
319 void
320 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
321 struct tu_cs *cs,
322 enum vgt_event_type event)
323 {
324 bool need_seqno = false;
325 switch (event) {
326 case CACHE_FLUSH_TS:
327 case WT_DONE_TS:
328 case RB_DONE_TS:
329 case PC_CCU_FLUSH_DEPTH_TS:
330 case PC_CCU_FLUSH_COLOR_TS:
331 case PC_CCU_RESOLVE_TS:
332 need_seqno = true;
333 break;
334 default:
335 break;
336 }
337
338 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
339 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
340 if (need_seqno) {
341 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
342 tu_cs_emit(cs, 0);
343 }
344 }
345
346 static void
347 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
348 struct tu_cs *cs,
349 enum tu_cmd_flush_bits flushes)
350 {
351 /* Experiments show that invalidating CCU while it still has data in it
352 * doesn't work, so make sure to always flush before invalidating in case
353 * any data remains that hasn't yet been made available through a barrier.
354 * However it does seem to work for UCHE.
355 */
356 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
357 TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
358 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
359 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
360 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
361 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
362 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
363 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
364 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
365 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
366 if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
367 tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
368 if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
369 tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
370 if (flushes & TU_CMD_FLAG_WFI)
371 tu_cs_emit_wfi(cs);
372 }
373
374 /* "Normal" cache flushes, that don't require any special handling */
375
376 static void
377 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
378 struct tu_cs *cs)
379 {
380 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
381 cmd_buffer->state.cache.flush_bits = 0;
382 }
383
384 /* Renderpass cache flushes */
385
386 void
387 tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
388 struct tu_cs *cs)
389 {
390 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
391 cmd_buffer->state.renderpass_cache.flush_bits = 0;
392 }
393
394 /* Cache flushes for things that use the color/depth read/write path (i.e.
395 * blits and draws). This deals with changing CCU state as well as the usual
396 * cache flushing.
397 */
398
399 void
400 tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
401 struct tu_cs *cs,
402 enum tu_cmd_ccu_state ccu_state)
403 {
404 enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
405
406 assert(ccu_state != TU_CMD_CCU_UNKNOWN);
407
408 /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
409 * the CCU may also contain data that we haven't flushed out yet, so we
410 * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
411 * emit a WFI as it isn't pipelined.
412 */
413 if (ccu_state != cmd_buffer->state.ccu_state) {
414 if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
415 flushes |=
416 TU_CMD_FLAG_CCU_FLUSH_COLOR |
417 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
418 cmd_buffer->state.cache.pending_flush_bits &= ~(
419 TU_CMD_FLAG_CCU_FLUSH_COLOR |
420 TU_CMD_FLAG_CCU_FLUSH_DEPTH);
421 }
422 flushes |=
423 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
424 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
425 TU_CMD_FLAG_WFI;
426 cmd_buffer->state.cache.pending_flush_bits &= ~(
427 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
428 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
429 }
430
431 tu6_emit_flushes(cmd_buffer, cs, flushes);
432 cmd_buffer->state.cache.flush_bits = 0;
433
434 if (ccu_state != cmd_buffer->state.ccu_state) {
435 struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
436 tu_cs_emit_regs(cs,
437 A6XX_RB_CCU_CNTL(.offset =
438 ccu_state == TU_CMD_CCU_GMEM ?
439 phys_dev->ccu_offset_gmem :
440 phys_dev->ccu_offset_bypass,
441 .gmem = ccu_state == TU_CMD_CCU_GMEM));
442 cmd_buffer->state.ccu_state = ccu_state;
443 }
444 }
445
446 static void
447 tu6_emit_zs(struct tu_cmd_buffer *cmd,
448 const struct tu_subpass *subpass,
449 struct tu_cs *cs)
450 {
451 const struct tu_framebuffer *fb = cmd->state.framebuffer;
452
453 const uint32_t a = subpass->depth_stencil_attachment.attachment;
454 if (a == VK_ATTACHMENT_UNUSED) {
455 tu_cs_emit_regs(cs,
456 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
457 A6XX_RB_DEPTH_BUFFER_PITCH(0),
458 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
459 A6XX_RB_DEPTH_BUFFER_BASE(0),
460 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
461
462 tu_cs_emit_regs(cs,
463 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
464
465 tu_cs_emit_regs(cs,
466 A6XX_GRAS_LRZ_BUFFER_BASE(0),
467 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
468 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
469
470 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
471
472 return;
473 }
474
475 const struct tu_image_view *iview = fb->attachments[a].attachment;
476 const struct tu_render_pass_attachment *attachment =
477 &cmd->state.pass->attachments[a];
478 enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
479
480 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
481 tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
482 tu_cs_image_ref(cs, iview, 0);
483 tu_cs_emit(cs, attachment->gmem_offset);
484
485 tu_cs_emit_regs(cs,
486 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
487
488 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3);
489 tu_cs_image_flag_ref(cs, iview, 0);
490
491 tu_cs_emit_regs(cs,
492 A6XX_GRAS_LRZ_BUFFER_BASE(0),
493 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
494 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
495
496 if (attachment->format == VK_FORMAT_S8_UINT) {
497 tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
498 tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
499 tu_cs_image_ref(cs, iview, 0);
500 tu_cs_emit(cs, attachment->gmem_offset);
501 } else {
502 tu_cs_emit_regs(cs,
503 A6XX_RB_STENCIL_INFO(0));
504 }
505 }
506
507 static void
508 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
509 const struct tu_subpass *subpass,
510 struct tu_cs *cs)
511 {
512 const struct tu_framebuffer *fb = cmd->state.framebuffer;
513
514 for (uint32_t i = 0; i < subpass->color_count; ++i) {
515 uint32_t a = subpass->color_attachments[i].attachment;
516 if (a == VK_ATTACHMENT_UNUSED)
517 continue;
518
519 const struct tu_image_view *iview = fb->attachments[a].attachment;
520
521 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
522 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
523 tu_cs_image_ref(cs, iview, 0);
524 tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
525
526 tu_cs_emit_regs(cs,
527 A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
528
529 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR_LO(i), 3);
530 tu_cs_image_flag_ref(cs, iview, 0);
531 }
532
533 tu_cs_emit_regs(cs,
534 A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
535 tu_cs_emit_regs(cs,
536 A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
537
538 tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(fb->layers - 1));
539 }
540
541 void
542 tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
543 {
544 const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
545 bool msaa_disable = samples == MSAA_ONE;
546
547 tu_cs_emit_regs(cs,
548 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
549 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
550 .msaa_disable = msaa_disable));
551
552 tu_cs_emit_regs(cs,
553 A6XX_GRAS_RAS_MSAA_CNTL(samples),
554 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
555 .msaa_disable = msaa_disable));
556
557 tu_cs_emit_regs(cs,
558 A6XX_RB_RAS_MSAA_CNTL(samples),
559 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
560 .msaa_disable = msaa_disable));
561
562 tu_cs_emit_regs(cs,
563 A6XX_RB_MSAA_CNTL(samples));
564 }
565
566 static void
567 tu6_emit_bin_size(struct tu_cs *cs,
568 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
569 {
570 tu_cs_emit_regs(cs,
571 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
572 .binh = bin_h,
573 .dword = flags));
574
575 tu_cs_emit_regs(cs,
576 A6XX_RB_BIN_CONTROL(.binw = bin_w,
577 .binh = bin_h,
578 .dword = flags));
579
580 /* no flag for RB_BIN_CONTROL2... */
581 tu_cs_emit_regs(cs,
582 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
583 .binh = bin_h));
584 }
585
586 static void
587 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
588 const struct tu_subpass *subpass,
589 struct tu_cs *cs,
590 bool binning)
591 {
592 const struct tu_framebuffer *fb = cmd->state.framebuffer;
593 uint32_t cntl = 0;
594 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
595 if (binning) {
596 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
597 } else {
598 uint32_t mrts_ubwc_enable = 0;
599 for (uint32_t i = 0; i < subpass->color_count; ++i) {
600 uint32_t a = subpass->color_attachments[i].attachment;
601 if (a == VK_ATTACHMENT_UNUSED)
602 continue;
603
604 const struct tu_image_view *iview = fb->attachments[a].attachment;
605 if (iview->ubwc_enabled)
606 mrts_ubwc_enable |= 1 << i;
607 }
608
609 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
610
611 const uint32_t a = subpass->depth_stencil_attachment.attachment;
612 if (a != VK_ATTACHMENT_UNUSED) {
613 const struct tu_image_view *iview = fb->attachments[a].attachment;
614 if (iview->ubwc_enabled)
615 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
616 }
617
618 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
619 * in order to set it correctly for the different subpasses. However,
620 * that means the packets we're emitting also happen during binning. So
621 * we need to guard the write on !BINNING at CP execution time.
622 */
623 tu_cs_reserve(cs, 3 + 4);
624 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
625 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
626 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
627 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
628 }
629
630 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
631 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
632 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
633 tu_cs_emit(cs, cntl);
634 }
635
636 static void
637 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
638 {
639 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
640 uint32_t x1 = render_area->offset.x;
641 uint32_t y1 = render_area->offset.y;
642 uint32_t x2 = x1 + render_area->extent.width - 1;
643 uint32_t y2 = y1 + render_area->extent.height - 1;
644
645 if (align) {
646 x1 = x1 & ~(GMEM_ALIGN_W - 1);
647 y1 = y1 & ~(GMEM_ALIGN_H - 1);
648 x2 = ALIGN_POT(x2 + 1, GMEM_ALIGN_W) - 1;
649 y2 = ALIGN_POT(y2 + 1, GMEM_ALIGN_H) - 1;
650 }
651
652 tu_cs_emit_regs(cs,
653 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
654 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
655 }
656
657 void
658 tu6_emit_window_scissor(struct tu_cs *cs,
659 uint32_t x1,
660 uint32_t y1,
661 uint32_t x2,
662 uint32_t y2)
663 {
664 tu_cs_emit_regs(cs,
665 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
666 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
667
668 tu_cs_emit_regs(cs,
669 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
670 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
671 }
672
673 void
674 tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
675 {
676 tu_cs_emit_regs(cs,
677 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
678
679 tu_cs_emit_regs(cs,
680 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
681
682 tu_cs_emit_regs(cs,
683 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
684
685 tu_cs_emit_regs(cs,
686 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
687 }
688
689 static bool
690 use_hw_binning(struct tu_cmd_buffer *cmd)
691 {
692 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
693
694 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
695 return false;
696
697 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
698 return true;
699
700 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
701 }
702
703 static bool
704 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
705 {
706 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
707 return true;
708
709 /* can't fit attachments into gmem */
710 if (!cmd->state.pass->gmem_pixels)
711 return true;
712
713 if (cmd->state.framebuffer->layers > 1)
714 return true;
715
716 return cmd->state.tiling_config.force_sysmem;
717 }
718
719 static void
720 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
721 struct tu_cs *cs,
722 const struct tu_tile *tile)
723 {
724 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
725 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
726
727 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
728 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
729
730 const uint32_t x1 = tile->begin.x;
731 const uint32_t y1 = tile->begin.y;
732 const uint32_t x2 = tile->end.x - 1;
733 const uint32_t y2 = tile->end.y - 1;
734 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
735 tu6_emit_window_offset(cs, x1, y1);
736
737 tu_cs_emit_regs(cs,
738 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
739
740 if (use_hw_binning(cmd)) {
741 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
742
743 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
744 tu_cs_emit(cs, 0x0);
745
746 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
747 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
748 A6XX_CP_REG_TEST_0_BIT(0) |
749 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
750
751 tu_cs_reserve(cs, 3 + 11);
752 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
753 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
754 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
755
756 /* if (no overflow) */ {
757 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
758 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
759 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
760 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + tile->pipe * cmd->vsc_draw_strm_pitch);
761 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + (tile->pipe * 4) + (32 * cmd->vsc_draw_strm_pitch));
762 tu_cs_emit_qw(cs, cmd->vsc_prim_strm.iova + (tile->pipe * cmd->vsc_prim_strm_pitch));
763
764 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
765 tu_cs_emit(cs, 0x0);
766
767 /* use a NOP packet to skip over the 'else' side: */
768 tu_cs_emit_pkt7(cs, CP_NOP, 2);
769 } /* else */ {
770 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
771 tu_cs_emit(cs, 0x1);
772 }
773
774 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
775 tu_cs_emit(cs, 0x0);
776 } else {
777 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
778 tu_cs_emit(cs, 0x1);
779
780 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
781 tu_cs_emit(cs, 0x0);
782 }
783 }
784
785 static void
786 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
787 struct tu_cs *cs,
788 uint32_t a,
789 uint32_t gmem_a)
790 {
791 const struct tu_framebuffer *fb = cmd->state.framebuffer;
792 struct tu_image_view *dst = fb->attachments[a].attachment;
793 struct tu_image_view *src = fb->attachments[gmem_a].attachment;
794
795 tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.tiling_config.render_area);
796 }
797
798 static void
799 tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
800 struct tu_cs *cs,
801 const struct tu_subpass *subpass)
802 {
803 if (subpass->resolve_attachments) {
804 /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
805 * Commands":
806 *
807 * End-of-subpass multisample resolves are treated as color
808 * attachment writes for the purposes of synchronization. That is,
809 * they are considered to execute in the
810 * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
811 * their writes are synchronized with
812 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
813 * rendering within a subpass and any resolve operations at the end
814 * of the subpass occurs automatically, without need for explicit
815 * dependencies or pipeline barriers. However, if the resolve
816 * attachment is also used in a different subpass, an explicit
817 * dependency is needed.
818 *
819 * We use the CP_BLIT path for sysmem resolves, which is really a
820 * transfer command, so we have to manually flush similar to the gmem
821 * resolve case. However, a flush afterwards isn't needed because of the
822 * last sentence and the fact that we're in sysmem mode.
823 */
824 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
825 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
826
827 /* Wait for the flushes to land before using the 2D engine */
828 tu_cs_emit_wfi(cs);
829
830 for (unsigned i = 0; i < subpass->color_count; i++) {
831 uint32_t a = subpass->resolve_attachments[i].attachment;
832 if (a == VK_ATTACHMENT_UNUSED)
833 continue;
834
835 tu6_emit_sysmem_resolve(cmd, cs, a,
836 subpass->color_attachments[i].attachment);
837 }
838 }
839 }
840
841 static void
842 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
843 {
844 const struct tu_render_pass *pass = cmd->state.pass;
845 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
846
847 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
848 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
849 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
850 CP_SET_DRAW_STATE__0_GROUP_ID(0));
851 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
852 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
853
854 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
855 tu_cs_emit(cs, 0x0);
856
857 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
858 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
859
860 tu6_emit_blit_scissor(cmd, cs, true);
861
862 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
863 if (pass->attachments[a].gmem_offset >= 0)
864 tu_store_gmem_attachment(cmd, cs, a, a);
865 }
866
867 if (subpass->resolve_attachments) {
868 for (unsigned i = 0; i < subpass->color_count; i++) {
869 uint32_t a = subpass->resolve_attachments[i].attachment;
870 if (a != VK_ATTACHMENT_UNUSED)
871 tu_store_gmem_attachment(cmd, cs, a,
872 subpass->color_attachments[i].attachment);
873 }
874 }
875 }
876
877 static void
878 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
879 {
880 tu_cs_emit_regs(cs,
881 A6XX_PC_RESTART_INDEX(restart_index));
882 }
883
884 static void
885 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
886 {
887 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
888
889 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
890
891 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
892
893 tu_cs_emit_regs(cs,
894 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
895 cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
896 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
897 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
898 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
899 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
900 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
901 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
902 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
903 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
904
905 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
906 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
907 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
908 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
909 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
910 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
911 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
912 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
913 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
914 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
915 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
916 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
917 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
918 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
919
920 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
921 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
922 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
923
924 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
925
926 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
927
928 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
929 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
930 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
931 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
932 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
933 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
934 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
935 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
936 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
937 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
938 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
939
940 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
941 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
942
943 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236,
944 A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
945 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
946
947 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
948 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
949
950 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
951 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
952 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
953
954 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
955 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
956
957 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
958
959 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
960
961 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
962 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
963 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
964 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
965 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
966 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
967 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
968 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
969 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
970 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
971 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
972 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
973 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
974 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
975
976 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
977
978 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
979
980 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
981
982 /* we don't use this yet.. probably best to disable.. */
983 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
984 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
985 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
986 CP_SET_DRAW_STATE__0_GROUP_ID(0));
987 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
988 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
989
990 /* Set not to use streamout by default, */
991 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
992 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
993 tu_cs_emit(cs, 0);
994 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
995 tu_cs_emit(cs, 0);
996
997 tu_cs_emit_regs(cs,
998 A6XX_SP_HS_CTRL_REG0(0));
999
1000 tu_cs_emit_regs(cs,
1001 A6XX_SP_GS_CTRL_REG0(0));
1002
1003 tu_cs_emit_regs(cs,
1004 A6XX_GRAS_LRZ_CNTL(0));
1005
1006 tu_cs_emit_regs(cs,
1007 A6XX_RB_LRZ_CNTL(0));
1008
1009 tu_cs_emit_regs(cs,
1010 A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
1011 tu_cs_emit_regs(cs,
1012 A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
1013
1014 tu_cs_sanity_check(cs);
1015 }
1016
1017 static void
1018 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1019 {
1020 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1021
1022 tu_cs_emit_regs(cs,
1023 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1024 .height = tiling->tile0.extent.height),
1025 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = &cmd->vsc_draw_strm,
1026 .bo_offset = 32 * cmd->vsc_draw_strm_pitch));
1027
1028 tu_cs_emit_regs(cs,
1029 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1030 .ny = tiling->tile_count.height));
1031
1032 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1033 for (unsigned i = 0; i < 32; i++)
1034 tu_cs_emit(cs, tiling->pipe_config[i]);
1035
1036 tu_cs_emit_regs(cs,
1037 A6XX_VSC_PRIM_STRM_ADDRESS(.bo = &cmd->vsc_prim_strm),
1038 A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
1039 A6XX_VSC_PRIM_STRM_ARRAY_PITCH(cmd->vsc_prim_strm.size));
1040
1041 tu_cs_emit_regs(cs,
1042 A6XX_VSC_DRAW_STRM_ADDRESS(.bo = &cmd->vsc_draw_strm),
1043 A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
1044 A6XX_VSC_DRAW_STRM_ARRAY_PITCH(cmd->vsc_draw_strm.size));
1045 }
1046
1047 static void
1048 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1049 {
1050 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1051 const uint32_t used_pipe_count =
1052 tiling->pipe_count.width * tiling->pipe_count.height;
1053
1054 /* Clear vsc_scratch: */
1055 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1056 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1057 tu_cs_emit(cs, 0x0);
1058
1059 /* Check for overflow, write vsc_scratch if detected: */
1060 for (int i = 0; i < used_pipe_count; i++) {
1061 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1062 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1063 CP_COND_WRITE5_0_WRITE_MEMORY);
1064 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
1065 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1066 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch));
1067 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1068 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1069 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_draw_strm_pitch));
1070
1071 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1072 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1073 CP_COND_WRITE5_0_WRITE_MEMORY);
1074 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
1075 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1076 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch));
1077 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1078 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1079 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_prim_strm_pitch));
1080 }
1081
1082 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1083
1084 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1085
1086 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1087 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1088 CP_MEM_TO_REG_0_CNT(1 - 1));
1089 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1090
1091 /*
1092 * This is a bit awkward, we really want a way to invert the
1093 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1094 * execute cmds to use hwbinning when a bit is *not* set. This
1095 * dance is to invert OVERFLOW_FLAG_REG
1096 *
1097 * A CP_NOP packet is used to skip executing the 'else' clause
1098 * if (b0 set)..
1099 */
1100
1101 /* b0 will be set if VSC_DRAW_STRM or VSC_PRIM_STRM overflow: */
1102 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1103 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1104 A6XX_CP_REG_TEST_0_BIT(0) |
1105 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1106
1107 tu_cs_reserve(cs, 3 + 7);
1108 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1109 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1110 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1111
1112 /* if (b0 set) */ {
1113 /*
1114 * On overflow, mirror the value to control->vsc_overflow
1115 * which CPU is checking to detect overflow (see
1116 * check_vsc_overflow())
1117 */
1118 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1119 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1120 CP_REG_TO_MEM_0_CNT(0));
1121 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_overflow));
1122
1123 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1124 tu_cs_emit(cs, 0x0);
1125
1126 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1127 } /* else */ {
1128 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1129 tu_cs_emit(cs, 0x1);
1130 }
1131 }
1132
1133 static void
1134 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1135 {
1136 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1137 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1138
1139 uint32_t x1 = tiling->tile0.offset.x;
1140 uint32_t y1 = tiling->tile0.offset.y;
1141 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1142 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1143
1144 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
1145
1146 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1147 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1148
1149 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1150 tu_cs_emit(cs, 0x1);
1151
1152 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1153 tu_cs_emit(cs, 0x1);
1154
1155 tu_cs_emit_wfi(cs);
1156
1157 tu_cs_emit_regs(cs,
1158 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1159
1160 update_vsc_pipe(cmd, cs);
1161
1162 tu_cs_emit_regs(cs,
1163 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1164
1165 tu_cs_emit_regs(cs,
1166 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1167
1168 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1169 tu_cs_emit(cs, UNK_2C);
1170
1171 tu_cs_emit_regs(cs,
1172 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1173
1174 tu_cs_emit_regs(cs,
1175 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1176
1177 /* emit IB to binning drawcmds: */
1178 tu_cs_emit_call(cs, &cmd->draw_cs);
1179
1180 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1181 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1182 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1183 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1184 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1185 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1186
1187 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1188 tu_cs_emit(cs, UNK_2D);
1189
1190 /* This flush is probably required because the VSC, which produces the
1191 * visibility stream, is a client of UCHE, whereas the CP needs to read the
1192 * visibility stream (without caching) to do draw skipping. The
1193 * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1194 * submitted are finished before reading the VSC regs (in
1195 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
1196 * part of draws).
1197 */
1198 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
1199
1200 tu_cs_emit_wfi(cs);
1201
1202 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1203
1204 emit_vsc_overflow_test(cmd, cs);
1205
1206 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1207 tu_cs_emit(cs, 0x0);
1208
1209 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1210 tu_cs_emit(cs, 0x0);
1211 }
1212
1213 static void
1214 tu_emit_load_clear(struct tu_cmd_buffer *cmd,
1215 const VkRenderPassBeginInfo *info)
1216 {
1217 struct tu_cs *cs = &cmd->draw_cs;
1218
1219 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1220
1221 tu6_emit_blit_scissor(cmd, cs, true);
1222
1223 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1224 tu_load_gmem_attachment(cmd, cs, i, false);
1225
1226 tu6_emit_blit_scissor(cmd, cs, false);
1227
1228 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1229 tu_clear_gmem_attachment(cmd, cs, i, info);
1230
1231 tu_cond_exec_end(cs);
1232
1233 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1234
1235 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1236 tu_clear_sysmem_attachment(cmd, cs, i, info);
1237
1238 tu_cond_exec_end(cs);
1239 }
1240
1241 static void
1242 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1243 const struct VkRect2D *renderArea)
1244 {
1245 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1246
1247 assert(fb->width > 0 && fb->height > 0);
1248 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1249 tu6_emit_window_offset(cs, 0, 0);
1250
1251 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1252
1253 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1254
1255 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1256 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1257
1258 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1259 tu_cs_emit(cs, 0x0);
1260
1261 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1262
1263 /* enable stream-out, with sysmem there is only one pass: */
1264 tu_cs_emit_regs(cs,
1265 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1266
1267 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1268 tu_cs_emit(cs, 0x1);
1269
1270 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1271 tu_cs_emit(cs, 0x0);
1272
1273 tu_cs_sanity_check(cs);
1274 }
1275
1276 static void
1277 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1278 {
1279 /* Do any resolves of the last subpass. These are handled in the
1280 * tile_store_ib in the gmem path.
1281 */
1282 tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1283
1284 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1285
1286 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1287 tu_cs_emit(cs, 0x0);
1288
1289 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1290
1291 tu_cs_sanity_check(cs);
1292 }
1293
1294
1295 static void
1296 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1297 {
1298 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1299
1300 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1301
1302 /* lrz clear? */
1303
1304 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1305 tu_cs_emit(cs, 0x0);
1306
1307 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1308
1309 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1310 if (use_hw_binning(cmd)) {
1311 /* enable stream-out during binning pass: */
1312 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1313
1314 tu6_emit_bin_size(cs,
1315 tiling->tile0.extent.width,
1316 tiling->tile0.extent.height,
1317 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1318
1319 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1320
1321 tu6_emit_binning_pass(cmd, cs);
1322
1323 /* and disable stream-out for draw pass: */
1324 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
1325
1326 tu6_emit_bin_size(cs,
1327 tiling->tile0.extent.width,
1328 tiling->tile0.extent.height,
1329 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1330
1331 tu_cs_emit_regs(cs,
1332 A6XX_VFD_MODE_CNTL(0));
1333
1334 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1335
1336 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1337
1338 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1339 tu_cs_emit(cs, 0x1);
1340 } else {
1341 /* no binning pass, so enable stream-out for draw pass:: */
1342 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1343
1344 tu6_emit_bin_size(cs,
1345 tiling->tile0.extent.width,
1346 tiling->tile0.extent.height,
1347 0x6000000);
1348 }
1349
1350 tu_cs_sanity_check(cs);
1351 }
1352
1353 static void
1354 tu6_render_tile(struct tu_cmd_buffer *cmd,
1355 struct tu_cs *cs,
1356 const struct tu_tile *tile)
1357 {
1358 tu6_emit_tile_select(cmd, cs, tile);
1359
1360 tu_cs_emit_call(cs, &cmd->draw_cs);
1361
1362 if (use_hw_binning(cmd)) {
1363 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1364 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1365 A6XX_CP_REG_TEST_0_BIT(0) |
1366 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1367
1368 tu_cs_reserve(cs, 3 + 2);
1369 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1370 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1371 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1372
1373 /* if (no overflow) */ {
1374 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1375 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1376 }
1377 }
1378
1379 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1380
1381 tu_cs_sanity_check(cs);
1382 }
1383
1384 static void
1385 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1386 {
1387 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1388
1389 tu_cs_emit_regs(cs,
1390 A6XX_GRAS_LRZ_CNTL(0));
1391
1392 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1393
1394 tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1395
1396 tu_cs_sanity_check(cs);
1397 }
1398
1399 static void
1400 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1401 {
1402 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1403
1404 tu6_tile_render_begin(cmd, &cmd->cs);
1405
1406 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1407 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1408 struct tu_tile tile;
1409 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1410 tu6_render_tile(cmd, &cmd->cs, &tile);
1411 }
1412 }
1413
1414 tu6_tile_render_end(cmd, &cmd->cs);
1415 }
1416
1417 static void
1418 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1419 {
1420 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1421
1422 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1423
1424 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1425
1426 tu6_sysmem_render_end(cmd, &cmd->cs);
1427 }
1428
1429 static void
1430 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1431 {
1432 const uint32_t tile_store_space = 11 + (35 * 2) * cmd->state.pass->attachment_count;
1433 struct tu_cs sub_cs;
1434
1435 VkResult result =
1436 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1437 if (result != VK_SUCCESS) {
1438 cmd->record_result = result;
1439 return;
1440 }
1441
1442 /* emit to tile-store sub_cs */
1443 tu6_emit_tile_store(cmd, &sub_cs);
1444
1445 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1446 }
1447
1448 static void
1449 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1450 const VkRect2D *render_area)
1451 {
1452 const struct tu_device *dev = cmd->device;
1453 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1454
1455 tiling->render_area = *render_area;
1456 tiling->force_sysmem = false;
1457
1458 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass);
1459 tu_tiling_config_update_pipe_layout(tiling, dev);
1460 tu_tiling_config_update_pipes(tiling, dev);
1461 }
1462
1463 const struct tu_dynamic_state default_dynamic_state = {
1464 .viewport =
1465 {
1466 .count = 0,
1467 },
1468 .scissor =
1469 {
1470 .count = 0,
1471 },
1472 .line_width = 1.0f,
1473 .depth_bias =
1474 {
1475 .bias = 0.0f,
1476 .clamp = 0.0f,
1477 .slope = 0.0f,
1478 },
1479 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1480 .depth_bounds =
1481 {
1482 .min = 0.0f,
1483 .max = 1.0f,
1484 },
1485 .stencil_compare_mask =
1486 {
1487 .front = ~0u,
1488 .back = ~0u,
1489 },
1490 .stencil_write_mask =
1491 {
1492 .front = ~0u,
1493 .back = ~0u,
1494 },
1495 .stencil_reference =
1496 {
1497 .front = 0u,
1498 .back = 0u,
1499 },
1500 };
1501
1502 static void UNUSED /* FINISHME */
1503 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1504 const struct tu_dynamic_state *src)
1505 {
1506 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1507 uint32_t copy_mask = src->mask;
1508 uint32_t dest_mask = 0;
1509
1510 tu_use_args(cmd_buffer); /* FINISHME */
1511
1512 /* Make sure to copy the number of viewports/scissors because they can
1513 * only be specified at pipeline creation time.
1514 */
1515 dest->viewport.count = src->viewport.count;
1516 dest->scissor.count = src->scissor.count;
1517 dest->discard_rectangle.count = src->discard_rectangle.count;
1518
1519 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1520 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1521 src->viewport.count * sizeof(VkViewport))) {
1522 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1523 src->viewport.count);
1524 dest_mask |= TU_DYNAMIC_VIEWPORT;
1525 }
1526 }
1527
1528 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1529 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1530 src->scissor.count * sizeof(VkRect2D))) {
1531 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1532 src->scissor.count);
1533 dest_mask |= TU_DYNAMIC_SCISSOR;
1534 }
1535 }
1536
1537 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1538 if (dest->line_width != src->line_width) {
1539 dest->line_width = src->line_width;
1540 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1541 }
1542 }
1543
1544 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1545 if (memcmp(&dest->depth_bias, &src->depth_bias,
1546 sizeof(src->depth_bias))) {
1547 dest->depth_bias = src->depth_bias;
1548 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1549 }
1550 }
1551
1552 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1553 if (memcmp(&dest->blend_constants, &src->blend_constants,
1554 sizeof(src->blend_constants))) {
1555 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1556 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1557 }
1558 }
1559
1560 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1561 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1562 sizeof(src->depth_bounds))) {
1563 dest->depth_bounds = src->depth_bounds;
1564 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1565 }
1566 }
1567
1568 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1569 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1570 sizeof(src->stencil_compare_mask))) {
1571 dest->stencil_compare_mask = src->stencil_compare_mask;
1572 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1573 }
1574 }
1575
1576 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1577 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1578 sizeof(src->stencil_write_mask))) {
1579 dest->stencil_write_mask = src->stencil_write_mask;
1580 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1581 }
1582 }
1583
1584 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1585 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1586 sizeof(src->stencil_reference))) {
1587 dest->stencil_reference = src->stencil_reference;
1588 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1589 }
1590 }
1591
1592 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1593 if (memcmp(&dest->discard_rectangle.rectangles,
1594 &src->discard_rectangle.rectangles,
1595 src->discard_rectangle.count * sizeof(VkRect2D))) {
1596 typed_memcpy(dest->discard_rectangle.rectangles,
1597 src->discard_rectangle.rectangles,
1598 src->discard_rectangle.count);
1599 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1600 }
1601 }
1602 }
1603
1604 static VkResult
1605 tu_create_cmd_buffer(struct tu_device *device,
1606 struct tu_cmd_pool *pool,
1607 VkCommandBufferLevel level,
1608 VkCommandBuffer *pCommandBuffer)
1609 {
1610 struct tu_cmd_buffer *cmd_buffer;
1611 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1612 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1613 if (cmd_buffer == NULL)
1614 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1615
1616 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1617 cmd_buffer->device = device;
1618 cmd_buffer->pool = pool;
1619 cmd_buffer->level = level;
1620
1621 if (pool) {
1622 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1623 cmd_buffer->queue_family_index = pool->queue_family_index;
1624
1625 } else {
1626 /* Init the pool_link so we can safely call list_del when we destroy
1627 * the command buffer
1628 */
1629 list_inithead(&cmd_buffer->pool_link);
1630 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1631 }
1632
1633 tu_bo_list_init(&cmd_buffer->bo_list);
1634 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1635 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1636 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1637 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1638
1639 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1640
1641 list_inithead(&cmd_buffer->upload.list);
1642
1643 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1644 if (result != VK_SUCCESS)
1645 goto fail_scratch_bo;
1646
1647 /* TODO: resize on overflow */
1648 cmd_buffer->vsc_draw_strm_pitch = device->vsc_draw_strm_pitch;
1649 cmd_buffer->vsc_prim_strm_pitch = device->vsc_prim_strm_pitch;
1650 cmd_buffer->vsc_draw_strm = device->vsc_draw_strm;
1651 cmd_buffer->vsc_prim_strm = device->vsc_prim_strm;
1652
1653 return VK_SUCCESS;
1654
1655 fail_scratch_bo:
1656 list_del(&cmd_buffer->pool_link);
1657 return result;
1658 }
1659
1660 static void
1661 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1662 {
1663 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1664
1665 list_del(&cmd_buffer->pool_link);
1666
1667 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
1668 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
1669
1670 tu_cs_finish(&cmd_buffer->cs);
1671 tu_cs_finish(&cmd_buffer->draw_cs);
1672 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1673 tu_cs_finish(&cmd_buffer->sub_cs);
1674
1675 tu_bo_list_destroy(&cmd_buffer->bo_list);
1676 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1677 }
1678
1679 static VkResult
1680 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1681 {
1682 cmd_buffer->record_result = VK_SUCCESS;
1683
1684 tu_bo_list_reset(&cmd_buffer->bo_list);
1685 tu_cs_reset(&cmd_buffer->cs);
1686 tu_cs_reset(&cmd_buffer->draw_cs);
1687 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1688 tu_cs_reset(&cmd_buffer->sub_cs);
1689
1690 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
1691 cmd_buffer->descriptors[i].valid = 0;
1692 cmd_buffer->descriptors[i].push_dirty = false;
1693 }
1694
1695 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1696
1697 return cmd_buffer->record_result;
1698 }
1699
1700 VkResult
1701 tu_AllocateCommandBuffers(VkDevice _device,
1702 const VkCommandBufferAllocateInfo *pAllocateInfo,
1703 VkCommandBuffer *pCommandBuffers)
1704 {
1705 TU_FROM_HANDLE(tu_device, device, _device);
1706 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1707
1708 VkResult result = VK_SUCCESS;
1709 uint32_t i;
1710
1711 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1712
1713 if (!list_is_empty(&pool->free_cmd_buffers)) {
1714 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1715 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1716
1717 list_del(&cmd_buffer->pool_link);
1718 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1719
1720 result = tu_reset_cmd_buffer(cmd_buffer);
1721 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1722 cmd_buffer->level = pAllocateInfo->level;
1723
1724 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1725 } else {
1726 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1727 &pCommandBuffers[i]);
1728 }
1729 if (result != VK_SUCCESS)
1730 break;
1731 }
1732
1733 if (result != VK_SUCCESS) {
1734 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1735 pCommandBuffers);
1736
1737 /* From the Vulkan 1.0.66 spec:
1738 *
1739 * "vkAllocateCommandBuffers can be used to create multiple
1740 * command buffers. If the creation of any of those command
1741 * buffers fails, the implementation must destroy all
1742 * successfully created command buffer objects from this
1743 * command, set all entries of the pCommandBuffers array to
1744 * NULL and return the error."
1745 */
1746 memset(pCommandBuffers, 0,
1747 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1748 }
1749
1750 return result;
1751 }
1752
1753 void
1754 tu_FreeCommandBuffers(VkDevice device,
1755 VkCommandPool commandPool,
1756 uint32_t commandBufferCount,
1757 const VkCommandBuffer *pCommandBuffers)
1758 {
1759 for (uint32_t i = 0; i < commandBufferCount; i++) {
1760 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1761
1762 if (cmd_buffer) {
1763 if (cmd_buffer->pool) {
1764 list_del(&cmd_buffer->pool_link);
1765 list_addtail(&cmd_buffer->pool_link,
1766 &cmd_buffer->pool->free_cmd_buffers);
1767 } else
1768 tu_cmd_buffer_destroy(cmd_buffer);
1769 }
1770 }
1771 }
1772
1773 VkResult
1774 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1775 VkCommandBufferResetFlags flags)
1776 {
1777 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1778 return tu_reset_cmd_buffer(cmd_buffer);
1779 }
1780
1781 /* Initialize the cache, assuming all necessary flushes have happened but *not*
1782 * invalidations.
1783 */
1784 static void
1785 tu_cache_init(struct tu_cache_state *cache)
1786 {
1787 cache->flush_bits = 0;
1788 cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1789 }
1790
1791 VkResult
1792 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1793 const VkCommandBufferBeginInfo *pBeginInfo)
1794 {
1795 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1796 VkResult result = VK_SUCCESS;
1797
1798 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1799 /* If the command buffer has already been resetted with
1800 * vkResetCommandBuffer, no need to do it again.
1801 */
1802 result = tu_reset_cmd_buffer(cmd_buffer);
1803 if (result != VK_SUCCESS)
1804 return result;
1805 }
1806
1807 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1808 tu_cache_init(&cmd_buffer->state.cache);
1809 tu_cache_init(&cmd_buffer->state.renderpass_cache);
1810 cmd_buffer->usage_flags = pBeginInfo->flags;
1811
1812 tu_cs_begin(&cmd_buffer->cs);
1813 tu_cs_begin(&cmd_buffer->draw_cs);
1814 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1815
1816 /* setup initial configuration into command buffer */
1817 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1818 switch (cmd_buffer->queue_family_index) {
1819 case TU_QUEUE_GENERAL:
1820 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1821 break;
1822 default:
1823 break;
1824 }
1825 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1826 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1827 assert(pBeginInfo->pInheritanceInfo);
1828 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1829 cmd_buffer->state.subpass =
1830 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1831 } else {
1832 /* When executing in the middle of another command buffer, the CCU
1833 * state is unknown.
1834 */
1835 cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1836 }
1837 }
1838
1839 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1840
1841 return VK_SUCCESS;
1842 }
1843
1844 /* Sets vertex buffers to HW binding points. We emit VBs in SDS (so that bin
1845 * rendering can skip over unused state), so we need to collect all the
1846 * bindings together into a single state emit at draw time.
1847 */
1848 void
1849 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1850 uint32_t firstBinding,
1851 uint32_t bindingCount,
1852 const VkBuffer *pBuffers,
1853 const VkDeviceSize *pOffsets)
1854 {
1855 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1856
1857 assert(firstBinding + bindingCount <= MAX_VBS);
1858
1859 for (uint32_t i = 0; i < bindingCount; i++) {
1860 struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1861
1862 cmd->state.vb.buffers[firstBinding + i] = buf;
1863 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
1864
1865 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1866 }
1867
1868 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1869 }
1870
1871 void
1872 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1873 VkBuffer buffer,
1874 VkDeviceSize offset,
1875 VkIndexType indexType)
1876 {
1877 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1878 TU_FROM_HANDLE(tu_buffer, buf, buffer);
1879
1880 /* initialize/update the restart index */
1881 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
1882 struct tu_cs *draw_cs = &cmd->draw_cs;
1883
1884 tu6_emit_restart_index(
1885 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
1886
1887 tu_cs_sanity_check(draw_cs);
1888 }
1889
1890 /* track the BO */
1891 if (cmd->state.index_buffer != buf)
1892 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1893
1894 cmd->state.index_buffer = buf;
1895 cmd->state.index_offset = offset;
1896 cmd->state.index_type = indexType;
1897 }
1898
1899 void
1900 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1901 VkPipelineBindPoint pipelineBindPoint,
1902 VkPipelineLayout _layout,
1903 uint32_t firstSet,
1904 uint32_t descriptorSetCount,
1905 const VkDescriptorSet *pDescriptorSets,
1906 uint32_t dynamicOffsetCount,
1907 const uint32_t *pDynamicOffsets)
1908 {
1909 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1910 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1911 unsigned dyn_idx = 0;
1912
1913 struct tu_descriptor_state *descriptors_state =
1914 tu_get_descriptors_state(cmd, pipelineBindPoint);
1915
1916 for (unsigned i = 0; i < descriptorSetCount; ++i) {
1917 unsigned idx = i + firstSet;
1918 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1919
1920 descriptors_state->sets[idx] = set;
1921 descriptors_state->valid |= (1u << idx);
1922
1923 /* Note: the actual input attachment indices come from the shader
1924 * itself, so we can't generate the patched versions of these until
1925 * draw time when both the pipeline and descriptors are bound and
1926 * we're inside the render pass.
1927 */
1928 unsigned dst_idx = layout->set[idx].input_attachment_start;
1929 memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS],
1930 set->dynamic_descriptors,
1931 set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
1932
1933 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1934 /* Dynamic buffers come after input attachments in the descriptor set
1935 * itself, but due to how the Vulkan descriptor set binding works, we
1936 * have to put input attachments and dynamic buffers in separate
1937 * buffers in the descriptor_state and then combine them at draw
1938 * time. Binding a descriptor set only invalidates the descriptor
1939 * sets after it, but if we try to tightly pack the descriptors after
1940 * the input attachments then we could corrupt dynamic buffers in the
1941 * descriptor set before it, or we'd have to move all the dynamic
1942 * buffers over. We just put them into separate buffers to make
1943 * binding as well as the later patching of input attachments easy.
1944 */
1945 unsigned src_idx = j + set->layout->input_attachment_count;
1946 unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1947 assert(dyn_idx < dynamicOffsetCount);
1948
1949 uint32_t *dst =
1950 &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1951 uint32_t *src =
1952 &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1953 uint32_t offset = pDynamicOffsets[dyn_idx];
1954
1955 /* Patch the storage/uniform descriptors right away. */
1956 if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1957 /* Note: we can assume here that the addition won't roll over and
1958 * change the SIZE field.
1959 */
1960 uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1961 va += offset;
1962 dst[0] = va;
1963 dst[1] = va >> 32;
1964 } else {
1965 memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1966 /* Note: A6XX_IBO_5_DEPTH is always 0 */
1967 uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1968 va += offset;
1969 dst[4] = va;
1970 dst[5] = va >> 32;
1971 }
1972 }
1973
1974 for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
1975 if (set->buffers[j]) {
1976 tu_bo_list_add(&cmd->bo_list, set->buffers[j],
1977 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1978 }
1979 }
1980
1981 if (set->size > 0) {
1982 tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
1983 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
1984 }
1985 }
1986
1987 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE)
1988 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
1989 else
1990 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS;
1991 }
1992
1993 void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1994 uint32_t firstBinding,
1995 uint32_t bindingCount,
1996 const VkBuffer *pBuffers,
1997 const VkDeviceSize *pOffsets,
1998 const VkDeviceSize *pSizes)
1999 {
2000 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2001 assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS);
2002
2003 for (uint32_t i = 0; i < bindingCount; i++) {
2004 uint32_t idx = firstBinding + i;
2005 TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
2006
2007 if (pOffsets[i] != 0)
2008 cmd->state.streamout_reset |= 1 << idx;
2009
2010 cmd->state.streamout_buf.buffers[idx] = buf;
2011 cmd->state.streamout_buf.offsets[idx] = pOffsets[i];
2012 cmd->state.streamout_buf.sizes[idx] = pSizes[i];
2013
2014 cmd->state.streamout_enabled |= 1 << idx;
2015 }
2016
2017 cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS;
2018 }
2019
2020 void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
2021 uint32_t firstCounterBuffer,
2022 uint32_t counterBufferCount,
2023 const VkBuffer *pCounterBuffers,
2024 const VkDeviceSize *pCounterBufferOffsets)
2025 {
2026 assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
2027 /* TODO do something with counter buffer? */
2028 }
2029
2030 void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
2031 uint32_t firstCounterBuffer,
2032 uint32_t counterBufferCount,
2033 const VkBuffer *pCounterBuffers,
2034 const VkDeviceSize *pCounterBufferOffsets)
2035 {
2036 assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
2037 /* TODO do something with counter buffer? */
2038
2039 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2040 cmd->state.streamout_enabled = 0;
2041 }
2042
2043 void
2044 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2045 VkPipelineLayout layout,
2046 VkShaderStageFlags stageFlags,
2047 uint32_t offset,
2048 uint32_t size,
2049 const void *pValues)
2050 {
2051 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2052 memcpy((void*) cmd->push_constants + offset, pValues, size);
2053 cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2054 }
2055
2056 /* Flush everything which has been made available but we haven't actually
2057 * flushed yet.
2058 */
2059 static void
2060 tu_flush_all_pending(struct tu_cache_state *cache)
2061 {
2062 cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2063 cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
2064 }
2065
2066 VkResult
2067 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2068 {
2069 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2070
2071 /* We currently flush CCU at the end of the command buffer, like
2072 * what the blob does. There's implicit synchronization around every
2073 * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
2074 * know yet if this command buffer will be the last in the submit so we
2075 * have to defensively flush everything else.
2076 *
2077 * TODO: We could definitely do better than this, since these flushes
2078 * aren't required by Vulkan, but we'd need kernel support to do that.
2079 * Ideally, we'd like the kernel to flush everything afterwards, so that we
2080 * wouldn't have to do any flushes here, and when submitting multiple
2081 * command buffers there wouldn't be any unnecessary flushes in between.
2082 */
2083 if (cmd_buffer->state.pass) {
2084 tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
2085 tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
2086 } else {
2087 tu_flush_all_pending(&cmd_buffer->state.cache);
2088 cmd_buffer->state.cache.flush_bits |=
2089 TU_CMD_FLAG_CCU_FLUSH_COLOR |
2090 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
2091 tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
2092 }
2093
2094 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2095 MSM_SUBMIT_BO_WRITE);
2096
2097 if (cmd_buffer->use_vsc_data) {
2098 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_draw_strm,
2099 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2100 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_prim_strm,
2101 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2102 }
2103
2104 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->border_color,
2105 MSM_SUBMIT_BO_READ);
2106
2107 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2108 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2109 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2110 }
2111
2112 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2113 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2114 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2115 }
2116
2117 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2118 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2119 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2120 }
2121
2122 tu_cs_end(&cmd_buffer->cs);
2123 tu_cs_end(&cmd_buffer->draw_cs);
2124 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2125
2126 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2127
2128 return cmd_buffer->record_result;
2129 }
2130
2131 void
2132 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2133 VkPipelineBindPoint pipelineBindPoint,
2134 VkPipeline _pipeline)
2135 {
2136 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2137 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2138
2139 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2140 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2141 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2142 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2143 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2144 }
2145
2146 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2147 cmd->state.compute_pipeline = pipeline;
2148 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2149 return;
2150 }
2151
2152 assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2153
2154 cmd->state.pipeline = pipeline;
2155 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_SHADER_CONSTS;
2156
2157 /* If the new pipeline requires more VBs than we had previously set up, we
2158 * need to re-emit them in SDS. If it requires the same set or fewer, we
2159 * can just re-use the old SDS.
2160 */
2161 if (pipeline->vi.bindings_used & ~cmd->vertex_bindings_set)
2162 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2163
2164 /* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */
2165 if (pipeline->layout->dynamic_offset_count + pipeline->layout->input_attachment_count)
2166 cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2167 }
2168
2169 void
2170 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2171 uint32_t firstViewport,
2172 uint32_t viewportCount,
2173 const VkViewport *pViewports)
2174 {
2175 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2176
2177 assert(firstViewport == 0 && viewportCount == 1);
2178 cmd->state.dynamic.viewport.viewports[0] = pViewports[0];
2179 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_VIEWPORT;
2180 }
2181
2182 void
2183 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2184 uint32_t firstScissor,
2185 uint32_t scissorCount,
2186 const VkRect2D *pScissors)
2187 {
2188 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2189
2190 assert(firstScissor == 0 && scissorCount == 1);
2191 cmd->state.dynamic.scissor.scissors[0] = pScissors[0];
2192 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2193 }
2194
2195 void
2196 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2197 {
2198 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2199
2200 cmd->state.dynamic.line_width = lineWidth;
2201
2202 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2203 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2204 }
2205
2206 void
2207 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2208 float depthBiasConstantFactor,
2209 float depthBiasClamp,
2210 float depthBiasSlopeFactor)
2211 {
2212 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2213 struct tu_cs *draw_cs = &cmd->draw_cs;
2214
2215 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2216 depthBiasSlopeFactor);
2217
2218 tu_cs_sanity_check(draw_cs);
2219 }
2220
2221 void
2222 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2223 const float blendConstants[4])
2224 {
2225 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2226 struct tu_cs *draw_cs = &cmd->draw_cs;
2227
2228 tu6_emit_blend_constants(draw_cs, blendConstants);
2229
2230 tu_cs_sanity_check(draw_cs);
2231 }
2232
2233 void
2234 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2235 float minDepthBounds,
2236 float maxDepthBounds)
2237 {
2238 }
2239
2240 void
2241 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2242 VkStencilFaceFlags faceMask,
2243 uint32_t compareMask)
2244 {
2245 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2246
2247 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2248 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2249 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2250 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2251
2252 /* the front/back compare masks must be updated together */
2253 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2254 }
2255
2256 void
2257 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2258 VkStencilFaceFlags faceMask,
2259 uint32_t writeMask)
2260 {
2261 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2262
2263 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2264 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2265 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2266 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2267
2268 /* the front/back write masks must be updated together */
2269 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2270 }
2271
2272 void
2273 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2274 VkStencilFaceFlags faceMask,
2275 uint32_t reference)
2276 {
2277 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2278
2279 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2280 cmd->state.dynamic.stencil_reference.front = reference;
2281 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2282 cmd->state.dynamic.stencil_reference.back = reference;
2283
2284 /* the front/back references must be updated together */
2285 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2286 }
2287
2288 void
2289 tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2290 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2291 {
2292 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2293
2294 tu6_emit_sample_locations(&cmd->draw_cs, pSampleLocationsInfo);
2295 }
2296
2297 static void
2298 tu_flush_for_access(struct tu_cache_state *cache,
2299 enum tu_cmd_access_mask src_mask,
2300 enum tu_cmd_access_mask dst_mask)
2301 {
2302 enum tu_cmd_flush_bits flush_bits = 0;
2303
2304 if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2305 cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2306 }
2307
2308 #define SRC_FLUSH(domain, flush, invalidate) \
2309 if (src_mask & TU_ACCESS_##domain##_WRITE) { \
2310 cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
2311 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2312 }
2313
2314 SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2315 SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2316 SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2317
2318 #undef SRC_FLUSH
2319
2320 #define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
2321 if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
2322 flush_bits |= TU_CMD_FLAG_##flush; \
2323 cache->pending_flush_bits |= \
2324 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2325 }
2326
2327 SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2328 SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2329
2330 #undef SRC_INCOHERENT_FLUSH
2331
2332 if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
2333 flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2334 }
2335
2336 #define DST_FLUSH(domain, flush, invalidate) \
2337 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2338 TU_ACCESS_##domain##_WRITE)) { \
2339 flush_bits |= cache->pending_flush_bits & \
2340 (TU_CMD_FLAG_##invalidate | \
2341 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2342 }
2343
2344 DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2345 DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2346 DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2347
2348 #undef DST_FLUSH
2349
2350 #define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2351 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2352 TU_ACCESS_##domain##_WRITE)) { \
2353 flush_bits |= TU_CMD_FLAG_##invalidate | \
2354 (cache->pending_flush_bits & \
2355 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2356 }
2357
2358 DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2359 DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2360
2361 #undef DST_INCOHERENT_FLUSH
2362
2363 if (dst_mask & TU_ACCESS_WFI_READ) {
2364 flush_bits |= TU_CMD_FLAG_WFI;
2365 }
2366
2367 cache->flush_bits |= flush_bits;
2368 cache->pending_flush_bits &= ~flush_bits;
2369 }
2370
2371 static enum tu_cmd_access_mask
2372 vk2tu_access(VkAccessFlags flags, bool gmem)
2373 {
2374 enum tu_cmd_access_mask mask = 0;
2375
2376 /* If the GPU writes a buffer that is then read by an indirect draw
2377 * command, we theoretically need a WFI + WAIT_FOR_ME combination to
2378 * wait for the writes to complete. The WAIT_FOR_ME is performed as part
2379 * of the draw by the firmware, so we just need to execute a WFI.
2380 */
2381 if (flags &
2382 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
2383 VK_ACCESS_MEMORY_READ_BIT)) {
2384 mask |= TU_ACCESS_WFI_READ;
2385 }
2386
2387 if (flags &
2388 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2389 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
2390 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2391 VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
2392 VK_ACCESS_MEMORY_READ_BIT)) {
2393 mask |= TU_ACCESS_SYSMEM_READ;
2394 }
2395
2396 if (flags &
2397 (VK_ACCESS_HOST_WRITE_BIT |
2398 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
2399 VK_ACCESS_MEMORY_WRITE_BIT)) {
2400 mask |= TU_ACCESS_SYSMEM_WRITE;
2401 }
2402
2403 if (flags &
2404 (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2405 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2406 VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2407 /* TODO: Is there a no-cache bit for textures so that we can ignore
2408 * these?
2409 */
2410 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2411 VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2412 VK_ACCESS_MEMORY_READ_BIT)) {
2413 mask |= TU_ACCESS_UCHE_READ;
2414 }
2415
2416 if (flags &
2417 (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2418 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2419 VK_ACCESS_MEMORY_WRITE_BIT)) {
2420 mask |= TU_ACCESS_UCHE_WRITE;
2421 }
2422
2423 /* When using GMEM, the CCU is always flushed automatically to GMEM, and
2424 * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2425 * previous writes in sysmem mode when transitioning to GMEM. Therefore we
2426 * can ignore CCU and pretend that color attachments and transfers use
2427 * sysmem directly.
2428 */
2429
2430 if (flags &
2431 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2432 VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2433 VK_ACCESS_MEMORY_READ_BIT)) {
2434 if (gmem)
2435 mask |= TU_ACCESS_SYSMEM_READ;
2436 else
2437 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2438 }
2439
2440 if (flags &
2441 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2442 VK_ACCESS_MEMORY_READ_BIT)) {
2443 if (gmem)
2444 mask |= TU_ACCESS_SYSMEM_READ;
2445 else
2446 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2447 }
2448
2449 if (flags &
2450 (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2451 VK_ACCESS_MEMORY_WRITE_BIT)) {
2452 if (gmem) {
2453 mask |= TU_ACCESS_SYSMEM_WRITE;
2454 } else {
2455 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2456 }
2457 }
2458
2459 if (flags &
2460 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2461 VK_ACCESS_MEMORY_WRITE_BIT)) {
2462 if (gmem) {
2463 mask |= TU_ACCESS_SYSMEM_WRITE;
2464 } else {
2465 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2466 }
2467 }
2468
2469 /* When the dst access is a transfer read/write, it seems we sometimes need
2470 * to insert a WFI after any flushes, to guarantee that the flushes finish
2471 * before the 2D engine starts. However the opposite (i.e. a WFI after
2472 * CP_BLIT and before any subsequent flush) does not seem to be needed, and
2473 * the blob doesn't emit such a WFI.
2474 */
2475
2476 if (flags &
2477 (VK_ACCESS_TRANSFER_WRITE_BIT |
2478 VK_ACCESS_MEMORY_WRITE_BIT)) {
2479 if (gmem) {
2480 mask |= TU_ACCESS_SYSMEM_WRITE;
2481 } else {
2482 mask |= TU_ACCESS_CCU_COLOR_WRITE;
2483 }
2484 mask |= TU_ACCESS_WFI_READ;
2485 }
2486
2487 if (flags &
2488 (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2489 VK_ACCESS_MEMORY_READ_BIT)) {
2490 mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
2491 }
2492
2493 return mask;
2494 }
2495
2496
2497 void
2498 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2499 uint32_t commandBufferCount,
2500 const VkCommandBuffer *pCmdBuffers)
2501 {
2502 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2503 VkResult result;
2504
2505 assert(commandBufferCount > 0);
2506
2507 /* Emit any pending flushes. */
2508 if (cmd->state.pass) {
2509 tu_flush_all_pending(&cmd->state.renderpass_cache);
2510 tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2511 } else {
2512 tu_flush_all_pending(&cmd->state.cache);
2513 tu_emit_cache_flush(cmd, &cmd->cs);
2514 }
2515
2516 for (uint32_t i = 0; i < commandBufferCount; i++) {
2517 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2518
2519 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2520 if (result != VK_SUCCESS) {
2521 cmd->record_result = result;
2522 break;
2523 }
2524
2525 if (secondary->usage_flags &
2526 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2527 assert(tu_cs_is_empty(&secondary->cs));
2528
2529 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2530 if (result != VK_SUCCESS) {
2531 cmd->record_result = result;
2532 break;
2533 }
2534
2535 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2536 &secondary->draw_epilogue_cs);
2537 if (result != VK_SUCCESS) {
2538 cmd->record_result = result;
2539 break;
2540 }
2541 } else {
2542 assert(tu_cs_is_empty(&secondary->draw_cs));
2543 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2544
2545 for (uint32_t j = 0; j < secondary->cs.bo_count; j++) {
2546 tu_bo_list_add(&cmd->bo_list, secondary->cs.bos[j],
2547 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2548 }
2549
2550 tu_cs_add_entries(&cmd->cs, &secondary->cs);
2551 }
2552 }
2553 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2554
2555 /* After executing secondary command buffers, there may have been arbitrary
2556 * flushes executed, so when we encounter a pipeline barrier with a
2557 * srcMask, we have to assume that we need to invalidate. Therefore we need
2558 * to re-initialize the cache with all pending invalidate bits set.
2559 */
2560 if (cmd->state.pass) {
2561 tu_cache_init(&cmd->state.renderpass_cache);
2562 } else {
2563 tu_cache_init(&cmd->state.cache);
2564 }
2565 }
2566
2567 VkResult
2568 tu_CreateCommandPool(VkDevice _device,
2569 const VkCommandPoolCreateInfo *pCreateInfo,
2570 const VkAllocationCallbacks *pAllocator,
2571 VkCommandPool *pCmdPool)
2572 {
2573 TU_FROM_HANDLE(tu_device, device, _device);
2574 struct tu_cmd_pool *pool;
2575
2576 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2577 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2578 if (pool == NULL)
2579 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2580
2581 if (pAllocator)
2582 pool->alloc = *pAllocator;
2583 else
2584 pool->alloc = device->alloc;
2585
2586 list_inithead(&pool->cmd_buffers);
2587 list_inithead(&pool->free_cmd_buffers);
2588
2589 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2590
2591 *pCmdPool = tu_cmd_pool_to_handle(pool);
2592
2593 return VK_SUCCESS;
2594 }
2595
2596 void
2597 tu_DestroyCommandPool(VkDevice _device,
2598 VkCommandPool commandPool,
2599 const VkAllocationCallbacks *pAllocator)
2600 {
2601 TU_FROM_HANDLE(tu_device, device, _device);
2602 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2603
2604 if (!pool)
2605 return;
2606
2607 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2608 &pool->cmd_buffers, pool_link)
2609 {
2610 tu_cmd_buffer_destroy(cmd_buffer);
2611 }
2612
2613 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2614 &pool->free_cmd_buffers, pool_link)
2615 {
2616 tu_cmd_buffer_destroy(cmd_buffer);
2617 }
2618
2619 vk_free2(&device->alloc, pAllocator, pool);
2620 }
2621
2622 VkResult
2623 tu_ResetCommandPool(VkDevice device,
2624 VkCommandPool commandPool,
2625 VkCommandPoolResetFlags flags)
2626 {
2627 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2628 VkResult result;
2629
2630 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2631 pool_link)
2632 {
2633 result = tu_reset_cmd_buffer(cmd_buffer);
2634 if (result != VK_SUCCESS)
2635 return result;
2636 }
2637
2638 return VK_SUCCESS;
2639 }
2640
2641 void
2642 tu_TrimCommandPool(VkDevice device,
2643 VkCommandPool commandPool,
2644 VkCommandPoolTrimFlags flags)
2645 {
2646 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2647
2648 if (!pool)
2649 return;
2650
2651 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2652 &pool->free_cmd_buffers, pool_link)
2653 {
2654 tu_cmd_buffer_destroy(cmd_buffer);
2655 }
2656 }
2657
2658 static void
2659 tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
2660 const struct tu_subpass_barrier *barrier,
2661 bool external)
2662 {
2663 /* Note: we don't know until the end of the subpass whether we'll use
2664 * sysmem, so assume sysmem here to be safe.
2665 */
2666 struct tu_cache_state *cache =
2667 external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
2668 enum tu_cmd_access_mask src_flags =
2669 vk2tu_access(barrier->src_access_mask, false);
2670 enum tu_cmd_access_mask dst_flags =
2671 vk2tu_access(barrier->dst_access_mask, false);
2672
2673 if (barrier->incoherent_ccu_color)
2674 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2675 if (barrier->incoherent_ccu_depth)
2676 src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2677
2678 tu_flush_for_access(cache, src_flags, dst_flags);
2679 }
2680
2681 void
2682 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2683 const VkRenderPassBeginInfo *pRenderPassBegin,
2684 VkSubpassContents contents)
2685 {
2686 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2687 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2688 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2689
2690 cmd->state.pass = pass;
2691 cmd->state.subpass = pass->subpasses;
2692 cmd->state.framebuffer = fb;
2693
2694 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2695 tu_cmd_prepare_tile_store_ib(cmd);
2696
2697 /* Note: because this is external, any flushes will happen before draw_cs
2698 * gets called. However deferred flushes could have to happen later as part
2699 * of the subpass.
2700 */
2701 tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
2702 cmd->state.renderpass_cache.pending_flush_bits =
2703 cmd->state.cache.pending_flush_bits;
2704 cmd->state.renderpass_cache.flush_bits = 0;
2705
2706 tu_emit_load_clear(cmd, pRenderPassBegin);
2707
2708 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2709 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2710 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
2711 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2712
2713 /* note: use_hw_binning only checks tiling config */
2714 if (use_hw_binning(cmd))
2715 cmd->use_vsc_data = true;
2716
2717 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2718 const struct tu_image_view *iview = fb->attachments[i].attachment;
2719 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2720 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2721 }
2722 }
2723
2724 void
2725 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2726 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2727 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2728 {
2729 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2730 pSubpassBeginInfo->contents);
2731 }
2732
2733 void
2734 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2735 {
2736 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2737 const struct tu_render_pass *pass = cmd->state.pass;
2738 struct tu_cs *cs = &cmd->draw_cs;
2739
2740 const struct tu_subpass *subpass = cmd->state.subpass++;
2741
2742 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2743
2744 if (subpass->resolve_attachments) {
2745 tu6_emit_blit_scissor(cmd, cs, true);
2746
2747 for (unsigned i = 0; i < subpass->color_count; i++) {
2748 uint32_t a = subpass->resolve_attachments[i].attachment;
2749 if (a == VK_ATTACHMENT_UNUSED)
2750 continue;
2751
2752 tu_store_gmem_attachment(cmd, cs, a,
2753 subpass->color_attachments[i].attachment);
2754
2755 if (pass->attachments[a].gmem_offset < 0)
2756 continue;
2757
2758 /* TODO:
2759 * check if the resolved attachment is needed by later subpasses,
2760 * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
2761 */
2762 tu_finishme("missing GMEM->GMEM resolve path\n");
2763 tu_load_gmem_attachment(cmd, cs, a, true);
2764 }
2765 }
2766
2767 tu_cond_exec_end(cs);
2768
2769 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2770
2771 tu6_emit_sysmem_resolves(cmd, cs, subpass);
2772
2773 tu_cond_exec_end(cs);
2774
2775 /* Handle dependencies for the next subpass */
2776 tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
2777
2778 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2779 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2780 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2781 tu6_emit_msaa(cs, cmd->state.subpass->samples);
2782 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2783 }
2784
2785 void
2786 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2787 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2788 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2789 {
2790 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2791 }
2792
2793 struct tu_draw_info
2794 {
2795 /**
2796 * Number of vertices.
2797 */
2798 uint32_t count;
2799
2800 /**
2801 * Index of the first vertex.
2802 */
2803 int32_t vertex_offset;
2804
2805 /**
2806 * First instance id.
2807 */
2808 uint32_t first_instance;
2809
2810 /**
2811 * Number of instances.
2812 */
2813 uint32_t instance_count;
2814
2815 /**
2816 * First index (indexed draws only).
2817 */
2818 uint32_t first_index;
2819
2820 /**
2821 * Whether it's an indexed draw.
2822 */
2823 bool indexed;
2824
2825 /**
2826 * Indirect draw parameters resource.
2827 */
2828 struct tu_buffer *indirect;
2829 uint64_t indirect_offset;
2830 uint32_t stride;
2831
2832 /**
2833 * Draw count parameters resource.
2834 */
2835 struct tu_buffer *count_buffer;
2836 uint64_t count_buffer_offset;
2837
2838 /**
2839 * Stream output parameters resource.
2840 */
2841 struct tu_buffer *streamout_buffer;
2842 uint64_t streamout_buffer_offset;
2843 };
2844
2845 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2846 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2847 #define ENABLE_NON_GMEM (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_SYSMEM)
2848
2849 enum tu_draw_state_group_id
2850 {
2851 TU_DRAW_STATE_PROGRAM,
2852 TU_DRAW_STATE_PROGRAM_BINNING,
2853 TU_DRAW_STATE_VB,
2854 TU_DRAW_STATE_VI,
2855 TU_DRAW_STATE_VI_BINNING,
2856 TU_DRAW_STATE_VP,
2857 TU_DRAW_STATE_RAST,
2858 TU_DRAW_STATE_DS,
2859 TU_DRAW_STATE_BLEND,
2860 TU_DRAW_STATE_VS_CONST,
2861 TU_DRAW_STATE_GS_CONST,
2862 TU_DRAW_STATE_FS_CONST,
2863 TU_DRAW_STATE_DESC_SETS,
2864 TU_DRAW_STATE_DESC_SETS_GMEM,
2865 TU_DRAW_STATE_DESC_SETS_LOAD,
2866 TU_DRAW_STATE_VS_PARAMS,
2867
2868 TU_DRAW_STATE_COUNT,
2869 };
2870
2871 struct tu_draw_state_group
2872 {
2873 enum tu_draw_state_group_id id;
2874 uint32_t enable_mask;
2875 struct tu_cs_entry ib;
2876 };
2877
2878 static void
2879 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2880 struct tu_descriptor_state *descriptors_state,
2881 gl_shader_stage type,
2882 uint32_t *push_constants)
2883 {
2884 const struct tu_program_descriptor_linkage *link =
2885 &pipeline->program.link[type];
2886 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2887
2888 if (link->push_consts.count > 0) {
2889 unsigned num_units = link->push_consts.count;
2890 unsigned offset = link->push_consts.lo;
2891 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
2892 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
2893 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2894 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2895 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2896 CP_LOAD_STATE6_0_NUM_UNIT(num_units));
2897 tu_cs_emit(cs, 0);
2898 tu_cs_emit(cs, 0);
2899 for (unsigned i = 0; i < num_units * 4; i++)
2900 tu_cs_emit(cs, push_constants[i + offset * 4]);
2901 }
2902
2903 for (uint32_t i = 0; i < state->num_enabled; i++) {
2904 uint32_t size = state->range[i].end - state->range[i].start;
2905 uint32_t offset = state->range[i].start;
2906
2907 /* and even if the start of the const buffer is before
2908 * first_immediate, the end may not be:
2909 */
2910 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2911
2912 if (size == 0)
2913 continue;
2914
2915 /* things should be aligned to vec4: */
2916 debug_assert((state->range[i].offset % 16) == 0);
2917 debug_assert((size % 16) == 0);
2918 debug_assert((offset % 16) == 0);
2919
2920 /* Dig out the descriptor from the descriptor state and read the VA from
2921 * it.
2922 */
2923 assert(state->range[i].bindless);
2924 uint32_t *base = state->range[i].bindless_base == MAX_SETS ?
2925 descriptors_state->dynamic_descriptors :
2926 descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr;
2927 unsigned block = state->range[i].block;
2928 /* If the block in the shader here is in the dynamic descriptor set, it
2929 * is an index into the dynamic descriptor set which is combined from
2930 * dynamic descriptors and input attachments on-the-fly, and we don't
2931 * have access to it here. Instead we work backwards to get the index
2932 * into dynamic_descriptors.
2933 */
2934 if (state->range[i].bindless_base == MAX_SETS)
2935 block -= pipeline->layout->input_attachment_count;
2936 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
2937 uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
2938 assert(va);
2939
2940 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2941 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2942 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2943 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2944 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2945 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2946 tu_cs_emit_qw(cs, va + offset);
2947 }
2948 }
2949
2950 static struct tu_cs_entry
2951 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2952 const struct tu_pipeline *pipeline,
2953 struct tu_descriptor_state *descriptors_state,
2954 gl_shader_stage type)
2955 {
2956 struct tu_cs cs;
2957 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2958
2959 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2960
2961 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2962 }
2963
2964 static VkResult
2965 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
2966 const struct tu_draw_info *draw,
2967 struct tu_cs_entry *entry)
2968 {
2969 /* TODO: fill out more than just base instance */
2970 const struct tu_program_descriptor_linkage *link =
2971 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
2972 const struct ir3_const_state *const_state = &link->const_state;
2973 struct tu_cs cs;
2974
2975 if (const_state->offsets.driver_param >= link->constlen) {
2976 *entry = (struct tu_cs_entry) {};
2977 return VK_SUCCESS;
2978 }
2979
2980 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
2981 if (result != VK_SUCCESS)
2982 return result;
2983
2984 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2985 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
2986 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2987 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2988 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
2989 CP_LOAD_STATE6_0_NUM_UNIT(1));
2990 tu_cs_emit(&cs, 0);
2991 tu_cs_emit(&cs, 0);
2992
2993 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
2994
2995 tu_cs_emit(&cs, 0);
2996 tu_cs_emit(&cs, 0);
2997 tu_cs_emit(&cs, draw->first_instance);
2998 tu_cs_emit(&cs, 0);
2999
3000 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3001 return VK_SUCCESS;
3002 }
3003
3004 static struct tu_cs_entry
3005 tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
3006 const struct tu_pipeline *pipeline)
3007 {
3008 struct tu_cs cs;
3009 tu_cs_begin_sub_stream(&cmd->sub_cs, 4 * MAX_VBS, &cs);
3010
3011 int binding;
3012 for_each_bit(binding, pipeline->vi.bindings_used) {
3013 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3014 const VkDeviceSize offset = buf->bo_offset +
3015 cmd->state.vb.offsets[binding];
3016
3017 tu_cs_emit_regs(&cs,
3018 A6XX_VFD_FETCH_BASE(binding, .bo = buf->bo, .bo_offset = offset),
3019 A6XX_VFD_FETCH_SIZE(binding, buf->size - offset));
3020
3021 }
3022
3023 cmd->vertex_bindings_set = pipeline->vi.bindings_used;
3024
3025 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3026 }
3027
3028 static VkResult
3029 tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
3030 const struct tu_pipeline *pipeline,
3031 VkPipelineBindPoint bind_point,
3032 struct tu_cs_entry *entry,
3033 bool gmem)
3034 {
3035 struct tu_cs *draw_state = &cmd->sub_cs;
3036 struct tu_pipeline_layout *layout = pipeline->layout;
3037 struct tu_descriptor_state *descriptors_state =
3038 tu_get_descriptors_state(cmd, bind_point);
3039 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
3040 const uint32_t *input_attachment_idx =
3041 pipeline->program.input_attachment_idx;
3042 uint32_t num_dynamic_descs = layout->dynamic_offset_count +
3043 layout->input_attachment_count;
3044 struct ts_cs_memory dynamic_desc_set;
3045 VkResult result;
3046
3047 if (num_dynamic_descs > 0) {
3048 /* allocate and fill out dynamic descriptor set */
3049 result = tu_cs_alloc(draw_state, num_dynamic_descs,
3050 A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
3051 if (result != VK_SUCCESS)
3052 return result;
3053
3054 memcpy(dynamic_desc_set.map, descriptors_state->input_attachments,
3055 layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
3056
3057 if (gmem) {
3058 /* Patch input attachments to refer to GMEM instead */
3059 for (unsigned i = 0; i < layout->input_attachment_count; i++) {
3060 uint32_t *dst =
3061 &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i];
3062
3063 /* The compiler has already laid out input_attachment_idx in the
3064 * final order of input attachments, so there's no need to go
3065 * through the pipeline layout finding input attachments.
3066 */
3067 unsigned attachment_idx = input_attachment_idx[i];
3068
3069 /* It's possible for the pipeline layout to include an input
3070 * attachment which doesn't actually exist for the current
3071 * subpass. Of course, this is only valid so long as the pipeline
3072 * doesn't try to actually load that attachment. Just skip
3073 * patching in that scenario to avoid out-of-bounds accesses.
3074 */
3075 if (attachment_idx >= cmd->state.subpass->input_count)
3076 continue;
3077
3078 uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment;
3079 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3080
3081 assert(att->gmem_offset >= 0);
3082
3083 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
3084 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
3085 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
3086 dst[2] |=
3087 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
3088 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
3089 dst[3] = 0;
3090 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
3091 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
3092 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3093 dst[i] = 0;
3094
3095 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
3096 tu_finishme("patch input attachment pitch for secondary cmd buffer");
3097 }
3098 }
3099
3100 memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS,
3101 descriptors_state->dynamic_descriptors,
3102 layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
3103 }
3104
3105 uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
3106 uint32_t hlsq_update_value;
3107 switch (bind_point) {
3108 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3109 sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
3110 hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
3111 hlsq_update_value = 0x7c000;
3112 break;
3113 case VK_PIPELINE_BIND_POINT_COMPUTE:
3114 sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
3115 hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
3116 hlsq_update_value = 0x3e00;
3117 break;
3118 default:
3119 unreachable("bad bind point");
3120 }
3121
3122 /* Be careful here to *not* refer to the pipeline, so that if only the
3123 * pipeline changes we don't have to emit this again (except if there are
3124 * dynamic descriptors in the pipeline layout). This means always emitting
3125 * all the valid descriptors, which means that we always have to put the
3126 * dynamic descriptor in the driver-only slot at the end
3127 */
3128 uint32_t num_user_sets = util_last_bit(descriptors_state->valid);
3129 uint32_t num_sets = num_user_sets;
3130 if (num_dynamic_descs > 0) {
3131 num_user_sets = MAX_SETS;
3132 num_sets = num_user_sets + 1;
3133 }
3134
3135 unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg };
3136
3137 struct tu_cs cs;
3138 result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs);
3139 if (result != VK_SUCCESS)
3140 return result;
3141
3142 if (num_sets > 0) {
3143 for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) {
3144 tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2);
3145 for (unsigned j = 0; j < num_user_sets; j++) {
3146 if (descriptors_state->valid & (1 << j)) {
3147 /* magic | 3 copied from the blob */
3148 tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3);
3149 } else {
3150 tu_cs_emit_qw(&cs, 0 | 3);
3151 }
3152 }
3153 if (num_dynamic_descs > 0) {
3154 tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3);
3155 }
3156 }
3157
3158 tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value));
3159 }
3160
3161 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3162 return VK_SUCCESS;
3163 }
3164
3165 static void
3166 tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
3167 {
3168 struct tu_streamout_state *tf = &cmd->state.pipeline->streamout;
3169
3170 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3171 struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
3172 if (!buf)
3173 continue;
3174
3175 uint32_t offset;
3176 offset = cmd->state.streamout_buf.offsets[i];
3177
3178 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo,
3179 .bo_offset = buf->bo_offset));
3180 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size));
3181
3182 if (cmd->state.streamout_reset & (1 << i)) {
3183 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset));
3184 cmd->state.streamout_reset &= ~(1 << i);
3185 } else {
3186 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
3187 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
3188 CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
3189 CP_MEM_TO_REG_0_CNT(0));
3190 tu_cs_emit_qw(cs, cmd->scratch_bo.iova +
3191 ctrl_offset(flush_base[i].offset));
3192 }
3193
3194 tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo,
3195 .bo_offset =
3196 ctrl_offset(flush_base[i])));
3197 }
3198
3199 if (cmd->state.streamout_enabled) {
3200 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count));
3201 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
3202 tu_cs_emit(cs, tf->vpc_so_buf_cntl);
3203 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0));
3204 tu_cs_emit(cs, tf->ncomp[0]);
3205 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1));
3206 tu_cs_emit(cs, tf->ncomp[1]);
3207 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2));
3208 tu_cs_emit(cs, tf->ncomp[2]);
3209 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3));
3210 tu_cs_emit(cs, tf->ncomp[3]);
3211 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
3212 tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE);
3213 for (unsigned i = 0; i < tf->prog_count; i++) {
3214 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
3215 tu_cs_emit(cs, tf->prog[i]);
3216 }
3217 } else {
3218 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
3219 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
3220 tu_cs_emit(cs, 0);
3221 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
3222 tu_cs_emit(cs, 0);
3223 }
3224 }
3225
3226 static VkResult
3227 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3228 struct tu_cs *cs,
3229 const struct tu_draw_info *draw)
3230 {
3231 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3232 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3233 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3234 uint32_t draw_state_group_count = 0;
3235 VkResult result;
3236
3237 struct tu_descriptor_state *descriptors_state =
3238 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3239
3240 /* TODO lrz */
3241
3242 tu_cs_emit_regs(cs,
3243 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3244 pipeline->ia.primitive_restart && draw->indexed));
3245
3246 if (cmd->state.dirty &
3247 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3248 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3249 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3250 dynamic->line_width);
3251 }
3252
3253 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3254 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3255 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3256 dynamic->stencil_compare_mask.back);
3257 }
3258
3259 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3260 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3261 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3262 dynamic->stencil_write_mask.back);
3263 }
3264
3265 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3266 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3267 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3268 dynamic->stencil_reference.back);
3269 }
3270
3271 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3272 (pipeline->dynamic_state.mask & TU_DYNAMIC_VIEWPORT)) {
3273 tu6_emit_viewport(cs, &cmd->state.dynamic.viewport.viewports[0]);
3274 }
3275
3276 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_SCISSOR) &&
3277 (pipeline->dynamic_state.mask & TU_DYNAMIC_SCISSOR)) {
3278 tu6_emit_scissor(cs, &cmd->state.dynamic.scissor.scissors[0]);
3279 }
3280
3281 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3282 draw_state_groups[draw_state_group_count++] =
3283 (struct tu_draw_state_group) {
3284 .id = TU_DRAW_STATE_PROGRAM,
3285 .enable_mask = ENABLE_DRAW,
3286 .ib = pipeline->program.state_ib,
3287 };
3288 draw_state_groups[draw_state_group_count++] =
3289 (struct tu_draw_state_group) {
3290 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3291 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3292 .ib = pipeline->program.binning_state_ib,
3293 };
3294 draw_state_groups[draw_state_group_count++] =
3295 (struct tu_draw_state_group) {
3296 .id = TU_DRAW_STATE_VI,
3297 .enable_mask = ENABLE_DRAW,
3298 .ib = pipeline->vi.state_ib,
3299 };
3300 draw_state_groups[draw_state_group_count++] =
3301 (struct tu_draw_state_group) {
3302 .id = TU_DRAW_STATE_VI_BINNING,
3303 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3304 .ib = pipeline->vi.binning_state_ib,
3305 };
3306 draw_state_groups[draw_state_group_count++] =
3307 (struct tu_draw_state_group) {
3308 .id = TU_DRAW_STATE_VP,
3309 .enable_mask = ENABLE_ALL,
3310 .ib = pipeline->vp.state_ib,
3311 };
3312 draw_state_groups[draw_state_group_count++] =
3313 (struct tu_draw_state_group) {
3314 .id = TU_DRAW_STATE_RAST,
3315 .enable_mask = ENABLE_ALL,
3316 .ib = pipeline->rast.state_ib,
3317 };
3318 draw_state_groups[draw_state_group_count++] =
3319 (struct tu_draw_state_group) {
3320 .id = TU_DRAW_STATE_DS,
3321 .enable_mask = ENABLE_ALL,
3322 .ib = pipeline->ds.state_ib,
3323 };
3324 draw_state_groups[draw_state_group_count++] =
3325 (struct tu_draw_state_group) {
3326 .id = TU_DRAW_STATE_BLEND,
3327 .enable_mask = ENABLE_ALL,
3328 .ib = pipeline->blend.state_ib,
3329 };
3330 }
3331
3332 if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3333 draw_state_groups[draw_state_group_count++] =
3334 (struct tu_draw_state_group) {
3335 .id = TU_DRAW_STATE_VS_CONST,
3336 .enable_mask = ENABLE_ALL,
3337 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3338 };
3339 draw_state_groups[draw_state_group_count++] =
3340 (struct tu_draw_state_group) {
3341 .id = TU_DRAW_STATE_GS_CONST,
3342 .enable_mask = ENABLE_ALL,
3343 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_GEOMETRY)
3344 };
3345 draw_state_groups[draw_state_group_count++] =
3346 (struct tu_draw_state_group) {
3347 .id = TU_DRAW_STATE_FS_CONST,
3348 .enable_mask = ENABLE_DRAW,
3349 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3350 };
3351 }
3352
3353 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3354 draw_state_groups[draw_state_group_count++] =
3355 (struct tu_draw_state_group) {
3356 .id = TU_DRAW_STATE_VB,
3357 .enable_mask = ENABLE_ALL,
3358 .ib = tu6_emit_vertex_buffers(cmd, pipeline)
3359 };
3360 }
3361
3362 if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
3363 tu6_emit_streamout(cmd, cs);
3364
3365 /* If there are any any dynamic descriptors, then we may need to re-emit
3366 * them after every pipeline change in case the number of input attachments
3367 * changes. We also always need to re-emit after a pipeline change if there
3368 * are any input attachments, because the input attachment index comes from
3369 * the pipeline. Finally, it can also happen that the subpass changes
3370 * without the pipeline changing, in which case the GMEM descriptors need
3371 * to be patched differently.
3372 *
3373 * TODO: We could probably be clever and avoid re-emitting state on
3374 * pipeline changes if the number of input attachments is always 0. We
3375 * could also only re-emit dynamic state.
3376 */
3377 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3378 struct tu_cs_entry desc_sets, desc_sets_gmem;
3379 bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0;
3380
3381 result = tu6_emit_descriptor_sets(cmd, pipeline,
3382 VK_PIPELINE_BIND_POINT_GRAPHICS,
3383 &desc_sets, false);
3384 if (result != VK_SUCCESS)
3385 return result;
3386
3387 draw_state_groups[draw_state_group_count++] =
3388 (struct tu_draw_state_group) {
3389 .id = TU_DRAW_STATE_DESC_SETS,
3390 .enable_mask = need_gmem_desc_set ? ENABLE_NON_GMEM : ENABLE_ALL,
3391 .ib = desc_sets,
3392 };
3393
3394 if (need_gmem_desc_set) {
3395 result = tu6_emit_descriptor_sets(cmd, pipeline,
3396 VK_PIPELINE_BIND_POINT_GRAPHICS,
3397 &desc_sets_gmem, true);
3398 if (result != VK_SUCCESS)
3399 return result;
3400
3401 draw_state_groups[draw_state_group_count++] =
3402 (struct tu_draw_state_group) {
3403 .id = TU_DRAW_STATE_DESC_SETS_GMEM,
3404 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3405 .ib = desc_sets_gmem,
3406 };
3407 }
3408
3409 /* We need to reload the descriptors every time the descriptor sets
3410 * change. However, the commands we send only depend on the pipeline
3411 * because the whole point is to cache descriptors which are used by the
3412 * pipeline. There's a problem here, in that the firmware has an
3413 * "optimization" which skips executing groups that are set to the same
3414 * value as the last draw. This means that if the descriptor sets change
3415 * but not the pipeline, we'd try to re-execute the same buffer which
3416 * the firmware would ignore and we wouldn't pre-load the new
3417 * descriptors. The blob seems to re-emit the LOAD_STATE group whenever
3418 * the descriptor sets change, which we emulate here by copying the
3419 * pre-prepared buffer.
3420 */
3421 const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib;
3422 if (load_entry->size > 0) {
3423 struct tu_cs load_cs;
3424 result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs);
3425 if (result != VK_SUCCESS)
3426 return result;
3427 tu_cs_emit_array(&load_cs,
3428 (uint32_t *)((char *)load_entry->bo->map + load_entry->offset),
3429 load_entry->size / 4);
3430 struct tu_cs_entry load_copy = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs);
3431
3432 draw_state_groups[draw_state_group_count++] =
3433 (struct tu_draw_state_group) {
3434 .id = TU_DRAW_STATE_DESC_SETS_LOAD,
3435 /* The blob seems to not enable this for binning, even when
3436 * resources would actually be used in the binning shader.
3437 * Presumably the overhead of prefetching the resources isn't
3438 * worth it.
3439 */
3440 .enable_mask = ENABLE_DRAW,
3441 .ib = load_copy,
3442 };
3443 }
3444 }
3445
3446 struct tu_cs_entry vs_params;
3447 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3448 if (result != VK_SUCCESS)
3449 return result;
3450
3451 draw_state_groups[draw_state_group_count++] =
3452 (struct tu_draw_state_group) {
3453 .id = TU_DRAW_STATE_VS_PARAMS,
3454 .enable_mask = ENABLE_ALL,
3455 .ib = vs_params,
3456 };
3457
3458 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3459 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3460 const struct tu_draw_state_group *group = &draw_state_groups[i];
3461 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3462 uint32_t cp_set_draw_state =
3463 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3464 group->enable_mask |
3465 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3466 uint64_t iova;
3467 if (group->ib.size) {
3468 iova = group->ib.bo->iova + group->ib.offset;
3469 } else {
3470 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3471 iova = 0;
3472 }
3473
3474 tu_cs_emit(cs, cp_set_draw_state);
3475 tu_cs_emit_qw(cs, iova);
3476 }
3477
3478 tu_cs_sanity_check(cs);
3479
3480 /* track BOs */
3481 if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
3482 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3483 const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
3484 if (buf) {
3485 tu_bo_list_add(&cmd->bo_list, buf->bo,
3486 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3487 }
3488 }
3489 }
3490
3491 /* There are too many graphics dirty bits to list here, so just list the
3492 * bits to preserve instead. The only things not emitted here are
3493 * compute-related state.
3494 */
3495 cmd->state.dirty &= (TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3496 return VK_SUCCESS;
3497 }
3498
3499 static void
3500 tu6_emit_draw_indirect(struct tu_cmd_buffer *cmd,
3501 struct tu_cs *cs,
3502 const struct tu_draw_info *draw)
3503 {
3504 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3505 bool has_gs = cmd->state.pipeline->active_stages &
3506 VK_SHADER_STAGE_GEOMETRY_BIT;
3507
3508 tu_cs_emit_regs(cs,
3509 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3510 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3511
3512 if (draw->indexed) {
3513 const enum a4xx_index_size index_size =
3514 tu6_index_size(cmd->state.index_type);
3515 const uint32_t index_bytes =
3516 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3517 const struct tu_buffer *index_buf = cmd->state.index_buffer;
3518 unsigned max_indicies =
3519 (index_buf->size - cmd->state.index_offset) / index_bytes;
3520
3521 const uint32_t cp_draw_indx =
3522 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3523 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3524 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3525 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3526 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3527
3528 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_INDIRECT, 6);
3529 tu_cs_emit(cs, cp_draw_indx);
3530 tu_cs_emit_qw(cs, index_buf->bo->iova + cmd->state.index_offset);
3531 tu_cs_emit(cs, A5XX_CP_DRAW_INDX_INDIRECT_3_MAX_INDICES(max_indicies));
3532 tu_cs_emit_qw(cs, draw->indirect->bo->iova + draw->indirect_offset);
3533 } else {
3534 const uint32_t cp_draw_indx =
3535 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3536 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3537 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3538 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3539
3540 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT, 3);
3541 tu_cs_emit(cs, cp_draw_indx);
3542 tu_cs_emit_qw(cs, draw->indirect->bo->iova + draw->indirect_offset);
3543 }
3544
3545 tu_bo_list_add(&cmd->bo_list, draw->indirect->bo, MSM_SUBMIT_BO_READ);
3546 }
3547
3548 static void
3549 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3550 struct tu_cs *cs,
3551 const struct tu_draw_info *draw)
3552 {
3553
3554 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3555 bool has_gs = cmd->state.pipeline->active_stages &
3556 VK_SHADER_STAGE_GEOMETRY_BIT;
3557
3558 tu_cs_emit_regs(cs,
3559 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3560 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3561
3562 /* TODO hw binning */
3563 if (draw->indexed) {
3564 const enum a4xx_index_size index_size =
3565 tu6_index_size(cmd->state.index_type);
3566 const uint32_t index_bytes =
3567 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3568 const struct tu_buffer *buf = cmd->state.index_buffer;
3569 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3570 index_bytes * draw->first_index;
3571 const uint32_t size = index_bytes * draw->count;
3572
3573 const uint32_t cp_draw_indx =
3574 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3575 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3576 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3577 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3578 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3579
3580 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3581 tu_cs_emit(cs, cp_draw_indx);
3582 tu_cs_emit(cs, draw->instance_count);
3583 tu_cs_emit(cs, draw->count);
3584 tu_cs_emit(cs, 0x0); /* XXX */
3585 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3586 tu_cs_emit(cs, size);
3587 } else {
3588 const uint32_t cp_draw_indx =
3589 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3590 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3591 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3592 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3593
3594 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3595 tu_cs_emit(cs, cp_draw_indx);
3596 tu_cs_emit(cs, draw->instance_count);
3597 tu_cs_emit(cs, draw->count);
3598 }
3599 }
3600
3601 static void
3602 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3603 {
3604 struct tu_cs *cs = &cmd->draw_cs;
3605 VkResult result;
3606
3607 tu_emit_cache_flush_renderpass(cmd, cs);
3608
3609 result = tu6_bind_draw_states(cmd, cs, draw);
3610 if (result != VK_SUCCESS) {
3611 cmd->record_result = result;
3612 return;
3613 }
3614
3615 if (draw->indirect)
3616 tu6_emit_draw_indirect(cmd, cs, draw);
3617 else
3618 tu6_emit_draw_direct(cmd, cs, draw);
3619
3620 if (cmd->state.streamout_enabled) {
3621 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3622 if (cmd->state.streamout_enabled & (1 << i))
3623 tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
3624 }
3625 }
3626
3627 tu_cs_sanity_check(cs);
3628 }
3629
3630 void
3631 tu_CmdDraw(VkCommandBuffer commandBuffer,
3632 uint32_t vertexCount,
3633 uint32_t instanceCount,
3634 uint32_t firstVertex,
3635 uint32_t firstInstance)
3636 {
3637 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3638 struct tu_draw_info info = {};
3639
3640 info.count = vertexCount;
3641 info.instance_count = instanceCount;
3642 info.first_instance = firstInstance;
3643 info.vertex_offset = firstVertex;
3644
3645 tu_draw(cmd_buffer, &info);
3646 }
3647
3648 void
3649 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3650 uint32_t indexCount,
3651 uint32_t instanceCount,
3652 uint32_t firstIndex,
3653 int32_t vertexOffset,
3654 uint32_t firstInstance)
3655 {
3656 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3657 struct tu_draw_info info = {};
3658
3659 info.indexed = true;
3660 info.count = indexCount;
3661 info.instance_count = instanceCount;
3662 info.first_index = firstIndex;
3663 info.vertex_offset = vertexOffset;
3664 info.first_instance = firstInstance;
3665
3666 tu_draw(cmd_buffer, &info);
3667 }
3668
3669 void
3670 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3671 VkBuffer _buffer,
3672 VkDeviceSize offset,
3673 uint32_t drawCount,
3674 uint32_t stride)
3675 {
3676 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3677 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3678 struct tu_draw_info info = {};
3679
3680 info.count = drawCount;
3681 info.indirect = buffer;
3682 info.indirect_offset = offset;
3683 info.stride = stride;
3684
3685 tu_draw(cmd_buffer, &info);
3686 }
3687
3688 void
3689 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3690 VkBuffer _buffer,
3691 VkDeviceSize offset,
3692 uint32_t drawCount,
3693 uint32_t stride)
3694 {
3695 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3696 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3697 struct tu_draw_info info = {};
3698
3699 info.indexed = true;
3700 info.count = drawCount;
3701 info.indirect = buffer;
3702 info.indirect_offset = offset;
3703 info.stride = stride;
3704
3705 tu_draw(cmd_buffer, &info);
3706 }
3707
3708 void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3709 uint32_t instanceCount,
3710 uint32_t firstInstance,
3711 VkBuffer _counterBuffer,
3712 VkDeviceSize counterBufferOffset,
3713 uint32_t counterOffset,
3714 uint32_t vertexStride)
3715 {
3716 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3717 TU_FROM_HANDLE(tu_buffer, buffer, _counterBuffer);
3718
3719 struct tu_draw_info info = {};
3720
3721 info.instance_count = instanceCount;
3722 info.first_instance = firstInstance;
3723 info.streamout_buffer = buffer;
3724 info.streamout_buffer_offset = counterBufferOffset;
3725 info.stride = vertexStride;
3726
3727 tu_draw(cmd_buffer, &info);
3728 }
3729
3730 struct tu_dispatch_info
3731 {
3732 /**
3733 * Determine the layout of the grid (in block units) to be used.
3734 */
3735 uint32_t blocks[3];
3736
3737 /**
3738 * A starting offset for the grid. If unaligned is set, the offset
3739 * must still be aligned.
3740 */
3741 uint32_t offsets[3];
3742 /**
3743 * Whether it's an unaligned compute dispatch.
3744 */
3745 bool unaligned;
3746
3747 /**
3748 * Indirect compute parameters resource.
3749 */
3750 struct tu_buffer *indirect;
3751 uint64_t indirect_offset;
3752 };
3753
3754 static void
3755 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3756 const struct tu_dispatch_info *info)
3757 {
3758 gl_shader_stage type = MESA_SHADER_COMPUTE;
3759 const struct tu_program_descriptor_linkage *link =
3760 &pipeline->program.link[type];
3761 const struct ir3_const_state *const_state = &link->const_state;
3762 uint32_t offset = const_state->offsets.driver_param;
3763
3764 if (link->constlen <= offset)
3765 return;
3766
3767 if (!info->indirect) {
3768 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3769 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3770 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3771 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3772 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3773 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3774 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3775 };
3776
3777 uint32_t num_consts = MIN2(const_state->num_driver_params,
3778 (link->constlen - offset) * 4);
3779 /* push constants */
3780 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3781 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3782 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3783 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3784 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3785 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3786 tu_cs_emit(cs, 0);
3787 tu_cs_emit(cs, 0);
3788 uint32_t i;
3789 for (i = 0; i < num_consts; i++)
3790 tu_cs_emit(cs, driver_params[i]);
3791 } else {
3792 tu_finishme("Indirect driver params");
3793 }
3794 }
3795
3796 static void
3797 tu_dispatch(struct tu_cmd_buffer *cmd,
3798 const struct tu_dispatch_info *info)
3799 {
3800 struct tu_cs *cs = &cmd->cs;
3801 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3802 struct tu_descriptor_state *descriptors_state =
3803 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3804 VkResult result;
3805
3806 /* TODO: We could probably flush less if we add a compute_flush_bits
3807 * bitfield.
3808 */
3809 tu_emit_cache_flush(cmd, cs);
3810
3811 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3812 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3813
3814 struct tu_cs_entry ib;
3815
3816 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3817 if (ib.size)
3818 tu_cs_emit_ib(cs, &ib);
3819
3820 tu_emit_compute_driver_params(cs, pipeline, info);
3821
3822 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) {
3823 result = tu6_emit_descriptor_sets(cmd, pipeline,
3824 VK_PIPELINE_BIND_POINT_COMPUTE, &ib,
3825 false);
3826 if (result != VK_SUCCESS) {
3827 cmd->record_result = result;
3828 return;
3829 }
3830 }
3831
3832 if (ib.size)
3833 tu_cs_emit_ib(cs, &ib);
3834
3835 if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) &&
3836 pipeline->load_state.state_ib.size > 0) {
3837 tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);
3838 }
3839
3840 cmd->state.dirty &=
3841 ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3842
3843 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3844 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3845
3846 const uint32_t *local_size = pipeline->compute.local_size;
3847 const uint32_t *num_groups = info->blocks;
3848 tu_cs_emit_regs(cs,
3849 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3850 .localsizex = local_size[0] - 1,
3851 .localsizey = local_size[1] - 1,
3852 .localsizez = local_size[2] - 1),
3853 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3854 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3855 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3856 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3857 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3858 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3859
3860 tu_cs_emit_regs(cs,
3861 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3862 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3863 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3864
3865 if (info->indirect) {
3866 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3867
3868 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3869 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3870
3871 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3872 tu_cs_emit(cs, 0x00000000);
3873 tu_cs_emit_qw(cs, iova);
3874 tu_cs_emit(cs,
3875 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3876 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3877 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3878 } else {
3879 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3880 tu_cs_emit(cs, 0x00000000);
3881 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3882 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3883 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3884 }
3885
3886 tu_cs_emit_wfi(cs);
3887 }
3888
3889 void
3890 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3891 uint32_t base_x,
3892 uint32_t base_y,
3893 uint32_t base_z,
3894 uint32_t x,
3895 uint32_t y,
3896 uint32_t z)
3897 {
3898 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3899 struct tu_dispatch_info info = {};
3900
3901 info.blocks[0] = x;
3902 info.blocks[1] = y;
3903 info.blocks[2] = z;
3904
3905 info.offsets[0] = base_x;
3906 info.offsets[1] = base_y;
3907 info.offsets[2] = base_z;
3908 tu_dispatch(cmd_buffer, &info);
3909 }
3910
3911 void
3912 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3913 uint32_t x,
3914 uint32_t y,
3915 uint32_t z)
3916 {
3917 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3918 }
3919
3920 void
3921 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3922 VkBuffer _buffer,
3923 VkDeviceSize offset)
3924 {
3925 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3926 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3927 struct tu_dispatch_info info = {};
3928
3929 info.indirect = buffer;
3930 info.indirect_offset = offset;
3931
3932 tu_dispatch(cmd_buffer, &info);
3933 }
3934
3935 void
3936 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3937 {
3938 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3939
3940 tu_cs_end(&cmd_buffer->draw_cs);
3941 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3942
3943 if (use_sysmem_rendering(cmd_buffer))
3944 tu_cmd_render_sysmem(cmd_buffer);
3945 else
3946 tu_cmd_render_tiles(cmd_buffer);
3947
3948 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
3949 rendered */
3950 tu_cs_discard_entries(&cmd_buffer->draw_cs);
3951 tu_cs_begin(&cmd_buffer->draw_cs);
3952 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
3953 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
3954
3955 cmd_buffer->state.cache.pending_flush_bits |=
3956 cmd_buffer->state.renderpass_cache.pending_flush_bits;
3957 tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
3958
3959 cmd_buffer->state.pass = NULL;
3960 cmd_buffer->state.subpass = NULL;
3961 cmd_buffer->state.framebuffer = NULL;
3962 }
3963
3964 void
3965 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
3966 const VkSubpassEndInfoKHR *pSubpassEndInfo)
3967 {
3968 tu_CmdEndRenderPass(commandBuffer);
3969 }
3970
3971 struct tu_barrier_info
3972 {
3973 uint32_t eventCount;
3974 const VkEvent *pEvents;
3975 VkPipelineStageFlags srcStageMask;
3976 };
3977
3978 static void
3979 tu_barrier(struct tu_cmd_buffer *cmd,
3980 uint32_t memoryBarrierCount,
3981 const VkMemoryBarrier *pMemoryBarriers,
3982 uint32_t bufferMemoryBarrierCount,
3983 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
3984 uint32_t imageMemoryBarrierCount,
3985 const VkImageMemoryBarrier *pImageMemoryBarriers,
3986 const struct tu_barrier_info *info)
3987 {
3988 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
3989 VkAccessFlags srcAccessMask = 0;
3990 VkAccessFlags dstAccessMask = 0;
3991
3992 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3993 srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
3994 dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
3995 }
3996
3997 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
3998 srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
3999 dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
4000 }
4001
4002 enum tu_cmd_access_mask src_flags = 0;
4003 enum tu_cmd_access_mask dst_flags = 0;
4004
4005 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4006 TU_FROM_HANDLE(tu_image, image, pImageMemoryBarriers[i].image);
4007 VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
4008 /* For non-linear images, PREINITIALIZED is the same as UNDEFINED */
4009 if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
4010 (image->tiling != VK_IMAGE_TILING_LINEAR &&
4011 old_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
4012 /* The underlying memory for this image may have been used earlier
4013 * within the same queue submission for a different image, which
4014 * means that there may be old, stale cache entries which are in the
4015 * "wrong" location, which could cause problems later after writing
4016 * to the image. We don't want these entries being flushed later and
4017 * overwriting the actual image, so we need to flush the CCU.
4018 */
4019 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4020 }
4021 srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
4022 dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
4023 }
4024
4025 /* Inside a renderpass, we don't know yet whether we'll be using sysmem
4026 * so we have to use the sysmem flushes.
4027 */
4028 bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
4029 !cmd->state.pass;
4030 src_flags |= vk2tu_access(srcAccessMask, gmem);
4031 dst_flags |= vk2tu_access(dstAccessMask, gmem);
4032
4033 struct tu_cache_state *cache =
4034 cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
4035 tu_flush_for_access(cache, src_flags, dst_flags);
4036
4037 for (uint32_t i = 0; i < info->eventCount; i++) {
4038 TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
4039
4040 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4041
4042 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4043 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4044 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4045 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4046 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4047 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4048 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4049 }
4050 }
4051
4052 void
4053 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4054 VkPipelineStageFlags srcStageMask,
4055 VkPipelineStageFlags dstStageMask,
4056 VkDependencyFlags dependencyFlags,
4057 uint32_t memoryBarrierCount,
4058 const VkMemoryBarrier *pMemoryBarriers,
4059 uint32_t bufferMemoryBarrierCount,
4060 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4061 uint32_t imageMemoryBarrierCount,
4062 const VkImageMemoryBarrier *pImageMemoryBarriers)
4063 {
4064 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4065 struct tu_barrier_info info;
4066
4067 info.eventCount = 0;
4068 info.pEvents = NULL;
4069 info.srcStageMask = srcStageMask;
4070
4071 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4072 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4073 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4074 }
4075
4076 static void
4077 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
4078 VkPipelineStageFlags stageMask, unsigned value)
4079 {
4080 struct tu_cs *cs = &cmd->cs;
4081
4082 /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
4083 assert(!cmd->state.pass);
4084
4085 tu_emit_cache_flush(cmd, cs);
4086
4087 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4088
4089 /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
4090 * read by the CP, so the draw indirect stage counts as top-of-pipe too.
4091 */
4092 VkPipelineStageFlags top_of_pipe_flags =
4093 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
4094 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
4095
4096 if (!(stageMask & ~top_of_pipe_flags)) {
4097 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4098 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4099 tu_cs_emit(cs, value);
4100 } else {
4101 /* Use a RB_DONE_TS event to wait for everything to complete. */
4102 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
4103 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
4104 tu_cs_emit_qw(cs, event->bo.iova);
4105 tu_cs_emit(cs, value);
4106 }
4107 }
4108
4109 void
4110 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4111 VkEvent _event,
4112 VkPipelineStageFlags stageMask)
4113 {
4114 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4115 TU_FROM_HANDLE(tu_event, event, _event);
4116
4117 write_event(cmd, event, stageMask, 1);
4118 }
4119
4120 void
4121 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4122 VkEvent _event,
4123 VkPipelineStageFlags stageMask)
4124 {
4125 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4126 TU_FROM_HANDLE(tu_event, event, _event);
4127
4128 write_event(cmd, event, stageMask, 0);
4129 }
4130
4131 void
4132 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4133 uint32_t eventCount,
4134 const VkEvent *pEvents,
4135 VkPipelineStageFlags srcStageMask,
4136 VkPipelineStageFlags dstStageMask,
4137 uint32_t memoryBarrierCount,
4138 const VkMemoryBarrier *pMemoryBarriers,
4139 uint32_t bufferMemoryBarrierCount,
4140 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4141 uint32_t imageMemoryBarrierCount,
4142 const VkImageMemoryBarrier *pImageMemoryBarriers)
4143 {
4144 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4145 struct tu_barrier_info info;
4146
4147 info.eventCount = eventCount;
4148 info.pEvents = pEvents;
4149 info.srcStageMask = 0;
4150
4151 tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
4152 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4153 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4154 }
4155
4156 void
4157 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4158 {
4159 /* No-op */
4160 }