turnip: remove duplicated stage2opcode and stage2shaderdb
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36
37 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
38
39 void
40 tu_bo_list_init(struct tu_bo_list *list)
41 {
42 list->count = list->capacity = 0;
43 list->bo_infos = NULL;
44 }
45
46 void
47 tu_bo_list_destroy(struct tu_bo_list *list)
48 {
49 free(list->bo_infos);
50 }
51
52 void
53 tu_bo_list_reset(struct tu_bo_list *list)
54 {
55 list->count = 0;
56 }
57
58 /**
59 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
60 */
61 static uint32_t
62 tu_bo_list_add_info(struct tu_bo_list *list,
63 const struct drm_msm_gem_submit_bo *bo_info)
64 {
65 assert(bo_info->handle != 0);
66
67 for (uint32_t i = 0; i < list->count; ++i) {
68 if (list->bo_infos[i].handle == bo_info->handle) {
69 assert(list->bo_infos[i].presumed == bo_info->presumed);
70 list->bo_infos[i].flags |= bo_info->flags;
71 return i;
72 }
73 }
74
75 /* grow list->bo_infos if needed */
76 if (list->count == list->capacity) {
77 uint32_t new_capacity = MAX2(2 * list->count, 16);
78 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
79 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
80 if (!new_bo_infos)
81 return TU_BO_LIST_FAILED;
82 list->bo_infos = new_bo_infos;
83 list->capacity = new_capacity;
84 }
85
86 list->bo_infos[list->count] = *bo_info;
87 return list->count++;
88 }
89
90 uint32_t
91 tu_bo_list_add(struct tu_bo_list *list,
92 const struct tu_bo *bo,
93 uint32_t flags)
94 {
95 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
96 .flags = flags,
97 .handle = bo->gem_handle,
98 .presumed = bo->iova,
99 });
100 }
101
102 VkResult
103 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
104 {
105 for (uint32_t i = 0; i < other->count; i++) {
106 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
107 return VK_ERROR_OUT_OF_HOST_MEMORY;
108 }
109
110 return VK_SUCCESS;
111 }
112
113 static void
114 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
115 const struct tu_device *dev,
116 const struct tu_render_pass *pass)
117 {
118 const uint32_t tile_align_w = pass->tile_align_w;
119 const uint32_t max_tile_width = 1024;
120
121 /* note: don't offset the tiling config by render_area.offset,
122 * because binning pass can't deal with it
123 * this means we might end up with more tiles than necessary,
124 * but load/store/etc are still scissored to the render_area
125 */
126 tiling->tile0.offset = (VkOffset2D) {};
127
128 const uint32_t ra_width =
129 tiling->render_area.extent.width +
130 (tiling->render_area.offset.x - tiling->tile0.offset.x);
131 const uint32_t ra_height =
132 tiling->render_area.extent.height +
133 (tiling->render_area.offset.y - tiling->tile0.offset.y);
134
135 /* start from 1 tile */
136 tiling->tile_count = (VkExtent2D) {
137 .width = 1,
138 .height = 1,
139 };
140 tiling->tile0.extent = (VkExtent2D) {
141 .width = util_align_npot(ra_width, tile_align_w),
142 .height = align(ra_height, TILE_ALIGN_H),
143 };
144
145 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
146 /* start with 2x2 tiles */
147 tiling->tile_count.width = 2;
148 tiling->tile_count.height = 2;
149 tiling->tile0.extent.width = util_align_npot(DIV_ROUND_UP(ra_width, 2), tile_align_w);
150 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), TILE_ALIGN_H);
151 }
152
153 /* do not exceed max tile width */
154 while (tiling->tile0.extent.width > max_tile_width) {
155 tiling->tile_count.width++;
156 tiling->tile0.extent.width =
157 util_align_npot(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
158 }
159
160 /* will force to sysmem, don't bother trying to have a valid tile config
161 * TODO: just skip all GMEM stuff when sysmem is forced?
162 */
163 if (!pass->gmem_pixels)
164 return;
165
166 /* do not exceed gmem size */
167 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pass->gmem_pixels) {
168 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
169 tiling->tile_count.width++;
170 tiling->tile0.extent.width =
171 util_align_npot(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
172 } else {
173 /* if this assert fails then layout is impossible.. */
174 assert(tiling->tile0.extent.height > TILE_ALIGN_H);
175 tiling->tile_count.height++;
176 tiling->tile0.extent.height =
177 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), TILE_ALIGN_H);
178 }
179 }
180 }
181
182 static void
183 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
184 const struct tu_device *dev)
185 {
186 const uint32_t max_pipe_count = 32; /* A6xx */
187
188 /* start from 1 tile per pipe */
189 tiling->pipe0 = (VkExtent2D) {
190 .width = 1,
191 .height = 1,
192 };
193 tiling->pipe_count = tiling->tile_count;
194
195 while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
196 if (tiling->pipe0.width < tiling->pipe0.height) {
197 tiling->pipe0.width += 1;
198 tiling->pipe_count.width =
199 DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
200 } else {
201 tiling->pipe0.height += 1;
202 tiling->pipe_count.height =
203 DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
204 }
205 }
206 }
207
208 static void
209 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
210 const struct tu_device *dev)
211 {
212 const uint32_t max_pipe_count = 32; /* A6xx */
213 const uint32_t used_pipe_count =
214 tiling->pipe_count.width * tiling->pipe_count.height;
215 const VkExtent2D last_pipe = {
216 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
217 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
218 };
219
220 assert(used_pipe_count <= max_pipe_count);
221 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
222
223 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
224 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
225 const uint32_t pipe_x = tiling->pipe0.width * x;
226 const uint32_t pipe_y = tiling->pipe0.height * y;
227 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
228 ? last_pipe.width
229 : tiling->pipe0.width;
230 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
231 ? last_pipe.height
232 : tiling->pipe0.height;
233 const uint32_t n = tiling->pipe_count.width * y + x;
234
235 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
236 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
237 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
238 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
239 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
240 }
241 }
242
243 memset(tiling->pipe_config + used_pipe_count, 0,
244 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
245 }
246
247 static void
248 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
249 const struct tu_device *dev,
250 uint32_t tx,
251 uint32_t ty,
252 struct tu_tile *tile)
253 {
254 /* find the pipe and the slot for tile (tx, ty) */
255 const uint32_t px = tx / tiling->pipe0.width;
256 const uint32_t py = ty / tiling->pipe0.height;
257 const uint32_t sx = tx - tiling->pipe0.width * px;
258 const uint32_t sy = ty - tiling->pipe0.height * py;
259 /* last pipe has different width */
260 const uint32_t pipe_width =
261 MIN2(tiling->pipe0.width,
262 tiling->tile_count.width - px * tiling->pipe0.width);
263
264 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
265 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
266 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
267
268 /* convert to 1D indices */
269 tile->pipe = tiling->pipe_count.width * py + px;
270 tile->slot = pipe_width * sy + sx;
271
272 /* get the blit area for the tile */
273 tile->begin = (VkOffset2D) {
274 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
275 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
276 };
277 tile->end.x =
278 (tx == tiling->tile_count.width - 1)
279 ? tiling->render_area.offset.x + tiling->render_area.extent.width
280 : tile->begin.x + tiling->tile0.extent.width;
281 tile->end.y =
282 (ty == tiling->tile_count.height - 1)
283 ? tiling->render_area.offset.y + tiling->render_area.extent.height
284 : tile->begin.y + tiling->tile0.extent.height;
285 }
286
287 enum a3xx_msaa_samples
288 tu_msaa_samples(uint32_t samples)
289 {
290 switch (samples) {
291 case 1:
292 return MSAA_ONE;
293 case 2:
294 return MSAA_TWO;
295 case 4:
296 return MSAA_FOUR;
297 case 8:
298 return MSAA_EIGHT;
299 default:
300 assert(!"invalid sample count");
301 return MSAA_ONE;
302 }
303 }
304
305 static enum a4xx_index_size
306 tu6_index_size(VkIndexType type)
307 {
308 switch (type) {
309 case VK_INDEX_TYPE_UINT16:
310 return INDEX4_SIZE_16_BIT;
311 case VK_INDEX_TYPE_UINT32:
312 return INDEX4_SIZE_32_BIT;
313 default:
314 unreachable("invalid VkIndexType");
315 return INDEX4_SIZE_8_BIT;
316 }
317 }
318
319 void
320 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
321 struct tu_cs *cs,
322 enum vgt_event_type event)
323 {
324 bool need_seqno = false;
325 switch (event) {
326 case CACHE_FLUSH_TS:
327 case WT_DONE_TS:
328 case RB_DONE_TS:
329 case PC_CCU_FLUSH_DEPTH_TS:
330 case PC_CCU_FLUSH_COLOR_TS:
331 case PC_CCU_RESOLVE_TS:
332 need_seqno = true;
333 break;
334 default:
335 break;
336 }
337
338 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
339 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
340 if (need_seqno) {
341 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
342 tu_cs_emit(cs, 0);
343 }
344 }
345
346 static void
347 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
348 struct tu_cs *cs,
349 enum tu_cmd_flush_bits flushes)
350 {
351 /* Experiments show that invalidating CCU while it still has data in it
352 * doesn't work, so make sure to always flush before invalidating in case
353 * any data remains that hasn't yet been made available through a barrier.
354 * However it does seem to work for UCHE.
355 */
356 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
357 TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
358 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
359 if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
360 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
361 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
362 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
363 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
364 if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
365 tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
366 if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
367 tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
368 if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
369 tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
370 if (flushes & TU_CMD_FLAG_WFI)
371 tu_cs_emit_wfi(cs);
372 }
373
374 /* "Normal" cache flushes, that don't require any special handling */
375
376 static void
377 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
378 struct tu_cs *cs)
379 {
380 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
381 cmd_buffer->state.cache.flush_bits = 0;
382 }
383
384 /* Renderpass cache flushes */
385
386 void
387 tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
388 struct tu_cs *cs)
389 {
390 tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
391 cmd_buffer->state.renderpass_cache.flush_bits = 0;
392 }
393
394 /* Cache flushes for things that use the color/depth read/write path (i.e.
395 * blits and draws). This deals with changing CCU state as well as the usual
396 * cache flushing.
397 */
398
399 void
400 tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
401 struct tu_cs *cs,
402 enum tu_cmd_ccu_state ccu_state)
403 {
404 enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
405
406 assert(ccu_state != TU_CMD_CCU_UNKNOWN);
407
408 /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
409 * the CCU may also contain data that we haven't flushed out yet, so we
410 * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
411 * emit a WFI as it isn't pipelined.
412 */
413 if (ccu_state != cmd_buffer->state.ccu_state) {
414 if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
415 flushes |=
416 TU_CMD_FLAG_CCU_FLUSH_COLOR |
417 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
418 cmd_buffer->state.cache.pending_flush_bits &= ~(
419 TU_CMD_FLAG_CCU_FLUSH_COLOR |
420 TU_CMD_FLAG_CCU_FLUSH_DEPTH);
421 }
422 flushes |=
423 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
424 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
425 TU_CMD_FLAG_WFI;
426 cmd_buffer->state.cache.pending_flush_bits &= ~(
427 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
428 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH);
429 }
430
431 tu6_emit_flushes(cmd_buffer, cs, flushes);
432 cmd_buffer->state.cache.flush_bits = 0;
433
434 if (ccu_state != cmd_buffer->state.ccu_state) {
435 struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
436 tu_cs_emit_regs(cs,
437 A6XX_RB_CCU_CNTL(.offset =
438 ccu_state == TU_CMD_CCU_GMEM ?
439 phys_dev->ccu_offset_gmem :
440 phys_dev->ccu_offset_bypass,
441 .gmem = ccu_state == TU_CMD_CCU_GMEM));
442 cmd_buffer->state.ccu_state = ccu_state;
443 }
444 }
445
446 static void
447 tu6_emit_zs(struct tu_cmd_buffer *cmd,
448 const struct tu_subpass *subpass,
449 struct tu_cs *cs)
450 {
451 const struct tu_framebuffer *fb = cmd->state.framebuffer;
452
453 const uint32_t a = subpass->depth_stencil_attachment.attachment;
454 if (a == VK_ATTACHMENT_UNUSED) {
455 tu_cs_emit_regs(cs,
456 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
457 A6XX_RB_DEPTH_BUFFER_PITCH(0),
458 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
459 A6XX_RB_DEPTH_BUFFER_BASE(0),
460 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
461
462 tu_cs_emit_regs(cs,
463 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
464
465 tu_cs_emit_regs(cs,
466 A6XX_GRAS_LRZ_BUFFER_BASE(0),
467 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
468 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
469
470 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
471
472 return;
473 }
474
475 const struct tu_image_view *iview = fb->attachments[a].attachment;
476 const struct tu_render_pass_attachment *attachment =
477 &cmd->state.pass->attachments[a];
478 enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
479
480 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
481 tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
482 tu_cs_image_ref(cs, iview, 0);
483 tu_cs_emit(cs, attachment->gmem_offset);
484
485 tu_cs_emit_regs(cs,
486 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
487
488 tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3);
489 tu_cs_image_flag_ref(cs, iview, 0);
490
491 tu_cs_emit_regs(cs,
492 A6XX_GRAS_LRZ_BUFFER_BASE(0),
493 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
494 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
495
496 if (attachment->format == VK_FORMAT_S8_UINT) {
497 tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
498 tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
499 tu_cs_image_ref(cs, iview, 0);
500 tu_cs_emit(cs, attachment->gmem_offset);
501 } else {
502 tu_cs_emit_regs(cs,
503 A6XX_RB_STENCIL_INFO(0));
504 }
505 }
506
507 static void
508 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
509 const struct tu_subpass *subpass,
510 struct tu_cs *cs)
511 {
512 const struct tu_framebuffer *fb = cmd->state.framebuffer;
513
514 for (uint32_t i = 0; i < subpass->color_count; ++i) {
515 uint32_t a = subpass->color_attachments[i].attachment;
516 if (a == VK_ATTACHMENT_UNUSED)
517 continue;
518
519 const struct tu_image_view *iview = fb->attachments[a].attachment;
520
521 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
522 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
523 tu_cs_image_ref(cs, iview, 0);
524 tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
525
526 tu_cs_emit_regs(cs,
527 A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
528
529 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR_LO(i), 3);
530 tu_cs_image_flag_ref(cs, iview, 0);
531 }
532
533 tu_cs_emit_regs(cs,
534 A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
535 tu_cs_emit_regs(cs,
536 A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
537
538 tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(fb->layers - 1));
539 }
540
541 void
542 tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
543 {
544 const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
545 bool msaa_disable = samples == MSAA_ONE;
546
547 tu_cs_emit_regs(cs,
548 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
549 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
550 .msaa_disable = msaa_disable));
551
552 tu_cs_emit_regs(cs,
553 A6XX_GRAS_RAS_MSAA_CNTL(samples),
554 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
555 .msaa_disable = msaa_disable));
556
557 tu_cs_emit_regs(cs,
558 A6XX_RB_RAS_MSAA_CNTL(samples),
559 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
560 .msaa_disable = msaa_disable));
561
562 tu_cs_emit_regs(cs,
563 A6XX_RB_MSAA_CNTL(samples));
564 }
565
566 static void
567 tu6_emit_bin_size(struct tu_cs *cs,
568 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
569 {
570 tu_cs_emit_regs(cs,
571 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
572 .binh = bin_h,
573 .dword = flags));
574
575 tu_cs_emit_regs(cs,
576 A6XX_RB_BIN_CONTROL(.binw = bin_w,
577 .binh = bin_h,
578 .dword = flags));
579
580 /* no flag for RB_BIN_CONTROL2... */
581 tu_cs_emit_regs(cs,
582 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
583 .binh = bin_h));
584 }
585
586 static void
587 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
588 const struct tu_subpass *subpass,
589 struct tu_cs *cs,
590 bool binning)
591 {
592 const struct tu_framebuffer *fb = cmd->state.framebuffer;
593 uint32_t cntl = 0;
594 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
595 if (binning) {
596 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
597 } else {
598 uint32_t mrts_ubwc_enable = 0;
599 for (uint32_t i = 0; i < subpass->color_count; ++i) {
600 uint32_t a = subpass->color_attachments[i].attachment;
601 if (a == VK_ATTACHMENT_UNUSED)
602 continue;
603
604 const struct tu_image_view *iview = fb->attachments[a].attachment;
605 if (iview->ubwc_enabled)
606 mrts_ubwc_enable |= 1 << i;
607 }
608
609 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
610
611 const uint32_t a = subpass->depth_stencil_attachment.attachment;
612 if (a != VK_ATTACHMENT_UNUSED) {
613 const struct tu_image_view *iview = fb->attachments[a].attachment;
614 if (iview->ubwc_enabled)
615 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
616 }
617
618 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
619 * in order to set it correctly for the different subpasses. However,
620 * that means the packets we're emitting also happen during binning. So
621 * we need to guard the write on !BINNING at CP execution time.
622 */
623 tu_cs_reserve(cs, 3 + 4);
624 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
625 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
626 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
627 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
628 }
629
630 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
631 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
632 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
633 tu_cs_emit(cs, cntl);
634 }
635
636 static void
637 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
638 {
639 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
640 uint32_t x1 = render_area->offset.x;
641 uint32_t y1 = render_area->offset.y;
642 uint32_t x2 = x1 + render_area->extent.width - 1;
643 uint32_t y2 = y1 + render_area->extent.height - 1;
644
645 if (align) {
646 x1 = x1 & ~(GMEM_ALIGN_W - 1);
647 y1 = y1 & ~(GMEM_ALIGN_H - 1);
648 x2 = ALIGN_POT(x2 + 1, GMEM_ALIGN_W) - 1;
649 y2 = ALIGN_POT(y2 + 1, GMEM_ALIGN_H) - 1;
650 }
651
652 tu_cs_emit_regs(cs,
653 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
654 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
655 }
656
657 void
658 tu6_emit_window_scissor(struct tu_cs *cs,
659 uint32_t x1,
660 uint32_t y1,
661 uint32_t x2,
662 uint32_t y2)
663 {
664 tu_cs_emit_regs(cs,
665 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
666 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
667
668 tu_cs_emit_regs(cs,
669 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
670 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
671 }
672
673 void
674 tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
675 {
676 tu_cs_emit_regs(cs,
677 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
678
679 tu_cs_emit_regs(cs,
680 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
681
682 tu_cs_emit_regs(cs,
683 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
684
685 tu_cs_emit_regs(cs,
686 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
687 }
688
689 static bool
690 use_hw_binning(struct tu_cmd_buffer *cmd)
691 {
692 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
693
694 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
695 return false;
696
697 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
698 return true;
699
700 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
701 }
702
703 static bool
704 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
705 {
706 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
707 return true;
708
709 /* can't fit attachments into gmem */
710 if (!cmd->state.pass->gmem_pixels)
711 return true;
712
713 if (cmd->state.framebuffer->layers > 1)
714 return true;
715
716 return cmd->state.tiling_config.force_sysmem;
717 }
718
719 static void
720 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
721 struct tu_cs *cs,
722 const struct tu_tile *tile)
723 {
724 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
725 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
726
727 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
728 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
729
730 const uint32_t x1 = tile->begin.x;
731 const uint32_t y1 = tile->begin.y;
732 const uint32_t x2 = tile->end.x - 1;
733 const uint32_t y2 = tile->end.y - 1;
734 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
735 tu6_emit_window_offset(cs, x1, y1);
736
737 tu_cs_emit_regs(cs,
738 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
739
740 if (use_hw_binning(cmd)) {
741 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
742
743 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
744 tu_cs_emit(cs, 0x0);
745
746 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
747 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
748 A6XX_CP_REG_TEST_0_BIT(0) |
749 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
750
751 tu_cs_reserve(cs, 3 + 11);
752 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
753 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
754 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
755
756 /* if (no overflow) */ {
757 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
758 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
759 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
760 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + tile->pipe * cmd->vsc_draw_strm_pitch);
761 tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + (tile->pipe * 4) + (32 * cmd->vsc_draw_strm_pitch));
762 tu_cs_emit_qw(cs, cmd->vsc_prim_strm.iova + (tile->pipe * cmd->vsc_prim_strm_pitch));
763
764 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
765 tu_cs_emit(cs, 0x0);
766
767 /* use a NOP packet to skip over the 'else' side: */
768 tu_cs_emit_pkt7(cs, CP_NOP, 2);
769 } /* else */ {
770 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
771 tu_cs_emit(cs, 0x1);
772 }
773
774 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
775 tu_cs_emit(cs, 0x0);
776 } else {
777 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
778 tu_cs_emit(cs, 0x1);
779
780 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
781 tu_cs_emit(cs, 0x0);
782 }
783 }
784
785 static void
786 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
787 struct tu_cs *cs,
788 uint32_t a,
789 uint32_t gmem_a)
790 {
791 const struct tu_framebuffer *fb = cmd->state.framebuffer;
792 struct tu_image_view *dst = fb->attachments[a].attachment;
793 struct tu_image_view *src = fb->attachments[gmem_a].attachment;
794
795 tu_resolve_sysmem(cmd, cs, src, dst, fb->layers, &cmd->state.tiling_config.render_area);
796 }
797
798 static void
799 tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
800 struct tu_cs *cs,
801 const struct tu_subpass *subpass)
802 {
803 if (subpass->resolve_attachments) {
804 /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
805 * Commands":
806 *
807 * End-of-subpass multisample resolves are treated as color
808 * attachment writes for the purposes of synchronization. That is,
809 * they are considered to execute in the
810 * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
811 * their writes are synchronized with
812 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
813 * rendering within a subpass and any resolve operations at the end
814 * of the subpass occurs automatically, without need for explicit
815 * dependencies or pipeline barriers. However, if the resolve
816 * attachment is also used in a different subpass, an explicit
817 * dependency is needed.
818 *
819 * We use the CP_BLIT path for sysmem resolves, which is really a
820 * transfer command, so we have to manually flush similar to the gmem
821 * resolve case. However, a flush afterwards isn't needed because of the
822 * last sentence and the fact that we're in sysmem mode.
823 */
824 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
825 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
826
827 /* Wait for the flushes to land before using the 2D engine */
828 tu_cs_emit_wfi(cs);
829
830 for (unsigned i = 0; i < subpass->color_count; i++) {
831 uint32_t a = subpass->resolve_attachments[i].attachment;
832 if (a == VK_ATTACHMENT_UNUSED)
833 continue;
834
835 tu6_emit_sysmem_resolve(cmd, cs, a,
836 subpass->color_attachments[i].attachment);
837 }
838 }
839 }
840
841 static void
842 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
843 {
844 const struct tu_render_pass *pass = cmd->state.pass;
845 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
846
847 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
848 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
849 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
850 CP_SET_DRAW_STATE__0_GROUP_ID(0));
851 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
852 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
853
854 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
855 tu_cs_emit(cs, 0x0);
856
857 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
858 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
859
860 tu6_emit_blit_scissor(cmd, cs, true);
861
862 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
863 if (pass->attachments[a].gmem_offset >= 0)
864 tu_store_gmem_attachment(cmd, cs, a, a);
865 }
866
867 if (subpass->resolve_attachments) {
868 for (unsigned i = 0; i < subpass->color_count; i++) {
869 uint32_t a = subpass->resolve_attachments[i].attachment;
870 if (a != VK_ATTACHMENT_UNUSED)
871 tu_store_gmem_attachment(cmd, cs, a,
872 subpass->color_attachments[i].attachment);
873 }
874 }
875 }
876
877 static void
878 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
879 {
880 tu_cs_emit_regs(cs,
881 A6XX_PC_RESTART_INDEX(restart_index));
882 }
883
884 static void
885 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
886 {
887 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
888
889 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
890
891 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
892
893 tu_cs_emit_regs(cs,
894 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
895 cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
896 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
897 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
898 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
899 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
900 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
901 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
902 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
903 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
904
905 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
906 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
907 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
908 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
909 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
910 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
911 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
912 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
913 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
914 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
915 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
916 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
917 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
918 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
919
920 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
921 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
922 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
923
924 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
925
926 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
927
928 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
929 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
930 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
931 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
932 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
933 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
934 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
935 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
936 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
937 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
938 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
939
940 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
941 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
942
943 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236,
944 A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
945 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
946
947 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
948 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
949
950 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
951 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
952 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
953
954 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
955 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
956
957 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
958
959 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
960
961 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
962 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
963 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
964 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
965 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
966 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
967 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
968 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
969 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
970 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
971 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
972 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
973 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
974 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
975
976 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
977
978 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
979
980 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
981
982 /* we don't use this yet.. probably best to disable.. */
983 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
984 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
985 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
986 CP_SET_DRAW_STATE__0_GROUP_ID(0));
987 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
988 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
989
990 /* Set not to use streamout by default, */
991 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
992 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
993 tu_cs_emit(cs, 0);
994 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
995 tu_cs_emit(cs, 0);
996
997 tu_cs_emit_regs(cs,
998 A6XX_SP_HS_CTRL_REG0(0));
999
1000 tu_cs_emit_regs(cs,
1001 A6XX_SP_GS_CTRL_REG0(0));
1002
1003 tu_cs_emit_regs(cs,
1004 A6XX_GRAS_LRZ_CNTL(0));
1005
1006 tu_cs_emit_regs(cs,
1007 A6XX_RB_LRZ_CNTL(0));
1008
1009 tu_cs_emit_regs(cs,
1010 A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
1011 tu_cs_emit_regs(cs,
1012 A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
1013
1014 tu_cs_sanity_check(cs);
1015 }
1016
1017 static void
1018 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1019 {
1020 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1021
1022 tu_cs_emit_regs(cs,
1023 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1024 .height = tiling->tile0.extent.height),
1025 A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = &cmd->vsc_draw_strm,
1026 .bo_offset = 32 * cmd->vsc_draw_strm_pitch));
1027
1028 tu_cs_emit_regs(cs,
1029 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1030 .ny = tiling->tile_count.height));
1031
1032 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1033 for (unsigned i = 0; i < 32; i++)
1034 tu_cs_emit(cs, tiling->pipe_config[i]);
1035
1036 tu_cs_emit_regs(cs,
1037 A6XX_VSC_PRIM_STRM_ADDRESS(.bo = &cmd->vsc_prim_strm),
1038 A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
1039 A6XX_VSC_PRIM_STRM_ARRAY_PITCH(cmd->vsc_prim_strm.size));
1040
1041 tu_cs_emit_regs(cs,
1042 A6XX_VSC_DRAW_STRM_ADDRESS(.bo = &cmd->vsc_draw_strm),
1043 A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
1044 A6XX_VSC_DRAW_STRM_ARRAY_PITCH(cmd->vsc_draw_strm.size));
1045 }
1046
1047 static void
1048 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1049 {
1050 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1051 const uint32_t used_pipe_count =
1052 tiling->pipe_count.width * tiling->pipe_count.height;
1053
1054 /* Clear vsc_scratch: */
1055 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1056 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1057 tu_cs_emit(cs, 0x0);
1058
1059 /* Check for overflow, write vsc_scratch if detected: */
1060 for (int i = 0; i < used_pipe_count; i++) {
1061 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1062 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1063 CP_COND_WRITE5_0_WRITE_MEMORY);
1064 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
1065 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1066 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch));
1067 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1068 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1069 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_draw_strm_pitch));
1070
1071 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1072 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1073 CP_COND_WRITE5_0_WRITE_MEMORY);
1074 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
1075 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1076 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch));
1077 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1078 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1079 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_prim_strm_pitch));
1080 }
1081
1082 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1083
1084 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1085
1086 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1087 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1088 CP_MEM_TO_REG_0_CNT(1 - 1));
1089 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
1090
1091 /*
1092 * This is a bit awkward, we really want a way to invert the
1093 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1094 * execute cmds to use hwbinning when a bit is *not* set. This
1095 * dance is to invert OVERFLOW_FLAG_REG
1096 *
1097 * A CP_NOP packet is used to skip executing the 'else' clause
1098 * if (b0 set)..
1099 */
1100
1101 /* b0 will be set if VSC_DRAW_STRM or VSC_PRIM_STRM overflow: */
1102 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1103 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1104 A6XX_CP_REG_TEST_0_BIT(0) |
1105 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1106
1107 tu_cs_reserve(cs, 3 + 7);
1108 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1109 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1110 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1111
1112 /* if (b0 set) */ {
1113 /*
1114 * On overflow, mirror the value to control->vsc_overflow
1115 * which CPU is checking to detect overflow (see
1116 * check_vsc_overflow())
1117 */
1118 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1119 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1120 CP_REG_TO_MEM_0_CNT(0));
1121 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_overflow));
1122
1123 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1124 tu_cs_emit(cs, 0x0);
1125
1126 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1127 } /* else */ {
1128 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1129 tu_cs_emit(cs, 0x1);
1130 }
1131 }
1132
1133 static void
1134 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1135 {
1136 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1137 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1138
1139 uint32_t x1 = tiling->tile0.offset.x;
1140 uint32_t y1 = tiling->tile0.offset.y;
1141 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1142 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1143
1144 tu6_emit_window_scissor(cs, x1, y1, x2, y2);
1145
1146 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1147 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1148
1149 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1150 tu_cs_emit(cs, 0x1);
1151
1152 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1153 tu_cs_emit(cs, 0x1);
1154
1155 tu_cs_emit_wfi(cs);
1156
1157 tu_cs_emit_regs(cs,
1158 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1159
1160 update_vsc_pipe(cmd, cs);
1161
1162 tu_cs_emit_regs(cs,
1163 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1164
1165 tu_cs_emit_regs(cs,
1166 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1167
1168 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1169 tu_cs_emit(cs, UNK_2C);
1170
1171 tu_cs_emit_regs(cs,
1172 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1173
1174 tu_cs_emit_regs(cs,
1175 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1176
1177 /* emit IB to binning drawcmds: */
1178 tu_cs_emit_call(cs, &cmd->draw_cs);
1179
1180 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1181 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1182 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1183 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1184 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1185 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1186
1187 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1188 tu_cs_emit(cs, UNK_2D);
1189
1190 /* This flush is probably required because the VSC, which produces the
1191 * visibility stream, is a client of UCHE, whereas the CP needs to read the
1192 * visibility stream (without caching) to do draw skipping. The
1193 * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1194 * submitted are finished before reading the VSC regs (in
1195 * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
1196 * part of draws).
1197 */
1198 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
1199
1200 tu_cs_emit_wfi(cs);
1201
1202 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1203
1204 emit_vsc_overflow_test(cmd, cs);
1205
1206 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1207 tu_cs_emit(cs, 0x0);
1208
1209 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1210 tu_cs_emit(cs, 0x0);
1211 }
1212
1213 static void
1214 tu_emit_load_clear(struct tu_cmd_buffer *cmd,
1215 const VkRenderPassBeginInfo *info)
1216 {
1217 struct tu_cs *cs = &cmd->draw_cs;
1218
1219 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1220
1221 tu6_emit_blit_scissor(cmd, cs, true);
1222
1223 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1224 tu_load_gmem_attachment(cmd, cs, i, false);
1225
1226 tu6_emit_blit_scissor(cmd, cs, false);
1227
1228 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1229 tu_clear_gmem_attachment(cmd, cs, i, info);
1230
1231 tu_cond_exec_end(cs);
1232
1233 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1234
1235 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1236 tu_clear_sysmem_attachment(cmd, cs, i, info);
1237
1238 tu_cond_exec_end(cs);
1239 }
1240
1241 static void
1242 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1243 const struct VkRect2D *renderArea)
1244 {
1245 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1246
1247 assert(fb->width > 0 && fb->height > 0);
1248 tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1249 tu6_emit_window_offset(cs, 0, 0);
1250
1251 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1252
1253 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1254
1255 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1256 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1257
1258 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1259 tu_cs_emit(cs, 0x0);
1260
1261 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1262
1263 /* enable stream-out, with sysmem there is only one pass: */
1264 tu_cs_emit_regs(cs,
1265 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1266
1267 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1268 tu_cs_emit(cs, 0x1);
1269
1270 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1271 tu_cs_emit(cs, 0x0);
1272
1273 tu_cs_sanity_check(cs);
1274 }
1275
1276 static void
1277 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1278 {
1279 /* Do any resolves of the last subpass. These are handled in the
1280 * tile_store_ib in the gmem path.
1281 */
1282 tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1283
1284 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1285
1286 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1287 tu_cs_emit(cs, 0x0);
1288
1289 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1290
1291 tu_cs_sanity_check(cs);
1292 }
1293
1294
1295 static void
1296 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1297 {
1298 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1299
1300 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1301
1302 /* lrz clear? */
1303
1304 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1305 tu_cs_emit(cs, 0x0);
1306
1307 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1308
1309 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1310 if (use_hw_binning(cmd)) {
1311 /* enable stream-out during binning pass: */
1312 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1313
1314 tu6_emit_bin_size(cs,
1315 tiling->tile0.extent.width,
1316 tiling->tile0.extent.height,
1317 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1318
1319 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1320
1321 tu6_emit_binning_pass(cmd, cs);
1322
1323 /* and disable stream-out for draw pass: */
1324 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
1325
1326 tu6_emit_bin_size(cs,
1327 tiling->tile0.extent.width,
1328 tiling->tile0.extent.height,
1329 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1330
1331 tu_cs_emit_regs(cs,
1332 A6XX_VFD_MODE_CNTL(0));
1333
1334 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1335
1336 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1337
1338 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1339 tu_cs_emit(cs, 0x1);
1340 } else {
1341 /* no binning pass, so enable stream-out for draw pass:: */
1342 tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
1343
1344 tu6_emit_bin_size(cs,
1345 tiling->tile0.extent.width,
1346 tiling->tile0.extent.height,
1347 0x6000000);
1348 }
1349
1350 tu_cs_sanity_check(cs);
1351 }
1352
1353 static void
1354 tu6_render_tile(struct tu_cmd_buffer *cmd,
1355 struct tu_cs *cs,
1356 const struct tu_tile *tile)
1357 {
1358 tu6_emit_tile_select(cmd, cs, tile);
1359
1360 tu_cs_emit_call(cs, &cmd->draw_cs);
1361
1362 if (use_hw_binning(cmd)) {
1363 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1364 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1365 A6XX_CP_REG_TEST_0_BIT(0) |
1366 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1367
1368 tu_cs_reserve(cs, 3 + 2);
1369 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1370 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1371 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1372
1373 /* if (no overflow) */ {
1374 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1375 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1376 }
1377 }
1378
1379 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1380
1381 tu_cs_sanity_check(cs);
1382 }
1383
1384 static void
1385 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1386 {
1387 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1388
1389 tu_cs_emit_regs(cs,
1390 A6XX_GRAS_LRZ_CNTL(0));
1391
1392 tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1393
1394 tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1395
1396 tu_cs_sanity_check(cs);
1397 }
1398
1399 static void
1400 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1401 {
1402 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1403
1404 tu6_tile_render_begin(cmd, &cmd->cs);
1405
1406 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1407 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1408 struct tu_tile tile;
1409 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1410 tu6_render_tile(cmd, &cmd->cs, &tile);
1411 }
1412 }
1413
1414 tu6_tile_render_end(cmd, &cmd->cs);
1415 }
1416
1417 static void
1418 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1419 {
1420 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1421
1422 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1423
1424 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1425
1426 tu6_sysmem_render_end(cmd, &cmd->cs);
1427 }
1428
1429 static void
1430 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1431 {
1432 const uint32_t tile_store_space = 11 + (35 * 2) * cmd->state.pass->attachment_count;
1433 struct tu_cs sub_cs;
1434
1435 VkResult result =
1436 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1437 if (result != VK_SUCCESS) {
1438 cmd->record_result = result;
1439 return;
1440 }
1441
1442 /* emit to tile-store sub_cs */
1443 tu6_emit_tile_store(cmd, &sub_cs);
1444
1445 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1446 }
1447
1448 static void
1449 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1450 const VkRect2D *render_area)
1451 {
1452 const struct tu_device *dev = cmd->device;
1453 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1454
1455 tiling->render_area = *render_area;
1456 tiling->force_sysmem = false;
1457
1458 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass);
1459 tu_tiling_config_update_pipe_layout(tiling, dev);
1460 tu_tiling_config_update_pipes(tiling, dev);
1461 }
1462
1463 const struct tu_dynamic_state default_dynamic_state = {
1464 .viewport =
1465 {
1466 .count = 0,
1467 },
1468 .scissor =
1469 {
1470 .count = 0,
1471 },
1472 .line_width = 1.0f,
1473 .depth_bias =
1474 {
1475 .bias = 0.0f,
1476 .clamp = 0.0f,
1477 .slope = 0.0f,
1478 },
1479 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1480 .depth_bounds =
1481 {
1482 .min = 0.0f,
1483 .max = 1.0f,
1484 },
1485 .stencil_compare_mask =
1486 {
1487 .front = ~0u,
1488 .back = ~0u,
1489 },
1490 .stencil_write_mask =
1491 {
1492 .front = ~0u,
1493 .back = ~0u,
1494 },
1495 .stencil_reference =
1496 {
1497 .front = 0u,
1498 .back = 0u,
1499 },
1500 };
1501
1502 static void UNUSED /* FINISHME */
1503 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1504 const struct tu_dynamic_state *src)
1505 {
1506 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1507 uint32_t copy_mask = src->mask;
1508 uint32_t dest_mask = 0;
1509
1510 tu_use_args(cmd_buffer); /* FINISHME */
1511
1512 /* Make sure to copy the number of viewports/scissors because they can
1513 * only be specified at pipeline creation time.
1514 */
1515 dest->viewport.count = src->viewport.count;
1516 dest->scissor.count = src->scissor.count;
1517 dest->discard_rectangle.count = src->discard_rectangle.count;
1518
1519 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1520 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1521 src->viewport.count * sizeof(VkViewport))) {
1522 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1523 src->viewport.count);
1524 dest_mask |= TU_DYNAMIC_VIEWPORT;
1525 }
1526 }
1527
1528 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1529 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1530 src->scissor.count * sizeof(VkRect2D))) {
1531 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1532 src->scissor.count);
1533 dest_mask |= TU_DYNAMIC_SCISSOR;
1534 }
1535 }
1536
1537 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1538 if (dest->line_width != src->line_width) {
1539 dest->line_width = src->line_width;
1540 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1541 }
1542 }
1543
1544 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1545 if (memcmp(&dest->depth_bias, &src->depth_bias,
1546 sizeof(src->depth_bias))) {
1547 dest->depth_bias = src->depth_bias;
1548 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1549 }
1550 }
1551
1552 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1553 if (memcmp(&dest->blend_constants, &src->blend_constants,
1554 sizeof(src->blend_constants))) {
1555 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1556 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1557 }
1558 }
1559
1560 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1561 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1562 sizeof(src->depth_bounds))) {
1563 dest->depth_bounds = src->depth_bounds;
1564 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1565 }
1566 }
1567
1568 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1569 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1570 sizeof(src->stencil_compare_mask))) {
1571 dest->stencil_compare_mask = src->stencil_compare_mask;
1572 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1573 }
1574 }
1575
1576 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1577 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1578 sizeof(src->stencil_write_mask))) {
1579 dest->stencil_write_mask = src->stencil_write_mask;
1580 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1581 }
1582 }
1583
1584 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1585 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1586 sizeof(src->stencil_reference))) {
1587 dest->stencil_reference = src->stencil_reference;
1588 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1589 }
1590 }
1591
1592 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1593 if (memcmp(&dest->discard_rectangle.rectangles,
1594 &src->discard_rectangle.rectangles,
1595 src->discard_rectangle.count * sizeof(VkRect2D))) {
1596 typed_memcpy(dest->discard_rectangle.rectangles,
1597 src->discard_rectangle.rectangles,
1598 src->discard_rectangle.count);
1599 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1600 }
1601 }
1602 }
1603
1604 static VkResult
1605 tu_create_cmd_buffer(struct tu_device *device,
1606 struct tu_cmd_pool *pool,
1607 VkCommandBufferLevel level,
1608 VkCommandBuffer *pCommandBuffer)
1609 {
1610 struct tu_cmd_buffer *cmd_buffer;
1611 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1612 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1613 if (cmd_buffer == NULL)
1614 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1615
1616 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1617 cmd_buffer->device = device;
1618 cmd_buffer->pool = pool;
1619 cmd_buffer->level = level;
1620
1621 if (pool) {
1622 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1623 cmd_buffer->queue_family_index = pool->queue_family_index;
1624
1625 } else {
1626 /* Init the pool_link so we can safely call list_del when we destroy
1627 * the command buffer
1628 */
1629 list_inithead(&cmd_buffer->pool_link);
1630 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1631 }
1632
1633 tu_bo_list_init(&cmd_buffer->bo_list);
1634 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1635 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1636 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1637 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1638
1639 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1640
1641 list_inithead(&cmd_buffer->upload.list);
1642
1643 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1644 if (result != VK_SUCCESS)
1645 goto fail_scratch_bo;
1646
1647 /* TODO: resize on overflow */
1648 cmd_buffer->vsc_draw_strm_pitch = device->vsc_draw_strm_pitch;
1649 cmd_buffer->vsc_prim_strm_pitch = device->vsc_prim_strm_pitch;
1650 cmd_buffer->vsc_draw_strm = device->vsc_draw_strm;
1651 cmd_buffer->vsc_prim_strm = device->vsc_prim_strm;
1652
1653 return VK_SUCCESS;
1654
1655 fail_scratch_bo:
1656 list_del(&cmd_buffer->pool_link);
1657 return result;
1658 }
1659
1660 static void
1661 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1662 {
1663 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1664
1665 list_del(&cmd_buffer->pool_link);
1666
1667 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
1668 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
1669
1670 tu_cs_finish(&cmd_buffer->cs);
1671 tu_cs_finish(&cmd_buffer->draw_cs);
1672 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1673 tu_cs_finish(&cmd_buffer->sub_cs);
1674
1675 tu_bo_list_destroy(&cmd_buffer->bo_list);
1676 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1677 }
1678
1679 static VkResult
1680 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1681 {
1682 cmd_buffer->record_result = VK_SUCCESS;
1683
1684 tu_bo_list_reset(&cmd_buffer->bo_list);
1685 tu_cs_reset(&cmd_buffer->cs);
1686 tu_cs_reset(&cmd_buffer->draw_cs);
1687 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1688 tu_cs_reset(&cmd_buffer->sub_cs);
1689
1690 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
1691 cmd_buffer->descriptors[i].valid = 0;
1692 cmd_buffer->descriptors[i].push_dirty = false;
1693 }
1694
1695 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1696
1697 return cmd_buffer->record_result;
1698 }
1699
1700 VkResult
1701 tu_AllocateCommandBuffers(VkDevice _device,
1702 const VkCommandBufferAllocateInfo *pAllocateInfo,
1703 VkCommandBuffer *pCommandBuffers)
1704 {
1705 TU_FROM_HANDLE(tu_device, device, _device);
1706 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1707
1708 VkResult result = VK_SUCCESS;
1709 uint32_t i;
1710
1711 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1712
1713 if (!list_is_empty(&pool->free_cmd_buffers)) {
1714 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1715 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1716
1717 list_del(&cmd_buffer->pool_link);
1718 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1719
1720 result = tu_reset_cmd_buffer(cmd_buffer);
1721 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1722 cmd_buffer->level = pAllocateInfo->level;
1723
1724 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1725 } else {
1726 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1727 &pCommandBuffers[i]);
1728 }
1729 if (result != VK_SUCCESS)
1730 break;
1731 }
1732
1733 if (result != VK_SUCCESS) {
1734 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1735 pCommandBuffers);
1736
1737 /* From the Vulkan 1.0.66 spec:
1738 *
1739 * "vkAllocateCommandBuffers can be used to create multiple
1740 * command buffers. If the creation of any of those command
1741 * buffers fails, the implementation must destroy all
1742 * successfully created command buffer objects from this
1743 * command, set all entries of the pCommandBuffers array to
1744 * NULL and return the error."
1745 */
1746 memset(pCommandBuffers, 0,
1747 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1748 }
1749
1750 return result;
1751 }
1752
1753 void
1754 tu_FreeCommandBuffers(VkDevice device,
1755 VkCommandPool commandPool,
1756 uint32_t commandBufferCount,
1757 const VkCommandBuffer *pCommandBuffers)
1758 {
1759 for (uint32_t i = 0; i < commandBufferCount; i++) {
1760 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1761
1762 if (cmd_buffer) {
1763 if (cmd_buffer->pool) {
1764 list_del(&cmd_buffer->pool_link);
1765 list_addtail(&cmd_buffer->pool_link,
1766 &cmd_buffer->pool->free_cmd_buffers);
1767 } else
1768 tu_cmd_buffer_destroy(cmd_buffer);
1769 }
1770 }
1771 }
1772
1773 VkResult
1774 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1775 VkCommandBufferResetFlags flags)
1776 {
1777 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1778 return tu_reset_cmd_buffer(cmd_buffer);
1779 }
1780
1781 /* Initialize the cache, assuming all necessary flushes have happened but *not*
1782 * invalidations.
1783 */
1784 static void
1785 tu_cache_init(struct tu_cache_state *cache)
1786 {
1787 cache->flush_bits = 0;
1788 cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1789 }
1790
1791 VkResult
1792 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1793 const VkCommandBufferBeginInfo *pBeginInfo)
1794 {
1795 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1796 VkResult result = VK_SUCCESS;
1797
1798 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1799 /* If the command buffer has already been resetted with
1800 * vkResetCommandBuffer, no need to do it again.
1801 */
1802 result = tu_reset_cmd_buffer(cmd_buffer);
1803 if (result != VK_SUCCESS)
1804 return result;
1805 }
1806
1807 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1808 tu_cache_init(&cmd_buffer->state.cache);
1809 tu_cache_init(&cmd_buffer->state.renderpass_cache);
1810 cmd_buffer->usage_flags = pBeginInfo->flags;
1811
1812 tu_cs_begin(&cmd_buffer->cs);
1813 tu_cs_begin(&cmd_buffer->draw_cs);
1814 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1815
1816 /* setup initial configuration into command buffer */
1817 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1818 switch (cmd_buffer->queue_family_index) {
1819 case TU_QUEUE_GENERAL:
1820 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1821 break;
1822 default:
1823 break;
1824 }
1825 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1826 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1827 assert(pBeginInfo->pInheritanceInfo);
1828 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1829 cmd_buffer->state.subpass =
1830 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1831 } else {
1832 /* When executing in the middle of another command buffer, the CCU
1833 * state is unknown.
1834 */
1835 cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1836 }
1837 }
1838
1839 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1840
1841 return VK_SUCCESS;
1842 }
1843
1844 /* Sets vertex buffers to HW binding points. We emit VBs in SDS (so that bin
1845 * rendering can skip over unused state), so we need to collect all the
1846 * bindings together into a single state emit at draw time.
1847 */
1848 void
1849 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1850 uint32_t firstBinding,
1851 uint32_t bindingCount,
1852 const VkBuffer *pBuffers,
1853 const VkDeviceSize *pOffsets)
1854 {
1855 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1856
1857 assert(firstBinding + bindingCount <= MAX_VBS);
1858
1859 for (uint32_t i = 0; i < bindingCount; i++) {
1860 struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1861
1862 cmd->state.vb.buffers[firstBinding + i] = buf;
1863 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
1864
1865 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1866 }
1867
1868 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1869 }
1870
1871 void
1872 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1873 VkBuffer buffer,
1874 VkDeviceSize offset,
1875 VkIndexType indexType)
1876 {
1877 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1878 TU_FROM_HANDLE(tu_buffer, buf, buffer);
1879
1880 /* initialize/update the restart index */
1881 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
1882 struct tu_cs *draw_cs = &cmd->draw_cs;
1883
1884 tu6_emit_restart_index(
1885 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
1886
1887 tu_cs_sanity_check(draw_cs);
1888 }
1889
1890 /* track the BO */
1891 if (cmd->state.index_buffer != buf)
1892 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
1893
1894 cmd->state.index_buffer = buf;
1895 cmd->state.index_offset = offset;
1896 cmd->state.index_type = indexType;
1897 }
1898
1899 void
1900 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1901 VkPipelineBindPoint pipelineBindPoint,
1902 VkPipelineLayout _layout,
1903 uint32_t firstSet,
1904 uint32_t descriptorSetCount,
1905 const VkDescriptorSet *pDescriptorSets,
1906 uint32_t dynamicOffsetCount,
1907 const uint32_t *pDynamicOffsets)
1908 {
1909 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1910 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1911 unsigned dyn_idx = 0;
1912
1913 struct tu_descriptor_state *descriptors_state =
1914 tu_get_descriptors_state(cmd_buffer, pipelineBindPoint);
1915
1916 for (unsigned i = 0; i < descriptorSetCount; ++i) {
1917 unsigned idx = i + firstSet;
1918 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1919
1920 descriptors_state->sets[idx] = set;
1921 descriptors_state->valid |= (1u << idx);
1922
1923 /* Note: the actual input attachment indices come from the shader
1924 * itself, so we can't generate the patched versions of these until
1925 * draw time when both the pipeline and descriptors are bound and
1926 * we're inside the render pass.
1927 */
1928 unsigned dst_idx = layout->set[idx].input_attachment_start;
1929 memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS],
1930 set->dynamic_descriptors,
1931 set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
1932
1933 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1934 /* Dynamic buffers come after input attachments in the descriptor set
1935 * itself, but due to how the Vulkan descriptor set binding works, we
1936 * have to put input attachments and dynamic buffers in separate
1937 * buffers in the descriptor_state and then combine them at draw
1938 * time. Binding a descriptor set only invalidates the descriptor
1939 * sets after it, but if we try to tightly pack the descriptors after
1940 * the input attachments then we could corrupt dynamic buffers in the
1941 * descriptor set before it, or we'd have to move all the dynamic
1942 * buffers over. We just put them into separate buffers to make
1943 * binding as well as the later patching of input attachments easy.
1944 */
1945 unsigned src_idx = j + set->layout->input_attachment_count;
1946 unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1947 assert(dyn_idx < dynamicOffsetCount);
1948
1949 uint32_t *dst =
1950 &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1951 uint32_t *src =
1952 &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1953 uint32_t offset = pDynamicOffsets[dyn_idx];
1954
1955 /* Patch the storage/uniform descriptors right away. */
1956 if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1957 /* Note: we can assume here that the addition won't roll over and
1958 * change the SIZE field.
1959 */
1960 uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1961 va += offset;
1962 dst[0] = va;
1963 dst[1] = va >> 32;
1964 } else {
1965 memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1966 /* Note: A6XX_IBO_5_DEPTH is always 0 */
1967 uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1968 va += offset;
1969 dst[4] = va;
1970 dst[5] = va >> 32;
1971 }
1972 }
1973 }
1974
1975 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE)
1976 cmd_buffer->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
1977 else
1978 cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
1979 }
1980
1981 void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1982 uint32_t firstBinding,
1983 uint32_t bindingCount,
1984 const VkBuffer *pBuffers,
1985 const VkDeviceSize *pOffsets,
1986 const VkDeviceSize *pSizes)
1987 {
1988 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1989 assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS);
1990
1991 for (uint32_t i = 0; i < bindingCount; i++) {
1992 uint32_t idx = firstBinding + i;
1993 TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
1994
1995 if (pOffsets[i] != 0)
1996 cmd->state.streamout_reset |= 1 << idx;
1997
1998 cmd->state.streamout_buf.buffers[idx] = buf;
1999 cmd->state.streamout_buf.offsets[idx] = pOffsets[i];
2000 cmd->state.streamout_buf.sizes[idx] = pSizes[i];
2001
2002 cmd->state.streamout_enabled |= 1 << idx;
2003 }
2004
2005 cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS;
2006 }
2007
2008 void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
2009 uint32_t firstCounterBuffer,
2010 uint32_t counterBufferCount,
2011 const VkBuffer *pCounterBuffers,
2012 const VkDeviceSize *pCounterBufferOffsets)
2013 {
2014 assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
2015 /* TODO do something with counter buffer? */
2016 }
2017
2018 void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
2019 uint32_t firstCounterBuffer,
2020 uint32_t counterBufferCount,
2021 const VkBuffer *pCounterBuffers,
2022 const VkDeviceSize *pCounterBufferOffsets)
2023 {
2024 assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
2025 /* TODO do something with counter buffer? */
2026
2027 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2028 cmd->state.streamout_enabled = 0;
2029 }
2030
2031 void
2032 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2033 VkPipelineLayout layout,
2034 VkShaderStageFlags stageFlags,
2035 uint32_t offset,
2036 uint32_t size,
2037 const void *pValues)
2038 {
2039 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2040 memcpy((void*) cmd->push_constants + offset, pValues, size);
2041 cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS;
2042 }
2043
2044 /* Flush everything which has been made available but we haven't actually
2045 * flushed yet.
2046 */
2047 static void
2048 tu_flush_all_pending(struct tu_cache_state *cache)
2049 {
2050 cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2051 cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
2052 }
2053
2054 VkResult
2055 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2056 {
2057 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2058
2059 /* We currently flush CCU at the end of the command buffer, like
2060 * what the blob does. There's implicit synchronization around every
2061 * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
2062 * know yet if this command buffer will be the last in the submit so we
2063 * have to defensively flush everything else.
2064 *
2065 * TODO: We could definitely do better than this, since these flushes
2066 * aren't required by Vulkan, but we'd need kernel support to do that.
2067 * Ideally, we'd like the kernel to flush everything afterwards, so that we
2068 * wouldn't have to do any flushes here, and when submitting multiple
2069 * command buffers there wouldn't be any unnecessary flushes in between.
2070 */
2071 if (cmd_buffer->state.pass) {
2072 tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
2073 tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
2074 } else {
2075 tu_flush_all_pending(&cmd_buffer->state.cache);
2076 cmd_buffer->state.cache.flush_bits |=
2077 TU_CMD_FLAG_CCU_FLUSH_COLOR |
2078 TU_CMD_FLAG_CCU_FLUSH_DEPTH;
2079 tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
2080 }
2081
2082 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2083 MSM_SUBMIT_BO_WRITE);
2084
2085 if (cmd_buffer->use_vsc_data) {
2086 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_draw_strm,
2087 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2088 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_prim_strm,
2089 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2090 }
2091
2092 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->border_color,
2093 MSM_SUBMIT_BO_READ);
2094
2095 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2096 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2097 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2098 }
2099
2100 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2101 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2102 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2103 }
2104
2105 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2106 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2107 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2108 }
2109
2110 tu_cs_end(&cmd_buffer->cs);
2111 tu_cs_end(&cmd_buffer->draw_cs);
2112 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2113
2114 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2115
2116 return cmd_buffer->record_result;
2117 }
2118
2119 void
2120 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2121 VkPipelineBindPoint pipelineBindPoint,
2122 VkPipeline _pipeline)
2123 {
2124 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2125 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2126
2127 switch (pipelineBindPoint) {
2128 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2129 cmd->state.pipeline = pipeline;
2130 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
2131 break;
2132 case VK_PIPELINE_BIND_POINT_COMPUTE:
2133 cmd->state.compute_pipeline = pipeline;
2134 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2135 break;
2136 default:
2137 unreachable("unrecognized pipeline bind point");
2138 break;
2139 }
2140
2141 /* If the new pipeline requires more VBs than we had previously set up, we
2142 * need to re-emit them in SDS. If it requires the same set or fewer, we
2143 * can just re-use the old SDS.
2144 */
2145 if (pipeline->vi.bindings_used & ~cmd->vertex_bindings_set)
2146 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2147
2148 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2149 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2150 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2151 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2152 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2153 }
2154 }
2155
2156 void
2157 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2158 uint32_t firstViewport,
2159 uint32_t viewportCount,
2160 const VkViewport *pViewports)
2161 {
2162 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2163
2164 assert(firstViewport == 0 && viewportCount == 1);
2165 cmd->state.dynamic.viewport.viewports[0] = pViewports[0];
2166 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_VIEWPORT;
2167 }
2168
2169 void
2170 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2171 uint32_t firstScissor,
2172 uint32_t scissorCount,
2173 const VkRect2D *pScissors)
2174 {
2175 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2176
2177 assert(firstScissor == 0 && scissorCount == 1);
2178 cmd->state.dynamic.scissor.scissors[0] = pScissors[0];
2179 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2180 }
2181
2182 void
2183 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2184 {
2185 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2186
2187 cmd->state.dynamic.line_width = lineWidth;
2188
2189 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2190 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2191 }
2192
2193 void
2194 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2195 float depthBiasConstantFactor,
2196 float depthBiasClamp,
2197 float depthBiasSlopeFactor)
2198 {
2199 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2200 struct tu_cs *draw_cs = &cmd->draw_cs;
2201
2202 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2203 depthBiasSlopeFactor);
2204
2205 tu_cs_sanity_check(draw_cs);
2206 }
2207
2208 void
2209 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2210 const float blendConstants[4])
2211 {
2212 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2213 struct tu_cs *draw_cs = &cmd->draw_cs;
2214
2215 tu6_emit_blend_constants(draw_cs, blendConstants);
2216
2217 tu_cs_sanity_check(draw_cs);
2218 }
2219
2220 void
2221 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2222 float minDepthBounds,
2223 float maxDepthBounds)
2224 {
2225 }
2226
2227 void
2228 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2229 VkStencilFaceFlags faceMask,
2230 uint32_t compareMask)
2231 {
2232 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2233
2234 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2235 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2236 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2237 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2238
2239 /* the front/back compare masks must be updated together */
2240 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2241 }
2242
2243 void
2244 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2245 VkStencilFaceFlags faceMask,
2246 uint32_t writeMask)
2247 {
2248 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2249
2250 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2251 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2252 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2253 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2254
2255 /* the front/back write masks must be updated together */
2256 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2257 }
2258
2259 void
2260 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2261 VkStencilFaceFlags faceMask,
2262 uint32_t reference)
2263 {
2264 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2265
2266 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2267 cmd->state.dynamic.stencil_reference.front = reference;
2268 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2269 cmd->state.dynamic.stencil_reference.back = reference;
2270
2271 /* the front/back references must be updated together */
2272 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2273 }
2274
2275 void
2276 tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2277 const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2278 {
2279 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2280
2281 tu6_emit_sample_locations(&cmd->draw_cs, pSampleLocationsInfo);
2282 }
2283
2284 static void
2285 tu_flush_for_access(struct tu_cache_state *cache,
2286 enum tu_cmd_access_mask src_mask,
2287 enum tu_cmd_access_mask dst_mask)
2288 {
2289 enum tu_cmd_flush_bits flush_bits = 0;
2290
2291 if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2292 cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2293 }
2294
2295 #define SRC_FLUSH(domain, flush, invalidate) \
2296 if (src_mask & TU_ACCESS_##domain##_WRITE) { \
2297 cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
2298 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2299 }
2300
2301 SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2302 SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2303 SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2304
2305 #undef SRC_FLUSH
2306
2307 #define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
2308 if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
2309 flush_bits |= TU_CMD_FLAG_##flush; \
2310 cache->pending_flush_bits |= \
2311 (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2312 }
2313
2314 SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2315 SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2316
2317 #undef SRC_INCOHERENT_FLUSH
2318
2319 if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
2320 flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2321 }
2322
2323 #define DST_FLUSH(domain, flush, invalidate) \
2324 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2325 TU_ACCESS_##domain##_WRITE)) { \
2326 flush_bits |= cache->pending_flush_bits & \
2327 (TU_CMD_FLAG_##invalidate | \
2328 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2329 }
2330
2331 DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2332 DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2333 DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2334
2335 #undef DST_FLUSH
2336
2337 #define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2338 if (dst_mask & (TU_ACCESS_##domain##_READ | \
2339 TU_ACCESS_##domain##_WRITE)) { \
2340 flush_bits |= TU_CMD_FLAG_##invalidate | \
2341 (cache->pending_flush_bits & \
2342 (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2343 }
2344
2345 DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2346 DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2347
2348 #undef DST_INCOHERENT_FLUSH
2349
2350 if (dst_mask & TU_ACCESS_WFI_READ) {
2351 flush_bits |= TU_CMD_FLAG_WFI;
2352 }
2353
2354 cache->flush_bits |= flush_bits;
2355 cache->pending_flush_bits &= ~flush_bits;
2356 }
2357
2358 static enum tu_cmd_access_mask
2359 vk2tu_access(VkAccessFlags flags, bool gmem)
2360 {
2361 enum tu_cmd_access_mask mask = 0;
2362
2363 /* If the GPU writes a buffer that is then read by an indirect draw
2364 * command, we theoretically need a WFI + WAIT_FOR_ME combination to
2365 * wait for the writes to complete. The WAIT_FOR_ME is performed as part
2366 * of the draw by the firmware, so we just need to execute a WFI.
2367 */
2368 if (flags &
2369 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
2370 VK_ACCESS_MEMORY_READ_BIT)) {
2371 mask |= TU_ACCESS_WFI_READ;
2372 }
2373
2374 if (flags &
2375 (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2376 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP, I think */
2377 VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2378 VK_ACCESS_HOST_READ_BIT | /* sysmem by definition */
2379 VK_ACCESS_MEMORY_READ_BIT)) {
2380 mask |= TU_ACCESS_SYSMEM_READ;
2381 }
2382
2383 if (flags &
2384 (VK_ACCESS_HOST_WRITE_BIT |
2385 VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | /* Write performed by CP, I think */
2386 VK_ACCESS_MEMORY_WRITE_BIT)) {
2387 mask |= TU_ACCESS_SYSMEM_WRITE;
2388 }
2389
2390 if (flags &
2391 (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2392 VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2393 VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2394 /* TODO: Is there a no-cache bit for textures so that we can ignore
2395 * these?
2396 */
2397 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2398 VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2399 VK_ACCESS_MEMORY_READ_BIT)) {
2400 mask |= TU_ACCESS_UCHE_READ;
2401 }
2402
2403 if (flags &
2404 (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2405 VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2406 VK_ACCESS_MEMORY_WRITE_BIT)) {
2407 mask |= TU_ACCESS_UCHE_WRITE;
2408 }
2409
2410 /* When using GMEM, the CCU is always flushed automatically to GMEM, and
2411 * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2412 * previous writes in sysmem mode when transitioning to GMEM. Therefore we
2413 * can ignore CCU and pretend that color attachments and transfers use
2414 * sysmem directly.
2415 */
2416
2417 if (flags &
2418 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2419 VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2420 VK_ACCESS_MEMORY_READ_BIT)) {
2421 if (gmem)
2422 mask |= TU_ACCESS_SYSMEM_READ;
2423 else
2424 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2425 }
2426
2427 if (flags &
2428 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2429 VK_ACCESS_MEMORY_READ_BIT)) {
2430 if (gmem)
2431 mask |= TU_ACCESS_SYSMEM_READ;
2432 else
2433 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2434 }
2435
2436 if (flags &
2437 (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2438 VK_ACCESS_MEMORY_WRITE_BIT)) {
2439 if (gmem) {
2440 mask |= TU_ACCESS_SYSMEM_WRITE;
2441 } else {
2442 mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2443 }
2444 }
2445
2446 if (flags &
2447 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2448 VK_ACCESS_MEMORY_WRITE_BIT)) {
2449 if (gmem) {
2450 mask |= TU_ACCESS_SYSMEM_WRITE;
2451 } else {
2452 mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2453 }
2454 }
2455
2456 /* When the dst access is a transfer read/write, it seems we sometimes need
2457 * to insert a WFI after any flushes, to guarantee that the flushes finish
2458 * before the 2D engine starts. However the opposite (i.e. a WFI after
2459 * CP_BLIT and before any subsequent flush) does not seem to be needed, and
2460 * the blob doesn't emit such a WFI.
2461 */
2462
2463 if (flags &
2464 (VK_ACCESS_TRANSFER_WRITE_BIT |
2465 VK_ACCESS_MEMORY_WRITE_BIT)) {
2466 if (gmem) {
2467 mask |= TU_ACCESS_SYSMEM_WRITE;
2468 } else {
2469 mask |= TU_ACCESS_CCU_COLOR_WRITE;
2470 }
2471 mask |= TU_ACCESS_WFI_READ;
2472 }
2473
2474 if (flags &
2475 (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2476 VK_ACCESS_MEMORY_READ_BIT)) {
2477 mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
2478 }
2479
2480 return mask;
2481 }
2482
2483
2484 void
2485 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2486 uint32_t commandBufferCount,
2487 const VkCommandBuffer *pCmdBuffers)
2488 {
2489 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2490 VkResult result;
2491
2492 assert(commandBufferCount > 0);
2493
2494 /* Emit any pending flushes. */
2495 if (cmd->state.pass) {
2496 tu_flush_all_pending(&cmd->state.renderpass_cache);
2497 tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2498 } else {
2499 tu_flush_all_pending(&cmd->state.cache);
2500 tu_emit_cache_flush(cmd, &cmd->cs);
2501 }
2502
2503 for (uint32_t i = 0; i < commandBufferCount; i++) {
2504 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2505
2506 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2507 if (result != VK_SUCCESS) {
2508 cmd->record_result = result;
2509 break;
2510 }
2511
2512 if (secondary->usage_flags &
2513 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2514 assert(tu_cs_is_empty(&secondary->cs));
2515
2516 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2517 if (result != VK_SUCCESS) {
2518 cmd->record_result = result;
2519 break;
2520 }
2521
2522 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2523 &secondary->draw_epilogue_cs);
2524 if (result != VK_SUCCESS) {
2525 cmd->record_result = result;
2526 break;
2527 }
2528 } else {
2529 assert(tu_cs_is_empty(&secondary->draw_cs));
2530 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2531
2532 for (uint32_t j = 0; j < secondary->cs.bo_count; j++) {
2533 tu_bo_list_add(&cmd->bo_list, secondary->cs.bos[j],
2534 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2535 }
2536
2537 tu_cs_add_entries(&cmd->cs, &secondary->cs);
2538 }
2539 }
2540 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2541
2542 /* After executing secondary command buffers, there may have been arbitrary
2543 * flushes executed, so when we encounter a pipeline barrier with a
2544 * srcMask, we have to assume that we need to invalidate. Therefore we need
2545 * to re-initialize the cache with all pending invalidate bits set.
2546 */
2547 if (cmd->state.pass) {
2548 tu_cache_init(&cmd->state.renderpass_cache);
2549 } else {
2550 tu_cache_init(&cmd->state.cache);
2551 }
2552 }
2553
2554 VkResult
2555 tu_CreateCommandPool(VkDevice _device,
2556 const VkCommandPoolCreateInfo *pCreateInfo,
2557 const VkAllocationCallbacks *pAllocator,
2558 VkCommandPool *pCmdPool)
2559 {
2560 TU_FROM_HANDLE(tu_device, device, _device);
2561 struct tu_cmd_pool *pool;
2562
2563 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2564 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2565 if (pool == NULL)
2566 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2567
2568 if (pAllocator)
2569 pool->alloc = *pAllocator;
2570 else
2571 pool->alloc = device->alloc;
2572
2573 list_inithead(&pool->cmd_buffers);
2574 list_inithead(&pool->free_cmd_buffers);
2575
2576 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2577
2578 *pCmdPool = tu_cmd_pool_to_handle(pool);
2579
2580 return VK_SUCCESS;
2581 }
2582
2583 void
2584 tu_DestroyCommandPool(VkDevice _device,
2585 VkCommandPool commandPool,
2586 const VkAllocationCallbacks *pAllocator)
2587 {
2588 TU_FROM_HANDLE(tu_device, device, _device);
2589 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2590
2591 if (!pool)
2592 return;
2593
2594 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2595 &pool->cmd_buffers, pool_link)
2596 {
2597 tu_cmd_buffer_destroy(cmd_buffer);
2598 }
2599
2600 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2601 &pool->free_cmd_buffers, pool_link)
2602 {
2603 tu_cmd_buffer_destroy(cmd_buffer);
2604 }
2605
2606 vk_free2(&device->alloc, pAllocator, pool);
2607 }
2608
2609 VkResult
2610 tu_ResetCommandPool(VkDevice device,
2611 VkCommandPool commandPool,
2612 VkCommandPoolResetFlags flags)
2613 {
2614 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2615 VkResult result;
2616
2617 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2618 pool_link)
2619 {
2620 result = tu_reset_cmd_buffer(cmd_buffer);
2621 if (result != VK_SUCCESS)
2622 return result;
2623 }
2624
2625 return VK_SUCCESS;
2626 }
2627
2628 void
2629 tu_TrimCommandPool(VkDevice device,
2630 VkCommandPool commandPool,
2631 VkCommandPoolTrimFlags flags)
2632 {
2633 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2634
2635 if (!pool)
2636 return;
2637
2638 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2639 &pool->free_cmd_buffers, pool_link)
2640 {
2641 tu_cmd_buffer_destroy(cmd_buffer);
2642 }
2643 }
2644
2645 static void
2646 tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
2647 const struct tu_subpass_barrier *barrier,
2648 bool external)
2649 {
2650 /* Note: we don't know until the end of the subpass whether we'll use
2651 * sysmem, so assume sysmem here to be safe.
2652 */
2653 struct tu_cache_state *cache =
2654 external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
2655 enum tu_cmd_access_mask src_flags =
2656 vk2tu_access(barrier->src_access_mask, false);
2657 enum tu_cmd_access_mask dst_flags =
2658 vk2tu_access(barrier->dst_access_mask, false);
2659
2660 if (barrier->incoherent_ccu_color)
2661 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2662 if (barrier->incoherent_ccu_depth)
2663 src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2664
2665 tu_flush_for_access(cache, src_flags, dst_flags);
2666 }
2667
2668 void
2669 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2670 const VkRenderPassBeginInfo *pRenderPassBegin,
2671 VkSubpassContents contents)
2672 {
2673 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2674 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2675 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2676
2677 cmd->state.pass = pass;
2678 cmd->state.subpass = pass->subpasses;
2679 cmd->state.framebuffer = fb;
2680
2681 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2682 tu_cmd_prepare_tile_store_ib(cmd);
2683
2684 /* Note: because this is external, any flushes will happen before draw_cs
2685 * gets called. However deferred flushes could have to happen later as part
2686 * of the subpass.
2687 */
2688 tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
2689 cmd->state.renderpass_cache.pending_flush_bits =
2690 cmd->state.cache.pending_flush_bits;
2691 cmd->state.renderpass_cache.flush_bits = 0;
2692
2693 tu_emit_load_clear(cmd, pRenderPassBegin);
2694
2695 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2696 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2697 tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
2698 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2699
2700 /* note: use_hw_binning only checks tiling config */
2701 if (use_hw_binning(cmd))
2702 cmd->use_vsc_data = true;
2703
2704 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2705 const struct tu_image_view *iview = fb->attachments[i].attachment;
2706 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2707 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2708 }
2709
2710 /* Flag input attachment descriptors for re-emission if necessary */
2711 cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
2712 }
2713
2714 void
2715 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2716 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2717 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2718 {
2719 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2720 pSubpassBeginInfo->contents);
2721 }
2722
2723 void
2724 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2725 {
2726 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2727 const struct tu_render_pass *pass = cmd->state.pass;
2728 struct tu_cs *cs = &cmd->draw_cs;
2729
2730 const struct tu_subpass *subpass = cmd->state.subpass++;
2731
2732 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2733
2734 if (subpass->resolve_attachments) {
2735 tu6_emit_blit_scissor(cmd, cs, true);
2736
2737 for (unsigned i = 0; i < subpass->color_count; i++) {
2738 uint32_t a = subpass->resolve_attachments[i].attachment;
2739 if (a == VK_ATTACHMENT_UNUSED)
2740 continue;
2741
2742 tu_store_gmem_attachment(cmd, cs, a,
2743 subpass->color_attachments[i].attachment);
2744
2745 if (pass->attachments[a].gmem_offset < 0)
2746 continue;
2747
2748 /* TODO:
2749 * check if the resolved attachment is needed by later subpasses,
2750 * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
2751 */
2752 tu_finishme("missing GMEM->GMEM resolve path\n");
2753 tu_load_gmem_attachment(cmd, cs, a, true);
2754 }
2755 }
2756
2757 tu_cond_exec_end(cs);
2758
2759 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2760
2761 tu6_emit_sysmem_resolves(cmd, cs, subpass);
2762
2763 tu_cond_exec_end(cs);
2764
2765 /* Handle dependencies for the next subpass */
2766 tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
2767
2768 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2769 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2770 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2771 tu6_emit_msaa(cs, cmd->state.subpass->samples);
2772 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2773
2774 /* Flag input attachment descriptors for re-emission if necessary */
2775 cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
2776 }
2777
2778 void
2779 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2780 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2781 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2782 {
2783 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2784 }
2785
2786 struct tu_draw_info
2787 {
2788 /**
2789 * Number of vertices.
2790 */
2791 uint32_t count;
2792
2793 /**
2794 * Index of the first vertex.
2795 */
2796 int32_t vertex_offset;
2797
2798 /**
2799 * First instance id.
2800 */
2801 uint32_t first_instance;
2802
2803 /**
2804 * Number of instances.
2805 */
2806 uint32_t instance_count;
2807
2808 /**
2809 * First index (indexed draws only).
2810 */
2811 uint32_t first_index;
2812
2813 /**
2814 * Whether it's an indexed draw.
2815 */
2816 bool indexed;
2817
2818 /**
2819 * Indirect draw parameters resource.
2820 */
2821 struct tu_buffer *indirect;
2822 uint64_t indirect_offset;
2823 uint32_t stride;
2824
2825 /**
2826 * Draw count parameters resource.
2827 */
2828 struct tu_buffer *count_buffer;
2829 uint64_t count_buffer_offset;
2830
2831 /**
2832 * Stream output parameters resource.
2833 */
2834 struct tu_buffer *streamout_buffer;
2835 uint64_t streamout_buffer_offset;
2836 };
2837
2838 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2839 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2840 #define ENABLE_NON_GMEM (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_SYSMEM)
2841
2842 enum tu_draw_state_group_id
2843 {
2844 TU_DRAW_STATE_PROGRAM,
2845 TU_DRAW_STATE_PROGRAM_BINNING,
2846 TU_DRAW_STATE_VB,
2847 TU_DRAW_STATE_VI,
2848 TU_DRAW_STATE_VI_BINNING,
2849 TU_DRAW_STATE_VP,
2850 TU_DRAW_STATE_RAST,
2851 TU_DRAW_STATE_DS,
2852 TU_DRAW_STATE_BLEND,
2853 TU_DRAW_STATE_VS_CONST,
2854 TU_DRAW_STATE_GS_CONST,
2855 TU_DRAW_STATE_FS_CONST,
2856 TU_DRAW_STATE_DESC_SETS,
2857 TU_DRAW_STATE_DESC_SETS_GMEM,
2858 TU_DRAW_STATE_DESC_SETS_LOAD,
2859 TU_DRAW_STATE_VS_PARAMS,
2860
2861 TU_DRAW_STATE_COUNT,
2862 };
2863
2864 struct tu_draw_state_group
2865 {
2866 enum tu_draw_state_group_id id;
2867 uint32_t enable_mask;
2868 struct tu_cs_entry ib;
2869 };
2870
2871 static void
2872 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2873 struct tu_descriptor_state *descriptors_state,
2874 gl_shader_stage type,
2875 uint32_t *push_constants)
2876 {
2877 const struct tu_program_descriptor_linkage *link =
2878 &pipeline->program.link[type];
2879 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2880
2881 if (link->push_consts.count > 0) {
2882 unsigned num_units = link->push_consts.count;
2883 unsigned offset = link->push_consts.lo;
2884 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
2885 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
2886 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2887 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2888 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2889 CP_LOAD_STATE6_0_NUM_UNIT(num_units));
2890 tu_cs_emit(cs, 0);
2891 tu_cs_emit(cs, 0);
2892 for (unsigned i = 0; i < num_units * 4; i++)
2893 tu_cs_emit(cs, push_constants[i + offset * 4]);
2894 }
2895
2896 for (uint32_t i = 0; i < state->num_enabled; i++) {
2897 uint32_t size = state->range[i].end - state->range[i].start;
2898 uint32_t offset = state->range[i].start;
2899
2900 /* and even if the start of the const buffer is before
2901 * first_immediate, the end may not be:
2902 */
2903 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2904
2905 if (size == 0)
2906 continue;
2907
2908 /* things should be aligned to vec4: */
2909 debug_assert((state->range[i].offset % 16) == 0);
2910 debug_assert((size % 16) == 0);
2911 debug_assert((offset % 16) == 0);
2912
2913 /* Dig out the descriptor from the descriptor state and read the VA from
2914 * it.
2915 */
2916 assert(state->range[i].bindless);
2917 uint32_t *base = state->range[i].bindless_base == MAX_SETS ?
2918 descriptors_state->dynamic_descriptors :
2919 descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr;
2920 unsigned block = state->range[i].block;
2921 /* If the block in the shader here is in the dynamic descriptor set, it
2922 * is an index into the dynamic descriptor set which is combined from
2923 * dynamic descriptors and input attachments on-the-fly, and we don't
2924 * have access to it here. Instead we work backwards to get the index
2925 * into dynamic_descriptors.
2926 */
2927 if (state->range[i].bindless_base == MAX_SETS)
2928 block -= pipeline->layout->input_attachment_count;
2929 uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
2930 uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
2931 assert(va);
2932
2933 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2934 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2935 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2936 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2937 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2938 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2939 tu_cs_emit_qw(cs, va + offset);
2940 }
2941 }
2942
2943 static struct tu_cs_entry
2944 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2945 const struct tu_pipeline *pipeline,
2946 struct tu_descriptor_state *descriptors_state,
2947 gl_shader_stage type)
2948 {
2949 struct tu_cs cs;
2950 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2951
2952 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2953
2954 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2955 }
2956
2957 static VkResult
2958 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
2959 const struct tu_draw_info *draw,
2960 struct tu_cs_entry *entry)
2961 {
2962 /* TODO: fill out more than just base instance */
2963 const struct tu_program_descriptor_linkage *link =
2964 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
2965 const struct ir3_const_state *const_state = &link->const_state;
2966 struct tu_cs cs;
2967
2968 if (const_state->offsets.driver_param >= link->constlen) {
2969 *entry = (struct tu_cs_entry) {};
2970 return VK_SUCCESS;
2971 }
2972
2973 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
2974 if (result != VK_SUCCESS)
2975 return result;
2976
2977 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
2978 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
2979 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2980 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2981 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
2982 CP_LOAD_STATE6_0_NUM_UNIT(1));
2983 tu_cs_emit(&cs, 0);
2984 tu_cs_emit(&cs, 0);
2985
2986 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
2987
2988 tu_cs_emit(&cs, 0);
2989 tu_cs_emit(&cs, 0);
2990 tu_cs_emit(&cs, draw->first_instance);
2991 tu_cs_emit(&cs, 0);
2992
2993 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2994 return VK_SUCCESS;
2995 }
2996
2997 static struct tu_cs_entry
2998 tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
2999 const struct tu_pipeline *pipeline)
3000 {
3001 struct tu_cs cs;
3002 tu_cs_begin_sub_stream(&cmd->sub_cs, 4 * MAX_VBS, &cs);
3003
3004 int binding;
3005 for_each_bit(binding, pipeline->vi.bindings_used) {
3006 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3007 const VkDeviceSize offset = buf->bo_offset +
3008 cmd->state.vb.offsets[binding];
3009
3010 tu_cs_emit_regs(&cs,
3011 A6XX_VFD_FETCH_BASE(binding, .bo = buf->bo, .bo_offset = offset),
3012 A6XX_VFD_FETCH_SIZE(binding, buf->size - offset));
3013
3014 }
3015
3016 cmd->vertex_bindings_set = pipeline->vi.bindings_used;
3017
3018 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3019 }
3020
3021 static VkResult
3022 tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
3023 const struct tu_pipeline *pipeline,
3024 VkPipelineBindPoint bind_point,
3025 struct tu_cs_entry *entry,
3026 bool gmem)
3027 {
3028 struct tu_cs *draw_state = &cmd->sub_cs;
3029 struct tu_pipeline_layout *layout = pipeline->layout;
3030 struct tu_descriptor_state *descriptors_state =
3031 tu_get_descriptors_state(cmd, bind_point);
3032 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
3033 const uint32_t *input_attachment_idx =
3034 pipeline->program.input_attachment_idx;
3035 uint32_t num_dynamic_descs = layout->dynamic_offset_count +
3036 layout->input_attachment_count;
3037 struct ts_cs_memory dynamic_desc_set;
3038 VkResult result;
3039
3040 if (num_dynamic_descs > 0) {
3041 /* allocate and fill out dynamic descriptor set */
3042 result = tu_cs_alloc(draw_state, num_dynamic_descs,
3043 A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
3044 if (result != VK_SUCCESS)
3045 return result;
3046
3047 memcpy(dynamic_desc_set.map, descriptors_state->input_attachments,
3048 layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
3049
3050 if (gmem) {
3051 /* Patch input attachments to refer to GMEM instead */
3052 for (unsigned i = 0; i < layout->input_attachment_count; i++) {
3053 uint32_t *dst =
3054 &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i];
3055
3056 /* The compiler has already laid out input_attachment_idx in the
3057 * final order of input attachments, so there's no need to go
3058 * through the pipeline layout finding input attachments.
3059 */
3060 unsigned attachment_idx = input_attachment_idx[i];
3061
3062 /* It's possible for the pipeline layout to include an input
3063 * attachment which doesn't actually exist for the current
3064 * subpass. Of course, this is only valid so long as the pipeline
3065 * doesn't try to actually load that attachment. Just skip
3066 * patching in that scenario to avoid out-of-bounds accesses.
3067 */
3068 if (attachment_idx >= cmd->state.subpass->input_count)
3069 continue;
3070
3071 uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment;
3072 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3073
3074 assert(att->gmem_offset >= 0);
3075
3076 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
3077 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
3078 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
3079 dst[2] |=
3080 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
3081 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
3082 dst[3] = 0;
3083 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
3084 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
3085 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3086 dst[i] = 0;
3087
3088 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
3089 tu_finishme("patch input attachment pitch for secondary cmd buffer");
3090 }
3091 }
3092
3093 memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS,
3094 descriptors_state->dynamic_descriptors,
3095 layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
3096 }
3097
3098 uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
3099 uint32_t hlsq_update_value;
3100 switch (bind_point) {
3101 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3102 sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
3103 hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
3104 hlsq_update_value = 0x7c000;
3105 break;
3106 case VK_PIPELINE_BIND_POINT_COMPUTE:
3107 sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
3108 hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
3109 hlsq_update_value = 0x3e00;
3110 break;
3111 default:
3112 unreachable("bad bind point");
3113 }
3114
3115 /* Be careful here to *not* refer to the pipeline, so that if only the
3116 * pipeline changes we don't have to emit this again (except if there are
3117 * dynamic descriptors in the pipeline layout). This means always emitting
3118 * all the valid descriptors, which means that we always have to put the
3119 * dynamic descriptor in the driver-only slot at the end
3120 */
3121 uint32_t num_user_sets = util_last_bit(descriptors_state->valid);
3122 uint32_t num_sets = num_user_sets;
3123 if (num_dynamic_descs > 0) {
3124 num_user_sets = MAX_SETS;
3125 num_sets = num_user_sets + 1;
3126 }
3127
3128 unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg };
3129
3130 struct tu_cs cs;
3131 result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs);
3132 if (result != VK_SUCCESS)
3133 return result;
3134
3135 if (num_sets > 0) {
3136 for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) {
3137 tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2);
3138 for (unsigned j = 0; j < num_user_sets; j++) {
3139 if (descriptors_state->valid & (1 << j)) {
3140 /* magic | 3 copied from the blob */
3141 tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3);
3142 } else {
3143 tu_cs_emit_qw(&cs, 0 | 3);
3144 }
3145 }
3146 if (num_dynamic_descs > 0) {
3147 tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3);
3148 }
3149 }
3150
3151 tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value));
3152 }
3153
3154 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3155 return VK_SUCCESS;
3156 }
3157
3158 static void
3159 tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
3160 {
3161 struct tu_streamout_state *tf = &cmd->state.pipeline->streamout;
3162
3163 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3164 struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
3165 if (!buf)
3166 continue;
3167
3168 uint32_t offset;
3169 offset = cmd->state.streamout_buf.offsets[i];
3170
3171 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo,
3172 .bo_offset = buf->bo_offset));
3173 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size));
3174
3175 if (cmd->state.streamout_reset & (1 << i)) {
3176 tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset));
3177 cmd->state.streamout_reset &= ~(1 << i);
3178 } else {
3179 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
3180 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
3181 CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
3182 CP_MEM_TO_REG_0_CNT(0));
3183 tu_cs_emit_qw(cs, cmd->scratch_bo.iova +
3184 ctrl_offset(flush_base[i].offset));
3185 }
3186
3187 tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo,
3188 .bo_offset =
3189 ctrl_offset(flush_base[i])));
3190 }
3191
3192 if (cmd->state.streamout_enabled) {
3193 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count));
3194 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
3195 tu_cs_emit(cs, tf->vpc_so_buf_cntl);
3196 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0));
3197 tu_cs_emit(cs, tf->ncomp[0]);
3198 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1));
3199 tu_cs_emit(cs, tf->ncomp[1]);
3200 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2));
3201 tu_cs_emit(cs, tf->ncomp[2]);
3202 tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3));
3203 tu_cs_emit(cs, tf->ncomp[3]);
3204 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
3205 tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE);
3206 for (unsigned i = 0; i < tf->prog_count; i++) {
3207 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
3208 tu_cs_emit(cs, tf->prog[i]);
3209 }
3210 } else {
3211 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
3212 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
3213 tu_cs_emit(cs, 0);
3214 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
3215 tu_cs_emit(cs, 0);
3216 }
3217 }
3218
3219 static VkResult
3220 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3221 struct tu_cs *cs,
3222 const struct tu_draw_info *draw)
3223 {
3224 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3225 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3226 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3227 uint32_t draw_state_group_count = 0;
3228 VkResult result;
3229
3230 struct tu_descriptor_state *descriptors_state =
3231 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3232
3233 /* TODO lrz */
3234
3235 tu_cs_emit_regs(cs,
3236 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3237 pipeline->ia.primitive_restart && draw->indexed));
3238
3239 if (cmd->state.dirty &
3240 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3241 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3242 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3243 dynamic->line_width);
3244 }
3245
3246 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3247 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3248 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3249 dynamic->stencil_compare_mask.back);
3250 }
3251
3252 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3253 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3254 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3255 dynamic->stencil_write_mask.back);
3256 }
3257
3258 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3259 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3260 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3261 dynamic->stencil_reference.back);
3262 }
3263
3264 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3265 (pipeline->dynamic_state.mask & TU_DYNAMIC_VIEWPORT)) {
3266 tu6_emit_viewport(cs, &cmd->state.dynamic.viewport.viewports[0]);
3267 }
3268
3269 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_SCISSOR) &&
3270 (pipeline->dynamic_state.mask & TU_DYNAMIC_SCISSOR)) {
3271 tu6_emit_scissor(cs, &cmd->state.dynamic.scissor.scissors[0]);
3272 }
3273
3274 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3275 draw_state_groups[draw_state_group_count++] =
3276 (struct tu_draw_state_group) {
3277 .id = TU_DRAW_STATE_PROGRAM,
3278 .enable_mask = ENABLE_DRAW,
3279 .ib = pipeline->program.state_ib,
3280 };
3281 draw_state_groups[draw_state_group_count++] =
3282 (struct tu_draw_state_group) {
3283 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3284 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3285 .ib = pipeline->program.binning_state_ib,
3286 };
3287 draw_state_groups[draw_state_group_count++] =
3288 (struct tu_draw_state_group) {
3289 .id = TU_DRAW_STATE_VI,
3290 .enable_mask = ENABLE_DRAW,
3291 .ib = pipeline->vi.state_ib,
3292 };
3293 draw_state_groups[draw_state_group_count++] =
3294 (struct tu_draw_state_group) {
3295 .id = TU_DRAW_STATE_VI_BINNING,
3296 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3297 .ib = pipeline->vi.binning_state_ib,
3298 };
3299 draw_state_groups[draw_state_group_count++] =
3300 (struct tu_draw_state_group) {
3301 .id = TU_DRAW_STATE_VP,
3302 .enable_mask = ENABLE_ALL,
3303 .ib = pipeline->vp.state_ib,
3304 };
3305 draw_state_groups[draw_state_group_count++] =
3306 (struct tu_draw_state_group) {
3307 .id = TU_DRAW_STATE_RAST,
3308 .enable_mask = ENABLE_ALL,
3309 .ib = pipeline->rast.state_ib,
3310 };
3311 draw_state_groups[draw_state_group_count++] =
3312 (struct tu_draw_state_group) {
3313 .id = TU_DRAW_STATE_DS,
3314 .enable_mask = ENABLE_ALL,
3315 .ib = pipeline->ds.state_ib,
3316 };
3317 draw_state_groups[draw_state_group_count++] =
3318 (struct tu_draw_state_group) {
3319 .id = TU_DRAW_STATE_BLEND,
3320 .enable_mask = ENABLE_ALL,
3321 .ib = pipeline->blend.state_ib,
3322 };
3323 }
3324
3325 if (cmd->state.dirty &
3326 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) {
3327 draw_state_groups[draw_state_group_count++] =
3328 (struct tu_draw_state_group) {
3329 .id = TU_DRAW_STATE_VS_CONST,
3330 .enable_mask = ENABLE_ALL,
3331 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3332 };
3333 draw_state_groups[draw_state_group_count++] =
3334 (struct tu_draw_state_group) {
3335 .id = TU_DRAW_STATE_GS_CONST,
3336 .enable_mask = ENABLE_ALL,
3337 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_GEOMETRY)
3338 };
3339 draw_state_groups[draw_state_group_count++] =
3340 (struct tu_draw_state_group) {
3341 .id = TU_DRAW_STATE_FS_CONST,
3342 .enable_mask = ENABLE_DRAW,
3343 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3344 };
3345 }
3346
3347 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3348 draw_state_groups[draw_state_group_count++] =
3349 (struct tu_draw_state_group) {
3350 .id = TU_DRAW_STATE_VB,
3351 .enable_mask = ENABLE_ALL,
3352 .ib = tu6_emit_vertex_buffers(cmd, pipeline)
3353 };
3354 }
3355
3356 if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
3357 tu6_emit_streamout(cmd, cs);
3358
3359 /* If there are any any dynamic descriptors, then we may need to re-emit
3360 * them after every pipeline change in case the number of input attachments
3361 * changes. We also always need to re-emit after a pipeline change if there
3362 * are any input attachments, because the input attachment index comes from
3363 * the pipeline. Finally, it can also happen that the subpass changes
3364 * without the pipeline changing, in which case the GMEM descriptors need
3365 * to be patched differently.
3366 *
3367 * TODO: We could probably be clever and avoid re-emitting state on
3368 * pipeline changes if the number of input attachments is always 0. We
3369 * could also only re-emit dynamic state.
3370 */
3371 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS ||
3372 ((pipeline->layout->dynamic_offset_count +
3373 pipeline->layout->input_attachment_count > 0) &&
3374 cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) ||
3375 (pipeline->layout->input_attachment_count > 0 &&
3376 cmd->state.dirty & TU_CMD_DIRTY_INPUT_ATTACHMENTS)) {
3377 struct tu_cs_entry desc_sets, desc_sets_gmem;
3378 bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0;
3379
3380 result = tu6_emit_descriptor_sets(cmd, pipeline,
3381 VK_PIPELINE_BIND_POINT_GRAPHICS,
3382 &desc_sets, false);
3383 if (result != VK_SUCCESS)
3384 return result;
3385
3386 draw_state_groups[draw_state_group_count++] =
3387 (struct tu_draw_state_group) {
3388 .id = TU_DRAW_STATE_DESC_SETS,
3389 .enable_mask = need_gmem_desc_set ? ENABLE_NON_GMEM : ENABLE_ALL,
3390 .ib = desc_sets,
3391 };
3392
3393 if (need_gmem_desc_set) {
3394 result = tu6_emit_descriptor_sets(cmd, pipeline,
3395 VK_PIPELINE_BIND_POINT_GRAPHICS,
3396 &desc_sets_gmem, true);
3397 if (result != VK_SUCCESS)
3398 return result;
3399
3400 draw_state_groups[draw_state_group_count++] =
3401 (struct tu_draw_state_group) {
3402 .id = TU_DRAW_STATE_DESC_SETS_GMEM,
3403 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3404 .ib = desc_sets_gmem,
3405 };
3406 }
3407
3408 /* We need to reload the descriptors every time the descriptor sets
3409 * change. However, the commands we send only depend on the pipeline
3410 * because the whole point is to cache descriptors which are used by the
3411 * pipeline. There's a problem here, in that the firmware has an
3412 * "optimization" which skips executing groups that are set to the same
3413 * value as the last draw. This means that if the descriptor sets change
3414 * but not the pipeline, we'd try to re-execute the same buffer which
3415 * the firmware would ignore and we wouldn't pre-load the new
3416 * descriptors. The blob seems to re-emit the LOAD_STATE group whenever
3417 * the descriptor sets change, which we emulate here by copying the
3418 * pre-prepared buffer.
3419 */
3420 const struct tu_cs_entry *load_entry = &pipeline->load_state.state_ib;
3421 if (load_entry->size > 0) {
3422 struct tu_cs load_cs;
3423 result = tu_cs_begin_sub_stream(&cmd->sub_cs, load_entry->size, &load_cs);
3424 if (result != VK_SUCCESS)
3425 return result;
3426 tu_cs_emit_array(&load_cs,
3427 (uint32_t *)((char *)load_entry->bo->map + load_entry->offset),
3428 load_entry->size / 4);
3429 struct tu_cs_entry load_copy = tu_cs_end_sub_stream(&cmd->sub_cs, &load_cs);
3430
3431 draw_state_groups[draw_state_group_count++] =
3432 (struct tu_draw_state_group) {
3433 .id = TU_DRAW_STATE_DESC_SETS_LOAD,
3434 /* The blob seems to not enable this for binning, even when
3435 * resources would actually be used in the binning shader.
3436 * Presumably the overhead of prefetching the resources isn't
3437 * worth it.
3438 */
3439 .enable_mask = ENABLE_DRAW,
3440 .ib = load_copy,
3441 };
3442 }
3443 }
3444
3445 struct tu_cs_entry vs_params;
3446 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3447 if (result != VK_SUCCESS)
3448 return result;
3449
3450 draw_state_groups[draw_state_group_count++] =
3451 (struct tu_draw_state_group) {
3452 .id = TU_DRAW_STATE_VS_PARAMS,
3453 .enable_mask = ENABLE_ALL,
3454 .ib = vs_params,
3455 };
3456
3457 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3458 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3459 const struct tu_draw_state_group *group = &draw_state_groups[i];
3460 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3461 uint32_t cp_set_draw_state =
3462 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3463 group->enable_mask |
3464 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3465 uint64_t iova;
3466 if (group->ib.size) {
3467 iova = group->ib.bo->iova + group->ib.offset;
3468 } else {
3469 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3470 iova = 0;
3471 }
3472
3473 tu_cs_emit(cs, cp_set_draw_state);
3474 tu_cs_emit_qw(cs, iova);
3475 }
3476
3477 tu_cs_sanity_check(cs);
3478
3479 /* track BOs */
3480 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3481 unsigned i;
3482 for_each_bit(i, descriptors_state->valid) {
3483 struct tu_descriptor_set *set = descriptors_state->sets[i];
3484 for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
3485 if (set->buffers[j]) {
3486 tu_bo_list_add(&cmd->bo_list, set->buffers[j],
3487 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3488 }
3489 }
3490 if (set->size > 0) {
3491 tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
3492 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
3493 }
3494 }
3495 }
3496 if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
3497 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3498 const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
3499 if (buf) {
3500 tu_bo_list_add(&cmd->bo_list, buf->bo,
3501 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3502 }
3503 }
3504 }
3505
3506 /* There are too many graphics dirty bits to list here, so just list the
3507 * bits to preserve instead. The only things not emitted here are
3508 * compute-related state.
3509 */
3510 cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
3511
3512 /* Fragment shader state overwrites compute shader state, so flag the
3513 * compute pipeline for re-emit.
3514 */
3515 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
3516 return VK_SUCCESS;
3517 }
3518
3519 static void
3520 tu6_emit_draw_indirect(struct tu_cmd_buffer *cmd,
3521 struct tu_cs *cs,
3522 const struct tu_draw_info *draw)
3523 {
3524 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3525 bool has_gs = cmd->state.pipeline->active_stages &
3526 VK_SHADER_STAGE_GEOMETRY_BIT;
3527
3528 tu_cs_emit_regs(cs,
3529 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3530 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3531
3532 if (draw->indexed) {
3533 const enum a4xx_index_size index_size =
3534 tu6_index_size(cmd->state.index_type);
3535 const uint32_t index_bytes =
3536 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3537 const struct tu_buffer *index_buf = cmd->state.index_buffer;
3538 unsigned max_indicies =
3539 (index_buf->size - cmd->state.index_offset) / index_bytes;
3540
3541 const uint32_t cp_draw_indx =
3542 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3543 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3544 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3545 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3546 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3547
3548 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_INDIRECT, 6);
3549 tu_cs_emit(cs, cp_draw_indx);
3550 tu_cs_emit_qw(cs, index_buf->bo->iova + cmd->state.index_offset);
3551 tu_cs_emit(cs, A5XX_CP_DRAW_INDX_INDIRECT_3_MAX_INDICES(max_indicies));
3552 tu_cs_emit_qw(cs, draw->indirect->bo->iova + draw->indirect_offset);
3553 } else {
3554 const uint32_t cp_draw_indx =
3555 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3556 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3557 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3558 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3559
3560 tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT, 3);
3561 tu_cs_emit(cs, cp_draw_indx);
3562 tu_cs_emit_qw(cs, draw->indirect->bo->iova + draw->indirect_offset);
3563 }
3564
3565 tu_bo_list_add(&cmd->bo_list, draw->indirect->bo, MSM_SUBMIT_BO_READ);
3566 }
3567
3568 static void
3569 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3570 struct tu_cs *cs,
3571 const struct tu_draw_info *draw)
3572 {
3573
3574 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3575 bool has_gs = cmd->state.pipeline->active_stages &
3576 VK_SHADER_STAGE_GEOMETRY_BIT;
3577
3578 tu_cs_emit_regs(cs,
3579 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3580 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3581
3582 /* TODO hw binning */
3583 if (draw->indexed) {
3584 const enum a4xx_index_size index_size =
3585 tu6_index_size(cmd->state.index_type);
3586 const uint32_t index_bytes =
3587 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3588 const struct tu_buffer *buf = cmd->state.index_buffer;
3589 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3590 index_bytes * draw->first_index;
3591 const uint32_t size = index_bytes * draw->count;
3592
3593 const uint32_t cp_draw_indx =
3594 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3595 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3596 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3597 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3598 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3599
3600 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3601 tu_cs_emit(cs, cp_draw_indx);
3602 tu_cs_emit(cs, draw->instance_count);
3603 tu_cs_emit(cs, draw->count);
3604 tu_cs_emit(cs, 0x0); /* XXX */
3605 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3606 tu_cs_emit(cs, size);
3607 } else {
3608 const uint32_t cp_draw_indx =
3609 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3610 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3611 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) |
3612 COND(has_gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE) | 0x2000;
3613
3614 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3615 tu_cs_emit(cs, cp_draw_indx);
3616 tu_cs_emit(cs, draw->instance_count);
3617 tu_cs_emit(cs, draw->count);
3618 }
3619 }
3620
3621 static void
3622 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3623 {
3624 struct tu_cs *cs = &cmd->draw_cs;
3625 VkResult result;
3626
3627 tu_emit_cache_flush_renderpass(cmd, cs);
3628
3629 result = tu6_bind_draw_states(cmd, cs, draw);
3630 if (result != VK_SUCCESS) {
3631 cmd->record_result = result;
3632 return;
3633 }
3634
3635 if (draw->indirect)
3636 tu6_emit_draw_indirect(cmd, cs, draw);
3637 else
3638 tu6_emit_draw_direct(cmd, cs, draw);
3639
3640 if (cmd->state.streamout_enabled) {
3641 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3642 if (cmd->state.streamout_enabled & (1 << i))
3643 tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
3644 }
3645 }
3646
3647 tu_cs_sanity_check(cs);
3648 }
3649
3650 void
3651 tu_CmdDraw(VkCommandBuffer commandBuffer,
3652 uint32_t vertexCount,
3653 uint32_t instanceCount,
3654 uint32_t firstVertex,
3655 uint32_t firstInstance)
3656 {
3657 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3658 struct tu_draw_info info = {};
3659
3660 info.count = vertexCount;
3661 info.instance_count = instanceCount;
3662 info.first_instance = firstInstance;
3663 info.vertex_offset = firstVertex;
3664
3665 tu_draw(cmd_buffer, &info);
3666 }
3667
3668 void
3669 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3670 uint32_t indexCount,
3671 uint32_t instanceCount,
3672 uint32_t firstIndex,
3673 int32_t vertexOffset,
3674 uint32_t firstInstance)
3675 {
3676 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3677 struct tu_draw_info info = {};
3678
3679 info.indexed = true;
3680 info.count = indexCount;
3681 info.instance_count = instanceCount;
3682 info.first_index = firstIndex;
3683 info.vertex_offset = vertexOffset;
3684 info.first_instance = firstInstance;
3685
3686 tu_draw(cmd_buffer, &info);
3687 }
3688
3689 void
3690 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3691 VkBuffer _buffer,
3692 VkDeviceSize offset,
3693 uint32_t drawCount,
3694 uint32_t stride)
3695 {
3696 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3697 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3698 struct tu_draw_info info = {};
3699
3700 info.count = drawCount;
3701 info.indirect = buffer;
3702 info.indirect_offset = offset;
3703 info.stride = stride;
3704
3705 tu_draw(cmd_buffer, &info);
3706 }
3707
3708 void
3709 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3710 VkBuffer _buffer,
3711 VkDeviceSize offset,
3712 uint32_t drawCount,
3713 uint32_t stride)
3714 {
3715 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3716 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3717 struct tu_draw_info info = {};
3718
3719 info.indexed = true;
3720 info.count = drawCount;
3721 info.indirect = buffer;
3722 info.indirect_offset = offset;
3723 info.stride = stride;
3724
3725 tu_draw(cmd_buffer, &info);
3726 }
3727
3728 void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3729 uint32_t instanceCount,
3730 uint32_t firstInstance,
3731 VkBuffer _counterBuffer,
3732 VkDeviceSize counterBufferOffset,
3733 uint32_t counterOffset,
3734 uint32_t vertexStride)
3735 {
3736 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3737 TU_FROM_HANDLE(tu_buffer, buffer, _counterBuffer);
3738
3739 struct tu_draw_info info = {};
3740
3741 info.instance_count = instanceCount;
3742 info.first_instance = firstInstance;
3743 info.streamout_buffer = buffer;
3744 info.streamout_buffer_offset = counterBufferOffset;
3745 info.stride = vertexStride;
3746
3747 tu_draw(cmd_buffer, &info);
3748 }
3749
3750 struct tu_dispatch_info
3751 {
3752 /**
3753 * Determine the layout of the grid (in block units) to be used.
3754 */
3755 uint32_t blocks[3];
3756
3757 /**
3758 * A starting offset for the grid. If unaligned is set, the offset
3759 * must still be aligned.
3760 */
3761 uint32_t offsets[3];
3762 /**
3763 * Whether it's an unaligned compute dispatch.
3764 */
3765 bool unaligned;
3766
3767 /**
3768 * Indirect compute parameters resource.
3769 */
3770 struct tu_buffer *indirect;
3771 uint64_t indirect_offset;
3772 };
3773
3774 static void
3775 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3776 const struct tu_dispatch_info *info)
3777 {
3778 gl_shader_stage type = MESA_SHADER_COMPUTE;
3779 const struct tu_program_descriptor_linkage *link =
3780 &pipeline->program.link[type];
3781 const struct ir3_const_state *const_state = &link->const_state;
3782 uint32_t offset = const_state->offsets.driver_param;
3783
3784 if (link->constlen <= offset)
3785 return;
3786
3787 if (!info->indirect) {
3788 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3789 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3790 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3791 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3792 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3793 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3794 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3795 };
3796
3797 uint32_t num_consts = MIN2(const_state->num_driver_params,
3798 (link->constlen - offset) * 4);
3799 /* push constants */
3800 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3801 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3802 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3803 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3804 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3805 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3806 tu_cs_emit(cs, 0);
3807 tu_cs_emit(cs, 0);
3808 uint32_t i;
3809 for (i = 0; i < num_consts; i++)
3810 tu_cs_emit(cs, driver_params[i]);
3811 } else {
3812 tu_finishme("Indirect driver params");
3813 }
3814 }
3815
3816 static void
3817 tu_dispatch(struct tu_cmd_buffer *cmd,
3818 const struct tu_dispatch_info *info)
3819 {
3820 struct tu_cs *cs = &cmd->cs;
3821 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3822 struct tu_descriptor_state *descriptors_state =
3823 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3824 VkResult result;
3825
3826 /* TODO: We could probably flush less if we add a compute_flush_bits
3827 * bitfield.
3828 */
3829 tu_emit_cache_flush(cmd, cs);
3830
3831 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3832 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3833
3834 struct tu_cs_entry ib;
3835
3836 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3837 if (ib.size)
3838 tu_cs_emit_ib(cs, &ib);
3839
3840 tu_emit_compute_driver_params(cs, pipeline, info);
3841
3842 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) {
3843 result = tu6_emit_descriptor_sets(cmd, pipeline,
3844 VK_PIPELINE_BIND_POINT_COMPUTE, &ib,
3845 false);
3846 if (result != VK_SUCCESS) {
3847 cmd->record_result = result;
3848 return;
3849 }
3850
3851 /* track BOs */
3852 unsigned i;
3853 for_each_bit(i, descriptors_state->valid) {
3854 struct tu_descriptor_set *set = descriptors_state->sets[i];
3855 for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
3856 if (set->buffers[j]) {
3857 tu_bo_list_add(&cmd->bo_list, set->buffers[j],
3858 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3859 }
3860 }
3861
3862 if (set->size > 0) {
3863 tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
3864 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
3865 }
3866 }
3867 }
3868
3869 if (ib.size)
3870 tu_cs_emit_ib(cs, &ib);
3871
3872 if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) &&
3873 pipeline->load_state.state_ib.size > 0) {
3874 tu_cs_emit_ib(cs, &pipeline->load_state.state_ib);
3875 }
3876
3877 cmd->state.dirty &=
3878 ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
3879
3880 /* Compute shader state overwrites fragment shader state, so we flag the
3881 * graphics pipeline for re-emit.
3882 */
3883 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
3884
3885 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3886 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3887
3888 const uint32_t *local_size = pipeline->compute.local_size;
3889 const uint32_t *num_groups = info->blocks;
3890 tu_cs_emit_regs(cs,
3891 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3892 .localsizex = local_size[0] - 1,
3893 .localsizey = local_size[1] - 1,
3894 .localsizez = local_size[2] - 1),
3895 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3896 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3897 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3898 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3899 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3900 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3901
3902 tu_cs_emit_regs(cs,
3903 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3904 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3905 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3906
3907 if (info->indirect) {
3908 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3909
3910 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3911 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3912
3913 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3914 tu_cs_emit(cs, 0x00000000);
3915 tu_cs_emit_qw(cs, iova);
3916 tu_cs_emit(cs,
3917 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3918 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3919 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3920 } else {
3921 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3922 tu_cs_emit(cs, 0x00000000);
3923 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3924 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3925 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3926 }
3927
3928 tu_cs_emit_wfi(cs);
3929 }
3930
3931 void
3932 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3933 uint32_t base_x,
3934 uint32_t base_y,
3935 uint32_t base_z,
3936 uint32_t x,
3937 uint32_t y,
3938 uint32_t z)
3939 {
3940 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3941 struct tu_dispatch_info info = {};
3942
3943 info.blocks[0] = x;
3944 info.blocks[1] = y;
3945 info.blocks[2] = z;
3946
3947 info.offsets[0] = base_x;
3948 info.offsets[1] = base_y;
3949 info.offsets[2] = base_z;
3950 tu_dispatch(cmd_buffer, &info);
3951 }
3952
3953 void
3954 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3955 uint32_t x,
3956 uint32_t y,
3957 uint32_t z)
3958 {
3959 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3960 }
3961
3962 void
3963 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3964 VkBuffer _buffer,
3965 VkDeviceSize offset)
3966 {
3967 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3968 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3969 struct tu_dispatch_info info = {};
3970
3971 info.indirect = buffer;
3972 info.indirect_offset = offset;
3973
3974 tu_dispatch(cmd_buffer, &info);
3975 }
3976
3977 void
3978 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3979 {
3980 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3981
3982 tu_cs_end(&cmd_buffer->draw_cs);
3983 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3984
3985 if (use_sysmem_rendering(cmd_buffer))
3986 tu_cmd_render_sysmem(cmd_buffer);
3987 else
3988 tu_cmd_render_tiles(cmd_buffer);
3989
3990 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
3991 rendered */
3992 tu_cs_discard_entries(&cmd_buffer->draw_cs);
3993 tu_cs_begin(&cmd_buffer->draw_cs);
3994 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
3995 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
3996
3997 cmd_buffer->state.cache.pending_flush_bits |=
3998 cmd_buffer->state.renderpass_cache.pending_flush_bits;
3999 tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
4000
4001 cmd_buffer->state.pass = NULL;
4002 cmd_buffer->state.subpass = NULL;
4003 cmd_buffer->state.framebuffer = NULL;
4004 }
4005
4006 void
4007 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4008 const VkSubpassEndInfoKHR *pSubpassEndInfo)
4009 {
4010 tu_CmdEndRenderPass(commandBuffer);
4011 }
4012
4013 struct tu_barrier_info
4014 {
4015 uint32_t eventCount;
4016 const VkEvent *pEvents;
4017 VkPipelineStageFlags srcStageMask;
4018 };
4019
4020 static void
4021 tu_barrier(struct tu_cmd_buffer *cmd,
4022 uint32_t memoryBarrierCount,
4023 const VkMemoryBarrier *pMemoryBarriers,
4024 uint32_t bufferMemoryBarrierCount,
4025 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4026 uint32_t imageMemoryBarrierCount,
4027 const VkImageMemoryBarrier *pImageMemoryBarriers,
4028 const struct tu_barrier_info *info)
4029 {
4030 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4031 VkAccessFlags srcAccessMask = 0;
4032 VkAccessFlags dstAccessMask = 0;
4033
4034 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
4035 srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
4036 dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
4037 }
4038
4039 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
4040 srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
4041 dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
4042 }
4043
4044 enum tu_cmd_access_mask src_flags = 0;
4045 enum tu_cmd_access_mask dst_flags = 0;
4046
4047 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4048 TU_FROM_HANDLE(tu_image, image, pImageMemoryBarriers[i].image);
4049 VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
4050 /* For non-linear images, PREINITIALIZED is the same as UNDEFINED */
4051 if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
4052 (image->tiling != VK_IMAGE_TILING_LINEAR &&
4053 old_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
4054 /* The underlying memory for this image may have been used earlier
4055 * within the same queue submission for a different image, which
4056 * means that there may be old, stale cache entries which are in the
4057 * "wrong" location, which could cause problems later after writing
4058 * to the image. We don't want these entries being flushed later and
4059 * overwriting the actual image, so we need to flush the CCU.
4060 */
4061 src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4062 }
4063 srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
4064 dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
4065 }
4066
4067 /* Inside a renderpass, we don't know yet whether we'll be using sysmem
4068 * so we have to use the sysmem flushes.
4069 */
4070 bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
4071 !cmd->state.pass;
4072 src_flags |= vk2tu_access(srcAccessMask, gmem);
4073 dst_flags |= vk2tu_access(dstAccessMask, gmem);
4074
4075 struct tu_cache_state *cache =
4076 cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
4077 tu_flush_for_access(cache, src_flags, dst_flags);
4078
4079 for (uint32_t i = 0; i < info->eventCount; i++) {
4080 TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
4081
4082 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4083
4084 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4085 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4086 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4087 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4088 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4089 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4090 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4091 }
4092 }
4093
4094 void
4095 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4096 VkPipelineStageFlags srcStageMask,
4097 VkPipelineStageFlags dstStageMask,
4098 VkDependencyFlags dependencyFlags,
4099 uint32_t memoryBarrierCount,
4100 const VkMemoryBarrier *pMemoryBarriers,
4101 uint32_t bufferMemoryBarrierCount,
4102 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4103 uint32_t imageMemoryBarrierCount,
4104 const VkImageMemoryBarrier *pImageMemoryBarriers)
4105 {
4106 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4107 struct tu_barrier_info info;
4108
4109 info.eventCount = 0;
4110 info.pEvents = NULL;
4111 info.srcStageMask = srcStageMask;
4112
4113 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4114 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4115 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4116 }
4117
4118 static void
4119 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
4120 VkPipelineStageFlags stageMask, unsigned value)
4121 {
4122 struct tu_cs *cs = &cmd->cs;
4123
4124 /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
4125 assert(!cmd->state.pass);
4126
4127 tu_emit_cache_flush(cmd, cs);
4128
4129 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4130
4131 /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
4132 * read by the CP, so the draw indirect stage counts as top-of-pipe too.
4133 */
4134 VkPipelineStageFlags top_of_pipe_flags =
4135 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
4136 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
4137
4138 if (!(stageMask & ~top_of_pipe_flags)) {
4139 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4140 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4141 tu_cs_emit(cs, value);
4142 } else {
4143 /* Use a RB_DONE_TS event to wait for everything to complete. */
4144 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
4145 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
4146 tu_cs_emit_qw(cs, event->bo.iova);
4147 tu_cs_emit(cs, value);
4148 }
4149 }
4150
4151 void
4152 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4153 VkEvent _event,
4154 VkPipelineStageFlags stageMask)
4155 {
4156 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4157 TU_FROM_HANDLE(tu_event, event, _event);
4158
4159 write_event(cmd, event, stageMask, 1);
4160 }
4161
4162 void
4163 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4164 VkEvent _event,
4165 VkPipelineStageFlags stageMask)
4166 {
4167 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4168 TU_FROM_HANDLE(tu_event, event, _event);
4169
4170 write_event(cmd, event, stageMask, 0);
4171 }
4172
4173 void
4174 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4175 uint32_t eventCount,
4176 const VkEvent *pEvents,
4177 VkPipelineStageFlags srcStageMask,
4178 VkPipelineStageFlags dstStageMask,
4179 uint32_t memoryBarrierCount,
4180 const VkMemoryBarrier *pMemoryBarriers,
4181 uint32_t bufferMemoryBarrierCount,
4182 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4183 uint32_t imageMemoryBarrierCount,
4184 const VkImageMemoryBarrier *pImageMemoryBarriers)
4185 {
4186 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4187 struct tu_barrier_info info;
4188
4189 info.eventCount = eventCount;
4190 info.pEvents = pEvents;
4191 info.srcStageMask = 0;
4192
4193 tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
4194 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4195 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4196 }
4197
4198 void
4199 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4200 {
4201 /* No-op */
4202 }