turnip: automatically reserve cmdstream space in emit_pkt4/emit_pkt7
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36 #include "tu_blit.h"
37
38 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
39
40 void
41 tu_bo_list_init(struct tu_bo_list *list)
42 {
43 list->count = list->capacity = 0;
44 list->bo_infos = NULL;
45 }
46
47 void
48 tu_bo_list_destroy(struct tu_bo_list *list)
49 {
50 free(list->bo_infos);
51 }
52
53 void
54 tu_bo_list_reset(struct tu_bo_list *list)
55 {
56 list->count = 0;
57 }
58
59 /**
60 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
61 */
62 static uint32_t
63 tu_bo_list_add_info(struct tu_bo_list *list,
64 const struct drm_msm_gem_submit_bo *bo_info)
65 {
66 assert(bo_info->handle != 0);
67
68 for (uint32_t i = 0; i < list->count; ++i) {
69 if (list->bo_infos[i].handle == bo_info->handle) {
70 assert(list->bo_infos[i].presumed == bo_info->presumed);
71 list->bo_infos[i].flags |= bo_info->flags;
72 return i;
73 }
74 }
75
76 /* grow list->bo_infos if needed */
77 if (list->count == list->capacity) {
78 uint32_t new_capacity = MAX2(2 * list->count, 16);
79 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
80 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
81 if (!new_bo_infos)
82 return TU_BO_LIST_FAILED;
83 list->bo_infos = new_bo_infos;
84 list->capacity = new_capacity;
85 }
86
87 list->bo_infos[list->count] = *bo_info;
88 return list->count++;
89 }
90
91 uint32_t
92 tu_bo_list_add(struct tu_bo_list *list,
93 const struct tu_bo *bo,
94 uint32_t flags)
95 {
96 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
97 .flags = flags,
98 .handle = bo->gem_handle,
99 .presumed = bo->iova,
100 });
101 }
102
103 VkResult
104 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
105 {
106 for (uint32_t i = 0; i < other->count; i++) {
107 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
108 return VK_ERROR_OUT_OF_HOST_MEMORY;
109 }
110
111 return VK_SUCCESS;
112 }
113
114 static bool
115 is_linear_mipmapped(const struct tu_image_view *iview)
116 {
117 return iview->image->layout.tile_mode == TILE6_LINEAR &&
118 iview->base_mip != iview->image->level_count - 1;
119 }
120
121 static bool
122 force_sysmem(const struct tu_cmd_buffer *cmd,
123 const struct VkRect2D *render_area)
124 {
125 const struct tu_framebuffer *fb = cmd->state.framebuffer;
126 const struct tu_physical_device *device = cmd->device->physical_device;
127 bool has_linear_mipmapped_store = false;
128 const struct tu_render_pass *pass = cmd->state.pass;
129
130 /* Iterate over all the places we call tu6_emit_store_attachment() */
131 for (unsigned i = 0; i < pass->subpass_count; i++) {
132 const struct tu_subpass *subpass = &pass->subpasses[i];
133 if (subpass->resolve_attachments) {
134 for (unsigned i = 0; i < subpass->color_count; i++) {
135 uint32_t a = subpass->resolve_attachments[i].attachment;
136 if (a != VK_ATTACHMENT_UNUSED &&
137 cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
138 const struct tu_image_view *iview = fb->attachments[a].attachment;
139 if (is_linear_mipmapped(iview)) {
140 has_linear_mipmapped_store = true;
141 break;
142 }
143 }
144 }
145 }
146 }
147
148 for (unsigned i = 0; i < pass->attachment_count; i++) {
149 if (pass->attachments[i].gmem_offset >= 0 &&
150 cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
151 const struct tu_image_view *iview = fb->attachments[i].attachment;
152 if (is_linear_mipmapped(iview)) {
153 has_linear_mipmapped_store = true;
154 break;
155 }
156 }
157 }
158
159 /* Linear textures cannot have any padding between mipmap levels and their
160 * height isn't padded, while at the same time the GMEM->MEM resolve does
161 * not have per-pixel granularity, so if the image height isn't aligned to
162 * the resolve granularity and the render area is tall enough, we may wind
163 * up writing past the bottom of the image into the next miplevel or even
164 * past the end of the image. For the last miplevel, the layout code should
165 * insert enough padding so that the overdraw writes to the padding. To
166 * work around this, we force-enable sysmem rendering.
167 */
168 const uint32_t y2 = render_area->offset.y + render_area->extent.height;
169 const uint32_t aligned_y2 = ALIGN_POT(y2, device->tile_align_h);
170
171 return has_linear_mipmapped_store && aligned_y2 > fb->height;
172 }
173
174 static void
175 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
176 const struct tu_device *dev,
177 uint32_t pixels)
178 {
179 const uint32_t tile_align_w = dev->physical_device->tile_align_w;
180 const uint32_t tile_align_h = dev->physical_device->tile_align_h;
181 const uint32_t max_tile_width = 1024; /* A6xx */
182
183 tiling->tile0.offset = (VkOffset2D) {
184 .x = tiling->render_area.offset.x & ~(tile_align_w - 1),
185 .y = tiling->render_area.offset.y & ~(tile_align_h - 1),
186 };
187
188 const uint32_t ra_width =
189 tiling->render_area.extent.width +
190 (tiling->render_area.offset.x - tiling->tile0.offset.x);
191 const uint32_t ra_height =
192 tiling->render_area.extent.height +
193 (tiling->render_area.offset.y - tiling->tile0.offset.y);
194
195 /* start from 1 tile */
196 tiling->tile_count = (VkExtent2D) {
197 .width = 1,
198 .height = 1,
199 };
200 tiling->tile0.extent = (VkExtent2D) {
201 .width = align(ra_width, tile_align_w),
202 .height = align(ra_height, tile_align_h),
203 };
204
205 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
206 /* start with 2x2 tiles */
207 tiling->tile_count.width = 2;
208 tiling->tile_count.height = 2;
209 tiling->tile0.extent.width = align(DIV_ROUND_UP(ra_width, 2), tile_align_w);
210 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), tile_align_h);
211 }
212
213 /* do not exceed max tile width */
214 while (tiling->tile0.extent.width > max_tile_width) {
215 tiling->tile_count.width++;
216 tiling->tile0.extent.width =
217 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
218 }
219
220 /* do not exceed gmem size */
221 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pixels) {
222 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
223 tiling->tile_count.width++;
224 tiling->tile0.extent.width =
225 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
226 } else {
227 /* if this assert fails then layout is impossible.. */
228 assert(tiling->tile0.extent.height > tile_align_h);
229 tiling->tile_count.height++;
230 tiling->tile0.extent.height =
231 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), tile_align_h);
232 }
233 }
234 }
235
236 static void
237 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
238 const struct tu_device *dev)
239 {
240 const uint32_t max_pipe_count = 32; /* A6xx */
241
242 /* start from 1 tile per pipe */
243 tiling->pipe0 = (VkExtent2D) {
244 .width = 1,
245 .height = 1,
246 };
247 tiling->pipe_count = tiling->tile_count;
248
249 /* do not exceed max pipe count vertically */
250 while (tiling->pipe_count.height > max_pipe_count) {
251 tiling->pipe0.height += 2;
252 tiling->pipe_count.height =
253 (tiling->tile_count.height + tiling->pipe0.height - 1) /
254 tiling->pipe0.height;
255 }
256
257 /* do not exceed max pipe count */
258 while (tiling->pipe_count.width * tiling->pipe_count.height >
259 max_pipe_count) {
260 tiling->pipe0.width += 1;
261 tiling->pipe_count.width =
262 (tiling->tile_count.width + tiling->pipe0.width - 1) /
263 tiling->pipe0.width;
264 }
265 }
266
267 static void
268 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
269 const struct tu_device *dev)
270 {
271 const uint32_t max_pipe_count = 32; /* A6xx */
272 const uint32_t used_pipe_count =
273 tiling->pipe_count.width * tiling->pipe_count.height;
274 const VkExtent2D last_pipe = {
275 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
276 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
277 };
278
279 assert(used_pipe_count <= max_pipe_count);
280 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
281
282 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
283 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
284 const uint32_t pipe_x = tiling->pipe0.width * x;
285 const uint32_t pipe_y = tiling->pipe0.height * y;
286 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
287 ? last_pipe.width
288 : tiling->pipe0.width;
289 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
290 ? last_pipe.height
291 : tiling->pipe0.height;
292 const uint32_t n = tiling->pipe_count.width * y + x;
293
294 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
295 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
296 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
297 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
298 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
299 }
300 }
301
302 memset(tiling->pipe_config + used_pipe_count, 0,
303 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
304 }
305
306 static void
307 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
308 const struct tu_device *dev,
309 uint32_t tx,
310 uint32_t ty,
311 struct tu_tile *tile)
312 {
313 /* find the pipe and the slot for tile (tx, ty) */
314 const uint32_t px = tx / tiling->pipe0.width;
315 const uint32_t py = ty / tiling->pipe0.height;
316 const uint32_t sx = tx - tiling->pipe0.width * px;
317 const uint32_t sy = ty - tiling->pipe0.height * py;
318
319 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
320 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
321 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
322
323 /* convert to 1D indices */
324 tile->pipe = tiling->pipe_count.width * py + px;
325 tile->slot = tiling->pipe0.width * sy + sx;
326
327 /* get the blit area for the tile */
328 tile->begin = (VkOffset2D) {
329 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
330 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
331 };
332 tile->end.x =
333 (tx == tiling->tile_count.width - 1)
334 ? tiling->render_area.offset.x + tiling->render_area.extent.width
335 : tile->begin.x + tiling->tile0.extent.width;
336 tile->end.y =
337 (ty == tiling->tile_count.height - 1)
338 ? tiling->render_area.offset.y + tiling->render_area.extent.height
339 : tile->begin.y + tiling->tile0.extent.height;
340 }
341
342 enum a3xx_msaa_samples
343 tu_msaa_samples(uint32_t samples)
344 {
345 switch (samples) {
346 case 1:
347 return MSAA_ONE;
348 case 2:
349 return MSAA_TWO;
350 case 4:
351 return MSAA_FOUR;
352 case 8:
353 return MSAA_EIGHT;
354 default:
355 assert(!"invalid sample count");
356 return MSAA_ONE;
357 }
358 }
359
360 static enum a4xx_index_size
361 tu6_index_size(VkIndexType type)
362 {
363 switch (type) {
364 case VK_INDEX_TYPE_UINT16:
365 return INDEX4_SIZE_16_BIT;
366 case VK_INDEX_TYPE_UINT32:
367 return INDEX4_SIZE_32_BIT;
368 default:
369 unreachable("invalid VkIndexType");
370 return INDEX4_SIZE_8_BIT;
371 }
372 }
373
374 static void
375 tu6_emit_marker(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
376 {
377 tu_cs_emit_write_reg(cs, cmd->marker_reg, ++cmd->marker_seqno);
378 }
379
380 unsigned
381 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
382 struct tu_cs *cs,
383 enum vgt_event_type event,
384 bool need_seqno)
385 {
386 unsigned seqno = 0;
387
388 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
389 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
390 if (need_seqno) {
391 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
392 seqno = ++cmd->scratch_seqno;
393 tu_cs_emit(cs, seqno);
394 }
395
396 return seqno;
397 }
398
399 static void
400 tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
401 {
402 tu6_emit_event_write(cmd, cs, 0x31, false);
403 }
404
405 static void
406 tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
407 {
408 tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false);
409 }
410
411 static void
412 tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
413 {
414 if (cmd->wait_for_idle) {
415 tu_cs_emit_wfi(cs);
416 cmd->wait_for_idle = false;
417 }
418 }
419
420 #define tu_image_view_ubwc_pitches(iview) \
421 .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \
422 .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2
423
424 static void
425 tu6_emit_zs(struct tu_cmd_buffer *cmd,
426 const struct tu_subpass *subpass,
427 struct tu_cs *cs)
428 {
429 const struct tu_framebuffer *fb = cmd->state.framebuffer;
430
431 const uint32_t a = subpass->depth_stencil_attachment.attachment;
432 if (a == VK_ATTACHMENT_UNUSED) {
433 tu_cs_emit_regs(cs,
434 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
435 A6XX_RB_DEPTH_BUFFER_PITCH(0),
436 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
437 A6XX_RB_DEPTH_BUFFER_BASE(0),
438 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
439
440 tu_cs_emit_regs(cs,
441 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
442
443 tu_cs_emit_regs(cs,
444 A6XX_GRAS_LRZ_BUFFER_BASE(0),
445 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
446 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
447
448 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
449
450 return;
451 }
452
453 const struct tu_image_view *iview = fb->attachments[a].attachment;
454 enum a6xx_depth_format fmt = tu6_pipe2depth(iview->vk_format);
455
456 tu_cs_emit_regs(cs,
457 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt),
458 A6XX_RB_DEPTH_BUFFER_PITCH(tu_image_stride(iview->image, iview->base_mip)),
459 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(iview->image->layout.layer_size),
460 A6XX_RB_DEPTH_BUFFER_BASE(tu_image_view_base_ref(iview)),
461 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(cmd->state.pass->attachments[a].gmem_offset));
462
463 tu_cs_emit_regs(cs,
464 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
465
466 tu_cs_emit_regs(cs,
467 A6XX_RB_DEPTH_FLAG_BUFFER_BASE(tu_image_view_ubwc_base_ref(iview)),
468 A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(tu_image_view_ubwc_pitches(iview)));
469
470 tu_cs_emit_regs(cs,
471 A6XX_GRAS_LRZ_BUFFER_BASE(0),
472 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
473 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
474
475 tu_cs_emit_regs(cs,
476 A6XX_RB_STENCIL_INFO(0));
477
478 /* enable zs? */
479 }
480
481 static void
482 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
483 const struct tu_subpass *subpass,
484 struct tu_cs *cs)
485 {
486 const struct tu_framebuffer *fb = cmd->state.framebuffer;
487 unsigned char mrt_comp[MAX_RTS] = { 0 };
488 unsigned srgb_cntl = 0;
489
490 for (uint32_t i = 0; i < subpass->color_count; ++i) {
491 uint32_t a = subpass->color_attachments[i].attachment;
492 if (a == VK_ATTACHMENT_UNUSED)
493 continue;
494
495 const struct tu_image_view *iview = fb->attachments[a].attachment;
496 const enum a6xx_tile_mode tile_mode =
497 tu6_get_image_tile_mode(iview->image, iview->base_mip);
498
499 mrt_comp[i] = 0xf;
500
501 if (vk_format_is_srgb(iview->vk_format))
502 srgb_cntl |= (1 << i);
503
504 const struct tu_native_format *format =
505 tu6_get_native_format(iview->vk_format);
506 assert(format && format->rb >= 0);
507
508 tu_cs_emit_regs(cs,
509 A6XX_RB_MRT_BUF_INFO(i,
510 .color_tile_mode = tile_mode,
511 .color_format = format->rb,
512 .color_swap = format->swap),
513 A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)),
514 A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size),
515 A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)),
516 A6XX_RB_MRT_BASE_GMEM(i, cmd->state.pass->attachments[a].gmem_offset));
517
518 tu_cs_emit_regs(cs,
519 A6XX_SP_FS_MRT_REG(i,
520 .color_format = format->rb,
521 .color_sint = vk_format_is_sint(iview->vk_format),
522 .color_uint = vk_format_is_uint(iview->vk_format)));
523
524 tu_cs_emit_regs(cs,
525 A6XX_RB_MRT_FLAG_BUFFER_ADDR(i, tu_image_view_ubwc_base_ref(iview)),
526 A6XX_RB_MRT_FLAG_BUFFER_PITCH(i, tu_image_view_ubwc_pitches(iview)));
527 }
528
529 tu_cs_emit_regs(cs,
530 A6XX_RB_SRGB_CNTL(srgb_cntl));
531
532 tu_cs_emit_regs(cs,
533 A6XX_SP_SRGB_CNTL(srgb_cntl));
534
535 tu_cs_emit_regs(cs,
536 A6XX_RB_RENDER_COMPONENTS(
537 .rt0 = mrt_comp[0],
538 .rt1 = mrt_comp[1],
539 .rt2 = mrt_comp[2],
540 .rt3 = mrt_comp[3],
541 .rt4 = mrt_comp[4],
542 .rt5 = mrt_comp[5],
543 .rt6 = mrt_comp[6],
544 .rt7 = mrt_comp[7]));
545
546 tu_cs_emit_regs(cs,
547 A6XX_SP_FS_RENDER_COMPONENTS(
548 .rt0 = mrt_comp[0],
549 .rt1 = mrt_comp[1],
550 .rt2 = mrt_comp[2],
551 .rt3 = mrt_comp[3],
552 .rt4 = mrt_comp[4],
553 .rt5 = mrt_comp[5],
554 .rt6 = mrt_comp[6],
555 .rt7 = mrt_comp[7]));
556 }
557
558 static void
559 tu6_emit_msaa(struct tu_cmd_buffer *cmd,
560 const struct tu_subpass *subpass,
561 struct tu_cs *cs)
562 {
563 const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples);
564 bool msaa_disable = samples == MSAA_ONE;
565
566 tu_cs_emit_regs(cs,
567 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
568 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
569 .msaa_disable = msaa_disable));
570
571 tu_cs_emit_regs(cs,
572 A6XX_GRAS_RAS_MSAA_CNTL(samples),
573 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
574 .msaa_disable = msaa_disable));
575
576 tu_cs_emit_regs(cs,
577 A6XX_RB_RAS_MSAA_CNTL(samples),
578 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
579 .msaa_disable = msaa_disable));
580
581 tu_cs_emit_regs(cs,
582 A6XX_RB_MSAA_CNTL(samples));
583 }
584
585 static void
586 tu6_emit_bin_size(struct tu_cs *cs,
587 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
588 {
589 tu_cs_emit_regs(cs,
590 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
591 .binh = bin_h,
592 .dword = flags));
593
594 tu_cs_emit_regs(cs,
595 A6XX_RB_BIN_CONTROL(.binw = bin_w,
596 .binh = bin_h,
597 .dword = flags));
598
599 /* no flag for RB_BIN_CONTROL2... */
600 tu_cs_emit_regs(cs,
601 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
602 .binh = bin_h));
603 }
604
605 static void
606 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
607 const struct tu_subpass *subpass,
608 struct tu_cs *cs,
609 bool binning)
610 {
611 const struct tu_framebuffer *fb = cmd->state.framebuffer;
612 uint32_t cntl = 0;
613 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
614 if (binning) {
615 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
616 } else {
617 uint32_t mrts_ubwc_enable = 0;
618 for (uint32_t i = 0; i < subpass->color_count; ++i) {
619 uint32_t a = subpass->color_attachments[i].attachment;
620 if (a == VK_ATTACHMENT_UNUSED)
621 continue;
622
623 const struct tu_image_view *iview = fb->attachments[a].attachment;
624 if (iview->image->layout.ubwc_layer_size != 0)
625 mrts_ubwc_enable |= 1 << i;
626 }
627
628 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
629
630 const uint32_t a = subpass->depth_stencil_attachment.attachment;
631 if (a != VK_ATTACHMENT_UNUSED) {
632 const struct tu_image_view *iview = fb->attachments[a].attachment;
633 if (iview->image->layout.ubwc_layer_size != 0)
634 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
635 }
636
637 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
638 * in order to set it correctly for the different subpasses. However,
639 * that means the packets we're emitting also happen during binning. So
640 * we need to guard the write on !BINNING at CP execution time.
641 */
642 tu_cs_reserve(cs, 3 + 4);
643 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
644 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
645 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
646 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
647 }
648
649 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
650 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
651 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
652 tu_cs_emit(cs, cntl);
653 }
654
655 static void
656 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
657 {
658 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
659 uint32_t x1 = render_area->offset.x;
660 uint32_t y1 = render_area->offset.y;
661 uint32_t x2 = x1 + render_area->extent.width - 1;
662 uint32_t y2 = y1 + render_area->extent.height - 1;
663
664 /* TODO: alignment requirement seems to be less than tile_align_w/h */
665 if (align) {
666 x1 = x1 & ~cmd->device->physical_device->tile_align_w;
667 y1 = y1 & ~cmd->device->physical_device->tile_align_h;
668 x2 = ALIGN_POT(x2 + 1, cmd->device->physical_device->tile_align_w) - 1;
669 y2 = ALIGN_POT(y2 + 1, cmd->device->physical_device->tile_align_h) - 1;
670 }
671
672 tu_cs_emit_regs(cs,
673 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
674 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
675 }
676
677 static void
678 tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
679 struct tu_cs *cs,
680 const struct tu_image_view *iview,
681 uint32_t gmem_offset,
682 bool resolve)
683 {
684 tu_cs_emit_regs(cs,
685 A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve));
686
687 const struct tu_native_format *format =
688 tu6_get_native_format(iview->vk_format);
689 assert(format && format->rb >= 0);
690
691 enum a6xx_tile_mode tile_mode =
692 tu6_get_image_tile_mode(iview->image, iview->base_mip);
693 tu_cs_emit_regs(cs,
694 A6XX_RB_BLIT_DST_INFO(
695 .tile_mode = tile_mode,
696 .samples = tu_msaa_samples(iview->image->samples),
697 .color_format = format->rb,
698 .color_swap = format->swap,
699 .flags = iview->image->layout.ubwc_layer_size != 0),
700 A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
701 A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
702 A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size));
703
704 if (iview->image->layout.ubwc_layer_size) {
705 tu_cs_emit_regs(cs,
706 A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)),
707 A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview)));
708 }
709
710 tu_cs_emit_regs(cs,
711 A6XX_RB_BLIT_BASE_GMEM(gmem_offset));
712 }
713
714 static void
715 tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
716 {
717 tu6_emit_marker(cmd, cs);
718 tu6_emit_event_write(cmd, cs, BLIT, false);
719 tu6_emit_marker(cmd, cs);
720 }
721
722 static void
723 tu6_emit_window_scissor(struct tu_cmd_buffer *cmd,
724 struct tu_cs *cs,
725 uint32_t x1,
726 uint32_t y1,
727 uint32_t x2,
728 uint32_t y2)
729 {
730 tu_cs_emit_regs(cs,
731 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
732 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
733
734 tu_cs_emit_regs(cs,
735 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
736 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
737 }
738
739 static void
740 tu6_emit_window_offset(struct tu_cmd_buffer *cmd,
741 struct tu_cs *cs,
742 uint32_t x1,
743 uint32_t y1)
744 {
745 tu_cs_emit_regs(cs,
746 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
747
748 tu_cs_emit_regs(cs,
749 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
750
751 tu_cs_emit_regs(cs,
752 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
753
754 tu_cs_emit_regs(cs,
755 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
756 }
757
758 static bool
759 use_hw_binning(struct tu_cmd_buffer *cmd)
760 {
761 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
762
763 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
764 return false;
765
766 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
767 return true;
768
769 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
770 }
771
772 static bool
773 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
774 {
775 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
776 return true;
777
778 return cmd->state.tiling_config.force_sysmem;
779 }
780
781 static void
782 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
783 struct tu_cs *cs,
784 const struct tu_tile *tile)
785 {
786 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
787 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
788
789 tu6_emit_marker(cmd, cs);
790 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
791 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
792 tu6_emit_marker(cmd, cs);
793
794 const uint32_t x1 = tile->begin.x;
795 const uint32_t y1 = tile->begin.y;
796 const uint32_t x2 = tile->end.x - 1;
797 const uint32_t y2 = tile->end.y - 1;
798 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
799 tu6_emit_window_offset(cmd, cs, x1, y1);
800
801 tu_cs_emit_regs(cs,
802 A6XX_VPC_SO_OVERRIDE(.so_disable = true));
803
804 if (use_hw_binning(cmd)) {
805 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
806
807 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
808 tu_cs_emit(cs, 0x0);
809
810 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
811 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
812 A6XX_CP_REG_TEST_0_BIT(0) |
813 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
814
815 tu_cs_reserve(cs, 3 + 11);
816 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
817 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
818 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
819
820 /* if (no overflow) */ {
821 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
822 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
823 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
824 tu_cs_emit_qw(cs, cmd->vsc_data.iova + tile->pipe * cmd->vsc_data_pitch);
825 tu_cs_emit_qw(cs, cmd->vsc_data.iova + (tile->pipe * 4) + (32 * cmd->vsc_data_pitch));
826 tu_cs_emit_qw(cs, cmd->vsc_data2.iova + (tile->pipe * cmd->vsc_data2_pitch));
827
828 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
829 tu_cs_emit(cs, 0x0);
830
831 /* use a NOP packet to skip over the 'else' side: */
832 tu_cs_emit_pkt7(cs, CP_NOP, 2);
833 } /* else */ {
834 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
835 tu_cs_emit(cs, 0x1);
836 }
837
838 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
839 tu_cs_emit(cs, 0x0);
840
841 tu_cs_emit_regs(cs,
842 A6XX_RB_UNKNOWN_8804(0));
843
844 tu_cs_emit_regs(cs,
845 A6XX_SP_TP_UNKNOWN_B304(0));
846
847 tu_cs_emit_regs(cs,
848 A6XX_GRAS_UNKNOWN_80A4(0));
849 } else {
850 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
851 tu_cs_emit(cs, 0x1);
852
853 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
854 tu_cs_emit(cs, 0x0);
855 }
856 }
857
858 static void
859 tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
860 {
861 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
862 const struct tu_framebuffer *fb = cmd->state.framebuffer;
863 const struct tu_image_view *iview = fb->attachments[a].attachment;
864 const struct tu_render_pass_attachment *attachment =
865 &cmd->state.pass->attachments[a];
866
867 if (attachment->gmem_offset < 0)
868 return;
869
870 const uint32_t x1 = tiling->render_area.offset.x;
871 const uint32_t y1 = tiling->render_area.offset.y;
872 const uint32_t x2 = x1 + tiling->render_area.extent.width;
873 const uint32_t y2 = y1 + tiling->render_area.extent.height;
874 const uint32_t tile_x2 =
875 tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width;
876 const uint32_t tile_y2 =
877 tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height;
878 bool need_load =
879 x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) ||
880 y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2);
881
882 if (need_load)
883 tu_finishme("improve handling of unaligned render area");
884
885 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
886 need_load = true;
887
888 if (vk_format_has_stencil(iview->vk_format) &&
889 attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
890 need_load = true;
891
892 if (need_load) {
893 tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false);
894 tu6_emit_blit(cmd, cs);
895 }
896 }
897
898 static void
899 tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
900 uint32_t a,
901 const VkRenderPassBeginInfo *info)
902 {
903 const struct tu_framebuffer *fb = cmd->state.framebuffer;
904 const struct tu_image_view *iview = fb->attachments[a].attachment;
905 const struct tu_render_pass_attachment *attachment =
906 &cmd->state.pass->attachments[a];
907 unsigned clear_mask = 0;
908
909 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
910 if (attachment->gmem_offset < 0)
911 return;
912
913 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
914 clear_mask = 0xf;
915
916 if (vk_format_has_stencil(iview->vk_format)) {
917 clear_mask &= 0x1;
918 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
919 clear_mask |= 0x2;
920 }
921 if (!clear_mask)
922 return;
923
924 tu_clear_gmem_attachment(cmd, cs, a, clear_mask,
925 &info->pClearValues[a]);
926 }
927
928 static void
929 tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd,
930 struct tu_cs *cs,
931 uint32_t a,
932 uint32_t gmem_a,
933 bool resolve)
934 {
935 const uint32_t space = 14 + 6;
936 struct tu_cond_exec_state state;
937
938 VkResult result = tu_cond_exec_start(cmd->device, cs, &state,
939 CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
940 CP_COND_REG_EXEC_0_GMEM,
941 space);
942 if (result != VK_SUCCESS) {
943 cmd->record_result = result;
944 return;
945 }
946
947 tu6_emit_blit_info(cmd, cs,
948 cmd->state.framebuffer->attachments[a].attachment,
949 cmd->state.pass->attachments[gmem_a].gmem_offset, resolve);
950 tu6_emit_blit(cmd, cs);
951
952 tu_cond_exec_end(cs, &state);
953 }
954
955 static void
956 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
957 struct tu_cs *cs,
958 uint32_t a,
959 uint32_t gmem_a)
960 {
961 const struct tu_framebuffer *fb = cmd->state.framebuffer;
962 const struct tu_image_view *dst = fb->attachments[a].attachment;
963 const struct tu_image_view *src = fb->attachments[gmem_a].attachment;
964
965 tu_blit(cmd, cs, &(struct tu_blit) {
966 .dst = sysmem_attachment_surf(dst, dst->base_layer,
967 &cmd->state.tiling_config.render_area),
968 .src = sysmem_attachment_surf(src, src->base_layer,
969 &cmd->state.tiling_config.render_area),
970 .layers = fb->layers,
971 });
972 }
973
974
975 /* Emit a MSAA resolve operation, with both gmem and sysmem paths. */
976 static void tu6_emit_resolve(struct tu_cmd_buffer *cmd,
977 struct tu_cs *cs,
978 uint32_t a,
979 uint32_t gmem_a)
980 {
981 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
982 return;
983
984 tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true);
985
986 const struct tu_framebuffer *fb = cmd->state.framebuffer;
987 const uint32_t space = 25 + 66 * fb->layers + 17;
988 struct tu_cond_exec_state state;
989
990 VkResult result = tu_cond_exec_start(cmd->device, cs, &state,
991 CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
992 CP_COND_REG_EXEC_0_SYSMEM,
993 space);
994 if (result != VK_SUCCESS) {
995 cmd->record_result = result;
996 return;
997 }
998
999 tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a);
1000 tu_cond_exec_end(cs, &state);
1001 }
1002
1003 static void
1004 tu6_emit_store_attachment(struct tu_cmd_buffer *cmd,
1005 struct tu_cs *cs,
1006 uint32_t a,
1007 uint32_t gmem_a)
1008 {
1009 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
1010 return;
1011
1012 tu6_emit_blit_info(cmd, cs,
1013 cmd->state.framebuffer->attachments[a].attachment,
1014 cmd->state.pass->attachments[gmem_a].gmem_offset, true);
1015 tu6_emit_blit(cmd, cs);
1016 }
1017
1018 static void
1019 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1020 {
1021 const struct tu_render_pass *pass = cmd->state.pass;
1022 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
1023
1024 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1025 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1026 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1027 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1028 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1029 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1030
1031 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1032 tu_cs_emit(cs, 0x0);
1033
1034 tu6_emit_marker(cmd, cs);
1035 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1036 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
1037 tu6_emit_marker(cmd, cs);
1038
1039 tu6_emit_blit_scissor(cmd, cs, true);
1040
1041 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
1042 if (pass->attachments[a].gmem_offset >= 0)
1043 tu6_emit_store_attachment(cmd, cs, a, a);
1044 }
1045
1046 if (subpass->resolve_attachments) {
1047 for (unsigned i = 0; i < subpass->color_count; i++) {
1048 uint32_t a = subpass->resolve_attachments[i].attachment;
1049 if (a != VK_ATTACHMENT_UNUSED)
1050 tu6_emit_store_attachment(cmd, cs, a,
1051 subpass->color_attachments[i].attachment);
1052 }
1053 }
1054 }
1055
1056 static void
1057 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
1058 {
1059 tu_cs_emit_regs(cs,
1060 A6XX_PC_RESTART_INDEX(restart_index));
1061 }
1062
1063 static void
1064 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1065 {
1066 tu6_emit_cache_flush(cmd, cs);
1067
1068 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
1069
1070 tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x10000000);
1071 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
1072 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
1073 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
1074 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
1075 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
1076 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
1077 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1078 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1079
1080 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
1081 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
1082 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
1083 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
1084 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
1085 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
1086 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
1087 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1088 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
1089 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
1090 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
1091 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1092 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1093 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
1094
1095 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
1096
1097 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8101, 0);
1098 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 0);
1099 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
1100
1101 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
1102 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
1103 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
1104 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SAMPLE_CNTL, 0);
1105 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
1106 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
1107 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
1108 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
1109 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
1110 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
1111 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
1112 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
1113
1114 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
1115 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
1116
1117 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1);
1118 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
1119
1120 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
1121 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
1122
1123 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
1124 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
1125 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
1126
1127 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
1128 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
1129
1130 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
1131
1132 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
1133
1134 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
1135 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
1136 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
1137 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1138 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
1139 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
1140 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
1141 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
1142 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
1143 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
1144 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0);
1145 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
1146 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0);
1147 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0);
1148 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0);
1149 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0);
1150 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0);
1151 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0);
1152 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
1153 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
1154 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1155
1156 tu6_emit_marker(cmd, cs);
1157
1158 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
1159
1160 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
1161
1162 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
1163
1164 /* we don't use this yet.. probably best to disable.. */
1165 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1166 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1167 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1168 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1169 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1170 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1171
1172 tu_cs_emit_regs(cs,
1173 A6XX_VPC_SO_BUFFER_BASE(0),
1174 A6XX_VPC_SO_BUFFER_SIZE(0));
1175
1176 tu_cs_emit_regs(cs,
1177 A6XX_VPC_SO_FLUSH_BASE(0));
1178
1179 tu_cs_emit_regs(cs,
1180 A6XX_VPC_SO_BUF_CNTL(0));
1181
1182 tu_cs_emit_regs(cs,
1183 A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
1184
1185 tu_cs_emit_regs(cs,
1186 A6XX_VPC_SO_BUFFER_BASE(1, 0),
1187 A6XX_VPC_SO_BUFFER_SIZE(1, 0));
1188
1189 tu_cs_emit_regs(cs,
1190 A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
1191 A6XX_VPC_SO_FLUSH_BASE(1, 0),
1192 A6XX_VPC_SO_BUFFER_BASE(2, 0),
1193 A6XX_VPC_SO_BUFFER_SIZE(2, 0));
1194
1195 tu_cs_emit_regs(cs,
1196 A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
1197 A6XX_VPC_SO_FLUSH_BASE(2, 0),
1198 A6XX_VPC_SO_BUFFER_BASE(3, 0),
1199 A6XX_VPC_SO_BUFFER_SIZE(3, 0));
1200
1201 tu_cs_emit_regs(cs,
1202 A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
1203 A6XX_VPC_SO_FLUSH_BASE(3, 0));
1204
1205 tu_cs_emit_regs(cs,
1206 A6XX_SP_HS_CTRL_REG0(0));
1207
1208 tu_cs_emit_regs(cs,
1209 A6XX_SP_GS_CTRL_REG0(0));
1210
1211 tu_cs_emit_regs(cs,
1212 A6XX_GRAS_LRZ_CNTL(0));
1213
1214 tu_cs_emit_regs(cs,
1215 A6XX_RB_LRZ_CNTL(0));
1216
1217 tu_cs_sanity_check(cs);
1218 }
1219
1220 static void
1221 tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1222 {
1223 unsigned seqno;
1224
1225 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_AND_INV_EVENT, true);
1226
1227 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1228 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1229 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1230 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1231 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(seqno));
1232 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1233 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1234
1235 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1236
1237 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4);
1238 tu_cs_emit(cs, CP_WAIT_MEM_GTE_0_RESERVED(0));
1239 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1240 tu_cs_emit(cs, CP_WAIT_MEM_GTE_3_REF(seqno));
1241 }
1242
1243 static void
1244 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1245 {
1246 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1247
1248 tu_cs_emit_regs(cs,
1249 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1250 .height = tiling->tile0.extent.height),
1251 A6XX_VSC_SIZE_ADDRESS(.bo = &cmd->vsc_data,
1252 .bo_offset = 32 * cmd->vsc_data_pitch));
1253
1254 tu_cs_emit_regs(cs,
1255 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1256 .ny = tiling->tile_count.height));
1257
1258 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1259 for (unsigned i = 0; i < 32; i++)
1260 tu_cs_emit(cs, tiling->pipe_config[i]);
1261
1262 tu_cs_emit_regs(cs,
1263 A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = &cmd->vsc_data2),
1264 A6XX_VSC_PIPE_DATA2_PITCH(cmd->vsc_data2_pitch),
1265 A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(cmd->vsc_data2.size));
1266
1267 tu_cs_emit_regs(cs,
1268 A6XX_VSC_PIPE_DATA_ADDRESS(.bo = &cmd->vsc_data),
1269 A6XX_VSC_PIPE_DATA_PITCH(cmd->vsc_data_pitch),
1270 A6XX_VSC_PIPE_DATA_ARRAY_PITCH(cmd->vsc_data.size));
1271 }
1272
1273 static void
1274 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1275 {
1276 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1277 const uint32_t used_pipe_count =
1278 tiling->pipe_count.width * tiling->pipe_count.height;
1279
1280 /* Clear vsc_scratch: */
1281 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1282 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1283 tu_cs_emit(cs, 0x0);
1284
1285 /* Check for overflow, write vsc_scratch if detected: */
1286 for (int i = 0; i < used_pipe_count; i++) {
1287 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1288 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1289 CP_COND_WRITE5_0_WRITE_MEMORY);
1290 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i)));
1291 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1292 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data_pitch));
1293 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1294 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1295 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_data_pitch));
1296
1297 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1298 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1299 CP_COND_WRITE5_0_WRITE_MEMORY);
1300 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i)));
1301 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1302 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data2_pitch));
1303 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1304 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1305 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_data2_pitch));
1306 }
1307
1308 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1309
1310 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1311
1312 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1313 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1314 CP_MEM_TO_REG_0_CNT(1 - 1));
1315 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1316
1317 /*
1318 * This is a bit awkward, we really want a way to invert the
1319 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1320 * execute cmds to use hwbinning when a bit is *not* set. This
1321 * dance is to invert OVERFLOW_FLAG_REG
1322 *
1323 * A CP_NOP packet is used to skip executing the 'else' clause
1324 * if (b0 set)..
1325 */
1326
1327 /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */
1328 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1329 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1330 A6XX_CP_REG_TEST_0_BIT(0) |
1331 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1332
1333 tu_cs_reserve(cs, 3 + 7);
1334 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1335 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1336 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1337
1338 /* if (b0 set) */ {
1339 /*
1340 * On overflow, mirror the value to control->vsc_overflow
1341 * which CPU is checking to detect overflow (see
1342 * check_vsc_overflow())
1343 */
1344 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1345 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1346 CP_REG_TO_MEM_0_CNT(0));
1347 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW);
1348
1349 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1350 tu_cs_emit(cs, 0x0);
1351
1352 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1353 } /* else */ {
1354 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1355 tu_cs_emit(cs, 0x1);
1356 }
1357 }
1358
1359 static void
1360 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1361 {
1362 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1363 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1364
1365 uint32_t x1 = tiling->tile0.offset.x;
1366 uint32_t y1 = tiling->tile0.offset.y;
1367 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1368 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1369
1370 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
1371
1372 tu6_emit_marker(cmd, cs);
1373 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1374 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1375 tu6_emit_marker(cmd, cs);
1376
1377 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1378 tu_cs_emit(cs, 0x1);
1379
1380 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1381 tu_cs_emit(cs, 0x1);
1382
1383 tu_cs_emit_wfi(cs);
1384
1385 tu_cs_emit_regs(cs,
1386 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1387
1388 update_vsc_pipe(cmd, cs);
1389
1390 tu_cs_emit_regs(cs,
1391 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1392
1393 tu_cs_emit_regs(cs,
1394 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1395
1396 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1397 tu_cs_emit(cs, UNK_2C);
1398
1399 tu_cs_emit_regs(cs,
1400 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1401
1402 tu_cs_emit_regs(cs,
1403 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1404
1405 /* emit IB to binning drawcmds: */
1406 tu_cs_emit_call(cs, &cmd->draw_cs);
1407
1408 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1409 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1410 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1411 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1412 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1413 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1414
1415 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1416 tu_cs_emit(cs, UNK_2D);
1417
1418 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1419 tu6_cache_flush(cmd, cs);
1420
1421 tu_cs_emit_wfi(cs);
1422
1423 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1424
1425 emit_vsc_overflow_test(cmd, cs);
1426
1427 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1428 tu_cs_emit(cs, 0x0);
1429
1430 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1431 tu_cs_emit(cs, 0x0);
1432
1433 tu_cs_emit_wfi(cs);
1434
1435 tu_cs_emit_regs(cs,
1436 A6XX_RB_CCU_CNTL(.unknown = phys_dev->magic.RB_CCU_CNTL_gmem));
1437
1438 cmd->wait_for_idle = false;
1439 }
1440
1441 static void
1442 tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1443 uint32_t a,
1444 const VkRenderPassBeginInfo *info)
1445 {
1446 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1447 const struct tu_image_view *iview = fb->attachments[a].attachment;
1448 const struct tu_render_pass_attachment *attachment =
1449 &cmd->state.pass->attachments[a];
1450 unsigned clear_mask = 0;
1451
1452 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
1453 if (attachment->gmem_offset < 0)
1454 return;
1455
1456 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1457 clear_mask = 0xf;
1458 }
1459
1460 if (vk_format_has_stencil(iview->vk_format)) {
1461 clear_mask &= 0x1;
1462 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
1463 clear_mask |= 0x2;
1464 if (clear_mask != 0x3)
1465 tu_finishme("depth/stencil only load op");
1466 }
1467
1468 if (!clear_mask)
1469 return;
1470
1471 tu_clear_sysmem_attachment(cmd, cs, a,
1472 &info->pClearValues[a], &(struct VkClearRect) {
1473 .rect = info->renderArea,
1474 .baseArrayLayer = iview->base_layer,
1475 .layerCount = iview->layer_count,
1476 });
1477 }
1478
1479 static void
1480 tu_cmd_prepare_sysmem_clear_ib(struct tu_cmd_buffer *cmd,
1481 const VkRenderPassBeginInfo *info)
1482 {
1483 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1484 const uint32_t blit_cmd_space = 25 + 66 * fb->layers + 17;
1485 const uint32_t clear_space =
1486 blit_cmd_space * cmd->state.pass->attachment_count + 5;
1487
1488 struct tu_cs sub_cs;
1489
1490 VkResult result =
1491 tu_cs_begin_sub_stream(&cmd->sub_cs, clear_space, &sub_cs);
1492 if (result != VK_SUCCESS) {
1493 cmd->record_result = result;
1494 return;
1495 }
1496
1497 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1498 tu_emit_sysmem_clear_attachment(cmd, &sub_cs, i, info);
1499
1500 /* TODO: We shouldn't need this flush, but without it we'd have an empty IB
1501 * when nothing clears which we currently can't handle.
1502 */
1503 tu6_emit_event_write(cmd, &sub_cs, PC_CCU_FLUSH_COLOR_TS, true);
1504
1505 cmd->state.sysmem_clear_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1506 }
1507
1508 static void
1509 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1510 const struct VkRect2D *renderArea)
1511 {
1512 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1513 if (fb->width > 0 && fb->height > 0) {
1514 tu6_emit_window_scissor(cmd, cs,
1515 0, 0, fb->width - 1, fb->height - 1);
1516 } else {
1517 tu6_emit_window_scissor(cmd, cs, 0, 0, 0, 0);
1518 }
1519
1520 tu6_emit_window_offset(cmd, cs, 0, 0);
1521
1522 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1523
1524 tu_cs_emit_ib(cs, &cmd->state.sysmem_clear_ib);
1525
1526 tu6_emit_lrz_flush(cmd, cs);
1527
1528 tu6_emit_marker(cmd, cs);
1529 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1530 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1531 tu6_emit_marker(cmd, cs);
1532
1533 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1534 tu_cs_emit(cs, 0x0);
1535
1536 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1537 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1538 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1539
1540 tu6_emit_wfi(cmd, cs);
1541 tu_cs_emit_regs(cs,
1542 A6XX_RB_CCU_CNTL(0x10000000));
1543
1544 /* enable stream-out, with sysmem there is only one pass: */
1545 tu_cs_emit_regs(cs,
1546 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1547
1548 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1549 tu_cs_emit(cs, 0x1);
1550
1551 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1552 tu_cs_emit(cs, 0x0);
1553
1554 tu_cs_sanity_check(cs);
1555 }
1556
1557 static void
1558 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1559 {
1560 /* Do any resolves of the last subpass. These are handled in the
1561 * tile_store_ib in the gmem path.
1562 */
1563
1564 const struct tu_subpass *subpass = cmd->state.subpass;
1565 if (subpass->resolve_attachments) {
1566 for (unsigned i = 0; i < subpass->color_count; i++) {
1567 uint32_t a = subpass->resolve_attachments[i].attachment;
1568 if (a != VK_ATTACHMENT_UNUSED)
1569 tu6_emit_sysmem_resolve(cmd, cs, a,
1570 subpass->color_attachments[i].attachment);
1571 }
1572 }
1573
1574 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1575
1576 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1577 tu_cs_emit(cs, 0x0);
1578
1579 tu6_emit_lrz_flush(cmd, cs);
1580
1581 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1582 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1583
1584 tu_cs_sanity_check(cs);
1585 }
1586
1587
1588 static void
1589 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1590 {
1591 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1592
1593 tu6_emit_lrz_flush(cmd, cs);
1594
1595 /* lrz clear? */
1596
1597 tu6_emit_cache_flush(cmd, cs);
1598
1599 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1600 tu_cs_emit(cs, 0x0);
1601
1602 /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
1603 tu6_emit_wfi(cmd, cs);
1604 tu_cs_emit_regs(cs,
1605 A6XX_RB_CCU_CNTL(phys_dev->magic.RB_CCU_CNTL_gmem));
1606
1607 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1608 if (use_hw_binning(cmd)) {
1609 tu6_emit_bin_size(cs,
1610 tiling->tile0.extent.width,
1611 tiling->tile0.extent.height,
1612 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1613
1614 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1615
1616 tu6_emit_binning_pass(cmd, cs);
1617
1618 tu6_emit_bin_size(cs,
1619 tiling->tile0.extent.width,
1620 tiling->tile0.extent.height,
1621 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1622
1623 tu_cs_emit_regs(cs,
1624 A6XX_VFD_MODE_CNTL(0));
1625
1626 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1627
1628 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1629
1630 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1631 tu_cs_emit(cs, 0x1);
1632 } else {
1633 tu6_emit_bin_size(cs,
1634 tiling->tile0.extent.width,
1635 tiling->tile0.extent.height,
1636 0x6000000);
1637 }
1638
1639 tu_cs_sanity_check(cs);
1640 }
1641
1642 static void
1643 tu6_render_tile(struct tu_cmd_buffer *cmd,
1644 struct tu_cs *cs,
1645 const struct tu_tile *tile)
1646 {
1647 tu6_emit_tile_select(cmd, cs, tile);
1648 tu_cs_emit_ib(cs, &cmd->state.tile_load_ib);
1649
1650 tu_cs_emit_call(cs, &cmd->draw_cs);
1651 cmd->wait_for_idle = true;
1652
1653 if (use_hw_binning(cmd)) {
1654 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1655 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1656 A6XX_CP_REG_TEST_0_BIT(0) |
1657 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1658
1659 tu_cs_reserve(cs, 3 + 2);
1660 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1661 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1662 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1663
1664 /* if (no overflow) */ {
1665 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1666 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1667 }
1668 }
1669
1670 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1671
1672 tu_cs_sanity_check(cs);
1673 }
1674
1675 static void
1676 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1677 {
1678 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1679
1680 tu_cs_emit_regs(cs,
1681 A6XX_GRAS_LRZ_CNTL(0));
1682
1683 tu6_emit_lrz_flush(cmd, cs);
1684
1685 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1686
1687 tu_cs_sanity_check(cs);
1688 }
1689
1690 static void
1691 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1692 {
1693 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1694
1695 tu6_tile_render_begin(cmd, &cmd->cs);
1696
1697 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1698 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1699 struct tu_tile tile;
1700 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1701 tu6_render_tile(cmd, &cmd->cs, &tile);
1702 }
1703 }
1704
1705 tu6_tile_render_end(cmd, &cmd->cs);
1706 }
1707
1708 static void
1709 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1710 {
1711 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1712
1713 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1714
1715 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1716 cmd->wait_for_idle = true;
1717
1718 tu6_sysmem_render_end(cmd, &cmd->cs);
1719 }
1720
1721 static void
1722 tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd,
1723 const VkRenderPassBeginInfo *info)
1724 {
1725 const uint32_t tile_load_space =
1726 2 * 3 /* blit_scissor */ +
1727 (20 /* load */ + 19 /* clear */) * cmd->state.pass->attachment_count +
1728 2 /* cache invalidate */;
1729
1730 struct tu_cs sub_cs;
1731
1732 VkResult result =
1733 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_load_space, &sub_cs);
1734 if (result != VK_SUCCESS) {
1735 cmd->record_result = result;
1736 return;
1737 }
1738
1739 tu6_emit_blit_scissor(cmd, &sub_cs, true);
1740
1741 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1742 tu6_emit_load_attachment(cmd, &sub_cs, i);
1743
1744 tu6_emit_blit_scissor(cmd, &sub_cs, false);
1745
1746 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1747 tu6_emit_clear_attachment(cmd, &sub_cs, i, info);
1748
1749 /* invalidate because reading input attachments will cache GMEM and
1750 * the cache isn''t updated when GMEM is written
1751 * TODO: is there a no-cache bit for textures?
1752 */
1753 if (cmd->state.subpass->input_count)
1754 tu6_emit_event_write(cmd, &sub_cs, CACHE_INVALIDATE, false);
1755
1756 cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1757 }
1758
1759 static void
1760 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1761 {
1762 const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count;
1763 struct tu_cs sub_cs;
1764
1765 VkResult result =
1766 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1767 if (result != VK_SUCCESS) {
1768 cmd->record_result = result;
1769 return;
1770 }
1771
1772 /* emit to tile-store sub_cs */
1773 tu6_emit_tile_store(cmd, &sub_cs);
1774
1775 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1776 }
1777
1778 static void
1779 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1780 const VkRect2D *render_area)
1781 {
1782 const struct tu_device *dev = cmd->device;
1783 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1784
1785 tiling->render_area = *render_area;
1786 tiling->force_sysmem = force_sysmem(cmd, render_area);
1787
1788 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels);
1789 tu_tiling_config_update_pipe_layout(tiling, dev);
1790 tu_tiling_config_update_pipes(tiling, dev);
1791 }
1792
1793 const struct tu_dynamic_state default_dynamic_state = {
1794 .viewport =
1795 {
1796 .count = 0,
1797 },
1798 .scissor =
1799 {
1800 .count = 0,
1801 },
1802 .line_width = 1.0f,
1803 .depth_bias =
1804 {
1805 .bias = 0.0f,
1806 .clamp = 0.0f,
1807 .slope = 0.0f,
1808 },
1809 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1810 .depth_bounds =
1811 {
1812 .min = 0.0f,
1813 .max = 1.0f,
1814 },
1815 .stencil_compare_mask =
1816 {
1817 .front = ~0u,
1818 .back = ~0u,
1819 },
1820 .stencil_write_mask =
1821 {
1822 .front = ~0u,
1823 .back = ~0u,
1824 },
1825 .stencil_reference =
1826 {
1827 .front = 0u,
1828 .back = 0u,
1829 },
1830 };
1831
1832 static void UNUSED /* FINISHME */
1833 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1834 const struct tu_dynamic_state *src)
1835 {
1836 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1837 uint32_t copy_mask = src->mask;
1838 uint32_t dest_mask = 0;
1839
1840 tu_use_args(cmd_buffer); /* FINISHME */
1841
1842 /* Make sure to copy the number of viewports/scissors because they can
1843 * only be specified at pipeline creation time.
1844 */
1845 dest->viewport.count = src->viewport.count;
1846 dest->scissor.count = src->scissor.count;
1847 dest->discard_rectangle.count = src->discard_rectangle.count;
1848
1849 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1850 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1851 src->viewport.count * sizeof(VkViewport))) {
1852 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1853 src->viewport.count);
1854 dest_mask |= TU_DYNAMIC_VIEWPORT;
1855 }
1856 }
1857
1858 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1859 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1860 src->scissor.count * sizeof(VkRect2D))) {
1861 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1862 src->scissor.count);
1863 dest_mask |= TU_DYNAMIC_SCISSOR;
1864 }
1865 }
1866
1867 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1868 if (dest->line_width != src->line_width) {
1869 dest->line_width = src->line_width;
1870 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1871 }
1872 }
1873
1874 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1875 if (memcmp(&dest->depth_bias, &src->depth_bias,
1876 sizeof(src->depth_bias))) {
1877 dest->depth_bias = src->depth_bias;
1878 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1879 }
1880 }
1881
1882 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1883 if (memcmp(&dest->blend_constants, &src->blend_constants,
1884 sizeof(src->blend_constants))) {
1885 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1886 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1887 }
1888 }
1889
1890 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1891 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1892 sizeof(src->depth_bounds))) {
1893 dest->depth_bounds = src->depth_bounds;
1894 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1895 }
1896 }
1897
1898 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1899 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1900 sizeof(src->stencil_compare_mask))) {
1901 dest->stencil_compare_mask = src->stencil_compare_mask;
1902 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1903 }
1904 }
1905
1906 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1907 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1908 sizeof(src->stencil_write_mask))) {
1909 dest->stencil_write_mask = src->stencil_write_mask;
1910 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1911 }
1912 }
1913
1914 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1915 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1916 sizeof(src->stencil_reference))) {
1917 dest->stencil_reference = src->stencil_reference;
1918 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1919 }
1920 }
1921
1922 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1923 if (memcmp(&dest->discard_rectangle.rectangles,
1924 &src->discard_rectangle.rectangles,
1925 src->discard_rectangle.count * sizeof(VkRect2D))) {
1926 typed_memcpy(dest->discard_rectangle.rectangles,
1927 src->discard_rectangle.rectangles,
1928 src->discard_rectangle.count);
1929 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1930 }
1931 }
1932 }
1933
1934 static VkResult
1935 tu_create_cmd_buffer(struct tu_device *device,
1936 struct tu_cmd_pool *pool,
1937 VkCommandBufferLevel level,
1938 VkCommandBuffer *pCommandBuffer)
1939 {
1940 struct tu_cmd_buffer *cmd_buffer;
1941 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1942 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1943 if (cmd_buffer == NULL)
1944 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1945
1946 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1947 cmd_buffer->device = device;
1948 cmd_buffer->pool = pool;
1949 cmd_buffer->level = level;
1950
1951 if (pool) {
1952 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1953 cmd_buffer->queue_family_index = pool->queue_family_index;
1954
1955 } else {
1956 /* Init the pool_link so we can safely call list_del when we destroy
1957 * the command buffer
1958 */
1959 list_inithead(&cmd_buffer->pool_link);
1960 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1961 }
1962
1963 tu_bo_list_init(&cmd_buffer->bo_list);
1964 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1965 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1966 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1967 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1968
1969 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1970
1971 list_inithead(&cmd_buffer->upload.list);
1972
1973 cmd_buffer->marker_reg = REG_A6XX_CP_SCRATCH_REG(
1974 cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ? 7 : 6);
1975
1976 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1977 if (result != VK_SUCCESS)
1978 goto fail_scratch_bo;
1979
1980 /* TODO: resize on overflow */
1981 cmd_buffer->vsc_data_pitch = device->vsc_data_pitch;
1982 cmd_buffer->vsc_data2_pitch = device->vsc_data2_pitch;
1983 cmd_buffer->vsc_data = device->vsc_data;
1984 cmd_buffer->vsc_data2 = device->vsc_data2;
1985
1986 return VK_SUCCESS;
1987
1988 fail_scratch_bo:
1989 list_del(&cmd_buffer->pool_link);
1990 return result;
1991 }
1992
1993 static void
1994 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1995 {
1996 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1997
1998 list_del(&cmd_buffer->pool_link);
1999
2000 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
2001 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
2002
2003 tu_cs_finish(&cmd_buffer->cs);
2004 tu_cs_finish(&cmd_buffer->draw_cs);
2005 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
2006 tu_cs_finish(&cmd_buffer->sub_cs);
2007
2008 tu_bo_list_destroy(&cmd_buffer->bo_list);
2009 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
2010 }
2011
2012 static VkResult
2013 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
2014 {
2015 cmd_buffer->wait_for_idle = true;
2016
2017 cmd_buffer->record_result = VK_SUCCESS;
2018
2019 tu_bo_list_reset(&cmd_buffer->bo_list);
2020 tu_cs_reset(&cmd_buffer->cs);
2021 tu_cs_reset(&cmd_buffer->draw_cs);
2022 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
2023 tu_cs_reset(&cmd_buffer->sub_cs);
2024
2025 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
2026 cmd_buffer->descriptors[i].valid = 0;
2027 cmd_buffer->descriptors[i].push_dirty = false;
2028 }
2029
2030 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
2031
2032 return cmd_buffer->record_result;
2033 }
2034
2035 VkResult
2036 tu_AllocateCommandBuffers(VkDevice _device,
2037 const VkCommandBufferAllocateInfo *pAllocateInfo,
2038 VkCommandBuffer *pCommandBuffers)
2039 {
2040 TU_FROM_HANDLE(tu_device, device, _device);
2041 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
2042
2043 VkResult result = VK_SUCCESS;
2044 uint32_t i;
2045
2046 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
2047
2048 if (!list_is_empty(&pool->free_cmd_buffers)) {
2049 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
2050 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
2051
2052 list_del(&cmd_buffer->pool_link);
2053 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
2054
2055 result = tu_reset_cmd_buffer(cmd_buffer);
2056 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
2057 cmd_buffer->level = pAllocateInfo->level;
2058
2059 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
2060 } else {
2061 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
2062 &pCommandBuffers[i]);
2063 }
2064 if (result != VK_SUCCESS)
2065 break;
2066 }
2067
2068 if (result != VK_SUCCESS) {
2069 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
2070 pCommandBuffers);
2071
2072 /* From the Vulkan 1.0.66 spec:
2073 *
2074 * "vkAllocateCommandBuffers can be used to create multiple
2075 * command buffers. If the creation of any of those command
2076 * buffers fails, the implementation must destroy all
2077 * successfully created command buffer objects from this
2078 * command, set all entries of the pCommandBuffers array to
2079 * NULL and return the error."
2080 */
2081 memset(pCommandBuffers, 0,
2082 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2083 }
2084
2085 return result;
2086 }
2087
2088 void
2089 tu_FreeCommandBuffers(VkDevice device,
2090 VkCommandPool commandPool,
2091 uint32_t commandBufferCount,
2092 const VkCommandBuffer *pCommandBuffers)
2093 {
2094 for (uint32_t i = 0; i < commandBufferCount; i++) {
2095 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
2096
2097 if (cmd_buffer) {
2098 if (cmd_buffer->pool) {
2099 list_del(&cmd_buffer->pool_link);
2100 list_addtail(&cmd_buffer->pool_link,
2101 &cmd_buffer->pool->free_cmd_buffers);
2102 } else
2103 tu_cmd_buffer_destroy(cmd_buffer);
2104 }
2105 }
2106 }
2107
2108 VkResult
2109 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
2110 VkCommandBufferResetFlags flags)
2111 {
2112 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2113 return tu_reset_cmd_buffer(cmd_buffer);
2114 }
2115
2116 VkResult
2117 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2118 const VkCommandBufferBeginInfo *pBeginInfo)
2119 {
2120 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2121 VkResult result = VK_SUCCESS;
2122
2123 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
2124 /* If the command buffer has already been resetted with
2125 * vkResetCommandBuffer, no need to do it again.
2126 */
2127 result = tu_reset_cmd_buffer(cmd_buffer);
2128 if (result != VK_SUCCESS)
2129 return result;
2130 }
2131
2132 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2133 cmd_buffer->usage_flags = pBeginInfo->flags;
2134
2135 tu_cs_begin(&cmd_buffer->cs);
2136 tu_cs_begin(&cmd_buffer->draw_cs);
2137 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2138
2139 cmd_buffer->marker_seqno = 0;
2140 cmd_buffer->scratch_seqno = 0;
2141
2142 /* setup initial configuration into command buffer */
2143 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2144 switch (cmd_buffer->queue_family_index) {
2145 case TU_QUEUE_GENERAL:
2146 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
2147 break;
2148 default:
2149 break;
2150 }
2151 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2152 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
2153 assert(pBeginInfo->pInheritanceInfo);
2154 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2155 cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2156 }
2157
2158 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
2159
2160 return VK_SUCCESS;
2161 }
2162
2163 void
2164 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2165 uint32_t firstBinding,
2166 uint32_t bindingCount,
2167 const VkBuffer *pBuffers,
2168 const VkDeviceSize *pOffsets)
2169 {
2170 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2171
2172 assert(firstBinding + bindingCount <= MAX_VBS);
2173
2174 for (uint32_t i = 0; i < bindingCount; i++) {
2175 cmd->state.vb.buffers[firstBinding + i] =
2176 tu_buffer_from_handle(pBuffers[i]);
2177 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
2178 }
2179
2180 /* VB states depend on VkPipelineVertexInputStateCreateInfo */
2181 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2182 }
2183
2184 void
2185 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2186 VkBuffer buffer,
2187 VkDeviceSize offset,
2188 VkIndexType indexType)
2189 {
2190 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2191 TU_FROM_HANDLE(tu_buffer, buf, buffer);
2192
2193 /* initialize/update the restart index */
2194 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
2195 struct tu_cs *draw_cs = &cmd->draw_cs;
2196
2197 tu6_emit_restart_index(
2198 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
2199
2200 tu_cs_sanity_check(draw_cs);
2201 }
2202
2203 /* track the BO */
2204 if (cmd->state.index_buffer != buf)
2205 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
2206
2207 cmd->state.index_buffer = buf;
2208 cmd->state.index_offset = offset;
2209 cmd->state.index_type = indexType;
2210 }
2211
2212 void
2213 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2214 VkPipelineBindPoint pipelineBindPoint,
2215 VkPipelineLayout _layout,
2216 uint32_t firstSet,
2217 uint32_t descriptorSetCount,
2218 const VkDescriptorSet *pDescriptorSets,
2219 uint32_t dynamicOffsetCount,
2220 const uint32_t *pDynamicOffsets)
2221 {
2222 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2223 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
2224 unsigned dyn_idx = 0;
2225
2226 struct tu_descriptor_state *descriptors_state =
2227 tu_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2228
2229 for (unsigned i = 0; i < descriptorSetCount; ++i) {
2230 unsigned idx = i + firstSet;
2231 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
2232
2233 descriptors_state->sets[idx] = set;
2234 descriptors_state->valid |= (1u << idx);
2235
2236 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
2237 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
2238 assert(dyn_idx < dynamicOffsetCount);
2239
2240 descriptors_state->dynamic_buffers[idx] =
2241 set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
2242 }
2243 }
2244
2245 cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2246 }
2247
2248 void
2249 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2250 VkPipelineLayout layout,
2251 VkShaderStageFlags stageFlags,
2252 uint32_t offset,
2253 uint32_t size,
2254 const void *pValues)
2255 {
2256 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2257 memcpy((void*) cmd->push_constants + offset, pValues, size);
2258 cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS;
2259 }
2260
2261 VkResult
2262 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2263 {
2264 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2265
2266 if (cmd_buffer->scratch_seqno) {
2267 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2268 MSM_SUBMIT_BO_WRITE);
2269 }
2270
2271 if (cmd_buffer->use_vsc_data) {
2272 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data,
2273 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2274 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data2,
2275 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2276 }
2277
2278 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2279 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2280 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2281 }
2282
2283 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2284 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2285 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2286 }
2287
2288 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2289 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2290 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2291 }
2292
2293 tu_cs_end(&cmd_buffer->cs);
2294 tu_cs_end(&cmd_buffer->draw_cs);
2295 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2296
2297 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2298
2299 return cmd_buffer->record_result;
2300 }
2301
2302 void
2303 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2304 VkPipelineBindPoint pipelineBindPoint,
2305 VkPipeline _pipeline)
2306 {
2307 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2308 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2309
2310 switch (pipelineBindPoint) {
2311 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2312 cmd->state.pipeline = pipeline;
2313 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
2314 break;
2315 case VK_PIPELINE_BIND_POINT_COMPUTE:
2316 cmd->state.compute_pipeline = pipeline;
2317 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2318 break;
2319 default:
2320 unreachable("unrecognized pipeline bind point");
2321 break;
2322 }
2323
2324 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2325 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2326 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2327 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2328 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2329 }
2330 }
2331
2332 void
2333 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2334 uint32_t firstViewport,
2335 uint32_t viewportCount,
2336 const VkViewport *pViewports)
2337 {
2338 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2339 struct tu_cs *draw_cs = &cmd->draw_cs;
2340
2341 assert(firstViewport == 0 && viewportCount == 1);
2342 tu6_emit_viewport(draw_cs, pViewports);
2343
2344 tu_cs_sanity_check(draw_cs);
2345 }
2346
2347 void
2348 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2349 uint32_t firstScissor,
2350 uint32_t scissorCount,
2351 const VkRect2D *pScissors)
2352 {
2353 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2354 struct tu_cs *draw_cs = &cmd->draw_cs;
2355
2356 assert(firstScissor == 0 && scissorCount == 1);
2357 tu6_emit_scissor(draw_cs, pScissors);
2358
2359 tu_cs_sanity_check(draw_cs);
2360 }
2361
2362 void
2363 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2364 {
2365 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2366
2367 cmd->state.dynamic.line_width = lineWidth;
2368
2369 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2370 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2371 }
2372
2373 void
2374 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2375 float depthBiasConstantFactor,
2376 float depthBiasClamp,
2377 float depthBiasSlopeFactor)
2378 {
2379 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2380 struct tu_cs *draw_cs = &cmd->draw_cs;
2381
2382 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2383 depthBiasSlopeFactor);
2384
2385 tu_cs_sanity_check(draw_cs);
2386 }
2387
2388 void
2389 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2390 const float blendConstants[4])
2391 {
2392 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2393 struct tu_cs *draw_cs = &cmd->draw_cs;
2394
2395 tu6_emit_blend_constants(draw_cs, blendConstants);
2396
2397 tu_cs_sanity_check(draw_cs);
2398 }
2399
2400 void
2401 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2402 float minDepthBounds,
2403 float maxDepthBounds)
2404 {
2405 }
2406
2407 void
2408 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2409 VkStencilFaceFlags faceMask,
2410 uint32_t compareMask)
2411 {
2412 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2413
2414 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2415 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2416 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2417 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2418
2419 /* the front/back compare masks must be updated together */
2420 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2421 }
2422
2423 void
2424 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2425 VkStencilFaceFlags faceMask,
2426 uint32_t writeMask)
2427 {
2428 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2429
2430 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2431 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2432 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2433 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2434
2435 /* the front/back write masks must be updated together */
2436 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2437 }
2438
2439 void
2440 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2441 VkStencilFaceFlags faceMask,
2442 uint32_t reference)
2443 {
2444 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2445
2446 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2447 cmd->state.dynamic.stencil_reference.front = reference;
2448 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2449 cmd->state.dynamic.stencil_reference.back = reference;
2450
2451 /* the front/back references must be updated together */
2452 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2453 }
2454
2455 void
2456 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2457 uint32_t commandBufferCount,
2458 const VkCommandBuffer *pCmdBuffers)
2459 {
2460 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2461 VkResult result;
2462
2463 assert(commandBufferCount > 0);
2464
2465 for (uint32_t i = 0; i < commandBufferCount; i++) {
2466 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2467
2468 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2469 if (result != VK_SUCCESS) {
2470 cmd->record_result = result;
2471 break;
2472 }
2473
2474 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2475 if (result != VK_SUCCESS) {
2476 cmd->record_result = result;
2477 break;
2478 }
2479
2480 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2481 &secondary->draw_epilogue_cs);
2482 if (result != VK_SUCCESS) {
2483 cmd->record_result = result;
2484 break;
2485 }
2486 }
2487 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2488 }
2489
2490 VkResult
2491 tu_CreateCommandPool(VkDevice _device,
2492 const VkCommandPoolCreateInfo *pCreateInfo,
2493 const VkAllocationCallbacks *pAllocator,
2494 VkCommandPool *pCmdPool)
2495 {
2496 TU_FROM_HANDLE(tu_device, device, _device);
2497 struct tu_cmd_pool *pool;
2498
2499 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2500 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2501 if (pool == NULL)
2502 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2503
2504 if (pAllocator)
2505 pool->alloc = *pAllocator;
2506 else
2507 pool->alloc = device->alloc;
2508
2509 list_inithead(&pool->cmd_buffers);
2510 list_inithead(&pool->free_cmd_buffers);
2511
2512 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2513
2514 *pCmdPool = tu_cmd_pool_to_handle(pool);
2515
2516 return VK_SUCCESS;
2517 }
2518
2519 void
2520 tu_DestroyCommandPool(VkDevice _device,
2521 VkCommandPool commandPool,
2522 const VkAllocationCallbacks *pAllocator)
2523 {
2524 TU_FROM_HANDLE(tu_device, device, _device);
2525 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2526
2527 if (!pool)
2528 return;
2529
2530 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2531 &pool->cmd_buffers, pool_link)
2532 {
2533 tu_cmd_buffer_destroy(cmd_buffer);
2534 }
2535
2536 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2537 &pool->free_cmd_buffers, pool_link)
2538 {
2539 tu_cmd_buffer_destroy(cmd_buffer);
2540 }
2541
2542 vk_free2(&device->alloc, pAllocator, pool);
2543 }
2544
2545 VkResult
2546 tu_ResetCommandPool(VkDevice device,
2547 VkCommandPool commandPool,
2548 VkCommandPoolResetFlags flags)
2549 {
2550 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2551 VkResult result;
2552
2553 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2554 pool_link)
2555 {
2556 result = tu_reset_cmd_buffer(cmd_buffer);
2557 if (result != VK_SUCCESS)
2558 return result;
2559 }
2560
2561 return VK_SUCCESS;
2562 }
2563
2564 void
2565 tu_TrimCommandPool(VkDevice device,
2566 VkCommandPool commandPool,
2567 VkCommandPoolTrimFlags flags)
2568 {
2569 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2570
2571 if (!pool)
2572 return;
2573
2574 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2575 &pool->free_cmd_buffers, pool_link)
2576 {
2577 tu_cmd_buffer_destroy(cmd_buffer);
2578 }
2579 }
2580
2581 void
2582 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2583 const VkRenderPassBeginInfo *pRenderPassBegin,
2584 VkSubpassContents contents)
2585 {
2586 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2587 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2588 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2589
2590 cmd->state.pass = pass;
2591 cmd->state.subpass = pass->subpasses;
2592 cmd->state.framebuffer = fb;
2593
2594 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2595 tu_cmd_prepare_sysmem_clear_ib(cmd, pRenderPassBegin);
2596 tu_cmd_prepare_tile_load_ib(cmd, pRenderPassBegin);
2597 tu_cmd_prepare_tile_store_ib(cmd);
2598
2599 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2600 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2601 tu6_emit_msaa(cmd, cmd->state.subpass, &cmd->draw_cs);
2602 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2603
2604 /* note: use_hw_binning only checks tiling config */
2605 if (use_hw_binning(cmd))
2606 cmd->use_vsc_data = true;
2607
2608 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2609 const struct tu_image_view *iview = fb->attachments[i].attachment;
2610 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2611 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2612 }
2613 }
2614
2615 void
2616 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2617 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2618 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2619 {
2620 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2621 pSubpassBeginInfo->contents);
2622 }
2623
2624 void
2625 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2626 {
2627 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2628 const struct tu_render_pass *pass = cmd->state.pass;
2629 struct tu_cs *cs = &cmd->draw_cs;
2630
2631 const struct tu_subpass *subpass = cmd->state.subpass++;
2632 /* TODO:
2633 * if msaa samples change between subpasses,
2634 * attachment store is broken for some attachments
2635 */
2636 if (subpass->resolve_attachments) {
2637 tu6_emit_blit_scissor(cmd, cs, true);
2638 for (unsigned i = 0; i < subpass->color_count; i++) {
2639 uint32_t a = subpass->resolve_attachments[i].attachment;
2640 if (a != VK_ATTACHMENT_UNUSED) {
2641 tu6_emit_resolve(cmd, cs, a,
2642 subpass->color_attachments[i].attachment);
2643 }
2644 }
2645 }
2646
2647 /* invalidate because reading input attachments will cache GMEM and
2648 * the cache isn''t updated when GMEM is written
2649 * TODO: is there a no-cache bit for textures?
2650 */
2651 if (cmd->state.subpass->input_count)
2652 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2653
2654 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2655 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2656 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2657 tu6_emit_msaa(cmd, cmd->state.subpass, cs);
2658 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2659
2660 /* Emit flushes so that input attachments will read the correct value. This
2661 * is for sysmem only, although it shouldn't do much harm on gmem.
2662 */
2663 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2664 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2665
2666 /* TODO:
2667 * since we don't know how to do GMEM->GMEM resolve,
2668 * resolve attachments are resolved to memory then loaded to GMEM again if needed
2669 */
2670 if (subpass->resolve_attachments) {
2671 for (unsigned i = 0; i < subpass->color_count; i++) {
2672 uint32_t a = subpass->resolve_attachments[i].attachment;
2673 if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) {
2674 tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n");
2675 tu6_emit_predicated_blit(cmd, cs, a, a, false);
2676 }
2677 }
2678 }
2679 }
2680
2681 void
2682 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2683 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2684 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2685 {
2686 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2687 }
2688
2689 struct tu_draw_info
2690 {
2691 /**
2692 * Number of vertices.
2693 */
2694 uint32_t count;
2695
2696 /**
2697 * Index of the first vertex.
2698 */
2699 int32_t vertex_offset;
2700
2701 /**
2702 * First instance id.
2703 */
2704 uint32_t first_instance;
2705
2706 /**
2707 * Number of instances.
2708 */
2709 uint32_t instance_count;
2710
2711 /**
2712 * First index (indexed draws only).
2713 */
2714 uint32_t first_index;
2715
2716 /**
2717 * Whether it's an indexed draw.
2718 */
2719 bool indexed;
2720
2721 /**
2722 * Indirect draw parameters resource.
2723 */
2724 struct tu_buffer *indirect;
2725 uint64_t indirect_offset;
2726 uint32_t stride;
2727
2728 /**
2729 * Draw count parameters resource.
2730 */
2731 struct tu_buffer *count_buffer;
2732 uint64_t count_buffer_offset;
2733 };
2734
2735 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2736 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2737
2738 enum tu_draw_state_group_id
2739 {
2740 TU_DRAW_STATE_PROGRAM,
2741 TU_DRAW_STATE_PROGRAM_BINNING,
2742 TU_DRAW_STATE_VI,
2743 TU_DRAW_STATE_VI_BINNING,
2744 TU_DRAW_STATE_VP,
2745 TU_DRAW_STATE_RAST,
2746 TU_DRAW_STATE_DS,
2747 TU_DRAW_STATE_BLEND,
2748 TU_DRAW_STATE_VS_CONST,
2749 TU_DRAW_STATE_FS_CONST,
2750 TU_DRAW_STATE_VS_TEX,
2751 TU_DRAW_STATE_FS_TEX_SYSMEM,
2752 TU_DRAW_STATE_FS_TEX_GMEM,
2753 TU_DRAW_STATE_FS_IBO,
2754 TU_DRAW_STATE_VS_PARAMS,
2755
2756 TU_DRAW_STATE_COUNT,
2757 };
2758
2759 struct tu_draw_state_group
2760 {
2761 enum tu_draw_state_group_id id;
2762 uint32_t enable_mask;
2763 struct tu_cs_entry ib;
2764 };
2765
2766 const static struct tu_sampler*
2767 sampler_ptr(struct tu_descriptor_state *descriptors_state,
2768 const struct tu_descriptor_map *map, unsigned i,
2769 unsigned array_index)
2770 {
2771 assert(descriptors_state->valid & (1 << map->set[i]));
2772
2773 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2774 assert(map->binding[i] < set->layout->binding_count);
2775
2776 const struct tu_descriptor_set_binding_layout *layout =
2777 &set->layout->binding[map->binding[i]];
2778
2779 if (layout->immutable_samplers_offset) {
2780 const struct tu_sampler *immutable_samplers =
2781 tu_immutable_samplers(set->layout, layout);
2782
2783 return &immutable_samplers[array_index];
2784 }
2785
2786 switch (layout->type) {
2787 case VK_DESCRIPTOR_TYPE_SAMPLER:
2788 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4];
2789 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2790 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
2791 array_index *
2792 (A6XX_TEX_CONST_DWORDS +
2793 sizeof(struct tu_sampler) / 4)];
2794 default:
2795 unreachable("unimplemented descriptor type");
2796 break;
2797 }
2798 }
2799
2800 static void
2801 write_tex_const(struct tu_cmd_buffer *cmd,
2802 uint32_t *dst,
2803 struct tu_descriptor_state *descriptors_state,
2804 const struct tu_descriptor_map *map,
2805 unsigned i, unsigned array_index, bool is_sysmem)
2806 {
2807 assert(descriptors_state->valid & (1 << map->set[i]));
2808
2809 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2810 assert(map->binding[i] < set->layout->binding_count);
2811
2812 const struct tu_descriptor_set_binding_layout *layout =
2813 &set->layout->binding[map->binding[i]];
2814
2815 switch (layout->type) {
2816 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2817 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2818 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2819 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2820 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2821 array_index * A6XX_TEX_CONST_DWORDS],
2822 A6XX_TEX_CONST_DWORDS * 4);
2823 break;
2824 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2825 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2826 array_index *
2827 (A6XX_TEX_CONST_DWORDS +
2828 sizeof(struct tu_sampler) / 4)],
2829 A6XX_TEX_CONST_DWORDS * 4);
2830 break;
2831 default:
2832 unreachable("unimplemented descriptor type");
2833 break;
2834 }
2835
2836 if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
2837 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2838 uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
2839 array_index].attachment;
2840 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2841
2842 assert(att->gmem_offset >= 0);
2843
2844 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
2845 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
2846 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
2847 dst[2] |=
2848 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
2849 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
2850 dst[3] = 0;
2851 dst[4] = 0x100000 + att->gmem_offset;
2852 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
2853 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
2854 dst[i] = 0;
2855
2856 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2857 tu_finishme("patch input attachment pitch for secondary cmd buffer");
2858 }
2859 }
2860
2861 static void
2862 write_image_ibo(struct tu_cmd_buffer *cmd,
2863 uint32_t *dst,
2864 struct tu_descriptor_state *descriptors_state,
2865 const struct tu_descriptor_map *map,
2866 unsigned i, unsigned array_index)
2867 {
2868 assert(descriptors_state->valid & (1 << map->set[i]));
2869
2870 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2871 assert(map->binding[i] < set->layout->binding_count);
2872
2873 const struct tu_descriptor_set_binding_layout *layout =
2874 &set->layout->binding[map->binding[i]];
2875
2876 assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
2877
2878 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2879 (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
2880 A6XX_TEX_CONST_DWORDS * 4);
2881 }
2882
2883 static uint64_t
2884 buffer_ptr(struct tu_descriptor_state *descriptors_state,
2885 const struct tu_descriptor_map *map,
2886 unsigned i, unsigned array_index)
2887 {
2888 assert(descriptors_state->valid & (1 << map->set[i]));
2889
2890 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2891 assert(map->binding[i] < set->layout->binding_count);
2892
2893 const struct tu_descriptor_set_binding_layout *layout =
2894 &set->layout->binding[map->binding[i]];
2895
2896 switch (layout->type) {
2897 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2898 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2899 return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
2900 array_index];
2901 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2902 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2903 return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
2904 set->mapped_ptr[layout->offset / 4 + array_index * 2];
2905 default:
2906 unreachable("unimplemented descriptor type");
2907 break;
2908 }
2909 }
2910
2911 static inline uint32_t
2912 tu6_stage2opcode(gl_shader_stage type)
2913 {
2914 switch (type) {
2915 case MESA_SHADER_VERTEX:
2916 case MESA_SHADER_TESS_CTRL:
2917 case MESA_SHADER_TESS_EVAL:
2918 case MESA_SHADER_GEOMETRY:
2919 return CP_LOAD_STATE6_GEOM;
2920 case MESA_SHADER_FRAGMENT:
2921 case MESA_SHADER_COMPUTE:
2922 case MESA_SHADER_KERNEL:
2923 return CP_LOAD_STATE6_FRAG;
2924 default:
2925 unreachable("bad shader type");
2926 }
2927 }
2928
2929 static inline enum a6xx_state_block
2930 tu6_stage2shadersb(gl_shader_stage type)
2931 {
2932 switch (type) {
2933 case MESA_SHADER_VERTEX:
2934 return SB6_VS_SHADER;
2935 case MESA_SHADER_FRAGMENT:
2936 return SB6_FS_SHADER;
2937 case MESA_SHADER_COMPUTE:
2938 case MESA_SHADER_KERNEL:
2939 return SB6_CS_SHADER;
2940 default:
2941 unreachable("bad shader type");
2942 return ~0;
2943 }
2944 }
2945
2946 static void
2947 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2948 struct tu_descriptor_state *descriptors_state,
2949 gl_shader_stage type,
2950 uint32_t *push_constants)
2951 {
2952 const struct tu_program_descriptor_linkage *link =
2953 &pipeline->program.link[type];
2954 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2955
2956 for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
2957 if (state->range[i].start < state->range[i].end) {
2958 uint32_t size = state->range[i].end - state->range[i].start;
2959 uint32_t offset = state->range[i].start;
2960
2961 /* and even if the start of the const buffer is before
2962 * first_immediate, the end may not be:
2963 */
2964 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2965
2966 if (size == 0)
2967 continue;
2968
2969 /* things should be aligned to vec4: */
2970 debug_assert((state->range[i].offset % 16) == 0);
2971 debug_assert((size % 16) == 0);
2972 debug_assert((offset % 16) == 0);
2973
2974 if (i == 0) {
2975 /* push constants */
2976 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (size / 4));
2977 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2978 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2979 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2980 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2981 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2982 tu_cs_emit(cs, 0);
2983 tu_cs_emit(cs, 0);
2984 for (unsigned i = 0; i < size / 4; i++)
2985 tu_cs_emit(cs, push_constants[i + offset / 4]);
2986 continue;
2987 }
2988
2989 /* Look through the UBO map to find our UBO index, and get the VA for
2990 * that UBO.
2991 */
2992 uint64_t va = 0;
2993 uint32_t ubo_idx = i - 1;
2994 uint32_t ubo_map_base = 0;
2995 for (int j = 0; j < link->ubo_map.num; j++) {
2996 if (ubo_idx >= ubo_map_base &&
2997 ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
2998 va = buffer_ptr(descriptors_state, &link->ubo_map, j,
2999 ubo_idx - ubo_map_base);
3000 break;
3001 }
3002 ubo_map_base += link->ubo_map.array_size[j];
3003 }
3004 assert(va);
3005
3006 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
3007 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
3008 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3009 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3010 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3011 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
3012 tu_cs_emit_qw(cs, va + offset);
3013 }
3014 }
3015 }
3016
3017 static void
3018 tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
3019 struct tu_descriptor_state *descriptors_state,
3020 gl_shader_stage type)
3021 {
3022 const struct tu_program_descriptor_linkage *link =
3023 &pipeline->program.link[type];
3024
3025 uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
3026 uint32_t anum = align(num, 2);
3027
3028 if (!num)
3029 return;
3030
3031 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
3032 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
3033 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3034 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3035 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3036 CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
3037 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
3038 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
3039
3040 unsigned emitted = 0;
3041 for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
3042 for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
3043 tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
3044 emitted++;
3045 }
3046 }
3047
3048 for (; emitted < anum; emitted++) {
3049 tu_cs_emit(cs, 0xffffffff);
3050 tu_cs_emit(cs, 0xffffffff);
3051 }
3052 }
3053
3054 static struct tu_cs_entry
3055 tu6_emit_consts(struct tu_cmd_buffer *cmd,
3056 const struct tu_pipeline *pipeline,
3057 struct tu_descriptor_state *descriptors_state,
3058 gl_shader_stage type)
3059 {
3060 struct tu_cs cs;
3061 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
3062
3063 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3064 tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
3065
3066 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3067 }
3068
3069 static VkResult
3070 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3071 const struct tu_draw_info *draw,
3072 struct tu_cs_entry *entry)
3073 {
3074 /* TODO: fill out more than just base instance */
3075 const struct tu_program_descriptor_linkage *link =
3076 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3077 const struct ir3_const_state *const_state = &link->const_state;
3078 struct tu_cs cs;
3079
3080 if (const_state->offsets.driver_param >= link->constlen) {
3081 *entry = (struct tu_cs_entry) {};
3082 return VK_SUCCESS;
3083 }
3084
3085 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
3086 if (result != VK_SUCCESS)
3087 return result;
3088
3089 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3090 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
3091 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3092 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3093 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3094 CP_LOAD_STATE6_0_NUM_UNIT(1));
3095 tu_cs_emit(&cs, 0);
3096 tu_cs_emit(&cs, 0);
3097
3098 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3099
3100 tu_cs_emit(&cs, 0);
3101 tu_cs_emit(&cs, 0);
3102 tu_cs_emit(&cs, draw->first_instance);
3103 tu_cs_emit(&cs, 0);
3104
3105 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3106 return VK_SUCCESS;
3107 }
3108
3109 static VkResult
3110 tu6_emit_textures(struct tu_cmd_buffer *cmd,
3111 const struct tu_pipeline *pipeline,
3112 struct tu_descriptor_state *descriptors_state,
3113 gl_shader_stage type,
3114 struct tu_cs_entry *entry,
3115 bool *needs_border,
3116 bool is_sysmem)
3117 {
3118 struct tu_cs *draw_state = &cmd->sub_cs;
3119 const struct tu_program_descriptor_linkage *link =
3120 &pipeline->program.link[type];
3121 VkResult result;
3122
3123 if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
3124 *entry = (struct tu_cs_entry) {};
3125 return VK_SUCCESS;
3126 }
3127
3128 /* allocate and fill texture state */
3129 struct ts_cs_memory tex_const;
3130 result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
3131 A6XX_TEX_CONST_DWORDS, &tex_const);
3132 if (result != VK_SUCCESS)
3133 return result;
3134
3135 int tex_index = 0;
3136 for (unsigned i = 0; i < link->texture_map.num; i++) {
3137 for (int j = 0; j < link->texture_map.array_size[i]; j++) {
3138 write_tex_const(cmd,
3139 &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
3140 descriptors_state, &link->texture_map, i, j,
3141 is_sysmem);
3142 }
3143 }
3144
3145 /* allocate and fill sampler state */
3146 struct ts_cs_memory tex_samp = { 0 };
3147 if (link->sampler_map.num_desc) {
3148 result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
3149 A6XX_TEX_SAMP_DWORDS, &tex_samp);
3150 if (result != VK_SUCCESS)
3151 return result;
3152
3153 int sampler_index = 0;
3154 for (unsigned i = 0; i < link->sampler_map.num; i++) {
3155 for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
3156 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3157 &link->sampler_map,
3158 i, j);
3159 memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
3160 sampler->state, sizeof(sampler->state));
3161 *needs_border |= sampler->needs_border;
3162 }
3163 }
3164 }
3165
3166 unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
3167 enum a6xx_state_block sb;
3168
3169 switch (type) {
3170 case MESA_SHADER_VERTEX:
3171 sb = SB6_VS_TEX;
3172 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
3173 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
3174 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
3175 break;
3176 case MESA_SHADER_FRAGMENT:
3177 sb = SB6_FS_TEX;
3178 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
3179 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
3180 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
3181 break;
3182 case MESA_SHADER_COMPUTE:
3183 sb = SB6_CS_TEX;
3184 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
3185 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
3186 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
3187 break;
3188 default:
3189 unreachable("bad state block");
3190 }
3191
3192 struct tu_cs cs;
3193 result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
3194 if (result != VK_SUCCESS)
3195 return result;
3196
3197 if (link->sampler_map.num_desc) {
3198 /* output sampler state: */
3199 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3200 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3201 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
3202 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3203 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3204 CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
3205 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3206
3207 tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
3208 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3209 }
3210
3211 /* emit texture state: */
3212 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3213 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3214 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3215 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3216 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3217 CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
3218 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3219
3220 tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
3221 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3222
3223 tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
3224 tu_cs_emit(&cs, link->texture_map.num_desc);
3225
3226 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3227 return VK_SUCCESS;
3228 }
3229
3230 static VkResult
3231 tu6_emit_ibo(struct tu_cmd_buffer *cmd,
3232 const struct tu_pipeline *pipeline,
3233 struct tu_descriptor_state *descriptors_state,
3234 gl_shader_stage type,
3235 struct tu_cs_entry *entry)
3236 {
3237 struct tu_cs *draw_state = &cmd->sub_cs;
3238 const struct tu_program_descriptor_linkage *link =
3239 &pipeline->program.link[type];
3240 VkResult result;
3241
3242 unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
3243
3244 if (num_desc == 0) {
3245 *entry = (struct tu_cs_entry) {};
3246 return VK_SUCCESS;
3247 }
3248
3249 struct ts_cs_memory ibo_const;
3250 result = tu_cs_alloc(draw_state, num_desc,
3251 A6XX_TEX_CONST_DWORDS, &ibo_const);
3252 if (result != VK_SUCCESS)
3253 return result;
3254
3255 int ssbo_index = 0;
3256 for (unsigned i = 0; i < link->ssbo_map.num; i++) {
3257 for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
3258 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3259
3260 uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
3261 /* We don't expose robustBufferAccess, so leave the size unlimited. */
3262 uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
3263
3264 dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
3265 dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
3266 A6XX_IBO_1_HEIGHT(sz >> 15);
3267 dst[2] = A6XX_IBO_2_UNK4 |
3268 A6XX_IBO_2_UNK31 |
3269 A6XX_IBO_2_TYPE(A6XX_TEX_1D);
3270 dst[3] = 0;
3271 dst[4] = va;
3272 dst[5] = va >> 32;
3273 for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3274 dst[i] = 0;
3275
3276 ssbo_index++;
3277 }
3278 }
3279
3280 for (unsigned i = 0; i < link->image_map.num; i++) {
3281 for (int j = 0; j < link->image_map.array_size[i]; j++) {
3282 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3283
3284 write_image_ibo(cmd, dst,
3285 descriptors_state, &link->image_map, i, j);
3286
3287 ssbo_index++;
3288 }
3289 }
3290
3291 assert(ssbo_index == num_desc);
3292
3293 struct tu_cs cs;
3294 result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
3295 if (result != VK_SUCCESS)
3296 return result;
3297
3298 uint32_t opcode, ibo_addr_reg;
3299 enum a6xx_state_block sb;
3300 enum a6xx_state_type st;
3301
3302 switch (type) {
3303 case MESA_SHADER_FRAGMENT:
3304 opcode = CP_LOAD_STATE6;
3305 st = ST6_SHADER;
3306 sb = SB6_IBO;
3307 ibo_addr_reg = REG_A6XX_SP_IBO_LO;
3308 break;
3309 case MESA_SHADER_COMPUTE:
3310 opcode = CP_LOAD_STATE6_FRAG;
3311 st = ST6_IBO;
3312 sb = SB6_CS_SHADER;
3313 ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
3314 break;
3315 default:
3316 unreachable("unsupported stage for ibos");
3317 }
3318
3319 /* emit texture state: */
3320 tu_cs_emit_pkt7(&cs, opcode, 3);
3321 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3322 CP_LOAD_STATE6_0_STATE_TYPE(st) |
3323 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3324 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3325 CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
3326 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3327
3328 tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
3329 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3330
3331 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3332 return VK_SUCCESS;
3333 }
3334
3335 struct PACKED bcolor_entry {
3336 uint32_t fp32[4];
3337 uint16_t ui16[4];
3338 int16_t si16[4];
3339 uint16_t fp16[4];
3340 uint16_t rgb565;
3341 uint16_t rgb5a1;
3342 uint16_t rgba4;
3343 uint8_t __pad0[2];
3344 uint8_t ui8[4];
3345 int8_t si8[4];
3346 uint32_t rgb10a2;
3347 uint32_t z24; /* also s8? */
3348 uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
3349 uint8_t __pad1[56];
3350 } border_color[] = {
3351 [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = {},
3352 [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = {},
3353 [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = {
3354 .fp32[3] = 0x3f800000,
3355 .ui16[3] = 0xffff,
3356 .si16[3] = 0x7fff,
3357 .fp16[3] = 0x3c00,
3358 .rgb5a1 = 0x8000,
3359 .rgba4 = 0xf000,
3360 .ui8[3] = 0xff,
3361 .si8[3] = 0x7f,
3362 .rgb10a2 = 0xc0000000,
3363 .srgb[3] = 0x3c00,
3364 },
3365 [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = {
3366 .fp32[3] = 1,
3367 .fp16[3] = 1,
3368 },
3369 [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = {
3370 .fp32[0 ... 3] = 0x3f800000,
3371 .ui16[0 ... 3] = 0xffff,
3372 .si16[0 ... 3] = 0x7fff,
3373 .fp16[0 ... 3] = 0x3c00,
3374 .rgb565 = 0xffff,
3375 .rgb5a1 = 0xffff,
3376 .rgba4 = 0xffff,
3377 .ui8[0 ... 3] = 0xff,
3378 .si8[0 ... 3] = 0x7f,
3379 .rgb10a2 = 0xffffffff,
3380 .z24 = 0xffffff,
3381 .srgb[0 ... 3] = 0x3c00,
3382 },
3383 [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = {
3384 .fp32[0 ... 3] = 1,
3385 .fp16[0 ... 3] = 1,
3386 },
3387 };
3388
3389 static VkResult
3390 tu6_emit_border_color(struct tu_cmd_buffer *cmd,
3391 struct tu_cs *cs)
3392 {
3393 STATIC_ASSERT(sizeof(struct bcolor_entry) == 128);
3394
3395 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3396 struct tu_descriptor_state *descriptors_state =
3397 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3398 const struct tu_descriptor_map *vs_sampler =
3399 &pipeline->program.link[MESA_SHADER_VERTEX].sampler_map;
3400 const struct tu_descriptor_map *fs_sampler =
3401 &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map;
3402 struct ts_cs_memory ptr;
3403
3404 VkResult result = tu_cs_alloc(&cmd->sub_cs,
3405 vs_sampler->num_desc + fs_sampler->num_desc,
3406 128 / 4,
3407 &ptr);
3408 if (result != VK_SUCCESS)
3409 return result;
3410
3411 for (unsigned i = 0; i < vs_sampler->num; i++) {
3412 for (unsigned j = 0; j < vs_sampler->array_size[i]; j++) {
3413 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3414 vs_sampler, i, j);
3415 memcpy(ptr.map, &border_color[sampler->border], 128);
3416 ptr.map += 128 / 4;
3417 }
3418 }
3419
3420 for (unsigned i = 0; i < fs_sampler->num; i++) {
3421 for (unsigned j = 0; j < fs_sampler->array_size[i]; j++) {
3422 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3423 fs_sampler, i, j);
3424 memcpy(ptr.map, &border_color[sampler->border], 128);
3425 ptr.map += 128 / 4;
3426 }
3427 }
3428
3429 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
3430 tu_cs_emit_qw(cs, ptr.iova);
3431 return VK_SUCCESS;
3432 }
3433
3434 static VkResult
3435 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3436 struct tu_cs *cs,
3437 const struct tu_draw_info *draw)
3438 {
3439 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3440 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3441 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3442 uint32_t draw_state_group_count = 0;
3443 VkResult result;
3444
3445 struct tu_descriptor_state *descriptors_state =
3446 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3447
3448 /* TODO lrz */
3449
3450 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
3451 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
3452 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
3453
3454 tu_cs_emit_regs(cs,
3455 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3456 pipeline->ia.primitive_restart && draw->indexed));
3457
3458 if (cmd->state.dirty &
3459 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3460 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3461 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3462 dynamic->line_width);
3463 }
3464
3465 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3466 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3467 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3468 dynamic->stencil_compare_mask.back);
3469 }
3470
3471 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3472 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3473 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3474 dynamic->stencil_write_mask.back);
3475 }
3476
3477 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3478 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3479 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3480 dynamic->stencil_reference.back);
3481 }
3482
3483 if (cmd->state.dirty &
3484 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) {
3485 for (uint32_t i = 0; i < pipeline->vi.count; i++) {
3486 const uint32_t binding = pipeline->vi.bindings[i];
3487 const uint32_t stride = pipeline->vi.strides[i];
3488 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3489 const VkDeviceSize offset = buf->bo_offset +
3490 cmd->state.vb.offsets[binding] +
3491 pipeline->vi.offsets[i];
3492 const VkDeviceSize size =
3493 offset < buf->bo->size ? buf->bo->size - offset : 0;
3494
3495 tu_cs_emit_regs(cs,
3496 A6XX_VFD_FETCH_BASE(i, .bo = buf->bo, .bo_offset = offset),
3497 A6XX_VFD_FETCH_SIZE(i, size),
3498 A6XX_VFD_FETCH_STRIDE(i, stride));
3499 }
3500 }
3501
3502 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3503 draw_state_groups[draw_state_group_count++] =
3504 (struct tu_draw_state_group) {
3505 .id = TU_DRAW_STATE_PROGRAM,
3506 .enable_mask = ENABLE_DRAW,
3507 .ib = pipeline->program.state_ib,
3508 };
3509 draw_state_groups[draw_state_group_count++] =
3510 (struct tu_draw_state_group) {
3511 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3512 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3513 .ib = pipeline->program.binning_state_ib,
3514 };
3515 draw_state_groups[draw_state_group_count++] =
3516 (struct tu_draw_state_group) {
3517 .id = TU_DRAW_STATE_VI,
3518 .enable_mask = ENABLE_DRAW,
3519 .ib = pipeline->vi.state_ib,
3520 };
3521 draw_state_groups[draw_state_group_count++] =
3522 (struct tu_draw_state_group) {
3523 .id = TU_DRAW_STATE_VI_BINNING,
3524 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3525 .ib = pipeline->vi.binning_state_ib,
3526 };
3527 draw_state_groups[draw_state_group_count++] =
3528 (struct tu_draw_state_group) {
3529 .id = TU_DRAW_STATE_VP,
3530 .enable_mask = ENABLE_ALL,
3531 .ib = pipeline->vp.state_ib,
3532 };
3533 draw_state_groups[draw_state_group_count++] =
3534 (struct tu_draw_state_group) {
3535 .id = TU_DRAW_STATE_RAST,
3536 .enable_mask = ENABLE_ALL,
3537 .ib = pipeline->rast.state_ib,
3538 };
3539 draw_state_groups[draw_state_group_count++] =
3540 (struct tu_draw_state_group) {
3541 .id = TU_DRAW_STATE_DS,
3542 .enable_mask = ENABLE_ALL,
3543 .ib = pipeline->ds.state_ib,
3544 };
3545 draw_state_groups[draw_state_group_count++] =
3546 (struct tu_draw_state_group) {
3547 .id = TU_DRAW_STATE_BLEND,
3548 .enable_mask = ENABLE_ALL,
3549 .ib = pipeline->blend.state_ib,
3550 };
3551 }
3552
3553 if (cmd->state.dirty &
3554 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) {
3555 draw_state_groups[draw_state_group_count++] =
3556 (struct tu_draw_state_group) {
3557 .id = TU_DRAW_STATE_VS_CONST,
3558 .enable_mask = ENABLE_ALL,
3559 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3560 };
3561 draw_state_groups[draw_state_group_count++] =
3562 (struct tu_draw_state_group) {
3563 .id = TU_DRAW_STATE_FS_CONST,
3564 .enable_mask = ENABLE_DRAW,
3565 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3566 };
3567 }
3568
3569 if (cmd->state.dirty &
3570 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
3571 bool needs_border = false;
3572 struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
3573
3574 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3575 MESA_SHADER_VERTEX, &vs_tex, &needs_border,
3576 false);
3577 if (result != VK_SUCCESS)
3578 return result;
3579
3580 /* TODO: we could emit just one texture descriptor draw state when there
3581 * are no input attachments, which is the most common case. We could
3582 * also split out the sampler state, which doesn't change even for input
3583 * attachments.
3584 */
3585 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3586 MESA_SHADER_FRAGMENT, &fs_tex_sysmem,
3587 &needs_border, true);
3588 if (result != VK_SUCCESS)
3589 return result;
3590
3591 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3592 MESA_SHADER_FRAGMENT, &fs_tex_gmem,
3593 &needs_border, false);
3594 if (result != VK_SUCCESS)
3595 return result;
3596
3597 result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
3598 MESA_SHADER_FRAGMENT, &fs_ibo);
3599 if (result != VK_SUCCESS)
3600 return result;
3601
3602 draw_state_groups[draw_state_group_count++] =
3603 (struct tu_draw_state_group) {
3604 .id = TU_DRAW_STATE_VS_TEX,
3605 .enable_mask = ENABLE_ALL,
3606 .ib = vs_tex,
3607 };
3608 draw_state_groups[draw_state_group_count++] =
3609 (struct tu_draw_state_group) {
3610 .id = TU_DRAW_STATE_FS_TEX_GMEM,
3611 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3612 .ib = fs_tex_gmem,
3613 };
3614 draw_state_groups[draw_state_group_count++] =
3615 (struct tu_draw_state_group) {
3616 .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
3617 .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
3618 .ib = fs_tex_sysmem,
3619 };
3620 draw_state_groups[draw_state_group_count++] =
3621 (struct tu_draw_state_group) {
3622 .id = TU_DRAW_STATE_FS_IBO,
3623 .enable_mask = ENABLE_DRAW,
3624 .ib = fs_ibo,
3625 };
3626
3627 if (needs_border) {
3628 result = tu6_emit_border_color(cmd, cs);
3629 if (result != VK_SUCCESS)
3630 return result;
3631 }
3632 }
3633
3634 struct tu_cs_entry vs_params;
3635 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3636 if (result != VK_SUCCESS)
3637 return result;
3638
3639 draw_state_groups[draw_state_group_count++] =
3640 (struct tu_draw_state_group) {
3641 .id = TU_DRAW_STATE_VS_PARAMS,
3642 .enable_mask = ENABLE_ALL,
3643 .ib = vs_params,
3644 };
3645
3646 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3647 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3648 const struct tu_draw_state_group *group = &draw_state_groups[i];
3649 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3650 uint32_t cp_set_draw_state =
3651 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3652 group->enable_mask |
3653 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3654 uint64_t iova;
3655 if (group->ib.size) {
3656 iova = group->ib.bo->iova + group->ib.offset;
3657 } else {
3658 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3659 iova = 0;
3660 }
3661
3662 tu_cs_emit(cs, cp_set_draw_state);
3663 tu_cs_emit_qw(cs, iova);
3664 }
3665
3666 tu_cs_sanity_check(cs);
3667
3668 /* track BOs */
3669 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3670 for (uint32_t i = 0; i < MAX_VBS; i++) {
3671 const struct tu_buffer *buf = cmd->state.vb.buffers[i];
3672 if (buf)
3673 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3674 }
3675 }
3676 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3677 unsigned i;
3678 for_each_bit(i, descriptors_state->valid) {
3679 struct tu_descriptor_set *set = descriptors_state->sets[i];
3680 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3681 if (set->descriptors[j]) {
3682 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3683 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3684 }
3685 }
3686 }
3687
3688 /* Fragment shader state overwrites compute shader state, so flag the
3689 * compute pipeline for re-emit.
3690 */
3691 cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
3692 return VK_SUCCESS;
3693 }
3694
3695 static void
3696 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3697 struct tu_cs *cs,
3698 const struct tu_draw_info *draw)
3699 {
3700
3701 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3702
3703 tu_cs_emit_regs(cs,
3704 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3705 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3706
3707 /* TODO hw binning */
3708 if (draw->indexed) {
3709 const enum a4xx_index_size index_size =
3710 tu6_index_size(cmd->state.index_type);
3711 const uint32_t index_bytes =
3712 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3713 const struct tu_buffer *buf = cmd->state.index_buffer;
3714 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3715 index_bytes * draw->first_index;
3716 const uint32_t size = index_bytes * draw->count;
3717
3718 const uint32_t cp_draw_indx =
3719 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3720 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3721 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3722 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3723
3724 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3725 tu_cs_emit(cs, cp_draw_indx);
3726 tu_cs_emit(cs, draw->instance_count);
3727 tu_cs_emit(cs, draw->count);
3728 tu_cs_emit(cs, 0x0); /* XXX */
3729 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3730 tu_cs_emit(cs, size);
3731 } else {
3732 const uint32_t cp_draw_indx =
3733 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3734 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3735 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3736
3737 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3738 tu_cs_emit(cs, cp_draw_indx);
3739 tu_cs_emit(cs, draw->instance_count);
3740 tu_cs_emit(cs, draw->count);
3741 }
3742 }
3743
3744 static void
3745 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3746 {
3747 struct tu_cs *cs = &cmd->draw_cs;
3748 VkResult result;
3749
3750 result = tu6_bind_draw_states(cmd, cs, draw);
3751 if (result != VK_SUCCESS) {
3752 cmd->record_result = result;
3753 return;
3754 }
3755
3756 if (draw->indirect) {
3757 tu_finishme("indirect draw");
3758 return;
3759 }
3760
3761 /* TODO tu6_emit_marker should pick different regs depending on cs */
3762
3763 tu6_emit_marker(cmd, cs);
3764 tu6_emit_draw_direct(cmd, cs, draw);
3765 tu6_emit_marker(cmd, cs);
3766
3767 cmd->wait_for_idle = true;
3768
3769 tu_cs_sanity_check(cs);
3770 }
3771
3772 void
3773 tu_CmdDraw(VkCommandBuffer commandBuffer,
3774 uint32_t vertexCount,
3775 uint32_t instanceCount,
3776 uint32_t firstVertex,
3777 uint32_t firstInstance)
3778 {
3779 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3780 struct tu_draw_info info = {};
3781
3782 info.count = vertexCount;
3783 info.instance_count = instanceCount;
3784 info.first_instance = firstInstance;
3785 info.vertex_offset = firstVertex;
3786
3787 tu_draw(cmd_buffer, &info);
3788 }
3789
3790 void
3791 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3792 uint32_t indexCount,
3793 uint32_t instanceCount,
3794 uint32_t firstIndex,
3795 int32_t vertexOffset,
3796 uint32_t firstInstance)
3797 {
3798 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3799 struct tu_draw_info info = {};
3800
3801 info.indexed = true;
3802 info.count = indexCount;
3803 info.instance_count = instanceCount;
3804 info.first_index = firstIndex;
3805 info.vertex_offset = vertexOffset;
3806 info.first_instance = firstInstance;
3807
3808 tu_draw(cmd_buffer, &info);
3809 }
3810
3811 void
3812 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3813 VkBuffer _buffer,
3814 VkDeviceSize offset,
3815 uint32_t drawCount,
3816 uint32_t stride)
3817 {
3818 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3819 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3820 struct tu_draw_info info = {};
3821
3822 info.count = drawCount;
3823 info.indirect = buffer;
3824 info.indirect_offset = offset;
3825 info.stride = stride;
3826
3827 tu_draw(cmd_buffer, &info);
3828 }
3829
3830 void
3831 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3832 VkBuffer _buffer,
3833 VkDeviceSize offset,
3834 uint32_t drawCount,
3835 uint32_t stride)
3836 {
3837 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3838 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3839 struct tu_draw_info info = {};
3840
3841 info.indexed = true;
3842 info.count = drawCount;
3843 info.indirect = buffer;
3844 info.indirect_offset = offset;
3845 info.stride = stride;
3846
3847 tu_draw(cmd_buffer, &info);
3848 }
3849
3850 struct tu_dispatch_info
3851 {
3852 /**
3853 * Determine the layout of the grid (in block units) to be used.
3854 */
3855 uint32_t blocks[3];
3856
3857 /**
3858 * A starting offset for the grid. If unaligned is set, the offset
3859 * must still be aligned.
3860 */
3861 uint32_t offsets[3];
3862 /**
3863 * Whether it's an unaligned compute dispatch.
3864 */
3865 bool unaligned;
3866
3867 /**
3868 * Indirect compute parameters resource.
3869 */
3870 struct tu_buffer *indirect;
3871 uint64_t indirect_offset;
3872 };
3873
3874 static void
3875 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3876 const struct tu_dispatch_info *info)
3877 {
3878 gl_shader_stage type = MESA_SHADER_COMPUTE;
3879 const struct tu_program_descriptor_linkage *link =
3880 &pipeline->program.link[type];
3881 const struct ir3_const_state *const_state = &link->const_state;
3882 uint32_t offset = const_state->offsets.driver_param;
3883
3884 if (link->constlen <= offset)
3885 return;
3886
3887 if (!info->indirect) {
3888 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3889 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3890 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3891 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3892 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3893 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3894 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3895 };
3896
3897 uint32_t num_consts = MIN2(const_state->num_driver_params,
3898 (link->constlen - offset) * 4);
3899 /* push constants */
3900 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3901 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3902 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3903 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3904 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3905 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3906 tu_cs_emit(cs, 0);
3907 tu_cs_emit(cs, 0);
3908 uint32_t i;
3909 for (i = 0; i < num_consts; i++)
3910 tu_cs_emit(cs, driver_params[i]);
3911 } else {
3912 tu_finishme("Indirect driver params");
3913 }
3914 }
3915
3916 static void
3917 tu_dispatch(struct tu_cmd_buffer *cmd,
3918 const struct tu_dispatch_info *info)
3919 {
3920 struct tu_cs *cs = &cmd->cs;
3921 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3922 struct tu_descriptor_state *descriptors_state =
3923 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3924 VkResult result;
3925
3926 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3927 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3928
3929 struct tu_cs_entry ib;
3930
3931 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3932 if (ib.size)
3933 tu_cs_emit_ib(cs, &ib);
3934
3935 tu_emit_compute_driver_params(cs, pipeline, info);
3936
3937 bool needs_border;
3938 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3939 MESA_SHADER_COMPUTE, &ib, &needs_border, false);
3940 if (result != VK_SUCCESS) {
3941 cmd->record_result = result;
3942 return;
3943 }
3944
3945 if (ib.size)
3946 tu_cs_emit_ib(cs, &ib);
3947
3948 if (needs_border)
3949 tu_finishme("compute border color");
3950
3951 result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
3952 if (result != VK_SUCCESS) {
3953 cmd->record_result = result;
3954 return;
3955 }
3956
3957 if (ib.size)
3958 tu_cs_emit_ib(cs, &ib);
3959
3960 /* track BOs */
3961 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3962 unsigned i;
3963 for_each_bit(i, descriptors_state->valid) {
3964 struct tu_descriptor_set *set = descriptors_state->sets[i];
3965 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3966 if (set->descriptors[j]) {
3967 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3968 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3969 }
3970 }
3971 }
3972
3973 /* Compute shader state overwrites fragment shader state, so we flag the
3974 * graphics pipeline for re-emit.
3975 */
3976 cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
3977
3978 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3979 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3980
3981 const uint32_t *local_size = pipeline->compute.local_size;
3982 const uint32_t *num_groups = info->blocks;
3983 tu_cs_emit_regs(cs,
3984 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3985 .localsizex = local_size[0] - 1,
3986 .localsizey = local_size[1] - 1,
3987 .localsizez = local_size[2] - 1),
3988 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3989 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3990 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3991 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3992 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3993 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3994
3995 tu_cs_emit_regs(cs,
3996 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3997 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3998 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3999
4000 if (info->indirect) {
4001 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
4002
4003 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
4004 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
4005
4006 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
4007 tu_cs_emit(cs, 0x00000000);
4008 tu_cs_emit_qw(cs, iova);
4009 tu_cs_emit(cs,
4010 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
4011 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
4012 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
4013 } else {
4014 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
4015 tu_cs_emit(cs, 0x00000000);
4016 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
4017 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
4018 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
4019 }
4020
4021 tu_cs_emit_wfi(cs);
4022
4023 tu6_emit_cache_flush(cmd, cs);
4024 }
4025
4026 void
4027 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
4028 uint32_t base_x,
4029 uint32_t base_y,
4030 uint32_t base_z,
4031 uint32_t x,
4032 uint32_t y,
4033 uint32_t z)
4034 {
4035 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4036 struct tu_dispatch_info info = {};
4037
4038 info.blocks[0] = x;
4039 info.blocks[1] = y;
4040 info.blocks[2] = z;
4041
4042 info.offsets[0] = base_x;
4043 info.offsets[1] = base_y;
4044 info.offsets[2] = base_z;
4045 tu_dispatch(cmd_buffer, &info);
4046 }
4047
4048 void
4049 tu_CmdDispatch(VkCommandBuffer commandBuffer,
4050 uint32_t x,
4051 uint32_t y,
4052 uint32_t z)
4053 {
4054 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4055 }
4056
4057 void
4058 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4059 VkBuffer _buffer,
4060 VkDeviceSize offset)
4061 {
4062 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4063 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
4064 struct tu_dispatch_info info = {};
4065
4066 info.indirect = buffer;
4067 info.indirect_offset = offset;
4068
4069 tu_dispatch(cmd_buffer, &info);
4070 }
4071
4072 void
4073 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
4074 {
4075 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4076
4077 tu_cs_end(&cmd_buffer->draw_cs);
4078 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4079
4080 if (use_sysmem_rendering(cmd_buffer))
4081 tu_cmd_render_sysmem(cmd_buffer);
4082 else
4083 tu_cmd_render_tiles(cmd_buffer);
4084
4085 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4086 rendered */
4087 tu_cs_discard_entries(&cmd_buffer->draw_cs);
4088 tu_cs_begin(&cmd_buffer->draw_cs);
4089 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4090 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4091
4092 cmd_buffer->state.pass = NULL;
4093 cmd_buffer->state.subpass = NULL;
4094 cmd_buffer->state.framebuffer = NULL;
4095 }
4096
4097 void
4098 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4099 const VkSubpassEndInfoKHR *pSubpassEndInfo)
4100 {
4101 tu_CmdEndRenderPass(commandBuffer);
4102 }
4103
4104 struct tu_barrier_info
4105 {
4106 uint32_t eventCount;
4107 const VkEvent *pEvents;
4108 VkPipelineStageFlags srcStageMask;
4109 };
4110
4111 static void
4112 tu_barrier(struct tu_cmd_buffer *cmd_buffer,
4113 uint32_t memoryBarrierCount,
4114 const VkMemoryBarrier *pMemoryBarriers,
4115 uint32_t bufferMemoryBarrierCount,
4116 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4117 uint32_t imageMemoryBarrierCount,
4118 const VkImageMemoryBarrier *pImageMemoryBarriers,
4119 const struct tu_barrier_info *info)
4120 {
4121 }
4122
4123 void
4124 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4125 VkPipelineStageFlags srcStageMask,
4126 VkPipelineStageFlags destStageMask,
4127 VkBool32 byRegion,
4128 uint32_t memoryBarrierCount,
4129 const VkMemoryBarrier *pMemoryBarriers,
4130 uint32_t bufferMemoryBarrierCount,
4131 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4132 uint32_t imageMemoryBarrierCount,
4133 const VkImageMemoryBarrier *pImageMemoryBarriers)
4134 {
4135 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4136 struct tu_barrier_info info;
4137
4138 info.eventCount = 0;
4139 info.pEvents = NULL;
4140 info.srcStageMask = srcStageMask;
4141
4142 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4143 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4144 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4145 }
4146
4147 static void
4148 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value)
4149 {
4150 struct tu_cs *cs = &cmd->cs;
4151
4152 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4153
4154 /* TODO: any flush required before/after ? */
4155
4156 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4157 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4158 tu_cs_emit(cs, value);
4159 }
4160
4161 void
4162 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4163 VkEvent _event,
4164 VkPipelineStageFlags stageMask)
4165 {
4166 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4167 TU_FROM_HANDLE(tu_event, event, _event);
4168
4169 write_event(cmd, event, 1);
4170 }
4171
4172 void
4173 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4174 VkEvent _event,
4175 VkPipelineStageFlags stageMask)
4176 {
4177 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4178 TU_FROM_HANDLE(tu_event, event, _event);
4179
4180 write_event(cmd, event, 0);
4181 }
4182
4183 void
4184 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4185 uint32_t eventCount,
4186 const VkEvent *pEvents,
4187 VkPipelineStageFlags srcStageMask,
4188 VkPipelineStageFlags dstStageMask,
4189 uint32_t memoryBarrierCount,
4190 const VkMemoryBarrier *pMemoryBarriers,
4191 uint32_t bufferMemoryBarrierCount,
4192 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4193 uint32_t imageMemoryBarrierCount,
4194 const VkImageMemoryBarrier *pImageMemoryBarriers)
4195 {
4196 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4197 struct tu_cs *cs = &cmd->cs;
4198
4199 /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */
4200
4201 for (uint32_t i = 0; i < eventCount; i++) {
4202 const struct tu_event *event = (const struct tu_event*) pEvents[i];
4203
4204 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4205
4206 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4207 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4208 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4209 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4210 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4211 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4212 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4213 }
4214 }
4215
4216 void
4217 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4218 {
4219 /* No-op */
4220 }