turnip: Execute main cs for secondary command buffers
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36 #include "tu_blit.h"
37
38 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
39
40 void
41 tu_bo_list_init(struct tu_bo_list *list)
42 {
43 list->count = list->capacity = 0;
44 list->bo_infos = NULL;
45 }
46
47 void
48 tu_bo_list_destroy(struct tu_bo_list *list)
49 {
50 free(list->bo_infos);
51 }
52
53 void
54 tu_bo_list_reset(struct tu_bo_list *list)
55 {
56 list->count = 0;
57 }
58
59 /**
60 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
61 */
62 static uint32_t
63 tu_bo_list_add_info(struct tu_bo_list *list,
64 const struct drm_msm_gem_submit_bo *bo_info)
65 {
66 assert(bo_info->handle != 0);
67
68 for (uint32_t i = 0; i < list->count; ++i) {
69 if (list->bo_infos[i].handle == bo_info->handle) {
70 assert(list->bo_infos[i].presumed == bo_info->presumed);
71 list->bo_infos[i].flags |= bo_info->flags;
72 return i;
73 }
74 }
75
76 /* grow list->bo_infos if needed */
77 if (list->count == list->capacity) {
78 uint32_t new_capacity = MAX2(2 * list->count, 16);
79 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
80 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
81 if (!new_bo_infos)
82 return TU_BO_LIST_FAILED;
83 list->bo_infos = new_bo_infos;
84 list->capacity = new_capacity;
85 }
86
87 list->bo_infos[list->count] = *bo_info;
88 return list->count++;
89 }
90
91 uint32_t
92 tu_bo_list_add(struct tu_bo_list *list,
93 const struct tu_bo *bo,
94 uint32_t flags)
95 {
96 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
97 .flags = flags,
98 .handle = bo->gem_handle,
99 .presumed = bo->iova,
100 });
101 }
102
103 VkResult
104 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
105 {
106 for (uint32_t i = 0; i < other->count; i++) {
107 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
108 return VK_ERROR_OUT_OF_HOST_MEMORY;
109 }
110
111 return VK_SUCCESS;
112 }
113
114 static bool
115 is_linear_mipmapped(const struct tu_image_view *iview)
116 {
117 return iview->image->layout.tile_mode == TILE6_LINEAR &&
118 iview->base_mip != iview->image->level_count - 1;
119 }
120
121 static bool
122 force_sysmem(const struct tu_cmd_buffer *cmd,
123 const struct VkRect2D *render_area)
124 {
125 const struct tu_framebuffer *fb = cmd->state.framebuffer;
126 const struct tu_physical_device *device = cmd->device->physical_device;
127 bool has_linear_mipmapped_store = false;
128 const struct tu_render_pass *pass = cmd->state.pass;
129
130 /* Iterate over all the places we call tu6_emit_store_attachment() */
131 for (unsigned i = 0; i < pass->subpass_count; i++) {
132 const struct tu_subpass *subpass = &pass->subpasses[i];
133 if (subpass->resolve_attachments) {
134 for (unsigned i = 0; i < subpass->color_count; i++) {
135 uint32_t a = subpass->resolve_attachments[i].attachment;
136 if (a != VK_ATTACHMENT_UNUSED &&
137 cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
138 const struct tu_image_view *iview = fb->attachments[a].attachment;
139 if (is_linear_mipmapped(iview)) {
140 has_linear_mipmapped_store = true;
141 break;
142 }
143 }
144 }
145 }
146 }
147
148 for (unsigned i = 0; i < pass->attachment_count; i++) {
149 if (pass->attachments[i].gmem_offset >= 0 &&
150 cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
151 const struct tu_image_view *iview = fb->attachments[i].attachment;
152 if (is_linear_mipmapped(iview)) {
153 has_linear_mipmapped_store = true;
154 break;
155 }
156 }
157 }
158
159 /* Linear textures cannot have any padding between mipmap levels and their
160 * height isn't padded, while at the same time the GMEM->MEM resolve does
161 * not have per-pixel granularity, so if the image height isn't aligned to
162 * the resolve granularity and the render area is tall enough, we may wind
163 * up writing past the bottom of the image into the next miplevel or even
164 * past the end of the image. For the last miplevel, the layout code should
165 * insert enough padding so that the overdraw writes to the padding. To
166 * work around this, we force-enable sysmem rendering.
167 */
168 const uint32_t y2 = render_area->offset.y + render_area->extent.height;
169 const uint32_t aligned_y2 = ALIGN_POT(y2, device->tile_align_h);
170
171 return has_linear_mipmapped_store && aligned_y2 > fb->height;
172 }
173
174 static void
175 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
176 const struct tu_device *dev,
177 uint32_t pixels)
178 {
179 const uint32_t tile_align_w = dev->physical_device->tile_align_w;
180 const uint32_t tile_align_h = dev->physical_device->tile_align_h;
181 const uint32_t max_tile_width = 1024; /* A6xx */
182
183 /* note: don't offset the tiling config by render_area.offset,
184 * because binning pass can't deal with it
185 * this means we might end up with more tiles than necessary,
186 * but load/store/etc are still scissored to the render_area
187 */
188 tiling->tile0.offset = (VkOffset2D) {};
189
190 const uint32_t ra_width =
191 tiling->render_area.extent.width +
192 (tiling->render_area.offset.x - tiling->tile0.offset.x);
193 const uint32_t ra_height =
194 tiling->render_area.extent.height +
195 (tiling->render_area.offset.y - tiling->tile0.offset.y);
196
197 /* start from 1 tile */
198 tiling->tile_count = (VkExtent2D) {
199 .width = 1,
200 .height = 1,
201 };
202 tiling->tile0.extent = (VkExtent2D) {
203 .width = align(ra_width, tile_align_w),
204 .height = align(ra_height, tile_align_h),
205 };
206
207 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
208 /* start with 2x2 tiles */
209 tiling->tile_count.width = 2;
210 tiling->tile_count.height = 2;
211 tiling->tile0.extent.width = align(DIV_ROUND_UP(ra_width, 2), tile_align_w);
212 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), tile_align_h);
213 }
214
215 /* do not exceed max tile width */
216 while (tiling->tile0.extent.width > max_tile_width) {
217 tiling->tile_count.width++;
218 tiling->tile0.extent.width =
219 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
220 }
221
222 /* will force to sysmem, don't bother trying to have a valid tile config
223 * TODO: just skip all GMEM stuff when sysmem is forced?
224 */
225 if (!pixels)
226 return;
227
228 /* do not exceed gmem size */
229 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pixels) {
230 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
231 tiling->tile_count.width++;
232 tiling->tile0.extent.width =
233 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
234 } else {
235 /* if this assert fails then layout is impossible.. */
236 assert(tiling->tile0.extent.height > tile_align_h);
237 tiling->tile_count.height++;
238 tiling->tile0.extent.height =
239 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), tile_align_h);
240 }
241 }
242 }
243
244 static void
245 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
246 const struct tu_device *dev)
247 {
248 const uint32_t max_pipe_count = 32; /* A6xx */
249
250 /* start from 1 tile per pipe */
251 tiling->pipe0 = (VkExtent2D) {
252 .width = 1,
253 .height = 1,
254 };
255 tiling->pipe_count = tiling->tile_count;
256
257 /* do not exceed max pipe count vertically */
258 while (tiling->pipe_count.height > max_pipe_count) {
259 tiling->pipe0.height += 2;
260 tiling->pipe_count.height =
261 (tiling->tile_count.height + tiling->pipe0.height - 1) /
262 tiling->pipe0.height;
263 }
264
265 /* do not exceed max pipe count */
266 while (tiling->pipe_count.width * tiling->pipe_count.height >
267 max_pipe_count) {
268 tiling->pipe0.width += 1;
269 tiling->pipe_count.width =
270 (tiling->tile_count.width + tiling->pipe0.width - 1) /
271 tiling->pipe0.width;
272 }
273 }
274
275 static void
276 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
277 const struct tu_device *dev)
278 {
279 const uint32_t max_pipe_count = 32; /* A6xx */
280 const uint32_t used_pipe_count =
281 tiling->pipe_count.width * tiling->pipe_count.height;
282 const VkExtent2D last_pipe = {
283 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
284 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
285 };
286
287 assert(used_pipe_count <= max_pipe_count);
288 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
289
290 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
291 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
292 const uint32_t pipe_x = tiling->pipe0.width * x;
293 const uint32_t pipe_y = tiling->pipe0.height * y;
294 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
295 ? last_pipe.width
296 : tiling->pipe0.width;
297 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
298 ? last_pipe.height
299 : tiling->pipe0.height;
300 const uint32_t n = tiling->pipe_count.width * y + x;
301
302 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
303 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
304 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
305 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
306 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
307 }
308 }
309
310 memset(tiling->pipe_config + used_pipe_count, 0,
311 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
312 }
313
314 static void
315 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
316 const struct tu_device *dev,
317 uint32_t tx,
318 uint32_t ty,
319 struct tu_tile *tile)
320 {
321 /* find the pipe and the slot for tile (tx, ty) */
322 const uint32_t px = tx / tiling->pipe0.width;
323 const uint32_t py = ty / tiling->pipe0.height;
324 const uint32_t sx = tx - tiling->pipe0.width * px;
325 const uint32_t sy = ty - tiling->pipe0.height * py;
326
327 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
328 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
329 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
330
331 /* convert to 1D indices */
332 tile->pipe = tiling->pipe_count.width * py + px;
333 tile->slot = tiling->pipe0.width * sy + sx;
334
335 /* get the blit area for the tile */
336 tile->begin = (VkOffset2D) {
337 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
338 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
339 };
340 tile->end.x =
341 (tx == tiling->tile_count.width - 1)
342 ? tiling->render_area.offset.x + tiling->render_area.extent.width
343 : tile->begin.x + tiling->tile0.extent.width;
344 tile->end.y =
345 (ty == tiling->tile_count.height - 1)
346 ? tiling->render_area.offset.y + tiling->render_area.extent.height
347 : tile->begin.y + tiling->tile0.extent.height;
348 }
349
350 enum a3xx_msaa_samples
351 tu_msaa_samples(uint32_t samples)
352 {
353 switch (samples) {
354 case 1:
355 return MSAA_ONE;
356 case 2:
357 return MSAA_TWO;
358 case 4:
359 return MSAA_FOUR;
360 case 8:
361 return MSAA_EIGHT;
362 default:
363 assert(!"invalid sample count");
364 return MSAA_ONE;
365 }
366 }
367
368 static enum a4xx_index_size
369 tu6_index_size(VkIndexType type)
370 {
371 switch (type) {
372 case VK_INDEX_TYPE_UINT16:
373 return INDEX4_SIZE_16_BIT;
374 case VK_INDEX_TYPE_UINT32:
375 return INDEX4_SIZE_32_BIT;
376 default:
377 unreachable("invalid VkIndexType");
378 return INDEX4_SIZE_8_BIT;
379 }
380 }
381
382 unsigned
383 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
384 struct tu_cs *cs,
385 enum vgt_event_type event,
386 bool need_seqno)
387 {
388 unsigned seqno = 0;
389
390 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
391 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
392 if (need_seqno) {
393 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
394 seqno = ++cmd->scratch_seqno;
395 tu_cs_emit(cs, seqno);
396 }
397
398 return seqno;
399 }
400
401 static void
402 tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
403 {
404 tu6_emit_event_write(cmd, cs, 0x31, false);
405 }
406
407 static void
408 tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
409 {
410 tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false);
411 }
412
413 static void
414 tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
415 {
416 if (cmd->wait_for_idle) {
417 tu_cs_emit_wfi(cs);
418 cmd->wait_for_idle = false;
419 }
420 }
421
422 #define tu_image_view_ubwc_pitches(iview) \
423 .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \
424 .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2
425
426 static void
427 tu6_emit_zs(struct tu_cmd_buffer *cmd,
428 const struct tu_subpass *subpass,
429 struct tu_cs *cs)
430 {
431 const struct tu_framebuffer *fb = cmd->state.framebuffer;
432
433 const uint32_t a = subpass->depth_stencil_attachment.attachment;
434 if (a == VK_ATTACHMENT_UNUSED) {
435 tu_cs_emit_regs(cs,
436 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
437 A6XX_RB_DEPTH_BUFFER_PITCH(0),
438 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
439 A6XX_RB_DEPTH_BUFFER_BASE(0),
440 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
441
442 tu_cs_emit_regs(cs,
443 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
444
445 tu_cs_emit_regs(cs,
446 A6XX_GRAS_LRZ_BUFFER_BASE(0),
447 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
448 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
449
450 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
451
452 return;
453 }
454
455 const struct tu_image_view *iview = fb->attachments[a].attachment;
456 enum a6xx_depth_format fmt = tu6_pipe2depth(iview->vk_format);
457
458 tu_cs_emit_regs(cs,
459 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt),
460 A6XX_RB_DEPTH_BUFFER_PITCH(tu_image_stride(iview->image, iview->base_mip)),
461 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(iview->image->layout.layer_size),
462 A6XX_RB_DEPTH_BUFFER_BASE(tu_image_view_base_ref(iview)),
463 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(cmd->state.pass->attachments[a].gmem_offset));
464
465 tu_cs_emit_regs(cs,
466 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
467
468 tu_cs_emit_regs(cs,
469 A6XX_RB_DEPTH_FLAG_BUFFER_BASE(tu_image_view_ubwc_base_ref(iview)),
470 A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(tu_image_view_ubwc_pitches(iview)));
471
472 tu_cs_emit_regs(cs,
473 A6XX_GRAS_LRZ_BUFFER_BASE(0),
474 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
475 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
476
477 tu_cs_emit_regs(cs,
478 A6XX_RB_STENCIL_INFO(0));
479
480 /* enable zs? */
481 }
482
483 static void
484 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
485 const struct tu_subpass *subpass,
486 struct tu_cs *cs)
487 {
488 const struct tu_framebuffer *fb = cmd->state.framebuffer;
489 unsigned char mrt_comp[MAX_RTS] = { 0 };
490 unsigned srgb_cntl = 0;
491
492 for (uint32_t i = 0; i < subpass->color_count; ++i) {
493 uint32_t a = subpass->color_attachments[i].attachment;
494 if (a == VK_ATTACHMENT_UNUSED)
495 continue;
496
497 const struct tu_image_view *iview = fb->attachments[a].attachment;
498 const enum a6xx_tile_mode tile_mode =
499 tu6_get_image_tile_mode(iview->image, iview->base_mip);
500
501 mrt_comp[i] = 0xf;
502
503 if (vk_format_is_srgb(iview->vk_format))
504 srgb_cntl |= (1 << i);
505
506 const struct tu_native_format format =
507 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
508
509 tu_cs_emit_regs(cs,
510 A6XX_RB_MRT_BUF_INFO(i,
511 .color_tile_mode = tile_mode,
512 .color_format = format.fmt,
513 .color_swap = format.swap),
514 A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)),
515 A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size),
516 A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)),
517 A6XX_RB_MRT_BASE_GMEM(i, cmd->state.pass->attachments[a].gmem_offset));
518
519 tu_cs_emit_regs(cs,
520 A6XX_SP_FS_MRT_REG(i,
521 .color_format = format.fmt,
522 .color_sint = vk_format_is_sint(iview->vk_format),
523 .color_uint = vk_format_is_uint(iview->vk_format)));
524
525 tu_cs_emit_regs(cs,
526 A6XX_RB_MRT_FLAG_BUFFER_ADDR(i, tu_image_view_ubwc_base_ref(iview)),
527 A6XX_RB_MRT_FLAG_BUFFER_PITCH(i, tu_image_view_ubwc_pitches(iview)));
528 }
529
530 tu_cs_emit_regs(cs,
531 A6XX_RB_SRGB_CNTL(.dword = srgb_cntl));
532
533 tu_cs_emit_regs(cs,
534 A6XX_SP_SRGB_CNTL(.dword = srgb_cntl));
535
536 tu_cs_emit_regs(cs,
537 A6XX_RB_RENDER_COMPONENTS(
538 .rt0 = mrt_comp[0],
539 .rt1 = mrt_comp[1],
540 .rt2 = mrt_comp[2],
541 .rt3 = mrt_comp[3],
542 .rt4 = mrt_comp[4],
543 .rt5 = mrt_comp[5],
544 .rt6 = mrt_comp[6],
545 .rt7 = mrt_comp[7]));
546
547 tu_cs_emit_regs(cs,
548 A6XX_SP_FS_RENDER_COMPONENTS(
549 .rt0 = mrt_comp[0],
550 .rt1 = mrt_comp[1],
551 .rt2 = mrt_comp[2],
552 .rt3 = mrt_comp[3],
553 .rt4 = mrt_comp[4],
554 .rt5 = mrt_comp[5],
555 .rt6 = mrt_comp[6],
556 .rt7 = mrt_comp[7]));
557 }
558
559 static void
560 tu6_emit_msaa(struct tu_cmd_buffer *cmd,
561 const struct tu_subpass *subpass,
562 struct tu_cs *cs)
563 {
564 const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples);
565 bool msaa_disable = samples == MSAA_ONE;
566
567 tu_cs_emit_regs(cs,
568 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
569 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
570 .msaa_disable = msaa_disable));
571
572 tu_cs_emit_regs(cs,
573 A6XX_GRAS_RAS_MSAA_CNTL(samples),
574 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
575 .msaa_disable = msaa_disable));
576
577 tu_cs_emit_regs(cs,
578 A6XX_RB_RAS_MSAA_CNTL(samples),
579 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
580 .msaa_disable = msaa_disable));
581
582 tu_cs_emit_regs(cs,
583 A6XX_RB_MSAA_CNTL(samples));
584 }
585
586 static void
587 tu6_emit_bin_size(struct tu_cs *cs,
588 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
589 {
590 tu_cs_emit_regs(cs,
591 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
592 .binh = bin_h,
593 .dword = flags));
594
595 tu_cs_emit_regs(cs,
596 A6XX_RB_BIN_CONTROL(.binw = bin_w,
597 .binh = bin_h,
598 .dword = flags));
599
600 /* no flag for RB_BIN_CONTROL2... */
601 tu_cs_emit_regs(cs,
602 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
603 .binh = bin_h));
604 }
605
606 static void
607 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
608 const struct tu_subpass *subpass,
609 struct tu_cs *cs,
610 bool binning)
611 {
612 const struct tu_framebuffer *fb = cmd->state.framebuffer;
613 uint32_t cntl = 0;
614 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
615 if (binning) {
616 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
617 } else {
618 uint32_t mrts_ubwc_enable = 0;
619 for (uint32_t i = 0; i < subpass->color_count; ++i) {
620 uint32_t a = subpass->color_attachments[i].attachment;
621 if (a == VK_ATTACHMENT_UNUSED)
622 continue;
623
624 const struct tu_image_view *iview = fb->attachments[a].attachment;
625 if (iview->image->layout.ubwc_layer_size != 0)
626 mrts_ubwc_enable |= 1 << i;
627 }
628
629 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
630
631 const uint32_t a = subpass->depth_stencil_attachment.attachment;
632 if (a != VK_ATTACHMENT_UNUSED) {
633 const struct tu_image_view *iview = fb->attachments[a].attachment;
634 if (iview->image->layout.ubwc_layer_size != 0)
635 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
636 }
637
638 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
639 * in order to set it correctly for the different subpasses. However,
640 * that means the packets we're emitting also happen during binning. So
641 * we need to guard the write on !BINNING at CP execution time.
642 */
643 tu_cs_reserve(cs, 3 + 4);
644 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
645 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
646 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
647 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
648 }
649
650 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
651 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
652 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
653 tu_cs_emit(cs, cntl);
654 }
655
656 static void
657 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
658 {
659 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
660 uint32_t x1 = render_area->offset.x;
661 uint32_t y1 = render_area->offset.y;
662 uint32_t x2 = x1 + render_area->extent.width - 1;
663 uint32_t y2 = y1 + render_area->extent.height - 1;
664
665 /* TODO: alignment requirement seems to be less than tile_align_w/h */
666 if (align) {
667 x1 = x1 & ~cmd->device->physical_device->tile_align_w;
668 y1 = y1 & ~cmd->device->physical_device->tile_align_h;
669 x2 = ALIGN_POT(x2 + 1, cmd->device->physical_device->tile_align_w) - 1;
670 y2 = ALIGN_POT(y2 + 1, cmd->device->physical_device->tile_align_h) - 1;
671 }
672
673 tu_cs_emit_regs(cs,
674 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
675 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
676 }
677
678 static void
679 tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
680 struct tu_cs *cs,
681 const struct tu_image_view *iview,
682 uint32_t gmem_offset,
683 bool resolve)
684 {
685 tu_cs_emit_regs(cs,
686 A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve));
687
688 const struct tu_native_format format =
689 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
690
691 enum a6xx_tile_mode tile_mode =
692 tu6_get_image_tile_mode(iview->image, iview->base_mip);
693 tu_cs_emit_regs(cs,
694 A6XX_RB_BLIT_DST_INFO(
695 .tile_mode = tile_mode,
696 .samples = tu_msaa_samples(iview->image->samples),
697 .color_format = format.fmt,
698 .color_swap = format.swap,
699 .flags = iview->image->layout.ubwc_layer_size != 0),
700 A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
701 A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
702 A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size));
703
704 if (iview->image->layout.ubwc_layer_size) {
705 tu_cs_emit_regs(cs,
706 A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)),
707 A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview)));
708 }
709
710 tu_cs_emit_regs(cs,
711 A6XX_RB_BLIT_BASE_GMEM(gmem_offset));
712 }
713
714 static void
715 tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
716 {
717 tu6_emit_event_write(cmd, cs, BLIT, false);
718 }
719
720 static void
721 tu6_emit_window_scissor(struct tu_cmd_buffer *cmd,
722 struct tu_cs *cs,
723 uint32_t x1,
724 uint32_t y1,
725 uint32_t x2,
726 uint32_t y2)
727 {
728 tu_cs_emit_regs(cs,
729 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
730 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
731
732 tu_cs_emit_regs(cs,
733 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
734 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
735 }
736
737 static void
738 tu6_emit_window_offset(struct tu_cmd_buffer *cmd,
739 struct tu_cs *cs,
740 uint32_t x1,
741 uint32_t y1)
742 {
743 tu_cs_emit_regs(cs,
744 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
745
746 tu_cs_emit_regs(cs,
747 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
748
749 tu_cs_emit_regs(cs,
750 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
751
752 tu_cs_emit_regs(cs,
753 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
754 }
755
756 static bool
757 use_hw_binning(struct tu_cmd_buffer *cmd)
758 {
759 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
760
761 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
762 return false;
763
764 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
765 return true;
766
767 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
768 }
769
770 static bool
771 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
772 {
773 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
774 return true;
775
776 /* can't fit attachments into gmem */
777 if (!cmd->state.pass->gmem_pixels)
778 return true;
779
780 return cmd->state.tiling_config.force_sysmem;
781 }
782
783 static void
784 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
785 struct tu_cs *cs,
786 const struct tu_tile *tile)
787 {
788 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
789 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
790
791 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
792 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
793
794 const uint32_t x1 = tile->begin.x;
795 const uint32_t y1 = tile->begin.y;
796 const uint32_t x2 = tile->end.x - 1;
797 const uint32_t y2 = tile->end.y - 1;
798 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
799 tu6_emit_window_offset(cmd, cs, x1, y1);
800
801 tu_cs_emit_regs(cs,
802 A6XX_VPC_SO_OVERRIDE(.so_disable = true));
803
804 if (use_hw_binning(cmd)) {
805 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
806
807 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
808 tu_cs_emit(cs, 0x0);
809
810 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
811 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
812 A6XX_CP_REG_TEST_0_BIT(0) |
813 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
814
815 tu_cs_reserve(cs, 3 + 11);
816 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
817 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
818 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
819
820 /* if (no overflow) */ {
821 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
822 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
823 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
824 tu_cs_emit_qw(cs, cmd->vsc_data.iova + tile->pipe * cmd->vsc_data_pitch);
825 tu_cs_emit_qw(cs, cmd->vsc_data.iova + (tile->pipe * 4) + (32 * cmd->vsc_data_pitch));
826 tu_cs_emit_qw(cs, cmd->vsc_data2.iova + (tile->pipe * cmd->vsc_data2_pitch));
827
828 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
829 tu_cs_emit(cs, 0x0);
830
831 /* use a NOP packet to skip over the 'else' side: */
832 tu_cs_emit_pkt7(cs, CP_NOP, 2);
833 } /* else */ {
834 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
835 tu_cs_emit(cs, 0x1);
836 }
837
838 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
839 tu_cs_emit(cs, 0x0);
840
841 tu_cs_emit_regs(cs,
842 A6XX_RB_UNKNOWN_8804(0));
843
844 tu_cs_emit_regs(cs,
845 A6XX_SP_TP_UNKNOWN_B304(0));
846
847 tu_cs_emit_regs(cs,
848 A6XX_GRAS_UNKNOWN_80A4(0));
849 } else {
850 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
851 tu_cs_emit(cs, 0x1);
852
853 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
854 tu_cs_emit(cs, 0x0);
855 }
856 }
857
858 static void
859 tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
860 {
861 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
862 const struct tu_framebuffer *fb = cmd->state.framebuffer;
863 const struct tu_image_view *iview = fb->attachments[a].attachment;
864 const struct tu_render_pass_attachment *attachment =
865 &cmd->state.pass->attachments[a];
866
867 if (attachment->gmem_offset < 0)
868 return;
869
870 const uint32_t x1 = tiling->render_area.offset.x;
871 const uint32_t y1 = tiling->render_area.offset.y;
872 const uint32_t x2 = x1 + tiling->render_area.extent.width;
873 const uint32_t y2 = y1 + tiling->render_area.extent.height;
874 const uint32_t tile_x2 =
875 tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width;
876 const uint32_t tile_y2 =
877 tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height;
878 bool need_load =
879 x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) ||
880 y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2);
881
882 if (need_load)
883 tu_finishme("improve handling of unaligned render area");
884
885 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
886 need_load = true;
887
888 if (vk_format_has_stencil(iview->vk_format) &&
889 attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
890 need_load = true;
891
892 if (need_load) {
893 tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false);
894 tu6_emit_blit(cmd, cs);
895 }
896 }
897
898 static void
899 tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
900 uint32_t a,
901 const VkRenderPassBeginInfo *info)
902 {
903 const struct tu_framebuffer *fb = cmd->state.framebuffer;
904 const struct tu_image_view *iview = fb->attachments[a].attachment;
905 const struct tu_render_pass_attachment *attachment =
906 &cmd->state.pass->attachments[a];
907 unsigned clear_mask = 0;
908
909 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
910 if (attachment->gmem_offset < 0)
911 return;
912
913 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
914 clear_mask = 0xf;
915
916 if (vk_format_has_stencil(iview->vk_format)) {
917 clear_mask &= 0x1;
918 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
919 clear_mask |= 0x2;
920 }
921 if (!clear_mask)
922 return;
923
924 tu_clear_gmem_attachment(cmd, cs, a, clear_mask,
925 &info->pClearValues[a]);
926 }
927
928 static void
929 tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd,
930 struct tu_cs *cs,
931 uint32_t a,
932 uint32_t gmem_a,
933 bool resolve)
934 {
935 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
936
937 tu6_emit_blit_info(cmd, cs,
938 cmd->state.framebuffer->attachments[a].attachment,
939 cmd->state.pass->attachments[gmem_a].gmem_offset, resolve);
940 tu6_emit_blit(cmd, cs);
941
942 tu_cond_exec_end(cs);
943 }
944
945 static void
946 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
947 struct tu_cs *cs,
948 uint32_t a,
949 uint32_t gmem_a)
950 {
951 const struct tu_framebuffer *fb = cmd->state.framebuffer;
952 const struct tu_image_view *dst = fb->attachments[a].attachment;
953 const struct tu_image_view *src = fb->attachments[gmem_a].attachment;
954
955 tu_blit(cmd, cs, &(struct tu_blit) {
956 .dst = sysmem_attachment_surf(dst, dst->base_layer,
957 &cmd->state.tiling_config.render_area),
958 .src = sysmem_attachment_surf(src, src->base_layer,
959 &cmd->state.tiling_config.render_area),
960 .layers = fb->layers,
961 });
962 }
963
964
965 /* Emit a MSAA resolve operation, with both gmem and sysmem paths. */
966 static void tu6_emit_resolve(struct tu_cmd_buffer *cmd,
967 struct tu_cs *cs,
968 uint32_t a,
969 uint32_t gmem_a)
970 {
971 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
972 return;
973
974 tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true);
975
976 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
977 tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a);
978 tu_cond_exec_end(cs);
979 }
980
981 static void
982 tu6_emit_store_attachment(struct tu_cmd_buffer *cmd,
983 struct tu_cs *cs,
984 uint32_t a,
985 uint32_t gmem_a)
986 {
987 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
988 return;
989
990 tu6_emit_blit_info(cmd, cs,
991 cmd->state.framebuffer->attachments[a].attachment,
992 cmd->state.pass->attachments[gmem_a].gmem_offset, true);
993 tu6_emit_blit(cmd, cs);
994 }
995
996 static void
997 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
998 {
999 const struct tu_render_pass *pass = cmd->state.pass;
1000 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
1001
1002 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1003 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1004 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1005 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1006 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1007 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1008
1009 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1010 tu_cs_emit(cs, 0x0);
1011
1012 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1013 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
1014
1015 tu6_emit_blit_scissor(cmd, cs, true);
1016
1017 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
1018 if (pass->attachments[a].gmem_offset >= 0)
1019 tu6_emit_store_attachment(cmd, cs, a, a);
1020 }
1021
1022 if (subpass->resolve_attachments) {
1023 for (unsigned i = 0; i < subpass->color_count; i++) {
1024 uint32_t a = subpass->resolve_attachments[i].attachment;
1025 if (a != VK_ATTACHMENT_UNUSED)
1026 tu6_emit_store_attachment(cmd, cs, a,
1027 subpass->color_attachments[i].attachment);
1028 }
1029 }
1030 }
1031
1032 static void
1033 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
1034 {
1035 tu_cs_emit_regs(cs,
1036 A6XX_PC_RESTART_INDEX(restart_index));
1037 }
1038
1039 static void
1040 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1041 {
1042 tu6_emit_cache_flush(cmd, cs);
1043
1044 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
1045
1046 tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x10000000);
1047 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
1048 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
1049 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
1050 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
1051 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
1052 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
1053 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1054 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1055
1056 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
1057 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
1058 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
1059 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
1060 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
1061 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
1062 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
1063 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1064 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
1065 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
1066 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
1067 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
1068 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
1069 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
1070
1071 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1072 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1073 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
1074
1075 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
1076
1077 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
1078
1079 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
1080 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
1081 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
1082 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
1083 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
1084 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
1085 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
1086 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
1087 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
1088 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
1089 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
1090
1091 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
1092 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
1093
1094 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1);
1095 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
1096
1097 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
1098 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
1099
1100 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
1101 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
1102 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
1103 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
1104
1105 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
1106 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
1107
1108 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
1109
1110 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
1111
1112 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
1113 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
1114 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
1115 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1116 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
1117 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
1118 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
1119 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
1120 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
1121 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
1122 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0);
1123 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
1124 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0);
1125 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0);
1126 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0);
1127 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0);
1128 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0);
1129 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0);
1130 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
1131 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
1132 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1133
1134 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
1135
1136 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
1137
1138 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
1139
1140 /* we don't use this yet.. probably best to disable.. */
1141 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1142 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1143 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1144 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1145 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1146 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1147
1148 tu_cs_emit_regs(cs,
1149 A6XX_VPC_SO_BUFFER_BASE(0),
1150 A6XX_VPC_SO_BUFFER_SIZE(0));
1151
1152 tu_cs_emit_regs(cs,
1153 A6XX_VPC_SO_FLUSH_BASE(0));
1154
1155 tu_cs_emit_regs(cs,
1156 A6XX_VPC_SO_BUF_CNTL(0));
1157
1158 tu_cs_emit_regs(cs,
1159 A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
1160
1161 tu_cs_emit_regs(cs,
1162 A6XX_VPC_SO_BUFFER_BASE(1, 0),
1163 A6XX_VPC_SO_BUFFER_SIZE(1, 0));
1164
1165 tu_cs_emit_regs(cs,
1166 A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
1167 A6XX_VPC_SO_FLUSH_BASE(1, 0),
1168 A6XX_VPC_SO_BUFFER_BASE(2, 0),
1169 A6XX_VPC_SO_BUFFER_SIZE(2, 0));
1170
1171 tu_cs_emit_regs(cs,
1172 A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
1173 A6XX_VPC_SO_FLUSH_BASE(2, 0),
1174 A6XX_VPC_SO_BUFFER_BASE(3, 0),
1175 A6XX_VPC_SO_BUFFER_SIZE(3, 0));
1176
1177 tu_cs_emit_regs(cs,
1178 A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
1179 A6XX_VPC_SO_FLUSH_BASE(3, 0));
1180
1181 tu_cs_emit_regs(cs,
1182 A6XX_SP_HS_CTRL_REG0(0));
1183
1184 tu_cs_emit_regs(cs,
1185 A6XX_SP_GS_CTRL_REG0(0));
1186
1187 tu_cs_emit_regs(cs,
1188 A6XX_GRAS_LRZ_CNTL(0));
1189
1190 tu_cs_emit_regs(cs,
1191 A6XX_RB_LRZ_CNTL(0));
1192
1193 tu_cs_sanity_check(cs);
1194 }
1195
1196 static void
1197 tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1198 {
1199 unsigned seqno;
1200
1201 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_AND_INV_EVENT, true);
1202
1203 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1204 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1205 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1206 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1207 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(seqno));
1208 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1209 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1210
1211 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1212
1213 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4);
1214 tu_cs_emit(cs, CP_WAIT_MEM_GTE_0_RESERVED(0));
1215 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1216 tu_cs_emit(cs, CP_WAIT_MEM_GTE_3_REF(seqno));
1217 }
1218
1219 static void
1220 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1221 {
1222 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1223
1224 tu_cs_emit_regs(cs,
1225 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1226 .height = tiling->tile0.extent.height),
1227 A6XX_VSC_SIZE_ADDRESS(.bo = &cmd->vsc_data,
1228 .bo_offset = 32 * cmd->vsc_data_pitch));
1229
1230 tu_cs_emit_regs(cs,
1231 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1232 .ny = tiling->tile_count.height));
1233
1234 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1235 for (unsigned i = 0; i < 32; i++)
1236 tu_cs_emit(cs, tiling->pipe_config[i]);
1237
1238 tu_cs_emit_regs(cs,
1239 A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = &cmd->vsc_data2),
1240 A6XX_VSC_PIPE_DATA2_PITCH(cmd->vsc_data2_pitch),
1241 A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(cmd->vsc_data2.size));
1242
1243 tu_cs_emit_regs(cs,
1244 A6XX_VSC_PIPE_DATA_ADDRESS(.bo = &cmd->vsc_data),
1245 A6XX_VSC_PIPE_DATA_PITCH(cmd->vsc_data_pitch),
1246 A6XX_VSC_PIPE_DATA_ARRAY_PITCH(cmd->vsc_data.size));
1247 }
1248
1249 static void
1250 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1251 {
1252 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1253 const uint32_t used_pipe_count =
1254 tiling->pipe_count.width * tiling->pipe_count.height;
1255
1256 /* Clear vsc_scratch: */
1257 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1258 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1259 tu_cs_emit(cs, 0x0);
1260
1261 /* Check for overflow, write vsc_scratch if detected: */
1262 for (int i = 0; i < used_pipe_count; i++) {
1263 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1264 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1265 CP_COND_WRITE5_0_WRITE_MEMORY);
1266 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i)));
1267 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1268 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data_pitch));
1269 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1270 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1271 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_data_pitch));
1272
1273 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1274 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1275 CP_COND_WRITE5_0_WRITE_MEMORY);
1276 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i)));
1277 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1278 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data2_pitch));
1279 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1280 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1281 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_data2_pitch));
1282 }
1283
1284 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1285
1286 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1287
1288 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1289 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1290 CP_MEM_TO_REG_0_CNT(1 - 1));
1291 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1292
1293 /*
1294 * This is a bit awkward, we really want a way to invert the
1295 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1296 * execute cmds to use hwbinning when a bit is *not* set. This
1297 * dance is to invert OVERFLOW_FLAG_REG
1298 *
1299 * A CP_NOP packet is used to skip executing the 'else' clause
1300 * if (b0 set)..
1301 */
1302
1303 /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */
1304 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1305 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1306 A6XX_CP_REG_TEST_0_BIT(0) |
1307 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1308
1309 tu_cs_reserve(cs, 3 + 7);
1310 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1311 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1312 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1313
1314 /* if (b0 set) */ {
1315 /*
1316 * On overflow, mirror the value to control->vsc_overflow
1317 * which CPU is checking to detect overflow (see
1318 * check_vsc_overflow())
1319 */
1320 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1321 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1322 CP_REG_TO_MEM_0_CNT(0));
1323 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW);
1324
1325 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1326 tu_cs_emit(cs, 0x0);
1327
1328 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1329 } /* else */ {
1330 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1331 tu_cs_emit(cs, 0x1);
1332 }
1333 }
1334
1335 static void
1336 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1337 {
1338 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1339 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1340
1341 uint32_t x1 = tiling->tile0.offset.x;
1342 uint32_t y1 = tiling->tile0.offset.y;
1343 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1344 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1345
1346 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
1347
1348 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1349 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1350
1351 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1352 tu_cs_emit(cs, 0x1);
1353
1354 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1355 tu_cs_emit(cs, 0x1);
1356
1357 tu_cs_emit_wfi(cs);
1358
1359 tu_cs_emit_regs(cs,
1360 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1361
1362 update_vsc_pipe(cmd, cs);
1363
1364 tu_cs_emit_regs(cs,
1365 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1366
1367 tu_cs_emit_regs(cs,
1368 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1369
1370 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1371 tu_cs_emit(cs, UNK_2C);
1372
1373 tu_cs_emit_regs(cs,
1374 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1375
1376 tu_cs_emit_regs(cs,
1377 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1378
1379 /* emit IB to binning drawcmds: */
1380 tu_cs_emit_call(cs, &cmd->draw_cs);
1381
1382 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1383 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1384 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1385 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1386 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1387 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1388
1389 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1390 tu_cs_emit(cs, UNK_2D);
1391
1392 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1393 tu6_cache_flush(cmd, cs);
1394
1395 tu_cs_emit_wfi(cs);
1396
1397 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1398
1399 emit_vsc_overflow_test(cmd, cs);
1400
1401 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1402 tu_cs_emit(cs, 0x0);
1403
1404 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1405 tu_cs_emit(cs, 0x0);
1406
1407 tu_cs_emit_wfi(cs);
1408
1409 tu_cs_emit_regs(cs,
1410 A6XX_RB_CCU_CNTL(.unknown = phys_dev->magic.RB_CCU_CNTL_gmem));
1411
1412 cmd->wait_for_idle = false;
1413 }
1414
1415 static void
1416 tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1417 uint32_t a,
1418 const VkRenderPassBeginInfo *info)
1419 {
1420 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1421 const struct tu_image_view *iview = fb->attachments[a].attachment;
1422 const struct tu_render_pass_attachment *attachment =
1423 &cmd->state.pass->attachments[a];
1424 unsigned clear_mask = 0;
1425
1426 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
1427 if (attachment->gmem_offset < 0)
1428 return;
1429
1430 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1431 clear_mask = 0xf;
1432 }
1433
1434 if (vk_format_has_stencil(iview->vk_format)) {
1435 clear_mask &= 0x1;
1436 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
1437 clear_mask |= 0x2;
1438 if (clear_mask != 0x3)
1439 tu_finishme("depth/stencil only load op");
1440 }
1441
1442 if (!clear_mask)
1443 return;
1444
1445 tu_clear_sysmem_attachment(cmd, cs, a,
1446 &info->pClearValues[a], &(struct VkClearRect) {
1447 .rect = info->renderArea,
1448 .baseArrayLayer = iview->base_layer,
1449 .layerCount = iview->layer_count,
1450 });
1451 }
1452
1453 static void
1454 tu_emit_load_clear(struct tu_cmd_buffer *cmd,
1455 const VkRenderPassBeginInfo *info)
1456 {
1457 struct tu_cs *cs = &cmd->draw_cs;
1458
1459 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1460
1461 tu6_emit_blit_scissor(cmd, cs, true);
1462
1463 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1464 tu6_emit_load_attachment(cmd, cs, i);
1465
1466 tu6_emit_blit_scissor(cmd, cs, false);
1467
1468 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1469 tu6_emit_clear_attachment(cmd, cs, i, info);
1470
1471 tu_cond_exec_end(cs);
1472
1473 /* invalidate because reading input attachments will cache GMEM and
1474 * the cache isn''t updated when GMEM is written
1475 * TODO: is there a no-cache bit for textures?
1476 */
1477 if (cmd->state.subpass->input_count)
1478 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1479
1480 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1481
1482 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1483 tu_emit_sysmem_clear_attachment(cmd, cs, i, info);
1484
1485 tu_cond_exec_end(cs);
1486 }
1487
1488 static void
1489 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1490 const struct VkRect2D *renderArea)
1491 {
1492 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1493
1494 assert(fb->width > 0 && fb->height > 0);
1495 tu6_emit_window_scissor(cmd, cs, 0, 0, fb->width - 1, fb->height - 1);
1496 tu6_emit_window_offset(cmd, cs, 0, 0);
1497
1498 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1499
1500 tu6_emit_lrz_flush(cmd, cs);
1501
1502 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1503 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1504
1505 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1506 tu_cs_emit(cs, 0x0);
1507
1508 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1509 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1510 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1511
1512 tu6_emit_wfi(cmd, cs);
1513 tu_cs_emit_regs(cs,
1514 A6XX_RB_CCU_CNTL(0x10000000));
1515
1516 /* enable stream-out, with sysmem there is only one pass: */
1517 tu_cs_emit_regs(cs,
1518 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1519
1520 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1521 tu_cs_emit(cs, 0x1);
1522
1523 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1524 tu_cs_emit(cs, 0x0);
1525
1526 tu_cs_sanity_check(cs);
1527 }
1528
1529 static void
1530 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1531 {
1532 /* Do any resolves of the last subpass. These are handled in the
1533 * tile_store_ib in the gmem path.
1534 */
1535
1536 const struct tu_subpass *subpass = cmd->state.subpass;
1537 if (subpass->resolve_attachments) {
1538 for (unsigned i = 0; i < subpass->color_count; i++) {
1539 uint32_t a = subpass->resolve_attachments[i].attachment;
1540 if (a != VK_ATTACHMENT_UNUSED)
1541 tu6_emit_sysmem_resolve(cmd, cs, a,
1542 subpass->color_attachments[i].attachment);
1543 }
1544 }
1545
1546 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1547
1548 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1549 tu_cs_emit(cs, 0x0);
1550
1551 tu6_emit_lrz_flush(cmd, cs);
1552
1553 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1554 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1555
1556 tu_cs_sanity_check(cs);
1557 }
1558
1559
1560 static void
1561 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1562 {
1563 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1564
1565 tu6_emit_lrz_flush(cmd, cs);
1566
1567 /* lrz clear? */
1568
1569 tu6_emit_cache_flush(cmd, cs);
1570
1571 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1572 tu_cs_emit(cs, 0x0);
1573
1574 /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
1575 tu6_emit_wfi(cmd, cs);
1576 tu_cs_emit_regs(cs,
1577 A6XX_RB_CCU_CNTL(phys_dev->magic.RB_CCU_CNTL_gmem));
1578
1579 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1580 if (use_hw_binning(cmd)) {
1581 tu6_emit_bin_size(cs,
1582 tiling->tile0.extent.width,
1583 tiling->tile0.extent.height,
1584 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1585
1586 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1587
1588 tu6_emit_binning_pass(cmd, cs);
1589
1590 tu6_emit_bin_size(cs,
1591 tiling->tile0.extent.width,
1592 tiling->tile0.extent.height,
1593 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1594
1595 tu_cs_emit_regs(cs,
1596 A6XX_VFD_MODE_CNTL(0));
1597
1598 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1599
1600 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1601
1602 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1603 tu_cs_emit(cs, 0x1);
1604 } else {
1605 tu6_emit_bin_size(cs,
1606 tiling->tile0.extent.width,
1607 tiling->tile0.extent.height,
1608 0x6000000);
1609 }
1610
1611 tu_cs_sanity_check(cs);
1612 }
1613
1614 static void
1615 tu6_render_tile(struct tu_cmd_buffer *cmd,
1616 struct tu_cs *cs,
1617 const struct tu_tile *tile)
1618 {
1619 tu6_emit_tile_select(cmd, cs, tile);
1620
1621 tu_cs_emit_call(cs, &cmd->draw_cs);
1622 cmd->wait_for_idle = true;
1623
1624 if (use_hw_binning(cmd)) {
1625 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1626 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1627 A6XX_CP_REG_TEST_0_BIT(0) |
1628 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1629
1630 tu_cs_reserve(cs, 3 + 2);
1631 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1632 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1633 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1634
1635 /* if (no overflow) */ {
1636 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1637 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1638 }
1639 }
1640
1641 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1642
1643 tu_cs_sanity_check(cs);
1644 }
1645
1646 static void
1647 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1648 {
1649 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1650
1651 tu_cs_emit_regs(cs,
1652 A6XX_GRAS_LRZ_CNTL(0));
1653
1654 tu6_emit_lrz_flush(cmd, cs);
1655
1656 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1657
1658 tu_cs_sanity_check(cs);
1659 }
1660
1661 static void
1662 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1663 {
1664 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1665
1666 tu6_tile_render_begin(cmd, &cmd->cs);
1667
1668 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1669 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1670 struct tu_tile tile;
1671 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1672 tu6_render_tile(cmd, &cmd->cs, &tile);
1673 }
1674 }
1675
1676 tu6_tile_render_end(cmd, &cmd->cs);
1677 }
1678
1679 static void
1680 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1681 {
1682 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1683
1684 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1685
1686 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1687 cmd->wait_for_idle = true;
1688
1689 tu6_sysmem_render_end(cmd, &cmd->cs);
1690 }
1691
1692 static void
1693 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1694 {
1695 const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count;
1696 struct tu_cs sub_cs;
1697
1698 VkResult result =
1699 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1700 if (result != VK_SUCCESS) {
1701 cmd->record_result = result;
1702 return;
1703 }
1704
1705 /* emit to tile-store sub_cs */
1706 tu6_emit_tile_store(cmd, &sub_cs);
1707
1708 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1709 }
1710
1711 static void
1712 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1713 const VkRect2D *render_area)
1714 {
1715 const struct tu_device *dev = cmd->device;
1716 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1717
1718 tiling->render_area = *render_area;
1719 tiling->force_sysmem = force_sysmem(cmd, render_area);
1720
1721 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels);
1722 tu_tiling_config_update_pipe_layout(tiling, dev);
1723 tu_tiling_config_update_pipes(tiling, dev);
1724 }
1725
1726 const struct tu_dynamic_state default_dynamic_state = {
1727 .viewport =
1728 {
1729 .count = 0,
1730 },
1731 .scissor =
1732 {
1733 .count = 0,
1734 },
1735 .line_width = 1.0f,
1736 .depth_bias =
1737 {
1738 .bias = 0.0f,
1739 .clamp = 0.0f,
1740 .slope = 0.0f,
1741 },
1742 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1743 .depth_bounds =
1744 {
1745 .min = 0.0f,
1746 .max = 1.0f,
1747 },
1748 .stencil_compare_mask =
1749 {
1750 .front = ~0u,
1751 .back = ~0u,
1752 },
1753 .stencil_write_mask =
1754 {
1755 .front = ~0u,
1756 .back = ~0u,
1757 },
1758 .stencil_reference =
1759 {
1760 .front = 0u,
1761 .back = 0u,
1762 },
1763 };
1764
1765 static void UNUSED /* FINISHME */
1766 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1767 const struct tu_dynamic_state *src)
1768 {
1769 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1770 uint32_t copy_mask = src->mask;
1771 uint32_t dest_mask = 0;
1772
1773 tu_use_args(cmd_buffer); /* FINISHME */
1774
1775 /* Make sure to copy the number of viewports/scissors because they can
1776 * only be specified at pipeline creation time.
1777 */
1778 dest->viewport.count = src->viewport.count;
1779 dest->scissor.count = src->scissor.count;
1780 dest->discard_rectangle.count = src->discard_rectangle.count;
1781
1782 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1783 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1784 src->viewport.count * sizeof(VkViewport))) {
1785 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1786 src->viewport.count);
1787 dest_mask |= TU_DYNAMIC_VIEWPORT;
1788 }
1789 }
1790
1791 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1792 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1793 src->scissor.count * sizeof(VkRect2D))) {
1794 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1795 src->scissor.count);
1796 dest_mask |= TU_DYNAMIC_SCISSOR;
1797 }
1798 }
1799
1800 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1801 if (dest->line_width != src->line_width) {
1802 dest->line_width = src->line_width;
1803 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1804 }
1805 }
1806
1807 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1808 if (memcmp(&dest->depth_bias, &src->depth_bias,
1809 sizeof(src->depth_bias))) {
1810 dest->depth_bias = src->depth_bias;
1811 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1812 }
1813 }
1814
1815 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1816 if (memcmp(&dest->blend_constants, &src->blend_constants,
1817 sizeof(src->blend_constants))) {
1818 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1819 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1820 }
1821 }
1822
1823 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1824 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1825 sizeof(src->depth_bounds))) {
1826 dest->depth_bounds = src->depth_bounds;
1827 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1828 }
1829 }
1830
1831 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1832 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1833 sizeof(src->stencil_compare_mask))) {
1834 dest->stencil_compare_mask = src->stencil_compare_mask;
1835 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1836 }
1837 }
1838
1839 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1840 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1841 sizeof(src->stencil_write_mask))) {
1842 dest->stencil_write_mask = src->stencil_write_mask;
1843 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1844 }
1845 }
1846
1847 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1848 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1849 sizeof(src->stencil_reference))) {
1850 dest->stencil_reference = src->stencil_reference;
1851 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1852 }
1853 }
1854
1855 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1856 if (memcmp(&dest->discard_rectangle.rectangles,
1857 &src->discard_rectangle.rectangles,
1858 src->discard_rectangle.count * sizeof(VkRect2D))) {
1859 typed_memcpy(dest->discard_rectangle.rectangles,
1860 src->discard_rectangle.rectangles,
1861 src->discard_rectangle.count);
1862 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1863 }
1864 }
1865 }
1866
1867 static VkResult
1868 tu_create_cmd_buffer(struct tu_device *device,
1869 struct tu_cmd_pool *pool,
1870 VkCommandBufferLevel level,
1871 VkCommandBuffer *pCommandBuffer)
1872 {
1873 struct tu_cmd_buffer *cmd_buffer;
1874 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1875 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1876 if (cmd_buffer == NULL)
1877 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1878
1879 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1880 cmd_buffer->device = device;
1881 cmd_buffer->pool = pool;
1882 cmd_buffer->level = level;
1883
1884 if (pool) {
1885 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1886 cmd_buffer->queue_family_index = pool->queue_family_index;
1887
1888 } else {
1889 /* Init the pool_link so we can safely call list_del when we destroy
1890 * the command buffer
1891 */
1892 list_inithead(&cmd_buffer->pool_link);
1893 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1894 }
1895
1896 tu_bo_list_init(&cmd_buffer->bo_list);
1897 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1898 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1899 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1900 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1901
1902 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1903
1904 list_inithead(&cmd_buffer->upload.list);
1905
1906 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1907 if (result != VK_SUCCESS)
1908 goto fail_scratch_bo;
1909
1910 /* TODO: resize on overflow */
1911 cmd_buffer->vsc_data_pitch = device->vsc_data_pitch;
1912 cmd_buffer->vsc_data2_pitch = device->vsc_data2_pitch;
1913 cmd_buffer->vsc_data = device->vsc_data;
1914 cmd_buffer->vsc_data2 = device->vsc_data2;
1915
1916 return VK_SUCCESS;
1917
1918 fail_scratch_bo:
1919 list_del(&cmd_buffer->pool_link);
1920 return result;
1921 }
1922
1923 static void
1924 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1925 {
1926 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1927
1928 list_del(&cmd_buffer->pool_link);
1929
1930 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
1931 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
1932
1933 tu_cs_finish(&cmd_buffer->cs);
1934 tu_cs_finish(&cmd_buffer->draw_cs);
1935 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1936 tu_cs_finish(&cmd_buffer->sub_cs);
1937
1938 tu_bo_list_destroy(&cmd_buffer->bo_list);
1939 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1940 }
1941
1942 static VkResult
1943 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1944 {
1945 cmd_buffer->wait_for_idle = true;
1946
1947 cmd_buffer->record_result = VK_SUCCESS;
1948
1949 tu_bo_list_reset(&cmd_buffer->bo_list);
1950 tu_cs_reset(&cmd_buffer->cs);
1951 tu_cs_reset(&cmd_buffer->draw_cs);
1952 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1953 tu_cs_reset(&cmd_buffer->sub_cs);
1954
1955 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
1956 cmd_buffer->descriptors[i].valid = 0;
1957 cmd_buffer->descriptors[i].push_dirty = false;
1958 }
1959
1960 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1961
1962 return cmd_buffer->record_result;
1963 }
1964
1965 VkResult
1966 tu_AllocateCommandBuffers(VkDevice _device,
1967 const VkCommandBufferAllocateInfo *pAllocateInfo,
1968 VkCommandBuffer *pCommandBuffers)
1969 {
1970 TU_FROM_HANDLE(tu_device, device, _device);
1971 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1972
1973 VkResult result = VK_SUCCESS;
1974 uint32_t i;
1975
1976 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1977
1978 if (!list_is_empty(&pool->free_cmd_buffers)) {
1979 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1980 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1981
1982 list_del(&cmd_buffer->pool_link);
1983 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1984
1985 result = tu_reset_cmd_buffer(cmd_buffer);
1986 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1987 cmd_buffer->level = pAllocateInfo->level;
1988
1989 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1990 } else {
1991 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1992 &pCommandBuffers[i]);
1993 }
1994 if (result != VK_SUCCESS)
1995 break;
1996 }
1997
1998 if (result != VK_SUCCESS) {
1999 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
2000 pCommandBuffers);
2001
2002 /* From the Vulkan 1.0.66 spec:
2003 *
2004 * "vkAllocateCommandBuffers can be used to create multiple
2005 * command buffers. If the creation of any of those command
2006 * buffers fails, the implementation must destroy all
2007 * successfully created command buffer objects from this
2008 * command, set all entries of the pCommandBuffers array to
2009 * NULL and return the error."
2010 */
2011 memset(pCommandBuffers, 0,
2012 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2013 }
2014
2015 return result;
2016 }
2017
2018 void
2019 tu_FreeCommandBuffers(VkDevice device,
2020 VkCommandPool commandPool,
2021 uint32_t commandBufferCount,
2022 const VkCommandBuffer *pCommandBuffers)
2023 {
2024 for (uint32_t i = 0; i < commandBufferCount; i++) {
2025 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
2026
2027 if (cmd_buffer) {
2028 if (cmd_buffer->pool) {
2029 list_del(&cmd_buffer->pool_link);
2030 list_addtail(&cmd_buffer->pool_link,
2031 &cmd_buffer->pool->free_cmd_buffers);
2032 } else
2033 tu_cmd_buffer_destroy(cmd_buffer);
2034 }
2035 }
2036 }
2037
2038 VkResult
2039 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
2040 VkCommandBufferResetFlags flags)
2041 {
2042 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2043 return tu_reset_cmd_buffer(cmd_buffer);
2044 }
2045
2046 VkResult
2047 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2048 const VkCommandBufferBeginInfo *pBeginInfo)
2049 {
2050 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2051 VkResult result = VK_SUCCESS;
2052
2053 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
2054 /* If the command buffer has already been resetted with
2055 * vkResetCommandBuffer, no need to do it again.
2056 */
2057 result = tu_reset_cmd_buffer(cmd_buffer);
2058 if (result != VK_SUCCESS)
2059 return result;
2060 }
2061
2062 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2063 cmd_buffer->usage_flags = pBeginInfo->flags;
2064
2065 tu_cs_begin(&cmd_buffer->cs);
2066 tu_cs_begin(&cmd_buffer->draw_cs);
2067 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2068
2069 cmd_buffer->scratch_seqno = 0;
2070
2071 /* setup initial configuration into command buffer */
2072 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2073 switch (cmd_buffer->queue_family_index) {
2074 case TU_QUEUE_GENERAL:
2075 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
2076 break;
2077 default:
2078 break;
2079 }
2080 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2081 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
2082 assert(pBeginInfo->pInheritanceInfo);
2083 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2084 cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2085 }
2086
2087 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
2088
2089 return VK_SUCCESS;
2090 }
2091
2092 void
2093 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2094 uint32_t firstBinding,
2095 uint32_t bindingCount,
2096 const VkBuffer *pBuffers,
2097 const VkDeviceSize *pOffsets)
2098 {
2099 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2100
2101 assert(firstBinding + bindingCount <= MAX_VBS);
2102
2103 for (uint32_t i = 0; i < bindingCount; i++) {
2104 cmd->state.vb.buffers[firstBinding + i] =
2105 tu_buffer_from_handle(pBuffers[i]);
2106 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
2107 }
2108
2109 /* VB states depend on VkPipelineVertexInputStateCreateInfo */
2110 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2111 }
2112
2113 void
2114 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2115 VkBuffer buffer,
2116 VkDeviceSize offset,
2117 VkIndexType indexType)
2118 {
2119 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2120 TU_FROM_HANDLE(tu_buffer, buf, buffer);
2121
2122 /* initialize/update the restart index */
2123 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
2124 struct tu_cs *draw_cs = &cmd->draw_cs;
2125
2126 tu6_emit_restart_index(
2127 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
2128
2129 tu_cs_sanity_check(draw_cs);
2130 }
2131
2132 /* track the BO */
2133 if (cmd->state.index_buffer != buf)
2134 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
2135
2136 cmd->state.index_buffer = buf;
2137 cmd->state.index_offset = offset;
2138 cmd->state.index_type = indexType;
2139 }
2140
2141 void
2142 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2143 VkPipelineBindPoint pipelineBindPoint,
2144 VkPipelineLayout _layout,
2145 uint32_t firstSet,
2146 uint32_t descriptorSetCount,
2147 const VkDescriptorSet *pDescriptorSets,
2148 uint32_t dynamicOffsetCount,
2149 const uint32_t *pDynamicOffsets)
2150 {
2151 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2152 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
2153 unsigned dyn_idx = 0;
2154
2155 struct tu_descriptor_state *descriptors_state =
2156 tu_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2157
2158 for (unsigned i = 0; i < descriptorSetCount; ++i) {
2159 unsigned idx = i + firstSet;
2160 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
2161
2162 descriptors_state->sets[idx] = set;
2163 descriptors_state->valid |= (1u << idx);
2164
2165 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
2166 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
2167 assert(dyn_idx < dynamicOffsetCount);
2168
2169 descriptors_state->dynamic_buffers[idx] =
2170 set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
2171 }
2172 }
2173
2174 cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2175 }
2176
2177 void
2178 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2179 VkPipelineLayout layout,
2180 VkShaderStageFlags stageFlags,
2181 uint32_t offset,
2182 uint32_t size,
2183 const void *pValues)
2184 {
2185 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2186 memcpy((void*) cmd->push_constants + offset, pValues, size);
2187 cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS;
2188 }
2189
2190 VkResult
2191 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2192 {
2193 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2194
2195 if (cmd_buffer->scratch_seqno) {
2196 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2197 MSM_SUBMIT_BO_WRITE);
2198 }
2199
2200 if (cmd_buffer->use_vsc_data) {
2201 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data,
2202 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2203 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data2,
2204 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2205 }
2206
2207 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2208 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2209 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2210 }
2211
2212 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2213 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2214 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2215 }
2216
2217 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2218 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2219 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2220 }
2221
2222 tu_cs_end(&cmd_buffer->cs);
2223 tu_cs_end(&cmd_buffer->draw_cs);
2224 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2225
2226 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2227
2228 return cmd_buffer->record_result;
2229 }
2230
2231 void
2232 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2233 VkPipelineBindPoint pipelineBindPoint,
2234 VkPipeline _pipeline)
2235 {
2236 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2237 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2238
2239 switch (pipelineBindPoint) {
2240 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2241 cmd->state.pipeline = pipeline;
2242 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
2243 break;
2244 case VK_PIPELINE_BIND_POINT_COMPUTE:
2245 cmd->state.compute_pipeline = pipeline;
2246 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2247 break;
2248 default:
2249 unreachable("unrecognized pipeline bind point");
2250 break;
2251 }
2252
2253 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2254 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2255 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2256 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2257 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2258 }
2259 }
2260
2261 void
2262 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2263 uint32_t firstViewport,
2264 uint32_t viewportCount,
2265 const VkViewport *pViewports)
2266 {
2267 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2268 struct tu_cs *draw_cs = &cmd->draw_cs;
2269
2270 assert(firstViewport == 0 && viewportCount == 1);
2271 tu6_emit_viewport(draw_cs, pViewports);
2272
2273 tu_cs_sanity_check(draw_cs);
2274 }
2275
2276 void
2277 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2278 uint32_t firstScissor,
2279 uint32_t scissorCount,
2280 const VkRect2D *pScissors)
2281 {
2282 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2283 struct tu_cs *draw_cs = &cmd->draw_cs;
2284
2285 assert(firstScissor == 0 && scissorCount == 1);
2286 tu6_emit_scissor(draw_cs, pScissors);
2287
2288 tu_cs_sanity_check(draw_cs);
2289 }
2290
2291 void
2292 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2293 {
2294 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2295
2296 cmd->state.dynamic.line_width = lineWidth;
2297
2298 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2299 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2300 }
2301
2302 void
2303 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2304 float depthBiasConstantFactor,
2305 float depthBiasClamp,
2306 float depthBiasSlopeFactor)
2307 {
2308 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2309 struct tu_cs *draw_cs = &cmd->draw_cs;
2310
2311 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2312 depthBiasSlopeFactor);
2313
2314 tu_cs_sanity_check(draw_cs);
2315 }
2316
2317 void
2318 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2319 const float blendConstants[4])
2320 {
2321 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2322 struct tu_cs *draw_cs = &cmd->draw_cs;
2323
2324 tu6_emit_blend_constants(draw_cs, blendConstants);
2325
2326 tu_cs_sanity_check(draw_cs);
2327 }
2328
2329 void
2330 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2331 float minDepthBounds,
2332 float maxDepthBounds)
2333 {
2334 }
2335
2336 void
2337 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2338 VkStencilFaceFlags faceMask,
2339 uint32_t compareMask)
2340 {
2341 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2342
2343 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2344 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2345 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2346 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2347
2348 /* the front/back compare masks must be updated together */
2349 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2350 }
2351
2352 void
2353 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2354 VkStencilFaceFlags faceMask,
2355 uint32_t writeMask)
2356 {
2357 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2358
2359 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2360 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2361 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2362 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2363
2364 /* the front/back write masks must be updated together */
2365 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2366 }
2367
2368 void
2369 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2370 VkStencilFaceFlags faceMask,
2371 uint32_t reference)
2372 {
2373 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2374
2375 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2376 cmd->state.dynamic.stencil_reference.front = reference;
2377 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2378 cmd->state.dynamic.stencil_reference.back = reference;
2379
2380 /* the front/back references must be updated together */
2381 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2382 }
2383
2384 void
2385 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2386 uint32_t commandBufferCount,
2387 const VkCommandBuffer *pCmdBuffers)
2388 {
2389 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2390 VkResult result;
2391
2392 assert(commandBufferCount > 0);
2393
2394 for (uint32_t i = 0; i < commandBufferCount; i++) {
2395 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2396
2397 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2398 if (result != VK_SUCCESS) {
2399 cmd->record_result = result;
2400 break;
2401 }
2402
2403 if (secondary->usage_flags &
2404 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2405 assert(tu_cs_is_empty(&secondary->cs));
2406
2407 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2408 if (result != VK_SUCCESS) {
2409 cmd->record_result = result;
2410 break;
2411 }
2412
2413 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2414 &secondary->draw_epilogue_cs);
2415 if (result != VK_SUCCESS) {
2416 cmd->record_result = result;
2417 break;
2418 }
2419 } else {
2420 assert(tu_cs_is_empty(&secondary->draw_cs));
2421 assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2422
2423 for (uint32_t j = 0; j < secondary->cs.bo_count; j++) {
2424 tu_bo_list_add(&cmd->bo_list, secondary->cs.bos[j],
2425 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2426 }
2427
2428 tu_cs_emit_call(&cmd->cs, &secondary->cs);
2429 }
2430 }
2431 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2432 }
2433
2434 VkResult
2435 tu_CreateCommandPool(VkDevice _device,
2436 const VkCommandPoolCreateInfo *pCreateInfo,
2437 const VkAllocationCallbacks *pAllocator,
2438 VkCommandPool *pCmdPool)
2439 {
2440 TU_FROM_HANDLE(tu_device, device, _device);
2441 struct tu_cmd_pool *pool;
2442
2443 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2444 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2445 if (pool == NULL)
2446 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2447
2448 if (pAllocator)
2449 pool->alloc = *pAllocator;
2450 else
2451 pool->alloc = device->alloc;
2452
2453 list_inithead(&pool->cmd_buffers);
2454 list_inithead(&pool->free_cmd_buffers);
2455
2456 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2457
2458 *pCmdPool = tu_cmd_pool_to_handle(pool);
2459
2460 return VK_SUCCESS;
2461 }
2462
2463 void
2464 tu_DestroyCommandPool(VkDevice _device,
2465 VkCommandPool commandPool,
2466 const VkAllocationCallbacks *pAllocator)
2467 {
2468 TU_FROM_HANDLE(tu_device, device, _device);
2469 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2470
2471 if (!pool)
2472 return;
2473
2474 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2475 &pool->cmd_buffers, pool_link)
2476 {
2477 tu_cmd_buffer_destroy(cmd_buffer);
2478 }
2479
2480 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2481 &pool->free_cmd_buffers, pool_link)
2482 {
2483 tu_cmd_buffer_destroy(cmd_buffer);
2484 }
2485
2486 vk_free2(&device->alloc, pAllocator, pool);
2487 }
2488
2489 VkResult
2490 tu_ResetCommandPool(VkDevice device,
2491 VkCommandPool commandPool,
2492 VkCommandPoolResetFlags flags)
2493 {
2494 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2495 VkResult result;
2496
2497 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2498 pool_link)
2499 {
2500 result = tu_reset_cmd_buffer(cmd_buffer);
2501 if (result != VK_SUCCESS)
2502 return result;
2503 }
2504
2505 return VK_SUCCESS;
2506 }
2507
2508 void
2509 tu_TrimCommandPool(VkDevice device,
2510 VkCommandPool commandPool,
2511 VkCommandPoolTrimFlags flags)
2512 {
2513 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2514
2515 if (!pool)
2516 return;
2517
2518 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2519 &pool->free_cmd_buffers, pool_link)
2520 {
2521 tu_cmd_buffer_destroy(cmd_buffer);
2522 }
2523 }
2524
2525 void
2526 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2527 const VkRenderPassBeginInfo *pRenderPassBegin,
2528 VkSubpassContents contents)
2529 {
2530 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2531 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2532 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2533
2534 cmd->state.pass = pass;
2535 cmd->state.subpass = pass->subpasses;
2536 cmd->state.framebuffer = fb;
2537
2538 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2539 tu_cmd_prepare_tile_store_ib(cmd);
2540
2541 tu_emit_load_clear(cmd, pRenderPassBegin);
2542
2543 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2544 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2545 tu6_emit_msaa(cmd, cmd->state.subpass, &cmd->draw_cs);
2546 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2547
2548 /* note: use_hw_binning only checks tiling config */
2549 if (use_hw_binning(cmd))
2550 cmd->use_vsc_data = true;
2551
2552 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2553 const struct tu_image_view *iview = fb->attachments[i].attachment;
2554 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2555 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2556 }
2557 }
2558
2559 void
2560 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2561 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2562 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2563 {
2564 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2565 pSubpassBeginInfo->contents);
2566 }
2567
2568 void
2569 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2570 {
2571 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2572 const struct tu_render_pass *pass = cmd->state.pass;
2573 struct tu_cs *cs = &cmd->draw_cs;
2574
2575 const struct tu_subpass *subpass = cmd->state.subpass++;
2576 /* TODO:
2577 * if msaa samples change between subpasses,
2578 * attachment store is broken for some attachments
2579 */
2580 if (subpass->resolve_attachments) {
2581 tu6_emit_blit_scissor(cmd, cs, true);
2582 for (unsigned i = 0; i < subpass->color_count; i++) {
2583 uint32_t a = subpass->resolve_attachments[i].attachment;
2584 if (a != VK_ATTACHMENT_UNUSED) {
2585 tu6_emit_resolve(cmd, cs, a,
2586 subpass->color_attachments[i].attachment);
2587 }
2588 }
2589 }
2590
2591 /* invalidate because reading input attachments will cache GMEM and
2592 * the cache isn''t updated when GMEM is written
2593 * TODO: is there a no-cache bit for textures?
2594 */
2595 if (cmd->state.subpass->input_count)
2596 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2597
2598 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2599 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2600 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2601 tu6_emit_msaa(cmd, cmd->state.subpass, cs);
2602 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2603
2604 /* Emit flushes so that input attachments will read the correct value. This
2605 * is for sysmem only, although it shouldn't do much harm on gmem.
2606 */
2607 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2608 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2609
2610 /* TODO:
2611 * since we don't know how to do GMEM->GMEM resolve,
2612 * resolve attachments are resolved to memory then loaded to GMEM again if needed
2613 */
2614 if (subpass->resolve_attachments) {
2615 for (unsigned i = 0; i < subpass->color_count; i++) {
2616 uint32_t a = subpass->resolve_attachments[i].attachment;
2617 if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) {
2618 tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n");
2619 tu6_emit_predicated_blit(cmd, cs, a, a, false);
2620 }
2621 }
2622 }
2623 }
2624
2625 void
2626 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2627 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2628 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2629 {
2630 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2631 }
2632
2633 struct tu_draw_info
2634 {
2635 /**
2636 * Number of vertices.
2637 */
2638 uint32_t count;
2639
2640 /**
2641 * Index of the first vertex.
2642 */
2643 int32_t vertex_offset;
2644
2645 /**
2646 * First instance id.
2647 */
2648 uint32_t first_instance;
2649
2650 /**
2651 * Number of instances.
2652 */
2653 uint32_t instance_count;
2654
2655 /**
2656 * First index (indexed draws only).
2657 */
2658 uint32_t first_index;
2659
2660 /**
2661 * Whether it's an indexed draw.
2662 */
2663 bool indexed;
2664
2665 /**
2666 * Indirect draw parameters resource.
2667 */
2668 struct tu_buffer *indirect;
2669 uint64_t indirect_offset;
2670 uint32_t stride;
2671
2672 /**
2673 * Draw count parameters resource.
2674 */
2675 struct tu_buffer *count_buffer;
2676 uint64_t count_buffer_offset;
2677 };
2678
2679 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2680 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2681
2682 enum tu_draw_state_group_id
2683 {
2684 TU_DRAW_STATE_PROGRAM,
2685 TU_DRAW_STATE_PROGRAM_BINNING,
2686 TU_DRAW_STATE_VI,
2687 TU_DRAW_STATE_VI_BINNING,
2688 TU_DRAW_STATE_VP,
2689 TU_DRAW_STATE_RAST,
2690 TU_DRAW_STATE_DS,
2691 TU_DRAW_STATE_BLEND,
2692 TU_DRAW_STATE_VS_CONST,
2693 TU_DRAW_STATE_FS_CONST,
2694 TU_DRAW_STATE_VS_TEX,
2695 TU_DRAW_STATE_FS_TEX_SYSMEM,
2696 TU_DRAW_STATE_FS_TEX_GMEM,
2697 TU_DRAW_STATE_FS_IBO,
2698 TU_DRAW_STATE_VS_PARAMS,
2699
2700 TU_DRAW_STATE_COUNT,
2701 };
2702
2703 struct tu_draw_state_group
2704 {
2705 enum tu_draw_state_group_id id;
2706 uint32_t enable_mask;
2707 struct tu_cs_entry ib;
2708 };
2709
2710 const static struct tu_sampler*
2711 sampler_ptr(struct tu_descriptor_state *descriptors_state,
2712 const struct tu_descriptor_map *map, unsigned i,
2713 unsigned array_index)
2714 {
2715 assert(descriptors_state->valid & (1 << map->set[i]));
2716
2717 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2718 assert(map->binding[i] < set->layout->binding_count);
2719
2720 const struct tu_descriptor_set_binding_layout *layout =
2721 &set->layout->binding[map->binding[i]];
2722
2723 if (layout->immutable_samplers_offset) {
2724 const struct tu_sampler *immutable_samplers =
2725 tu_immutable_samplers(set->layout, layout);
2726
2727 return &immutable_samplers[array_index];
2728 }
2729
2730 switch (layout->type) {
2731 case VK_DESCRIPTOR_TYPE_SAMPLER:
2732 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4];
2733 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2734 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
2735 array_index *
2736 (A6XX_TEX_CONST_DWORDS +
2737 sizeof(struct tu_sampler) / 4)];
2738 default:
2739 unreachable("unimplemented descriptor type");
2740 break;
2741 }
2742 }
2743
2744 static void
2745 write_tex_const(struct tu_cmd_buffer *cmd,
2746 uint32_t *dst,
2747 struct tu_descriptor_state *descriptors_state,
2748 const struct tu_descriptor_map *map,
2749 unsigned i, unsigned array_index, bool is_sysmem)
2750 {
2751 assert(descriptors_state->valid & (1 << map->set[i]));
2752
2753 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2754 assert(map->binding[i] < set->layout->binding_count);
2755
2756 const struct tu_descriptor_set_binding_layout *layout =
2757 &set->layout->binding[map->binding[i]];
2758
2759 switch (layout->type) {
2760 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2761 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2762 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2763 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2764 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2765 array_index * A6XX_TEX_CONST_DWORDS],
2766 A6XX_TEX_CONST_DWORDS * 4);
2767 break;
2768 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2769 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2770 array_index *
2771 (A6XX_TEX_CONST_DWORDS +
2772 sizeof(struct tu_sampler) / 4)],
2773 A6XX_TEX_CONST_DWORDS * 4);
2774 break;
2775 default:
2776 unreachable("unimplemented descriptor type");
2777 break;
2778 }
2779
2780 if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
2781 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2782 uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
2783 array_index].attachment;
2784 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2785
2786 assert(att->gmem_offset >= 0);
2787
2788 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
2789 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
2790 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
2791 dst[2] |=
2792 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
2793 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
2794 dst[3] = 0;
2795 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
2796 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
2797 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
2798 dst[i] = 0;
2799
2800 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2801 tu_finishme("patch input attachment pitch for secondary cmd buffer");
2802 }
2803 }
2804
2805 static void
2806 write_image_ibo(struct tu_cmd_buffer *cmd,
2807 uint32_t *dst,
2808 struct tu_descriptor_state *descriptors_state,
2809 const struct tu_descriptor_map *map,
2810 unsigned i, unsigned array_index)
2811 {
2812 assert(descriptors_state->valid & (1 << map->set[i]));
2813
2814 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2815 assert(map->binding[i] < set->layout->binding_count);
2816
2817 const struct tu_descriptor_set_binding_layout *layout =
2818 &set->layout->binding[map->binding[i]];
2819
2820 assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
2821
2822 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2823 (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
2824 A6XX_TEX_CONST_DWORDS * 4);
2825 }
2826
2827 static uint64_t
2828 buffer_ptr(struct tu_descriptor_state *descriptors_state,
2829 const struct tu_descriptor_map *map,
2830 unsigned i, unsigned array_index)
2831 {
2832 assert(descriptors_state->valid & (1 << map->set[i]));
2833
2834 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2835 assert(map->binding[i] < set->layout->binding_count);
2836
2837 const struct tu_descriptor_set_binding_layout *layout =
2838 &set->layout->binding[map->binding[i]];
2839
2840 switch (layout->type) {
2841 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2842 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2843 return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
2844 array_index];
2845 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2846 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2847 return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
2848 set->mapped_ptr[layout->offset / 4 + array_index * 2];
2849 default:
2850 unreachable("unimplemented descriptor type");
2851 break;
2852 }
2853 }
2854
2855 static inline uint32_t
2856 tu6_stage2opcode(gl_shader_stage type)
2857 {
2858 switch (type) {
2859 case MESA_SHADER_VERTEX:
2860 case MESA_SHADER_TESS_CTRL:
2861 case MESA_SHADER_TESS_EVAL:
2862 case MESA_SHADER_GEOMETRY:
2863 return CP_LOAD_STATE6_GEOM;
2864 case MESA_SHADER_FRAGMENT:
2865 case MESA_SHADER_COMPUTE:
2866 case MESA_SHADER_KERNEL:
2867 return CP_LOAD_STATE6_FRAG;
2868 default:
2869 unreachable("bad shader type");
2870 }
2871 }
2872
2873 static inline enum a6xx_state_block
2874 tu6_stage2shadersb(gl_shader_stage type)
2875 {
2876 switch (type) {
2877 case MESA_SHADER_VERTEX:
2878 return SB6_VS_SHADER;
2879 case MESA_SHADER_FRAGMENT:
2880 return SB6_FS_SHADER;
2881 case MESA_SHADER_COMPUTE:
2882 case MESA_SHADER_KERNEL:
2883 return SB6_CS_SHADER;
2884 default:
2885 unreachable("bad shader type");
2886 return ~0;
2887 }
2888 }
2889
2890 static void
2891 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2892 struct tu_descriptor_state *descriptors_state,
2893 gl_shader_stage type,
2894 uint32_t *push_constants)
2895 {
2896 const struct tu_program_descriptor_linkage *link =
2897 &pipeline->program.link[type];
2898 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2899
2900 for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
2901 if (state->range[i].start < state->range[i].end) {
2902 uint32_t size = state->range[i].end - state->range[i].start;
2903 uint32_t offset = state->range[i].start;
2904
2905 /* and even if the start of the const buffer is before
2906 * first_immediate, the end may not be:
2907 */
2908 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2909
2910 if (size == 0)
2911 continue;
2912
2913 /* things should be aligned to vec4: */
2914 debug_assert((state->range[i].offset % 16) == 0);
2915 debug_assert((size % 16) == 0);
2916 debug_assert((offset % 16) == 0);
2917
2918 if (i == 0) {
2919 /* push constants */
2920 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (size / 4));
2921 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2922 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2923 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2924 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2925 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2926 tu_cs_emit(cs, 0);
2927 tu_cs_emit(cs, 0);
2928 for (unsigned i = 0; i < size / 4; i++)
2929 tu_cs_emit(cs, push_constants[i + offset / 4]);
2930 continue;
2931 }
2932
2933 /* Look through the UBO map to find our UBO index, and get the VA for
2934 * that UBO.
2935 */
2936 uint64_t va = 0;
2937 uint32_t ubo_idx = i - 1;
2938 uint32_t ubo_map_base = 0;
2939 for (int j = 0; j < link->ubo_map.num; j++) {
2940 if (ubo_idx >= ubo_map_base &&
2941 ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
2942 va = buffer_ptr(descriptors_state, &link->ubo_map, j,
2943 ubo_idx - ubo_map_base);
2944 break;
2945 }
2946 ubo_map_base += link->ubo_map.array_size[j];
2947 }
2948 assert(va);
2949
2950 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2951 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2952 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2953 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2954 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2955 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2956 tu_cs_emit_qw(cs, va + offset);
2957 }
2958 }
2959 }
2960
2961 static void
2962 tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2963 struct tu_descriptor_state *descriptors_state,
2964 gl_shader_stage type)
2965 {
2966 const struct tu_program_descriptor_linkage *link =
2967 &pipeline->program.link[type];
2968
2969 uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
2970 uint32_t anum = align(num, 2);
2971
2972 if (!num)
2973 return;
2974
2975 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
2976 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
2977 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2978 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2979 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2980 CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
2981 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2982 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2983
2984 unsigned emitted = 0;
2985 for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
2986 for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
2987 tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
2988 emitted++;
2989 }
2990 }
2991
2992 for (; emitted < anum; emitted++) {
2993 tu_cs_emit(cs, 0xffffffff);
2994 tu_cs_emit(cs, 0xffffffff);
2995 }
2996 }
2997
2998 static struct tu_cs_entry
2999 tu6_emit_consts(struct tu_cmd_buffer *cmd,
3000 const struct tu_pipeline *pipeline,
3001 struct tu_descriptor_state *descriptors_state,
3002 gl_shader_stage type)
3003 {
3004 struct tu_cs cs;
3005 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
3006
3007 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3008 tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
3009
3010 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3011 }
3012
3013 static VkResult
3014 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3015 const struct tu_draw_info *draw,
3016 struct tu_cs_entry *entry)
3017 {
3018 /* TODO: fill out more than just base instance */
3019 const struct tu_program_descriptor_linkage *link =
3020 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3021 const struct ir3_const_state *const_state = &link->const_state;
3022 struct tu_cs cs;
3023
3024 if (const_state->offsets.driver_param >= link->constlen) {
3025 *entry = (struct tu_cs_entry) {};
3026 return VK_SUCCESS;
3027 }
3028
3029 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
3030 if (result != VK_SUCCESS)
3031 return result;
3032
3033 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3034 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
3035 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3036 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3037 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3038 CP_LOAD_STATE6_0_NUM_UNIT(1));
3039 tu_cs_emit(&cs, 0);
3040 tu_cs_emit(&cs, 0);
3041
3042 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3043
3044 tu_cs_emit(&cs, 0);
3045 tu_cs_emit(&cs, 0);
3046 tu_cs_emit(&cs, draw->first_instance);
3047 tu_cs_emit(&cs, 0);
3048
3049 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3050 return VK_SUCCESS;
3051 }
3052
3053 static VkResult
3054 tu6_emit_textures(struct tu_cmd_buffer *cmd,
3055 const struct tu_pipeline *pipeline,
3056 struct tu_descriptor_state *descriptors_state,
3057 gl_shader_stage type,
3058 struct tu_cs_entry *entry,
3059 bool *needs_border,
3060 bool is_sysmem)
3061 {
3062 struct tu_cs *draw_state = &cmd->sub_cs;
3063 const struct tu_program_descriptor_linkage *link =
3064 &pipeline->program.link[type];
3065 VkResult result;
3066
3067 if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
3068 *entry = (struct tu_cs_entry) {};
3069 return VK_SUCCESS;
3070 }
3071
3072 /* allocate and fill texture state */
3073 struct ts_cs_memory tex_const;
3074 result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
3075 A6XX_TEX_CONST_DWORDS, &tex_const);
3076 if (result != VK_SUCCESS)
3077 return result;
3078
3079 int tex_index = 0;
3080 for (unsigned i = 0; i < link->texture_map.num; i++) {
3081 for (int j = 0; j < link->texture_map.array_size[i]; j++) {
3082 write_tex_const(cmd,
3083 &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
3084 descriptors_state, &link->texture_map, i, j,
3085 is_sysmem);
3086 }
3087 }
3088
3089 /* allocate and fill sampler state */
3090 struct ts_cs_memory tex_samp = { 0 };
3091 if (link->sampler_map.num_desc) {
3092 result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
3093 A6XX_TEX_SAMP_DWORDS, &tex_samp);
3094 if (result != VK_SUCCESS)
3095 return result;
3096
3097 int sampler_index = 0;
3098 for (unsigned i = 0; i < link->sampler_map.num; i++) {
3099 for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
3100 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3101 &link->sampler_map,
3102 i, j);
3103 memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
3104 sampler->state, sizeof(sampler->state));
3105 *needs_border |= sampler->needs_border;
3106 }
3107 }
3108 }
3109
3110 unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
3111 enum a6xx_state_block sb;
3112
3113 switch (type) {
3114 case MESA_SHADER_VERTEX:
3115 sb = SB6_VS_TEX;
3116 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
3117 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
3118 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
3119 break;
3120 case MESA_SHADER_FRAGMENT:
3121 sb = SB6_FS_TEX;
3122 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
3123 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
3124 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
3125 break;
3126 case MESA_SHADER_COMPUTE:
3127 sb = SB6_CS_TEX;
3128 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
3129 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
3130 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
3131 break;
3132 default:
3133 unreachable("bad state block");
3134 }
3135
3136 struct tu_cs cs;
3137 result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
3138 if (result != VK_SUCCESS)
3139 return result;
3140
3141 if (link->sampler_map.num_desc) {
3142 /* output sampler state: */
3143 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3144 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3145 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
3146 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3147 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3148 CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
3149 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3150
3151 tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
3152 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3153 }
3154
3155 /* emit texture state: */
3156 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3157 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3158 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3159 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3160 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3161 CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
3162 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3163
3164 tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
3165 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3166
3167 tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
3168 tu_cs_emit(&cs, link->texture_map.num_desc);
3169
3170 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3171 return VK_SUCCESS;
3172 }
3173
3174 static VkResult
3175 tu6_emit_ibo(struct tu_cmd_buffer *cmd,
3176 const struct tu_pipeline *pipeline,
3177 struct tu_descriptor_state *descriptors_state,
3178 gl_shader_stage type,
3179 struct tu_cs_entry *entry)
3180 {
3181 struct tu_cs *draw_state = &cmd->sub_cs;
3182 const struct tu_program_descriptor_linkage *link =
3183 &pipeline->program.link[type];
3184 VkResult result;
3185
3186 unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
3187
3188 if (num_desc == 0) {
3189 *entry = (struct tu_cs_entry) {};
3190 return VK_SUCCESS;
3191 }
3192
3193 struct ts_cs_memory ibo_const;
3194 result = tu_cs_alloc(draw_state, num_desc,
3195 A6XX_TEX_CONST_DWORDS, &ibo_const);
3196 if (result != VK_SUCCESS)
3197 return result;
3198
3199 int ssbo_index = 0;
3200 for (unsigned i = 0; i < link->ssbo_map.num; i++) {
3201 for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
3202 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3203
3204 uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
3205 /* We don't expose robustBufferAccess, so leave the size unlimited. */
3206 uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
3207
3208 dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
3209 dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
3210 A6XX_IBO_1_HEIGHT(sz >> 15);
3211 dst[2] = A6XX_IBO_2_UNK4 |
3212 A6XX_IBO_2_UNK31 |
3213 A6XX_IBO_2_TYPE(A6XX_TEX_1D);
3214 dst[3] = 0;
3215 dst[4] = va;
3216 dst[5] = va >> 32;
3217 for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3218 dst[i] = 0;
3219
3220 ssbo_index++;
3221 }
3222 }
3223
3224 for (unsigned i = 0; i < link->image_map.num; i++) {
3225 for (int j = 0; j < link->image_map.array_size[i]; j++) {
3226 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3227
3228 write_image_ibo(cmd, dst,
3229 descriptors_state, &link->image_map, i, j);
3230
3231 ssbo_index++;
3232 }
3233 }
3234
3235 assert(ssbo_index == num_desc);
3236
3237 struct tu_cs cs;
3238 result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
3239 if (result != VK_SUCCESS)
3240 return result;
3241
3242 uint32_t opcode, ibo_addr_reg;
3243 enum a6xx_state_block sb;
3244 enum a6xx_state_type st;
3245
3246 switch (type) {
3247 case MESA_SHADER_FRAGMENT:
3248 opcode = CP_LOAD_STATE6;
3249 st = ST6_SHADER;
3250 sb = SB6_IBO;
3251 ibo_addr_reg = REG_A6XX_SP_IBO_LO;
3252 break;
3253 case MESA_SHADER_COMPUTE:
3254 opcode = CP_LOAD_STATE6_FRAG;
3255 st = ST6_IBO;
3256 sb = SB6_CS_SHADER;
3257 ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
3258 break;
3259 default:
3260 unreachable("unsupported stage for ibos");
3261 }
3262
3263 /* emit texture state: */
3264 tu_cs_emit_pkt7(&cs, opcode, 3);
3265 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3266 CP_LOAD_STATE6_0_STATE_TYPE(st) |
3267 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3268 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3269 CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
3270 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3271
3272 tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
3273 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3274
3275 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3276 return VK_SUCCESS;
3277 }
3278
3279 struct PACKED bcolor_entry {
3280 uint32_t fp32[4];
3281 uint16_t ui16[4];
3282 int16_t si16[4];
3283 uint16_t fp16[4];
3284 uint16_t rgb565;
3285 uint16_t rgb5a1;
3286 uint16_t rgba4;
3287 uint8_t __pad0[2];
3288 uint8_t ui8[4];
3289 int8_t si8[4];
3290 uint32_t rgb10a2;
3291 uint32_t z24; /* also s8? */
3292 uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
3293 uint8_t __pad1[56];
3294 } border_color[] = {
3295 [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = {},
3296 [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = {},
3297 [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = {
3298 .fp32[3] = 0x3f800000,
3299 .ui16[3] = 0xffff,
3300 .si16[3] = 0x7fff,
3301 .fp16[3] = 0x3c00,
3302 .rgb5a1 = 0x8000,
3303 .rgba4 = 0xf000,
3304 .ui8[3] = 0xff,
3305 .si8[3] = 0x7f,
3306 .rgb10a2 = 0xc0000000,
3307 .srgb[3] = 0x3c00,
3308 },
3309 [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = {
3310 .fp32[3] = 1,
3311 .fp16[3] = 1,
3312 },
3313 [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = {
3314 .fp32[0 ... 3] = 0x3f800000,
3315 .ui16[0 ... 3] = 0xffff,
3316 .si16[0 ... 3] = 0x7fff,
3317 .fp16[0 ... 3] = 0x3c00,
3318 .rgb565 = 0xffff,
3319 .rgb5a1 = 0xffff,
3320 .rgba4 = 0xffff,
3321 .ui8[0 ... 3] = 0xff,
3322 .si8[0 ... 3] = 0x7f,
3323 .rgb10a2 = 0xffffffff,
3324 .z24 = 0xffffff,
3325 .srgb[0 ... 3] = 0x3c00,
3326 },
3327 [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = {
3328 .fp32[0 ... 3] = 1,
3329 .fp16[0 ... 3] = 1,
3330 },
3331 };
3332
3333 static VkResult
3334 tu6_emit_border_color(struct tu_cmd_buffer *cmd,
3335 struct tu_cs *cs)
3336 {
3337 STATIC_ASSERT(sizeof(struct bcolor_entry) == 128);
3338
3339 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3340 struct tu_descriptor_state *descriptors_state =
3341 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3342 const struct tu_descriptor_map *vs_sampler =
3343 &pipeline->program.link[MESA_SHADER_VERTEX].sampler_map;
3344 const struct tu_descriptor_map *fs_sampler =
3345 &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map;
3346 struct ts_cs_memory ptr;
3347
3348 VkResult result = tu_cs_alloc(&cmd->sub_cs,
3349 vs_sampler->num_desc + fs_sampler->num_desc,
3350 128 / 4,
3351 &ptr);
3352 if (result != VK_SUCCESS)
3353 return result;
3354
3355 for (unsigned i = 0; i < vs_sampler->num; i++) {
3356 for (unsigned j = 0; j < vs_sampler->array_size[i]; j++) {
3357 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3358 vs_sampler, i, j);
3359 memcpy(ptr.map, &border_color[sampler->border], 128);
3360 ptr.map += 128 / 4;
3361 }
3362 }
3363
3364 for (unsigned i = 0; i < fs_sampler->num; i++) {
3365 for (unsigned j = 0; j < fs_sampler->array_size[i]; j++) {
3366 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3367 fs_sampler, i, j);
3368 memcpy(ptr.map, &border_color[sampler->border], 128);
3369 ptr.map += 128 / 4;
3370 }
3371 }
3372
3373 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
3374 tu_cs_emit_qw(cs, ptr.iova);
3375 return VK_SUCCESS;
3376 }
3377
3378 static VkResult
3379 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3380 struct tu_cs *cs,
3381 const struct tu_draw_info *draw)
3382 {
3383 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3384 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3385 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3386 uint32_t draw_state_group_count = 0;
3387 VkResult result;
3388
3389 struct tu_descriptor_state *descriptors_state =
3390 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3391
3392 /* TODO lrz */
3393
3394 tu_cs_emit_regs(cs,
3395 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3396 pipeline->ia.primitive_restart && draw->indexed));
3397
3398 if (cmd->state.dirty &
3399 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3400 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3401 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3402 dynamic->line_width);
3403 }
3404
3405 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3406 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3407 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3408 dynamic->stencil_compare_mask.back);
3409 }
3410
3411 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3412 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3413 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3414 dynamic->stencil_write_mask.back);
3415 }
3416
3417 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3418 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3419 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3420 dynamic->stencil_reference.back);
3421 }
3422
3423 if (cmd->state.dirty &
3424 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) {
3425 for (uint32_t i = 0; i < pipeline->vi.count; i++) {
3426 const uint32_t binding = pipeline->vi.bindings[i];
3427 const uint32_t stride = pipeline->vi.strides[i];
3428 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3429 const VkDeviceSize offset = buf->bo_offset +
3430 cmd->state.vb.offsets[binding] +
3431 pipeline->vi.offsets[i];
3432 const VkDeviceSize size =
3433 offset < buf->bo->size ? buf->bo->size - offset : 0;
3434
3435 tu_cs_emit_regs(cs,
3436 A6XX_VFD_FETCH_BASE(i, .bo = buf->bo, .bo_offset = offset),
3437 A6XX_VFD_FETCH_SIZE(i, size),
3438 A6XX_VFD_FETCH_STRIDE(i, stride));
3439 }
3440 }
3441
3442 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3443 draw_state_groups[draw_state_group_count++] =
3444 (struct tu_draw_state_group) {
3445 .id = TU_DRAW_STATE_PROGRAM,
3446 .enable_mask = ENABLE_DRAW,
3447 .ib = pipeline->program.state_ib,
3448 };
3449 draw_state_groups[draw_state_group_count++] =
3450 (struct tu_draw_state_group) {
3451 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3452 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3453 .ib = pipeline->program.binning_state_ib,
3454 };
3455 draw_state_groups[draw_state_group_count++] =
3456 (struct tu_draw_state_group) {
3457 .id = TU_DRAW_STATE_VI,
3458 .enable_mask = ENABLE_DRAW,
3459 .ib = pipeline->vi.state_ib,
3460 };
3461 draw_state_groups[draw_state_group_count++] =
3462 (struct tu_draw_state_group) {
3463 .id = TU_DRAW_STATE_VI_BINNING,
3464 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3465 .ib = pipeline->vi.binning_state_ib,
3466 };
3467 draw_state_groups[draw_state_group_count++] =
3468 (struct tu_draw_state_group) {
3469 .id = TU_DRAW_STATE_VP,
3470 .enable_mask = ENABLE_ALL,
3471 .ib = pipeline->vp.state_ib,
3472 };
3473 draw_state_groups[draw_state_group_count++] =
3474 (struct tu_draw_state_group) {
3475 .id = TU_DRAW_STATE_RAST,
3476 .enable_mask = ENABLE_ALL,
3477 .ib = pipeline->rast.state_ib,
3478 };
3479 draw_state_groups[draw_state_group_count++] =
3480 (struct tu_draw_state_group) {
3481 .id = TU_DRAW_STATE_DS,
3482 .enable_mask = ENABLE_ALL,
3483 .ib = pipeline->ds.state_ib,
3484 };
3485 draw_state_groups[draw_state_group_count++] =
3486 (struct tu_draw_state_group) {
3487 .id = TU_DRAW_STATE_BLEND,
3488 .enable_mask = ENABLE_ALL,
3489 .ib = pipeline->blend.state_ib,
3490 };
3491 }
3492
3493 if (cmd->state.dirty &
3494 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) {
3495 draw_state_groups[draw_state_group_count++] =
3496 (struct tu_draw_state_group) {
3497 .id = TU_DRAW_STATE_VS_CONST,
3498 .enable_mask = ENABLE_ALL,
3499 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3500 };
3501 draw_state_groups[draw_state_group_count++] =
3502 (struct tu_draw_state_group) {
3503 .id = TU_DRAW_STATE_FS_CONST,
3504 .enable_mask = ENABLE_DRAW,
3505 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3506 };
3507 }
3508
3509 if (cmd->state.dirty &
3510 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
3511 bool needs_border = false;
3512 struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
3513
3514 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3515 MESA_SHADER_VERTEX, &vs_tex, &needs_border,
3516 false);
3517 if (result != VK_SUCCESS)
3518 return result;
3519
3520 /* TODO: we could emit just one texture descriptor draw state when there
3521 * are no input attachments, which is the most common case. We could
3522 * also split out the sampler state, which doesn't change even for input
3523 * attachments.
3524 */
3525 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3526 MESA_SHADER_FRAGMENT, &fs_tex_sysmem,
3527 &needs_border, true);
3528 if (result != VK_SUCCESS)
3529 return result;
3530
3531 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3532 MESA_SHADER_FRAGMENT, &fs_tex_gmem,
3533 &needs_border, false);
3534 if (result != VK_SUCCESS)
3535 return result;
3536
3537 result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
3538 MESA_SHADER_FRAGMENT, &fs_ibo);
3539 if (result != VK_SUCCESS)
3540 return result;
3541
3542 draw_state_groups[draw_state_group_count++] =
3543 (struct tu_draw_state_group) {
3544 .id = TU_DRAW_STATE_VS_TEX,
3545 .enable_mask = ENABLE_ALL,
3546 .ib = vs_tex,
3547 };
3548 draw_state_groups[draw_state_group_count++] =
3549 (struct tu_draw_state_group) {
3550 .id = TU_DRAW_STATE_FS_TEX_GMEM,
3551 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3552 .ib = fs_tex_gmem,
3553 };
3554 draw_state_groups[draw_state_group_count++] =
3555 (struct tu_draw_state_group) {
3556 .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
3557 .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
3558 .ib = fs_tex_sysmem,
3559 };
3560 draw_state_groups[draw_state_group_count++] =
3561 (struct tu_draw_state_group) {
3562 .id = TU_DRAW_STATE_FS_IBO,
3563 .enable_mask = ENABLE_DRAW,
3564 .ib = fs_ibo,
3565 };
3566
3567 if (needs_border) {
3568 result = tu6_emit_border_color(cmd, cs);
3569 if (result != VK_SUCCESS)
3570 return result;
3571 }
3572 }
3573
3574 struct tu_cs_entry vs_params;
3575 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3576 if (result != VK_SUCCESS)
3577 return result;
3578
3579 draw_state_groups[draw_state_group_count++] =
3580 (struct tu_draw_state_group) {
3581 .id = TU_DRAW_STATE_VS_PARAMS,
3582 .enable_mask = ENABLE_ALL,
3583 .ib = vs_params,
3584 };
3585
3586 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3587 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3588 const struct tu_draw_state_group *group = &draw_state_groups[i];
3589 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3590 uint32_t cp_set_draw_state =
3591 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3592 group->enable_mask |
3593 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3594 uint64_t iova;
3595 if (group->ib.size) {
3596 iova = group->ib.bo->iova + group->ib.offset;
3597 } else {
3598 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3599 iova = 0;
3600 }
3601
3602 tu_cs_emit(cs, cp_set_draw_state);
3603 tu_cs_emit_qw(cs, iova);
3604 }
3605
3606 tu_cs_sanity_check(cs);
3607
3608 /* track BOs */
3609 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3610 for (uint32_t i = 0; i < MAX_VBS; i++) {
3611 const struct tu_buffer *buf = cmd->state.vb.buffers[i];
3612 if (buf)
3613 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3614 }
3615 }
3616 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3617 unsigned i;
3618 for_each_bit(i, descriptors_state->valid) {
3619 struct tu_descriptor_set *set = descriptors_state->sets[i];
3620 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3621 if (set->descriptors[j]) {
3622 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3623 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3624 }
3625 }
3626 }
3627
3628 /* Fragment shader state overwrites compute shader state, so flag the
3629 * compute pipeline for re-emit.
3630 */
3631 cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
3632 return VK_SUCCESS;
3633 }
3634
3635 static void
3636 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3637 struct tu_cs *cs,
3638 const struct tu_draw_info *draw)
3639 {
3640
3641 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3642
3643 tu_cs_emit_regs(cs,
3644 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3645 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3646
3647 /* TODO hw binning */
3648 if (draw->indexed) {
3649 const enum a4xx_index_size index_size =
3650 tu6_index_size(cmd->state.index_type);
3651 const uint32_t index_bytes =
3652 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3653 const struct tu_buffer *buf = cmd->state.index_buffer;
3654 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3655 index_bytes * draw->first_index;
3656 const uint32_t size = index_bytes * draw->count;
3657
3658 const uint32_t cp_draw_indx =
3659 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3660 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3661 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3662 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3663
3664 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3665 tu_cs_emit(cs, cp_draw_indx);
3666 tu_cs_emit(cs, draw->instance_count);
3667 tu_cs_emit(cs, draw->count);
3668 tu_cs_emit(cs, 0x0); /* XXX */
3669 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3670 tu_cs_emit(cs, size);
3671 } else {
3672 const uint32_t cp_draw_indx =
3673 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3674 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3675 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3676
3677 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3678 tu_cs_emit(cs, cp_draw_indx);
3679 tu_cs_emit(cs, draw->instance_count);
3680 tu_cs_emit(cs, draw->count);
3681 }
3682 }
3683
3684 static void
3685 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3686 {
3687 struct tu_cs *cs = &cmd->draw_cs;
3688 VkResult result;
3689
3690 result = tu6_bind_draw_states(cmd, cs, draw);
3691 if (result != VK_SUCCESS) {
3692 cmd->record_result = result;
3693 return;
3694 }
3695
3696 if (draw->indirect) {
3697 tu_finishme("indirect draw");
3698 return;
3699 }
3700
3701 tu6_emit_draw_direct(cmd, cs, draw);
3702
3703 cmd->wait_for_idle = true;
3704
3705 tu_cs_sanity_check(cs);
3706 }
3707
3708 void
3709 tu_CmdDraw(VkCommandBuffer commandBuffer,
3710 uint32_t vertexCount,
3711 uint32_t instanceCount,
3712 uint32_t firstVertex,
3713 uint32_t firstInstance)
3714 {
3715 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3716 struct tu_draw_info info = {};
3717
3718 info.count = vertexCount;
3719 info.instance_count = instanceCount;
3720 info.first_instance = firstInstance;
3721 info.vertex_offset = firstVertex;
3722
3723 tu_draw(cmd_buffer, &info);
3724 }
3725
3726 void
3727 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3728 uint32_t indexCount,
3729 uint32_t instanceCount,
3730 uint32_t firstIndex,
3731 int32_t vertexOffset,
3732 uint32_t firstInstance)
3733 {
3734 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3735 struct tu_draw_info info = {};
3736
3737 info.indexed = true;
3738 info.count = indexCount;
3739 info.instance_count = instanceCount;
3740 info.first_index = firstIndex;
3741 info.vertex_offset = vertexOffset;
3742 info.first_instance = firstInstance;
3743
3744 tu_draw(cmd_buffer, &info);
3745 }
3746
3747 void
3748 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3749 VkBuffer _buffer,
3750 VkDeviceSize offset,
3751 uint32_t drawCount,
3752 uint32_t stride)
3753 {
3754 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3755 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3756 struct tu_draw_info info = {};
3757
3758 info.count = drawCount;
3759 info.indirect = buffer;
3760 info.indirect_offset = offset;
3761 info.stride = stride;
3762
3763 tu_draw(cmd_buffer, &info);
3764 }
3765
3766 void
3767 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3768 VkBuffer _buffer,
3769 VkDeviceSize offset,
3770 uint32_t drawCount,
3771 uint32_t stride)
3772 {
3773 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3774 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3775 struct tu_draw_info info = {};
3776
3777 info.indexed = true;
3778 info.count = drawCount;
3779 info.indirect = buffer;
3780 info.indirect_offset = offset;
3781 info.stride = stride;
3782
3783 tu_draw(cmd_buffer, &info);
3784 }
3785
3786 struct tu_dispatch_info
3787 {
3788 /**
3789 * Determine the layout of the grid (in block units) to be used.
3790 */
3791 uint32_t blocks[3];
3792
3793 /**
3794 * A starting offset for the grid. If unaligned is set, the offset
3795 * must still be aligned.
3796 */
3797 uint32_t offsets[3];
3798 /**
3799 * Whether it's an unaligned compute dispatch.
3800 */
3801 bool unaligned;
3802
3803 /**
3804 * Indirect compute parameters resource.
3805 */
3806 struct tu_buffer *indirect;
3807 uint64_t indirect_offset;
3808 };
3809
3810 static void
3811 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3812 const struct tu_dispatch_info *info)
3813 {
3814 gl_shader_stage type = MESA_SHADER_COMPUTE;
3815 const struct tu_program_descriptor_linkage *link =
3816 &pipeline->program.link[type];
3817 const struct ir3_const_state *const_state = &link->const_state;
3818 uint32_t offset = const_state->offsets.driver_param;
3819
3820 if (link->constlen <= offset)
3821 return;
3822
3823 if (!info->indirect) {
3824 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3825 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3826 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3827 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3828 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3829 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3830 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3831 };
3832
3833 uint32_t num_consts = MIN2(const_state->num_driver_params,
3834 (link->constlen - offset) * 4);
3835 /* push constants */
3836 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3837 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3838 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3839 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3840 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3841 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3842 tu_cs_emit(cs, 0);
3843 tu_cs_emit(cs, 0);
3844 uint32_t i;
3845 for (i = 0; i < num_consts; i++)
3846 tu_cs_emit(cs, driver_params[i]);
3847 } else {
3848 tu_finishme("Indirect driver params");
3849 }
3850 }
3851
3852 static void
3853 tu_dispatch(struct tu_cmd_buffer *cmd,
3854 const struct tu_dispatch_info *info)
3855 {
3856 struct tu_cs *cs = &cmd->cs;
3857 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3858 struct tu_descriptor_state *descriptors_state =
3859 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3860 VkResult result;
3861
3862 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3863 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3864
3865 struct tu_cs_entry ib;
3866
3867 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3868 if (ib.size)
3869 tu_cs_emit_ib(cs, &ib);
3870
3871 tu_emit_compute_driver_params(cs, pipeline, info);
3872
3873 bool needs_border;
3874 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3875 MESA_SHADER_COMPUTE, &ib, &needs_border, false);
3876 if (result != VK_SUCCESS) {
3877 cmd->record_result = result;
3878 return;
3879 }
3880
3881 if (ib.size)
3882 tu_cs_emit_ib(cs, &ib);
3883
3884 if (needs_border)
3885 tu_finishme("compute border color");
3886
3887 result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
3888 if (result != VK_SUCCESS) {
3889 cmd->record_result = result;
3890 return;
3891 }
3892
3893 if (ib.size)
3894 tu_cs_emit_ib(cs, &ib);
3895
3896 /* track BOs */
3897 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3898 unsigned i;
3899 for_each_bit(i, descriptors_state->valid) {
3900 struct tu_descriptor_set *set = descriptors_state->sets[i];
3901 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3902 if (set->descriptors[j]) {
3903 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3904 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3905 }
3906 }
3907 }
3908
3909 /* Compute shader state overwrites fragment shader state, so we flag the
3910 * graphics pipeline for re-emit.
3911 */
3912 cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
3913
3914 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3915 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3916
3917 const uint32_t *local_size = pipeline->compute.local_size;
3918 const uint32_t *num_groups = info->blocks;
3919 tu_cs_emit_regs(cs,
3920 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3921 .localsizex = local_size[0] - 1,
3922 .localsizey = local_size[1] - 1,
3923 .localsizez = local_size[2] - 1),
3924 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3925 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3926 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3927 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3928 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3929 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3930
3931 tu_cs_emit_regs(cs,
3932 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3933 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3934 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3935
3936 if (info->indirect) {
3937 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3938
3939 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3940 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3941
3942 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3943 tu_cs_emit(cs, 0x00000000);
3944 tu_cs_emit_qw(cs, iova);
3945 tu_cs_emit(cs,
3946 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3947 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3948 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3949 } else {
3950 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3951 tu_cs_emit(cs, 0x00000000);
3952 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3953 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3954 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3955 }
3956
3957 tu_cs_emit_wfi(cs);
3958
3959 tu6_emit_cache_flush(cmd, cs);
3960 }
3961
3962 void
3963 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3964 uint32_t base_x,
3965 uint32_t base_y,
3966 uint32_t base_z,
3967 uint32_t x,
3968 uint32_t y,
3969 uint32_t z)
3970 {
3971 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3972 struct tu_dispatch_info info = {};
3973
3974 info.blocks[0] = x;
3975 info.blocks[1] = y;
3976 info.blocks[2] = z;
3977
3978 info.offsets[0] = base_x;
3979 info.offsets[1] = base_y;
3980 info.offsets[2] = base_z;
3981 tu_dispatch(cmd_buffer, &info);
3982 }
3983
3984 void
3985 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3986 uint32_t x,
3987 uint32_t y,
3988 uint32_t z)
3989 {
3990 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3991 }
3992
3993 void
3994 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3995 VkBuffer _buffer,
3996 VkDeviceSize offset)
3997 {
3998 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3999 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
4000 struct tu_dispatch_info info = {};
4001
4002 info.indirect = buffer;
4003 info.indirect_offset = offset;
4004
4005 tu_dispatch(cmd_buffer, &info);
4006 }
4007
4008 void
4009 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
4010 {
4011 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4012
4013 tu_cs_end(&cmd_buffer->draw_cs);
4014 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4015
4016 if (use_sysmem_rendering(cmd_buffer))
4017 tu_cmd_render_sysmem(cmd_buffer);
4018 else
4019 tu_cmd_render_tiles(cmd_buffer);
4020
4021 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4022 rendered */
4023 tu_cs_discard_entries(&cmd_buffer->draw_cs);
4024 tu_cs_begin(&cmd_buffer->draw_cs);
4025 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4026 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4027
4028 cmd_buffer->state.pass = NULL;
4029 cmd_buffer->state.subpass = NULL;
4030 cmd_buffer->state.framebuffer = NULL;
4031 }
4032
4033 void
4034 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4035 const VkSubpassEndInfoKHR *pSubpassEndInfo)
4036 {
4037 tu_CmdEndRenderPass(commandBuffer);
4038 }
4039
4040 struct tu_barrier_info
4041 {
4042 uint32_t eventCount;
4043 const VkEvent *pEvents;
4044 VkPipelineStageFlags srcStageMask;
4045 };
4046
4047 static void
4048 tu_barrier(struct tu_cmd_buffer *cmd_buffer,
4049 uint32_t memoryBarrierCount,
4050 const VkMemoryBarrier *pMemoryBarriers,
4051 uint32_t bufferMemoryBarrierCount,
4052 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4053 uint32_t imageMemoryBarrierCount,
4054 const VkImageMemoryBarrier *pImageMemoryBarriers,
4055 const struct tu_barrier_info *info)
4056 {
4057 }
4058
4059 void
4060 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4061 VkPipelineStageFlags srcStageMask,
4062 VkPipelineStageFlags destStageMask,
4063 VkBool32 byRegion,
4064 uint32_t memoryBarrierCount,
4065 const VkMemoryBarrier *pMemoryBarriers,
4066 uint32_t bufferMemoryBarrierCount,
4067 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4068 uint32_t imageMemoryBarrierCount,
4069 const VkImageMemoryBarrier *pImageMemoryBarriers)
4070 {
4071 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4072 struct tu_barrier_info info;
4073
4074 info.eventCount = 0;
4075 info.pEvents = NULL;
4076 info.srcStageMask = srcStageMask;
4077
4078 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4079 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4080 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4081 }
4082
4083 static void
4084 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value)
4085 {
4086 struct tu_cs *cs = &cmd->cs;
4087
4088 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4089
4090 /* TODO: any flush required before/after ? */
4091
4092 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4093 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4094 tu_cs_emit(cs, value);
4095 }
4096
4097 void
4098 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4099 VkEvent _event,
4100 VkPipelineStageFlags stageMask)
4101 {
4102 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4103 TU_FROM_HANDLE(tu_event, event, _event);
4104
4105 write_event(cmd, event, 1);
4106 }
4107
4108 void
4109 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4110 VkEvent _event,
4111 VkPipelineStageFlags stageMask)
4112 {
4113 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4114 TU_FROM_HANDLE(tu_event, event, _event);
4115
4116 write_event(cmd, event, 0);
4117 }
4118
4119 void
4120 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4121 uint32_t eventCount,
4122 const VkEvent *pEvents,
4123 VkPipelineStageFlags srcStageMask,
4124 VkPipelineStageFlags dstStageMask,
4125 uint32_t memoryBarrierCount,
4126 const VkMemoryBarrier *pMemoryBarriers,
4127 uint32_t bufferMemoryBarrierCount,
4128 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4129 uint32_t imageMemoryBarrierCount,
4130 const VkImageMemoryBarrier *pImageMemoryBarriers)
4131 {
4132 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4133 struct tu_cs *cs = &cmd->cs;
4134
4135 /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */
4136
4137 for (uint32_t i = 0; i < eventCount; i++) {
4138 TU_FROM_HANDLE(tu_event, event, pEvents[i]);
4139
4140 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4141
4142 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4143 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4144 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4145 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4146 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4147 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4148 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4149 }
4150 }
4151
4152 void
4153 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4154 {
4155 /* No-op */
4156 }