turnip: fix hw binning + render_area offset interaction
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36 #include "tu_blit.h"
37
38 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
39
40 void
41 tu_bo_list_init(struct tu_bo_list *list)
42 {
43 list->count = list->capacity = 0;
44 list->bo_infos = NULL;
45 }
46
47 void
48 tu_bo_list_destroy(struct tu_bo_list *list)
49 {
50 free(list->bo_infos);
51 }
52
53 void
54 tu_bo_list_reset(struct tu_bo_list *list)
55 {
56 list->count = 0;
57 }
58
59 /**
60 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
61 */
62 static uint32_t
63 tu_bo_list_add_info(struct tu_bo_list *list,
64 const struct drm_msm_gem_submit_bo *bo_info)
65 {
66 assert(bo_info->handle != 0);
67
68 for (uint32_t i = 0; i < list->count; ++i) {
69 if (list->bo_infos[i].handle == bo_info->handle) {
70 assert(list->bo_infos[i].presumed == bo_info->presumed);
71 list->bo_infos[i].flags |= bo_info->flags;
72 return i;
73 }
74 }
75
76 /* grow list->bo_infos if needed */
77 if (list->count == list->capacity) {
78 uint32_t new_capacity = MAX2(2 * list->count, 16);
79 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
80 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
81 if (!new_bo_infos)
82 return TU_BO_LIST_FAILED;
83 list->bo_infos = new_bo_infos;
84 list->capacity = new_capacity;
85 }
86
87 list->bo_infos[list->count] = *bo_info;
88 return list->count++;
89 }
90
91 uint32_t
92 tu_bo_list_add(struct tu_bo_list *list,
93 const struct tu_bo *bo,
94 uint32_t flags)
95 {
96 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
97 .flags = flags,
98 .handle = bo->gem_handle,
99 .presumed = bo->iova,
100 });
101 }
102
103 VkResult
104 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
105 {
106 for (uint32_t i = 0; i < other->count; i++) {
107 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
108 return VK_ERROR_OUT_OF_HOST_MEMORY;
109 }
110
111 return VK_SUCCESS;
112 }
113
114 static bool
115 is_linear_mipmapped(const struct tu_image_view *iview)
116 {
117 return iview->image->layout.tile_mode == TILE6_LINEAR &&
118 iview->base_mip != iview->image->level_count - 1;
119 }
120
121 static bool
122 force_sysmem(const struct tu_cmd_buffer *cmd,
123 const struct VkRect2D *render_area)
124 {
125 const struct tu_framebuffer *fb = cmd->state.framebuffer;
126 const struct tu_physical_device *device = cmd->device->physical_device;
127 bool has_linear_mipmapped_store = false;
128 const struct tu_render_pass *pass = cmd->state.pass;
129
130 /* Iterate over all the places we call tu6_emit_store_attachment() */
131 for (unsigned i = 0; i < pass->subpass_count; i++) {
132 const struct tu_subpass *subpass = &pass->subpasses[i];
133 if (subpass->resolve_attachments) {
134 for (unsigned i = 0; i < subpass->color_count; i++) {
135 uint32_t a = subpass->resolve_attachments[i].attachment;
136 if (a != VK_ATTACHMENT_UNUSED &&
137 cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
138 const struct tu_image_view *iview = fb->attachments[a].attachment;
139 if (is_linear_mipmapped(iview)) {
140 has_linear_mipmapped_store = true;
141 break;
142 }
143 }
144 }
145 }
146 }
147
148 for (unsigned i = 0; i < pass->attachment_count; i++) {
149 if (pass->attachments[i].gmem_offset >= 0 &&
150 cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
151 const struct tu_image_view *iview = fb->attachments[i].attachment;
152 if (is_linear_mipmapped(iview)) {
153 has_linear_mipmapped_store = true;
154 break;
155 }
156 }
157 }
158
159 /* Linear textures cannot have any padding between mipmap levels and their
160 * height isn't padded, while at the same time the GMEM->MEM resolve does
161 * not have per-pixel granularity, so if the image height isn't aligned to
162 * the resolve granularity and the render area is tall enough, we may wind
163 * up writing past the bottom of the image into the next miplevel or even
164 * past the end of the image. For the last miplevel, the layout code should
165 * insert enough padding so that the overdraw writes to the padding. To
166 * work around this, we force-enable sysmem rendering.
167 */
168 const uint32_t y2 = render_area->offset.y + render_area->extent.height;
169 const uint32_t aligned_y2 = ALIGN_POT(y2, device->tile_align_h);
170
171 return has_linear_mipmapped_store && aligned_y2 > fb->height;
172 }
173
174 static void
175 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
176 const struct tu_device *dev,
177 uint32_t pixels)
178 {
179 const uint32_t tile_align_w = dev->physical_device->tile_align_w;
180 const uint32_t tile_align_h = dev->physical_device->tile_align_h;
181 const uint32_t max_tile_width = 1024; /* A6xx */
182
183 /* note: don't offset the tiling config by render_area.offset,
184 * because binning pass can't deal with it
185 * this means we might end up with more tiles than necessary,
186 * but load/store/etc are still scissored to the render_area
187 */
188 tiling->tile0.offset = (VkOffset2D) {};
189
190 const uint32_t ra_width =
191 tiling->render_area.extent.width +
192 (tiling->render_area.offset.x - tiling->tile0.offset.x);
193 const uint32_t ra_height =
194 tiling->render_area.extent.height +
195 (tiling->render_area.offset.y - tiling->tile0.offset.y);
196
197 /* start from 1 tile */
198 tiling->tile_count = (VkExtent2D) {
199 .width = 1,
200 .height = 1,
201 };
202 tiling->tile0.extent = (VkExtent2D) {
203 .width = align(ra_width, tile_align_w),
204 .height = align(ra_height, tile_align_h),
205 };
206
207 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
208 /* start with 2x2 tiles */
209 tiling->tile_count.width = 2;
210 tiling->tile_count.height = 2;
211 tiling->tile0.extent.width = align(DIV_ROUND_UP(ra_width, 2), tile_align_w);
212 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), tile_align_h);
213 }
214
215 /* do not exceed max tile width */
216 while (tiling->tile0.extent.width > max_tile_width) {
217 tiling->tile_count.width++;
218 tiling->tile0.extent.width =
219 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
220 }
221
222 /* do not exceed gmem size */
223 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pixels) {
224 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
225 tiling->tile_count.width++;
226 tiling->tile0.extent.width =
227 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
228 } else {
229 /* if this assert fails then layout is impossible.. */
230 assert(tiling->tile0.extent.height > tile_align_h);
231 tiling->tile_count.height++;
232 tiling->tile0.extent.height =
233 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), tile_align_h);
234 }
235 }
236 }
237
238 static void
239 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
240 const struct tu_device *dev)
241 {
242 const uint32_t max_pipe_count = 32; /* A6xx */
243
244 /* start from 1 tile per pipe */
245 tiling->pipe0 = (VkExtent2D) {
246 .width = 1,
247 .height = 1,
248 };
249 tiling->pipe_count = tiling->tile_count;
250
251 /* do not exceed max pipe count vertically */
252 while (tiling->pipe_count.height > max_pipe_count) {
253 tiling->pipe0.height += 2;
254 tiling->pipe_count.height =
255 (tiling->tile_count.height + tiling->pipe0.height - 1) /
256 tiling->pipe0.height;
257 }
258
259 /* do not exceed max pipe count */
260 while (tiling->pipe_count.width * tiling->pipe_count.height >
261 max_pipe_count) {
262 tiling->pipe0.width += 1;
263 tiling->pipe_count.width =
264 (tiling->tile_count.width + tiling->pipe0.width - 1) /
265 tiling->pipe0.width;
266 }
267 }
268
269 static void
270 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
271 const struct tu_device *dev)
272 {
273 const uint32_t max_pipe_count = 32; /* A6xx */
274 const uint32_t used_pipe_count =
275 tiling->pipe_count.width * tiling->pipe_count.height;
276 const VkExtent2D last_pipe = {
277 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
278 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
279 };
280
281 assert(used_pipe_count <= max_pipe_count);
282 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
283
284 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
285 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
286 const uint32_t pipe_x = tiling->pipe0.width * x;
287 const uint32_t pipe_y = tiling->pipe0.height * y;
288 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
289 ? last_pipe.width
290 : tiling->pipe0.width;
291 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
292 ? last_pipe.height
293 : tiling->pipe0.height;
294 const uint32_t n = tiling->pipe_count.width * y + x;
295
296 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
297 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
298 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
299 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
300 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
301 }
302 }
303
304 memset(tiling->pipe_config + used_pipe_count, 0,
305 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
306 }
307
308 static void
309 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
310 const struct tu_device *dev,
311 uint32_t tx,
312 uint32_t ty,
313 struct tu_tile *tile)
314 {
315 /* find the pipe and the slot for tile (tx, ty) */
316 const uint32_t px = tx / tiling->pipe0.width;
317 const uint32_t py = ty / tiling->pipe0.height;
318 const uint32_t sx = tx - tiling->pipe0.width * px;
319 const uint32_t sy = ty - tiling->pipe0.height * py;
320
321 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
322 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
323 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
324
325 /* convert to 1D indices */
326 tile->pipe = tiling->pipe_count.width * py + px;
327 tile->slot = tiling->pipe0.width * sy + sx;
328
329 /* get the blit area for the tile */
330 tile->begin = (VkOffset2D) {
331 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
332 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
333 };
334 tile->end.x =
335 (tx == tiling->tile_count.width - 1)
336 ? tiling->render_area.offset.x + tiling->render_area.extent.width
337 : tile->begin.x + tiling->tile0.extent.width;
338 tile->end.y =
339 (ty == tiling->tile_count.height - 1)
340 ? tiling->render_area.offset.y + tiling->render_area.extent.height
341 : tile->begin.y + tiling->tile0.extent.height;
342 }
343
344 enum a3xx_msaa_samples
345 tu_msaa_samples(uint32_t samples)
346 {
347 switch (samples) {
348 case 1:
349 return MSAA_ONE;
350 case 2:
351 return MSAA_TWO;
352 case 4:
353 return MSAA_FOUR;
354 case 8:
355 return MSAA_EIGHT;
356 default:
357 assert(!"invalid sample count");
358 return MSAA_ONE;
359 }
360 }
361
362 static enum a4xx_index_size
363 tu6_index_size(VkIndexType type)
364 {
365 switch (type) {
366 case VK_INDEX_TYPE_UINT16:
367 return INDEX4_SIZE_16_BIT;
368 case VK_INDEX_TYPE_UINT32:
369 return INDEX4_SIZE_32_BIT;
370 default:
371 unreachable("invalid VkIndexType");
372 return INDEX4_SIZE_8_BIT;
373 }
374 }
375
376 unsigned
377 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
378 struct tu_cs *cs,
379 enum vgt_event_type event,
380 bool need_seqno)
381 {
382 unsigned seqno = 0;
383
384 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
385 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
386 if (need_seqno) {
387 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
388 seqno = ++cmd->scratch_seqno;
389 tu_cs_emit(cs, seqno);
390 }
391
392 return seqno;
393 }
394
395 static void
396 tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
397 {
398 tu6_emit_event_write(cmd, cs, 0x31, false);
399 }
400
401 static void
402 tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
403 {
404 tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false);
405 }
406
407 static void
408 tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
409 {
410 if (cmd->wait_for_idle) {
411 tu_cs_emit_wfi(cs);
412 cmd->wait_for_idle = false;
413 }
414 }
415
416 #define tu_image_view_ubwc_pitches(iview) \
417 .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \
418 .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2
419
420 static void
421 tu6_emit_zs(struct tu_cmd_buffer *cmd,
422 const struct tu_subpass *subpass,
423 struct tu_cs *cs)
424 {
425 const struct tu_framebuffer *fb = cmd->state.framebuffer;
426
427 const uint32_t a = subpass->depth_stencil_attachment.attachment;
428 if (a == VK_ATTACHMENT_UNUSED) {
429 tu_cs_emit_regs(cs,
430 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
431 A6XX_RB_DEPTH_BUFFER_PITCH(0),
432 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
433 A6XX_RB_DEPTH_BUFFER_BASE(0),
434 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
435
436 tu_cs_emit_regs(cs,
437 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
438
439 tu_cs_emit_regs(cs,
440 A6XX_GRAS_LRZ_BUFFER_BASE(0),
441 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
442 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
443
444 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
445
446 return;
447 }
448
449 const struct tu_image_view *iview = fb->attachments[a].attachment;
450 enum a6xx_depth_format fmt = tu6_pipe2depth(iview->vk_format);
451
452 tu_cs_emit_regs(cs,
453 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt),
454 A6XX_RB_DEPTH_BUFFER_PITCH(tu_image_stride(iview->image, iview->base_mip)),
455 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(iview->image->layout.layer_size),
456 A6XX_RB_DEPTH_BUFFER_BASE(tu_image_view_base_ref(iview)),
457 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(cmd->state.pass->attachments[a].gmem_offset));
458
459 tu_cs_emit_regs(cs,
460 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
461
462 tu_cs_emit_regs(cs,
463 A6XX_RB_DEPTH_FLAG_BUFFER_BASE(tu_image_view_ubwc_base_ref(iview)),
464 A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(tu_image_view_ubwc_pitches(iview)));
465
466 tu_cs_emit_regs(cs,
467 A6XX_GRAS_LRZ_BUFFER_BASE(0),
468 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
469 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
470
471 tu_cs_emit_regs(cs,
472 A6XX_RB_STENCIL_INFO(0));
473
474 /* enable zs? */
475 }
476
477 static void
478 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
479 const struct tu_subpass *subpass,
480 struct tu_cs *cs)
481 {
482 const struct tu_framebuffer *fb = cmd->state.framebuffer;
483 unsigned char mrt_comp[MAX_RTS] = { 0 };
484 unsigned srgb_cntl = 0;
485
486 for (uint32_t i = 0; i < subpass->color_count; ++i) {
487 uint32_t a = subpass->color_attachments[i].attachment;
488 if (a == VK_ATTACHMENT_UNUSED)
489 continue;
490
491 const struct tu_image_view *iview = fb->attachments[a].attachment;
492 const enum a6xx_tile_mode tile_mode =
493 tu6_get_image_tile_mode(iview->image, iview->base_mip);
494
495 mrt_comp[i] = 0xf;
496
497 if (vk_format_is_srgb(iview->vk_format))
498 srgb_cntl |= (1 << i);
499
500 const struct tu_native_format format =
501 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
502
503 tu_cs_emit_regs(cs,
504 A6XX_RB_MRT_BUF_INFO(i,
505 .color_tile_mode = tile_mode,
506 .color_format = format.fmt,
507 .color_swap = format.swap),
508 A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)),
509 A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size),
510 A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)),
511 A6XX_RB_MRT_BASE_GMEM(i, cmd->state.pass->attachments[a].gmem_offset));
512
513 tu_cs_emit_regs(cs,
514 A6XX_SP_FS_MRT_REG(i,
515 .color_format = format.fmt,
516 .color_sint = vk_format_is_sint(iview->vk_format),
517 .color_uint = vk_format_is_uint(iview->vk_format)));
518
519 tu_cs_emit_regs(cs,
520 A6XX_RB_MRT_FLAG_BUFFER_ADDR(i, tu_image_view_ubwc_base_ref(iview)),
521 A6XX_RB_MRT_FLAG_BUFFER_PITCH(i, tu_image_view_ubwc_pitches(iview)));
522 }
523
524 tu_cs_emit_regs(cs,
525 A6XX_RB_SRGB_CNTL(srgb_cntl));
526
527 tu_cs_emit_regs(cs,
528 A6XX_SP_SRGB_CNTL(srgb_cntl));
529
530 tu_cs_emit_regs(cs,
531 A6XX_RB_RENDER_COMPONENTS(
532 .rt0 = mrt_comp[0],
533 .rt1 = mrt_comp[1],
534 .rt2 = mrt_comp[2],
535 .rt3 = mrt_comp[3],
536 .rt4 = mrt_comp[4],
537 .rt5 = mrt_comp[5],
538 .rt6 = mrt_comp[6],
539 .rt7 = mrt_comp[7]));
540
541 tu_cs_emit_regs(cs,
542 A6XX_SP_FS_RENDER_COMPONENTS(
543 .rt0 = mrt_comp[0],
544 .rt1 = mrt_comp[1],
545 .rt2 = mrt_comp[2],
546 .rt3 = mrt_comp[3],
547 .rt4 = mrt_comp[4],
548 .rt5 = mrt_comp[5],
549 .rt6 = mrt_comp[6],
550 .rt7 = mrt_comp[7]));
551 }
552
553 static void
554 tu6_emit_msaa(struct tu_cmd_buffer *cmd,
555 const struct tu_subpass *subpass,
556 struct tu_cs *cs)
557 {
558 const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples);
559 bool msaa_disable = samples == MSAA_ONE;
560
561 tu_cs_emit_regs(cs,
562 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
563 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
564 .msaa_disable = msaa_disable));
565
566 tu_cs_emit_regs(cs,
567 A6XX_GRAS_RAS_MSAA_CNTL(samples),
568 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
569 .msaa_disable = msaa_disable));
570
571 tu_cs_emit_regs(cs,
572 A6XX_RB_RAS_MSAA_CNTL(samples),
573 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
574 .msaa_disable = msaa_disable));
575
576 tu_cs_emit_regs(cs,
577 A6XX_RB_MSAA_CNTL(samples));
578 }
579
580 static void
581 tu6_emit_bin_size(struct tu_cs *cs,
582 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
583 {
584 tu_cs_emit_regs(cs,
585 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
586 .binh = bin_h,
587 .dword = flags));
588
589 tu_cs_emit_regs(cs,
590 A6XX_RB_BIN_CONTROL(.binw = bin_w,
591 .binh = bin_h,
592 .dword = flags));
593
594 /* no flag for RB_BIN_CONTROL2... */
595 tu_cs_emit_regs(cs,
596 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
597 .binh = bin_h));
598 }
599
600 static void
601 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
602 const struct tu_subpass *subpass,
603 struct tu_cs *cs,
604 bool binning)
605 {
606 const struct tu_framebuffer *fb = cmd->state.framebuffer;
607 uint32_t cntl = 0;
608 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
609 if (binning) {
610 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
611 } else {
612 uint32_t mrts_ubwc_enable = 0;
613 for (uint32_t i = 0; i < subpass->color_count; ++i) {
614 uint32_t a = subpass->color_attachments[i].attachment;
615 if (a == VK_ATTACHMENT_UNUSED)
616 continue;
617
618 const struct tu_image_view *iview = fb->attachments[a].attachment;
619 if (iview->image->layout.ubwc_layer_size != 0)
620 mrts_ubwc_enable |= 1 << i;
621 }
622
623 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
624
625 const uint32_t a = subpass->depth_stencil_attachment.attachment;
626 if (a != VK_ATTACHMENT_UNUSED) {
627 const struct tu_image_view *iview = fb->attachments[a].attachment;
628 if (iview->image->layout.ubwc_layer_size != 0)
629 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
630 }
631
632 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
633 * in order to set it correctly for the different subpasses. However,
634 * that means the packets we're emitting also happen during binning. So
635 * we need to guard the write on !BINNING at CP execution time.
636 */
637 tu_cs_reserve(cs, 3 + 4);
638 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
639 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
640 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
641 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
642 }
643
644 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
645 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
646 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
647 tu_cs_emit(cs, cntl);
648 }
649
650 static void
651 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
652 {
653 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
654 uint32_t x1 = render_area->offset.x;
655 uint32_t y1 = render_area->offset.y;
656 uint32_t x2 = x1 + render_area->extent.width - 1;
657 uint32_t y2 = y1 + render_area->extent.height - 1;
658
659 /* TODO: alignment requirement seems to be less than tile_align_w/h */
660 if (align) {
661 x1 = x1 & ~cmd->device->physical_device->tile_align_w;
662 y1 = y1 & ~cmd->device->physical_device->tile_align_h;
663 x2 = ALIGN_POT(x2 + 1, cmd->device->physical_device->tile_align_w) - 1;
664 y2 = ALIGN_POT(y2 + 1, cmd->device->physical_device->tile_align_h) - 1;
665 }
666
667 tu_cs_emit_regs(cs,
668 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
669 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
670 }
671
672 static void
673 tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
674 struct tu_cs *cs,
675 const struct tu_image_view *iview,
676 uint32_t gmem_offset,
677 bool resolve)
678 {
679 tu_cs_emit_regs(cs,
680 A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve));
681
682 const struct tu_native_format format =
683 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
684
685 enum a6xx_tile_mode tile_mode =
686 tu6_get_image_tile_mode(iview->image, iview->base_mip);
687 tu_cs_emit_regs(cs,
688 A6XX_RB_BLIT_DST_INFO(
689 .tile_mode = tile_mode,
690 .samples = tu_msaa_samples(iview->image->samples),
691 .color_format = format.fmt,
692 .color_swap = format.swap,
693 .flags = iview->image->layout.ubwc_layer_size != 0),
694 A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
695 A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
696 A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size));
697
698 if (iview->image->layout.ubwc_layer_size) {
699 tu_cs_emit_regs(cs,
700 A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)),
701 A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview)));
702 }
703
704 tu_cs_emit_regs(cs,
705 A6XX_RB_BLIT_BASE_GMEM(gmem_offset));
706 }
707
708 static void
709 tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
710 {
711 tu6_emit_event_write(cmd, cs, BLIT, false);
712 }
713
714 static void
715 tu6_emit_window_scissor(struct tu_cmd_buffer *cmd,
716 struct tu_cs *cs,
717 uint32_t x1,
718 uint32_t y1,
719 uint32_t x2,
720 uint32_t y2)
721 {
722 tu_cs_emit_regs(cs,
723 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
724 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
725
726 tu_cs_emit_regs(cs,
727 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
728 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
729 }
730
731 static void
732 tu6_emit_window_offset(struct tu_cmd_buffer *cmd,
733 struct tu_cs *cs,
734 uint32_t x1,
735 uint32_t y1)
736 {
737 tu_cs_emit_regs(cs,
738 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
739
740 tu_cs_emit_regs(cs,
741 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
742
743 tu_cs_emit_regs(cs,
744 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
745
746 tu_cs_emit_regs(cs,
747 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
748 }
749
750 static bool
751 use_hw_binning(struct tu_cmd_buffer *cmd)
752 {
753 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
754
755 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
756 return false;
757
758 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
759 return true;
760
761 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
762 }
763
764 static bool
765 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
766 {
767 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
768 return true;
769
770 return cmd->state.tiling_config.force_sysmem;
771 }
772
773 static void
774 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
775 struct tu_cs *cs,
776 const struct tu_tile *tile)
777 {
778 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
779 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
780
781 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
782 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
783
784 const uint32_t x1 = tile->begin.x;
785 const uint32_t y1 = tile->begin.y;
786 const uint32_t x2 = tile->end.x - 1;
787 const uint32_t y2 = tile->end.y - 1;
788 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
789 tu6_emit_window_offset(cmd, cs, x1, y1);
790
791 tu_cs_emit_regs(cs,
792 A6XX_VPC_SO_OVERRIDE(.so_disable = true));
793
794 if (use_hw_binning(cmd)) {
795 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
796
797 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
798 tu_cs_emit(cs, 0x0);
799
800 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
801 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
802 A6XX_CP_REG_TEST_0_BIT(0) |
803 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
804
805 tu_cs_reserve(cs, 3 + 11);
806 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
807 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
808 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
809
810 /* if (no overflow) */ {
811 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
812 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
813 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
814 tu_cs_emit_qw(cs, cmd->vsc_data.iova + tile->pipe * cmd->vsc_data_pitch);
815 tu_cs_emit_qw(cs, cmd->vsc_data.iova + (tile->pipe * 4) + (32 * cmd->vsc_data_pitch));
816 tu_cs_emit_qw(cs, cmd->vsc_data2.iova + (tile->pipe * cmd->vsc_data2_pitch));
817
818 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
819 tu_cs_emit(cs, 0x0);
820
821 /* use a NOP packet to skip over the 'else' side: */
822 tu_cs_emit_pkt7(cs, CP_NOP, 2);
823 } /* else */ {
824 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
825 tu_cs_emit(cs, 0x1);
826 }
827
828 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
829 tu_cs_emit(cs, 0x0);
830
831 tu_cs_emit_regs(cs,
832 A6XX_RB_UNKNOWN_8804(0));
833
834 tu_cs_emit_regs(cs,
835 A6XX_SP_TP_UNKNOWN_B304(0));
836
837 tu_cs_emit_regs(cs,
838 A6XX_GRAS_UNKNOWN_80A4(0));
839 } else {
840 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
841 tu_cs_emit(cs, 0x1);
842
843 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
844 tu_cs_emit(cs, 0x0);
845 }
846 }
847
848 static void
849 tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
850 {
851 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
852 const struct tu_framebuffer *fb = cmd->state.framebuffer;
853 const struct tu_image_view *iview = fb->attachments[a].attachment;
854 const struct tu_render_pass_attachment *attachment =
855 &cmd->state.pass->attachments[a];
856
857 if (attachment->gmem_offset < 0)
858 return;
859
860 const uint32_t x1 = tiling->render_area.offset.x;
861 const uint32_t y1 = tiling->render_area.offset.y;
862 const uint32_t x2 = x1 + tiling->render_area.extent.width;
863 const uint32_t y2 = y1 + tiling->render_area.extent.height;
864 const uint32_t tile_x2 =
865 tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width;
866 const uint32_t tile_y2 =
867 tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height;
868 bool need_load =
869 x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) ||
870 y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2);
871
872 if (need_load)
873 tu_finishme("improve handling of unaligned render area");
874
875 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
876 need_load = true;
877
878 if (vk_format_has_stencil(iview->vk_format) &&
879 attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
880 need_load = true;
881
882 if (need_load) {
883 tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false);
884 tu6_emit_blit(cmd, cs);
885 }
886 }
887
888 static void
889 tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
890 uint32_t a,
891 const VkRenderPassBeginInfo *info)
892 {
893 const struct tu_framebuffer *fb = cmd->state.framebuffer;
894 const struct tu_image_view *iview = fb->attachments[a].attachment;
895 const struct tu_render_pass_attachment *attachment =
896 &cmd->state.pass->attachments[a];
897 unsigned clear_mask = 0;
898
899 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
900 if (attachment->gmem_offset < 0)
901 return;
902
903 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
904 clear_mask = 0xf;
905
906 if (vk_format_has_stencil(iview->vk_format)) {
907 clear_mask &= 0x1;
908 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
909 clear_mask |= 0x2;
910 }
911 if (!clear_mask)
912 return;
913
914 tu_clear_gmem_attachment(cmd, cs, a, clear_mask,
915 &info->pClearValues[a]);
916 }
917
918 static void
919 tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd,
920 struct tu_cs *cs,
921 uint32_t a,
922 uint32_t gmem_a,
923 bool resolve)
924 {
925 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
926
927 tu6_emit_blit_info(cmd, cs,
928 cmd->state.framebuffer->attachments[a].attachment,
929 cmd->state.pass->attachments[gmem_a].gmem_offset, resolve);
930 tu6_emit_blit(cmd, cs);
931
932 tu_cond_exec_end(cs);
933 }
934
935 static void
936 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
937 struct tu_cs *cs,
938 uint32_t a,
939 uint32_t gmem_a)
940 {
941 const struct tu_framebuffer *fb = cmd->state.framebuffer;
942 const struct tu_image_view *dst = fb->attachments[a].attachment;
943 const struct tu_image_view *src = fb->attachments[gmem_a].attachment;
944
945 tu_blit(cmd, cs, &(struct tu_blit) {
946 .dst = sysmem_attachment_surf(dst, dst->base_layer,
947 &cmd->state.tiling_config.render_area),
948 .src = sysmem_attachment_surf(src, src->base_layer,
949 &cmd->state.tiling_config.render_area),
950 .layers = fb->layers,
951 });
952 }
953
954
955 /* Emit a MSAA resolve operation, with both gmem and sysmem paths. */
956 static void tu6_emit_resolve(struct tu_cmd_buffer *cmd,
957 struct tu_cs *cs,
958 uint32_t a,
959 uint32_t gmem_a)
960 {
961 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
962 return;
963
964 tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true);
965
966 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
967 tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a);
968 tu_cond_exec_end(cs);
969 }
970
971 static void
972 tu6_emit_store_attachment(struct tu_cmd_buffer *cmd,
973 struct tu_cs *cs,
974 uint32_t a,
975 uint32_t gmem_a)
976 {
977 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
978 return;
979
980 tu6_emit_blit_info(cmd, cs,
981 cmd->state.framebuffer->attachments[a].attachment,
982 cmd->state.pass->attachments[gmem_a].gmem_offset, true);
983 tu6_emit_blit(cmd, cs);
984 }
985
986 static void
987 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
988 {
989 const struct tu_render_pass *pass = cmd->state.pass;
990 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
991
992 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
993 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
994 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
995 CP_SET_DRAW_STATE__0_GROUP_ID(0));
996 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
997 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
998
999 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1000 tu_cs_emit(cs, 0x0);
1001
1002 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1003 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
1004
1005 tu6_emit_blit_scissor(cmd, cs, true);
1006
1007 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
1008 if (pass->attachments[a].gmem_offset >= 0)
1009 tu6_emit_store_attachment(cmd, cs, a, a);
1010 }
1011
1012 if (subpass->resolve_attachments) {
1013 for (unsigned i = 0; i < subpass->color_count; i++) {
1014 uint32_t a = subpass->resolve_attachments[i].attachment;
1015 if (a != VK_ATTACHMENT_UNUSED)
1016 tu6_emit_store_attachment(cmd, cs, a,
1017 subpass->color_attachments[i].attachment);
1018 }
1019 }
1020 }
1021
1022 static void
1023 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
1024 {
1025 tu_cs_emit_regs(cs,
1026 A6XX_PC_RESTART_INDEX(restart_index));
1027 }
1028
1029 static void
1030 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1031 {
1032 tu6_emit_cache_flush(cmd, cs);
1033
1034 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
1035
1036 tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x10000000);
1037 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
1038 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
1039 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
1040 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
1041 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
1042 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
1043 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1044 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1045
1046 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
1047 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
1048 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
1049 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
1050 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
1051 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
1052 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
1053 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1054 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
1055 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
1056 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
1057 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
1058 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
1059 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
1060
1061 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1062 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1063 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
1064
1065 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
1066
1067 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
1068
1069 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
1070 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
1071 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
1072 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
1073 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
1074 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
1075 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
1076 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
1077 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
1078 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
1079 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
1080
1081 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
1082 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
1083
1084 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1);
1085 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
1086
1087 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
1088 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
1089
1090 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
1091 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
1092 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
1093 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
1094
1095 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
1096 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
1097
1098 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
1099
1100 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
1101
1102 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
1103 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
1104 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
1105 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1106 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
1107 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
1108 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
1109 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
1110 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
1111 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
1112 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0);
1113 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
1114 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0);
1115 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0);
1116 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0);
1117 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0);
1118 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0);
1119 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0);
1120 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
1121 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
1122 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1123
1124 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
1125
1126 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
1127
1128 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
1129
1130 /* we don't use this yet.. probably best to disable.. */
1131 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1132 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1133 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1134 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1135 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1136 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1137
1138 tu_cs_emit_regs(cs,
1139 A6XX_VPC_SO_BUFFER_BASE(0),
1140 A6XX_VPC_SO_BUFFER_SIZE(0));
1141
1142 tu_cs_emit_regs(cs,
1143 A6XX_VPC_SO_FLUSH_BASE(0));
1144
1145 tu_cs_emit_regs(cs,
1146 A6XX_VPC_SO_BUF_CNTL(0));
1147
1148 tu_cs_emit_regs(cs,
1149 A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
1150
1151 tu_cs_emit_regs(cs,
1152 A6XX_VPC_SO_BUFFER_BASE(1, 0),
1153 A6XX_VPC_SO_BUFFER_SIZE(1, 0));
1154
1155 tu_cs_emit_regs(cs,
1156 A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
1157 A6XX_VPC_SO_FLUSH_BASE(1, 0),
1158 A6XX_VPC_SO_BUFFER_BASE(2, 0),
1159 A6XX_VPC_SO_BUFFER_SIZE(2, 0));
1160
1161 tu_cs_emit_regs(cs,
1162 A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
1163 A6XX_VPC_SO_FLUSH_BASE(2, 0),
1164 A6XX_VPC_SO_BUFFER_BASE(3, 0),
1165 A6XX_VPC_SO_BUFFER_SIZE(3, 0));
1166
1167 tu_cs_emit_regs(cs,
1168 A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
1169 A6XX_VPC_SO_FLUSH_BASE(3, 0));
1170
1171 tu_cs_emit_regs(cs,
1172 A6XX_SP_HS_CTRL_REG0(0));
1173
1174 tu_cs_emit_regs(cs,
1175 A6XX_SP_GS_CTRL_REG0(0));
1176
1177 tu_cs_emit_regs(cs,
1178 A6XX_GRAS_LRZ_CNTL(0));
1179
1180 tu_cs_emit_regs(cs,
1181 A6XX_RB_LRZ_CNTL(0));
1182
1183 tu_cs_sanity_check(cs);
1184 }
1185
1186 static void
1187 tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1188 {
1189 unsigned seqno;
1190
1191 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_AND_INV_EVENT, true);
1192
1193 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1194 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1195 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1196 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1197 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(seqno));
1198 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1199 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1200
1201 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1202
1203 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4);
1204 tu_cs_emit(cs, CP_WAIT_MEM_GTE_0_RESERVED(0));
1205 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1206 tu_cs_emit(cs, CP_WAIT_MEM_GTE_3_REF(seqno));
1207 }
1208
1209 static void
1210 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1211 {
1212 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1213
1214 tu_cs_emit_regs(cs,
1215 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1216 .height = tiling->tile0.extent.height),
1217 A6XX_VSC_SIZE_ADDRESS(.bo = &cmd->vsc_data,
1218 .bo_offset = 32 * cmd->vsc_data_pitch));
1219
1220 tu_cs_emit_regs(cs,
1221 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1222 .ny = tiling->tile_count.height));
1223
1224 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1225 for (unsigned i = 0; i < 32; i++)
1226 tu_cs_emit(cs, tiling->pipe_config[i]);
1227
1228 tu_cs_emit_regs(cs,
1229 A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = &cmd->vsc_data2),
1230 A6XX_VSC_PIPE_DATA2_PITCH(cmd->vsc_data2_pitch),
1231 A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(cmd->vsc_data2.size));
1232
1233 tu_cs_emit_regs(cs,
1234 A6XX_VSC_PIPE_DATA_ADDRESS(.bo = &cmd->vsc_data),
1235 A6XX_VSC_PIPE_DATA_PITCH(cmd->vsc_data_pitch),
1236 A6XX_VSC_PIPE_DATA_ARRAY_PITCH(cmd->vsc_data.size));
1237 }
1238
1239 static void
1240 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1241 {
1242 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1243 const uint32_t used_pipe_count =
1244 tiling->pipe_count.width * tiling->pipe_count.height;
1245
1246 /* Clear vsc_scratch: */
1247 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1248 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1249 tu_cs_emit(cs, 0x0);
1250
1251 /* Check for overflow, write vsc_scratch if detected: */
1252 for (int i = 0; i < used_pipe_count; i++) {
1253 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1254 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1255 CP_COND_WRITE5_0_WRITE_MEMORY);
1256 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i)));
1257 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1258 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data_pitch));
1259 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1260 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1261 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_data_pitch));
1262
1263 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1264 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1265 CP_COND_WRITE5_0_WRITE_MEMORY);
1266 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i)));
1267 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1268 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data2_pitch));
1269 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1270 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1271 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_data2_pitch));
1272 }
1273
1274 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1275
1276 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1277
1278 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1279 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1280 CP_MEM_TO_REG_0_CNT(1 - 1));
1281 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1282
1283 /*
1284 * This is a bit awkward, we really want a way to invert the
1285 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1286 * execute cmds to use hwbinning when a bit is *not* set. This
1287 * dance is to invert OVERFLOW_FLAG_REG
1288 *
1289 * A CP_NOP packet is used to skip executing the 'else' clause
1290 * if (b0 set)..
1291 */
1292
1293 /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */
1294 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1295 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1296 A6XX_CP_REG_TEST_0_BIT(0) |
1297 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1298
1299 tu_cs_reserve(cs, 3 + 7);
1300 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1301 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1302 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1303
1304 /* if (b0 set) */ {
1305 /*
1306 * On overflow, mirror the value to control->vsc_overflow
1307 * which CPU is checking to detect overflow (see
1308 * check_vsc_overflow())
1309 */
1310 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1311 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1312 CP_REG_TO_MEM_0_CNT(0));
1313 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW);
1314
1315 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1316 tu_cs_emit(cs, 0x0);
1317
1318 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1319 } /* else */ {
1320 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1321 tu_cs_emit(cs, 0x1);
1322 }
1323 }
1324
1325 static void
1326 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1327 {
1328 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1329 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1330
1331 uint32_t x1 = tiling->tile0.offset.x;
1332 uint32_t y1 = tiling->tile0.offset.y;
1333 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1334 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1335
1336 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
1337
1338 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1339 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1340
1341 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1342 tu_cs_emit(cs, 0x1);
1343
1344 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1345 tu_cs_emit(cs, 0x1);
1346
1347 tu_cs_emit_wfi(cs);
1348
1349 tu_cs_emit_regs(cs,
1350 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1351
1352 update_vsc_pipe(cmd, cs);
1353
1354 tu_cs_emit_regs(cs,
1355 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1356
1357 tu_cs_emit_regs(cs,
1358 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1359
1360 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1361 tu_cs_emit(cs, UNK_2C);
1362
1363 tu_cs_emit_regs(cs,
1364 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1365
1366 tu_cs_emit_regs(cs,
1367 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1368
1369 /* emit IB to binning drawcmds: */
1370 tu_cs_emit_call(cs, &cmd->draw_cs);
1371
1372 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1373 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1374 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1375 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1376 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1377 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1378
1379 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1380 tu_cs_emit(cs, UNK_2D);
1381
1382 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1383 tu6_cache_flush(cmd, cs);
1384
1385 tu_cs_emit_wfi(cs);
1386
1387 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1388
1389 emit_vsc_overflow_test(cmd, cs);
1390
1391 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1392 tu_cs_emit(cs, 0x0);
1393
1394 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1395 tu_cs_emit(cs, 0x0);
1396
1397 tu_cs_emit_wfi(cs);
1398
1399 tu_cs_emit_regs(cs,
1400 A6XX_RB_CCU_CNTL(.unknown = phys_dev->magic.RB_CCU_CNTL_gmem));
1401
1402 cmd->wait_for_idle = false;
1403 }
1404
1405 static void
1406 tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1407 uint32_t a,
1408 const VkRenderPassBeginInfo *info)
1409 {
1410 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1411 const struct tu_image_view *iview = fb->attachments[a].attachment;
1412 const struct tu_render_pass_attachment *attachment =
1413 &cmd->state.pass->attachments[a];
1414 unsigned clear_mask = 0;
1415
1416 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
1417 if (attachment->gmem_offset < 0)
1418 return;
1419
1420 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1421 clear_mask = 0xf;
1422 }
1423
1424 if (vk_format_has_stencil(iview->vk_format)) {
1425 clear_mask &= 0x1;
1426 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
1427 clear_mask |= 0x2;
1428 if (clear_mask != 0x3)
1429 tu_finishme("depth/stencil only load op");
1430 }
1431
1432 if (!clear_mask)
1433 return;
1434
1435 tu_clear_sysmem_attachment(cmd, cs, a,
1436 &info->pClearValues[a], &(struct VkClearRect) {
1437 .rect = info->renderArea,
1438 .baseArrayLayer = iview->base_layer,
1439 .layerCount = iview->layer_count,
1440 });
1441 }
1442
1443 static void
1444 tu_emit_load_clear(struct tu_cmd_buffer *cmd,
1445 const VkRenderPassBeginInfo *info)
1446 {
1447 struct tu_cs *cs = &cmd->draw_cs;
1448
1449 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1450
1451 tu6_emit_blit_scissor(cmd, cs, true);
1452
1453 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1454 tu6_emit_load_attachment(cmd, cs, i);
1455
1456 tu6_emit_blit_scissor(cmd, cs, false);
1457
1458 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1459 tu6_emit_clear_attachment(cmd, cs, i, info);
1460
1461 tu_cond_exec_end(cs);
1462
1463 /* invalidate because reading input attachments will cache GMEM and
1464 * the cache isn''t updated when GMEM is written
1465 * TODO: is there a no-cache bit for textures?
1466 */
1467 if (cmd->state.subpass->input_count)
1468 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1469
1470 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1471
1472 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1473 tu_emit_sysmem_clear_attachment(cmd, cs, i, info);
1474
1475 tu_cond_exec_end(cs);
1476 }
1477
1478 static void
1479 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1480 const struct VkRect2D *renderArea)
1481 {
1482 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1483 if (fb->width > 0 && fb->height > 0) {
1484 tu6_emit_window_scissor(cmd, cs,
1485 0, 0, fb->width - 1, fb->height - 1);
1486 } else {
1487 tu6_emit_window_scissor(cmd, cs, 0, 0, 0, 0);
1488 }
1489
1490 tu6_emit_window_offset(cmd, cs, 0, 0);
1491
1492 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1493
1494 tu6_emit_lrz_flush(cmd, cs);
1495
1496 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1497 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1498
1499 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1500 tu_cs_emit(cs, 0x0);
1501
1502 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1503 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1504 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1505
1506 tu6_emit_wfi(cmd, cs);
1507 tu_cs_emit_regs(cs,
1508 A6XX_RB_CCU_CNTL(0x10000000));
1509
1510 /* enable stream-out, with sysmem there is only one pass: */
1511 tu_cs_emit_regs(cs,
1512 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1513
1514 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1515 tu_cs_emit(cs, 0x1);
1516
1517 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1518 tu_cs_emit(cs, 0x0);
1519
1520 tu_cs_sanity_check(cs);
1521 }
1522
1523 static void
1524 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1525 {
1526 /* Do any resolves of the last subpass. These are handled in the
1527 * tile_store_ib in the gmem path.
1528 */
1529
1530 const struct tu_subpass *subpass = cmd->state.subpass;
1531 if (subpass->resolve_attachments) {
1532 for (unsigned i = 0; i < subpass->color_count; i++) {
1533 uint32_t a = subpass->resolve_attachments[i].attachment;
1534 if (a != VK_ATTACHMENT_UNUSED)
1535 tu6_emit_sysmem_resolve(cmd, cs, a,
1536 subpass->color_attachments[i].attachment);
1537 }
1538 }
1539
1540 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1541
1542 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1543 tu_cs_emit(cs, 0x0);
1544
1545 tu6_emit_lrz_flush(cmd, cs);
1546
1547 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1548 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1549
1550 tu_cs_sanity_check(cs);
1551 }
1552
1553
1554 static void
1555 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1556 {
1557 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1558
1559 tu6_emit_lrz_flush(cmd, cs);
1560
1561 /* lrz clear? */
1562
1563 tu6_emit_cache_flush(cmd, cs);
1564
1565 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1566 tu_cs_emit(cs, 0x0);
1567
1568 /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
1569 tu6_emit_wfi(cmd, cs);
1570 tu_cs_emit_regs(cs,
1571 A6XX_RB_CCU_CNTL(phys_dev->magic.RB_CCU_CNTL_gmem));
1572
1573 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1574 if (use_hw_binning(cmd)) {
1575 tu6_emit_bin_size(cs,
1576 tiling->tile0.extent.width,
1577 tiling->tile0.extent.height,
1578 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1579
1580 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1581
1582 tu6_emit_binning_pass(cmd, cs);
1583
1584 tu6_emit_bin_size(cs,
1585 tiling->tile0.extent.width,
1586 tiling->tile0.extent.height,
1587 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1588
1589 tu_cs_emit_regs(cs,
1590 A6XX_VFD_MODE_CNTL(0));
1591
1592 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1593
1594 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1595
1596 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1597 tu_cs_emit(cs, 0x1);
1598 } else {
1599 tu6_emit_bin_size(cs,
1600 tiling->tile0.extent.width,
1601 tiling->tile0.extent.height,
1602 0x6000000);
1603 }
1604
1605 tu_cs_sanity_check(cs);
1606 }
1607
1608 static void
1609 tu6_render_tile(struct tu_cmd_buffer *cmd,
1610 struct tu_cs *cs,
1611 const struct tu_tile *tile)
1612 {
1613 tu6_emit_tile_select(cmd, cs, tile);
1614
1615 tu_cs_emit_call(cs, &cmd->draw_cs);
1616 cmd->wait_for_idle = true;
1617
1618 if (use_hw_binning(cmd)) {
1619 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1620 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1621 A6XX_CP_REG_TEST_0_BIT(0) |
1622 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1623
1624 tu_cs_reserve(cs, 3 + 2);
1625 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1626 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1627 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1628
1629 /* if (no overflow) */ {
1630 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1631 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1632 }
1633 }
1634
1635 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1636
1637 tu_cs_sanity_check(cs);
1638 }
1639
1640 static void
1641 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1642 {
1643 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1644
1645 tu_cs_emit_regs(cs,
1646 A6XX_GRAS_LRZ_CNTL(0));
1647
1648 tu6_emit_lrz_flush(cmd, cs);
1649
1650 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1651
1652 tu_cs_sanity_check(cs);
1653 }
1654
1655 static void
1656 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1657 {
1658 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1659
1660 tu6_tile_render_begin(cmd, &cmd->cs);
1661
1662 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1663 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1664 struct tu_tile tile;
1665 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1666 tu6_render_tile(cmd, &cmd->cs, &tile);
1667 }
1668 }
1669
1670 tu6_tile_render_end(cmd, &cmd->cs);
1671 }
1672
1673 static void
1674 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1675 {
1676 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1677
1678 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1679
1680 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1681 cmd->wait_for_idle = true;
1682
1683 tu6_sysmem_render_end(cmd, &cmd->cs);
1684 }
1685
1686 static void
1687 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1688 {
1689 const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count;
1690 struct tu_cs sub_cs;
1691
1692 VkResult result =
1693 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1694 if (result != VK_SUCCESS) {
1695 cmd->record_result = result;
1696 return;
1697 }
1698
1699 /* emit to tile-store sub_cs */
1700 tu6_emit_tile_store(cmd, &sub_cs);
1701
1702 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1703 }
1704
1705 static void
1706 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1707 const VkRect2D *render_area)
1708 {
1709 const struct tu_device *dev = cmd->device;
1710 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1711
1712 tiling->render_area = *render_area;
1713 tiling->force_sysmem = force_sysmem(cmd, render_area);
1714
1715 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels);
1716 tu_tiling_config_update_pipe_layout(tiling, dev);
1717 tu_tiling_config_update_pipes(tiling, dev);
1718 }
1719
1720 const struct tu_dynamic_state default_dynamic_state = {
1721 .viewport =
1722 {
1723 .count = 0,
1724 },
1725 .scissor =
1726 {
1727 .count = 0,
1728 },
1729 .line_width = 1.0f,
1730 .depth_bias =
1731 {
1732 .bias = 0.0f,
1733 .clamp = 0.0f,
1734 .slope = 0.0f,
1735 },
1736 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1737 .depth_bounds =
1738 {
1739 .min = 0.0f,
1740 .max = 1.0f,
1741 },
1742 .stencil_compare_mask =
1743 {
1744 .front = ~0u,
1745 .back = ~0u,
1746 },
1747 .stencil_write_mask =
1748 {
1749 .front = ~0u,
1750 .back = ~0u,
1751 },
1752 .stencil_reference =
1753 {
1754 .front = 0u,
1755 .back = 0u,
1756 },
1757 };
1758
1759 static void UNUSED /* FINISHME */
1760 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1761 const struct tu_dynamic_state *src)
1762 {
1763 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1764 uint32_t copy_mask = src->mask;
1765 uint32_t dest_mask = 0;
1766
1767 tu_use_args(cmd_buffer); /* FINISHME */
1768
1769 /* Make sure to copy the number of viewports/scissors because they can
1770 * only be specified at pipeline creation time.
1771 */
1772 dest->viewport.count = src->viewport.count;
1773 dest->scissor.count = src->scissor.count;
1774 dest->discard_rectangle.count = src->discard_rectangle.count;
1775
1776 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1777 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1778 src->viewport.count * sizeof(VkViewport))) {
1779 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1780 src->viewport.count);
1781 dest_mask |= TU_DYNAMIC_VIEWPORT;
1782 }
1783 }
1784
1785 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1786 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1787 src->scissor.count * sizeof(VkRect2D))) {
1788 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1789 src->scissor.count);
1790 dest_mask |= TU_DYNAMIC_SCISSOR;
1791 }
1792 }
1793
1794 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1795 if (dest->line_width != src->line_width) {
1796 dest->line_width = src->line_width;
1797 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1798 }
1799 }
1800
1801 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1802 if (memcmp(&dest->depth_bias, &src->depth_bias,
1803 sizeof(src->depth_bias))) {
1804 dest->depth_bias = src->depth_bias;
1805 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1806 }
1807 }
1808
1809 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1810 if (memcmp(&dest->blend_constants, &src->blend_constants,
1811 sizeof(src->blend_constants))) {
1812 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1813 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1814 }
1815 }
1816
1817 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1818 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1819 sizeof(src->depth_bounds))) {
1820 dest->depth_bounds = src->depth_bounds;
1821 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1822 }
1823 }
1824
1825 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1826 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1827 sizeof(src->stencil_compare_mask))) {
1828 dest->stencil_compare_mask = src->stencil_compare_mask;
1829 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1830 }
1831 }
1832
1833 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1834 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1835 sizeof(src->stencil_write_mask))) {
1836 dest->stencil_write_mask = src->stencil_write_mask;
1837 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1838 }
1839 }
1840
1841 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1842 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1843 sizeof(src->stencil_reference))) {
1844 dest->stencil_reference = src->stencil_reference;
1845 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1846 }
1847 }
1848
1849 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1850 if (memcmp(&dest->discard_rectangle.rectangles,
1851 &src->discard_rectangle.rectangles,
1852 src->discard_rectangle.count * sizeof(VkRect2D))) {
1853 typed_memcpy(dest->discard_rectangle.rectangles,
1854 src->discard_rectangle.rectangles,
1855 src->discard_rectangle.count);
1856 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1857 }
1858 }
1859 }
1860
1861 static VkResult
1862 tu_create_cmd_buffer(struct tu_device *device,
1863 struct tu_cmd_pool *pool,
1864 VkCommandBufferLevel level,
1865 VkCommandBuffer *pCommandBuffer)
1866 {
1867 struct tu_cmd_buffer *cmd_buffer;
1868 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1869 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1870 if (cmd_buffer == NULL)
1871 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1872
1873 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1874 cmd_buffer->device = device;
1875 cmd_buffer->pool = pool;
1876 cmd_buffer->level = level;
1877
1878 if (pool) {
1879 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1880 cmd_buffer->queue_family_index = pool->queue_family_index;
1881
1882 } else {
1883 /* Init the pool_link so we can safely call list_del when we destroy
1884 * the command buffer
1885 */
1886 list_inithead(&cmd_buffer->pool_link);
1887 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1888 }
1889
1890 tu_bo_list_init(&cmd_buffer->bo_list);
1891 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1892 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1893 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1894 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1895
1896 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1897
1898 list_inithead(&cmd_buffer->upload.list);
1899
1900 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1901 if (result != VK_SUCCESS)
1902 goto fail_scratch_bo;
1903
1904 /* TODO: resize on overflow */
1905 cmd_buffer->vsc_data_pitch = device->vsc_data_pitch;
1906 cmd_buffer->vsc_data2_pitch = device->vsc_data2_pitch;
1907 cmd_buffer->vsc_data = device->vsc_data;
1908 cmd_buffer->vsc_data2 = device->vsc_data2;
1909
1910 return VK_SUCCESS;
1911
1912 fail_scratch_bo:
1913 list_del(&cmd_buffer->pool_link);
1914 return result;
1915 }
1916
1917 static void
1918 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1919 {
1920 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1921
1922 list_del(&cmd_buffer->pool_link);
1923
1924 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
1925 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
1926
1927 tu_cs_finish(&cmd_buffer->cs);
1928 tu_cs_finish(&cmd_buffer->draw_cs);
1929 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1930 tu_cs_finish(&cmd_buffer->sub_cs);
1931
1932 tu_bo_list_destroy(&cmd_buffer->bo_list);
1933 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1934 }
1935
1936 static VkResult
1937 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1938 {
1939 cmd_buffer->wait_for_idle = true;
1940
1941 cmd_buffer->record_result = VK_SUCCESS;
1942
1943 tu_bo_list_reset(&cmd_buffer->bo_list);
1944 tu_cs_reset(&cmd_buffer->cs);
1945 tu_cs_reset(&cmd_buffer->draw_cs);
1946 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1947 tu_cs_reset(&cmd_buffer->sub_cs);
1948
1949 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
1950 cmd_buffer->descriptors[i].valid = 0;
1951 cmd_buffer->descriptors[i].push_dirty = false;
1952 }
1953
1954 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1955
1956 return cmd_buffer->record_result;
1957 }
1958
1959 VkResult
1960 tu_AllocateCommandBuffers(VkDevice _device,
1961 const VkCommandBufferAllocateInfo *pAllocateInfo,
1962 VkCommandBuffer *pCommandBuffers)
1963 {
1964 TU_FROM_HANDLE(tu_device, device, _device);
1965 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1966
1967 VkResult result = VK_SUCCESS;
1968 uint32_t i;
1969
1970 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1971
1972 if (!list_is_empty(&pool->free_cmd_buffers)) {
1973 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1974 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1975
1976 list_del(&cmd_buffer->pool_link);
1977 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1978
1979 result = tu_reset_cmd_buffer(cmd_buffer);
1980 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1981 cmd_buffer->level = pAllocateInfo->level;
1982
1983 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1984 } else {
1985 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1986 &pCommandBuffers[i]);
1987 }
1988 if (result != VK_SUCCESS)
1989 break;
1990 }
1991
1992 if (result != VK_SUCCESS) {
1993 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1994 pCommandBuffers);
1995
1996 /* From the Vulkan 1.0.66 spec:
1997 *
1998 * "vkAllocateCommandBuffers can be used to create multiple
1999 * command buffers. If the creation of any of those command
2000 * buffers fails, the implementation must destroy all
2001 * successfully created command buffer objects from this
2002 * command, set all entries of the pCommandBuffers array to
2003 * NULL and return the error."
2004 */
2005 memset(pCommandBuffers, 0,
2006 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2007 }
2008
2009 return result;
2010 }
2011
2012 void
2013 tu_FreeCommandBuffers(VkDevice device,
2014 VkCommandPool commandPool,
2015 uint32_t commandBufferCount,
2016 const VkCommandBuffer *pCommandBuffers)
2017 {
2018 for (uint32_t i = 0; i < commandBufferCount; i++) {
2019 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
2020
2021 if (cmd_buffer) {
2022 if (cmd_buffer->pool) {
2023 list_del(&cmd_buffer->pool_link);
2024 list_addtail(&cmd_buffer->pool_link,
2025 &cmd_buffer->pool->free_cmd_buffers);
2026 } else
2027 tu_cmd_buffer_destroy(cmd_buffer);
2028 }
2029 }
2030 }
2031
2032 VkResult
2033 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
2034 VkCommandBufferResetFlags flags)
2035 {
2036 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2037 return tu_reset_cmd_buffer(cmd_buffer);
2038 }
2039
2040 VkResult
2041 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2042 const VkCommandBufferBeginInfo *pBeginInfo)
2043 {
2044 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2045 VkResult result = VK_SUCCESS;
2046
2047 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
2048 /* If the command buffer has already been resetted with
2049 * vkResetCommandBuffer, no need to do it again.
2050 */
2051 result = tu_reset_cmd_buffer(cmd_buffer);
2052 if (result != VK_SUCCESS)
2053 return result;
2054 }
2055
2056 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2057 cmd_buffer->usage_flags = pBeginInfo->flags;
2058
2059 tu_cs_begin(&cmd_buffer->cs);
2060 tu_cs_begin(&cmd_buffer->draw_cs);
2061 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2062
2063 cmd_buffer->scratch_seqno = 0;
2064
2065 /* setup initial configuration into command buffer */
2066 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2067 switch (cmd_buffer->queue_family_index) {
2068 case TU_QUEUE_GENERAL:
2069 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
2070 break;
2071 default:
2072 break;
2073 }
2074 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2075 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
2076 assert(pBeginInfo->pInheritanceInfo);
2077 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2078 cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2079 }
2080
2081 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
2082
2083 return VK_SUCCESS;
2084 }
2085
2086 void
2087 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2088 uint32_t firstBinding,
2089 uint32_t bindingCount,
2090 const VkBuffer *pBuffers,
2091 const VkDeviceSize *pOffsets)
2092 {
2093 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2094
2095 assert(firstBinding + bindingCount <= MAX_VBS);
2096
2097 for (uint32_t i = 0; i < bindingCount; i++) {
2098 cmd->state.vb.buffers[firstBinding + i] =
2099 tu_buffer_from_handle(pBuffers[i]);
2100 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
2101 }
2102
2103 /* VB states depend on VkPipelineVertexInputStateCreateInfo */
2104 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2105 }
2106
2107 void
2108 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2109 VkBuffer buffer,
2110 VkDeviceSize offset,
2111 VkIndexType indexType)
2112 {
2113 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2114 TU_FROM_HANDLE(tu_buffer, buf, buffer);
2115
2116 /* initialize/update the restart index */
2117 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
2118 struct tu_cs *draw_cs = &cmd->draw_cs;
2119
2120 tu6_emit_restart_index(
2121 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
2122
2123 tu_cs_sanity_check(draw_cs);
2124 }
2125
2126 /* track the BO */
2127 if (cmd->state.index_buffer != buf)
2128 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
2129
2130 cmd->state.index_buffer = buf;
2131 cmd->state.index_offset = offset;
2132 cmd->state.index_type = indexType;
2133 }
2134
2135 void
2136 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2137 VkPipelineBindPoint pipelineBindPoint,
2138 VkPipelineLayout _layout,
2139 uint32_t firstSet,
2140 uint32_t descriptorSetCount,
2141 const VkDescriptorSet *pDescriptorSets,
2142 uint32_t dynamicOffsetCount,
2143 const uint32_t *pDynamicOffsets)
2144 {
2145 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2146 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
2147 unsigned dyn_idx = 0;
2148
2149 struct tu_descriptor_state *descriptors_state =
2150 tu_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2151
2152 for (unsigned i = 0; i < descriptorSetCount; ++i) {
2153 unsigned idx = i + firstSet;
2154 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
2155
2156 descriptors_state->sets[idx] = set;
2157 descriptors_state->valid |= (1u << idx);
2158
2159 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
2160 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
2161 assert(dyn_idx < dynamicOffsetCount);
2162
2163 descriptors_state->dynamic_buffers[idx] =
2164 set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
2165 }
2166 }
2167
2168 cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2169 }
2170
2171 void
2172 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2173 VkPipelineLayout layout,
2174 VkShaderStageFlags stageFlags,
2175 uint32_t offset,
2176 uint32_t size,
2177 const void *pValues)
2178 {
2179 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2180 memcpy((void*) cmd->push_constants + offset, pValues, size);
2181 cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS;
2182 }
2183
2184 VkResult
2185 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2186 {
2187 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2188
2189 if (cmd_buffer->scratch_seqno) {
2190 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2191 MSM_SUBMIT_BO_WRITE);
2192 }
2193
2194 if (cmd_buffer->use_vsc_data) {
2195 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data,
2196 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2197 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data2,
2198 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2199 }
2200
2201 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2202 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2203 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2204 }
2205
2206 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2207 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2208 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2209 }
2210
2211 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2212 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2213 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2214 }
2215
2216 tu_cs_end(&cmd_buffer->cs);
2217 tu_cs_end(&cmd_buffer->draw_cs);
2218 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2219
2220 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2221
2222 return cmd_buffer->record_result;
2223 }
2224
2225 void
2226 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2227 VkPipelineBindPoint pipelineBindPoint,
2228 VkPipeline _pipeline)
2229 {
2230 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2231 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2232
2233 switch (pipelineBindPoint) {
2234 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2235 cmd->state.pipeline = pipeline;
2236 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
2237 break;
2238 case VK_PIPELINE_BIND_POINT_COMPUTE:
2239 cmd->state.compute_pipeline = pipeline;
2240 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2241 break;
2242 default:
2243 unreachable("unrecognized pipeline bind point");
2244 break;
2245 }
2246
2247 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2248 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2249 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2250 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2251 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2252 }
2253 }
2254
2255 void
2256 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2257 uint32_t firstViewport,
2258 uint32_t viewportCount,
2259 const VkViewport *pViewports)
2260 {
2261 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2262 struct tu_cs *draw_cs = &cmd->draw_cs;
2263
2264 assert(firstViewport == 0 && viewportCount == 1);
2265 tu6_emit_viewport(draw_cs, pViewports);
2266
2267 tu_cs_sanity_check(draw_cs);
2268 }
2269
2270 void
2271 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2272 uint32_t firstScissor,
2273 uint32_t scissorCount,
2274 const VkRect2D *pScissors)
2275 {
2276 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2277 struct tu_cs *draw_cs = &cmd->draw_cs;
2278
2279 assert(firstScissor == 0 && scissorCount == 1);
2280 tu6_emit_scissor(draw_cs, pScissors);
2281
2282 tu_cs_sanity_check(draw_cs);
2283 }
2284
2285 void
2286 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2287 {
2288 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2289
2290 cmd->state.dynamic.line_width = lineWidth;
2291
2292 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2293 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2294 }
2295
2296 void
2297 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2298 float depthBiasConstantFactor,
2299 float depthBiasClamp,
2300 float depthBiasSlopeFactor)
2301 {
2302 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2303 struct tu_cs *draw_cs = &cmd->draw_cs;
2304
2305 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2306 depthBiasSlopeFactor);
2307
2308 tu_cs_sanity_check(draw_cs);
2309 }
2310
2311 void
2312 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2313 const float blendConstants[4])
2314 {
2315 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2316 struct tu_cs *draw_cs = &cmd->draw_cs;
2317
2318 tu6_emit_blend_constants(draw_cs, blendConstants);
2319
2320 tu_cs_sanity_check(draw_cs);
2321 }
2322
2323 void
2324 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2325 float minDepthBounds,
2326 float maxDepthBounds)
2327 {
2328 }
2329
2330 void
2331 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2332 VkStencilFaceFlags faceMask,
2333 uint32_t compareMask)
2334 {
2335 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2336
2337 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2338 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2339 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2340 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2341
2342 /* the front/back compare masks must be updated together */
2343 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2344 }
2345
2346 void
2347 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2348 VkStencilFaceFlags faceMask,
2349 uint32_t writeMask)
2350 {
2351 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2352
2353 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2354 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2355 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2356 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2357
2358 /* the front/back write masks must be updated together */
2359 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2360 }
2361
2362 void
2363 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2364 VkStencilFaceFlags faceMask,
2365 uint32_t reference)
2366 {
2367 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2368
2369 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2370 cmd->state.dynamic.stencil_reference.front = reference;
2371 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2372 cmd->state.dynamic.stencil_reference.back = reference;
2373
2374 /* the front/back references must be updated together */
2375 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2376 }
2377
2378 void
2379 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2380 uint32_t commandBufferCount,
2381 const VkCommandBuffer *pCmdBuffers)
2382 {
2383 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2384 VkResult result;
2385
2386 assert(commandBufferCount > 0);
2387
2388 for (uint32_t i = 0; i < commandBufferCount; i++) {
2389 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2390
2391 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2392 if (result != VK_SUCCESS) {
2393 cmd->record_result = result;
2394 break;
2395 }
2396
2397 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2398 if (result != VK_SUCCESS) {
2399 cmd->record_result = result;
2400 break;
2401 }
2402
2403 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2404 &secondary->draw_epilogue_cs);
2405 if (result != VK_SUCCESS) {
2406 cmd->record_result = result;
2407 break;
2408 }
2409 }
2410 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2411 }
2412
2413 VkResult
2414 tu_CreateCommandPool(VkDevice _device,
2415 const VkCommandPoolCreateInfo *pCreateInfo,
2416 const VkAllocationCallbacks *pAllocator,
2417 VkCommandPool *pCmdPool)
2418 {
2419 TU_FROM_HANDLE(tu_device, device, _device);
2420 struct tu_cmd_pool *pool;
2421
2422 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2423 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2424 if (pool == NULL)
2425 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2426
2427 if (pAllocator)
2428 pool->alloc = *pAllocator;
2429 else
2430 pool->alloc = device->alloc;
2431
2432 list_inithead(&pool->cmd_buffers);
2433 list_inithead(&pool->free_cmd_buffers);
2434
2435 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2436
2437 *pCmdPool = tu_cmd_pool_to_handle(pool);
2438
2439 return VK_SUCCESS;
2440 }
2441
2442 void
2443 tu_DestroyCommandPool(VkDevice _device,
2444 VkCommandPool commandPool,
2445 const VkAllocationCallbacks *pAllocator)
2446 {
2447 TU_FROM_HANDLE(tu_device, device, _device);
2448 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2449
2450 if (!pool)
2451 return;
2452
2453 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2454 &pool->cmd_buffers, pool_link)
2455 {
2456 tu_cmd_buffer_destroy(cmd_buffer);
2457 }
2458
2459 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2460 &pool->free_cmd_buffers, pool_link)
2461 {
2462 tu_cmd_buffer_destroy(cmd_buffer);
2463 }
2464
2465 vk_free2(&device->alloc, pAllocator, pool);
2466 }
2467
2468 VkResult
2469 tu_ResetCommandPool(VkDevice device,
2470 VkCommandPool commandPool,
2471 VkCommandPoolResetFlags flags)
2472 {
2473 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2474 VkResult result;
2475
2476 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2477 pool_link)
2478 {
2479 result = tu_reset_cmd_buffer(cmd_buffer);
2480 if (result != VK_SUCCESS)
2481 return result;
2482 }
2483
2484 return VK_SUCCESS;
2485 }
2486
2487 void
2488 tu_TrimCommandPool(VkDevice device,
2489 VkCommandPool commandPool,
2490 VkCommandPoolTrimFlags flags)
2491 {
2492 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2493
2494 if (!pool)
2495 return;
2496
2497 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2498 &pool->free_cmd_buffers, pool_link)
2499 {
2500 tu_cmd_buffer_destroy(cmd_buffer);
2501 }
2502 }
2503
2504 void
2505 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2506 const VkRenderPassBeginInfo *pRenderPassBegin,
2507 VkSubpassContents contents)
2508 {
2509 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2510 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2511 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2512
2513 cmd->state.pass = pass;
2514 cmd->state.subpass = pass->subpasses;
2515 cmd->state.framebuffer = fb;
2516
2517 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2518 tu_cmd_prepare_tile_store_ib(cmd);
2519
2520 tu_emit_load_clear(cmd, pRenderPassBegin);
2521
2522 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2523 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2524 tu6_emit_msaa(cmd, cmd->state.subpass, &cmd->draw_cs);
2525 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2526
2527 /* note: use_hw_binning only checks tiling config */
2528 if (use_hw_binning(cmd))
2529 cmd->use_vsc_data = true;
2530
2531 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2532 const struct tu_image_view *iview = fb->attachments[i].attachment;
2533 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2534 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2535 }
2536 }
2537
2538 void
2539 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2540 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2541 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2542 {
2543 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2544 pSubpassBeginInfo->contents);
2545 }
2546
2547 void
2548 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2549 {
2550 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2551 const struct tu_render_pass *pass = cmd->state.pass;
2552 struct tu_cs *cs = &cmd->draw_cs;
2553
2554 const struct tu_subpass *subpass = cmd->state.subpass++;
2555 /* TODO:
2556 * if msaa samples change between subpasses,
2557 * attachment store is broken for some attachments
2558 */
2559 if (subpass->resolve_attachments) {
2560 tu6_emit_blit_scissor(cmd, cs, true);
2561 for (unsigned i = 0; i < subpass->color_count; i++) {
2562 uint32_t a = subpass->resolve_attachments[i].attachment;
2563 if (a != VK_ATTACHMENT_UNUSED) {
2564 tu6_emit_resolve(cmd, cs, a,
2565 subpass->color_attachments[i].attachment);
2566 }
2567 }
2568 }
2569
2570 /* invalidate because reading input attachments will cache GMEM and
2571 * the cache isn''t updated when GMEM is written
2572 * TODO: is there a no-cache bit for textures?
2573 */
2574 if (cmd->state.subpass->input_count)
2575 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2576
2577 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2578 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2579 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2580 tu6_emit_msaa(cmd, cmd->state.subpass, cs);
2581 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2582
2583 /* Emit flushes so that input attachments will read the correct value. This
2584 * is for sysmem only, although it shouldn't do much harm on gmem.
2585 */
2586 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2587 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2588
2589 /* TODO:
2590 * since we don't know how to do GMEM->GMEM resolve,
2591 * resolve attachments are resolved to memory then loaded to GMEM again if needed
2592 */
2593 if (subpass->resolve_attachments) {
2594 for (unsigned i = 0; i < subpass->color_count; i++) {
2595 uint32_t a = subpass->resolve_attachments[i].attachment;
2596 if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) {
2597 tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n");
2598 tu6_emit_predicated_blit(cmd, cs, a, a, false);
2599 }
2600 }
2601 }
2602 }
2603
2604 void
2605 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2606 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2607 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2608 {
2609 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2610 }
2611
2612 struct tu_draw_info
2613 {
2614 /**
2615 * Number of vertices.
2616 */
2617 uint32_t count;
2618
2619 /**
2620 * Index of the first vertex.
2621 */
2622 int32_t vertex_offset;
2623
2624 /**
2625 * First instance id.
2626 */
2627 uint32_t first_instance;
2628
2629 /**
2630 * Number of instances.
2631 */
2632 uint32_t instance_count;
2633
2634 /**
2635 * First index (indexed draws only).
2636 */
2637 uint32_t first_index;
2638
2639 /**
2640 * Whether it's an indexed draw.
2641 */
2642 bool indexed;
2643
2644 /**
2645 * Indirect draw parameters resource.
2646 */
2647 struct tu_buffer *indirect;
2648 uint64_t indirect_offset;
2649 uint32_t stride;
2650
2651 /**
2652 * Draw count parameters resource.
2653 */
2654 struct tu_buffer *count_buffer;
2655 uint64_t count_buffer_offset;
2656 };
2657
2658 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2659 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2660
2661 enum tu_draw_state_group_id
2662 {
2663 TU_DRAW_STATE_PROGRAM,
2664 TU_DRAW_STATE_PROGRAM_BINNING,
2665 TU_DRAW_STATE_VI,
2666 TU_DRAW_STATE_VI_BINNING,
2667 TU_DRAW_STATE_VP,
2668 TU_DRAW_STATE_RAST,
2669 TU_DRAW_STATE_DS,
2670 TU_DRAW_STATE_BLEND,
2671 TU_DRAW_STATE_VS_CONST,
2672 TU_DRAW_STATE_FS_CONST,
2673 TU_DRAW_STATE_VS_TEX,
2674 TU_DRAW_STATE_FS_TEX_SYSMEM,
2675 TU_DRAW_STATE_FS_TEX_GMEM,
2676 TU_DRAW_STATE_FS_IBO,
2677 TU_DRAW_STATE_VS_PARAMS,
2678
2679 TU_DRAW_STATE_COUNT,
2680 };
2681
2682 struct tu_draw_state_group
2683 {
2684 enum tu_draw_state_group_id id;
2685 uint32_t enable_mask;
2686 struct tu_cs_entry ib;
2687 };
2688
2689 const static struct tu_sampler*
2690 sampler_ptr(struct tu_descriptor_state *descriptors_state,
2691 const struct tu_descriptor_map *map, unsigned i,
2692 unsigned array_index)
2693 {
2694 assert(descriptors_state->valid & (1 << map->set[i]));
2695
2696 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2697 assert(map->binding[i] < set->layout->binding_count);
2698
2699 const struct tu_descriptor_set_binding_layout *layout =
2700 &set->layout->binding[map->binding[i]];
2701
2702 if (layout->immutable_samplers_offset) {
2703 const struct tu_sampler *immutable_samplers =
2704 tu_immutable_samplers(set->layout, layout);
2705
2706 return &immutable_samplers[array_index];
2707 }
2708
2709 switch (layout->type) {
2710 case VK_DESCRIPTOR_TYPE_SAMPLER:
2711 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4];
2712 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2713 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
2714 array_index *
2715 (A6XX_TEX_CONST_DWORDS +
2716 sizeof(struct tu_sampler) / 4)];
2717 default:
2718 unreachable("unimplemented descriptor type");
2719 break;
2720 }
2721 }
2722
2723 static void
2724 write_tex_const(struct tu_cmd_buffer *cmd,
2725 uint32_t *dst,
2726 struct tu_descriptor_state *descriptors_state,
2727 const struct tu_descriptor_map *map,
2728 unsigned i, unsigned array_index, bool is_sysmem)
2729 {
2730 assert(descriptors_state->valid & (1 << map->set[i]));
2731
2732 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2733 assert(map->binding[i] < set->layout->binding_count);
2734
2735 const struct tu_descriptor_set_binding_layout *layout =
2736 &set->layout->binding[map->binding[i]];
2737
2738 switch (layout->type) {
2739 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2740 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2741 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2742 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2743 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2744 array_index * A6XX_TEX_CONST_DWORDS],
2745 A6XX_TEX_CONST_DWORDS * 4);
2746 break;
2747 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2748 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2749 array_index *
2750 (A6XX_TEX_CONST_DWORDS +
2751 sizeof(struct tu_sampler) / 4)],
2752 A6XX_TEX_CONST_DWORDS * 4);
2753 break;
2754 default:
2755 unreachable("unimplemented descriptor type");
2756 break;
2757 }
2758
2759 if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
2760 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2761 uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
2762 array_index].attachment;
2763 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2764
2765 assert(att->gmem_offset >= 0);
2766
2767 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
2768 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
2769 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
2770 dst[2] |=
2771 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
2772 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
2773 dst[3] = 0;
2774 dst[4] = 0x100000 + att->gmem_offset;
2775 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
2776 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
2777 dst[i] = 0;
2778
2779 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2780 tu_finishme("patch input attachment pitch for secondary cmd buffer");
2781 }
2782 }
2783
2784 static void
2785 write_image_ibo(struct tu_cmd_buffer *cmd,
2786 uint32_t *dst,
2787 struct tu_descriptor_state *descriptors_state,
2788 const struct tu_descriptor_map *map,
2789 unsigned i, unsigned array_index)
2790 {
2791 assert(descriptors_state->valid & (1 << map->set[i]));
2792
2793 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2794 assert(map->binding[i] < set->layout->binding_count);
2795
2796 const struct tu_descriptor_set_binding_layout *layout =
2797 &set->layout->binding[map->binding[i]];
2798
2799 assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
2800
2801 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2802 (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
2803 A6XX_TEX_CONST_DWORDS * 4);
2804 }
2805
2806 static uint64_t
2807 buffer_ptr(struct tu_descriptor_state *descriptors_state,
2808 const struct tu_descriptor_map *map,
2809 unsigned i, unsigned array_index)
2810 {
2811 assert(descriptors_state->valid & (1 << map->set[i]));
2812
2813 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2814 assert(map->binding[i] < set->layout->binding_count);
2815
2816 const struct tu_descriptor_set_binding_layout *layout =
2817 &set->layout->binding[map->binding[i]];
2818
2819 switch (layout->type) {
2820 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2821 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2822 return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
2823 array_index];
2824 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2825 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2826 return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
2827 set->mapped_ptr[layout->offset / 4 + array_index * 2];
2828 default:
2829 unreachable("unimplemented descriptor type");
2830 break;
2831 }
2832 }
2833
2834 static inline uint32_t
2835 tu6_stage2opcode(gl_shader_stage type)
2836 {
2837 switch (type) {
2838 case MESA_SHADER_VERTEX:
2839 case MESA_SHADER_TESS_CTRL:
2840 case MESA_SHADER_TESS_EVAL:
2841 case MESA_SHADER_GEOMETRY:
2842 return CP_LOAD_STATE6_GEOM;
2843 case MESA_SHADER_FRAGMENT:
2844 case MESA_SHADER_COMPUTE:
2845 case MESA_SHADER_KERNEL:
2846 return CP_LOAD_STATE6_FRAG;
2847 default:
2848 unreachable("bad shader type");
2849 }
2850 }
2851
2852 static inline enum a6xx_state_block
2853 tu6_stage2shadersb(gl_shader_stage type)
2854 {
2855 switch (type) {
2856 case MESA_SHADER_VERTEX:
2857 return SB6_VS_SHADER;
2858 case MESA_SHADER_FRAGMENT:
2859 return SB6_FS_SHADER;
2860 case MESA_SHADER_COMPUTE:
2861 case MESA_SHADER_KERNEL:
2862 return SB6_CS_SHADER;
2863 default:
2864 unreachable("bad shader type");
2865 return ~0;
2866 }
2867 }
2868
2869 static void
2870 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2871 struct tu_descriptor_state *descriptors_state,
2872 gl_shader_stage type,
2873 uint32_t *push_constants)
2874 {
2875 const struct tu_program_descriptor_linkage *link =
2876 &pipeline->program.link[type];
2877 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2878
2879 for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
2880 if (state->range[i].start < state->range[i].end) {
2881 uint32_t size = state->range[i].end - state->range[i].start;
2882 uint32_t offset = state->range[i].start;
2883
2884 /* and even if the start of the const buffer is before
2885 * first_immediate, the end may not be:
2886 */
2887 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2888
2889 if (size == 0)
2890 continue;
2891
2892 /* things should be aligned to vec4: */
2893 debug_assert((state->range[i].offset % 16) == 0);
2894 debug_assert((size % 16) == 0);
2895 debug_assert((offset % 16) == 0);
2896
2897 if (i == 0) {
2898 /* push constants */
2899 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (size / 4));
2900 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2901 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2902 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2903 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2904 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2905 tu_cs_emit(cs, 0);
2906 tu_cs_emit(cs, 0);
2907 for (unsigned i = 0; i < size / 4; i++)
2908 tu_cs_emit(cs, push_constants[i + offset / 4]);
2909 continue;
2910 }
2911
2912 /* Look through the UBO map to find our UBO index, and get the VA for
2913 * that UBO.
2914 */
2915 uint64_t va = 0;
2916 uint32_t ubo_idx = i - 1;
2917 uint32_t ubo_map_base = 0;
2918 for (int j = 0; j < link->ubo_map.num; j++) {
2919 if (ubo_idx >= ubo_map_base &&
2920 ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
2921 va = buffer_ptr(descriptors_state, &link->ubo_map, j,
2922 ubo_idx - ubo_map_base);
2923 break;
2924 }
2925 ubo_map_base += link->ubo_map.array_size[j];
2926 }
2927 assert(va);
2928
2929 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2930 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2931 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2932 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2933 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2934 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2935 tu_cs_emit_qw(cs, va + offset);
2936 }
2937 }
2938 }
2939
2940 static void
2941 tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2942 struct tu_descriptor_state *descriptors_state,
2943 gl_shader_stage type)
2944 {
2945 const struct tu_program_descriptor_linkage *link =
2946 &pipeline->program.link[type];
2947
2948 uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
2949 uint32_t anum = align(num, 2);
2950
2951 if (!num)
2952 return;
2953
2954 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
2955 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
2956 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2957 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2958 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2959 CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
2960 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2961 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2962
2963 unsigned emitted = 0;
2964 for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
2965 for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
2966 tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
2967 emitted++;
2968 }
2969 }
2970
2971 for (; emitted < anum; emitted++) {
2972 tu_cs_emit(cs, 0xffffffff);
2973 tu_cs_emit(cs, 0xffffffff);
2974 }
2975 }
2976
2977 static struct tu_cs_entry
2978 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2979 const struct tu_pipeline *pipeline,
2980 struct tu_descriptor_state *descriptors_state,
2981 gl_shader_stage type)
2982 {
2983 struct tu_cs cs;
2984 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2985
2986 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2987 tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
2988
2989 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2990 }
2991
2992 static VkResult
2993 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
2994 const struct tu_draw_info *draw,
2995 struct tu_cs_entry *entry)
2996 {
2997 /* TODO: fill out more than just base instance */
2998 const struct tu_program_descriptor_linkage *link =
2999 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3000 const struct ir3_const_state *const_state = &link->const_state;
3001 struct tu_cs cs;
3002
3003 if (const_state->offsets.driver_param >= link->constlen) {
3004 *entry = (struct tu_cs_entry) {};
3005 return VK_SUCCESS;
3006 }
3007
3008 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
3009 if (result != VK_SUCCESS)
3010 return result;
3011
3012 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3013 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
3014 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3015 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3016 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3017 CP_LOAD_STATE6_0_NUM_UNIT(1));
3018 tu_cs_emit(&cs, 0);
3019 tu_cs_emit(&cs, 0);
3020
3021 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3022
3023 tu_cs_emit(&cs, 0);
3024 tu_cs_emit(&cs, 0);
3025 tu_cs_emit(&cs, draw->first_instance);
3026 tu_cs_emit(&cs, 0);
3027
3028 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3029 return VK_SUCCESS;
3030 }
3031
3032 static VkResult
3033 tu6_emit_textures(struct tu_cmd_buffer *cmd,
3034 const struct tu_pipeline *pipeline,
3035 struct tu_descriptor_state *descriptors_state,
3036 gl_shader_stage type,
3037 struct tu_cs_entry *entry,
3038 bool *needs_border,
3039 bool is_sysmem)
3040 {
3041 struct tu_cs *draw_state = &cmd->sub_cs;
3042 const struct tu_program_descriptor_linkage *link =
3043 &pipeline->program.link[type];
3044 VkResult result;
3045
3046 if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
3047 *entry = (struct tu_cs_entry) {};
3048 return VK_SUCCESS;
3049 }
3050
3051 /* allocate and fill texture state */
3052 struct ts_cs_memory tex_const;
3053 result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
3054 A6XX_TEX_CONST_DWORDS, &tex_const);
3055 if (result != VK_SUCCESS)
3056 return result;
3057
3058 int tex_index = 0;
3059 for (unsigned i = 0; i < link->texture_map.num; i++) {
3060 for (int j = 0; j < link->texture_map.array_size[i]; j++) {
3061 write_tex_const(cmd,
3062 &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
3063 descriptors_state, &link->texture_map, i, j,
3064 is_sysmem);
3065 }
3066 }
3067
3068 /* allocate and fill sampler state */
3069 struct ts_cs_memory tex_samp = { 0 };
3070 if (link->sampler_map.num_desc) {
3071 result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
3072 A6XX_TEX_SAMP_DWORDS, &tex_samp);
3073 if (result != VK_SUCCESS)
3074 return result;
3075
3076 int sampler_index = 0;
3077 for (unsigned i = 0; i < link->sampler_map.num; i++) {
3078 for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
3079 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3080 &link->sampler_map,
3081 i, j);
3082 memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
3083 sampler->state, sizeof(sampler->state));
3084 *needs_border |= sampler->needs_border;
3085 }
3086 }
3087 }
3088
3089 unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
3090 enum a6xx_state_block sb;
3091
3092 switch (type) {
3093 case MESA_SHADER_VERTEX:
3094 sb = SB6_VS_TEX;
3095 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
3096 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
3097 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
3098 break;
3099 case MESA_SHADER_FRAGMENT:
3100 sb = SB6_FS_TEX;
3101 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
3102 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
3103 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
3104 break;
3105 case MESA_SHADER_COMPUTE:
3106 sb = SB6_CS_TEX;
3107 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
3108 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
3109 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
3110 break;
3111 default:
3112 unreachable("bad state block");
3113 }
3114
3115 struct tu_cs cs;
3116 result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
3117 if (result != VK_SUCCESS)
3118 return result;
3119
3120 if (link->sampler_map.num_desc) {
3121 /* output sampler state: */
3122 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3123 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3124 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
3125 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3126 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3127 CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
3128 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3129
3130 tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
3131 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3132 }
3133
3134 /* emit texture state: */
3135 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3136 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3137 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3138 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3139 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3140 CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
3141 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3142
3143 tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
3144 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3145
3146 tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
3147 tu_cs_emit(&cs, link->texture_map.num_desc);
3148
3149 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3150 return VK_SUCCESS;
3151 }
3152
3153 static VkResult
3154 tu6_emit_ibo(struct tu_cmd_buffer *cmd,
3155 const struct tu_pipeline *pipeline,
3156 struct tu_descriptor_state *descriptors_state,
3157 gl_shader_stage type,
3158 struct tu_cs_entry *entry)
3159 {
3160 struct tu_cs *draw_state = &cmd->sub_cs;
3161 const struct tu_program_descriptor_linkage *link =
3162 &pipeline->program.link[type];
3163 VkResult result;
3164
3165 unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
3166
3167 if (num_desc == 0) {
3168 *entry = (struct tu_cs_entry) {};
3169 return VK_SUCCESS;
3170 }
3171
3172 struct ts_cs_memory ibo_const;
3173 result = tu_cs_alloc(draw_state, num_desc,
3174 A6XX_TEX_CONST_DWORDS, &ibo_const);
3175 if (result != VK_SUCCESS)
3176 return result;
3177
3178 int ssbo_index = 0;
3179 for (unsigned i = 0; i < link->ssbo_map.num; i++) {
3180 for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
3181 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3182
3183 uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
3184 /* We don't expose robustBufferAccess, so leave the size unlimited. */
3185 uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
3186
3187 dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
3188 dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
3189 A6XX_IBO_1_HEIGHT(sz >> 15);
3190 dst[2] = A6XX_IBO_2_UNK4 |
3191 A6XX_IBO_2_UNK31 |
3192 A6XX_IBO_2_TYPE(A6XX_TEX_1D);
3193 dst[3] = 0;
3194 dst[4] = va;
3195 dst[5] = va >> 32;
3196 for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3197 dst[i] = 0;
3198
3199 ssbo_index++;
3200 }
3201 }
3202
3203 for (unsigned i = 0; i < link->image_map.num; i++) {
3204 for (int j = 0; j < link->image_map.array_size[i]; j++) {
3205 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3206
3207 write_image_ibo(cmd, dst,
3208 descriptors_state, &link->image_map, i, j);
3209
3210 ssbo_index++;
3211 }
3212 }
3213
3214 assert(ssbo_index == num_desc);
3215
3216 struct tu_cs cs;
3217 result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
3218 if (result != VK_SUCCESS)
3219 return result;
3220
3221 uint32_t opcode, ibo_addr_reg;
3222 enum a6xx_state_block sb;
3223 enum a6xx_state_type st;
3224
3225 switch (type) {
3226 case MESA_SHADER_FRAGMENT:
3227 opcode = CP_LOAD_STATE6;
3228 st = ST6_SHADER;
3229 sb = SB6_IBO;
3230 ibo_addr_reg = REG_A6XX_SP_IBO_LO;
3231 break;
3232 case MESA_SHADER_COMPUTE:
3233 opcode = CP_LOAD_STATE6_FRAG;
3234 st = ST6_IBO;
3235 sb = SB6_CS_SHADER;
3236 ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
3237 break;
3238 default:
3239 unreachable("unsupported stage for ibos");
3240 }
3241
3242 /* emit texture state: */
3243 tu_cs_emit_pkt7(&cs, opcode, 3);
3244 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3245 CP_LOAD_STATE6_0_STATE_TYPE(st) |
3246 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3247 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3248 CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
3249 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3250
3251 tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
3252 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3253
3254 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3255 return VK_SUCCESS;
3256 }
3257
3258 struct PACKED bcolor_entry {
3259 uint32_t fp32[4];
3260 uint16_t ui16[4];
3261 int16_t si16[4];
3262 uint16_t fp16[4];
3263 uint16_t rgb565;
3264 uint16_t rgb5a1;
3265 uint16_t rgba4;
3266 uint8_t __pad0[2];
3267 uint8_t ui8[4];
3268 int8_t si8[4];
3269 uint32_t rgb10a2;
3270 uint32_t z24; /* also s8? */
3271 uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
3272 uint8_t __pad1[56];
3273 } border_color[] = {
3274 [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = {},
3275 [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = {},
3276 [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = {
3277 .fp32[3] = 0x3f800000,
3278 .ui16[3] = 0xffff,
3279 .si16[3] = 0x7fff,
3280 .fp16[3] = 0x3c00,
3281 .rgb5a1 = 0x8000,
3282 .rgba4 = 0xf000,
3283 .ui8[3] = 0xff,
3284 .si8[3] = 0x7f,
3285 .rgb10a2 = 0xc0000000,
3286 .srgb[3] = 0x3c00,
3287 },
3288 [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = {
3289 .fp32[3] = 1,
3290 .fp16[3] = 1,
3291 },
3292 [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = {
3293 .fp32[0 ... 3] = 0x3f800000,
3294 .ui16[0 ... 3] = 0xffff,
3295 .si16[0 ... 3] = 0x7fff,
3296 .fp16[0 ... 3] = 0x3c00,
3297 .rgb565 = 0xffff,
3298 .rgb5a1 = 0xffff,
3299 .rgba4 = 0xffff,
3300 .ui8[0 ... 3] = 0xff,
3301 .si8[0 ... 3] = 0x7f,
3302 .rgb10a2 = 0xffffffff,
3303 .z24 = 0xffffff,
3304 .srgb[0 ... 3] = 0x3c00,
3305 },
3306 [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = {
3307 .fp32[0 ... 3] = 1,
3308 .fp16[0 ... 3] = 1,
3309 },
3310 };
3311
3312 static VkResult
3313 tu6_emit_border_color(struct tu_cmd_buffer *cmd,
3314 struct tu_cs *cs)
3315 {
3316 STATIC_ASSERT(sizeof(struct bcolor_entry) == 128);
3317
3318 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3319 struct tu_descriptor_state *descriptors_state =
3320 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3321 const struct tu_descriptor_map *vs_sampler =
3322 &pipeline->program.link[MESA_SHADER_VERTEX].sampler_map;
3323 const struct tu_descriptor_map *fs_sampler =
3324 &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map;
3325 struct ts_cs_memory ptr;
3326
3327 VkResult result = tu_cs_alloc(&cmd->sub_cs,
3328 vs_sampler->num_desc + fs_sampler->num_desc,
3329 128 / 4,
3330 &ptr);
3331 if (result != VK_SUCCESS)
3332 return result;
3333
3334 for (unsigned i = 0; i < vs_sampler->num; i++) {
3335 for (unsigned j = 0; j < vs_sampler->array_size[i]; j++) {
3336 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3337 vs_sampler, i, j);
3338 memcpy(ptr.map, &border_color[sampler->border], 128);
3339 ptr.map += 128 / 4;
3340 }
3341 }
3342
3343 for (unsigned i = 0; i < fs_sampler->num; i++) {
3344 for (unsigned j = 0; j < fs_sampler->array_size[i]; j++) {
3345 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3346 fs_sampler, i, j);
3347 memcpy(ptr.map, &border_color[sampler->border], 128);
3348 ptr.map += 128 / 4;
3349 }
3350 }
3351
3352 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
3353 tu_cs_emit_qw(cs, ptr.iova);
3354 return VK_SUCCESS;
3355 }
3356
3357 static VkResult
3358 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3359 struct tu_cs *cs,
3360 const struct tu_draw_info *draw)
3361 {
3362 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3363 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3364 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3365 uint32_t draw_state_group_count = 0;
3366 VkResult result;
3367
3368 struct tu_descriptor_state *descriptors_state =
3369 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3370
3371 /* TODO lrz */
3372
3373 tu_cs_emit_regs(cs,
3374 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3375 pipeline->ia.primitive_restart && draw->indexed));
3376
3377 if (cmd->state.dirty &
3378 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3379 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3380 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3381 dynamic->line_width);
3382 }
3383
3384 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3385 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3386 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3387 dynamic->stencil_compare_mask.back);
3388 }
3389
3390 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3391 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3392 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3393 dynamic->stencil_write_mask.back);
3394 }
3395
3396 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3397 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3398 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3399 dynamic->stencil_reference.back);
3400 }
3401
3402 if (cmd->state.dirty &
3403 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) {
3404 for (uint32_t i = 0; i < pipeline->vi.count; i++) {
3405 const uint32_t binding = pipeline->vi.bindings[i];
3406 const uint32_t stride = pipeline->vi.strides[i];
3407 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3408 const VkDeviceSize offset = buf->bo_offset +
3409 cmd->state.vb.offsets[binding] +
3410 pipeline->vi.offsets[i];
3411 const VkDeviceSize size =
3412 offset < buf->bo->size ? buf->bo->size - offset : 0;
3413
3414 tu_cs_emit_regs(cs,
3415 A6XX_VFD_FETCH_BASE(i, .bo = buf->bo, .bo_offset = offset),
3416 A6XX_VFD_FETCH_SIZE(i, size),
3417 A6XX_VFD_FETCH_STRIDE(i, stride));
3418 }
3419 }
3420
3421 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3422 draw_state_groups[draw_state_group_count++] =
3423 (struct tu_draw_state_group) {
3424 .id = TU_DRAW_STATE_PROGRAM,
3425 .enable_mask = ENABLE_DRAW,
3426 .ib = pipeline->program.state_ib,
3427 };
3428 draw_state_groups[draw_state_group_count++] =
3429 (struct tu_draw_state_group) {
3430 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3431 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3432 .ib = pipeline->program.binning_state_ib,
3433 };
3434 draw_state_groups[draw_state_group_count++] =
3435 (struct tu_draw_state_group) {
3436 .id = TU_DRAW_STATE_VI,
3437 .enable_mask = ENABLE_DRAW,
3438 .ib = pipeline->vi.state_ib,
3439 };
3440 draw_state_groups[draw_state_group_count++] =
3441 (struct tu_draw_state_group) {
3442 .id = TU_DRAW_STATE_VI_BINNING,
3443 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3444 .ib = pipeline->vi.binning_state_ib,
3445 };
3446 draw_state_groups[draw_state_group_count++] =
3447 (struct tu_draw_state_group) {
3448 .id = TU_DRAW_STATE_VP,
3449 .enable_mask = ENABLE_ALL,
3450 .ib = pipeline->vp.state_ib,
3451 };
3452 draw_state_groups[draw_state_group_count++] =
3453 (struct tu_draw_state_group) {
3454 .id = TU_DRAW_STATE_RAST,
3455 .enable_mask = ENABLE_ALL,
3456 .ib = pipeline->rast.state_ib,
3457 };
3458 draw_state_groups[draw_state_group_count++] =
3459 (struct tu_draw_state_group) {
3460 .id = TU_DRAW_STATE_DS,
3461 .enable_mask = ENABLE_ALL,
3462 .ib = pipeline->ds.state_ib,
3463 };
3464 draw_state_groups[draw_state_group_count++] =
3465 (struct tu_draw_state_group) {
3466 .id = TU_DRAW_STATE_BLEND,
3467 .enable_mask = ENABLE_ALL,
3468 .ib = pipeline->blend.state_ib,
3469 };
3470 }
3471
3472 if (cmd->state.dirty &
3473 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) {
3474 draw_state_groups[draw_state_group_count++] =
3475 (struct tu_draw_state_group) {
3476 .id = TU_DRAW_STATE_VS_CONST,
3477 .enable_mask = ENABLE_ALL,
3478 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3479 };
3480 draw_state_groups[draw_state_group_count++] =
3481 (struct tu_draw_state_group) {
3482 .id = TU_DRAW_STATE_FS_CONST,
3483 .enable_mask = ENABLE_DRAW,
3484 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3485 };
3486 }
3487
3488 if (cmd->state.dirty &
3489 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
3490 bool needs_border = false;
3491 struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
3492
3493 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3494 MESA_SHADER_VERTEX, &vs_tex, &needs_border,
3495 false);
3496 if (result != VK_SUCCESS)
3497 return result;
3498
3499 /* TODO: we could emit just one texture descriptor draw state when there
3500 * are no input attachments, which is the most common case. We could
3501 * also split out the sampler state, which doesn't change even for input
3502 * attachments.
3503 */
3504 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3505 MESA_SHADER_FRAGMENT, &fs_tex_sysmem,
3506 &needs_border, true);
3507 if (result != VK_SUCCESS)
3508 return result;
3509
3510 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3511 MESA_SHADER_FRAGMENT, &fs_tex_gmem,
3512 &needs_border, false);
3513 if (result != VK_SUCCESS)
3514 return result;
3515
3516 result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
3517 MESA_SHADER_FRAGMENT, &fs_ibo);
3518 if (result != VK_SUCCESS)
3519 return result;
3520
3521 draw_state_groups[draw_state_group_count++] =
3522 (struct tu_draw_state_group) {
3523 .id = TU_DRAW_STATE_VS_TEX,
3524 .enable_mask = ENABLE_ALL,
3525 .ib = vs_tex,
3526 };
3527 draw_state_groups[draw_state_group_count++] =
3528 (struct tu_draw_state_group) {
3529 .id = TU_DRAW_STATE_FS_TEX_GMEM,
3530 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3531 .ib = fs_tex_gmem,
3532 };
3533 draw_state_groups[draw_state_group_count++] =
3534 (struct tu_draw_state_group) {
3535 .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
3536 .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
3537 .ib = fs_tex_sysmem,
3538 };
3539 draw_state_groups[draw_state_group_count++] =
3540 (struct tu_draw_state_group) {
3541 .id = TU_DRAW_STATE_FS_IBO,
3542 .enable_mask = ENABLE_DRAW,
3543 .ib = fs_ibo,
3544 };
3545
3546 if (needs_border) {
3547 result = tu6_emit_border_color(cmd, cs);
3548 if (result != VK_SUCCESS)
3549 return result;
3550 }
3551 }
3552
3553 struct tu_cs_entry vs_params;
3554 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3555 if (result != VK_SUCCESS)
3556 return result;
3557
3558 draw_state_groups[draw_state_group_count++] =
3559 (struct tu_draw_state_group) {
3560 .id = TU_DRAW_STATE_VS_PARAMS,
3561 .enable_mask = ENABLE_ALL,
3562 .ib = vs_params,
3563 };
3564
3565 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3566 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3567 const struct tu_draw_state_group *group = &draw_state_groups[i];
3568 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3569 uint32_t cp_set_draw_state =
3570 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3571 group->enable_mask |
3572 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3573 uint64_t iova;
3574 if (group->ib.size) {
3575 iova = group->ib.bo->iova + group->ib.offset;
3576 } else {
3577 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3578 iova = 0;
3579 }
3580
3581 tu_cs_emit(cs, cp_set_draw_state);
3582 tu_cs_emit_qw(cs, iova);
3583 }
3584
3585 tu_cs_sanity_check(cs);
3586
3587 /* track BOs */
3588 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3589 for (uint32_t i = 0; i < MAX_VBS; i++) {
3590 const struct tu_buffer *buf = cmd->state.vb.buffers[i];
3591 if (buf)
3592 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3593 }
3594 }
3595 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3596 unsigned i;
3597 for_each_bit(i, descriptors_state->valid) {
3598 struct tu_descriptor_set *set = descriptors_state->sets[i];
3599 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3600 if (set->descriptors[j]) {
3601 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3602 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3603 }
3604 }
3605 }
3606
3607 /* Fragment shader state overwrites compute shader state, so flag the
3608 * compute pipeline for re-emit.
3609 */
3610 cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
3611 return VK_SUCCESS;
3612 }
3613
3614 static void
3615 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3616 struct tu_cs *cs,
3617 const struct tu_draw_info *draw)
3618 {
3619
3620 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3621
3622 tu_cs_emit_regs(cs,
3623 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3624 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3625
3626 /* TODO hw binning */
3627 if (draw->indexed) {
3628 const enum a4xx_index_size index_size =
3629 tu6_index_size(cmd->state.index_type);
3630 const uint32_t index_bytes =
3631 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3632 const struct tu_buffer *buf = cmd->state.index_buffer;
3633 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3634 index_bytes * draw->first_index;
3635 const uint32_t size = index_bytes * draw->count;
3636
3637 const uint32_t cp_draw_indx =
3638 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3639 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3640 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3641 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3642
3643 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3644 tu_cs_emit(cs, cp_draw_indx);
3645 tu_cs_emit(cs, draw->instance_count);
3646 tu_cs_emit(cs, draw->count);
3647 tu_cs_emit(cs, 0x0); /* XXX */
3648 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3649 tu_cs_emit(cs, size);
3650 } else {
3651 const uint32_t cp_draw_indx =
3652 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3653 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3654 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3655
3656 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3657 tu_cs_emit(cs, cp_draw_indx);
3658 tu_cs_emit(cs, draw->instance_count);
3659 tu_cs_emit(cs, draw->count);
3660 }
3661 }
3662
3663 static void
3664 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3665 {
3666 struct tu_cs *cs = &cmd->draw_cs;
3667 VkResult result;
3668
3669 result = tu6_bind_draw_states(cmd, cs, draw);
3670 if (result != VK_SUCCESS) {
3671 cmd->record_result = result;
3672 return;
3673 }
3674
3675 if (draw->indirect) {
3676 tu_finishme("indirect draw");
3677 return;
3678 }
3679
3680 tu6_emit_draw_direct(cmd, cs, draw);
3681
3682 cmd->wait_for_idle = true;
3683
3684 tu_cs_sanity_check(cs);
3685 }
3686
3687 void
3688 tu_CmdDraw(VkCommandBuffer commandBuffer,
3689 uint32_t vertexCount,
3690 uint32_t instanceCount,
3691 uint32_t firstVertex,
3692 uint32_t firstInstance)
3693 {
3694 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3695 struct tu_draw_info info = {};
3696
3697 info.count = vertexCount;
3698 info.instance_count = instanceCount;
3699 info.first_instance = firstInstance;
3700 info.vertex_offset = firstVertex;
3701
3702 tu_draw(cmd_buffer, &info);
3703 }
3704
3705 void
3706 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3707 uint32_t indexCount,
3708 uint32_t instanceCount,
3709 uint32_t firstIndex,
3710 int32_t vertexOffset,
3711 uint32_t firstInstance)
3712 {
3713 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3714 struct tu_draw_info info = {};
3715
3716 info.indexed = true;
3717 info.count = indexCount;
3718 info.instance_count = instanceCount;
3719 info.first_index = firstIndex;
3720 info.vertex_offset = vertexOffset;
3721 info.first_instance = firstInstance;
3722
3723 tu_draw(cmd_buffer, &info);
3724 }
3725
3726 void
3727 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3728 VkBuffer _buffer,
3729 VkDeviceSize offset,
3730 uint32_t drawCount,
3731 uint32_t stride)
3732 {
3733 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3734 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3735 struct tu_draw_info info = {};
3736
3737 info.count = drawCount;
3738 info.indirect = buffer;
3739 info.indirect_offset = offset;
3740 info.stride = stride;
3741
3742 tu_draw(cmd_buffer, &info);
3743 }
3744
3745 void
3746 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3747 VkBuffer _buffer,
3748 VkDeviceSize offset,
3749 uint32_t drawCount,
3750 uint32_t stride)
3751 {
3752 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3753 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3754 struct tu_draw_info info = {};
3755
3756 info.indexed = true;
3757 info.count = drawCount;
3758 info.indirect = buffer;
3759 info.indirect_offset = offset;
3760 info.stride = stride;
3761
3762 tu_draw(cmd_buffer, &info);
3763 }
3764
3765 struct tu_dispatch_info
3766 {
3767 /**
3768 * Determine the layout of the grid (in block units) to be used.
3769 */
3770 uint32_t blocks[3];
3771
3772 /**
3773 * A starting offset for the grid. If unaligned is set, the offset
3774 * must still be aligned.
3775 */
3776 uint32_t offsets[3];
3777 /**
3778 * Whether it's an unaligned compute dispatch.
3779 */
3780 bool unaligned;
3781
3782 /**
3783 * Indirect compute parameters resource.
3784 */
3785 struct tu_buffer *indirect;
3786 uint64_t indirect_offset;
3787 };
3788
3789 static void
3790 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3791 const struct tu_dispatch_info *info)
3792 {
3793 gl_shader_stage type = MESA_SHADER_COMPUTE;
3794 const struct tu_program_descriptor_linkage *link =
3795 &pipeline->program.link[type];
3796 const struct ir3_const_state *const_state = &link->const_state;
3797 uint32_t offset = const_state->offsets.driver_param;
3798
3799 if (link->constlen <= offset)
3800 return;
3801
3802 if (!info->indirect) {
3803 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3804 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3805 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3806 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3807 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3808 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3809 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3810 };
3811
3812 uint32_t num_consts = MIN2(const_state->num_driver_params,
3813 (link->constlen - offset) * 4);
3814 /* push constants */
3815 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3816 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3817 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3818 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3819 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3820 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3821 tu_cs_emit(cs, 0);
3822 tu_cs_emit(cs, 0);
3823 uint32_t i;
3824 for (i = 0; i < num_consts; i++)
3825 tu_cs_emit(cs, driver_params[i]);
3826 } else {
3827 tu_finishme("Indirect driver params");
3828 }
3829 }
3830
3831 static void
3832 tu_dispatch(struct tu_cmd_buffer *cmd,
3833 const struct tu_dispatch_info *info)
3834 {
3835 struct tu_cs *cs = &cmd->cs;
3836 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3837 struct tu_descriptor_state *descriptors_state =
3838 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3839 VkResult result;
3840
3841 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3842 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3843
3844 struct tu_cs_entry ib;
3845
3846 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3847 if (ib.size)
3848 tu_cs_emit_ib(cs, &ib);
3849
3850 tu_emit_compute_driver_params(cs, pipeline, info);
3851
3852 bool needs_border;
3853 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3854 MESA_SHADER_COMPUTE, &ib, &needs_border, false);
3855 if (result != VK_SUCCESS) {
3856 cmd->record_result = result;
3857 return;
3858 }
3859
3860 if (ib.size)
3861 tu_cs_emit_ib(cs, &ib);
3862
3863 if (needs_border)
3864 tu_finishme("compute border color");
3865
3866 result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
3867 if (result != VK_SUCCESS) {
3868 cmd->record_result = result;
3869 return;
3870 }
3871
3872 if (ib.size)
3873 tu_cs_emit_ib(cs, &ib);
3874
3875 /* track BOs */
3876 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3877 unsigned i;
3878 for_each_bit(i, descriptors_state->valid) {
3879 struct tu_descriptor_set *set = descriptors_state->sets[i];
3880 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3881 if (set->descriptors[j]) {
3882 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3883 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3884 }
3885 }
3886 }
3887
3888 /* Compute shader state overwrites fragment shader state, so we flag the
3889 * graphics pipeline for re-emit.
3890 */
3891 cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
3892
3893 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3894 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3895
3896 const uint32_t *local_size = pipeline->compute.local_size;
3897 const uint32_t *num_groups = info->blocks;
3898 tu_cs_emit_regs(cs,
3899 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3900 .localsizex = local_size[0] - 1,
3901 .localsizey = local_size[1] - 1,
3902 .localsizez = local_size[2] - 1),
3903 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3904 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3905 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3906 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3907 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3908 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3909
3910 tu_cs_emit_regs(cs,
3911 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3912 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3913 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3914
3915 if (info->indirect) {
3916 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3917
3918 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3919 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3920
3921 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3922 tu_cs_emit(cs, 0x00000000);
3923 tu_cs_emit_qw(cs, iova);
3924 tu_cs_emit(cs,
3925 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3926 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3927 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3928 } else {
3929 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3930 tu_cs_emit(cs, 0x00000000);
3931 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3932 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3933 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3934 }
3935
3936 tu_cs_emit_wfi(cs);
3937
3938 tu6_emit_cache_flush(cmd, cs);
3939 }
3940
3941 void
3942 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3943 uint32_t base_x,
3944 uint32_t base_y,
3945 uint32_t base_z,
3946 uint32_t x,
3947 uint32_t y,
3948 uint32_t z)
3949 {
3950 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3951 struct tu_dispatch_info info = {};
3952
3953 info.blocks[0] = x;
3954 info.blocks[1] = y;
3955 info.blocks[2] = z;
3956
3957 info.offsets[0] = base_x;
3958 info.offsets[1] = base_y;
3959 info.offsets[2] = base_z;
3960 tu_dispatch(cmd_buffer, &info);
3961 }
3962
3963 void
3964 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3965 uint32_t x,
3966 uint32_t y,
3967 uint32_t z)
3968 {
3969 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3970 }
3971
3972 void
3973 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3974 VkBuffer _buffer,
3975 VkDeviceSize offset)
3976 {
3977 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3978 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3979 struct tu_dispatch_info info = {};
3980
3981 info.indirect = buffer;
3982 info.indirect_offset = offset;
3983
3984 tu_dispatch(cmd_buffer, &info);
3985 }
3986
3987 void
3988 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3989 {
3990 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3991
3992 tu_cs_end(&cmd_buffer->draw_cs);
3993 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
3994
3995 if (use_sysmem_rendering(cmd_buffer))
3996 tu_cmd_render_sysmem(cmd_buffer);
3997 else
3998 tu_cmd_render_tiles(cmd_buffer);
3999
4000 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4001 rendered */
4002 tu_cs_discard_entries(&cmd_buffer->draw_cs);
4003 tu_cs_begin(&cmd_buffer->draw_cs);
4004 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4005 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4006
4007 cmd_buffer->state.pass = NULL;
4008 cmd_buffer->state.subpass = NULL;
4009 cmd_buffer->state.framebuffer = NULL;
4010 }
4011
4012 void
4013 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4014 const VkSubpassEndInfoKHR *pSubpassEndInfo)
4015 {
4016 tu_CmdEndRenderPass(commandBuffer);
4017 }
4018
4019 struct tu_barrier_info
4020 {
4021 uint32_t eventCount;
4022 const VkEvent *pEvents;
4023 VkPipelineStageFlags srcStageMask;
4024 };
4025
4026 static void
4027 tu_barrier(struct tu_cmd_buffer *cmd_buffer,
4028 uint32_t memoryBarrierCount,
4029 const VkMemoryBarrier *pMemoryBarriers,
4030 uint32_t bufferMemoryBarrierCount,
4031 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4032 uint32_t imageMemoryBarrierCount,
4033 const VkImageMemoryBarrier *pImageMemoryBarriers,
4034 const struct tu_barrier_info *info)
4035 {
4036 }
4037
4038 void
4039 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4040 VkPipelineStageFlags srcStageMask,
4041 VkPipelineStageFlags destStageMask,
4042 VkBool32 byRegion,
4043 uint32_t memoryBarrierCount,
4044 const VkMemoryBarrier *pMemoryBarriers,
4045 uint32_t bufferMemoryBarrierCount,
4046 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4047 uint32_t imageMemoryBarrierCount,
4048 const VkImageMemoryBarrier *pImageMemoryBarriers)
4049 {
4050 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4051 struct tu_barrier_info info;
4052
4053 info.eventCount = 0;
4054 info.pEvents = NULL;
4055 info.srcStageMask = srcStageMask;
4056
4057 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4058 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4059 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4060 }
4061
4062 static void
4063 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value)
4064 {
4065 struct tu_cs *cs = &cmd->cs;
4066
4067 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4068
4069 /* TODO: any flush required before/after ? */
4070
4071 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4072 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4073 tu_cs_emit(cs, value);
4074 }
4075
4076 void
4077 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4078 VkEvent _event,
4079 VkPipelineStageFlags stageMask)
4080 {
4081 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4082 TU_FROM_HANDLE(tu_event, event, _event);
4083
4084 write_event(cmd, event, 1);
4085 }
4086
4087 void
4088 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4089 VkEvent _event,
4090 VkPipelineStageFlags stageMask)
4091 {
4092 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4093 TU_FROM_HANDLE(tu_event, event, _event);
4094
4095 write_event(cmd, event, 0);
4096 }
4097
4098 void
4099 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4100 uint32_t eventCount,
4101 const VkEvent *pEvents,
4102 VkPipelineStageFlags srcStageMask,
4103 VkPipelineStageFlags dstStageMask,
4104 uint32_t memoryBarrierCount,
4105 const VkMemoryBarrier *pMemoryBarriers,
4106 uint32_t bufferMemoryBarrierCount,
4107 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4108 uint32_t imageMemoryBarrierCount,
4109 const VkImageMemoryBarrier *pImageMemoryBarriers)
4110 {
4111 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4112 struct tu_cs *cs = &cmd->cs;
4113
4114 /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */
4115
4116 for (uint32_t i = 0; i < eventCount; i++) {
4117 TU_FROM_HANDLE(tu_event, event, pEvents[i]);
4118
4119 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4120
4121 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4122 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4123 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4124 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4125 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4126 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4127 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4128 }
4129 }
4130
4131 void
4132 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4133 {
4134 /* No-op */
4135 }