turnip: fall back to sysmem when attachments don't fit into gmem
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #include "tu_private.h"
29
30 #include "registers/adreno_pm4.xml.h"
31 #include "registers/adreno_common.xml.h"
32
33 #include "vk_format.h"
34
35 #include "tu_cs.h"
36 #include "tu_blit.h"
37
38 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
39
40 void
41 tu_bo_list_init(struct tu_bo_list *list)
42 {
43 list->count = list->capacity = 0;
44 list->bo_infos = NULL;
45 }
46
47 void
48 tu_bo_list_destroy(struct tu_bo_list *list)
49 {
50 free(list->bo_infos);
51 }
52
53 void
54 tu_bo_list_reset(struct tu_bo_list *list)
55 {
56 list->count = 0;
57 }
58
59 /**
60 * \a flags consists of MSM_SUBMIT_BO_FLAGS.
61 */
62 static uint32_t
63 tu_bo_list_add_info(struct tu_bo_list *list,
64 const struct drm_msm_gem_submit_bo *bo_info)
65 {
66 assert(bo_info->handle != 0);
67
68 for (uint32_t i = 0; i < list->count; ++i) {
69 if (list->bo_infos[i].handle == bo_info->handle) {
70 assert(list->bo_infos[i].presumed == bo_info->presumed);
71 list->bo_infos[i].flags |= bo_info->flags;
72 return i;
73 }
74 }
75
76 /* grow list->bo_infos if needed */
77 if (list->count == list->capacity) {
78 uint32_t new_capacity = MAX2(2 * list->count, 16);
79 struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
80 list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
81 if (!new_bo_infos)
82 return TU_BO_LIST_FAILED;
83 list->bo_infos = new_bo_infos;
84 list->capacity = new_capacity;
85 }
86
87 list->bo_infos[list->count] = *bo_info;
88 return list->count++;
89 }
90
91 uint32_t
92 tu_bo_list_add(struct tu_bo_list *list,
93 const struct tu_bo *bo,
94 uint32_t flags)
95 {
96 return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
97 .flags = flags,
98 .handle = bo->gem_handle,
99 .presumed = bo->iova,
100 });
101 }
102
103 VkResult
104 tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
105 {
106 for (uint32_t i = 0; i < other->count; i++) {
107 if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
108 return VK_ERROR_OUT_OF_HOST_MEMORY;
109 }
110
111 return VK_SUCCESS;
112 }
113
114 static bool
115 is_linear_mipmapped(const struct tu_image_view *iview)
116 {
117 return iview->image->layout.tile_mode == TILE6_LINEAR &&
118 iview->base_mip != iview->image->level_count - 1;
119 }
120
121 static bool
122 force_sysmem(const struct tu_cmd_buffer *cmd,
123 const struct VkRect2D *render_area)
124 {
125 const struct tu_framebuffer *fb = cmd->state.framebuffer;
126 const struct tu_physical_device *device = cmd->device->physical_device;
127 bool has_linear_mipmapped_store = false;
128 const struct tu_render_pass *pass = cmd->state.pass;
129
130 /* Iterate over all the places we call tu6_emit_store_attachment() */
131 for (unsigned i = 0; i < pass->subpass_count; i++) {
132 const struct tu_subpass *subpass = &pass->subpasses[i];
133 if (subpass->resolve_attachments) {
134 for (unsigned i = 0; i < subpass->color_count; i++) {
135 uint32_t a = subpass->resolve_attachments[i].attachment;
136 if (a != VK_ATTACHMENT_UNUSED &&
137 cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
138 const struct tu_image_view *iview = fb->attachments[a].attachment;
139 if (is_linear_mipmapped(iview)) {
140 has_linear_mipmapped_store = true;
141 break;
142 }
143 }
144 }
145 }
146 }
147
148 for (unsigned i = 0; i < pass->attachment_count; i++) {
149 if (pass->attachments[i].gmem_offset >= 0 &&
150 cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
151 const struct tu_image_view *iview = fb->attachments[i].attachment;
152 if (is_linear_mipmapped(iview)) {
153 has_linear_mipmapped_store = true;
154 break;
155 }
156 }
157 }
158
159 /* Linear textures cannot have any padding between mipmap levels and their
160 * height isn't padded, while at the same time the GMEM->MEM resolve does
161 * not have per-pixel granularity, so if the image height isn't aligned to
162 * the resolve granularity and the render area is tall enough, we may wind
163 * up writing past the bottom of the image into the next miplevel or even
164 * past the end of the image. For the last miplevel, the layout code should
165 * insert enough padding so that the overdraw writes to the padding. To
166 * work around this, we force-enable sysmem rendering.
167 */
168 const uint32_t y2 = render_area->offset.y + render_area->extent.height;
169 const uint32_t aligned_y2 = ALIGN_POT(y2, device->tile_align_h);
170
171 return has_linear_mipmapped_store && aligned_y2 > fb->height;
172 }
173
174 static void
175 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
176 const struct tu_device *dev,
177 uint32_t pixels)
178 {
179 const uint32_t tile_align_w = dev->physical_device->tile_align_w;
180 const uint32_t tile_align_h = dev->physical_device->tile_align_h;
181 const uint32_t max_tile_width = 1024; /* A6xx */
182
183 /* note: don't offset the tiling config by render_area.offset,
184 * because binning pass can't deal with it
185 * this means we might end up with more tiles than necessary,
186 * but load/store/etc are still scissored to the render_area
187 */
188 tiling->tile0.offset = (VkOffset2D) {};
189
190 const uint32_t ra_width =
191 tiling->render_area.extent.width +
192 (tiling->render_area.offset.x - tiling->tile0.offset.x);
193 const uint32_t ra_height =
194 tiling->render_area.extent.height +
195 (tiling->render_area.offset.y - tiling->tile0.offset.y);
196
197 /* start from 1 tile */
198 tiling->tile_count = (VkExtent2D) {
199 .width = 1,
200 .height = 1,
201 };
202 tiling->tile0.extent = (VkExtent2D) {
203 .width = align(ra_width, tile_align_w),
204 .height = align(ra_height, tile_align_h),
205 };
206
207 if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
208 /* start with 2x2 tiles */
209 tiling->tile_count.width = 2;
210 tiling->tile_count.height = 2;
211 tiling->tile0.extent.width = align(DIV_ROUND_UP(ra_width, 2), tile_align_w);
212 tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), tile_align_h);
213 }
214
215 /* do not exceed max tile width */
216 while (tiling->tile0.extent.width > max_tile_width) {
217 tiling->tile_count.width++;
218 tiling->tile0.extent.width =
219 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
220 }
221
222 /* will force to sysmem, don't bother trying to have a valid tile config
223 * TODO: just skip all GMEM stuff when sysmem is forced?
224 */
225 if (!pixels)
226 return;
227
228 /* do not exceed gmem size */
229 while (tiling->tile0.extent.width * tiling->tile0.extent.height > pixels) {
230 if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) {
231 tiling->tile_count.width++;
232 tiling->tile0.extent.width =
233 align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
234 } else {
235 /* if this assert fails then layout is impossible.. */
236 assert(tiling->tile0.extent.height > tile_align_h);
237 tiling->tile_count.height++;
238 tiling->tile0.extent.height =
239 align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), tile_align_h);
240 }
241 }
242 }
243
244 static void
245 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
246 const struct tu_device *dev)
247 {
248 const uint32_t max_pipe_count = 32; /* A6xx */
249
250 /* start from 1 tile per pipe */
251 tiling->pipe0 = (VkExtent2D) {
252 .width = 1,
253 .height = 1,
254 };
255 tiling->pipe_count = tiling->tile_count;
256
257 /* do not exceed max pipe count vertically */
258 while (tiling->pipe_count.height > max_pipe_count) {
259 tiling->pipe0.height += 2;
260 tiling->pipe_count.height =
261 (tiling->tile_count.height + tiling->pipe0.height - 1) /
262 tiling->pipe0.height;
263 }
264
265 /* do not exceed max pipe count */
266 while (tiling->pipe_count.width * tiling->pipe_count.height >
267 max_pipe_count) {
268 tiling->pipe0.width += 1;
269 tiling->pipe_count.width =
270 (tiling->tile_count.width + tiling->pipe0.width - 1) /
271 tiling->pipe0.width;
272 }
273 }
274
275 static void
276 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
277 const struct tu_device *dev)
278 {
279 const uint32_t max_pipe_count = 32; /* A6xx */
280 const uint32_t used_pipe_count =
281 tiling->pipe_count.width * tiling->pipe_count.height;
282 const VkExtent2D last_pipe = {
283 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
284 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
285 };
286
287 assert(used_pipe_count <= max_pipe_count);
288 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
289
290 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
291 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
292 const uint32_t pipe_x = tiling->pipe0.width * x;
293 const uint32_t pipe_y = tiling->pipe0.height * y;
294 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
295 ? last_pipe.width
296 : tiling->pipe0.width;
297 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
298 ? last_pipe.height
299 : tiling->pipe0.height;
300 const uint32_t n = tiling->pipe_count.width * y + x;
301
302 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
303 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
304 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
305 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
306 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
307 }
308 }
309
310 memset(tiling->pipe_config + used_pipe_count, 0,
311 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
312 }
313
314 static void
315 tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
316 const struct tu_device *dev,
317 uint32_t tx,
318 uint32_t ty,
319 struct tu_tile *tile)
320 {
321 /* find the pipe and the slot for tile (tx, ty) */
322 const uint32_t px = tx / tiling->pipe0.width;
323 const uint32_t py = ty / tiling->pipe0.height;
324 const uint32_t sx = tx - tiling->pipe0.width * px;
325 const uint32_t sy = ty - tiling->pipe0.height * py;
326
327 assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
328 assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
329 assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
330
331 /* convert to 1D indices */
332 tile->pipe = tiling->pipe_count.width * py + px;
333 tile->slot = tiling->pipe0.width * sy + sx;
334
335 /* get the blit area for the tile */
336 tile->begin = (VkOffset2D) {
337 .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
338 .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
339 };
340 tile->end.x =
341 (tx == tiling->tile_count.width - 1)
342 ? tiling->render_area.offset.x + tiling->render_area.extent.width
343 : tile->begin.x + tiling->tile0.extent.width;
344 tile->end.y =
345 (ty == tiling->tile_count.height - 1)
346 ? tiling->render_area.offset.y + tiling->render_area.extent.height
347 : tile->begin.y + tiling->tile0.extent.height;
348 }
349
350 enum a3xx_msaa_samples
351 tu_msaa_samples(uint32_t samples)
352 {
353 switch (samples) {
354 case 1:
355 return MSAA_ONE;
356 case 2:
357 return MSAA_TWO;
358 case 4:
359 return MSAA_FOUR;
360 case 8:
361 return MSAA_EIGHT;
362 default:
363 assert(!"invalid sample count");
364 return MSAA_ONE;
365 }
366 }
367
368 static enum a4xx_index_size
369 tu6_index_size(VkIndexType type)
370 {
371 switch (type) {
372 case VK_INDEX_TYPE_UINT16:
373 return INDEX4_SIZE_16_BIT;
374 case VK_INDEX_TYPE_UINT32:
375 return INDEX4_SIZE_32_BIT;
376 default:
377 unreachable("invalid VkIndexType");
378 return INDEX4_SIZE_8_BIT;
379 }
380 }
381
382 unsigned
383 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
384 struct tu_cs *cs,
385 enum vgt_event_type event,
386 bool need_seqno)
387 {
388 unsigned seqno = 0;
389
390 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
391 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
392 if (need_seqno) {
393 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
394 seqno = ++cmd->scratch_seqno;
395 tu_cs_emit(cs, seqno);
396 }
397
398 return seqno;
399 }
400
401 static void
402 tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
403 {
404 tu6_emit_event_write(cmd, cs, 0x31, false);
405 }
406
407 static void
408 tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
409 {
410 tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false);
411 }
412
413 static void
414 tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
415 {
416 if (cmd->wait_for_idle) {
417 tu_cs_emit_wfi(cs);
418 cmd->wait_for_idle = false;
419 }
420 }
421
422 #define tu_image_view_ubwc_pitches(iview) \
423 .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \
424 .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2
425
426 static void
427 tu6_emit_zs(struct tu_cmd_buffer *cmd,
428 const struct tu_subpass *subpass,
429 struct tu_cs *cs)
430 {
431 const struct tu_framebuffer *fb = cmd->state.framebuffer;
432
433 const uint32_t a = subpass->depth_stencil_attachment.attachment;
434 if (a == VK_ATTACHMENT_UNUSED) {
435 tu_cs_emit_regs(cs,
436 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
437 A6XX_RB_DEPTH_BUFFER_PITCH(0),
438 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
439 A6XX_RB_DEPTH_BUFFER_BASE(0),
440 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
441
442 tu_cs_emit_regs(cs,
443 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
444
445 tu_cs_emit_regs(cs,
446 A6XX_GRAS_LRZ_BUFFER_BASE(0),
447 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
448 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
449
450 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
451
452 return;
453 }
454
455 const struct tu_image_view *iview = fb->attachments[a].attachment;
456 enum a6xx_depth_format fmt = tu6_pipe2depth(iview->vk_format);
457
458 tu_cs_emit_regs(cs,
459 A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt),
460 A6XX_RB_DEPTH_BUFFER_PITCH(tu_image_stride(iview->image, iview->base_mip)),
461 A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(iview->image->layout.layer_size),
462 A6XX_RB_DEPTH_BUFFER_BASE(tu_image_view_base_ref(iview)),
463 A6XX_RB_DEPTH_BUFFER_BASE_GMEM(cmd->state.pass->attachments[a].gmem_offset));
464
465 tu_cs_emit_regs(cs,
466 A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
467
468 tu_cs_emit_regs(cs,
469 A6XX_RB_DEPTH_FLAG_BUFFER_BASE(tu_image_view_ubwc_base_ref(iview)),
470 A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(tu_image_view_ubwc_pitches(iview)));
471
472 tu_cs_emit_regs(cs,
473 A6XX_GRAS_LRZ_BUFFER_BASE(0),
474 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
475 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
476
477 tu_cs_emit_regs(cs,
478 A6XX_RB_STENCIL_INFO(0));
479
480 /* enable zs? */
481 }
482
483 static void
484 tu6_emit_mrt(struct tu_cmd_buffer *cmd,
485 const struct tu_subpass *subpass,
486 struct tu_cs *cs)
487 {
488 const struct tu_framebuffer *fb = cmd->state.framebuffer;
489 unsigned char mrt_comp[MAX_RTS] = { 0 };
490 unsigned srgb_cntl = 0;
491
492 for (uint32_t i = 0; i < subpass->color_count; ++i) {
493 uint32_t a = subpass->color_attachments[i].attachment;
494 if (a == VK_ATTACHMENT_UNUSED)
495 continue;
496
497 const struct tu_image_view *iview = fb->attachments[a].attachment;
498 const enum a6xx_tile_mode tile_mode =
499 tu6_get_image_tile_mode(iview->image, iview->base_mip);
500
501 mrt_comp[i] = 0xf;
502
503 if (vk_format_is_srgb(iview->vk_format))
504 srgb_cntl |= (1 << i);
505
506 const struct tu_native_format format =
507 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
508
509 tu_cs_emit_regs(cs,
510 A6XX_RB_MRT_BUF_INFO(i,
511 .color_tile_mode = tile_mode,
512 .color_format = format.fmt,
513 .color_swap = format.swap),
514 A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)),
515 A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size),
516 A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)),
517 A6XX_RB_MRT_BASE_GMEM(i, cmd->state.pass->attachments[a].gmem_offset));
518
519 tu_cs_emit_regs(cs,
520 A6XX_SP_FS_MRT_REG(i,
521 .color_format = format.fmt,
522 .color_sint = vk_format_is_sint(iview->vk_format),
523 .color_uint = vk_format_is_uint(iview->vk_format)));
524
525 tu_cs_emit_regs(cs,
526 A6XX_RB_MRT_FLAG_BUFFER_ADDR(i, tu_image_view_ubwc_base_ref(iview)),
527 A6XX_RB_MRT_FLAG_BUFFER_PITCH(i, tu_image_view_ubwc_pitches(iview)));
528 }
529
530 tu_cs_emit_regs(cs,
531 A6XX_RB_SRGB_CNTL(.dword = srgb_cntl));
532
533 tu_cs_emit_regs(cs,
534 A6XX_SP_SRGB_CNTL(.dword = srgb_cntl));
535
536 tu_cs_emit_regs(cs,
537 A6XX_RB_RENDER_COMPONENTS(
538 .rt0 = mrt_comp[0],
539 .rt1 = mrt_comp[1],
540 .rt2 = mrt_comp[2],
541 .rt3 = mrt_comp[3],
542 .rt4 = mrt_comp[4],
543 .rt5 = mrt_comp[5],
544 .rt6 = mrt_comp[6],
545 .rt7 = mrt_comp[7]));
546
547 tu_cs_emit_regs(cs,
548 A6XX_SP_FS_RENDER_COMPONENTS(
549 .rt0 = mrt_comp[0],
550 .rt1 = mrt_comp[1],
551 .rt2 = mrt_comp[2],
552 .rt3 = mrt_comp[3],
553 .rt4 = mrt_comp[4],
554 .rt5 = mrt_comp[5],
555 .rt6 = mrt_comp[6],
556 .rt7 = mrt_comp[7]));
557 }
558
559 static void
560 tu6_emit_msaa(struct tu_cmd_buffer *cmd,
561 const struct tu_subpass *subpass,
562 struct tu_cs *cs)
563 {
564 const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples);
565 bool msaa_disable = samples == MSAA_ONE;
566
567 tu_cs_emit_regs(cs,
568 A6XX_SP_TP_RAS_MSAA_CNTL(samples),
569 A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
570 .msaa_disable = msaa_disable));
571
572 tu_cs_emit_regs(cs,
573 A6XX_GRAS_RAS_MSAA_CNTL(samples),
574 A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
575 .msaa_disable = msaa_disable));
576
577 tu_cs_emit_regs(cs,
578 A6XX_RB_RAS_MSAA_CNTL(samples),
579 A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
580 .msaa_disable = msaa_disable));
581
582 tu_cs_emit_regs(cs,
583 A6XX_RB_MSAA_CNTL(samples));
584 }
585
586 static void
587 tu6_emit_bin_size(struct tu_cs *cs,
588 uint32_t bin_w, uint32_t bin_h, uint32_t flags)
589 {
590 tu_cs_emit_regs(cs,
591 A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
592 .binh = bin_h,
593 .dword = flags));
594
595 tu_cs_emit_regs(cs,
596 A6XX_RB_BIN_CONTROL(.binw = bin_w,
597 .binh = bin_h,
598 .dword = flags));
599
600 /* no flag for RB_BIN_CONTROL2... */
601 tu_cs_emit_regs(cs,
602 A6XX_RB_BIN_CONTROL2(.binw = bin_w,
603 .binh = bin_h));
604 }
605
606 static void
607 tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
608 const struct tu_subpass *subpass,
609 struct tu_cs *cs,
610 bool binning)
611 {
612 const struct tu_framebuffer *fb = cmd->state.framebuffer;
613 uint32_t cntl = 0;
614 cntl |= A6XX_RB_RENDER_CNTL_UNK4;
615 if (binning) {
616 cntl |= A6XX_RB_RENDER_CNTL_BINNING;
617 } else {
618 uint32_t mrts_ubwc_enable = 0;
619 for (uint32_t i = 0; i < subpass->color_count; ++i) {
620 uint32_t a = subpass->color_attachments[i].attachment;
621 if (a == VK_ATTACHMENT_UNUSED)
622 continue;
623
624 const struct tu_image_view *iview = fb->attachments[a].attachment;
625 if (iview->image->layout.ubwc_layer_size != 0)
626 mrts_ubwc_enable |= 1 << i;
627 }
628
629 cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
630
631 const uint32_t a = subpass->depth_stencil_attachment.attachment;
632 if (a != VK_ATTACHMENT_UNUSED) {
633 const struct tu_image_view *iview = fb->attachments[a].attachment;
634 if (iview->image->layout.ubwc_layer_size != 0)
635 cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
636 }
637
638 /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
639 * in order to set it correctly for the different subpasses. However,
640 * that means the packets we're emitting also happen during binning. So
641 * we need to guard the write on !BINNING at CP execution time.
642 */
643 tu_cs_reserve(cs, 3 + 4);
644 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
645 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
646 CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
647 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
648 }
649
650 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
651 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
652 tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
653 tu_cs_emit(cs, cntl);
654 }
655
656 static void
657 tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
658 {
659 const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
660 uint32_t x1 = render_area->offset.x;
661 uint32_t y1 = render_area->offset.y;
662 uint32_t x2 = x1 + render_area->extent.width - 1;
663 uint32_t y2 = y1 + render_area->extent.height - 1;
664
665 /* TODO: alignment requirement seems to be less than tile_align_w/h */
666 if (align) {
667 x1 = x1 & ~cmd->device->physical_device->tile_align_w;
668 y1 = y1 & ~cmd->device->physical_device->tile_align_h;
669 x2 = ALIGN_POT(x2 + 1, cmd->device->physical_device->tile_align_w) - 1;
670 y2 = ALIGN_POT(y2 + 1, cmd->device->physical_device->tile_align_h) - 1;
671 }
672
673 tu_cs_emit_regs(cs,
674 A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
675 A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
676 }
677
678 static void
679 tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
680 struct tu_cs *cs,
681 const struct tu_image_view *iview,
682 uint32_t gmem_offset,
683 bool resolve)
684 {
685 tu_cs_emit_regs(cs,
686 A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve));
687
688 const struct tu_native_format format =
689 tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
690
691 enum a6xx_tile_mode tile_mode =
692 tu6_get_image_tile_mode(iview->image, iview->base_mip);
693 tu_cs_emit_regs(cs,
694 A6XX_RB_BLIT_DST_INFO(
695 .tile_mode = tile_mode,
696 .samples = tu_msaa_samples(iview->image->samples),
697 .color_format = format.fmt,
698 .color_swap = format.swap,
699 .flags = iview->image->layout.ubwc_layer_size != 0),
700 A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
701 A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
702 A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size));
703
704 if (iview->image->layout.ubwc_layer_size) {
705 tu_cs_emit_regs(cs,
706 A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)),
707 A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview)));
708 }
709
710 tu_cs_emit_regs(cs,
711 A6XX_RB_BLIT_BASE_GMEM(gmem_offset));
712 }
713
714 static void
715 tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
716 {
717 tu6_emit_event_write(cmd, cs, BLIT, false);
718 }
719
720 static void
721 tu6_emit_window_scissor(struct tu_cmd_buffer *cmd,
722 struct tu_cs *cs,
723 uint32_t x1,
724 uint32_t y1,
725 uint32_t x2,
726 uint32_t y2)
727 {
728 tu_cs_emit_regs(cs,
729 A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
730 A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
731
732 tu_cs_emit_regs(cs,
733 A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1),
734 A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2));
735 }
736
737 static void
738 tu6_emit_window_offset(struct tu_cmd_buffer *cmd,
739 struct tu_cs *cs,
740 uint32_t x1,
741 uint32_t y1)
742 {
743 tu_cs_emit_regs(cs,
744 A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
745
746 tu_cs_emit_regs(cs,
747 A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
748
749 tu_cs_emit_regs(cs,
750 A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
751
752 tu_cs_emit_regs(cs,
753 A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
754 }
755
756 static bool
757 use_hw_binning(struct tu_cmd_buffer *cmd)
758 {
759 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
760
761 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
762 return false;
763
764 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
765 return true;
766
767 return (tiling->tile_count.width * tiling->tile_count.height) > 2;
768 }
769
770 static bool
771 use_sysmem_rendering(struct tu_cmd_buffer *cmd)
772 {
773 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
774 return true;
775
776 /* can't fit attachments into gmem */
777 if (!cmd->state.pass->gmem_pixels)
778 return true;
779
780 return cmd->state.tiling_config.force_sysmem;
781 }
782
783 static void
784 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
785 struct tu_cs *cs,
786 const struct tu_tile *tile)
787 {
788 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
789 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
790
791 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
792 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
793
794 const uint32_t x1 = tile->begin.x;
795 const uint32_t y1 = tile->begin.y;
796 const uint32_t x2 = tile->end.x - 1;
797 const uint32_t y2 = tile->end.y - 1;
798 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
799 tu6_emit_window_offset(cmd, cs, x1, y1);
800
801 tu_cs_emit_regs(cs,
802 A6XX_VPC_SO_OVERRIDE(.so_disable = true));
803
804 if (use_hw_binning(cmd)) {
805 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
806
807 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
808 tu_cs_emit(cs, 0x0);
809
810 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
811 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
812 A6XX_CP_REG_TEST_0_BIT(0) |
813 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
814
815 tu_cs_reserve(cs, 3 + 11);
816 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
817 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
818 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
819
820 /* if (no overflow) */ {
821 tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
822 tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] |
823 CP_SET_BIN_DATA5_0_VSC_N(tile->slot));
824 tu_cs_emit_qw(cs, cmd->vsc_data.iova + tile->pipe * cmd->vsc_data_pitch);
825 tu_cs_emit_qw(cs, cmd->vsc_data.iova + (tile->pipe * 4) + (32 * cmd->vsc_data_pitch));
826 tu_cs_emit_qw(cs, cmd->vsc_data2.iova + (tile->pipe * cmd->vsc_data2_pitch));
827
828 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
829 tu_cs_emit(cs, 0x0);
830
831 /* use a NOP packet to skip over the 'else' side: */
832 tu_cs_emit_pkt7(cs, CP_NOP, 2);
833 } /* else */ {
834 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
835 tu_cs_emit(cs, 0x1);
836 }
837
838 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
839 tu_cs_emit(cs, 0x0);
840
841 tu_cs_emit_regs(cs,
842 A6XX_RB_UNKNOWN_8804(0));
843
844 tu_cs_emit_regs(cs,
845 A6XX_SP_TP_UNKNOWN_B304(0));
846
847 tu_cs_emit_regs(cs,
848 A6XX_GRAS_UNKNOWN_80A4(0));
849 } else {
850 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
851 tu_cs_emit(cs, 0x1);
852
853 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
854 tu_cs_emit(cs, 0x0);
855 }
856 }
857
858 static void
859 tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
860 {
861 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
862 const struct tu_framebuffer *fb = cmd->state.framebuffer;
863 const struct tu_image_view *iview = fb->attachments[a].attachment;
864 const struct tu_render_pass_attachment *attachment =
865 &cmd->state.pass->attachments[a];
866
867 if (attachment->gmem_offset < 0)
868 return;
869
870 const uint32_t x1 = tiling->render_area.offset.x;
871 const uint32_t y1 = tiling->render_area.offset.y;
872 const uint32_t x2 = x1 + tiling->render_area.extent.width;
873 const uint32_t y2 = y1 + tiling->render_area.extent.height;
874 const uint32_t tile_x2 =
875 tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width;
876 const uint32_t tile_y2 =
877 tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height;
878 bool need_load =
879 x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) ||
880 y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2);
881
882 if (need_load)
883 tu_finishme("improve handling of unaligned render area");
884
885 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
886 need_load = true;
887
888 if (vk_format_has_stencil(iview->vk_format) &&
889 attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
890 need_load = true;
891
892 if (need_load) {
893 tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false);
894 tu6_emit_blit(cmd, cs);
895 }
896 }
897
898 static void
899 tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
900 uint32_t a,
901 const VkRenderPassBeginInfo *info)
902 {
903 const struct tu_framebuffer *fb = cmd->state.framebuffer;
904 const struct tu_image_view *iview = fb->attachments[a].attachment;
905 const struct tu_render_pass_attachment *attachment =
906 &cmd->state.pass->attachments[a];
907 unsigned clear_mask = 0;
908
909 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
910 if (attachment->gmem_offset < 0)
911 return;
912
913 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
914 clear_mask = 0xf;
915
916 if (vk_format_has_stencil(iview->vk_format)) {
917 clear_mask &= 0x1;
918 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
919 clear_mask |= 0x2;
920 }
921 if (!clear_mask)
922 return;
923
924 tu_clear_gmem_attachment(cmd, cs, a, clear_mask,
925 &info->pClearValues[a]);
926 }
927
928 static void
929 tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd,
930 struct tu_cs *cs,
931 uint32_t a,
932 uint32_t gmem_a,
933 bool resolve)
934 {
935 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
936
937 tu6_emit_blit_info(cmd, cs,
938 cmd->state.framebuffer->attachments[a].attachment,
939 cmd->state.pass->attachments[gmem_a].gmem_offset, resolve);
940 tu6_emit_blit(cmd, cs);
941
942 tu_cond_exec_end(cs);
943 }
944
945 static void
946 tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
947 struct tu_cs *cs,
948 uint32_t a,
949 uint32_t gmem_a)
950 {
951 const struct tu_framebuffer *fb = cmd->state.framebuffer;
952 const struct tu_image_view *dst = fb->attachments[a].attachment;
953 const struct tu_image_view *src = fb->attachments[gmem_a].attachment;
954
955 tu_blit(cmd, cs, &(struct tu_blit) {
956 .dst = sysmem_attachment_surf(dst, dst->base_layer,
957 &cmd->state.tiling_config.render_area),
958 .src = sysmem_attachment_surf(src, src->base_layer,
959 &cmd->state.tiling_config.render_area),
960 .layers = fb->layers,
961 });
962 }
963
964
965 /* Emit a MSAA resolve operation, with both gmem and sysmem paths. */
966 static void tu6_emit_resolve(struct tu_cmd_buffer *cmd,
967 struct tu_cs *cs,
968 uint32_t a,
969 uint32_t gmem_a)
970 {
971 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
972 return;
973
974 tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true);
975
976 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
977 tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a);
978 tu_cond_exec_end(cs);
979 }
980
981 static void
982 tu6_emit_store_attachment(struct tu_cmd_buffer *cmd,
983 struct tu_cs *cs,
984 uint32_t a,
985 uint32_t gmem_a)
986 {
987 if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
988 return;
989
990 tu6_emit_blit_info(cmd, cs,
991 cmd->state.framebuffer->attachments[a].attachment,
992 cmd->state.pass->attachments[gmem_a].gmem_offset, true);
993 tu6_emit_blit(cmd, cs);
994 }
995
996 static void
997 tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
998 {
999 const struct tu_render_pass *pass = cmd->state.pass;
1000 const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
1001
1002 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1003 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1004 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1005 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1006 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1007 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1008
1009 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1010 tu_cs_emit(cs, 0x0);
1011
1012 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1013 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
1014
1015 tu6_emit_blit_scissor(cmd, cs, true);
1016
1017 for (uint32_t a = 0; a < pass->attachment_count; ++a) {
1018 if (pass->attachments[a].gmem_offset >= 0)
1019 tu6_emit_store_attachment(cmd, cs, a, a);
1020 }
1021
1022 if (subpass->resolve_attachments) {
1023 for (unsigned i = 0; i < subpass->color_count; i++) {
1024 uint32_t a = subpass->resolve_attachments[i].attachment;
1025 if (a != VK_ATTACHMENT_UNUSED)
1026 tu6_emit_store_attachment(cmd, cs, a,
1027 subpass->color_attachments[i].attachment);
1028 }
1029 }
1030 }
1031
1032 static void
1033 tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
1034 {
1035 tu_cs_emit_regs(cs,
1036 A6XX_PC_RESTART_INDEX(restart_index));
1037 }
1038
1039 static void
1040 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1041 {
1042 tu6_emit_cache_flush(cmd, cs);
1043
1044 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
1045
1046 tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x10000000);
1047 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
1048 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
1049 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
1050 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
1051 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
1052 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
1053 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1054 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1055
1056 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
1057 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
1058 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
1059 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
1060 tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
1061 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
1062 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
1063 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1064 tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
1065 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
1066 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A982, 0);
1067 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
1068 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
1069 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
1070
1071 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1072 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1073 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
1074
1075 tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
1076
1077 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
1078
1079 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
1080 tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
1081 tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
1082 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
1083 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
1084 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
1085 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
1086 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
1087 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
1088 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
1089 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
1090
1091 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
1092 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
1093
1094 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1);
1095 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
1096
1097 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
1098 A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
1099
1100 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
1101 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
1102 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
1103 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
1104
1105 tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0);
1106 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0);
1107
1108 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
1109
1110 tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
1111
1112 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
1113 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
1114 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
1115 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1116 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
1117 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
1118 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
1119 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
1120 tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
1121 tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
1122 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0);
1123 tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
1124 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0);
1125 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0);
1126 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0);
1127 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0);
1128 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0);
1129 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0);
1130 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
1131 tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
1132 tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1133
1134 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
1135
1136 tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
1137
1138 tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
1139
1140 /* we don't use this yet.. probably best to disable.. */
1141 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1142 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1143 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1144 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1145 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1146 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1147
1148 tu_cs_emit_regs(cs,
1149 A6XX_VPC_SO_BUFFER_BASE(0),
1150 A6XX_VPC_SO_BUFFER_SIZE(0));
1151
1152 tu_cs_emit_regs(cs,
1153 A6XX_VPC_SO_FLUSH_BASE(0));
1154
1155 tu_cs_emit_regs(cs,
1156 A6XX_VPC_SO_BUF_CNTL(0));
1157
1158 tu_cs_emit_regs(cs,
1159 A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
1160
1161 tu_cs_emit_regs(cs,
1162 A6XX_VPC_SO_BUFFER_BASE(1, 0),
1163 A6XX_VPC_SO_BUFFER_SIZE(1, 0));
1164
1165 tu_cs_emit_regs(cs,
1166 A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
1167 A6XX_VPC_SO_FLUSH_BASE(1, 0),
1168 A6XX_VPC_SO_BUFFER_BASE(2, 0),
1169 A6XX_VPC_SO_BUFFER_SIZE(2, 0));
1170
1171 tu_cs_emit_regs(cs,
1172 A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
1173 A6XX_VPC_SO_FLUSH_BASE(2, 0),
1174 A6XX_VPC_SO_BUFFER_BASE(3, 0),
1175 A6XX_VPC_SO_BUFFER_SIZE(3, 0));
1176
1177 tu_cs_emit_regs(cs,
1178 A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
1179 A6XX_VPC_SO_FLUSH_BASE(3, 0));
1180
1181 tu_cs_emit_regs(cs,
1182 A6XX_SP_HS_CTRL_REG0(0));
1183
1184 tu_cs_emit_regs(cs,
1185 A6XX_SP_GS_CTRL_REG0(0));
1186
1187 tu_cs_emit_regs(cs,
1188 A6XX_GRAS_LRZ_CNTL(0));
1189
1190 tu_cs_emit_regs(cs,
1191 A6XX_RB_LRZ_CNTL(0));
1192
1193 tu_cs_sanity_check(cs);
1194 }
1195
1196 static void
1197 tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1198 {
1199 unsigned seqno;
1200
1201 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_AND_INV_EVENT, true);
1202
1203 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1204 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1205 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1206 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1207 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(seqno));
1208 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1209 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1210
1211 seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1212
1213 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4);
1214 tu_cs_emit(cs, CP_WAIT_MEM_GTE_0_RESERVED(0));
1215 tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
1216 tu_cs_emit(cs, CP_WAIT_MEM_GTE_3_REF(seqno));
1217 }
1218
1219 static void
1220 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1221 {
1222 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1223
1224 tu_cs_emit_regs(cs,
1225 A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width,
1226 .height = tiling->tile0.extent.height),
1227 A6XX_VSC_SIZE_ADDRESS(.bo = &cmd->vsc_data,
1228 .bo_offset = 32 * cmd->vsc_data_pitch));
1229
1230 tu_cs_emit_regs(cs,
1231 A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
1232 .ny = tiling->tile_count.height));
1233
1234 tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
1235 for (unsigned i = 0; i < 32; i++)
1236 tu_cs_emit(cs, tiling->pipe_config[i]);
1237
1238 tu_cs_emit_regs(cs,
1239 A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = &cmd->vsc_data2),
1240 A6XX_VSC_PIPE_DATA2_PITCH(cmd->vsc_data2_pitch),
1241 A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(cmd->vsc_data2.size));
1242
1243 tu_cs_emit_regs(cs,
1244 A6XX_VSC_PIPE_DATA_ADDRESS(.bo = &cmd->vsc_data),
1245 A6XX_VSC_PIPE_DATA_PITCH(cmd->vsc_data_pitch),
1246 A6XX_VSC_PIPE_DATA_ARRAY_PITCH(cmd->vsc_data.size));
1247 }
1248
1249 static void
1250 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1251 {
1252 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1253 const uint32_t used_pipe_count =
1254 tiling->pipe_count.width * tiling->pipe_count.height;
1255
1256 /* Clear vsc_scratch: */
1257 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1258 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1259 tu_cs_emit(cs, 0x0);
1260
1261 /* Check for overflow, write vsc_scratch if detected: */
1262 for (int i = 0; i < used_pipe_count; i++) {
1263 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1264 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1265 CP_COND_WRITE5_0_WRITE_MEMORY);
1266 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i)));
1267 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1268 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data_pitch));
1269 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1270 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1271 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_data_pitch));
1272
1273 tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
1274 tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
1275 CP_COND_WRITE5_0_WRITE_MEMORY);
1276 tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i)));
1277 tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
1278 tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data2_pitch));
1279 tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
1280 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1281 tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_data2_pitch));
1282 }
1283
1284 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1285
1286 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1287
1288 tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1289 tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) |
1290 CP_MEM_TO_REG_0_CNT(1 - 1));
1291 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH);
1292
1293 /*
1294 * This is a bit awkward, we really want a way to invert the
1295 * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally
1296 * execute cmds to use hwbinning when a bit is *not* set. This
1297 * dance is to invert OVERFLOW_FLAG_REG
1298 *
1299 * A CP_NOP packet is used to skip executing the 'else' clause
1300 * if (b0 set)..
1301 */
1302
1303 /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */
1304 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1305 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1306 A6XX_CP_REG_TEST_0_BIT(0) |
1307 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1308
1309 tu_cs_reserve(cs, 3 + 7);
1310 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1311 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1312 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
1313
1314 /* if (b0 set) */ {
1315 /*
1316 * On overflow, mirror the value to control->vsc_overflow
1317 * which CPU is checking to detect overflow (see
1318 * check_vsc_overflow())
1319 */
1320 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1321 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
1322 CP_REG_TO_MEM_0_CNT(0));
1323 tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW);
1324
1325 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1326 tu_cs_emit(cs, 0x0);
1327
1328 tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */
1329 } /* else */ {
1330 tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1);
1331 tu_cs_emit(cs, 0x1);
1332 }
1333 }
1334
1335 static void
1336 tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1337 {
1338 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1339 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1340
1341 uint32_t x1 = tiling->tile0.offset.x;
1342 uint32_t y1 = tiling->tile0.offset.y;
1343 uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1;
1344 uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1;
1345
1346 tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
1347
1348 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1349 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
1350
1351 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1352 tu_cs_emit(cs, 0x1);
1353
1354 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1355 tu_cs_emit(cs, 0x1);
1356
1357 tu_cs_emit_wfi(cs);
1358
1359 tu_cs_emit_regs(cs,
1360 A6XX_VFD_MODE_CNTL(.binning_pass = true));
1361
1362 update_vsc_pipe(cmd, cs);
1363
1364 tu_cs_emit_regs(cs,
1365 A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1366
1367 tu_cs_emit_regs(cs,
1368 A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1369
1370 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1371 tu_cs_emit(cs, UNK_2C);
1372
1373 tu_cs_emit_regs(cs,
1374 A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
1375
1376 tu_cs_emit_regs(cs,
1377 A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
1378
1379 /* emit IB to binning drawcmds: */
1380 tu_cs_emit_call(cs, &cmd->draw_cs);
1381
1382 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1383 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1384 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1385 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1386 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1387 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1388
1389 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1390 tu_cs_emit(cs, UNK_2D);
1391
1392 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1393 tu6_cache_flush(cmd, cs);
1394
1395 tu_cs_emit_wfi(cs);
1396
1397 tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1398
1399 emit_vsc_overflow_test(cmd, cs);
1400
1401 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1402 tu_cs_emit(cs, 0x0);
1403
1404 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1405 tu_cs_emit(cs, 0x0);
1406
1407 tu_cs_emit_wfi(cs);
1408
1409 tu_cs_emit_regs(cs,
1410 A6XX_RB_CCU_CNTL(.unknown = phys_dev->magic.RB_CCU_CNTL_gmem));
1411
1412 cmd->wait_for_idle = false;
1413 }
1414
1415 static void
1416 tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1417 uint32_t a,
1418 const VkRenderPassBeginInfo *info)
1419 {
1420 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1421 const struct tu_image_view *iview = fb->attachments[a].attachment;
1422 const struct tu_render_pass_attachment *attachment =
1423 &cmd->state.pass->attachments[a];
1424 unsigned clear_mask = 0;
1425
1426 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
1427 if (attachment->gmem_offset < 0)
1428 return;
1429
1430 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1431 clear_mask = 0xf;
1432 }
1433
1434 if (vk_format_has_stencil(iview->vk_format)) {
1435 clear_mask &= 0x1;
1436 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
1437 clear_mask |= 0x2;
1438 if (clear_mask != 0x3)
1439 tu_finishme("depth/stencil only load op");
1440 }
1441
1442 if (!clear_mask)
1443 return;
1444
1445 tu_clear_sysmem_attachment(cmd, cs, a,
1446 &info->pClearValues[a], &(struct VkClearRect) {
1447 .rect = info->renderArea,
1448 .baseArrayLayer = iview->base_layer,
1449 .layerCount = iview->layer_count,
1450 });
1451 }
1452
1453 static void
1454 tu_emit_load_clear(struct tu_cmd_buffer *cmd,
1455 const VkRenderPassBeginInfo *info)
1456 {
1457 struct tu_cs *cs = &cmd->draw_cs;
1458
1459 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1460
1461 tu6_emit_blit_scissor(cmd, cs, true);
1462
1463 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1464 tu6_emit_load_attachment(cmd, cs, i);
1465
1466 tu6_emit_blit_scissor(cmd, cs, false);
1467
1468 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1469 tu6_emit_clear_attachment(cmd, cs, i, info);
1470
1471 tu_cond_exec_end(cs);
1472
1473 /* invalidate because reading input attachments will cache GMEM and
1474 * the cache isn''t updated when GMEM is written
1475 * TODO: is there a no-cache bit for textures?
1476 */
1477 if (cmd->state.subpass->input_count)
1478 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1479
1480 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1481
1482 for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1483 tu_emit_sysmem_clear_attachment(cmd, cs, i, info);
1484
1485 tu_cond_exec_end(cs);
1486 }
1487
1488 static void
1489 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1490 const struct VkRect2D *renderArea)
1491 {
1492 const struct tu_framebuffer *fb = cmd->state.framebuffer;
1493
1494 assert(fb->width > 0 && fb->height > 0);
1495 tu6_emit_window_scissor(cmd, cs, 0, 0, fb->width - 1, fb->height - 1);
1496 tu6_emit_window_offset(cmd, cs, 0, 0);
1497
1498 tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1499
1500 tu6_emit_lrz_flush(cmd, cs);
1501
1502 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1503 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1504
1505 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1506 tu_cs_emit(cs, 0x0);
1507
1508 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1509 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1510 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1511
1512 tu6_emit_wfi(cmd, cs);
1513 tu_cs_emit_regs(cs,
1514 A6XX_RB_CCU_CNTL(0x10000000));
1515
1516 /* enable stream-out, with sysmem there is only one pass: */
1517 tu_cs_emit_regs(cs,
1518 A6XX_VPC_SO_OVERRIDE(.so_disable = false));
1519
1520 tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1521 tu_cs_emit(cs, 0x1);
1522
1523 tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1524 tu_cs_emit(cs, 0x0);
1525
1526 tu_cs_sanity_check(cs);
1527 }
1528
1529 static void
1530 tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1531 {
1532 /* Do any resolves of the last subpass. These are handled in the
1533 * tile_store_ib in the gmem path.
1534 */
1535
1536 const struct tu_subpass *subpass = cmd->state.subpass;
1537 if (subpass->resolve_attachments) {
1538 for (unsigned i = 0; i < subpass->color_count; i++) {
1539 uint32_t a = subpass->resolve_attachments[i].attachment;
1540 if (a != VK_ATTACHMENT_UNUSED)
1541 tu6_emit_sysmem_resolve(cmd, cs, a,
1542 subpass->color_attachments[i].attachment);
1543 }
1544 }
1545
1546 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1547
1548 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1549 tu_cs_emit(cs, 0x0);
1550
1551 tu6_emit_lrz_flush(cmd, cs);
1552
1553 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1554 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1555
1556 tu_cs_sanity_check(cs);
1557 }
1558
1559
1560 static void
1561 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1562 {
1563 struct tu_physical_device *phys_dev = cmd->device->physical_device;
1564
1565 tu6_emit_lrz_flush(cmd, cs);
1566
1567 /* lrz clear? */
1568
1569 tu6_emit_cache_flush(cmd, cs);
1570
1571 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1572 tu_cs_emit(cs, 0x0);
1573
1574 /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
1575 tu6_emit_wfi(cmd, cs);
1576 tu_cs_emit_regs(cs,
1577 A6XX_RB_CCU_CNTL(phys_dev->magic.RB_CCU_CNTL_gmem));
1578
1579 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1580 if (use_hw_binning(cmd)) {
1581 tu6_emit_bin_size(cs,
1582 tiling->tile0.extent.width,
1583 tiling->tile0.extent.height,
1584 A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1585
1586 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1587
1588 tu6_emit_binning_pass(cmd, cs);
1589
1590 tu6_emit_bin_size(cs,
1591 tiling->tile0.extent.width,
1592 tiling->tile0.extent.height,
1593 A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1594
1595 tu_cs_emit_regs(cs,
1596 A6XX_VFD_MODE_CNTL(0));
1597
1598 tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->magic.PC_UNKNOWN_9805));
1599
1600 tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->magic.SP_UNKNOWN_A0F8));
1601
1602 tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1603 tu_cs_emit(cs, 0x1);
1604 } else {
1605 tu6_emit_bin_size(cs,
1606 tiling->tile0.extent.width,
1607 tiling->tile0.extent.height,
1608 0x6000000);
1609 }
1610
1611 tu_cs_sanity_check(cs);
1612 }
1613
1614 static void
1615 tu6_render_tile(struct tu_cmd_buffer *cmd,
1616 struct tu_cs *cs,
1617 const struct tu_tile *tile)
1618 {
1619 tu6_emit_tile_select(cmd, cs, tile);
1620
1621 tu_cs_emit_call(cs, &cmd->draw_cs);
1622 cmd->wait_for_idle = true;
1623
1624 if (use_hw_binning(cmd)) {
1625 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
1626 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
1627 A6XX_CP_REG_TEST_0_BIT(0) |
1628 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
1629
1630 tu_cs_reserve(cs, 3 + 2);
1631 tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
1632 tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
1633 tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
1634
1635 /* if (no overflow) */ {
1636 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1637 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1638 }
1639 }
1640
1641 tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1642
1643 tu_cs_sanity_check(cs);
1644 }
1645
1646 static void
1647 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1648 {
1649 tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1650
1651 tu_cs_emit_regs(cs,
1652 A6XX_GRAS_LRZ_CNTL(0));
1653
1654 tu6_emit_lrz_flush(cmd, cs);
1655
1656 tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
1657
1658 tu_cs_sanity_check(cs);
1659 }
1660
1661 static void
1662 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1663 {
1664 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1665
1666 tu6_tile_render_begin(cmd, &cmd->cs);
1667
1668 for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
1669 for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
1670 struct tu_tile tile;
1671 tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
1672 tu6_render_tile(cmd, &cmd->cs, &tile);
1673 }
1674 }
1675
1676 tu6_tile_render_end(cmd, &cmd->cs);
1677 }
1678
1679 static void
1680 tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1681 {
1682 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1683
1684 tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
1685
1686 tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1687 cmd->wait_for_idle = true;
1688
1689 tu6_sysmem_render_end(cmd, &cmd->cs);
1690 }
1691
1692 static void
1693 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1694 {
1695 const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count;
1696 struct tu_cs sub_cs;
1697
1698 VkResult result =
1699 tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1700 if (result != VK_SUCCESS) {
1701 cmd->record_result = result;
1702 return;
1703 }
1704
1705 /* emit to tile-store sub_cs */
1706 tu6_emit_tile_store(cmd, &sub_cs);
1707
1708 cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1709 }
1710
1711 static void
1712 tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
1713 const VkRect2D *render_area)
1714 {
1715 const struct tu_device *dev = cmd->device;
1716 struct tu_tiling_config *tiling = &cmd->state.tiling_config;
1717
1718 tiling->render_area = *render_area;
1719 tiling->force_sysmem = force_sysmem(cmd, render_area);
1720
1721 tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels);
1722 tu_tiling_config_update_pipe_layout(tiling, dev);
1723 tu_tiling_config_update_pipes(tiling, dev);
1724 }
1725
1726 const struct tu_dynamic_state default_dynamic_state = {
1727 .viewport =
1728 {
1729 .count = 0,
1730 },
1731 .scissor =
1732 {
1733 .count = 0,
1734 },
1735 .line_width = 1.0f,
1736 .depth_bias =
1737 {
1738 .bias = 0.0f,
1739 .clamp = 0.0f,
1740 .slope = 0.0f,
1741 },
1742 .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
1743 .depth_bounds =
1744 {
1745 .min = 0.0f,
1746 .max = 1.0f,
1747 },
1748 .stencil_compare_mask =
1749 {
1750 .front = ~0u,
1751 .back = ~0u,
1752 },
1753 .stencil_write_mask =
1754 {
1755 .front = ~0u,
1756 .back = ~0u,
1757 },
1758 .stencil_reference =
1759 {
1760 .front = 0u,
1761 .back = 0u,
1762 },
1763 };
1764
1765 static void UNUSED /* FINISHME */
1766 tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
1767 const struct tu_dynamic_state *src)
1768 {
1769 struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
1770 uint32_t copy_mask = src->mask;
1771 uint32_t dest_mask = 0;
1772
1773 tu_use_args(cmd_buffer); /* FINISHME */
1774
1775 /* Make sure to copy the number of viewports/scissors because they can
1776 * only be specified at pipeline creation time.
1777 */
1778 dest->viewport.count = src->viewport.count;
1779 dest->scissor.count = src->scissor.count;
1780 dest->discard_rectangle.count = src->discard_rectangle.count;
1781
1782 if (copy_mask & TU_DYNAMIC_VIEWPORT) {
1783 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1784 src->viewport.count * sizeof(VkViewport))) {
1785 typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
1786 src->viewport.count);
1787 dest_mask |= TU_DYNAMIC_VIEWPORT;
1788 }
1789 }
1790
1791 if (copy_mask & TU_DYNAMIC_SCISSOR) {
1792 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1793 src->scissor.count * sizeof(VkRect2D))) {
1794 typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
1795 src->scissor.count);
1796 dest_mask |= TU_DYNAMIC_SCISSOR;
1797 }
1798 }
1799
1800 if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
1801 if (dest->line_width != src->line_width) {
1802 dest->line_width = src->line_width;
1803 dest_mask |= TU_DYNAMIC_LINE_WIDTH;
1804 }
1805 }
1806
1807 if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
1808 if (memcmp(&dest->depth_bias, &src->depth_bias,
1809 sizeof(src->depth_bias))) {
1810 dest->depth_bias = src->depth_bias;
1811 dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
1812 }
1813 }
1814
1815 if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
1816 if (memcmp(&dest->blend_constants, &src->blend_constants,
1817 sizeof(src->blend_constants))) {
1818 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1819 dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
1820 }
1821 }
1822
1823 if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
1824 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
1825 sizeof(src->depth_bounds))) {
1826 dest->depth_bounds = src->depth_bounds;
1827 dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
1828 }
1829 }
1830
1831 if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
1832 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1833 sizeof(src->stencil_compare_mask))) {
1834 dest->stencil_compare_mask = src->stencil_compare_mask;
1835 dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
1836 }
1837 }
1838
1839 if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
1840 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
1841 sizeof(src->stencil_write_mask))) {
1842 dest->stencil_write_mask = src->stencil_write_mask;
1843 dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
1844 }
1845 }
1846
1847 if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
1848 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
1849 sizeof(src->stencil_reference))) {
1850 dest->stencil_reference = src->stencil_reference;
1851 dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
1852 }
1853 }
1854
1855 if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
1856 if (memcmp(&dest->discard_rectangle.rectangles,
1857 &src->discard_rectangle.rectangles,
1858 src->discard_rectangle.count * sizeof(VkRect2D))) {
1859 typed_memcpy(dest->discard_rectangle.rectangles,
1860 src->discard_rectangle.rectangles,
1861 src->discard_rectangle.count);
1862 dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
1863 }
1864 }
1865 }
1866
1867 static VkResult
1868 tu_create_cmd_buffer(struct tu_device *device,
1869 struct tu_cmd_pool *pool,
1870 VkCommandBufferLevel level,
1871 VkCommandBuffer *pCommandBuffer)
1872 {
1873 struct tu_cmd_buffer *cmd_buffer;
1874 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
1875 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1876 if (cmd_buffer == NULL)
1877 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1878
1879 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1880 cmd_buffer->device = device;
1881 cmd_buffer->pool = pool;
1882 cmd_buffer->level = level;
1883
1884 if (pool) {
1885 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1886 cmd_buffer->queue_family_index = pool->queue_family_index;
1887
1888 } else {
1889 /* Init the pool_link so we can safely call list_del when we destroy
1890 * the command buffer
1891 */
1892 list_inithead(&cmd_buffer->pool_link);
1893 cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1894 }
1895
1896 tu_bo_list_init(&cmd_buffer->bo_list);
1897 tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1898 tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1899 tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1900 tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1901
1902 *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1903
1904 list_inithead(&cmd_buffer->upload.list);
1905
1906 VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
1907 if (result != VK_SUCCESS)
1908 goto fail_scratch_bo;
1909
1910 /* TODO: resize on overflow */
1911 cmd_buffer->vsc_data_pitch = device->vsc_data_pitch;
1912 cmd_buffer->vsc_data2_pitch = device->vsc_data2_pitch;
1913 cmd_buffer->vsc_data = device->vsc_data;
1914 cmd_buffer->vsc_data2 = device->vsc_data2;
1915
1916 return VK_SUCCESS;
1917
1918 fail_scratch_bo:
1919 list_del(&cmd_buffer->pool_link);
1920 return result;
1921 }
1922
1923 static void
1924 tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1925 {
1926 tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
1927
1928 list_del(&cmd_buffer->pool_link);
1929
1930 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
1931 free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
1932
1933 tu_cs_finish(&cmd_buffer->cs);
1934 tu_cs_finish(&cmd_buffer->draw_cs);
1935 tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1936 tu_cs_finish(&cmd_buffer->sub_cs);
1937
1938 tu_bo_list_destroy(&cmd_buffer->bo_list);
1939 vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
1940 }
1941
1942 static VkResult
1943 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1944 {
1945 cmd_buffer->wait_for_idle = true;
1946
1947 cmd_buffer->record_result = VK_SUCCESS;
1948
1949 tu_bo_list_reset(&cmd_buffer->bo_list);
1950 tu_cs_reset(&cmd_buffer->cs);
1951 tu_cs_reset(&cmd_buffer->draw_cs);
1952 tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1953 tu_cs_reset(&cmd_buffer->sub_cs);
1954
1955 for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
1956 cmd_buffer->descriptors[i].valid = 0;
1957 cmd_buffer->descriptors[i].push_dirty = false;
1958 }
1959
1960 cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1961
1962 return cmd_buffer->record_result;
1963 }
1964
1965 VkResult
1966 tu_AllocateCommandBuffers(VkDevice _device,
1967 const VkCommandBufferAllocateInfo *pAllocateInfo,
1968 VkCommandBuffer *pCommandBuffers)
1969 {
1970 TU_FROM_HANDLE(tu_device, device, _device);
1971 TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1972
1973 VkResult result = VK_SUCCESS;
1974 uint32_t i;
1975
1976 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1977
1978 if (!list_is_empty(&pool->free_cmd_buffers)) {
1979 struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1980 &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1981
1982 list_del(&cmd_buffer->pool_link);
1983 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1984
1985 result = tu_reset_cmd_buffer(cmd_buffer);
1986 cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
1987 cmd_buffer->level = pAllocateInfo->level;
1988
1989 pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1990 } else {
1991 result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1992 &pCommandBuffers[i]);
1993 }
1994 if (result != VK_SUCCESS)
1995 break;
1996 }
1997
1998 if (result != VK_SUCCESS) {
1999 tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
2000 pCommandBuffers);
2001
2002 /* From the Vulkan 1.0.66 spec:
2003 *
2004 * "vkAllocateCommandBuffers can be used to create multiple
2005 * command buffers. If the creation of any of those command
2006 * buffers fails, the implementation must destroy all
2007 * successfully created command buffer objects from this
2008 * command, set all entries of the pCommandBuffers array to
2009 * NULL and return the error."
2010 */
2011 memset(pCommandBuffers, 0,
2012 sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
2013 }
2014
2015 return result;
2016 }
2017
2018 void
2019 tu_FreeCommandBuffers(VkDevice device,
2020 VkCommandPool commandPool,
2021 uint32_t commandBufferCount,
2022 const VkCommandBuffer *pCommandBuffers)
2023 {
2024 for (uint32_t i = 0; i < commandBufferCount; i++) {
2025 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
2026
2027 if (cmd_buffer) {
2028 if (cmd_buffer->pool) {
2029 list_del(&cmd_buffer->pool_link);
2030 list_addtail(&cmd_buffer->pool_link,
2031 &cmd_buffer->pool->free_cmd_buffers);
2032 } else
2033 tu_cmd_buffer_destroy(cmd_buffer);
2034 }
2035 }
2036 }
2037
2038 VkResult
2039 tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
2040 VkCommandBufferResetFlags flags)
2041 {
2042 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2043 return tu_reset_cmd_buffer(cmd_buffer);
2044 }
2045
2046 VkResult
2047 tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2048 const VkCommandBufferBeginInfo *pBeginInfo)
2049 {
2050 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2051 VkResult result = VK_SUCCESS;
2052
2053 if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
2054 /* If the command buffer has already been resetted with
2055 * vkResetCommandBuffer, no need to do it again.
2056 */
2057 result = tu_reset_cmd_buffer(cmd_buffer);
2058 if (result != VK_SUCCESS)
2059 return result;
2060 }
2061
2062 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
2063 cmd_buffer->usage_flags = pBeginInfo->flags;
2064
2065 tu_cs_begin(&cmd_buffer->cs);
2066 tu_cs_begin(&cmd_buffer->draw_cs);
2067 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
2068
2069 cmd_buffer->scratch_seqno = 0;
2070
2071 /* setup initial configuration into command buffer */
2072 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2073 switch (cmd_buffer->queue_family_index) {
2074 case TU_QUEUE_GENERAL:
2075 tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
2076 break;
2077 default:
2078 break;
2079 }
2080 } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2081 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
2082 assert(pBeginInfo->pInheritanceInfo);
2083 cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
2084 cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
2085 }
2086
2087 cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
2088
2089 return VK_SUCCESS;
2090 }
2091
2092 void
2093 tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2094 uint32_t firstBinding,
2095 uint32_t bindingCount,
2096 const VkBuffer *pBuffers,
2097 const VkDeviceSize *pOffsets)
2098 {
2099 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2100
2101 assert(firstBinding + bindingCount <= MAX_VBS);
2102
2103 for (uint32_t i = 0; i < bindingCount; i++) {
2104 cmd->state.vb.buffers[firstBinding + i] =
2105 tu_buffer_from_handle(pBuffers[i]);
2106 cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
2107 }
2108
2109 /* VB states depend on VkPipelineVertexInputStateCreateInfo */
2110 cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2111 }
2112
2113 void
2114 tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2115 VkBuffer buffer,
2116 VkDeviceSize offset,
2117 VkIndexType indexType)
2118 {
2119 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2120 TU_FROM_HANDLE(tu_buffer, buf, buffer);
2121
2122 /* initialize/update the restart index */
2123 if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
2124 struct tu_cs *draw_cs = &cmd->draw_cs;
2125
2126 tu6_emit_restart_index(
2127 draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
2128
2129 tu_cs_sanity_check(draw_cs);
2130 }
2131
2132 /* track the BO */
2133 if (cmd->state.index_buffer != buf)
2134 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
2135
2136 cmd->state.index_buffer = buf;
2137 cmd->state.index_offset = offset;
2138 cmd->state.index_type = indexType;
2139 }
2140
2141 void
2142 tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2143 VkPipelineBindPoint pipelineBindPoint,
2144 VkPipelineLayout _layout,
2145 uint32_t firstSet,
2146 uint32_t descriptorSetCount,
2147 const VkDescriptorSet *pDescriptorSets,
2148 uint32_t dynamicOffsetCount,
2149 const uint32_t *pDynamicOffsets)
2150 {
2151 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2152 TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
2153 unsigned dyn_idx = 0;
2154
2155 struct tu_descriptor_state *descriptors_state =
2156 tu_get_descriptors_state(cmd_buffer, pipelineBindPoint);
2157
2158 for (unsigned i = 0; i < descriptorSetCount; ++i) {
2159 unsigned idx = i + firstSet;
2160 TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
2161
2162 descriptors_state->sets[idx] = set;
2163 descriptors_state->valid |= (1u << idx);
2164
2165 for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
2166 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
2167 assert(dyn_idx < dynamicOffsetCount);
2168
2169 descriptors_state->dynamic_buffers[idx] =
2170 set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
2171 }
2172 }
2173
2174 cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
2175 }
2176
2177 void
2178 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2179 VkPipelineLayout layout,
2180 VkShaderStageFlags stageFlags,
2181 uint32_t offset,
2182 uint32_t size,
2183 const void *pValues)
2184 {
2185 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2186 memcpy((void*) cmd->push_constants + offset, pValues, size);
2187 cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS;
2188 }
2189
2190 VkResult
2191 tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2192 {
2193 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2194
2195 if (cmd_buffer->scratch_seqno) {
2196 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
2197 MSM_SUBMIT_BO_WRITE);
2198 }
2199
2200 if (cmd_buffer->use_vsc_data) {
2201 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data,
2202 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2203 tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data2,
2204 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2205 }
2206
2207 for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
2208 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
2209 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2210 }
2211
2212 for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) {
2213 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i],
2214 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2215 }
2216
2217 for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) {
2218 tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i],
2219 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2220 }
2221
2222 tu_cs_end(&cmd_buffer->cs);
2223 tu_cs_end(&cmd_buffer->draw_cs);
2224 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2225
2226 cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2227
2228 return cmd_buffer->record_result;
2229 }
2230
2231 void
2232 tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2233 VkPipelineBindPoint pipelineBindPoint,
2234 VkPipeline _pipeline)
2235 {
2236 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2237 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2238
2239 switch (pipelineBindPoint) {
2240 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2241 cmd->state.pipeline = pipeline;
2242 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
2243 break;
2244 case VK_PIPELINE_BIND_POINT_COMPUTE:
2245 cmd->state.compute_pipeline = pipeline;
2246 cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
2247 break;
2248 default:
2249 unreachable("unrecognized pipeline bind point");
2250 break;
2251 }
2252
2253 tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
2254 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2255 for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
2256 tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
2257 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
2258 }
2259 }
2260
2261 void
2262 tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2263 uint32_t firstViewport,
2264 uint32_t viewportCount,
2265 const VkViewport *pViewports)
2266 {
2267 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2268 struct tu_cs *draw_cs = &cmd->draw_cs;
2269
2270 assert(firstViewport == 0 && viewportCount == 1);
2271 tu6_emit_viewport(draw_cs, pViewports);
2272
2273 tu_cs_sanity_check(draw_cs);
2274 }
2275
2276 void
2277 tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2278 uint32_t firstScissor,
2279 uint32_t scissorCount,
2280 const VkRect2D *pScissors)
2281 {
2282 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2283 struct tu_cs *draw_cs = &cmd->draw_cs;
2284
2285 assert(firstScissor == 0 && scissorCount == 1);
2286 tu6_emit_scissor(draw_cs, pScissors);
2287
2288 tu_cs_sanity_check(draw_cs);
2289 }
2290
2291 void
2292 tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2293 {
2294 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2295
2296 cmd->state.dynamic.line_width = lineWidth;
2297
2298 /* line width depends on VkPipelineRasterizationStateCreateInfo */
2299 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
2300 }
2301
2302 void
2303 tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2304 float depthBiasConstantFactor,
2305 float depthBiasClamp,
2306 float depthBiasSlopeFactor)
2307 {
2308 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2309 struct tu_cs *draw_cs = &cmd->draw_cs;
2310
2311 tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
2312 depthBiasSlopeFactor);
2313
2314 tu_cs_sanity_check(draw_cs);
2315 }
2316
2317 void
2318 tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2319 const float blendConstants[4])
2320 {
2321 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2322 struct tu_cs *draw_cs = &cmd->draw_cs;
2323
2324 tu6_emit_blend_constants(draw_cs, blendConstants);
2325
2326 tu_cs_sanity_check(draw_cs);
2327 }
2328
2329 void
2330 tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2331 float minDepthBounds,
2332 float maxDepthBounds)
2333 {
2334 }
2335
2336 void
2337 tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2338 VkStencilFaceFlags faceMask,
2339 uint32_t compareMask)
2340 {
2341 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2342
2343 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2344 cmd->state.dynamic.stencil_compare_mask.front = compareMask;
2345 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2346 cmd->state.dynamic.stencil_compare_mask.back = compareMask;
2347
2348 /* the front/back compare masks must be updated together */
2349 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
2350 }
2351
2352 void
2353 tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2354 VkStencilFaceFlags faceMask,
2355 uint32_t writeMask)
2356 {
2357 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2358
2359 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2360 cmd->state.dynamic.stencil_write_mask.front = writeMask;
2361 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2362 cmd->state.dynamic.stencil_write_mask.back = writeMask;
2363
2364 /* the front/back write masks must be updated together */
2365 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
2366 }
2367
2368 void
2369 tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2370 VkStencilFaceFlags faceMask,
2371 uint32_t reference)
2372 {
2373 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2374
2375 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2376 cmd->state.dynamic.stencil_reference.front = reference;
2377 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2378 cmd->state.dynamic.stencil_reference.back = reference;
2379
2380 /* the front/back references must be updated together */
2381 cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
2382 }
2383
2384 void
2385 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2386 uint32_t commandBufferCount,
2387 const VkCommandBuffer *pCmdBuffers)
2388 {
2389 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2390 VkResult result;
2391
2392 assert(commandBufferCount > 0);
2393
2394 for (uint32_t i = 0; i < commandBufferCount; i++) {
2395 TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2396
2397 result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list);
2398 if (result != VK_SUCCESS) {
2399 cmd->record_result = result;
2400 break;
2401 }
2402
2403 result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2404 if (result != VK_SUCCESS) {
2405 cmd->record_result = result;
2406 break;
2407 }
2408
2409 result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2410 &secondary->draw_epilogue_cs);
2411 if (result != VK_SUCCESS) {
2412 cmd->record_result = result;
2413 break;
2414 }
2415 }
2416 cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2417 }
2418
2419 VkResult
2420 tu_CreateCommandPool(VkDevice _device,
2421 const VkCommandPoolCreateInfo *pCreateInfo,
2422 const VkAllocationCallbacks *pAllocator,
2423 VkCommandPool *pCmdPool)
2424 {
2425 TU_FROM_HANDLE(tu_device, device, _device);
2426 struct tu_cmd_pool *pool;
2427
2428 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
2429 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2430 if (pool == NULL)
2431 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2432
2433 if (pAllocator)
2434 pool->alloc = *pAllocator;
2435 else
2436 pool->alloc = device->alloc;
2437
2438 list_inithead(&pool->cmd_buffers);
2439 list_inithead(&pool->free_cmd_buffers);
2440
2441 pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2442
2443 *pCmdPool = tu_cmd_pool_to_handle(pool);
2444
2445 return VK_SUCCESS;
2446 }
2447
2448 void
2449 tu_DestroyCommandPool(VkDevice _device,
2450 VkCommandPool commandPool,
2451 const VkAllocationCallbacks *pAllocator)
2452 {
2453 TU_FROM_HANDLE(tu_device, device, _device);
2454 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2455
2456 if (!pool)
2457 return;
2458
2459 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2460 &pool->cmd_buffers, pool_link)
2461 {
2462 tu_cmd_buffer_destroy(cmd_buffer);
2463 }
2464
2465 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2466 &pool->free_cmd_buffers, pool_link)
2467 {
2468 tu_cmd_buffer_destroy(cmd_buffer);
2469 }
2470
2471 vk_free2(&device->alloc, pAllocator, pool);
2472 }
2473
2474 VkResult
2475 tu_ResetCommandPool(VkDevice device,
2476 VkCommandPool commandPool,
2477 VkCommandPoolResetFlags flags)
2478 {
2479 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2480 VkResult result;
2481
2482 list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2483 pool_link)
2484 {
2485 result = tu_reset_cmd_buffer(cmd_buffer);
2486 if (result != VK_SUCCESS)
2487 return result;
2488 }
2489
2490 return VK_SUCCESS;
2491 }
2492
2493 void
2494 tu_TrimCommandPool(VkDevice device,
2495 VkCommandPool commandPool,
2496 VkCommandPoolTrimFlags flags)
2497 {
2498 TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2499
2500 if (!pool)
2501 return;
2502
2503 list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2504 &pool->free_cmd_buffers, pool_link)
2505 {
2506 tu_cmd_buffer_destroy(cmd_buffer);
2507 }
2508 }
2509
2510 void
2511 tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
2512 const VkRenderPassBeginInfo *pRenderPassBegin,
2513 VkSubpassContents contents)
2514 {
2515 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2516 TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2517 TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2518
2519 cmd->state.pass = pass;
2520 cmd->state.subpass = pass->subpasses;
2521 cmd->state.framebuffer = fb;
2522
2523 tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
2524 tu_cmd_prepare_tile_store_ib(cmd);
2525
2526 tu_emit_load_clear(cmd, pRenderPassBegin);
2527
2528 tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2529 tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2530 tu6_emit_msaa(cmd, cmd->state.subpass, &cmd->draw_cs);
2531 tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2532
2533 /* note: use_hw_binning only checks tiling config */
2534 if (use_hw_binning(cmd))
2535 cmd->use_vsc_data = true;
2536
2537 for (uint32_t i = 0; i < fb->attachment_count; ++i) {
2538 const struct tu_image_view *iview = fb->attachments[i].attachment;
2539 tu_bo_list_add(&cmd->bo_list, iview->image->bo,
2540 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
2541 }
2542 }
2543
2544 void
2545 tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2546 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2547 const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
2548 {
2549 tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
2550 pSubpassBeginInfo->contents);
2551 }
2552
2553 void
2554 tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
2555 {
2556 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2557 const struct tu_render_pass *pass = cmd->state.pass;
2558 struct tu_cs *cs = &cmd->draw_cs;
2559
2560 const struct tu_subpass *subpass = cmd->state.subpass++;
2561 /* TODO:
2562 * if msaa samples change between subpasses,
2563 * attachment store is broken for some attachments
2564 */
2565 if (subpass->resolve_attachments) {
2566 tu6_emit_blit_scissor(cmd, cs, true);
2567 for (unsigned i = 0; i < subpass->color_count; i++) {
2568 uint32_t a = subpass->resolve_attachments[i].attachment;
2569 if (a != VK_ATTACHMENT_UNUSED) {
2570 tu6_emit_resolve(cmd, cs, a,
2571 subpass->color_attachments[i].attachment);
2572 }
2573 }
2574 }
2575
2576 /* invalidate because reading input attachments will cache GMEM and
2577 * the cache isn''t updated when GMEM is written
2578 * TODO: is there a no-cache bit for textures?
2579 */
2580 if (cmd->state.subpass->input_count)
2581 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2582
2583 /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
2584 tu6_emit_zs(cmd, cmd->state.subpass, cs);
2585 tu6_emit_mrt(cmd, cmd->state.subpass, cs);
2586 tu6_emit_msaa(cmd, cmd->state.subpass, cs);
2587 tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
2588
2589 /* Emit flushes so that input attachments will read the correct value. This
2590 * is for sysmem only, although it shouldn't do much harm on gmem.
2591 */
2592 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2593 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2594
2595 /* TODO:
2596 * since we don't know how to do GMEM->GMEM resolve,
2597 * resolve attachments are resolved to memory then loaded to GMEM again if needed
2598 */
2599 if (subpass->resolve_attachments) {
2600 for (unsigned i = 0; i < subpass->color_count; i++) {
2601 uint32_t a = subpass->resolve_attachments[i].attachment;
2602 if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) {
2603 tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n");
2604 tu6_emit_predicated_blit(cmd, cs, a, a, false);
2605 }
2606 }
2607 }
2608 }
2609
2610 void
2611 tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2612 const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
2613 const VkSubpassEndInfoKHR *pSubpassEndInfo)
2614 {
2615 tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
2616 }
2617
2618 struct tu_draw_info
2619 {
2620 /**
2621 * Number of vertices.
2622 */
2623 uint32_t count;
2624
2625 /**
2626 * Index of the first vertex.
2627 */
2628 int32_t vertex_offset;
2629
2630 /**
2631 * First instance id.
2632 */
2633 uint32_t first_instance;
2634
2635 /**
2636 * Number of instances.
2637 */
2638 uint32_t instance_count;
2639
2640 /**
2641 * First index (indexed draws only).
2642 */
2643 uint32_t first_index;
2644
2645 /**
2646 * Whether it's an indexed draw.
2647 */
2648 bool indexed;
2649
2650 /**
2651 * Indirect draw parameters resource.
2652 */
2653 struct tu_buffer *indirect;
2654 uint64_t indirect_offset;
2655 uint32_t stride;
2656
2657 /**
2658 * Draw count parameters resource.
2659 */
2660 struct tu_buffer *count_buffer;
2661 uint64_t count_buffer_offset;
2662 };
2663
2664 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2665 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
2666
2667 enum tu_draw_state_group_id
2668 {
2669 TU_DRAW_STATE_PROGRAM,
2670 TU_DRAW_STATE_PROGRAM_BINNING,
2671 TU_DRAW_STATE_VI,
2672 TU_DRAW_STATE_VI_BINNING,
2673 TU_DRAW_STATE_VP,
2674 TU_DRAW_STATE_RAST,
2675 TU_DRAW_STATE_DS,
2676 TU_DRAW_STATE_BLEND,
2677 TU_DRAW_STATE_VS_CONST,
2678 TU_DRAW_STATE_FS_CONST,
2679 TU_DRAW_STATE_VS_TEX,
2680 TU_DRAW_STATE_FS_TEX_SYSMEM,
2681 TU_DRAW_STATE_FS_TEX_GMEM,
2682 TU_DRAW_STATE_FS_IBO,
2683 TU_DRAW_STATE_VS_PARAMS,
2684
2685 TU_DRAW_STATE_COUNT,
2686 };
2687
2688 struct tu_draw_state_group
2689 {
2690 enum tu_draw_state_group_id id;
2691 uint32_t enable_mask;
2692 struct tu_cs_entry ib;
2693 };
2694
2695 const static struct tu_sampler*
2696 sampler_ptr(struct tu_descriptor_state *descriptors_state,
2697 const struct tu_descriptor_map *map, unsigned i,
2698 unsigned array_index)
2699 {
2700 assert(descriptors_state->valid & (1 << map->set[i]));
2701
2702 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2703 assert(map->binding[i] < set->layout->binding_count);
2704
2705 const struct tu_descriptor_set_binding_layout *layout =
2706 &set->layout->binding[map->binding[i]];
2707
2708 if (layout->immutable_samplers_offset) {
2709 const struct tu_sampler *immutable_samplers =
2710 tu_immutable_samplers(set->layout, layout);
2711
2712 return &immutable_samplers[array_index];
2713 }
2714
2715 switch (layout->type) {
2716 case VK_DESCRIPTOR_TYPE_SAMPLER:
2717 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4];
2718 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2719 return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
2720 array_index *
2721 (A6XX_TEX_CONST_DWORDS +
2722 sizeof(struct tu_sampler) / 4)];
2723 default:
2724 unreachable("unimplemented descriptor type");
2725 break;
2726 }
2727 }
2728
2729 static void
2730 write_tex_const(struct tu_cmd_buffer *cmd,
2731 uint32_t *dst,
2732 struct tu_descriptor_state *descriptors_state,
2733 const struct tu_descriptor_map *map,
2734 unsigned i, unsigned array_index, bool is_sysmem)
2735 {
2736 assert(descriptors_state->valid & (1 << map->set[i]));
2737
2738 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2739 assert(map->binding[i] < set->layout->binding_count);
2740
2741 const struct tu_descriptor_set_binding_layout *layout =
2742 &set->layout->binding[map->binding[i]];
2743
2744 switch (layout->type) {
2745 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2746 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2747 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2748 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2749 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2750 array_index * A6XX_TEX_CONST_DWORDS],
2751 A6XX_TEX_CONST_DWORDS * 4);
2752 break;
2753 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2754 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2755 array_index *
2756 (A6XX_TEX_CONST_DWORDS +
2757 sizeof(struct tu_sampler) / 4)],
2758 A6XX_TEX_CONST_DWORDS * 4);
2759 break;
2760 default:
2761 unreachable("unimplemented descriptor type");
2762 break;
2763 }
2764
2765 if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
2766 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2767 uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
2768 array_index].attachment;
2769 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2770
2771 assert(att->gmem_offset >= 0);
2772
2773 dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
2774 dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
2775 dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
2776 dst[2] |=
2777 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
2778 A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
2779 dst[3] = 0;
2780 dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
2781 dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
2782 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
2783 dst[i] = 0;
2784
2785 if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2786 tu_finishme("patch input attachment pitch for secondary cmd buffer");
2787 }
2788 }
2789
2790 static void
2791 write_image_ibo(struct tu_cmd_buffer *cmd,
2792 uint32_t *dst,
2793 struct tu_descriptor_state *descriptors_state,
2794 const struct tu_descriptor_map *map,
2795 unsigned i, unsigned array_index)
2796 {
2797 assert(descriptors_state->valid & (1 << map->set[i]));
2798
2799 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2800 assert(map->binding[i] < set->layout->binding_count);
2801
2802 const struct tu_descriptor_set_binding_layout *layout =
2803 &set->layout->binding[map->binding[i]];
2804
2805 assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
2806
2807 memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
2808 (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
2809 A6XX_TEX_CONST_DWORDS * 4);
2810 }
2811
2812 static uint64_t
2813 buffer_ptr(struct tu_descriptor_state *descriptors_state,
2814 const struct tu_descriptor_map *map,
2815 unsigned i, unsigned array_index)
2816 {
2817 assert(descriptors_state->valid & (1 << map->set[i]));
2818
2819 struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
2820 assert(map->binding[i] < set->layout->binding_count);
2821
2822 const struct tu_descriptor_set_binding_layout *layout =
2823 &set->layout->binding[map->binding[i]];
2824
2825 switch (layout->type) {
2826 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2827 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2828 return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
2829 array_index];
2830 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2831 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2832 return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
2833 set->mapped_ptr[layout->offset / 4 + array_index * 2];
2834 default:
2835 unreachable("unimplemented descriptor type");
2836 break;
2837 }
2838 }
2839
2840 static inline uint32_t
2841 tu6_stage2opcode(gl_shader_stage type)
2842 {
2843 switch (type) {
2844 case MESA_SHADER_VERTEX:
2845 case MESA_SHADER_TESS_CTRL:
2846 case MESA_SHADER_TESS_EVAL:
2847 case MESA_SHADER_GEOMETRY:
2848 return CP_LOAD_STATE6_GEOM;
2849 case MESA_SHADER_FRAGMENT:
2850 case MESA_SHADER_COMPUTE:
2851 case MESA_SHADER_KERNEL:
2852 return CP_LOAD_STATE6_FRAG;
2853 default:
2854 unreachable("bad shader type");
2855 }
2856 }
2857
2858 static inline enum a6xx_state_block
2859 tu6_stage2shadersb(gl_shader_stage type)
2860 {
2861 switch (type) {
2862 case MESA_SHADER_VERTEX:
2863 return SB6_VS_SHADER;
2864 case MESA_SHADER_FRAGMENT:
2865 return SB6_FS_SHADER;
2866 case MESA_SHADER_COMPUTE:
2867 case MESA_SHADER_KERNEL:
2868 return SB6_CS_SHADER;
2869 default:
2870 unreachable("bad shader type");
2871 return ~0;
2872 }
2873 }
2874
2875 static void
2876 tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2877 struct tu_descriptor_state *descriptors_state,
2878 gl_shader_stage type,
2879 uint32_t *push_constants)
2880 {
2881 const struct tu_program_descriptor_linkage *link =
2882 &pipeline->program.link[type];
2883 const struct ir3_ubo_analysis_state *state = &link->ubo_state;
2884
2885 for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
2886 if (state->range[i].start < state->range[i].end) {
2887 uint32_t size = state->range[i].end - state->range[i].start;
2888 uint32_t offset = state->range[i].start;
2889
2890 /* and even if the start of the const buffer is before
2891 * first_immediate, the end may not be:
2892 */
2893 size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
2894
2895 if (size == 0)
2896 continue;
2897
2898 /* things should be aligned to vec4: */
2899 debug_assert((state->range[i].offset % 16) == 0);
2900 debug_assert((size % 16) == 0);
2901 debug_assert((offset % 16) == 0);
2902
2903 if (i == 0) {
2904 /* push constants */
2905 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (size / 4));
2906 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2907 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2908 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2909 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2910 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2911 tu_cs_emit(cs, 0);
2912 tu_cs_emit(cs, 0);
2913 for (unsigned i = 0; i < size / 4; i++)
2914 tu_cs_emit(cs, push_constants[i + offset / 4]);
2915 continue;
2916 }
2917
2918 /* Look through the UBO map to find our UBO index, and get the VA for
2919 * that UBO.
2920 */
2921 uint64_t va = 0;
2922 uint32_t ubo_idx = i - 1;
2923 uint32_t ubo_map_base = 0;
2924 for (int j = 0; j < link->ubo_map.num; j++) {
2925 if (ubo_idx >= ubo_map_base &&
2926 ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
2927 va = buffer_ptr(descriptors_state, &link->ubo_map, j,
2928 ubo_idx - ubo_map_base);
2929 break;
2930 }
2931 ubo_map_base += link->ubo_map.array_size[j];
2932 }
2933 assert(va);
2934
2935 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
2936 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
2937 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2938 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
2939 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2940 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
2941 tu_cs_emit_qw(cs, va + offset);
2942 }
2943 }
2944 }
2945
2946 static void
2947 tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
2948 struct tu_descriptor_state *descriptors_state,
2949 gl_shader_stage type)
2950 {
2951 const struct tu_program_descriptor_linkage *link =
2952 &pipeline->program.link[type];
2953
2954 uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
2955 uint32_t anum = align(num, 2);
2956
2957 if (!num)
2958 return;
2959
2960 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
2961 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
2962 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2963 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2964 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
2965 CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
2966 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2967 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2968
2969 unsigned emitted = 0;
2970 for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
2971 for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
2972 tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
2973 emitted++;
2974 }
2975 }
2976
2977 for (; emitted < anum; emitted++) {
2978 tu_cs_emit(cs, 0xffffffff);
2979 tu_cs_emit(cs, 0xffffffff);
2980 }
2981 }
2982
2983 static struct tu_cs_entry
2984 tu6_emit_consts(struct tu_cmd_buffer *cmd,
2985 const struct tu_pipeline *pipeline,
2986 struct tu_descriptor_state *descriptors_state,
2987 gl_shader_stage type)
2988 {
2989 struct tu_cs cs;
2990 tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
2991
2992 tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
2993 tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
2994
2995 return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
2996 }
2997
2998 static VkResult
2999 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3000 const struct tu_draw_info *draw,
3001 struct tu_cs_entry *entry)
3002 {
3003 /* TODO: fill out more than just base instance */
3004 const struct tu_program_descriptor_linkage *link =
3005 &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3006 const struct ir3_const_state *const_state = &link->const_state;
3007 struct tu_cs cs;
3008
3009 if (const_state->offsets.driver_param >= link->constlen) {
3010 *entry = (struct tu_cs_entry) {};
3011 return VK_SUCCESS;
3012 }
3013
3014 VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
3015 if (result != VK_SUCCESS)
3016 return result;
3017
3018 tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3019 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
3020 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3021 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3022 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3023 CP_LOAD_STATE6_0_NUM_UNIT(1));
3024 tu_cs_emit(&cs, 0);
3025 tu_cs_emit(&cs, 0);
3026
3027 STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3028
3029 tu_cs_emit(&cs, 0);
3030 tu_cs_emit(&cs, 0);
3031 tu_cs_emit(&cs, draw->first_instance);
3032 tu_cs_emit(&cs, 0);
3033
3034 *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3035 return VK_SUCCESS;
3036 }
3037
3038 static VkResult
3039 tu6_emit_textures(struct tu_cmd_buffer *cmd,
3040 const struct tu_pipeline *pipeline,
3041 struct tu_descriptor_state *descriptors_state,
3042 gl_shader_stage type,
3043 struct tu_cs_entry *entry,
3044 bool *needs_border,
3045 bool is_sysmem)
3046 {
3047 struct tu_cs *draw_state = &cmd->sub_cs;
3048 const struct tu_program_descriptor_linkage *link =
3049 &pipeline->program.link[type];
3050 VkResult result;
3051
3052 if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
3053 *entry = (struct tu_cs_entry) {};
3054 return VK_SUCCESS;
3055 }
3056
3057 /* allocate and fill texture state */
3058 struct ts_cs_memory tex_const;
3059 result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
3060 A6XX_TEX_CONST_DWORDS, &tex_const);
3061 if (result != VK_SUCCESS)
3062 return result;
3063
3064 int tex_index = 0;
3065 for (unsigned i = 0; i < link->texture_map.num; i++) {
3066 for (int j = 0; j < link->texture_map.array_size[i]; j++) {
3067 write_tex_const(cmd,
3068 &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
3069 descriptors_state, &link->texture_map, i, j,
3070 is_sysmem);
3071 }
3072 }
3073
3074 /* allocate and fill sampler state */
3075 struct ts_cs_memory tex_samp = { 0 };
3076 if (link->sampler_map.num_desc) {
3077 result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
3078 A6XX_TEX_SAMP_DWORDS, &tex_samp);
3079 if (result != VK_SUCCESS)
3080 return result;
3081
3082 int sampler_index = 0;
3083 for (unsigned i = 0; i < link->sampler_map.num; i++) {
3084 for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
3085 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3086 &link->sampler_map,
3087 i, j);
3088 memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
3089 sampler->state, sizeof(sampler->state));
3090 *needs_border |= sampler->needs_border;
3091 }
3092 }
3093 }
3094
3095 unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
3096 enum a6xx_state_block sb;
3097
3098 switch (type) {
3099 case MESA_SHADER_VERTEX:
3100 sb = SB6_VS_TEX;
3101 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
3102 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
3103 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
3104 break;
3105 case MESA_SHADER_FRAGMENT:
3106 sb = SB6_FS_TEX;
3107 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
3108 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
3109 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
3110 break;
3111 case MESA_SHADER_COMPUTE:
3112 sb = SB6_CS_TEX;
3113 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
3114 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
3115 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
3116 break;
3117 default:
3118 unreachable("bad state block");
3119 }
3120
3121 struct tu_cs cs;
3122 result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
3123 if (result != VK_SUCCESS)
3124 return result;
3125
3126 if (link->sampler_map.num_desc) {
3127 /* output sampler state: */
3128 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3129 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3130 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
3131 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3132 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3133 CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
3134 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3135
3136 tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
3137 tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
3138 }
3139
3140 /* emit texture state: */
3141 tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
3142 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3143 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3144 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3145 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3146 CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
3147 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3148
3149 tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
3150 tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
3151
3152 tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
3153 tu_cs_emit(&cs, link->texture_map.num_desc);
3154
3155 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3156 return VK_SUCCESS;
3157 }
3158
3159 static VkResult
3160 tu6_emit_ibo(struct tu_cmd_buffer *cmd,
3161 const struct tu_pipeline *pipeline,
3162 struct tu_descriptor_state *descriptors_state,
3163 gl_shader_stage type,
3164 struct tu_cs_entry *entry)
3165 {
3166 struct tu_cs *draw_state = &cmd->sub_cs;
3167 const struct tu_program_descriptor_linkage *link =
3168 &pipeline->program.link[type];
3169 VkResult result;
3170
3171 unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
3172
3173 if (num_desc == 0) {
3174 *entry = (struct tu_cs_entry) {};
3175 return VK_SUCCESS;
3176 }
3177
3178 struct ts_cs_memory ibo_const;
3179 result = tu_cs_alloc(draw_state, num_desc,
3180 A6XX_TEX_CONST_DWORDS, &ibo_const);
3181 if (result != VK_SUCCESS)
3182 return result;
3183
3184 int ssbo_index = 0;
3185 for (unsigned i = 0; i < link->ssbo_map.num; i++) {
3186 for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
3187 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3188
3189 uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
3190 /* We don't expose robustBufferAccess, so leave the size unlimited. */
3191 uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
3192
3193 dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
3194 dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
3195 A6XX_IBO_1_HEIGHT(sz >> 15);
3196 dst[2] = A6XX_IBO_2_UNK4 |
3197 A6XX_IBO_2_UNK31 |
3198 A6XX_IBO_2_TYPE(A6XX_TEX_1D);
3199 dst[3] = 0;
3200 dst[4] = va;
3201 dst[5] = va >> 32;
3202 for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
3203 dst[i] = 0;
3204
3205 ssbo_index++;
3206 }
3207 }
3208
3209 for (unsigned i = 0; i < link->image_map.num; i++) {
3210 for (int j = 0; j < link->image_map.array_size[i]; j++) {
3211 uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
3212
3213 write_image_ibo(cmd, dst,
3214 descriptors_state, &link->image_map, i, j);
3215
3216 ssbo_index++;
3217 }
3218 }
3219
3220 assert(ssbo_index == num_desc);
3221
3222 struct tu_cs cs;
3223 result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
3224 if (result != VK_SUCCESS)
3225 return result;
3226
3227 uint32_t opcode, ibo_addr_reg;
3228 enum a6xx_state_block sb;
3229 enum a6xx_state_type st;
3230
3231 switch (type) {
3232 case MESA_SHADER_FRAGMENT:
3233 opcode = CP_LOAD_STATE6;
3234 st = ST6_SHADER;
3235 sb = SB6_IBO;
3236 ibo_addr_reg = REG_A6XX_SP_IBO_LO;
3237 break;
3238 case MESA_SHADER_COMPUTE:
3239 opcode = CP_LOAD_STATE6_FRAG;
3240 st = ST6_IBO;
3241 sb = SB6_CS_SHADER;
3242 ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
3243 break;
3244 default:
3245 unreachable("unsupported stage for ibos");
3246 }
3247
3248 /* emit texture state: */
3249 tu_cs_emit_pkt7(&cs, opcode, 3);
3250 tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
3251 CP_LOAD_STATE6_0_STATE_TYPE(st) |
3252 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3253 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
3254 CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
3255 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3256
3257 tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
3258 tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
3259
3260 *entry = tu_cs_end_sub_stream(draw_state, &cs);
3261 return VK_SUCCESS;
3262 }
3263
3264 struct PACKED bcolor_entry {
3265 uint32_t fp32[4];
3266 uint16_t ui16[4];
3267 int16_t si16[4];
3268 uint16_t fp16[4];
3269 uint16_t rgb565;
3270 uint16_t rgb5a1;
3271 uint16_t rgba4;
3272 uint8_t __pad0[2];
3273 uint8_t ui8[4];
3274 int8_t si8[4];
3275 uint32_t rgb10a2;
3276 uint32_t z24; /* also s8? */
3277 uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
3278 uint8_t __pad1[56];
3279 } border_color[] = {
3280 [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = {},
3281 [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = {},
3282 [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = {
3283 .fp32[3] = 0x3f800000,
3284 .ui16[3] = 0xffff,
3285 .si16[3] = 0x7fff,
3286 .fp16[3] = 0x3c00,
3287 .rgb5a1 = 0x8000,
3288 .rgba4 = 0xf000,
3289 .ui8[3] = 0xff,
3290 .si8[3] = 0x7f,
3291 .rgb10a2 = 0xc0000000,
3292 .srgb[3] = 0x3c00,
3293 },
3294 [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = {
3295 .fp32[3] = 1,
3296 .fp16[3] = 1,
3297 },
3298 [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = {
3299 .fp32[0 ... 3] = 0x3f800000,
3300 .ui16[0 ... 3] = 0xffff,
3301 .si16[0 ... 3] = 0x7fff,
3302 .fp16[0 ... 3] = 0x3c00,
3303 .rgb565 = 0xffff,
3304 .rgb5a1 = 0xffff,
3305 .rgba4 = 0xffff,
3306 .ui8[0 ... 3] = 0xff,
3307 .si8[0 ... 3] = 0x7f,
3308 .rgb10a2 = 0xffffffff,
3309 .z24 = 0xffffff,
3310 .srgb[0 ... 3] = 0x3c00,
3311 },
3312 [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = {
3313 .fp32[0 ... 3] = 1,
3314 .fp16[0 ... 3] = 1,
3315 },
3316 };
3317
3318 static VkResult
3319 tu6_emit_border_color(struct tu_cmd_buffer *cmd,
3320 struct tu_cs *cs)
3321 {
3322 STATIC_ASSERT(sizeof(struct bcolor_entry) == 128);
3323
3324 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3325 struct tu_descriptor_state *descriptors_state =
3326 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3327 const struct tu_descriptor_map *vs_sampler =
3328 &pipeline->program.link[MESA_SHADER_VERTEX].sampler_map;
3329 const struct tu_descriptor_map *fs_sampler =
3330 &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map;
3331 struct ts_cs_memory ptr;
3332
3333 VkResult result = tu_cs_alloc(&cmd->sub_cs,
3334 vs_sampler->num_desc + fs_sampler->num_desc,
3335 128 / 4,
3336 &ptr);
3337 if (result != VK_SUCCESS)
3338 return result;
3339
3340 for (unsigned i = 0; i < vs_sampler->num; i++) {
3341 for (unsigned j = 0; j < vs_sampler->array_size[i]; j++) {
3342 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3343 vs_sampler, i, j);
3344 memcpy(ptr.map, &border_color[sampler->border], 128);
3345 ptr.map += 128 / 4;
3346 }
3347 }
3348
3349 for (unsigned i = 0; i < fs_sampler->num; i++) {
3350 for (unsigned j = 0; j < fs_sampler->array_size[i]; j++) {
3351 const struct tu_sampler *sampler = sampler_ptr(descriptors_state,
3352 fs_sampler, i, j);
3353 memcpy(ptr.map, &border_color[sampler->border], 128);
3354 ptr.map += 128 / 4;
3355 }
3356 }
3357
3358 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
3359 tu_cs_emit_qw(cs, ptr.iova);
3360 return VK_SUCCESS;
3361 }
3362
3363 static VkResult
3364 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
3365 struct tu_cs *cs,
3366 const struct tu_draw_info *draw)
3367 {
3368 const struct tu_pipeline *pipeline = cmd->state.pipeline;
3369 const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
3370 struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
3371 uint32_t draw_state_group_count = 0;
3372 VkResult result;
3373
3374 struct tu_descriptor_state *descriptors_state =
3375 &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3376
3377 /* TODO lrz */
3378
3379 tu_cs_emit_regs(cs,
3380 A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart =
3381 pipeline->ia.primitive_restart && draw->indexed));
3382
3383 if (cmd->state.dirty &
3384 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
3385 (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
3386 tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
3387 dynamic->line_width);
3388 }
3389
3390 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
3391 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
3392 tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
3393 dynamic->stencil_compare_mask.back);
3394 }
3395
3396 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
3397 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
3398 tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
3399 dynamic->stencil_write_mask.back);
3400 }
3401
3402 if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
3403 (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
3404 tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
3405 dynamic->stencil_reference.back);
3406 }
3407
3408 if (cmd->state.dirty &
3409 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) {
3410 for (uint32_t i = 0; i < pipeline->vi.count; i++) {
3411 const uint32_t binding = pipeline->vi.bindings[i];
3412 const uint32_t stride = pipeline->vi.strides[i];
3413 const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
3414 const VkDeviceSize offset = buf->bo_offset +
3415 cmd->state.vb.offsets[binding] +
3416 pipeline->vi.offsets[i];
3417 const VkDeviceSize size =
3418 offset < buf->bo->size ? buf->bo->size - offset : 0;
3419
3420 tu_cs_emit_regs(cs,
3421 A6XX_VFD_FETCH_BASE(i, .bo = buf->bo, .bo_offset = offset),
3422 A6XX_VFD_FETCH_SIZE(i, size),
3423 A6XX_VFD_FETCH_STRIDE(i, stride));
3424 }
3425 }
3426
3427 if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
3428 draw_state_groups[draw_state_group_count++] =
3429 (struct tu_draw_state_group) {
3430 .id = TU_DRAW_STATE_PROGRAM,
3431 .enable_mask = ENABLE_DRAW,
3432 .ib = pipeline->program.state_ib,
3433 };
3434 draw_state_groups[draw_state_group_count++] =
3435 (struct tu_draw_state_group) {
3436 .id = TU_DRAW_STATE_PROGRAM_BINNING,
3437 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3438 .ib = pipeline->program.binning_state_ib,
3439 };
3440 draw_state_groups[draw_state_group_count++] =
3441 (struct tu_draw_state_group) {
3442 .id = TU_DRAW_STATE_VI,
3443 .enable_mask = ENABLE_DRAW,
3444 .ib = pipeline->vi.state_ib,
3445 };
3446 draw_state_groups[draw_state_group_count++] =
3447 (struct tu_draw_state_group) {
3448 .id = TU_DRAW_STATE_VI_BINNING,
3449 .enable_mask = CP_SET_DRAW_STATE__0_BINNING,
3450 .ib = pipeline->vi.binning_state_ib,
3451 };
3452 draw_state_groups[draw_state_group_count++] =
3453 (struct tu_draw_state_group) {
3454 .id = TU_DRAW_STATE_VP,
3455 .enable_mask = ENABLE_ALL,
3456 .ib = pipeline->vp.state_ib,
3457 };
3458 draw_state_groups[draw_state_group_count++] =
3459 (struct tu_draw_state_group) {
3460 .id = TU_DRAW_STATE_RAST,
3461 .enable_mask = ENABLE_ALL,
3462 .ib = pipeline->rast.state_ib,
3463 };
3464 draw_state_groups[draw_state_group_count++] =
3465 (struct tu_draw_state_group) {
3466 .id = TU_DRAW_STATE_DS,
3467 .enable_mask = ENABLE_ALL,
3468 .ib = pipeline->ds.state_ib,
3469 };
3470 draw_state_groups[draw_state_group_count++] =
3471 (struct tu_draw_state_group) {
3472 .id = TU_DRAW_STATE_BLEND,
3473 .enable_mask = ENABLE_ALL,
3474 .ib = pipeline->blend.state_ib,
3475 };
3476 }
3477
3478 if (cmd->state.dirty &
3479 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) {
3480 draw_state_groups[draw_state_group_count++] =
3481 (struct tu_draw_state_group) {
3482 .id = TU_DRAW_STATE_VS_CONST,
3483 .enable_mask = ENABLE_ALL,
3484 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX)
3485 };
3486 draw_state_groups[draw_state_group_count++] =
3487 (struct tu_draw_state_group) {
3488 .id = TU_DRAW_STATE_FS_CONST,
3489 .enable_mask = ENABLE_DRAW,
3490 .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT)
3491 };
3492 }
3493
3494 if (cmd->state.dirty &
3495 (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
3496 bool needs_border = false;
3497 struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
3498
3499 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3500 MESA_SHADER_VERTEX, &vs_tex, &needs_border,
3501 false);
3502 if (result != VK_SUCCESS)
3503 return result;
3504
3505 /* TODO: we could emit just one texture descriptor draw state when there
3506 * are no input attachments, which is the most common case. We could
3507 * also split out the sampler state, which doesn't change even for input
3508 * attachments.
3509 */
3510 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3511 MESA_SHADER_FRAGMENT, &fs_tex_sysmem,
3512 &needs_border, true);
3513 if (result != VK_SUCCESS)
3514 return result;
3515
3516 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3517 MESA_SHADER_FRAGMENT, &fs_tex_gmem,
3518 &needs_border, false);
3519 if (result != VK_SUCCESS)
3520 return result;
3521
3522 result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
3523 MESA_SHADER_FRAGMENT, &fs_ibo);
3524 if (result != VK_SUCCESS)
3525 return result;
3526
3527 draw_state_groups[draw_state_group_count++] =
3528 (struct tu_draw_state_group) {
3529 .id = TU_DRAW_STATE_VS_TEX,
3530 .enable_mask = ENABLE_ALL,
3531 .ib = vs_tex,
3532 };
3533 draw_state_groups[draw_state_group_count++] =
3534 (struct tu_draw_state_group) {
3535 .id = TU_DRAW_STATE_FS_TEX_GMEM,
3536 .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
3537 .ib = fs_tex_gmem,
3538 };
3539 draw_state_groups[draw_state_group_count++] =
3540 (struct tu_draw_state_group) {
3541 .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
3542 .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
3543 .ib = fs_tex_sysmem,
3544 };
3545 draw_state_groups[draw_state_group_count++] =
3546 (struct tu_draw_state_group) {
3547 .id = TU_DRAW_STATE_FS_IBO,
3548 .enable_mask = ENABLE_DRAW,
3549 .ib = fs_ibo,
3550 };
3551
3552 if (needs_border) {
3553 result = tu6_emit_border_color(cmd, cs);
3554 if (result != VK_SUCCESS)
3555 return result;
3556 }
3557 }
3558
3559 struct tu_cs_entry vs_params;
3560 result = tu6_emit_vs_params(cmd, draw, &vs_params);
3561 if (result != VK_SUCCESS)
3562 return result;
3563
3564 draw_state_groups[draw_state_group_count++] =
3565 (struct tu_draw_state_group) {
3566 .id = TU_DRAW_STATE_VS_PARAMS,
3567 .enable_mask = ENABLE_ALL,
3568 .ib = vs_params,
3569 };
3570
3571 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
3572 for (uint32_t i = 0; i < draw_state_group_count; i++) {
3573 const struct tu_draw_state_group *group = &draw_state_groups[i];
3574 debug_assert((group->enable_mask & ~ENABLE_ALL) == 0);
3575 uint32_t cp_set_draw_state =
3576 CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) |
3577 group->enable_mask |
3578 CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
3579 uint64_t iova;
3580 if (group->ib.size) {
3581 iova = group->ib.bo->iova + group->ib.offset;
3582 } else {
3583 cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
3584 iova = 0;
3585 }
3586
3587 tu_cs_emit(cs, cp_set_draw_state);
3588 tu_cs_emit_qw(cs, iova);
3589 }
3590
3591 tu_cs_sanity_check(cs);
3592
3593 /* track BOs */
3594 if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
3595 for (uint32_t i = 0; i < MAX_VBS; i++) {
3596 const struct tu_buffer *buf = cmd->state.vb.buffers[i];
3597 if (buf)
3598 tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
3599 }
3600 }
3601 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3602 unsigned i;
3603 for_each_bit(i, descriptors_state->valid) {
3604 struct tu_descriptor_set *set = descriptors_state->sets[i];
3605 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3606 if (set->descriptors[j]) {
3607 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3608 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3609 }
3610 }
3611 }
3612
3613 /* Fragment shader state overwrites compute shader state, so flag the
3614 * compute pipeline for re-emit.
3615 */
3616 cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
3617 return VK_SUCCESS;
3618 }
3619
3620 static void
3621 tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
3622 struct tu_cs *cs,
3623 const struct tu_draw_info *draw)
3624 {
3625
3626 const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
3627
3628 tu_cs_emit_regs(cs,
3629 A6XX_VFD_INDEX_OFFSET(draw->vertex_offset),
3630 A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance));
3631
3632 /* TODO hw binning */
3633 if (draw->indexed) {
3634 const enum a4xx_index_size index_size =
3635 tu6_index_size(cmd->state.index_type);
3636 const uint32_t index_bytes =
3637 (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
3638 const struct tu_buffer *buf = cmd->state.index_buffer;
3639 const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
3640 index_bytes * draw->first_index;
3641 const uint32_t size = index_bytes * draw->count;
3642
3643 const uint32_t cp_draw_indx =
3644 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3645 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
3646 CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
3647 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3648
3649 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3650 tu_cs_emit(cs, cp_draw_indx);
3651 tu_cs_emit(cs, draw->instance_count);
3652 tu_cs_emit(cs, draw->count);
3653 tu_cs_emit(cs, 0x0); /* XXX */
3654 tu_cs_emit_qw(cs, buf->bo->iova + offset);
3655 tu_cs_emit(cs, size);
3656 } else {
3657 const uint32_t cp_draw_indx =
3658 CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3659 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
3660 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000;
3661
3662 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3663 tu_cs_emit(cs, cp_draw_indx);
3664 tu_cs_emit(cs, draw->instance_count);
3665 tu_cs_emit(cs, draw->count);
3666 }
3667 }
3668
3669 static void
3670 tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
3671 {
3672 struct tu_cs *cs = &cmd->draw_cs;
3673 VkResult result;
3674
3675 result = tu6_bind_draw_states(cmd, cs, draw);
3676 if (result != VK_SUCCESS) {
3677 cmd->record_result = result;
3678 return;
3679 }
3680
3681 if (draw->indirect) {
3682 tu_finishme("indirect draw");
3683 return;
3684 }
3685
3686 tu6_emit_draw_direct(cmd, cs, draw);
3687
3688 cmd->wait_for_idle = true;
3689
3690 tu_cs_sanity_check(cs);
3691 }
3692
3693 void
3694 tu_CmdDraw(VkCommandBuffer commandBuffer,
3695 uint32_t vertexCount,
3696 uint32_t instanceCount,
3697 uint32_t firstVertex,
3698 uint32_t firstInstance)
3699 {
3700 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3701 struct tu_draw_info info = {};
3702
3703 info.count = vertexCount;
3704 info.instance_count = instanceCount;
3705 info.first_instance = firstInstance;
3706 info.vertex_offset = firstVertex;
3707
3708 tu_draw(cmd_buffer, &info);
3709 }
3710
3711 void
3712 tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3713 uint32_t indexCount,
3714 uint32_t instanceCount,
3715 uint32_t firstIndex,
3716 int32_t vertexOffset,
3717 uint32_t firstInstance)
3718 {
3719 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3720 struct tu_draw_info info = {};
3721
3722 info.indexed = true;
3723 info.count = indexCount;
3724 info.instance_count = instanceCount;
3725 info.first_index = firstIndex;
3726 info.vertex_offset = vertexOffset;
3727 info.first_instance = firstInstance;
3728
3729 tu_draw(cmd_buffer, &info);
3730 }
3731
3732 void
3733 tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3734 VkBuffer _buffer,
3735 VkDeviceSize offset,
3736 uint32_t drawCount,
3737 uint32_t stride)
3738 {
3739 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3740 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3741 struct tu_draw_info info = {};
3742
3743 info.count = drawCount;
3744 info.indirect = buffer;
3745 info.indirect_offset = offset;
3746 info.stride = stride;
3747
3748 tu_draw(cmd_buffer, &info);
3749 }
3750
3751 void
3752 tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3753 VkBuffer _buffer,
3754 VkDeviceSize offset,
3755 uint32_t drawCount,
3756 uint32_t stride)
3757 {
3758 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3759 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3760 struct tu_draw_info info = {};
3761
3762 info.indexed = true;
3763 info.count = drawCount;
3764 info.indirect = buffer;
3765 info.indirect_offset = offset;
3766 info.stride = stride;
3767
3768 tu_draw(cmd_buffer, &info);
3769 }
3770
3771 struct tu_dispatch_info
3772 {
3773 /**
3774 * Determine the layout of the grid (in block units) to be used.
3775 */
3776 uint32_t blocks[3];
3777
3778 /**
3779 * A starting offset for the grid. If unaligned is set, the offset
3780 * must still be aligned.
3781 */
3782 uint32_t offsets[3];
3783 /**
3784 * Whether it's an unaligned compute dispatch.
3785 */
3786 bool unaligned;
3787
3788 /**
3789 * Indirect compute parameters resource.
3790 */
3791 struct tu_buffer *indirect;
3792 uint64_t indirect_offset;
3793 };
3794
3795 static void
3796 tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
3797 const struct tu_dispatch_info *info)
3798 {
3799 gl_shader_stage type = MESA_SHADER_COMPUTE;
3800 const struct tu_program_descriptor_linkage *link =
3801 &pipeline->program.link[type];
3802 const struct ir3_const_state *const_state = &link->const_state;
3803 uint32_t offset = const_state->offsets.driver_param;
3804
3805 if (link->constlen <= offset)
3806 return;
3807
3808 if (!info->indirect) {
3809 uint32_t driver_params[IR3_DP_CS_COUNT] = {
3810 [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
3811 [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
3812 [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
3813 [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0],
3814 [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1],
3815 [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2],
3816 };
3817
3818 uint32_t num_consts = MIN2(const_state->num_driver_params,
3819 (link->constlen - offset) * 4);
3820 /* push constants */
3821 tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
3822 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3823 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3824 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3825 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3826 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
3827 tu_cs_emit(cs, 0);
3828 tu_cs_emit(cs, 0);
3829 uint32_t i;
3830 for (i = 0; i < num_consts; i++)
3831 tu_cs_emit(cs, driver_params[i]);
3832 } else {
3833 tu_finishme("Indirect driver params");
3834 }
3835 }
3836
3837 static void
3838 tu_dispatch(struct tu_cmd_buffer *cmd,
3839 const struct tu_dispatch_info *info)
3840 {
3841 struct tu_cs *cs = &cmd->cs;
3842 struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
3843 struct tu_descriptor_state *descriptors_state =
3844 &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
3845 VkResult result;
3846
3847 if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
3848 tu_cs_emit_ib(cs, &pipeline->program.state_ib);
3849
3850 struct tu_cs_entry ib;
3851
3852 ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3853 if (ib.size)
3854 tu_cs_emit_ib(cs, &ib);
3855
3856 tu_emit_compute_driver_params(cs, pipeline, info);
3857
3858 bool needs_border;
3859 result = tu6_emit_textures(cmd, pipeline, descriptors_state,
3860 MESA_SHADER_COMPUTE, &ib, &needs_border, false);
3861 if (result != VK_SUCCESS) {
3862 cmd->record_result = result;
3863 return;
3864 }
3865
3866 if (ib.size)
3867 tu_cs_emit_ib(cs, &ib);
3868
3869 if (needs_border)
3870 tu_finishme("compute border color");
3871
3872 result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
3873 if (result != VK_SUCCESS) {
3874 cmd->record_result = result;
3875 return;
3876 }
3877
3878 if (ib.size)
3879 tu_cs_emit_ib(cs, &ib);
3880
3881 /* track BOs */
3882 if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
3883 unsigned i;
3884 for_each_bit(i, descriptors_state->valid) {
3885 struct tu_descriptor_set *set = descriptors_state->sets[i];
3886 for (unsigned j = 0; j < set->layout->buffer_count; ++j)
3887 if (set->descriptors[j]) {
3888 tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
3889 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3890 }
3891 }
3892 }
3893
3894 /* Compute shader state overwrites fragment shader state, so we flag the
3895 * graphics pipeline for re-emit.
3896 */
3897 cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
3898
3899 tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
3900 tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
3901
3902 const uint32_t *local_size = pipeline->compute.local_size;
3903 const uint32_t *num_groups = info->blocks;
3904 tu_cs_emit_regs(cs,
3905 A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
3906 .localsizex = local_size[0] - 1,
3907 .localsizey = local_size[1] - 1,
3908 .localsizez = local_size[2] - 1),
3909 A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
3910 A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
3911 A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
3912 A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
3913 A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
3914 A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
3915
3916 tu_cs_emit_regs(cs,
3917 A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
3918 A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
3919 A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
3920
3921 if (info->indirect) {
3922 uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
3923
3924 tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
3925 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
3926
3927 tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
3928 tu_cs_emit(cs, 0x00000000);
3929 tu_cs_emit_qw(cs, iova);
3930 tu_cs_emit(cs,
3931 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
3932 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
3933 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
3934 } else {
3935 tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
3936 tu_cs_emit(cs, 0x00000000);
3937 tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
3938 tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
3939 tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
3940 }
3941
3942 tu_cs_emit_wfi(cs);
3943
3944 tu6_emit_cache_flush(cmd, cs);
3945 }
3946
3947 void
3948 tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
3949 uint32_t base_x,
3950 uint32_t base_y,
3951 uint32_t base_z,
3952 uint32_t x,
3953 uint32_t y,
3954 uint32_t z)
3955 {
3956 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3957 struct tu_dispatch_info info = {};
3958
3959 info.blocks[0] = x;
3960 info.blocks[1] = y;
3961 info.blocks[2] = z;
3962
3963 info.offsets[0] = base_x;
3964 info.offsets[1] = base_y;
3965 info.offsets[2] = base_z;
3966 tu_dispatch(cmd_buffer, &info);
3967 }
3968
3969 void
3970 tu_CmdDispatch(VkCommandBuffer commandBuffer,
3971 uint32_t x,
3972 uint32_t y,
3973 uint32_t z)
3974 {
3975 tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
3976 }
3977
3978 void
3979 tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3980 VkBuffer _buffer,
3981 VkDeviceSize offset)
3982 {
3983 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3984 TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
3985 struct tu_dispatch_info info = {};
3986
3987 info.indirect = buffer;
3988 info.indirect_offset = offset;
3989
3990 tu_dispatch(cmd_buffer, &info);
3991 }
3992
3993 void
3994 tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
3995 {
3996 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
3997
3998 tu_cs_end(&cmd_buffer->draw_cs);
3999 tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4000
4001 if (use_sysmem_rendering(cmd_buffer))
4002 tu_cmd_render_sysmem(cmd_buffer);
4003 else
4004 tu_cmd_render_tiles(cmd_buffer);
4005
4006 /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4007 rendered */
4008 tu_cs_discard_entries(&cmd_buffer->draw_cs);
4009 tu_cs_begin(&cmd_buffer->draw_cs);
4010 tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4011 tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4012
4013 cmd_buffer->state.pass = NULL;
4014 cmd_buffer->state.subpass = NULL;
4015 cmd_buffer->state.framebuffer = NULL;
4016 }
4017
4018 void
4019 tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4020 const VkSubpassEndInfoKHR *pSubpassEndInfo)
4021 {
4022 tu_CmdEndRenderPass(commandBuffer);
4023 }
4024
4025 struct tu_barrier_info
4026 {
4027 uint32_t eventCount;
4028 const VkEvent *pEvents;
4029 VkPipelineStageFlags srcStageMask;
4030 };
4031
4032 static void
4033 tu_barrier(struct tu_cmd_buffer *cmd_buffer,
4034 uint32_t memoryBarrierCount,
4035 const VkMemoryBarrier *pMemoryBarriers,
4036 uint32_t bufferMemoryBarrierCount,
4037 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4038 uint32_t imageMemoryBarrierCount,
4039 const VkImageMemoryBarrier *pImageMemoryBarriers,
4040 const struct tu_barrier_info *info)
4041 {
4042 }
4043
4044 void
4045 tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4046 VkPipelineStageFlags srcStageMask,
4047 VkPipelineStageFlags destStageMask,
4048 VkBool32 byRegion,
4049 uint32_t memoryBarrierCount,
4050 const VkMemoryBarrier *pMemoryBarriers,
4051 uint32_t bufferMemoryBarrierCount,
4052 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4053 uint32_t imageMemoryBarrierCount,
4054 const VkImageMemoryBarrier *pImageMemoryBarriers)
4055 {
4056 TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4057 struct tu_barrier_info info;
4058
4059 info.eventCount = 0;
4060 info.pEvents = NULL;
4061 info.srcStageMask = srcStageMask;
4062
4063 tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4064 bufferMemoryBarrierCount, pBufferMemoryBarriers,
4065 imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4066 }
4067
4068 static void
4069 write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value)
4070 {
4071 struct tu_cs *cs = &cmd->cs;
4072
4073 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
4074
4075 /* TODO: any flush required before/after ? */
4076
4077 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4078 tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4079 tu_cs_emit(cs, value);
4080 }
4081
4082 void
4083 tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4084 VkEvent _event,
4085 VkPipelineStageFlags stageMask)
4086 {
4087 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4088 TU_FROM_HANDLE(tu_event, event, _event);
4089
4090 write_event(cmd, event, 1);
4091 }
4092
4093 void
4094 tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4095 VkEvent _event,
4096 VkPipelineStageFlags stageMask)
4097 {
4098 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4099 TU_FROM_HANDLE(tu_event, event, _event);
4100
4101 write_event(cmd, event, 0);
4102 }
4103
4104 void
4105 tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4106 uint32_t eventCount,
4107 const VkEvent *pEvents,
4108 VkPipelineStageFlags srcStageMask,
4109 VkPipelineStageFlags dstStageMask,
4110 uint32_t memoryBarrierCount,
4111 const VkMemoryBarrier *pMemoryBarriers,
4112 uint32_t bufferMemoryBarrierCount,
4113 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4114 uint32_t imageMemoryBarrierCount,
4115 const VkImageMemoryBarrier *pImageMemoryBarriers)
4116 {
4117 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4118 struct tu_cs *cs = &cmd->cs;
4119
4120 /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */
4121
4122 for (uint32_t i = 0; i < eventCount; i++) {
4123 TU_FROM_HANDLE(tu_event, event, pEvents[i]);
4124
4125 tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);
4126
4127 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4128 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4129 CP_WAIT_REG_MEM_0_POLL_MEMORY);
4130 tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4131 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4132 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4133 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4134 }
4135 }
4136
4137 void
4138 tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4139 {
4140 /* No-op */
4141 }