turnip: refactor draw states and dynamic states
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 /* helper functions previously in tu_formats.c */
20
21 static uint32_t
22 tu_pack_mask(int bits)
23 {
24 assert(bits <= 32);
25 return (1ull << bits) - 1;
26 }
27
28 static uint32_t
29 tu_pack_float32_for_unorm(float val, int bits)
30 {
31 const uint32_t max = tu_pack_mask(bits);
32 if (val < 0.0f)
33 return 0;
34 else if (val > 1.0f)
35 return max;
36 else
37 return _mesa_lroundevenf(val * (float) max);
38 }
39
40 static uint32_t
41 tu_pack_float32_for_snorm(float val, int bits)
42 {
43 const int32_t max = tu_pack_mask(bits - 1);
44 int32_t tmp;
45 if (val < -1.0f)
46 tmp = -max;
47 else if (val > 1.0f)
48 tmp = max;
49 else
50 tmp = _mesa_lroundevenf(val * (float) max);
51
52 return tmp & tu_pack_mask(bits);
53 }
54
55 static uint32_t
56 tu_pack_float32_for_uscaled(float val, int bits)
57 {
58 const uint32_t max = tu_pack_mask(bits);
59 if (val < 0.0f)
60 return 0;
61 else if (val > (float) max)
62 return max;
63 else
64 return (uint32_t) val;
65 }
66
67 static uint32_t
68 tu_pack_float32_for_sscaled(float val, int bits)
69 {
70 const int32_t max = tu_pack_mask(bits - 1);
71 const int32_t min = -max - 1;
72 int32_t tmp;
73 if (val < (float) min)
74 tmp = min;
75 else if (val > (float) max)
76 tmp = max;
77 else
78 tmp = (int32_t) val;
79
80 return tmp & tu_pack_mask(bits);
81 }
82
83 static uint32_t
84 tu_pack_uint32_for_uint(uint32_t val, int bits)
85 {
86 return val & tu_pack_mask(bits);
87 }
88
89 static uint32_t
90 tu_pack_int32_for_sint(int32_t val, int bits)
91 {
92 return val & tu_pack_mask(bits);
93 }
94
95 static uint32_t
96 tu_pack_float32_for_sfloat(float val, int bits)
97 {
98 assert(bits == 16 || bits == 32);
99 return bits == 16 ? util_float_to_half(val) : fui(val);
100 }
101
102 union tu_clear_component_value {
103 float float32;
104 int32_t int32;
105 uint32_t uint32;
106 };
107
108 static uint32_t
109 tu_pack_clear_component_value(union tu_clear_component_value val,
110 const struct util_format_channel_description *ch)
111 {
112 uint32_t packed;
113
114 switch (ch->type) {
115 case UTIL_FORMAT_TYPE_UNSIGNED:
116 /* normalized, scaled, or pure integer */
117 if (ch->normalized)
118 packed = tu_pack_float32_for_unorm(val.float32, ch->size);
119 else if (ch->pure_integer)
120 packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
121 else
122 packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
123 break;
124 case UTIL_FORMAT_TYPE_SIGNED:
125 /* normalized, scaled, or pure integer */
126 if (ch->normalized)
127 packed = tu_pack_float32_for_snorm(val.float32, ch->size);
128 else if (ch->pure_integer)
129 packed = tu_pack_int32_for_sint(val.int32, ch->size);
130 else
131 packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
132 break;
133 case UTIL_FORMAT_TYPE_FLOAT:
134 packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
135 break;
136 default:
137 unreachable("unexpected channel type");
138 packed = 0;
139 break;
140 }
141
142 assert((packed & tu_pack_mask(ch->size)) == packed);
143 return packed;
144 }
145
146 static const struct util_format_channel_description *
147 tu_get_format_channel_description(const struct util_format_description *desc,
148 int comp)
149 {
150 switch (desc->swizzle[comp]) {
151 case PIPE_SWIZZLE_X:
152 return &desc->channel[0];
153 case PIPE_SWIZZLE_Y:
154 return &desc->channel[1];
155 case PIPE_SWIZZLE_Z:
156 return &desc->channel[2];
157 case PIPE_SWIZZLE_W:
158 return &desc->channel[3];
159 default:
160 return NULL;
161 }
162 }
163
164 static union tu_clear_component_value
165 tu_get_clear_component_value(const VkClearValue *val, int comp,
166 enum util_format_colorspace colorspace)
167 {
168 assert(comp < 4);
169
170 union tu_clear_component_value tmp;
171 switch (colorspace) {
172 case UTIL_FORMAT_COLORSPACE_ZS:
173 assert(comp < 2);
174 if (comp == 0)
175 tmp.float32 = val->depthStencil.depth;
176 else
177 tmp.uint32 = val->depthStencil.stencil;
178 break;
179 case UTIL_FORMAT_COLORSPACE_SRGB:
180 if (comp < 3) {
181 tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
182 break;
183 }
184 default:
185 assert(comp < 4);
186 tmp.uint32 = val->color.uint32[comp];
187 break;
188 }
189
190 return tmp;
191 }
192
193 /* r2d_ = BLIT_OP_SCALE operations */
194
195 static enum a6xx_2d_ifmt
196 format_to_ifmt(enum a6xx_format fmt)
197 {
198 switch (fmt) {
199 case FMT6_A8_UNORM:
200 case FMT6_8_UNORM:
201 case FMT6_8_SNORM:
202 case FMT6_8_8_UNORM:
203 case FMT6_8_8_SNORM:
204 case FMT6_8_8_8_8_UNORM:
205 case FMT6_8_8_8_X8_UNORM:
206 case FMT6_8_8_8_8_SNORM:
207 case FMT6_4_4_4_4_UNORM:
208 case FMT6_5_5_5_1_UNORM:
209 case FMT6_5_6_5_UNORM:
210 case FMT6_Z24_UNORM_S8_UINT:
211 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
212 return R2D_UNORM8;
213
214 case FMT6_32_UINT:
215 case FMT6_32_SINT:
216 case FMT6_32_32_UINT:
217 case FMT6_32_32_SINT:
218 case FMT6_32_32_32_32_UINT:
219 case FMT6_32_32_32_32_SINT:
220 return R2D_INT32;
221
222 case FMT6_16_UINT:
223 case FMT6_16_SINT:
224 case FMT6_16_16_UINT:
225 case FMT6_16_16_SINT:
226 case FMT6_16_16_16_16_UINT:
227 case FMT6_16_16_16_16_SINT:
228 case FMT6_10_10_10_2_UINT:
229 return R2D_INT16;
230
231 case FMT6_8_UINT:
232 case FMT6_8_SINT:
233 case FMT6_8_8_UINT:
234 case FMT6_8_8_SINT:
235 case FMT6_8_8_8_8_UINT:
236 case FMT6_8_8_8_8_SINT:
237 return R2D_INT8;
238
239 case FMT6_16_UNORM:
240 case FMT6_16_SNORM:
241 case FMT6_16_16_UNORM:
242 case FMT6_16_16_SNORM:
243 case FMT6_16_16_16_16_UNORM:
244 case FMT6_16_16_16_16_SNORM:
245 case FMT6_32_FLOAT:
246 case FMT6_32_32_FLOAT:
247 case FMT6_32_32_32_32_FLOAT:
248 return R2D_FLOAT32;
249
250 case FMT6_16_FLOAT:
251 case FMT6_16_16_FLOAT:
252 case FMT6_16_16_16_16_FLOAT:
253 case FMT6_11_11_10_FLOAT:
254 case FMT6_10_10_10_2_UNORM:
255 case FMT6_10_10_10_2_UNORM_DEST:
256 return R2D_FLOAT16;
257
258 default:
259 unreachable("bad format");
260 return 0;
261 }
262 }
263
264 static void
265 r2d_coords(struct tu_cs *cs,
266 const VkOffset2D *dst,
267 const VkOffset2D *src,
268 const VkExtent2D *extent)
269 {
270 tu_cs_emit_regs(cs,
271 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
272 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
273
274 if (!src)
275 return;
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
279 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
280 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
281 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
282 }
283
284 static void
285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
286 {
287 uint32_t clear_value[4] = {};
288
289 switch (format) {
290 case VK_FORMAT_X8_D24_UNORM_PACK32:
291 case VK_FORMAT_D24_UNORM_S8_UINT:
292 /* cleared as r8g8b8a8_unorm using special format */
293 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
294 clear_value[1] = clear_value[0] >> 8;
295 clear_value[2] = clear_value[0] >> 16;
296 clear_value[3] = val->depthStencil.stencil;
297 break;
298 case VK_FORMAT_D16_UNORM:
299 case VK_FORMAT_D32_SFLOAT:
300 /* R2D_FLOAT32 */
301 clear_value[0] = fui(val->depthStencil.depth);
302 break;
303 case VK_FORMAT_S8_UINT:
304 clear_value[0] = val->depthStencil.stencil;
305 break;
306 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
307 /* cleared as UINT32 */
308 clear_value[0] = float3_to_rgb9e5(val->color.float32);
309 break;
310 default:
311 assert(!vk_format_is_depth_or_stencil(format));
312 const struct util_format_description *desc = vk_format_description(format);
313 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
314
315 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
316 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
317
318 for (unsigned i = 0; i < desc->nr_channels; i++) {
319 const struct util_format_channel_description *ch = &desc->channel[i];
320 if (ifmt == R2D_UNORM8) {
321 float linear = val->color.float32[i];
322 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
323 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
324
325 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
326 clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
327 else
328 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
329 } else if (ifmt == R2D_FLOAT16) {
330 clear_value[i] = util_float_to_half(val->color.float32[i]);
331 } else {
332 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
333 ifmt == R2D_INT16 || ifmt == R2D_INT8);
334 clear_value[i] = val->color.uint32[i];
335 }
336 }
337 break;
338 }
339
340 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
341 tu_cs_emit_array(cs, clear_value, 4);
342 }
343
344 static void
345 r2d_src(struct tu_cmd_buffer *cmd,
346 struct tu_cs *cs,
347 const struct tu_image_view *iview,
348 uint32_t layer,
349 VkFilter filter)
350 {
351 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
352 if (filter != VK_FILTER_NEAREST)
353 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
354
355 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
356 tu_cs_emit(cs, src_info);
357 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
358 tu_cs_image_ref_2d(cs, iview, layer, true);
359
360 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
361 tu_cs_image_flag_ref(cs, iview, layer);
362 }
363
364 static void
365 r2d_src_buffer(struct tu_cmd_buffer *cmd,
366 struct tu_cs *cs,
367 VkFormat vk_format,
368 uint64_t va, uint32_t pitch,
369 uint32_t width, uint32_t height)
370 {
371 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
372
373 tu_cs_emit_regs(cs,
374 A6XX_SP_PS_2D_SRC_INFO(
375 .color_format = format.fmt,
376 .color_swap = format.swap,
377 .srgb = vk_format_is_srgb(vk_format),
378 .unk20 = 1,
379 .unk22 = 1),
380 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
381 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
382 A6XX_SP_PS_2D_SRC_HI(va >> 32),
383 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
384 }
385
386 static void
387 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
388 {
389 assert(iview->image->samples == 1);
390
391 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
392 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
393 tu_cs_image_ref_2d(cs, iview, layer, false);
394
395 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
396 tu_cs_image_flag_ref(cs, iview, layer);
397 }
398
399 static void
400 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
401 {
402 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
403
404 tu_cs_emit_regs(cs,
405 A6XX_RB_2D_DST_INFO(
406 .color_format = format.fmt,
407 .color_swap = format.swap,
408 .srgb = vk_format_is_srgb(vk_format)),
409 A6XX_RB_2D_DST_LO((uint32_t) va),
410 A6XX_RB_2D_DST_HI(va >> 32),
411 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
412 }
413
414 static void
415 r2d_setup_common(struct tu_cmd_buffer *cmd,
416 struct tu_cs *cs,
417 VkFormat vk_format,
418 enum a6xx_rotation rotation,
419 bool clear,
420 uint8_t mask,
421 bool scissor)
422 {
423 enum a6xx_format format = tu6_base_format(vk_format);
424 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
425 uint32_t unknown_8c01 = 0;
426
427 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
428 /* preserve depth channels */
429 if (mask == 0x8)
430 unknown_8c01 = 0x00084001;
431 /* preserve stencil channel */
432 if (mask == 0x7)
433 unknown_8c01 = 0x08000041;
434 }
435
436 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
437 tu_cs_emit(cs, unknown_8c01);
438
439 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
440 .scissor = scissor,
441 .rotate = rotation,
442 .solid_color = clear,
443 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
444 .color_format = format,
445 .mask = 0xf,
446 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
447 ).value;
448
449 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
450 tu_cs_emit(cs, blit_cntl);
451
452 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
453 tu_cs_emit(cs, blit_cntl);
454
455 if (format == FMT6_10_10_10_2_UNORM_DEST)
456 format = FMT6_16_16_16_16_FLOAT;
457
458 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
459 .sint = vk_format_is_sint(vk_format),
460 .uint = vk_format_is_uint(vk_format),
461 .color_format = format,
462 .srgb = vk_format_is_srgb(vk_format),
463 .mask = 0xf));
464 }
465
466 static void
467 r2d_setup(struct tu_cmd_buffer *cmd,
468 struct tu_cs *cs,
469 VkFormat vk_format,
470 enum a6xx_rotation rotation,
471 bool clear,
472 uint8_t mask)
473 {
474 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
475
476 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
477 }
478
479 static void
480 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
481 {
482 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
483 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
484 }
485
486 /* r3d_ = shader path operations */
487
488 static void
489 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
490 bool layered_clear)
491 {
492 struct ir3_shader dummy_shader = {};
493
494 struct ir3_shader_variant vs = {
495 .type = MESA_SHADER_VERTEX,
496 .instrlen = 1,
497 .constlen = 2,
498 .info.max_reg = 1,
499 .inputs_count = 1,
500 .inputs[0] = {
501 .slot = SYSTEM_VALUE_VERTEX_ID,
502 .regid = regid(0, 3),
503 .sysval = true,
504 },
505 .outputs_count = blit ? 2 : 1,
506 .outputs[0] = {
507 .slot = VARYING_SLOT_POS,
508 .regid = regid(0, 0),
509 },
510 .outputs[1] = {
511 .slot = VARYING_SLOT_VAR0,
512 .regid = regid(1, 0),
513 },
514 .shader = &dummy_shader,
515 };
516 if (layered_clear) {
517 vs = (struct ir3_shader_variant) {
518 .type = MESA_SHADER_VERTEX,
519 .instrlen = 1,
520 .info.max_reg = 0,
521 .shader = &dummy_shader,
522 };
523 }
524
525 struct ir3_shader_variant fs = {
526 .type = MESA_SHADER_FRAGMENT,
527 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
528 .constlen = num_rts,
529 .info.max_reg = MAX2(num_rts, 1) - 1,
530 .total_in = blit ? 2 : 0,
531 .num_samp = blit ? 1 : 0,
532 .inputs_count = blit ? 2 : 0,
533 .inputs[0] = {
534 .slot = VARYING_SLOT_VAR0,
535 .inloc = 0,
536 .compmask = 3,
537 .bary = true,
538 },
539 .inputs[1] = {
540 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
541 .regid = regid(0, 0),
542 .sysval = 1,
543 },
544 .num_sampler_prefetch = blit ? 1 : 0,
545 .sampler_prefetch[0] = {
546 .src = 0,
547 .wrmask = 0xf,
548 .cmd = 4,
549 },
550 .shader = &dummy_shader,
551 };
552
553 struct ir3_shader_variant gs_shader = {
554 .type = MESA_SHADER_GEOMETRY,
555 .instrlen = 1,
556 .constlen = 2,
557 .info.max_reg = 1,
558 .inputs_count = 1,
559 .inputs[0] = {
560 .slot = SYSTEM_VALUE_GS_HEADER_IR3,
561 .regid = regid(0, 0),
562 .sysval = true,
563 },
564 .outputs_count = 3,
565 .outputs[0] = {
566 .slot = VARYING_SLOT_POS,
567 .regid = regid(0, 0),
568 },
569 .outputs[1] = {
570 .slot = VARYING_SLOT_LAYER,
571 .regid = regid(1, 1),
572 },
573 .outputs[2] = {
574 .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
575 .regid = regid(1, 0),
576 },
577 .shader = &dummy_shader,
578 }, *gs = layered_clear ? &gs_shader : NULL;
579
580
581 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
582 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
583 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
584
585 static const instr_t vs_code[] = {
586 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
587 * r1.xy = r0.w ? c1.zw : c0.zw
588 * r0.w = 1.0f
589 */
590 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
591 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
592 .src2 = 3,
593 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
594 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
595 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
596 .src2 = 3,
597 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
598 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
599 { .cat0 = { .opc = OPC_END } },
600 };
601
602 static const instr_t vs_layered[] = {
603 { .cat0 = { .opc = OPC_CHMASK } },
604 { .cat0 = { .opc = OPC_CHSH } },
605 };
606
607 static const instr_t gs_code[16] = {
608 /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
609 CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
610 .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
611 /* x = (local_id & 1) ? c1.x : c0.x */
612 CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
613 /* y = (local_id & 2) ? c1.y : c0.y */
614 CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
615 /* pred = (local_id >= 4), used by OPC_KILL */
616 CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
617 /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
618 CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
619
620 MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
621 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
622 MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
623
624 /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
625 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
626 .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
627 .src2 = 0,
628 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
629
630 CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
631
632 { .cat0 = { .opc = OPC_KILL } },
633 { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
634 };
635 #define FS_OFFSET (16 * sizeof(instr_t))
636 #define GS_OFFSET (32 * sizeof(instr_t))
637
638 /* shaders */
639 struct ts_cs_memory shaders = { };
640 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
641 16 * sizeof(instr_t), &shaders);
642 assert(result == VK_SUCCESS);
643
644 if (layered_clear) {
645 memcpy(shaders.map, vs_layered, sizeof(vs_layered));
646 memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
647 } else {
648 memcpy(shaders.map, vs_code, sizeof(vs_code));
649 }
650
651 instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
652 for (uint32_t i = 0; i < num_rts; i++) {
653 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
654 *fs_code++ = (instr_t) { .cat1 = {
655 .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
656 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
657 } };
658 }
659
660 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
661 * blit path (its not clear what allows it to not have it)
662 */
663 if (blit) {
664 *fs_code++ = (instr_t) { .cat2 = {
665 .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
666 .dst = regid(63, 0), .src1_im = 1
667 } };
668 }
669 *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
670 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
671
672 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
673
674 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
675 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
676 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
677 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
678 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
679
680 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
681 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
682
683 tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
684
685 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
686 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
687 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
688
689 tu6_emit_fs_inputs(cs, &fs);
690
691 tu_cs_emit_regs(cs,
692 A6XX_GRAS_CL_CNTL(
693 .persp_division_disable = 1,
694 .vp_xform_disable = 1,
695 .vp_clip_code_ignore = 1,
696 .clip_disable = 1),
697 A6XX_GRAS_UNKNOWN_8001(0));
698 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
699
700 tu_cs_emit_regs(cs,
701 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
702 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
703 tu_cs_emit_regs(cs,
704 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
705 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
706
707 tu_cs_emit_regs(cs,
708 A6XX_VFD_INDEX_OFFSET(),
709 A6XX_VFD_INSTANCE_START_OFFSET());
710 }
711
712 static void
713 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
714 {
715 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
716 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
717 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
718 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
719 CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
720 CP_LOAD_STATE6_0_NUM_UNIT(2));
721 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
722 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
723 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
724 }
725
726 static void
727 r3d_coords(struct tu_cs *cs,
728 const VkOffset2D *dst,
729 const VkOffset2D *src,
730 const VkExtent2D *extent)
731 {
732 int32_t src_x1 = src ? src->x : 0;
733 int32_t src_y1 = src ? src->y : 0;
734 r3d_coords_raw(cs, false, (float[]) {
735 dst->x, dst->y,
736 src_x1, src_y1,
737 dst->x + extent->width, dst->y + extent->height,
738 src_x1 + extent->width, src_y1 + extent->height,
739 });
740 }
741
742 static void
743 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
744 {
745 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
746 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
747 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
748 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
749 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
750 CP_LOAD_STATE6_0_NUM_UNIT(1));
751 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
752 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
753 switch (format) {
754 case VK_FORMAT_X8_D24_UNORM_PACK32:
755 case VK_FORMAT_D24_UNORM_S8_UINT: {
756 /* cleared as r8g8b8a8_unorm using special format */
757 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
758 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
759 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
760 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
761 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
762 } break;
763 case VK_FORMAT_D16_UNORM:
764 case VK_FORMAT_D32_SFLOAT:
765 tu_cs_emit(cs, fui(val->depthStencil.depth));
766 tu_cs_emit(cs, 0);
767 tu_cs_emit(cs, 0);
768 tu_cs_emit(cs, 0);
769 break;
770 case VK_FORMAT_S8_UINT:
771 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
772 tu_cs_emit(cs, 0);
773 tu_cs_emit(cs, 0);
774 tu_cs_emit(cs, 0);
775 break;
776 default:
777 /* as color formats use clear value as-is */
778 assert(!vk_format_is_depth_or_stencil(format));
779 tu_cs_emit_array(cs, val->color.uint32, 4);
780 break;
781 }
782 }
783
784 static void
785 r3d_src_common(struct tu_cmd_buffer *cmd,
786 struct tu_cs *cs,
787 const uint32_t *tex_const,
788 uint32_t offset_base,
789 uint32_t offset_ubwc,
790 VkFilter filter)
791 {
792 struct ts_cs_memory texture = { };
793 VkResult result = tu_cs_alloc(&cmd->sub_cs,
794 2, /* allocate space for a sampler too */
795 A6XX_TEX_CONST_DWORDS, &texture);
796 assert(result == VK_SUCCESS);
797
798 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
799
800 /* patch addresses for layer offset */
801 *(uint64_t*) (texture.map + 4) += offset_base;
802 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
803 texture.map[7] = ubwc_addr;
804 texture.map[8] = ubwc_addr >> 32;
805
806 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
807 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
808 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
809 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
810 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
811 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
812 0x60000; /* XXX used by blob, doesn't seem necessary */
813 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
814 0x1 | /* XXX used by blob, doesn't seem necessary */
815 A6XX_TEX_SAMP_1_UNNORM_COORDS |
816 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
817 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
818 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
819
820 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
821 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
822 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
823 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
824 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
825 CP_LOAD_STATE6_0_NUM_UNIT(1));
826 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
827
828 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
829 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
830
831 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
832 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
833 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
834 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
835 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
836 CP_LOAD_STATE6_0_NUM_UNIT(1));
837 tu_cs_emit_qw(cs, texture.iova);
838
839 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
840 tu_cs_emit_qw(cs, texture.iova);
841
842 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
843 }
844
845 static void
846 r3d_src(struct tu_cmd_buffer *cmd,
847 struct tu_cs *cs,
848 const struct tu_image_view *iview,
849 uint32_t layer,
850 VkFilter filter)
851 {
852 r3d_src_common(cmd, cs, iview->descriptor,
853 iview->layer_size * layer,
854 iview->ubwc_layer_size * layer,
855 filter);
856 }
857
858 static void
859 r3d_src_buffer(struct tu_cmd_buffer *cmd,
860 struct tu_cs *cs,
861 VkFormat vk_format,
862 uint64_t va, uint32_t pitch,
863 uint32_t width, uint32_t height)
864 {
865 uint32_t desc[A6XX_TEX_CONST_DWORDS];
866
867 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
868
869 desc[0] =
870 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
871 A6XX_TEX_CONST_0_FMT(format.fmt) |
872 A6XX_TEX_CONST_0_SWAP(format.swap) |
873 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
874 // XXX to swizzle into .w for stencil buffer_to_image
875 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
876 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
877 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
878 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
879 desc[2] =
880 A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
881 A6XX_TEX_CONST_2_PITCH(pitch) |
882 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
883 desc[3] = 0;
884 desc[4] = va;
885 desc[5] = va >> 32;
886 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
887 desc[i] = 0;
888
889 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
890 }
891
892 static void
893 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
894 {
895 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
896
897 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
898 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
899 tu_cs_image_ref(cs, iview, layer);
900 tu_cs_emit(cs, 0);
901
902 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
903 tu_cs_image_flag_ref(cs, iview, layer);
904
905 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
906 }
907
908 static void
909 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
910 {
911 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
912
913 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
914
915 tu_cs_emit_regs(cs,
916 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
917 A6XX_RB_MRT_PITCH(0, pitch),
918 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
919 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
920 A6XX_RB_MRT_BASE_HI(0, va >> 32),
921 A6XX_RB_MRT_BASE_GMEM(0, 0));
922
923 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
924 }
925
926 static void
927 r3d_setup(struct tu_cmd_buffer *cmd,
928 struct tu_cs *cs,
929 VkFormat vk_format,
930 enum a6xx_rotation rotation,
931 bool clear,
932 uint8_t mask)
933 {
934 if (!cmd->state.pass) {
935 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
936 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
937 }
938
939 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
940 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
941
942 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
943
944 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
945 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
946 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
947 0xfc000000);
948 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
949
950 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
951 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
952
953 tu_cs_emit_regs(cs,
954 A6XX_RB_FS_OUTPUT_CNTL0(),
955 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
956
957 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
958 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
959 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
960
961 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
962 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
963 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
964 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
965 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
966 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
967 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
968
969 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
970 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
971
972 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
973 .color_format = tu6_base_format(vk_format),
974 .color_sint = vk_format_is_sint(vk_format),
975 .color_uint = vk_format_is_uint(vk_format)));
976
977 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
978 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
979 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
980 }
981
982 static void
983 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
984 {
985 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
986 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
987 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
988 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
989 tu_cs_emit(cs, 1); /* instance count */
990 tu_cs_emit(cs, 2); /* vertex count */
991 }
992
993 /* blit ops - common interface for 2d/shader paths */
994
995 struct blit_ops {
996 void (*coords)(struct tu_cs *cs,
997 const VkOffset2D *dst,
998 const VkOffset2D *src,
999 const VkExtent2D *extent);
1000 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1001 void (*src)(
1002 struct tu_cmd_buffer *cmd,
1003 struct tu_cs *cs,
1004 const struct tu_image_view *iview,
1005 uint32_t layer,
1006 VkFilter filter);
1007 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1008 VkFormat vk_format,
1009 uint64_t va, uint32_t pitch,
1010 uint32_t width, uint32_t height);
1011 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1012 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1013 void (*setup)(struct tu_cmd_buffer *cmd,
1014 struct tu_cs *cs,
1015 VkFormat vk_format,
1016 enum a6xx_rotation rotation,
1017 bool clear,
1018 uint8_t mask);
1019 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1020 };
1021
1022 static const struct blit_ops r2d_ops = {
1023 .coords = r2d_coords,
1024 .clear_value = r2d_clear_value,
1025 .src = r2d_src,
1026 .src_buffer = r2d_src_buffer,
1027 .dst = r2d_dst,
1028 .dst_buffer = r2d_dst_buffer,
1029 .setup = r2d_setup,
1030 .run = r2d_run,
1031 };
1032
1033 static const struct blit_ops r3d_ops = {
1034 .coords = r3d_coords,
1035 .clear_value = r3d_clear_value,
1036 .src = r3d_src,
1037 .src_buffer = r3d_src_buffer,
1038 .dst = r3d_dst,
1039 .dst_buffer = r3d_dst_buffer,
1040 .setup = r3d_setup,
1041 .run = r3d_run,
1042 };
1043
1044 /* passthrough set coords from 3D extents */
1045 static void
1046 coords(const struct blit_ops *ops,
1047 struct tu_cs *cs,
1048 const VkOffset3D *dst,
1049 const VkOffset3D *src,
1050 const VkExtent3D *extent)
1051 {
1052 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1053 }
1054
1055 static void
1056 tu_image_view_blit2(struct tu_image_view *iview,
1057 struct tu_image *image,
1058 VkFormat format,
1059 const VkImageSubresourceLayers *subres,
1060 uint32_t layer,
1061 bool stencil_read)
1062 {
1063 VkImageAspectFlags aspect_mask = subres->aspectMask;
1064
1065 /* always use the AS_R8G8B8A8 format for these */
1066 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1067 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1068 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1069 }
1070
1071 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1072 .image = tu_image_to_handle(image),
1073 .viewType = VK_IMAGE_VIEW_TYPE_2D,
1074 .format = format,
1075 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1076 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1077 .subresourceRange = {
1078 .aspectMask = aspect_mask,
1079 .baseMipLevel = subres->mipLevel,
1080 .levelCount = 1,
1081 .baseArrayLayer = subres->baseArrayLayer + layer,
1082 .layerCount = 1,
1083 },
1084 });
1085 }
1086
1087 static void
1088 tu_image_view_blit(struct tu_image_view *iview,
1089 struct tu_image *image,
1090 const VkImageSubresourceLayers *subres,
1091 uint32_t layer)
1092 {
1093 tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1094 }
1095
1096 static void
1097 tu6_blit_image(struct tu_cmd_buffer *cmd,
1098 struct tu_image *src_image,
1099 struct tu_image *dst_image,
1100 const VkImageBlit *info,
1101 VkFilter filter)
1102 {
1103 const struct blit_ops *ops = &r2d_ops;
1104 struct tu_cs *cs = &cmd->cs;
1105 uint32_t layers;
1106
1107 /* 2D blit can't do rotation mirroring from just coordinates */
1108 static const enum a6xx_rotation rotate[2][2] = {
1109 {ROTATE_0, ROTATE_HFLIP},
1110 {ROTATE_VFLIP, ROTATE_180},
1111 };
1112
1113 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1114 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1115 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1116 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1117 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1118 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1119
1120 if (mirror_z) {
1121 tu_finishme("blit z mirror\n");
1122 return;
1123 }
1124
1125 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1126 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1127 tu_finishme("blit z filter\n");
1128 return;
1129 }
1130
1131 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1132 if (info->dstSubresource.layerCount > 1) {
1133 assert(layers <= 1);
1134 layers = info->dstSubresource.layerCount;
1135 }
1136
1137 uint8_t mask = 0xf;
1138 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1139 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1140 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1141 mask = 0x7;
1142 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1143 mask = 0x8;
1144 }
1145
1146 /* BC1_RGB_* formats need to have their last components overriden with 1
1147 * when sampling, which is normally handled with the texture descriptor
1148 * swizzle. The 2d path can't handle that, so use the 3d path.
1149 *
1150 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1151 * the 2d path.
1152 */
1153
1154 if (dst_image->samples > 1 ||
1155 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1156 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1157 filter == VK_FILTER_CUBIC_EXT)
1158 ops = &r3d_ops;
1159
1160 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1161 * figure out why (should be able to pass all tests with only shader path)
1162 */
1163
1164 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1165
1166 if (ops == &r3d_ops) {
1167 r3d_coords_raw(cs, false, (float[]) {
1168 info->dstOffsets[0].x, info->dstOffsets[0].y,
1169 info->srcOffsets[0].x, info->srcOffsets[0].y,
1170 info->dstOffsets[1].x, info->dstOffsets[1].y,
1171 info->srcOffsets[1].x, info->srcOffsets[1].y
1172 });
1173 } else {
1174 tu_cs_emit_regs(cs,
1175 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1176 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1177 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1178 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1179 tu_cs_emit_regs(cs,
1180 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1181 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1182 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1183 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1184 }
1185
1186 struct tu_image_view dst, src;
1187 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1188 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1189
1190 for (uint32_t i = 0; i < layers; i++) {
1191 ops->dst(cs, &dst, i);
1192 ops->src(cmd, cs, &src, i, filter);
1193 ops->run(cmd, cs);
1194 }
1195 }
1196
1197 void
1198 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1199 VkImage srcImage,
1200 VkImageLayout srcImageLayout,
1201 VkImage dstImage,
1202 VkImageLayout dstImageLayout,
1203 uint32_t regionCount,
1204 const VkImageBlit *pRegions,
1205 VkFilter filter)
1206
1207 {
1208 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1209 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1210 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1211
1212 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1213 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1214
1215 for (uint32_t i = 0; i < regionCount; ++i)
1216 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1217 }
1218
1219 static VkFormat
1220 copy_format(VkFormat format)
1221 {
1222 switch (vk_format_get_blocksize(format)) {
1223 case 1: return VK_FORMAT_R8_UINT;
1224 case 2: return VK_FORMAT_R16_UINT;
1225 case 4: return VK_FORMAT_R32_UINT;
1226 case 8: return VK_FORMAT_R32G32_UINT;
1227 case 12:return VK_FORMAT_R32G32B32_UINT;
1228 case 16:return VK_FORMAT_R32G32B32A32_UINT;
1229 default:
1230 unreachable("unhandled format size");
1231 }
1232 }
1233
1234 static void
1235 copy_compressed(VkFormat format,
1236 VkOffset3D *offset,
1237 VkExtent3D *extent,
1238 uint32_t *width,
1239 uint32_t *height)
1240 {
1241 if (!vk_format_is_compressed(format))
1242 return;
1243
1244 uint32_t block_width = vk_format_get_blockwidth(format);
1245 uint32_t block_height = vk_format_get_blockheight(format);
1246
1247 offset->x /= block_width;
1248 offset->y /= block_height;
1249
1250 if (extent) {
1251 extent->width = DIV_ROUND_UP(extent->width, block_width);
1252 extent->height = DIV_ROUND_UP(extent->height, block_height);
1253 }
1254 if (width)
1255 *width = DIV_ROUND_UP(*width, block_width);
1256 if (height)
1257 *height = DIV_ROUND_UP(*height, block_height);
1258 }
1259
1260 static void
1261 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1262 struct tu_buffer *src_buffer,
1263 struct tu_image *dst_image,
1264 const VkBufferImageCopy *info)
1265 {
1266 struct tu_cs *cs = &cmd->cs;
1267 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1268 VkFormat dst_format = dst_image->vk_format;
1269 VkFormat src_format = dst_image->vk_format;
1270 const struct blit_ops *ops = &r2d_ops;
1271
1272 uint8_t mask = 0xf;
1273
1274 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1275 switch (info->imageSubresource.aspectMask) {
1276 case VK_IMAGE_ASPECT_STENCIL_BIT:
1277 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1278 mask = 0x8;
1279 ops = &r3d_ops;
1280 break;
1281 case VK_IMAGE_ASPECT_DEPTH_BIT:
1282 mask = 0x7;
1283 break;
1284 }
1285 }
1286
1287 VkOffset3D offset = info->imageOffset;
1288 VkExtent3D extent = info->imageExtent;
1289 uint32_t src_width = info->bufferRowLength ?: extent.width;
1290 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1291
1292 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1293 assert(src_format == dst_format);
1294 copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1295 src_format = dst_format = copy_format(dst_format);
1296 }
1297
1298 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1299 uint32_t layer_size = src_height * pitch;
1300
1301 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1302 * it is also valid for 1cpp format with shader path (stencil aspect path)
1303 */
1304
1305 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1306
1307 struct tu_image_view dst;
1308 tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1309
1310 for (uint32_t i = 0; i < layers; i++) {
1311 ops->dst(cs, &dst, i);
1312
1313 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1314 if ((src_va & 63) || (pitch & 63)) {
1315 for (uint32_t y = 0; y < extent.height; y++) {
1316 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1317 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1318 x + extent.width, 1);
1319 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1320 &(VkExtent2D) {extent.width, 1});
1321 ops->run(cmd, cs);
1322 src_va += pitch;
1323 }
1324 } else {
1325 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1326 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1327 ops->run(cmd, cs);
1328 }
1329 }
1330 }
1331
1332 void
1333 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1334 VkBuffer srcBuffer,
1335 VkImage dstImage,
1336 VkImageLayout dstImageLayout,
1337 uint32_t regionCount,
1338 const VkBufferImageCopy *pRegions)
1339 {
1340 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1341 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1342 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1343
1344 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1345 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1346
1347 for (unsigned i = 0; i < regionCount; ++i)
1348 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1349 }
1350
1351 static void
1352 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1353 struct tu_image *src_image,
1354 struct tu_buffer *dst_buffer,
1355 const VkBufferImageCopy *info)
1356 {
1357 struct tu_cs *cs = &cmd->cs;
1358 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1359 VkFormat src_format = src_image->vk_format;
1360 VkFormat dst_format = src_image->vk_format;
1361 bool stencil_read = false;
1362
1363 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1364 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1365 dst_format = VK_FORMAT_R8_UNORM;
1366 stencil_read = true;
1367 }
1368
1369 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1370 VkOffset3D offset = info->imageOffset;
1371 VkExtent3D extent = info->imageExtent;
1372 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1373 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1374
1375 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1376 assert(src_format == dst_format);
1377 copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1378 src_format = dst_format = copy_format(dst_format);
1379 }
1380
1381 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1382 uint32_t layer_size = pitch * dst_height;
1383
1384 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1385 * it is also valid for 1cpp format with shader path (stencil aspect)
1386 */
1387
1388 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1389
1390 struct tu_image_view src;
1391 tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1392
1393 for (uint32_t i = 0; i < layers; i++) {
1394 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1395
1396 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1397 if ((dst_va & 63) || (pitch & 63)) {
1398 for (uint32_t y = 0; y < extent.height; y++) {
1399 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1400 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1401 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1402 &(VkExtent2D) {extent.width, 1});
1403 ops->run(cmd, cs);
1404 dst_va += pitch;
1405 }
1406 } else {
1407 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1408 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1409 ops->run(cmd, cs);
1410 }
1411 }
1412 }
1413
1414 void
1415 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1416 VkImage srcImage,
1417 VkImageLayout srcImageLayout,
1418 VkBuffer dstBuffer,
1419 uint32_t regionCount,
1420 const VkBufferImageCopy *pRegions)
1421 {
1422 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1423 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1424 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1425
1426 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1427 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1428
1429 for (unsigned i = 0; i < regionCount; ++i)
1430 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1431 }
1432
1433 /* Tiled formats don't support swapping, which means that we can't support
1434 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1435 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1436 * Currently we fake support for tiled swapped formats and use the unswapped
1437 * format instead, but this means that reinterpreting copies to and from
1438 * swapped formats can't be performed correctly unless we can swizzle the
1439 * components by reinterpreting the other image as the "correct" swapped
1440 * format, i.e. only when the other image is linear.
1441 */
1442
1443 static bool
1444 is_swapped_format(VkFormat format)
1445 {
1446 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1447 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1448 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1449 }
1450
1451 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1452 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1453 * versa). This should mirror the logic in fdl6_layout.
1454 */
1455 static bool
1456 image_is_r8g8(struct tu_image *image)
1457 {
1458 return image->layout.cpp == 2 &&
1459 vk_format_get_nr_components(image->vk_format) == 2;
1460 }
1461
1462 static void
1463 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1464 struct tu_image *src_image,
1465 struct tu_image *dst_image,
1466 const VkImageCopy *info)
1467 {
1468 const struct blit_ops *ops = &r2d_ops;
1469 struct tu_cs *cs = &cmd->cs;
1470
1471 uint8_t mask = 0xf;
1472 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1473 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1474 mask = 0x7;
1475 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1476 mask = 0x8;
1477 }
1478
1479 if (dst_image->samples > 1)
1480 ops = &r3d_ops;
1481
1482 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1483
1484 VkFormat format = VK_FORMAT_UNDEFINED;
1485 VkOffset3D src_offset = info->srcOffset;
1486 VkOffset3D dst_offset = info->dstOffset;
1487 VkExtent3D extent = info->extent;
1488
1489 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1490 * Images":
1491 *
1492 * When copying between compressed and uncompressed formats the extent
1493 * members represent the texel dimensions of the source image and not
1494 * the destination. When copying from a compressed image to an
1495 * uncompressed image the image texel dimensions written to the
1496 * uncompressed image will be source extent divided by the compressed
1497 * texel block dimensions. When copying from an uncompressed image to a
1498 * compressed image the image texel dimensions written to the compressed
1499 * image will be the source extent multiplied by the compressed texel
1500 * block dimensions.
1501 *
1502 * This means we only have to adjust the extent if the source image is
1503 * compressed.
1504 */
1505 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1506 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1507
1508 VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1509 copy_format(dst_image->vk_format) : dst_image->vk_format;
1510 VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1511 copy_format(src_image->vk_format) : src_image->vk_format;
1512
1513 bool use_staging_blit = false;
1514
1515 if (src_format == dst_format) {
1516 /* Images that share a format can always be copied directly because it's
1517 * the same as a blit.
1518 */
1519 format = src_format;
1520 } else if (!src_image->layout.tile_mode) {
1521 /* If an image is linear, we can always safely reinterpret it with the
1522 * other image's format and then do a regular blit.
1523 */
1524 format = dst_format;
1525 } else if (!dst_image->layout.tile_mode) {
1526 format = src_format;
1527 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1528 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1529 * due to the different tile layout.
1530 */
1531 use_staging_blit = true;
1532 } else if (is_swapped_format(src_format) ||
1533 is_swapped_format(dst_format)) {
1534 /* If either format has a non-identity swap, then we can't copy
1535 * to/from it.
1536 */
1537 use_staging_blit = true;
1538 } else if (!src_image->layout.ubwc) {
1539 format = dst_format;
1540 } else if (!dst_image->layout.ubwc) {
1541 format = src_format;
1542 } else {
1543 /* Both formats use UBWC and so neither can be reinterpreted.
1544 * TODO: We could do an in-place decompression of the dst instead.
1545 */
1546 use_staging_blit = true;
1547 }
1548
1549 struct tu_image_view dst, src;
1550
1551 if (use_staging_blit) {
1552 tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1553 tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1554
1555 struct tu_image staging_image = {
1556 .vk_format = src_format,
1557 .type = src_image->type,
1558 .tiling = VK_IMAGE_TILING_LINEAR,
1559 .extent = extent,
1560 .level_count = 1,
1561 .layer_count = info->srcSubresource.layerCount,
1562 .samples = src_image->samples,
1563 .bo_offset = 0,
1564 };
1565
1566 VkImageSubresourceLayers staging_subresource = {
1567 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1568 .mipLevel = 0,
1569 .baseArrayLayer = 0,
1570 .layerCount = info->srcSubresource.layerCount,
1571 };
1572
1573 VkOffset3D staging_offset = { 0 };
1574
1575 staging_image.layout.tile_mode = TILE6_LINEAR;
1576 staging_image.layout.ubwc = false;
1577
1578 fdl6_layout(&staging_image.layout,
1579 vk_format_to_pipe_format(staging_image.vk_format),
1580 staging_image.samples,
1581 staging_image.extent.width,
1582 staging_image.extent.height,
1583 staging_image.extent.depth,
1584 staging_image.level_count,
1585 staging_image.layer_count,
1586 staging_image.type == VK_IMAGE_TYPE_3D,
1587 NULL);
1588
1589 VkResult result = tu_get_scratch_bo(cmd->device,
1590 staging_image.layout.size,
1591 &staging_image.bo);
1592 if (result != VK_SUCCESS) {
1593 cmd->record_result = result;
1594 return;
1595 }
1596
1597 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1598 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1599
1600 struct tu_image_view staging;
1601 tu_image_view_blit2(&staging, &staging_image, src_format,
1602 &staging_subresource, 0, false);
1603
1604 ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1605 coords(ops, cs, &staging_offset, &src_offset, &extent);
1606
1607 for (uint32_t i = 0; i < info->extent.depth; i++) {
1608 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1609 ops->dst(cs, &staging, i);
1610 ops->run(cmd, cs);
1611 }
1612
1613 /* When executed by the user there has to be a pipeline barrier here,
1614 * but since we're doing it manually we'll have to flush ourselves.
1615 */
1616 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1617 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1618
1619 tu_image_view_blit2(&staging, &staging_image, dst_format,
1620 &staging_subresource, 0, false);
1621
1622 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1623 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1624
1625 for (uint32_t i = 0; i < info->extent.depth; i++) {
1626 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1627 ops->dst(cs, &dst, i);
1628 ops->run(cmd, cs);
1629 }
1630 } else {
1631 tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1632 tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1633
1634 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1635 coords(ops, cs, &dst_offset, &src_offset, &extent);
1636
1637 for (uint32_t i = 0; i < info->extent.depth; i++) {
1638 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1639 ops->dst(cs, &dst, i);
1640 ops->run(cmd, cs);
1641 }
1642 }
1643 }
1644
1645 void
1646 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1647 VkImage srcImage,
1648 VkImageLayout srcImageLayout,
1649 VkImage destImage,
1650 VkImageLayout destImageLayout,
1651 uint32_t regionCount,
1652 const VkImageCopy *pRegions)
1653 {
1654 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1655 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1656 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1657
1658 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1659 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1660
1661 for (uint32_t i = 0; i < regionCount; ++i)
1662 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1663 }
1664
1665 static void
1666 copy_buffer(struct tu_cmd_buffer *cmd,
1667 uint64_t dst_va,
1668 uint64_t src_va,
1669 uint64_t size,
1670 uint32_t block_size)
1671 {
1672 const struct blit_ops *ops = &r2d_ops;
1673 struct tu_cs *cs = &cmd->cs;
1674 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1675 uint64_t blocks = size / block_size;
1676
1677 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1678
1679 while (blocks) {
1680 uint32_t src_x = (src_va & 63) / block_size;
1681 uint32_t dst_x = (dst_va & 63) / block_size;
1682 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1683
1684 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1685 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1686 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1687 ops->run(cmd, cs);
1688
1689 src_va += width * block_size;
1690 dst_va += width * block_size;
1691 blocks -= width;
1692 }
1693 }
1694
1695 void
1696 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1697 VkBuffer srcBuffer,
1698 VkBuffer dstBuffer,
1699 uint32_t regionCount,
1700 const VkBufferCopy *pRegions)
1701 {
1702 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1703 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1704 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1705
1706 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1707 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1708
1709 for (unsigned i = 0; i < regionCount; ++i) {
1710 copy_buffer(cmd,
1711 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1712 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1713 pRegions[i].size, 1);
1714 }
1715 }
1716
1717 void
1718 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1719 VkBuffer dstBuffer,
1720 VkDeviceSize dstOffset,
1721 VkDeviceSize dataSize,
1722 const void *pData)
1723 {
1724 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1725 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1726
1727 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1728
1729 struct ts_cs_memory tmp;
1730 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1731 if (result != VK_SUCCESS) {
1732 cmd->record_result = result;
1733 return;
1734 }
1735
1736 memcpy(tmp.map, pData, dataSize);
1737 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1738 }
1739
1740 void
1741 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1742 VkBuffer dstBuffer,
1743 VkDeviceSize dstOffset,
1744 VkDeviceSize fillSize,
1745 uint32_t data)
1746 {
1747 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1748 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1749 const struct blit_ops *ops = &r2d_ops;
1750 struct tu_cs *cs = &cmd->cs;
1751
1752 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1753
1754 if (fillSize == VK_WHOLE_SIZE)
1755 fillSize = buffer->size - dstOffset;
1756
1757 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1758 uint32_t blocks = fillSize / 4;
1759
1760 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1761 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1762
1763 while (blocks) {
1764 uint32_t dst_x = (dst_va & 63) / 4;
1765 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1766
1767 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1768 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1769 ops->run(cmd, cs);
1770
1771 dst_va += width * 4;
1772 blocks -= width;
1773 }
1774 }
1775
1776 void
1777 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1778 VkImage srcImage,
1779 VkImageLayout srcImageLayout,
1780 VkImage dstImage,
1781 VkImageLayout dstImageLayout,
1782 uint32_t regionCount,
1783 const VkImageResolve *pRegions)
1784 {
1785 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1786 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1787 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1788 const struct blit_ops *ops = &r2d_ops;
1789 struct tu_cs *cs = &cmd->cs;
1790
1791 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1792 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1793
1794 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1795
1796 for (uint32_t i = 0; i < regionCount; ++i) {
1797 const VkImageResolve *info = &pRegions[i];
1798 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1799
1800 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1801 /* TODO: aspect masks possible ? */
1802
1803 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1804
1805 struct tu_image_view dst, src;
1806 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1807 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1808
1809 for (uint32_t i = 0; i < layers; i++) {
1810 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1811 ops->dst(cs, &dst, i);
1812 ops->run(cmd, cs);
1813 }
1814 }
1815 }
1816
1817 void
1818 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1819 struct tu_cs *cs,
1820 struct tu_image_view *src,
1821 struct tu_image_view *dst,
1822 uint32_t layers,
1823 const VkRect2D *rect)
1824 {
1825 const struct blit_ops *ops = &r2d_ops;
1826
1827 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1828 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1829
1830 assert(src->image->vk_format == dst->image->vk_format);
1831
1832 ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1833 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1834
1835 for (uint32_t i = 0; i < layers; i++) {
1836 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1837 ops->dst(cs, dst, i);
1838 ops->run(cmd, cs);
1839 }
1840 }
1841
1842 static void
1843 clear_image(struct tu_cmd_buffer *cmd,
1844 struct tu_image *image,
1845 const VkClearValue *clear_value,
1846 const VkImageSubresourceRange *range)
1847 {
1848 uint32_t level_count = tu_get_levelCount(image, range);
1849 uint32_t layer_count = tu_get_layerCount(image, range);
1850 struct tu_cs *cs = &cmd->cs;
1851 VkFormat format = image->vk_format;
1852 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1853 format = VK_FORMAT_R32_UINT;
1854
1855 if (image->type == VK_IMAGE_TYPE_3D) {
1856 assert(layer_count == 1);
1857 assert(range->baseArrayLayer == 0);
1858 }
1859
1860 uint8_t mask = 0xf;
1861 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1862 mask = 0;
1863 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1864 mask |= 0x7;
1865 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1866 mask |= 0x8;
1867 }
1868
1869 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1870
1871 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1872 ops->clear_value(cs, image->vk_format, clear_value);
1873
1874 for (unsigned j = 0; j < level_count; j++) {
1875 if (image->type == VK_IMAGE_TYPE_3D)
1876 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1877
1878 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1879 u_minify(image->extent.width, range->baseMipLevel + j),
1880 u_minify(image->extent.height, range->baseMipLevel + j)
1881 });
1882
1883 struct tu_image_view dst;
1884 tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1885 .aspectMask = range->aspectMask,
1886 .mipLevel = range->baseMipLevel + j,
1887 .baseArrayLayer = range->baseArrayLayer,
1888 .layerCount = 1,
1889 }, 0, false);
1890
1891 for (uint32_t i = 0; i < layer_count; i++) {
1892 ops->dst(cs, &dst, i);
1893 ops->run(cmd, cs);
1894 }
1895 }
1896 }
1897
1898 void
1899 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1900 VkImage image_h,
1901 VkImageLayout imageLayout,
1902 const VkClearColorValue *pColor,
1903 uint32_t rangeCount,
1904 const VkImageSubresourceRange *pRanges)
1905 {
1906 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1907 TU_FROM_HANDLE(tu_image, image, image_h);
1908
1909 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1910
1911 for (unsigned i = 0; i < rangeCount; i++)
1912 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1913 }
1914
1915 void
1916 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1917 VkImage image_h,
1918 VkImageLayout imageLayout,
1919 const VkClearDepthStencilValue *pDepthStencil,
1920 uint32_t rangeCount,
1921 const VkImageSubresourceRange *pRanges)
1922 {
1923 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1924 TU_FROM_HANDLE(tu_image, image, image_h);
1925
1926 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1927
1928 for (unsigned i = 0; i < rangeCount; i++)
1929 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1930 }
1931
1932 static void
1933 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1934 uint32_t attachment_count,
1935 const VkClearAttachment *attachments,
1936 uint32_t rect_count,
1937 const VkClearRect *rects)
1938 {
1939 const struct tu_subpass *subpass = cmd->state.subpass;
1940 /* note: cannot use shader path here.. there is a special shader path
1941 * in tu_clear_sysmem_attachments()
1942 */
1943 const struct blit_ops *ops = &r2d_ops;
1944 struct tu_cs *cs = &cmd->draw_cs;
1945
1946 for (uint32_t j = 0; j < attachment_count; j++) {
1947 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1948 * Pass Instance" says that:
1949 *
1950 * Unlike other clear commands, vkCmdClearAttachments executes as
1951 * a drawing command, rather than a transfer command, with writes
1952 * performed by it executing in rasterization order. Clears to
1953 * color attachments are executed as color attachment writes, by
1954 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1955 * Clears to depth/stencil attachments are executed as depth
1956 * writes and writes by the
1957 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1958 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1959 *
1960 * However, the 2d path here is executed the same way as a
1961 * transfer command, using the CCU color cache exclusively with
1962 * a special depth-as-color format for depth clears. This means that
1963 * we can't rely on the normal pipeline barrier mechanism here, and
1964 * have to manually flush whenever using a different cache domain
1965 * from what the 3d path would've used. This happens when we clear
1966 * depth/stencil, since normally depth attachments use CCU depth, but
1967 * we clear it using a special depth-as-color format. Since the clear
1968 * potentially uses a different attachment state we also need to
1969 * invalidate color beforehand and flush it afterwards.
1970 */
1971
1972 uint32_t a;
1973 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1974 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1975 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1976 } else {
1977 a = subpass->depth_stencil_attachment.attachment;
1978 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1979 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1980 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1981 }
1982
1983 if (a == VK_ATTACHMENT_UNUSED)
1984 continue;
1985
1986 uint8_t mask = 0xf;
1987 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1988 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1989 mask &= ~0x7;
1990 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1991 mask &= ~0x8;
1992 }
1993
1994 const struct tu_image_view *iview =
1995 cmd->state.framebuffer->attachments[a].attachment;
1996
1997 ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1998 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1999
2000 /* Wait for the flushes we triggered manually to complete */
2001 tu_cs_emit_wfi(cs);
2002
2003 for (uint32_t i = 0; i < rect_count; i++) {
2004 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
2005 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2006 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
2007 ops->run(cmd, cs);
2008 }
2009 }
2010
2011 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2012 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2013 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2014 } else {
2015 /* sync color into depth */
2016 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2017 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2018 }
2019 }
2020 }
2021
2022 static void
2023 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2024 uint32_t attachment_count,
2025 const VkClearAttachment *attachments,
2026 uint32_t rect_count,
2027 const VkClearRect *rects)
2028 {
2029 /* the shader path here is special, it avoids changing MRT/etc state */
2030 const struct tu_render_pass *pass = cmd->state.pass;
2031 const struct tu_subpass *subpass = cmd->state.subpass;
2032 const uint32_t mrt_count = subpass->color_count;
2033 struct tu_cs *cs = &cmd->draw_cs;
2034 uint32_t clear_value[MAX_RTS][4];
2035 float z_clear_val = 0.0f;
2036 uint8_t s_clear_val = 0;
2037 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2038 bool z_clear = false;
2039 bool s_clear = false;
2040 bool layered_clear = false;
2041 uint32_t max_samples = 1;
2042
2043 for (uint32_t i = 0; i < attachment_count; i++) {
2044 uint32_t a;
2045 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2046 uint32_t c = attachments[i].colorAttachment;
2047 a = subpass->color_attachments[c].attachment;
2048 if (a == VK_ATTACHMENT_UNUSED)
2049 continue;
2050
2051 clear_rts |= 1 << c;
2052 clear_components |= 0xf << (c * 4);
2053 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2054 } else {
2055 a = subpass->depth_stencil_attachment.attachment;
2056 if (a == VK_ATTACHMENT_UNUSED)
2057 continue;
2058
2059 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2060 z_clear = true;
2061 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2062 }
2063
2064 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2065 s_clear = true;
2066 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2067 }
2068 }
2069
2070 max_samples = MAX2(max_samples, pass->attachments[a].samples);
2071 }
2072
2073 /* prefer to use 2D path for clears
2074 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2075 */
2076 if (max_samples == 1 && cmd->state.framebuffer) {
2077 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2078 return;
2079 }
2080
2081 /* This clear path behaves like a draw, needs the same flush as tu_draw */
2082 tu_emit_cache_flush_renderpass(cmd, cs);
2083
2084 /* disable all draw states so they don't interfere
2085 * TODO: use and re-use draw states for this path
2086 */
2087 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
2088 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
2089 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
2090 CP_SET_DRAW_STATE__0_GROUP_ID(0));
2091 tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
2092 tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
2093 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2094
2095 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2096 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2097 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2098 0xfc000000);
2099 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2100
2101 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2102 for (uint32_t i = 0; i < mrt_count; i++) {
2103 if (clear_rts & (1 << i))
2104 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2105 else
2106 tu_cs_emit(cs, 0);
2107 }
2108
2109 for (uint32_t i = 0; i < rect_count; i++) {
2110 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
2111 layered_clear = true;
2112 }
2113
2114 r3d_common(cmd, cs, false, num_rts, layered_clear);
2115
2116 tu_cs_emit_regs(cs,
2117 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2118 tu_cs_emit_regs(cs,
2119 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2120
2121 tu_cs_emit_regs(cs,
2122 A6XX_RB_FS_OUTPUT_CNTL0(),
2123 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2124
2125 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2126 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2127 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2128 for (uint32_t i = 0; i < mrt_count; i++) {
2129 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2130 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2131 }
2132
2133 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2134 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2135 .z_enable = z_clear,
2136 .z_write_enable = z_clear,
2137 .zfunc = FUNC_ALWAYS));
2138 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2139 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2140 .stencil_enable = s_clear,
2141 .func = FUNC_ALWAYS,
2142 .zpass = STENCIL_REPLACE));
2143 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2144 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2145 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2146
2147 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2148 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2149 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2150 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2151 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2152 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2153 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2154 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2155 for_each_bit(b, clear_rts)
2156 tu_cs_emit_array(cs, clear_value[b], 4);
2157
2158 for (uint32_t i = 0; i < rect_count; i++) {
2159 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2160 r3d_coords_raw(cs, layered_clear, (float[]) {
2161 rects[i].rect.offset.x, rects[i].rect.offset.y,
2162 z_clear_val, uif(rects[i].baseArrayLayer + layer),
2163 rects[i].rect.offset.x + rects[i].rect.extent.width,
2164 rects[i].rect.offset.y + rects[i].rect.extent.height,
2165 z_clear_val, 1.0f,
2166 });
2167
2168 if (layered_clear) {
2169 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2170 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2171 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2172 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2173 CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2174 tu_cs_emit(cs, 1); /* instance count */
2175 tu_cs_emit(cs, 1); /* vertex count */
2176 } else {
2177 r3d_run(cmd, cs);
2178 }
2179 }
2180 }
2181 }
2182
2183 /**
2184 * Pack a VkClearValue into a 128-bit buffer. format is respected except
2185 * for the component order. The components are always packed in WZYX order,
2186 * because gmem is tiled and tiled formats always have WZYX swap
2187 */
2188 static void
2189 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2190 {
2191 const struct util_format_description *desc = vk_format_description(format);
2192
2193 switch (format) {
2194 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2195 buf[0] = float3_to_r11g11b10f(val->color.float32);
2196 return;
2197 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2198 buf[0] = float3_to_rgb9e5(val->color.float32);
2199 return;
2200 default:
2201 break;
2202 }
2203
2204 assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2205
2206 /* S8_UINT is special and has no depth */
2207 const int max_components =
2208 format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2209
2210 int buf_offset = 0;
2211 int bit_shift = 0;
2212 for (int comp = 0; comp < max_components; comp++) {
2213 const struct util_format_channel_description *ch =
2214 tu_get_format_channel_description(desc, comp);
2215 if (!ch) {
2216 assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2217 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2218 continue;
2219 }
2220
2221 union tu_clear_component_value v = tu_get_clear_component_value(
2222 val, comp, desc->colorspace);
2223
2224 /* move to the next uint32_t when there is not enough space */
2225 assert(ch->size <= 32);
2226 if (bit_shift + ch->size > 32) {
2227 buf_offset++;
2228 bit_shift = 0;
2229 }
2230
2231 if (bit_shift == 0)
2232 buf[buf_offset] = 0;
2233
2234 buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2235 bit_shift += ch->size;
2236 }
2237 }
2238
2239 static void
2240 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2241 struct tu_cs *cs,
2242 uint32_t attachment,
2243 uint8_t component_mask,
2244 const VkClearValue *value)
2245 {
2246 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2247 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2248 * because D24S8 is cleared with AS_R8G8B8A8 format
2249 */
2250
2251 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2252 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2253
2254 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2255 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2256
2257 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2258 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2259
2260 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2261 tu_cs_emit(cs, 0);
2262
2263 uint32_t clear_vals[4] = {};
2264 pack_gmem_clear_value(value, vk_format, clear_vals);
2265
2266 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2267 tu_cs_emit_array(cs, clear_vals, 4);
2268
2269 tu6_emit_event_write(cmd, cs, BLIT);
2270 }
2271
2272 static void
2273 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2274 uint32_t attachment_count,
2275 const VkClearAttachment *attachments,
2276 uint32_t rect_count,
2277 const VkClearRect *rects)
2278 {
2279 const struct tu_subpass *subpass = cmd->state.subpass;
2280 struct tu_cs *cs = &cmd->draw_cs;
2281
2282 /* TODO: swap the loops for smaller cmdstream */
2283 for (unsigned i = 0; i < rect_count; i++) {
2284 unsigned x1 = rects[i].rect.offset.x;
2285 unsigned y1 = rects[i].rect.offset.y;
2286 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2287 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2288
2289 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2290 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2291 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2292
2293 for (unsigned j = 0; j < attachment_count; j++) {
2294 uint32_t a;
2295 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2296 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2297 else
2298 a = subpass->depth_stencil_attachment.attachment;
2299
2300 if (a == VK_ATTACHMENT_UNUSED)
2301 continue;
2302
2303 unsigned clear_mask = 0xf;
2304 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2305 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2306 clear_mask &= ~0x7;
2307 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2308 clear_mask &= ~0x8;
2309 }
2310
2311 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2312 &attachments[j].clearValue);
2313 }
2314 }
2315 }
2316
2317 void
2318 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2319 uint32_t attachmentCount,
2320 const VkClearAttachment *pAttachments,
2321 uint32_t rectCount,
2322 const VkClearRect *pRects)
2323 {
2324 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2325 struct tu_cs *cs = &cmd->draw_cs;
2326
2327 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2328 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2329 tu_cond_exec_end(cs);
2330
2331 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2332 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2333 tu_cond_exec_end(cs);
2334 }
2335
2336 void
2337 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2338 struct tu_cs *cs,
2339 uint32_t a,
2340 const VkRenderPassBeginInfo *info)
2341 {
2342 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2343 const struct tu_image_view *iview = fb->attachments[a].attachment;
2344 const struct tu_render_pass_attachment *attachment =
2345 &cmd->state.pass->attachments[a];
2346 uint8_t mask = 0;
2347
2348 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2349 mask = 0xf;
2350 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2351 mask |= 0x7;
2352 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2353 mask |= 0x8;
2354
2355 if (!mask)
2356 return;
2357
2358 const struct blit_ops *ops = &r2d_ops;
2359 if (attachment->samples > 1)
2360 ops = &r3d_ops;
2361
2362 ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2363 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2364 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2365
2366 /* Wait for any flushes at the beginning of the renderpass to complete */
2367 tu_cs_emit_wfi(cs);
2368
2369 for (uint32_t i = 0; i < fb->layers; i++) {
2370 ops->dst(cs, iview, i);
2371 ops->run(cmd, cs);
2372 }
2373
2374 /* The spec doesn't explicitly say, but presumably the initial renderpass
2375 * clear is considered part of the renderpass, and therefore barriers
2376 * aren't required inside the subpass/renderpass. Therefore we need to
2377 * flush CCU color into CCU depth here, just like with
2378 * vkCmdClearAttachments(). Note that because this only happens at the
2379 * beginning of a renderpass, and renderpass writes are considered
2380 * "incoherent", we shouldn't have to worry about syncing depth into color
2381 * beforehand as depth should already be flushed.
2382 */
2383 if (vk_format_is_depth_or_stencil(attachment->format)) {
2384 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2385 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2386 } else {
2387 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2388 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2389 }
2390 }
2391
2392 void
2393 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2394 struct tu_cs *cs,
2395 uint32_t a,
2396 const VkRenderPassBeginInfo *info)
2397 {
2398 const struct tu_render_pass_attachment *attachment =
2399 &cmd->state.pass->attachments[a];
2400 unsigned clear_mask = 0;
2401
2402 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2403 clear_mask = 0xf;
2404 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2405 clear_mask |= 0x7;
2406 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2407 clear_mask |= 0x8;
2408
2409 if (!clear_mask)
2410 return;
2411
2412 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2413
2414 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2415 &info->pClearValues[a]);
2416 }
2417
2418 static void
2419 tu_emit_blit(struct tu_cmd_buffer *cmd,
2420 struct tu_cs *cs,
2421 const struct tu_image_view *iview,
2422 const struct tu_render_pass_attachment *attachment,
2423 bool resolve)
2424 {
2425 tu_cs_emit_regs(cs,
2426 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2427
2428 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2429 .unk0 = !resolve,
2430 .gmem = !resolve,
2431 /* "integer" bit disables msaa resolve averaging */
2432 .integer = vk_format_is_int(attachment->format)));
2433
2434 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2435 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2436 tu_cs_image_ref_2d(cs, iview, 0, false);
2437
2438 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2439 tu_cs_image_flag_ref(cs, iview, 0);
2440
2441 tu_cs_emit_regs(cs,
2442 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2443
2444 tu6_emit_event_write(cmd, cs, BLIT);
2445 }
2446
2447 static bool
2448 blit_can_resolve(VkFormat format)
2449 {
2450 const struct util_format_description *desc = vk_format_description(format);
2451
2452 /* blit event can only do resolve for simple cases:
2453 * averaging samples as unsigned integers or choosing only one sample
2454 */
2455 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2456 return false;
2457
2458 /* can't do formats with larger channel sizes
2459 * note: this includes all float formats
2460 * note2: single channel integer formats seem OK
2461 */
2462 if (desc->channel[0].size > 10)
2463 return false;
2464
2465 switch (format) {
2466 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2467 * likely related to these formats having different layout from other cpp=2 formats
2468 */
2469 case VK_FORMAT_R8G8_UNORM:
2470 case VK_FORMAT_R8G8_UINT:
2471 case VK_FORMAT_R8G8_SINT:
2472 /* TODO: this one should be able to work? */
2473 case VK_FORMAT_D24_UNORM_S8_UINT:
2474 return false;
2475 default:
2476 break;
2477 }
2478
2479 return true;
2480 }
2481
2482 void
2483 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2484 struct tu_cs *cs,
2485 uint32_t a,
2486 bool force_load)
2487 {
2488 const struct tu_image_view *iview =
2489 cmd->state.framebuffer->attachments[a].attachment;
2490 const struct tu_render_pass_attachment *attachment =
2491 &cmd->state.pass->attachments[a];
2492
2493 if (attachment->load || force_load)
2494 tu_emit_blit(cmd, cs, iview, attachment, false);
2495 }
2496
2497 void
2498 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2499 struct tu_cs *cs,
2500 uint32_t a,
2501 uint32_t gmem_a)
2502 {
2503 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2504 const VkRect2D *render_area = &tiling->render_area;
2505 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2506 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2507 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2508
2509 if (!dst->store)
2510 return;
2511
2512 uint32_t x1 = render_area->offset.x;
2513 uint32_t y1 = render_area->offset.y;
2514 uint32_t x2 = x1 + render_area->extent.width;
2515 uint32_t y2 = y1 + render_area->extent.height;
2516 /* x2/y2 can be unaligned if equal to the size of the image,
2517 * since it will write into padding space
2518 * the one exception is linear levels which don't have the
2519 * required y padding in the layout (except for the last level)
2520 */
2521 bool need_y2_align =
2522 y2 != iview->extent.height || iview->need_y2_align;
2523
2524 bool unaligned =
2525 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2526 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2527
2528 /* use fast path when render area is aligned, except for unsupported resolve cases */
2529 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2530 tu_emit_blit(cmd, cs, iview, src, true);
2531 return;
2532 }
2533
2534 if (dst->samples > 1) {
2535 /* I guess we need to use shader path in this case?
2536 * need a testcase which fails because of this
2537 */
2538 tu_finishme("unaligned store of msaa attachment\n");
2539 return;
2540 }
2541
2542 r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2543 r2d_dst(cs, iview, 0);
2544 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2545
2546 tu_cs_emit_regs(cs,
2547 A6XX_SP_PS_2D_SRC_INFO(
2548 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2549 .tile_mode = TILE6_2,
2550 .srgb = vk_format_is_srgb(src->format),
2551 .samples = tu_msaa_samples(src->samples),
2552 .samples_average = !vk_format_is_int(src->format),
2553 .unk20 = 1,
2554 .unk22 = 1),
2555 /* note: src size does not matter when not scaling */
2556 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2557 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2558 A6XX_SP_PS_2D_SRC_HI(),
2559 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2560
2561 /* sync GMEM writes with CACHE. */
2562 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2563
2564 /* Wait for CACHE_INVALIDATE to land */
2565 tu_cs_emit_wfi(cs);
2566
2567 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2568 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2569
2570 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2571 * sysmem, and we generally assume that GMEM renderpasses leave their
2572 * results in sysmem, so we need to flush manually here.
2573 */
2574 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2575 }