2ca9860161c9894a565398caffd5c3a246d2bc8f
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 /* helper functions previously in tu_formats.c */
20
21 static uint32_t
22 tu_pack_mask(int bits)
23 {
24 assert(bits <= 32);
25 return (1ull << bits) - 1;
26 }
27
28 static uint32_t
29 tu_pack_float32_for_unorm(float val, int bits)
30 {
31 const uint32_t max = tu_pack_mask(bits);
32 if (val < 0.0f)
33 return 0;
34 else if (val > 1.0f)
35 return max;
36 else
37 return _mesa_lroundevenf(val * (float) max);
38 }
39
40 static uint32_t
41 tu_pack_float32_for_snorm(float val, int bits)
42 {
43 const int32_t max = tu_pack_mask(bits - 1);
44 int32_t tmp;
45 if (val < -1.0f)
46 tmp = -max;
47 else if (val > 1.0f)
48 tmp = max;
49 else
50 tmp = _mesa_lroundevenf(val * (float) max);
51
52 return tmp & tu_pack_mask(bits);
53 }
54
55 static uint32_t
56 tu_pack_float32_for_uscaled(float val, int bits)
57 {
58 const uint32_t max = tu_pack_mask(bits);
59 if (val < 0.0f)
60 return 0;
61 else if (val > (float) max)
62 return max;
63 else
64 return (uint32_t) val;
65 }
66
67 static uint32_t
68 tu_pack_float32_for_sscaled(float val, int bits)
69 {
70 const int32_t max = tu_pack_mask(bits - 1);
71 const int32_t min = -max - 1;
72 int32_t tmp;
73 if (val < (float) min)
74 tmp = min;
75 else if (val > (float) max)
76 tmp = max;
77 else
78 tmp = (int32_t) val;
79
80 return tmp & tu_pack_mask(bits);
81 }
82
83 static uint32_t
84 tu_pack_uint32_for_uint(uint32_t val, int bits)
85 {
86 return val & tu_pack_mask(bits);
87 }
88
89 static uint32_t
90 tu_pack_int32_for_sint(int32_t val, int bits)
91 {
92 return val & tu_pack_mask(bits);
93 }
94
95 static uint32_t
96 tu_pack_float32_for_sfloat(float val, int bits)
97 {
98 assert(bits == 16 || bits == 32);
99 return bits == 16 ? util_float_to_half(val) : fui(val);
100 }
101
102 union tu_clear_component_value {
103 float float32;
104 int32_t int32;
105 uint32_t uint32;
106 };
107
108 static uint32_t
109 tu_pack_clear_component_value(union tu_clear_component_value val,
110 const struct util_format_channel_description *ch)
111 {
112 uint32_t packed;
113
114 switch (ch->type) {
115 case UTIL_FORMAT_TYPE_UNSIGNED:
116 /* normalized, scaled, or pure integer */
117 if (ch->normalized)
118 packed = tu_pack_float32_for_unorm(val.float32, ch->size);
119 else if (ch->pure_integer)
120 packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
121 else
122 packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
123 break;
124 case UTIL_FORMAT_TYPE_SIGNED:
125 /* normalized, scaled, or pure integer */
126 if (ch->normalized)
127 packed = tu_pack_float32_for_snorm(val.float32, ch->size);
128 else if (ch->pure_integer)
129 packed = tu_pack_int32_for_sint(val.int32, ch->size);
130 else
131 packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
132 break;
133 case UTIL_FORMAT_TYPE_FLOAT:
134 packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
135 break;
136 default:
137 unreachable("unexpected channel type");
138 packed = 0;
139 break;
140 }
141
142 assert((packed & tu_pack_mask(ch->size)) == packed);
143 return packed;
144 }
145
146 static const struct util_format_channel_description *
147 tu_get_format_channel_description(const struct util_format_description *desc,
148 int comp)
149 {
150 switch (desc->swizzle[comp]) {
151 case PIPE_SWIZZLE_X:
152 return &desc->channel[0];
153 case PIPE_SWIZZLE_Y:
154 return &desc->channel[1];
155 case PIPE_SWIZZLE_Z:
156 return &desc->channel[2];
157 case PIPE_SWIZZLE_W:
158 return &desc->channel[3];
159 default:
160 return NULL;
161 }
162 }
163
164 static union tu_clear_component_value
165 tu_get_clear_component_value(const VkClearValue *val, int comp,
166 enum util_format_colorspace colorspace)
167 {
168 assert(comp < 4);
169
170 union tu_clear_component_value tmp;
171 switch (colorspace) {
172 case UTIL_FORMAT_COLORSPACE_ZS:
173 assert(comp < 2);
174 if (comp == 0)
175 tmp.float32 = val->depthStencil.depth;
176 else
177 tmp.uint32 = val->depthStencil.stencil;
178 break;
179 case UTIL_FORMAT_COLORSPACE_SRGB:
180 if (comp < 3) {
181 tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
182 break;
183 }
184 default:
185 assert(comp < 4);
186 tmp.uint32 = val->color.uint32[comp];
187 break;
188 }
189
190 return tmp;
191 }
192
193 /* r2d_ = BLIT_OP_SCALE operations */
194
195 static enum a6xx_2d_ifmt
196 format_to_ifmt(enum a6xx_format fmt)
197 {
198 switch (fmt) {
199 case FMT6_A8_UNORM:
200 case FMT6_8_UNORM:
201 case FMT6_8_SNORM:
202 case FMT6_8_8_UNORM:
203 case FMT6_8_8_SNORM:
204 case FMT6_8_8_8_8_UNORM:
205 case FMT6_8_8_8_X8_UNORM:
206 case FMT6_8_8_8_8_SNORM:
207 case FMT6_4_4_4_4_UNORM:
208 case FMT6_5_5_5_1_UNORM:
209 case FMT6_5_6_5_UNORM:
210 case FMT6_Z24_UNORM_S8_UINT:
211 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
212 return R2D_UNORM8;
213
214 case FMT6_32_UINT:
215 case FMT6_32_SINT:
216 case FMT6_32_32_UINT:
217 case FMT6_32_32_SINT:
218 case FMT6_32_32_32_32_UINT:
219 case FMT6_32_32_32_32_SINT:
220 return R2D_INT32;
221
222 case FMT6_16_UINT:
223 case FMT6_16_SINT:
224 case FMT6_16_16_UINT:
225 case FMT6_16_16_SINT:
226 case FMT6_16_16_16_16_UINT:
227 case FMT6_16_16_16_16_SINT:
228 case FMT6_10_10_10_2_UINT:
229 return R2D_INT16;
230
231 case FMT6_8_UINT:
232 case FMT6_8_SINT:
233 case FMT6_8_8_UINT:
234 case FMT6_8_8_SINT:
235 case FMT6_8_8_8_8_UINT:
236 case FMT6_8_8_8_8_SINT:
237 return R2D_INT8;
238
239 case FMT6_16_UNORM:
240 case FMT6_16_SNORM:
241 case FMT6_16_16_UNORM:
242 case FMT6_16_16_SNORM:
243 case FMT6_16_16_16_16_UNORM:
244 case FMT6_16_16_16_16_SNORM:
245 case FMT6_32_FLOAT:
246 case FMT6_32_32_FLOAT:
247 case FMT6_32_32_32_32_FLOAT:
248 return R2D_FLOAT32;
249
250 case FMT6_16_FLOAT:
251 case FMT6_16_16_FLOAT:
252 case FMT6_16_16_16_16_FLOAT:
253 case FMT6_11_11_10_FLOAT:
254 case FMT6_10_10_10_2_UNORM:
255 case FMT6_10_10_10_2_UNORM_DEST:
256 return R2D_FLOAT16;
257
258 default:
259 unreachable("bad format");
260 return 0;
261 }
262 }
263
264 static void
265 r2d_coords(struct tu_cs *cs,
266 const VkOffset2D *dst,
267 const VkOffset2D *src,
268 const VkExtent2D *extent)
269 {
270 tu_cs_emit_regs(cs,
271 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
272 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
273
274 if (!src)
275 return;
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
279 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
280 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
281 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
282 }
283
284 static void
285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
286 {
287 uint32_t clear_value[4] = {};
288
289 switch (format) {
290 case VK_FORMAT_X8_D24_UNORM_PACK32:
291 case VK_FORMAT_D24_UNORM_S8_UINT:
292 /* cleared as r8g8b8a8_unorm using special format */
293 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
294 clear_value[1] = clear_value[0] >> 8;
295 clear_value[2] = clear_value[0] >> 16;
296 clear_value[3] = val->depthStencil.stencil;
297 break;
298 case VK_FORMAT_D16_UNORM:
299 case VK_FORMAT_D32_SFLOAT:
300 /* R2D_FLOAT32 */
301 clear_value[0] = fui(val->depthStencil.depth);
302 break;
303 case VK_FORMAT_S8_UINT:
304 clear_value[0] = val->depthStencil.stencil;
305 break;
306 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
307 /* cleared as UINT32 */
308 clear_value[0] = float3_to_rgb9e5(val->color.float32);
309 break;
310 default:
311 assert(!vk_format_is_depth_or_stencil(format));
312 const struct util_format_description *desc = vk_format_description(format);
313 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
314
315 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
316 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
317
318 for (unsigned i = 0; i < desc->nr_channels; i++) {
319 const struct util_format_channel_description *ch = &desc->channel[i];
320 if (ifmt == R2D_UNORM8) {
321 float linear = val->color.float32[i];
322 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
323 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
324
325 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
326 clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
327 else
328 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
329 } else if (ifmt == R2D_FLOAT16) {
330 clear_value[i] = util_float_to_half(val->color.float32[i]);
331 } else {
332 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
333 ifmt == R2D_INT16 || ifmt == R2D_INT8);
334 clear_value[i] = val->color.uint32[i];
335 }
336 }
337 break;
338 }
339
340 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
341 tu_cs_emit_array(cs, clear_value, 4);
342 }
343
344 static void
345 r2d_src(struct tu_cmd_buffer *cmd,
346 struct tu_cs *cs,
347 const struct tu_image_view *iview,
348 uint32_t layer,
349 bool linear_filter)
350 {
351 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
352 tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
353 COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
354 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
355 tu_cs_image_ref_2d(cs, iview, layer, true);
356
357 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
358 tu_cs_image_flag_ref(cs, iview, layer);
359 }
360
361 static void
362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
363 struct tu_cs *cs,
364 VkFormat vk_format,
365 uint64_t va, uint32_t pitch,
366 uint32_t width, uint32_t height)
367 {
368 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
369
370 tu_cs_emit_regs(cs,
371 A6XX_SP_PS_2D_SRC_INFO(
372 .color_format = format.fmt,
373 .color_swap = format.swap,
374 .srgb = vk_format_is_srgb(vk_format),
375 .unk20 = 1,
376 .unk22 = 1),
377 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
378 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
379 A6XX_SP_PS_2D_SRC_HI(va >> 32),
380 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
381 }
382
383 static void
384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
385 {
386 assert(iview->image->samples == 1);
387
388 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
389 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
390 tu_cs_image_ref_2d(cs, iview, layer, false);
391
392 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
393 tu_cs_image_flag_ref(cs, iview, layer);
394 }
395
396 static void
397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
398 {
399 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
400
401 tu_cs_emit_regs(cs,
402 A6XX_RB_2D_DST_INFO(
403 .color_format = format.fmt,
404 .color_swap = format.swap,
405 .srgb = vk_format_is_srgb(vk_format)),
406 A6XX_RB_2D_DST_LO((uint32_t) va),
407 A6XX_RB_2D_DST_HI(va >> 32),
408 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
409 }
410
411 static void
412 r2d_setup_common(struct tu_cmd_buffer *cmd,
413 struct tu_cs *cs,
414 VkFormat vk_format,
415 enum a6xx_rotation rotation,
416 bool clear,
417 uint8_t mask,
418 bool scissor)
419 {
420 enum a6xx_format format = tu6_base_format(vk_format);
421 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
422 uint32_t unknown_8c01 = 0;
423
424 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
425 /* preserve depth channels */
426 if (mask == 0x8)
427 unknown_8c01 = 0x00084001;
428 /* preserve stencil channel */
429 if (mask == 0x7)
430 unknown_8c01 = 0x08000041;
431 }
432
433 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
434 tu_cs_emit(cs, unknown_8c01);
435
436 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
437 .scissor = scissor,
438 .rotate = rotation,
439 .solid_color = clear,
440 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
441 .color_format = format,
442 .mask = 0xf,
443 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
444 ).value;
445
446 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
447 tu_cs_emit(cs, blit_cntl);
448
449 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
450 tu_cs_emit(cs, blit_cntl);
451
452 if (format == FMT6_10_10_10_2_UNORM_DEST)
453 format = FMT6_16_16_16_16_FLOAT;
454
455 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
456 .sint = vk_format_is_sint(vk_format),
457 .uint = vk_format_is_uint(vk_format),
458 .color_format = format,
459 .srgb = vk_format_is_srgb(vk_format),
460 .mask = 0xf));
461 }
462
463 static void
464 r2d_setup(struct tu_cmd_buffer *cmd,
465 struct tu_cs *cs,
466 VkFormat vk_format,
467 enum a6xx_rotation rotation,
468 bool clear,
469 uint8_t mask)
470 {
471 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
472
473 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
474 }
475
476 static void
477 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
478 {
479 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
480 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
481 }
482
483 /* r3d_ = shader path operations */
484
485 static void
486 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
487 bool layered_clear)
488 {
489 struct ir3_shader dummy_shader = {};
490
491 struct ir3_shader_variant vs = {
492 .type = MESA_SHADER_VERTEX,
493 .instrlen = 1,
494 .constlen = 2,
495 .info.max_reg = 1,
496 .inputs_count = 1,
497 .inputs[0] = {
498 .slot = SYSTEM_VALUE_VERTEX_ID,
499 .regid = regid(0, 3),
500 .sysval = true,
501 },
502 .outputs_count = blit ? 2 : 1,
503 .outputs[0] = {
504 .slot = VARYING_SLOT_POS,
505 .regid = regid(0, 0),
506 },
507 .outputs[1] = {
508 .slot = VARYING_SLOT_VAR0,
509 .regid = regid(1, 0),
510 },
511 .shader = &dummy_shader,
512 };
513 if (layered_clear) {
514 vs = (struct ir3_shader_variant) {
515 .type = MESA_SHADER_VERTEX,
516 .instrlen = 1,
517 .info.max_reg = 0,
518 .shader = &dummy_shader,
519 };
520 }
521
522 struct ir3_shader_variant fs = {
523 .type = MESA_SHADER_FRAGMENT,
524 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
525 .constlen = num_rts,
526 .info.max_reg = MAX2(num_rts, 1) - 1,
527 .total_in = blit ? 2 : 0,
528 .num_samp = blit ? 1 : 0,
529 .inputs_count = blit ? 2 : 0,
530 .inputs[0] = {
531 .slot = VARYING_SLOT_VAR0,
532 .inloc = 0,
533 .compmask = 3,
534 .bary = true,
535 },
536 .inputs[1] = {
537 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
538 .regid = regid(0, 0),
539 .sysval = 1,
540 },
541 .num_sampler_prefetch = blit ? 1 : 0,
542 .sampler_prefetch[0] = {
543 .src = 0,
544 .wrmask = 0xf,
545 .cmd = 4,
546 },
547 .shader = &dummy_shader,
548 };
549
550 struct ir3_shader_variant gs_shader = {
551 .type = MESA_SHADER_GEOMETRY,
552 .instrlen = 1,
553 .constlen = 2,
554 .info.max_reg = 1,
555 .inputs_count = 1,
556 .inputs[0] = {
557 .slot = SYSTEM_VALUE_GS_HEADER_IR3,
558 .regid = regid(0, 0),
559 .sysval = true,
560 },
561 .outputs_count = 3,
562 .outputs[0] = {
563 .slot = VARYING_SLOT_POS,
564 .regid = regid(0, 0),
565 },
566 .outputs[1] = {
567 .slot = VARYING_SLOT_LAYER,
568 .regid = regid(1, 1),
569 },
570 .outputs[2] = {
571 .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
572 .regid = regid(1, 0),
573 },
574 .shader = &dummy_shader,
575 }, *gs = layered_clear ? &gs_shader : NULL;
576
577
578 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
579 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
580 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
581
582 static const instr_t vs_code[] = {
583 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
584 * r1.xy = r0.w ? c1.zw : c0.zw
585 * r0.w = 1.0f
586 */
587 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
588 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
589 .src2 = 3,
590 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
591 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
592 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
593 .src2 = 3,
594 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
595 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
596 { .cat0 = { .opc = OPC_END } },
597 };
598
599 static const instr_t vs_layered[] = {
600 { .cat0 = { .opc = OPC_CHMASK } },
601 { .cat0 = { .opc = OPC_CHSH } },
602 };
603
604 static const instr_t gs_code[16] = {
605 /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
606 CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
607 .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
608 /* x = (local_id & 1) ? c1.x : c0.x */
609 CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
610 /* y = (local_id & 2) ? c1.y : c0.y */
611 CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
612 /* pred = (local_id >= 4), used by OPC_KILL */
613 CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
614 /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
615 CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
616
617 MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
618 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
619 MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
620
621 /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
622 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
623 .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
624 .src2 = 0,
625 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
626
627 CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
628
629 { .cat0 = { .opc = OPC_KILL } },
630 { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
631 };
632 #define FS_OFFSET (16 * sizeof(instr_t))
633 #define GS_OFFSET (32 * sizeof(instr_t))
634
635 /* shaders */
636 struct ts_cs_memory shaders = { };
637 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
638 16 * sizeof(instr_t), &shaders);
639 assert(result == VK_SUCCESS);
640
641 if (layered_clear) {
642 memcpy(shaders.map, vs_layered, sizeof(vs_layered));
643 memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
644 } else {
645 memcpy(shaders.map, vs_code, sizeof(vs_code));
646 }
647
648 instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
649 for (uint32_t i = 0; i < num_rts; i++) {
650 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
651 *fs_code++ = (instr_t) { .cat1 = {
652 .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
653 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
654 } };
655 }
656
657 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
658 * blit path (its not clear what allows it to not have it)
659 */
660 if (blit) {
661 *fs_code++ = (instr_t) { .cat2 = {
662 .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
663 .dst = regid(63, 0), .src1_im = 1
664 } };
665 }
666 *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
667 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
668
669 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
670
671 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
672 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
673 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
674 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
675 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
676
677 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
678 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
679
680 tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
681
682 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
683 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
684 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
685
686 tu6_emit_fs_inputs(cs, &fs);
687
688 tu_cs_emit_regs(cs,
689 A6XX_GRAS_CL_CNTL(
690 .persp_division_disable = 1,
691 .vp_xform_disable = 1,
692 .vp_clip_code_ignore = 1,
693 .clip_disable = 1),
694 A6XX_GRAS_UNKNOWN_8001(0));
695 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
696
697 tu_cs_emit_regs(cs,
698 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
699 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
700 tu_cs_emit_regs(cs,
701 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
702 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
703 }
704
705 static void
706 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
707 {
708 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
709 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
710 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
711 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
712 CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
713 CP_LOAD_STATE6_0_NUM_UNIT(2));
714 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
715 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
716 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
717 }
718
719 static void
720 r3d_coords(struct tu_cs *cs,
721 const VkOffset2D *dst,
722 const VkOffset2D *src,
723 const VkExtent2D *extent)
724 {
725 int32_t src_x1 = src ? src->x : 0;
726 int32_t src_y1 = src ? src->y : 0;
727 r3d_coords_raw(cs, false, (float[]) {
728 dst->x, dst->y,
729 src_x1, src_y1,
730 dst->x + extent->width, dst->y + extent->height,
731 src_x1 + extent->width, src_y1 + extent->height,
732 });
733 }
734
735 static void
736 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
737 {
738 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
739 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
740 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
741 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
742 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
743 CP_LOAD_STATE6_0_NUM_UNIT(1));
744 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
745 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
746 switch (format) {
747 case VK_FORMAT_X8_D24_UNORM_PACK32:
748 case VK_FORMAT_D24_UNORM_S8_UINT: {
749 /* cleared as r8g8b8a8_unorm using special format */
750 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
751 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
752 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
753 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
754 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
755 } break;
756 case VK_FORMAT_D16_UNORM:
757 case VK_FORMAT_D32_SFLOAT:
758 tu_cs_emit(cs, fui(val->depthStencil.depth));
759 tu_cs_emit(cs, 0);
760 tu_cs_emit(cs, 0);
761 tu_cs_emit(cs, 0);
762 break;
763 case VK_FORMAT_S8_UINT:
764 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
765 tu_cs_emit(cs, 0);
766 tu_cs_emit(cs, 0);
767 tu_cs_emit(cs, 0);
768 break;
769 default:
770 /* as color formats use clear value as-is */
771 assert(!vk_format_is_depth_or_stencil(format));
772 tu_cs_emit_array(cs, val->color.uint32, 4);
773 break;
774 }
775 }
776
777 static void
778 r3d_src_common(struct tu_cmd_buffer *cmd,
779 struct tu_cs *cs,
780 const uint32_t *tex_const,
781 uint32_t offset_base,
782 uint32_t offset_ubwc,
783 bool linear_filter)
784 {
785 struct ts_cs_memory texture = { };
786 VkResult result = tu_cs_alloc(&cmd->sub_cs,
787 2, /* allocate space for a sampler too */
788 A6XX_TEX_CONST_DWORDS, &texture);
789 assert(result == VK_SUCCESS);
790
791 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
792
793 /* patch addresses for layer offset */
794 *(uint64_t*) (texture.map + 4) += offset_base;
795 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
796 texture.map[7] = ubwc_addr;
797 texture.map[8] = ubwc_addr >> 32;
798
799 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
800 A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
801 A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
802 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
803 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
804 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
805 0x60000; /* XXX used by blob, doesn't seem necessary */
806 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
807 0x1 | /* XXX used by blob, doesn't seem necessary */
808 A6XX_TEX_SAMP_1_UNNORM_COORDS |
809 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
810 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
811 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
812
813 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
814 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
815 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
816 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
817 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
818 CP_LOAD_STATE6_0_NUM_UNIT(1));
819 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
820
821 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
822 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
823
824 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
825 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
826 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
827 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
828 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
829 CP_LOAD_STATE6_0_NUM_UNIT(1));
830 tu_cs_emit_qw(cs, texture.iova);
831
832 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
833 tu_cs_emit_qw(cs, texture.iova);
834
835 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
836 }
837
838 static void
839 r3d_src(struct tu_cmd_buffer *cmd,
840 struct tu_cs *cs,
841 const struct tu_image_view *iview,
842 uint32_t layer,
843 bool linear_filter)
844 {
845 r3d_src_common(cmd, cs, iview->descriptor,
846 iview->layer_size * layer,
847 iview->ubwc_layer_size * layer,
848 linear_filter);
849 }
850
851 static void
852 r3d_src_buffer(struct tu_cmd_buffer *cmd,
853 struct tu_cs *cs,
854 VkFormat vk_format,
855 uint64_t va, uint32_t pitch,
856 uint32_t width, uint32_t height)
857 {
858 uint32_t desc[A6XX_TEX_CONST_DWORDS];
859
860 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
861
862 desc[0] =
863 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
864 A6XX_TEX_CONST_0_FMT(format.fmt) |
865 A6XX_TEX_CONST_0_SWAP(format.swap) |
866 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
867 // XXX to swizzle into .w for stencil buffer_to_image
868 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
869 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
870 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
871 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
872 desc[2] =
873 A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
874 A6XX_TEX_CONST_2_PITCH(pitch) |
875 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
876 desc[3] = 0;
877 desc[4] = va;
878 desc[5] = va >> 32;
879 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
880 desc[i] = 0;
881
882 r3d_src_common(cmd, cs, desc, 0, 0, false);
883 }
884
885 static void
886 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
887 {
888 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
889
890 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
891 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
892 tu_cs_image_ref(cs, iview, layer);
893 tu_cs_emit(cs, 0);
894
895 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
896 tu_cs_image_flag_ref(cs, iview, layer);
897
898 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
899 }
900
901 static void
902 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
903 {
904 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
905
906 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
907
908 tu_cs_emit_regs(cs,
909 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
910 A6XX_RB_MRT_PITCH(0, pitch),
911 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
912 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
913 A6XX_RB_MRT_BASE_HI(0, va >> 32),
914 A6XX_RB_MRT_BASE_GMEM(0, 0));
915
916 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
917 }
918
919 static void
920 r3d_setup(struct tu_cmd_buffer *cmd,
921 struct tu_cs *cs,
922 VkFormat vk_format,
923 enum a6xx_rotation rotation,
924 bool clear,
925 uint8_t mask)
926 {
927 if (!cmd->state.pass) {
928 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
929 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
930 }
931
932 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
933 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
934
935 r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0, false);
936
937 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
938 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
939 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
940 0xfc000000);
941 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
942
943 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
944 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
945
946 tu_cs_emit_regs(cs,
947 A6XX_RB_FS_OUTPUT_CNTL0(),
948 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
949
950 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
951 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
952 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
953
954 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
955 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
956 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
957 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
958 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
959 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
960 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
961
962 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
963 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
964
965 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
966 .color_format = tu6_base_format(vk_format),
967 .color_sint = vk_format_is_sint(vk_format),
968 .color_uint = vk_format_is_uint(vk_format)));
969
970 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
971 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
972 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
973 }
974
975 static void
976 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
977 {
978 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
979 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
980 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
981 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
982 tu_cs_emit(cs, 1); /* instance count */
983 tu_cs_emit(cs, 2); /* vertex count */
984 }
985
986 /* blit ops - common interface for 2d/shader paths */
987
988 struct blit_ops {
989 void (*coords)(struct tu_cs *cs,
990 const VkOffset2D *dst,
991 const VkOffset2D *src,
992 const VkExtent2D *extent);
993 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
994 void (*src)(
995 struct tu_cmd_buffer *cmd,
996 struct tu_cs *cs,
997 const struct tu_image_view *iview,
998 uint32_t layer,
999 bool linear_filter);
1000 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1001 VkFormat vk_format,
1002 uint64_t va, uint32_t pitch,
1003 uint32_t width, uint32_t height);
1004 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1005 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1006 void (*setup)(struct tu_cmd_buffer *cmd,
1007 struct tu_cs *cs,
1008 VkFormat vk_format,
1009 enum a6xx_rotation rotation,
1010 bool clear,
1011 uint8_t mask);
1012 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1013 };
1014
1015 static const struct blit_ops r2d_ops = {
1016 .coords = r2d_coords,
1017 .clear_value = r2d_clear_value,
1018 .src = r2d_src,
1019 .src_buffer = r2d_src_buffer,
1020 .dst = r2d_dst,
1021 .dst_buffer = r2d_dst_buffer,
1022 .setup = r2d_setup,
1023 .run = r2d_run,
1024 };
1025
1026 static const struct blit_ops r3d_ops = {
1027 .coords = r3d_coords,
1028 .clear_value = r3d_clear_value,
1029 .src = r3d_src,
1030 .src_buffer = r3d_src_buffer,
1031 .dst = r3d_dst,
1032 .dst_buffer = r3d_dst_buffer,
1033 .setup = r3d_setup,
1034 .run = r3d_run,
1035 };
1036
1037 /* passthrough set coords from 3D extents */
1038 static void
1039 coords(const struct blit_ops *ops,
1040 struct tu_cs *cs,
1041 const VkOffset3D *dst,
1042 const VkOffset3D *src,
1043 const VkExtent3D *extent)
1044 {
1045 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1046 }
1047
1048 static void
1049 tu_image_view_blit2(struct tu_image_view *iview,
1050 struct tu_image *image,
1051 VkFormat format,
1052 const VkImageSubresourceLayers *subres,
1053 uint32_t layer,
1054 bool stencil_read)
1055 {
1056 VkImageAspectFlags aspect_mask = subres->aspectMask;
1057
1058 /* always use the AS_R8G8B8A8 format for these */
1059 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1060 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1061 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1062 }
1063
1064 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1065 .image = tu_image_to_handle(image),
1066 .viewType = VK_IMAGE_VIEW_TYPE_2D,
1067 .format = format,
1068 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1069 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1070 .subresourceRange = {
1071 .aspectMask = aspect_mask,
1072 .baseMipLevel = subres->mipLevel,
1073 .levelCount = 1,
1074 .baseArrayLayer = subres->baseArrayLayer + layer,
1075 .layerCount = 1,
1076 },
1077 });
1078 }
1079
1080 static void
1081 tu_image_view_blit(struct tu_image_view *iview,
1082 struct tu_image *image,
1083 const VkImageSubresourceLayers *subres,
1084 uint32_t layer)
1085 {
1086 tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1087 }
1088
1089 static void
1090 tu6_blit_image(struct tu_cmd_buffer *cmd,
1091 struct tu_image *src_image,
1092 struct tu_image *dst_image,
1093 const VkImageBlit *info,
1094 VkFilter filter)
1095 {
1096 const struct blit_ops *ops = &r3d_ops;
1097 struct tu_cs *cs = &cmd->cs;
1098 uint32_t layers;
1099
1100 /* 2D blit can't do rotation mirroring from just coordinates */
1101 static const enum a6xx_rotation rotate[2][2] = {
1102 {ROTATE_0, ROTATE_HFLIP},
1103 {ROTATE_VFLIP, ROTATE_180},
1104 };
1105
1106 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1107 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1108 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1109 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1110 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1111 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1112
1113 if (mirror_z) {
1114 tu_finishme("blit z mirror\n");
1115 return;
1116 }
1117
1118 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1119 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1120 tu_finishme("blit z filter\n");
1121 return;
1122 }
1123
1124 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1125 if (info->dstSubresource.layerCount > 1) {
1126 assert(layers <= 1);
1127 layers = info->dstSubresource.layerCount;
1128 }
1129
1130 uint8_t mask = 0xf;
1131 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1132 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1133 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1134 mask = 0x7;
1135 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1136 mask = 0x8;
1137 }
1138
1139 /* BC1_RGB_* formats need to have their last components overriden with 1
1140 * when sampling, which is normally handled with the texture descriptor
1141 * swizzle. The 2d path can't handle that, so use the 3d path.
1142 *
1143 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1144 * the 2d path.
1145 */
1146
1147 if (dst_image->samples > 1 ||
1148 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1149 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1150 ops = &r3d_ops;
1151
1152 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1153 * figure out why (should be able to pass all tests with only shader path)
1154 */
1155
1156 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1157
1158 if (ops == &r3d_ops) {
1159 r3d_coords_raw(cs, false, (float[]) {
1160 info->dstOffsets[0].x, info->dstOffsets[0].y,
1161 info->srcOffsets[0].x, info->srcOffsets[0].y,
1162 info->dstOffsets[1].x, info->dstOffsets[1].y,
1163 info->srcOffsets[1].x, info->srcOffsets[1].y
1164 });
1165 } else {
1166 tu_cs_emit_regs(cs,
1167 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1168 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1169 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1170 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1171 tu_cs_emit_regs(cs,
1172 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1173 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1174 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1175 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1176 }
1177
1178 struct tu_image_view dst, src;
1179 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1180 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1181
1182 for (uint32_t i = 0; i < layers; i++) {
1183 ops->dst(cs, &dst, i);
1184 ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1185 ops->run(cmd, cs);
1186 }
1187 }
1188
1189 void
1190 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1191 VkImage srcImage,
1192 VkImageLayout srcImageLayout,
1193 VkImage dstImage,
1194 VkImageLayout dstImageLayout,
1195 uint32_t regionCount,
1196 const VkImageBlit *pRegions,
1197 VkFilter filter)
1198
1199 {
1200 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1201 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1202 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1203
1204 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1205 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1206
1207 for (uint32_t i = 0; i < regionCount; ++i)
1208 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1209 }
1210
1211 static VkFormat
1212 copy_format(VkFormat format)
1213 {
1214 switch (vk_format_get_blocksizebits(format)) {
1215 case 8: return VK_FORMAT_R8_UINT;
1216 case 16: return VK_FORMAT_R16_UINT;
1217 case 32: return VK_FORMAT_R32_UINT;
1218 case 64: return VK_FORMAT_R32G32_UINT;
1219 case 96: return VK_FORMAT_R32G32B32_UINT;
1220 case 128:return VK_FORMAT_R32G32B32A32_UINT;
1221 default:
1222 unreachable("unhandled format size");
1223 }
1224 }
1225
1226 static void
1227 copy_compressed(VkFormat format,
1228 VkOffset3D *offset,
1229 VkExtent3D *extent,
1230 uint32_t *width,
1231 uint32_t *height)
1232 {
1233 if (!vk_format_is_compressed(format))
1234 return;
1235
1236 uint32_t block_width = vk_format_get_blockwidth(format);
1237 uint32_t block_height = vk_format_get_blockheight(format);
1238
1239 offset->x /= block_width;
1240 offset->y /= block_height;
1241
1242 if (extent) {
1243 extent->width = DIV_ROUND_UP(extent->width, block_width);
1244 extent->height = DIV_ROUND_UP(extent->height, block_height);
1245 }
1246 if (width)
1247 *width = DIV_ROUND_UP(*width, block_width);
1248 if (height)
1249 *height = DIV_ROUND_UP(*height, block_height);
1250 }
1251
1252 static void
1253 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1254 struct tu_buffer *src_buffer,
1255 struct tu_image *dst_image,
1256 const VkBufferImageCopy *info)
1257 {
1258 struct tu_cs *cs = &cmd->cs;
1259 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1260 VkFormat dst_format = dst_image->vk_format;
1261 VkFormat src_format = dst_image->vk_format;
1262 const struct blit_ops *ops = &r2d_ops;
1263
1264 uint8_t mask = 0xf;
1265
1266 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1267 switch (info->imageSubresource.aspectMask) {
1268 case VK_IMAGE_ASPECT_STENCIL_BIT:
1269 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1270 mask = 0x8;
1271 ops = &r3d_ops;
1272 break;
1273 case VK_IMAGE_ASPECT_DEPTH_BIT:
1274 mask = 0x7;
1275 break;
1276 }
1277 }
1278
1279 VkOffset3D offset = info->imageOffset;
1280 VkExtent3D extent = info->imageExtent;
1281 uint32_t src_width = info->bufferRowLength ?: extent.width;
1282 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1283
1284 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1285 assert(src_format == dst_format);
1286 copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1287 src_format = dst_format = copy_format(dst_format);
1288 }
1289
1290 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1291 uint32_t layer_size = src_height * pitch;
1292
1293 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1294 * it is also valid for 1cpp format with shader path (stencil aspect path)
1295 */
1296
1297 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1298
1299 struct tu_image_view dst;
1300 tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1301
1302 for (uint32_t i = 0; i < layers; i++) {
1303 ops->dst(cs, &dst, i);
1304
1305 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1306 if ((src_va & 63) || (pitch & 63)) {
1307 for (uint32_t y = 0; y < extent.height; y++) {
1308 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1309 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1310 x + extent.width, 1);
1311 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1312 &(VkExtent2D) {extent.width, 1});
1313 ops->run(cmd, cs);
1314 src_va += pitch;
1315 }
1316 } else {
1317 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1318 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1319 ops->run(cmd, cs);
1320 }
1321 }
1322 }
1323
1324 void
1325 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1326 VkBuffer srcBuffer,
1327 VkImage dstImage,
1328 VkImageLayout dstImageLayout,
1329 uint32_t regionCount,
1330 const VkBufferImageCopy *pRegions)
1331 {
1332 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1333 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1334 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1335
1336 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1337 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1338
1339 for (unsigned i = 0; i < regionCount; ++i)
1340 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1341 }
1342
1343 static void
1344 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1345 struct tu_image *src_image,
1346 struct tu_buffer *dst_buffer,
1347 const VkBufferImageCopy *info)
1348 {
1349 struct tu_cs *cs = &cmd->cs;
1350 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1351 VkFormat src_format = src_image->vk_format;
1352 VkFormat dst_format = src_image->vk_format;
1353 bool stencil_read = false;
1354
1355 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1356 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1357 dst_format = VK_FORMAT_R8_UNORM;
1358 stencil_read = true;
1359 }
1360
1361 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1362 VkOffset3D offset = info->imageOffset;
1363 VkExtent3D extent = info->imageExtent;
1364 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1365 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1366
1367 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1368 assert(src_format == dst_format);
1369 copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1370 src_format = dst_format = copy_format(dst_format);
1371 }
1372
1373 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1374 uint32_t layer_size = pitch * dst_height;
1375
1376 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1377 * it is also valid for 1cpp format with shader path (stencil aspect)
1378 */
1379
1380 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1381
1382 struct tu_image_view src;
1383 tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1384
1385 for (uint32_t i = 0; i < layers; i++) {
1386 ops->src(cmd, cs, &src, i, false);
1387
1388 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1389 if ((dst_va & 63) || (pitch & 63)) {
1390 for (uint32_t y = 0; y < extent.height; y++) {
1391 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1392 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1393 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1394 &(VkExtent2D) {extent.width, 1});
1395 ops->run(cmd, cs);
1396 dst_va += pitch;
1397 }
1398 } else {
1399 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1400 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1401 ops->run(cmd, cs);
1402 }
1403 }
1404 }
1405
1406 void
1407 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1408 VkImage srcImage,
1409 VkImageLayout srcImageLayout,
1410 VkBuffer dstBuffer,
1411 uint32_t regionCount,
1412 const VkBufferImageCopy *pRegions)
1413 {
1414 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1415 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1416 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1417
1418 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1419 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1420
1421 for (unsigned i = 0; i < regionCount; ++i)
1422 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1423 }
1424
1425 /* Tiled formats don't support swapping, which means that we can't support
1426 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1427 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1428 * Currently we fake support for tiled swapped formats and use the unswapped
1429 * format instead, but this means that reinterpreting copies to and from
1430 * swapped formats can't be performed correctly unless we can swizzle the
1431 * components by reinterpreting the other image as the "correct" swapped
1432 * format, i.e. only when the other image is linear.
1433 */
1434
1435 static bool
1436 is_swapped_format(VkFormat format)
1437 {
1438 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1439 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1440 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1441 }
1442
1443 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1444 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1445 * versa). This should mirror the logic in fdl6_layout.
1446 */
1447 static bool
1448 image_is_r8g8(struct tu_image *image)
1449 {
1450 return image->layout.cpp == 2 &&
1451 vk_format_get_nr_components(image->vk_format) == 2;
1452 }
1453
1454 static void
1455 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1456 struct tu_image *src_image,
1457 struct tu_image *dst_image,
1458 const VkImageCopy *info)
1459 {
1460 const struct blit_ops *ops = &r2d_ops;
1461 struct tu_cs *cs = &cmd->cs;
1462
1463 uint8_t mask = 0xf;
1464 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1465 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1466 mask = 0x7;
1467 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1468 mask = 0x8;
1469 }
1470
1471 if (dst_image->samples > 1)
1472 ops = &r3d_ops;
1473
1474 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1475
1476 VkFormat format = VK_FORMAT_UNDEFINED;
1477 VkOffset3D src_offset = info->srcOffset;
1478 VkOffset3D dst_offset = info->dstOffset;
1479 VkExtent3D extent = info->extent;
1480
1481 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1482 * Images":
1483 *
1484 * When copying between compressed and uncompressed formats the extent
1485 * members represent the texel dimensions of the source image and not
1486 * the destination. When copying from a compressed image to an
1487 * uncompressed image the image texel dimensions written to the
1488 * uncompressed image will be source extent divided by the compressed
1489 * texel block dimensions. When copying from an uncompressed image to a
1490 * compressed image the image texel dimensions written to the compressed
1491 * image will be the source extent multiplied by the compressed texel
1492 * block dimensions.
1493 *
1494 * This means we only have to adjust the extent if the source image is
1495 * compressed.
1496 */
1497 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1498 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1499
1500 VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1501 copy_format(dst_image->vk_format) : dst_image->vk_format;
1502 VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1503 copy_format(src_image->vk_format) : src_image->vk_format;
1504
1505 bool use_staging_blit = false;
1506
1507 if (src_format == dst_format) {
1508 /* Images that share a format can always be copied directly because it's
1509 * the same as a blit.
1510 */
1511 format = src_format;
1512 } else if (!src_image->layout.tile_mode) {
1513 /* If an image is linear, we can always safely reinterpret it with the
1514 * other image's format and then do a regular blit.
1515 */
1516 format = dst_format;
1517 } else if (!dst_image->layout.tile_mode) {
1518 format = src_format;
1519 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1520 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1521 * due to the different tile layout.
1522 */
1523 use_staging_blit = true;
1524 } else if (is_swapped_format(src_format) ||
1525 is_swapped_format(dst_format)) {
1526 /* If either format has a non-identity swap, then we can't copy
1527 * to/from it.
1528 */
1529 use_staging_blit = true;
1530 } else if (!src_image->layout.ubwc) {
1531 format = dst_format;
1532 } else if (!dst_image->layout.ubwc) {
1533 format = src_format;
1534 } else {
1535 /* Both formats use UBWC and so neither can be reinterpreted.
1536 * TODO: We could do an in-place decompression of the dst instead.
1537 */
1538 use_staging_blit = true;
1539 }
1540
1541 struct tu_image_view dst, src;
1542
1543 if (use_staging_blit) {
1544 tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1545 tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1546
1547 struct tu_image staging_image = {
1548 .vk_format = src_format,
1549 .type = src_image->type,
1550 .tiling = VK_IMAGE_TILING_LINEAR,
1551 .extent = extent,
1552 .level_count = 1,
1553 .layer_count = info->srcSubresource.layerCount,
1554 .samples = src_image->samples,
1555 .bo_offset = 0,
1556 };
1557
1558 VkImageSubresourceLayers staging_subresource = {
1559 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1560 .mipLevel = 0,
1561 .baseArrayLayer = 0,
1562 .layerCount = info->srcSubresource.layerCount,
1563 };
1564
1565 VkOffset3D staging_offset = { 0 };
1566
1567 staging_image.layout.tile_mode = TILE6_LINEAR;
1568 staging_image.layout.ubwc = false;
1569
1570 fdl6_layout(&staging_image.layout,
1571 vk_format_to_pipe_format(staging_image.vk_format),
1572 staging_image.samples,
1573 staging_image.extent.width,
1574 staging_image.extent.height,
1575 staging_image.extent.depth,
1576 staging_image.level_count,
1577 staging_image.layer_count,
1578 staging_image.type == VK_IMAGE_TYPE_3D,
1579 NULL);
1580
1581 VkResult result = tu_get_scratch_bo(cmd->device,
1582 staging_image.layout.size,
1583 &staging_image.bo);
1584 if (result != VK_SUCCESS) {
1585 cmd->record_result = result;
1586 return;
1587 }
1588
1589 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1590 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1591
1592 struct tu_image_view staging;
1593 tu_image_view_blit2(&staging, &staging_image, src_format,
1594 &staging_subresource, 0, false);
1595
1596 ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1597 coords(ops, cs, &staging_offset, &src_offset, &extent);
1598
1599 for (uint32_t i = 0; i < info->extent.depth; i++) {
1600 ops->src(cmd, cs, &src, i, false);
1601 ops->dst(cs, &staging, i);
1602 ops->run(cmd, cs);
1603 }
1604
1605 /* When executed by the user there has to be a pipeline barrier here,
1606 * but since we're doing it manually we'll have to flush ourselves.
1607 */
1608 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1609 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1610
1611 tu_image_view_blit2(&staging, &staging_image, dst_format,
1612 &staging_subresource, 0, false);
1613
1614 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1615 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1616
1617 for (uint32_t i = 0; i < info->extent.depth; i++) {
1618 ops->src(cmd, cs, &staging, i, false);
1619 ops->dst(cs, &dst, i);
1620 ops->run(cmd, cs);
1621 }
1622 } else {
1623 tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1624 tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1625
1626 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1627 coords(ops, cs, &dst_offset, &src_offset, &extent);
1628
1629 for (uint32_t i = 0; i < info->extent.depth; i++) {
1630 ops->src(cmd, cs, &src, i, false);
1631 ops->dst(cs, &dst, i);
1632 ops->run(cmd, cs);
1633 }
1634 }
1635 }
1636
1637 void
1638 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1639 VkImage srcImage,
1640 VkImageLayout srcImageLayout,
1641 VkImage destImage,
1642 VkImageLayout destImageLayout,
1643 uint32_t regionCount,
1644 const VkImageCopy *pRegions)
1645 {
1646 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1647 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1648 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1649
1650 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1651 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1652
1653 for (uint32_t i = 0; i < regionCount; ++i)
1654 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1655 }
1656
1657 static void
1658 copy_buffer(struct tu_cmd_buffer *cmd,
1659 uint64_t dst_va,
1660 uint64_t src_va,
1661 uint64_t size,
1662 uint32_t block_size)
1663 {
1664 const struct blit_ops *ops = &r2d_ops;
1665 struct tu_cs *cs = &cmd->cs;
1666 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1667 uint64_t blocks = size / block_size;
1668
1669 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1670
1671 while (blocks) {
1672 uint32_t src_x = (src_va & 63) / block_size;
1673 uint32_t dst_x = (dst_va & 63) / block_size;
1674 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1675
1676 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1677 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1678 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1679 ops->run(cmd, cs);
1680
1681 src_va += width * block_size;
1682 dst_va += width * block_size;
1683 blocks -= width;
1684 }
1685 }
1686
1687 void
1688 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1689 VkBuffer srcBuffer,
1690 VkBuffer dstBuffer,
1691 uint32_t regionCount,
1692 const VkBufferCopy *pRegions)
1693 {
1694 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1695 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1696 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1697
1698 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1699 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1700
1701 for (unsigned i = 0; i < regionCount; ++i) {
1702 copy_buffer(cmd,
1703 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1704 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1705 pRegions[i].size, 1);
1706 }
1707 }
1708
1709 void
1710 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1711 VkBuffer dstBuffer,
1712 VkDeviceSize dstOffset,
1713 VkDeviceSize dataSize,
1714 const void *pData)
1715 {
1716 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1717 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1718
1719 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1720
1721 struct ts_cs_memory tmp;
1722 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1723 if (result != VK_SUCCESS) {
1724 cmd->record_result = result;
1725 return;
1726 }
1727
1728 memcpy(tmp.map, pData, dataSize);
1729 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1730 }
1731
1732 void
1733 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1734 VkBuffer dstBuffer,
1735 VkDeviceSize dstOffset,
1736 VkDeviceSize fillSize,
1737 uint32_t data)
1738 {
1739 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1740 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1741 const struct blit_ops *ops = &r2d_ops;
1742 struct tu_cs *cs = &cmd->cs;
1743
1744 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1745
1746 if (fillSize == VK_WHOLE_SIZE)
1747 fillSize = buffer->size - dstOffset;
1748
1749 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1750 uint32_t blocks = fillSize / 4;
1751
1752 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1753 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1754
1755 while (blocks) {
1756 uint32_t dst_x = (dst_va & 63) / 4;
1757 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1758
1759 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1760 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1761 ops->run(cmd, cs);
1762
1763 dst_va += width * 4;
1764 blocks -= width;
1765 }
1766 }
1767
1768 void
1769 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1770 VkImage srcImage,
1771 VkImageLayout srcImageLayout,
1772 VkImage dstImage,
1773 VkImageLayout dstImageLayout,
1774 uint32_t regionCount,
1775 const VkImageResolve *pRegions)
1776 {
1777 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1778 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1779 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1780 const struct blit_ops *ops = &r2d_ops;
1781 struct tu_cs *cs = &cmd->cs;
1782
1783 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1784 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1785
1786 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1787
1788 for (uint32_t i = 0; i < regionCount; ++i) {
1789 const VkImageResolve *info = &pRegions[i];
1790 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1791
1792 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1793 /* TODO: aspect masks possible ? */
1794
1795 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1796
1797 struct tu_image_view dst, src;
1798 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1799 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1800
1801 for (uint32_t i = 0; i < layers; i++) {
1802 ops->src(cmd, cs, &src, i, false);
1803 ops->dst(cs, &dst, i);
1804 ops->run(cmd, cs);
1805 }
1806 }
1807 }
1808
1809 void
1810 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1811 struct tu_cs *cs,
1812 struct tu_image_view *src,
1813 struct tu_image_view *dst,
1814 uint32_t layers,
1815 const VkRect2D *rect)
1816 {
1817 const struct blit_ops *ops = &r2d_ops;
1818
1819 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1820 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1821
1822 assert(src->image->vk_format == dst->image->vk_format);
1823
1824 ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1825 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1826
1827 for (uint32_t i = 0; i < layers; i++) {
1828 ops->src(cmd, cs, src, i, false);
1829 ops->dst(cs, dst, i);
1830 ops->run(cmd, cs);
1831 }
1832 }
1833
1834 static void
1835 clear_image(struct tu_cmd_buffer *cmd,
1836 struct tu_image *image,
1837 const VkClearValue *clear_value,
1838 const VkImageSubresourceRange *range)
1839 {
1840 uint32_t level_count = tu_get_levelCount(image, range);
1841 uint32_t layer_count = tu_get_layerCount(image, range);
1842 struct tu_cs *cs = &cmd->cs;
1843 VkFormat format = image->vk_format;
1844 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1845 format = VK_FORMAT_R32_UINT;
1846
1847 if (image->type == VK_IMAGE_TYPE_3D) {
1848 assert(layer_count == 1);
1849 assert(range->baseArrayLayer == 0);
1850 }
1851
1852 uint8_t mask = 0xf;
1853 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1854 mask = 0;
1855 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1856 mask |= 0x7;
1857 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1858 mask |= 0x8;
1859 }
1860
1861 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1862
1863 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1864 ops->clear_value(cs, image->vk_format, clear_value);
1865
1866 for (unsigned j = 0; j < level_count; j++) {
1867 if (image->type == VK_IMAGE_TYPE_3D)
1868 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1869
1870 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1871 u_minify(image->extent.width, range->baseMipLevel + j),
1872 u_minify(image->extent.height, range->baseMipLevel + j)
1873 });
1874
1875 struct tu_image_view dst;
1876 tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1877 .aspectMask = range->aspectMask,
1878 .mipLevel = range->baseMipLevel + j,
1879 .baseArrayLayer = range->baseArrayLayer,
1880 .layerCount = 1,
1881 }, 0, false);
1882
1883 for (uint32_t i = 0; i < layer_count; i++) {
1884 ops->dst(cs, &dst, i);
1885 ops->run(cmd, cs);
1886 }
1887 }
1888 }
1889
1890 void
1891 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1892 VkImage image_h,
1893 VkImageLayout imageLayout,
1894 const VkClearColorValue *pColor,
1895 uint32_t rangeCount,
1896 const VkImageSubresourceRange *pRanges)
1897 {
1898 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1899 TU_FROM_HANDLE(tu_image, image, image_h);
1900
1901 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1902
1903 for (unsigned i = 0; i < rangeCount; i++)
1904 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1905 }
1906
1907 void
1908 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1909 VkImage image_h,
1910 VkImageLayout imageLayout,
1911 const VkClearDepthStencilValue *pDepthStencil,
1912 uint32_t rangeCount,
1913 const VkImageSubresourceRange *pRanges)
1914 {
1915 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1916 TU_FROM_HANDLE(tu_image, image, image_h);
1917
1918 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1919
1920 for (unsigned i = 0; i < rangeCount; i++)
1921 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1922 }
1923
1924 static void
1925 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1926 uint32_t attachment_count,
1927 const VkClearAttachment *attachments,
1928 uint32_t rect_count,
1929 const VkClearRect *rects)
1930 {
1931 const struct tu_subpass *subpass = cmd->state.subpass;
1932 /* note: cannot use shader path here.. there is a special shader path
1933 * in tu_clear_sysmem_attachments()
1934 */
1935 const struct blit_ops *ops = &r2d_ops;
1936 struct tu_cs *cs = &cmd->draw_cs;
1937
1938 for (uint32_t j = 0; j < attachment_count; j++) {
1939 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1940 * Pass Instance" says that:
1941 *
1942 * Unlike other clear commands, vkCmdClearAttachments executes as
1943 * a drawing command, rather than a transfer command, with writes
1944 * performed by it executing in rasterization order. Clears to
1945 * color attachments are executed as color attachment writes, by
1946 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1947 * Clears to depth/stencil attachments are executed as depth
1948 * writes and writes by the
1949 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1950 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1951 *
1952 * However, the 2d path here is executed the same way as a
1953 * transfer command, using the CCU color cache exclusively with
1954 * a special depth-as-color format for depth clears. This means that
1955 * we can't rely on the normal pipeline barrier mechanism here, and
1956 * have to manually flush whenever using a different cache domain
1957 * from what the 3d path would've used. This happens when we clear
1958 * depth/stencil, since normally depth attachments use CCU depth, but
1959 * we clear it using a special depth-as-color format. Since the clear
1960 * potentially uses a different attachment state we also need to
1961 * invalidate color beforehand and flush it afterwards.
1962 */
1963
1964 uint32_t a;
1965 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1966 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1967 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1968 } else {
1969 a = subpass->depth_stencil_attachment.attachment;
1970 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1971 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1972 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1973 }
1974
1975 if (a == VK_ATTACHMENT_UNUSED)
1976 continue;
1977
1978 uint8_t mask = 0xf;
1979 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1980 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1981 mask &= ~0x7;
1982 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1983 mask &= ~0x8;
1984 }
1985
1986 const struct tu_image_view *iview =
1987 cmd->state.framebuffer->attachments[a].attachment;
1988
1989 ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1990 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1991
1992 /* Wait for the flushes we triggered manually to complete */
1993 tu_cs_emit_wfi(cs);
1994
1995 for (uint32_t i = 0; i < rect_count; i++) {
1996 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1997 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1998 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1999 ops->run(cmd, cs);
2000 }
2001 }
2002
2003 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2004 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2005 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2006 } else {
2007 /* sync color into depth */
2008 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2009 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2010 }
2011 }
2012 }
2013
2014 static void
2015 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2016 uint32_t attachment_count,
2017 const VkClearAttachment *attachments,
2018 uint32_t rect_count,
2019 const VkClearRect *rects)
2020 {
2021 /* the shader path here is special, it avoids changing MRT/etc state */
2022 const struct tu_render_pass *pass = cmd->state.pass;
2023 const struct tu_subpass *subpass = cmd->state.subpass;
2024 const uint32_t mrt_count = subpass->color_count;
2025 struct tu_cs *cs = &cmd->draw_cs;
2026 uint32_t clear_value[MAX_RTS][4];
2027 float z_clear_val = 0.0f;
2028 uint8_t s_clear_val = 0;
2029 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2030 bool z_clear = false;
2031 bool s_clear = false;
2032 bool layered_clear = false;
2033 uint32_t max_samples = 1;
2034
2035 for (uint32_t i = 0; i < attachment_count; i++) {
2036 uint32_t a;
2037 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2038 uint32_t c = attachments[i].colorAttachment;
2039 a = subpass->color_attachments[c].attachment;
2040 if (a == VK_ATTACHMENT_UNUSED)
2041 continue;
2042
2043 clear_rts |= 1 << c;
2044 clear_components |= 0xf << (c * 4);
2045 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2046 } else {
2047 a = subpass->depth_stencil_attachment.attachment;
2048 if (a == VK_ATTACHMENT_UNUSED)
2049 continue;
2050
2051 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2052 z_clear = true;
2053 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2054 }
2055
2056 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2057 s_clear = true;
2058 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2059 }
2060 }
2061
2062 max_samples = MAX2(max_samples, pass->attachments[a].samples);
2063 }
2064
2065 /* prefer to use 2D path for clears
2066 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2067 */
2068 if (max_samples == 1 && cmd->state.framebuffer) {
2069 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2070 return;
2071 }
2072
2073 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2074 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2075 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2076 0xfc000000);
2077 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2078
2079 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2080 for (uint32_t i = 0; i < mrt_count; i++) {
2081 if (clear_rts & (1 << i))
2082 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2083 else
2084 tu_cs_emit(cs, 0);
2085 }
2086
2087 for (uint32_t i = 0; i < rect_count; i++) {
2088 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
2089 layered_clear = true;
2090 }
2091
2092 r3d_pipeline(cmd, cs, false, num_rts, layered_clear);
2093
2094 tu_cs_emit_regs(cs,
2095 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2096 tu_cs_emit_regs(cs,
2097 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2098
2099 tu_cs_emit_regs(cs,
2100 A6XX_RB_FS_OUTPUT_CNTL0(),
2101 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2102
2103 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2104 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2105 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2106 for (uint32_t i = 0; i < mrt_count; i++) {
2107 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2108 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2109 }
2110
2111 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2112 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2113 .z_enable = z_clear,
2114 .z_write_enable = z_clear,
2115 .zfunc = FUNC_ALWAYS));
2116 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2117 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2118 .stencil_enable = s_clear,
2119 .func = FUNC_ALWAYS,
2120 .zpass = STENCIL_REPLACE));
2121 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2122 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2123 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2124
2125 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2126 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2127 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2128 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2129 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2130 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2131 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2132 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2133 for_each_bit(b, clear_rts)
2134 tu_cs_emit_array(cs, clear_value[b], 4);
2135
2136 for (uint32_t i = 0; i < rect_count; i++) {
2137 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2138 r3d_coords_raw(cs, layered_clear, (float[]) {
2139 rects[i].rect.offset.x, rects[i].rect.offset.y,
2140 z_clear_val, uif(rects[i].baseArrayLayer + layer),
2141 rects[i].rect.offset.x + rects[i].rect.extent.width,
2142 rects[i].rect.offset.y + rects[i].rect.extent.height,
2143 z_clear_val, 1.0f,
2144 });
2145
2146 if (layered_clear) {
2147 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2148 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2149 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2150 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2151 CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2152 tu_cs_emit(cs, 1); /* instance count */
2153 tu_cs_emit(cs, 1); /* vertex count */
2154 } else {
2155 r3d_run(cmd, cs);
2156 }
2157 }
2158 }
2159
2160 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2161 TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2162 TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2163 TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2164 TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2165 TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2166 }
2167
2168 /**
2169 * Pack a VkClearValue into a 128-bit buffer. format is respected except
2170 * for the component order. The components are always packed in WZYX order,
2171 * because gmem is tiled and tiled formats always have WZYX swap
2172 */
2173 static void
2174 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2175 {
2176 const struct util_format_description *desc = vk_format_description(format);
2177
2178 switch (format) {
2179 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2180 buf[0] = float3_to_r11g11b10f(val->color.float32);
2181 return;
2182 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2183 buf[0] = float3_to_rgb9e5(val->color.float32);
2184 return;
2185 default:
2186 break;
2187 }
2188
2189 assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2190
2191 /* S8_UINT is special and has no depth */
2192 const int max_components =
2193 format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2194
2195 int buf_offset = 0;
2196 int bit_shift = 0;
2197 for (int comp = 0; comp < max_components; comp++) {
2198 const struct util_format_channel_description *ch =
2199 tu_get_format_channel_description(desc, comp);
2200 if (!ch) {
2201 assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2202 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2203 continue;
2204 }
2205
2206 union tu_clear_component_value v = tu_get_clear_component_value(
2207 val, comp, desc->colorspace);
2208
2209 /* move to the next uint32_t when there is not enough space */
2210 assert(ch->size <= 32);
2211 if (bit_shift + ch->size > 32) {
2212 buf_offset++;
2213 bit_shift = 0;
2214 }
2215
2216 if (bit_shift == 0)
2217 buf[buf_offset] = 0;
2218
2219 buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2220 bit_shift += ch->size;
2221 }
2222 }
2223
2224 static void
2225 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2226 struct tu_cs *cs,
2227 uint32_t attachment,
2228 uint8_t component_mask,
2229 const VkClearValue *value)
2230 {
2231 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2232 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2233 * because D24S8 is cleared with AS_R8G8B8A8 format
2234 */
2235
2236 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2237 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2238
2239 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2240 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2241
2242 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2243 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2244
2245 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2246 tu_cs_emit(cs, 0);
2247
2248 uint32_t clear_vals[4] = {};
2249 pack_gmem_clear_value(value, vk_format, clear_vals);
2250
2251 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2252 tu_cs_emit_array(cs, clear_vals, 4);
2253
2254 tu6_emit_event_write(cmd, cs, BLIT);
2255 }
2256
2257 static void
2258 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2259 uint32_t attachment_count,
2260 const VkClearAttachment *attachments,
2261 uint32_t rect_count,
2262 const VkClearRect *rects)
2263 {
2264 const struct tu_subpass *subpass = cmd->state.subpass;
2265 struct tu_cs *cs = &cmd->draw_cs;
2266
2267 /* TODO: swap the loops for smaller cmdstream */
2268 for (unsigned i = 0; i < rect_count; i++) {
2269 unsigned x1 = rects[i].rect.offset.x;
2270 unsigned y1 = rects[i].rect.offset.y;
2271 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2272 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2273
2274 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2275 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2276 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2277
2278 for (unsigned j = 0; j < attachment_count; j++) {
2279 uint32_t a;
2280 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2281 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2282 else
2283 a = subpass->depth_stencil_attachment.attachment;
2284
2285 if (a == VK_ATTACHMENT_UNUSED)
2286 continue;
2287
2288 unsigned clear_mask = 0xf;
2289 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2290 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2291 clear_mask &= ~0x7;
2292 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2293 clear_mask &= ~0x8;
2294 }
2295
2296 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2297 &attachments[j].clearValue);
2298 }
2299 }
2300 }
2301
2302 void
2303 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2304 uint32_t attachmentCount,
2305 const VkClearAttachment *pAttachments,
2306 uint32_t rectCount,
2307 const VkClearRect *pRects)
2308 {
2309 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2310 struct tu_cs *cs = &cmd->draw_cs;
2311
2312 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2313 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2314 tu_cond_exec_end(cs);
2315
2316 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2317 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2318 tu_cond_exec_end(cs);
2319 }
2320
2321 void
2322 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2323 struct tu_cs *cs,
2324 uint32_t a,
2325 const VkRenderPassBeginInfo *info)
2326 {
2327 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2328 const struct tu_image_view *iview = fb->attachments[a].attachment;
2329 const struct tu_render_pass_attachment *attachment =
2330 &cmd->state.pass->attachments[a];
2331 uint8_t mask = 0;
2332
2333 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2334 mask = 0xf;
2335 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2336 mask |= 0x7;
2337 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2338 mask |= 0x8;
2339
2340 if (!mask)
2341 return;
2342
2343 const struct blit_ops *ops = &r2d_ops;
2344 if (attachment->samples > 1)
2345 ops = &r3d_ops;
2346
2347 ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2348 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2349 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2350
2351 /* Wait for any flushes at the beginning of the renderpass to complete */
2352 tu_cs_emit_wfi(cs);
2353
2354 for (uint32_t i = 0; i < fb->layers; i++) {
2355 ops->dst(cs, iview, i);
2356 ops->run(cmd, cs);
2357 }
2358
2359 /* The spec doesn't explicitly say, but presumably the initial renderpass
2360 * clear is considered part of the renderpass, and therefore barriers
2361 * aren't required inside the subpass/renderpass. Therefore we need to
2362 * flush CCU color into CCU depth here, just like with
2363 * vkCmdClearAttachments(). Note that because this only happens at the
2364 * beginning of a renderpass, and renderpass writes are considered
2365 * "incoherent", we shouldn't have to worry about syncing depth into color
2366 * beforehand as depth should already be flushed.
2367 */
2368 if (vk_format_is_depth_or_stencil(attachment->format)) {
2369 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2370 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2371 } else {
2372 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2373 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2374 }
2375 }
2376
2377 void
2378 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2379 struct tu_cs *cs,
2380 uint32_t a,
2381 const VkRenderPassBeginInfo *info)
2382 {
2383 const struct tu_render_pass_attachment *attachment =
2384 &cmd->state.pass->attachments[a];
2385 unsigned clear_mask = 0;
2386
2387 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2388 clear_mask = 0xf;
2389 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2390 clear_mask |= 0x7;
2391 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2392 clear_mask |= 0x8;
2393
2394 if (!clear_mask)
2395 return;
2396
2397 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2398
2399 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2400 &info->pClearValues[a]);
2401 }
2402
2403 static void
2404 tu_emit_blit(struct tu_cmd_buffer *cmd,
2405 struct tu_cs *cs,
2406 const struct tu_image_view *iview,
2407 const struct tu_render_pass_attachment *attachment,
2408 bool resolve)
2409 {
2410 tu_cs_emit_regs(cs,
2411 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2412
2413 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2414 .unk0 = !resolve,
2415 .gmem = !resolve,
2416 /* "integer" bit disables msaa resolve averaging */
2417 .integer = vk_format_is_int(attachment->format)));
2418
2419 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2420 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2421 tu_cs_image_ref_2d(cs, iview, 0, false);
2422
2423 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2424 tu_cs_image_flag_ref(cs, iview, 0);
2425
2426 tu_cs_emit_regs(cs,
2427 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2428
2429 tu6_emit_event_write(cmd, cs, BLIT);
2430 }
2431
2432 static bool
2433 blit_can_resolve(VkFormat format)
2434 {
2435 const struct util_format_description *desc = vk_format_description(format);
2436
2437 /* blit event can only do resolve for simple cases:
2438 * averaging samples as unsigned integers or choosing only one sample
2439 */
2440 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2441 return false;
2442
2443 /* can't do formats with larger channel sizes
2444 * note: this includes all float formats
2445 * note2: single channel integer formats seem OK
2446 */
2447 if (desc->channel[0].size > 10)
2448 return false;
2449
2450 switch (format) {
2451 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2452 * likely related to these formats having different layout from other cpp=2 formats
2453 */
2454 case VK_FORMAT_R8G8_UNORM:
2455 case VK_FORMAT_R8G8_UINT:
2456 case VK_FORMAT_R8G8_SINT:
2457 /* TODO: this one should be able to work? */
2458 case VK_FORMAT_D24_UNORM_S8_UINT:
2459 return false;
2460 default:
2461 break;
2462 }
2463
2464 return true;
2465 }
2466
2467 void
2468 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2469 struct tu_cs *cs,
2470 uint32_t a,
2471 bool force_load)
2472 {
2473 const struct tu_image_view *iview =
2474 cmd->state.framebuffer->attachments[a].attachment;
2475 const struct tu_render_pass_attachment *attachment =
2476 &cmd->state.pass->attachments[a];
2477
2478 if (attachment->load || force_load)
2479 tu_emit_blit(cmd, cs, iview, attachment, false);
2480 }
2481
2482 void
2483 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2484 struct tu_cs *cs,
2485 uint32_t a,
2486 uint32_t gmem_a)
2487 {
2488 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2489 const VkRect2D *render_area = &tiling->render_area;
2490 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2491 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2492 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2493
2494 if (!dst->store)
2495 return;
2496
2497 uint32_t x1 = render_area->offset.x;
2498 uint32_t y1 = render_area->offset.y;
2499 uint32_t x2 = x1 + render_area->extent.width;
2500 uint32_t y2 = y1 + render_area->extent.height;
2501 /* x2/y2 can be unaligned if equal to the size of the image,
2502 * since it will write into padding space
2503 * the one exception is linear levels which don't have the
2504 * required y padding in the layout (except for the last level)
2505 */
2506 bool need_y2_align =
2507 y2 != iview->extent.height || iview->need_y2_align;
2508
2509 bool unaligned =
2510 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2511 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2512
2513 /* use fast path when render area is aligned, except for unsupported resolve cases */
2514 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2515 tu_emit_blit(cmd, cs, iview, src, true);
2516 return;
2517 }
2518
2519 if (dst->samples > 1) {
2520 /* I guess we need to use shader path in this case?
2521 * need a testcase which fails because of this
2522 */
2523 tu_finishme("unaligned store of msaa attachment\n");
2524 return;
2525 }
2526
2527 r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2528 r2d_dst(cs, iview, 0);
2529 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2530
2531 tu_cs_emit_regs(cs,
2532 A6XX_SP_PS_2D_SRC_INFO(
2533 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2534 .tile_mode = TILE6_2,
2535 .srgb = vk_format_is_srgb(src->format),
2536 .samples = tu_msaa_samples(src->samples),
2537 .samples_average = !vk_format_is_int(src->format),
2538 .unk20 = 1,
2539 .unk22 = 1),
2540 /* note: src size does not matter when not scaling */
2541 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2542 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2543 A6XX_SP_PS_2D_SRC_HI(),
2544 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2545
2546 /* sync GMEM writes with CACHE. */
2547 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2548
2549 /* Wait for CACHE_INVALIDATE to land */
2550 tu_cs_emit_wfi(cs);
2551
2552 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2553 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2554
2555 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2556 * sysmem, and we generally assume that GMEM renderpasses leave their
2557 * results in sysmem, so we need to flush manually here.
2558 */
2559 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2560 }