turnip: share code between 3D blit/clear path and tu_pipeline
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 /* helper functions previously in tu_formats.c */
20
21 static uint32_t
22 tu_pack_mask(int bits)
23 {
24 assert(bits <= 32);
25 return (1ull << bits) - 1;
26 }
27
28 static uint32_t
29 tu_pack_float32_for_unorm(float val, int bits)
30 {
31 const uint32_t max = tu_pack_mask(bits);
32 if (val < 0.0f)
33 return 0;
34 else if (val > 1.0f)
35 return max;
36 else
37 return _mesa_lroundevenf(val * (float) max);
38 }
39
40 static uint32_t
41 tu_pack_float32_for_snorm(float val, int bits)
42 {
43 const int32_t max = tu_pack_mask(bits - 1);
44 int32_t tmp;
45 if (val < -1.0f)
46 tmp = -max;
47 else if (val > 1.0f)
48 tmp = max;
49 else
50 tmp = _mesa_lroundevenf(val * (float) max);
51
52 return tmp & tu_pack_mask(bits);
53 }
54
55 static uint32_t
56 tu_pack_float32_for_uscaled(float val, int bits)
57 {
58 const uint32_t max = tu_pack_mask(bits);
59 if (val < 0.0f)
60 return 0;
61 else if (val > (float) max)
62 return max;
63 else
64 return (uint32_t) val;
65 }
66
67 static uint32_t
68 tu_pack_float32_for_sscaled(float val, int bits)
69 {
70 const int32_t max = tu_pack_mask(bits - 1);
71 const int32_t min = -max - 1;
72 int32_t tmp;
73 if (val < (float) min)
74 tmp = min;
75 else if (val > (float) max)
76 tmp = max;
77 else
78 tmp = (int32_t) val;
79
80 return tmp & tu_pack_mask(bits);
81 }
82
83 static uint32_t
84 tu_pack_uint32_for_uint(uint32_t val, int bits)
85 {
86 return val & tu_pack_mask(bits);
87 }
88
89 static uint32_t
90 tu_pack_int32_for_sint(int32_t val, int bits)
91 {
92 return val & tu_pack_mask(bits);
93 }
94
95 static uint32_t
96 tu_pack_float32_for_sfloat(float val, int bits)
97 {
98 assert(bits == 16 || bits == 32);
99 return bits == 16 ? util_float_to_half(val) : fui(val);
100 }
101
102 union tu_clear_component_value {
103 float float32;
104 int32_t int32;
105 uint32_t uint32;
106 };
107
108 static uint32_t
109 tu_pack_clear_component_value(union tu_clear_component_value val,
110 const struct util_format_channel_description *ch)
111 {
112 uint32_t packed;
113
114 switch (ch->type) {
115 case UTIL_FORMAT_TYPE_UNSIGNED:
116 /* normalized, scaled, or pure integer */
117 if (ch->normalized)
118 packed = tu_pack_float32_for_unorm(val.float32, ch->size);
119 else if (ch->pure_integer)
120 packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
121 else
122 packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
123 break;
124 case UTIL_FORMAT_TYPE_SIGNED:
125 /* normalized, scaled, or pure integer */
126 if (ch->normalized)
127 packed = tu_pack_float32_for_snorm(val.float32, ch->size);
128 else if (ch->pure_integer)
129 packed = tu_pack_int32_for_sint(val.int32, ch->size);
130 else
131 packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
132 break;
133 case UTIL_FORMAT_TYPE_FLOAT:
134 packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
135 break;
136 default:
137 unreachable("unexpected channel type");
138 packed = 0;
139 break;
140 }
141
142 assert((packed & tu_pack_mask(ch->size)) == packed);
143 return packed;
144 }
145
146 static const struct util_format_channel_description *
147 tu_get_format_channel_description(const struct util_format_description *desc,
148 int comp)
149 {
150 switch (desc->swizzle[comp]) {
151 case PIPE_SWIZZLE_X:
152 return &desc->channel[0];
153 case PIPE_SWIZZLE_Y:
154 return &desc->channel[1];
155 case PIPE_SWIZZLE_Z:
156 return &desc->channel[2];
157 case PIPE_SWIZZLE_W:
158 return &desc->channel[3];
159 default:
160 return NULL;
161 }
162 }
163
164 static union tu_clear_component_value
165 tu_get_clear_component_value(const VkClearValue *val, int comp,
166 enum util_format_colorspace colorspace)
167 {
168 assert(comp < 4);
169
170 union tu_clear_component_value tmp;
171 switch (colorspace) {
172 case UTIL_FORMAT_COLORSPACE_ZS:
173 assert(comp < 2);
174 if (comp == 0)
175 tmp.float32 = val->depthStencil.depth;
176 else
177 tmp.uint32 = val->depthStencil.stencil;
178 break;
179 case UTIL_FORMAT_COLORSPACE_SRGB:
180 if (comp < 3) {
181 tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
182 break;
183 }
184 default:
185 assert(comp < 4);
186 tmp.uint32 = val->color.uint32[comp];
187 break;
188 }
189
190 return tmp;
191 }
192
193 /* r2d_ = BLIT_OP_SCALE operations */
194
195 static enum a6xx_2d_ifmt
196 format_to_ifmt(enum a6xx_format fmt)
197 {
198 switch (fmt) {
199 case FMT6_A8_UNORM:
200 case FMT6_8_UNORM:
201 case FMT6_8_SNORM:
202 case FMT6_8_8_UNORM:
203 case FMT6_8_8_SNORM:
204 case FMT6_8_8_8_8_UNORM:
205 case FMT6_8_8_8_X8_UNORM:
206 case FMT6_8_8_8_8_SNORM:
207 case FMT6_4_4_4_4_UNORM:
208 case FMT6_5_5_5_1_UNORM:
209 case FMT6_5_6_5_UNORM:
210 case FMT6_Z24_UNORM_S8_UINT:
211 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
212 return R2D_UNORM8;
213
214 case FMT6_32_UINT:
215 case FMT6_32_SINT:
216 case FMT6_32_32_UINT:
217 case FMT6_32_32_SINT:
218 case FMT6_32_32_32_32_UINT:
219 case FMT6_32_32_32_32_SINT:
220 return R2D_INT32;
221
222 case FMT6_16_UINT:
223 case FMT6_16_SINT:
224 case FMT6_16_16_UINT:
225 case FMT6_16_16_SINT:
226 case FMT6_16_16_16_16_UINT:
227 case FMT6_16_16_16_16_SINT:
228 case FMT6_10_10_10_2_UINT:
229 return R2D_INT16;
230
231 case FMT6_8_UINT:
232 case FMT6_8_SINT:
233 case FMT6_8_8_UINT:
234 case FMT6_8_8_SINT:
235 case FMT6_8_8_8_8_UINT:
236 case FMT6_8_8_8_8_SINT:
237 return R2D_INT8;
238
239 case FMT6_16_UNORM:
240 case FMT6_16_SNORM:
241 case FMT6_16_16_UNORM:
242 case FMT6_16_16_SNORM:
243 case FMT6_16_16_16_16_UNORM:
244 case FMT6_16_16_16_16_SNORM:
245 case FMT6_32_FLOAT:
246 case FMT6_32_32_FLOAT:
247 case FMT6_32_32_32_32_FLOAT:
248 return R2D_FLOAT32;
249
250 case FMT6_16_FLOAT:
251 case FMT6_16_16_FLOAT:
252 case FMT6_16_16_16_16_FLOAT:
253 case FMT6_11_11_10_FLOAT:
254 case FMT6_10_10_10_2_UNORM:
255 case FMT6_10_10_10_2_UNORM_DEST:
256 return R2D_FLOAT16;
257
258 default:
259 unreachable("bad format");
260 return 0;
261 }
262 }
263
264 static void
265 r2d_coords(struct tu_cs *cs,
266 const VkOffset2D *dst,
267 const VkOffset2D *src,
268 const VkExtent2D *extent)
269 {
270 tu_cs_emit_regs(cs,
271 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
272 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
273
274 if (!src)
275 return;
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
279 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
280 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
281 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
282 }
283
284 static void
285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
286 {
287 uint32_t clear_value[4] = {};
288
289 switch (format) {
290 case VK_FORMAT_X8_D24_UNORM_PACK32:
291 case VK_FORMAT_D24_UNORM_S8_UINT:
292 /* cleared as r8g8b8a8_unorm using special format */
293 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
294 clear_value[1] = clear_value[0] >> 8;
295 clear_value[2] = clear_value[0] >> 16;
296 clear_value[3] = val->depthStencil.stencil;
297 break;
298 case VK_FORMAT_D16_UNORM:
299 case VK_FORMAT_D32_SFLOAT:
300 /* R2D_FLOAT32 */
301 clear_value[0] = fui(val->depthStencil.depth);
302 break;
303 case VK_FORMAT_S8_UINT:
304 clear_value[0] = val->depthStencil.stencil;
305 break;
306 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
307 /* cleared as UINT32 */
308 clear_value[0] = float3_to_rgb9e5(val->color.float32);
309 break;
310 default:
311 assert(!vk_format_is_depth_or_stencil(format));
312 const struct util_format_description *desc = vk_format_description(format);
313 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
314
315 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
316 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
317
318 for (unsigned i = 0; i < desc->nr_channels; i++) {
319 const struct util_format_channel_description *ch = &desc->channel[i];
320 if (ifmt == R2D_UNORM8) {
321 float linear = val->color.float32[i];
322 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
323 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
324
325 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
326 clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
327 else
328 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
329 } else if (ifmt == R2D_FLOAT16) {
330 clear_value[i] = util_float_to_half(val->color.float32[i]);
331 } else {
332 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
333 ifmt == R2D_INT16 || ifmt == R2D_INT8);
334 clear_value[i] = val->color.uint32[i];
335 }
336 }
337 break;
338 }
339
340 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
341 tu_cs_emit_array(cs, clear_value, 4);
342 }
343
344 static void
345 r2d_src(struct tu_cmd_buffer *cmd,
346 struct tu_cs *cs,
347 const struct tu_image_view *iview,
348 uint32_t layer,
349 bool linear_filter)
350 {
351 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
352 tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
353 COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
354 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
355 tu_cs_image_ref_2d(cs, iview, layer, true);
356
357 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
358 tu_cs_image_flag_ref(cs, iview, layer);
359 }
360
361 static void
362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
363 struct tu_cs *cs,
364 VkFormat vk_format,
365 uint64_t va, uint32_t pitch,
366 uint32_t width, uint32_t height)
367 {
368 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
369
370 tu_cs_emit_regs(cs,
371 A6XX_SP_PS_2D_SRC_INFO(
372 .color_format = format.fmt,
373 .color_swap = format.swap,
374 .srgb = vk_format_is_srgb(vk_format),
375 .unk20 = 1,
376 .unk22 = 1),
377 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
378 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
379 A6XX_SP_PS_2D_SRC_HI(va >> 32),
380 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
381 }
382
383 static void
384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
385 {
386 assert(iview->image->samples == 1);
387
388 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
389 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
390 tu_cs_image_ref_2d(cs, iview, layer, false);
391
392 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
393 tu_cs_image_flag_ref(cs, iview, layer);
394 }
395
396 static void
397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
398 {
399 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
400
401 tu_cs_emit_regs(cs,
402 A6XX_RB_2D_DST_INFO(
403 .color_format = format.fmt,
404 .color_swap = format.swap,
405 .srgb = vk_format_is_srgb(vk_format)),
406 A6XX_RB_2D_DST_LO((uint32_t) va),
407 A6XX_RB_2D_DST_HI(va >> 32),
408 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
409 }
410
411 static void
412 r2d_setup_common(struct tu_cmd_buffer *cmd,
413 struct tu_cs *cs,
414 VkFormat vk_format,
415 enum a6xx_rotation rotation,
416 bool clear,
417 uint8_t mask,
418 bool scissor)
419 {
420 enum a6xx_format format = tu6_base_format(vk_format);
421 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
422 uint32_t unknown_8c01 = 0;
423
424 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
425 /* preserve depth channels */
426 if (mask == 0x8)
427 unknown_8c01 = 0x00084001;
428 /* preserve stencil channel */
429 if (mask == 0x7)
430 unknown_8c01 = 0x08000041;
431 }
432
433 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
434 tu_cs_emit(cs, unknown_8c01);
435
436 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
437 .scissor = scissor,
438 .rotate = rotation,
439 .solid_color = clear,
440 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
441 .color_format = format,
442 .mask = 0xf,
443 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
444 ).value;
445
446 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
447 tu_cs_emit(cs, blit_cntl);
448
449 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
450 tu_cs_emit(cs, blit_cntl);
451
452 if (format == FMT6_10_10_10_2_UNORM_DEST)
453 format = FMT6_16_16_16_16_FLOAT;
454
455 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
456 .sint = vk_format_is_sint(vk_format),
457 .uint = vk_format_is_uint(vk_format),
458 .color_format = format,
459 .srgb = vk_format_is_srgb(vk_format),
460 .mask = 0xf));
461 }
462
463 static void
464 r2d_setup(struct tu_cmd_buffer *cmd,
465 struct tu_cs *cs,
466 VkFormat vk_format,
467 enum a6xx_rotation rotation,
468 bool clear,
469 uint8_t mask)
470 {
471 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
472
473 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
474 }
475
476 static void
477 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
478 {
479 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
480 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
481 }
482
483 /* r3d_ = shader path operations */
484
485 static void
486 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
487 {
488 struct ir3_shader dummy_shader = {};
489
490 struct ir3_shader_variant vs = {
491 .type = MESA_SHADER_VERTEX,
492 .instrlen = 1,
493 .constlen = 2,
494 .info.max_reg = 1,
495 .inputs_count = 1,
496 .inputs[0] = {
497 .slot = SYSTEM_VALUE_VERTEX_ID,
498 .regid = regid(0, 3),
499 .sysval = true,
500 },
501 .outputs_count = blit ? 2 : 1,
502 .outputs[0] = {
503 .slot = VARYING_SLOT_POS,
504 .regid = regid(0, 0),
505 },
506 .outputs[1] = {
507 .slot = VARYING_SLOT_VAR0,
508 .regid = regid(1, 0),
509 },
510 .shader = &dummy_shader,
511 };
512
513 struct ir3_shader_variant fs = {
514 .type = MESA_SHADER_FRAGMENT,
515 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
516 .constlen = num_rts,
517 .info.max_reg = MAX2(num_rts, 1) - 1,
518 .total_in = blit ? 2 : 0,
519 .num_samp = blit ? 1 : 0,
520 .inputs_count = blit ? 2 : 0,
521 .inputs[0] = {
522 .slot = VARYING_SLOT_VAR0,
523 .inloc = 0,
524 .compmask = 3,
525 .bary = true,
526 },
527 .inputs[1] = {
528 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
529 .regid = regid(0, 0),
530 .sysval = 1,
531 },
532 .num_sampler_prefetch = blit ? 1 : 0,
533 .sampler_prefetch[0] = {
534 .src = 0,
535 .wrmask = 0xf,
536 .cmd = 4,
537 },
538 .shader = &dummy_shader,
539 };
540
541 static const instr_t vs_code[] = {
542 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
543 * r1.xy = r0.w ? c1.zw : c0.zw
544 * r0.w = 1.0f
545 */
546 { .cat3 = {
547 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
548 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
549 .src2 = 3,
550 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
551 } },
552 { .cat3 = {
553 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
554 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
555 .src2 = 3,
556 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
557 } },
558 { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
559 .src_im = 1, .fim_val = 1.0f } },
560 { .cat0 = { .opc = OPC_END } },
561 };
562 #define FS_OFFSET (16 * sizeof(instr_t))
563 STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
564
565 /* shaders */
566 struct ts_cs_memory shaders = { };
567 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
568 assert(result == VK_SUCCESS);
569
570 memcpy(shaders.map, vs_code, sizeof(vs_code));
571
572 instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
573 for (uint32_t i = 0; i < num_rts; i++) {
574 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
575 *fs_code++ = (instr_t) { .cat1 = {
576 .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
577 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
578 } };
579 }
580
581 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
582 * blit path (its not clear what allows it to not have it)
583 */
584 if (blit) {
585 *fs_code++ = (instr_t) { .cat2 = {
586 .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
587 .dst = regid(63, 0), .src1_im = 1
588 } };
589 }
590 *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
591 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
592
593 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
594
595 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
596 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
597 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
598 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
599 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
600
601 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
602 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
603
604 tu6_emit_vpc(cs, &vs, NULL, &fs, NULL);
605
606 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
607 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
608 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
609
610 tu6_emit_fs_inputs(cs, &fs);
611
612 tu_cs_emit_regs(cs,
613 A6XX_GRAS_CL_CNTL(
614 .persp_division_disable = 1,
615 .vp_xform_disable = 1,
616 .vp_clip_code_ignore = 1,
617 .clip_disable = 1),
618 A6XX_GRAS_UNKNOWN_8001(0));
619 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
620
621 tu_cs_emit_regs(cs,
622 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
623 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
624 tu_cs_emit_regs(cs,
625 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
626 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
627 }
628
629 static void
630 r3d_coords_raw(struct tu_cs *cs, const float *coords)
631 {
632 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
633 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
634 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
635 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
636 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
637 CP_LOAD_STATE6_0_NUM_UNIT(2));
638 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
639 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
640 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
641 }
642
643 static void
644 r3d_coords(struct tu_cs *cs,
645 const VkOffset2D *dst,
646 const VkOffset2D *src,
647 const VkExtent2D *extent)
648 {
649 int32_t src_x1 = src ? src->x : 0;
650 int32_t src_y1 = src ? src->y : 0;
651 r3d_coords_raw(cs, (float[]) {
652 dst->x, dst->y,
653 src_x1, src_y1,
654 dst->x + extent->width, dst->y + extent->height,
655 src_x1 + extent->width, src_y1 + extent->height,
656 });
657 }
658
659 static void
660 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
661 {
662 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
663 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
664 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
665 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
666 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
667 CP_LOAD_STATE6_0_NUM_UNIT(1));
668 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
669 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
670 switch (format) {
671 case VK_FORMAT_X8_D24_UNORM_PACK32:
672 case VK_FORMAT_D24_UNORM_S8_UINT: {
673 /* cleared as r8g8b8a8_unorm using special format */
674 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
675 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
676 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
677 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
678 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
679 } break;
680 case VK_FORMAT_D16_UNORM:
681 case VK_FORMAT_D32_SFLOAT:
682 tu_cs_emit(cs, fui(val->depthStencil.depth));
683 tu_cs_emit(cs, 0);
684 tu_cs_emit(cs, 0);
685 tu_cs_emit(cs, 0);
686 break;
687 case VK_FORMAT_S8_UINT:
688 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
689 tu_cs_emit(cs, 0);
690 tu_cs_emit(cs, 0);
691 tu_cs_emit(cs, 0);
692 break;
693 default:
694 /* as color formats use clear value as-is */
695 assert(!vk_format_is_depth_or_stencil(format));
696 tu_cs_emit_array(cs, val->color.uint32, 4);
697 break;
698 }
699 }
700
701 static void
702 r3d_src_common(struct tu_cmd_buffer *cmd,
703 struct tu_cs *cs,
704 const uint32_t *tex_const,
705 uint32_t offset_base,
706 uint32_t offset_ubwc,
707 bool linear_filter)
708 {
709 struct ts_cs_memory texture = { };
710 VkResult result = tu_cs_alloc(&cmd->sub_cs,
711 2, /* allocate space for a sampler too */
712 A6XX_TEX_CONST_DWORDS, &texture);
713 assert(result == VK_SUCCESS);
714
715 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
716
717 /* patch addresses for layer offset */
718 *(uint64_t*) (texture.map + 4) += offset_base;
719 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
720 texture.map[7] = ubwc_addr;
721 texture.map[8] = ubwc_addr >> 32;
722
723 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
724 A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
725 A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
726 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
727 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
728 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
729 0x60000; /* XXX used by blob, doesn't seem necessary */
730 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
731 0x1 | /* XXX used by blob, doesn't seem necessary */
732 A6XX_TEX_SAMP_1_UNNORM_COORDS |
733 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
734 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
735 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
736
737 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
738 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
739 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
740 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
741 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
742 CP_LOAD_STATE6_0_NUM_UNIT(1));
743 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
744
745 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
746 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
747
748 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
749 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
750 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
751 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
752 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
753 CP_LOAD_STATE6_0_NUM_UNIT(1));
754 tu_cs_emit_qw(cs, texture.iova);
755
756 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
757 tu_cs_emit_qw(cs, texture.iova);
758
759 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
760 }
761
762 static void
763 r3d_src(struct tu_cmd_buffer *cmd,
764 struct tu_cs *cs,
765 const struct tu_image_view *iview,
766 uint32_t layer,
767 bool linear_filter)
768 {
769 r3d_src_common(cmd, cs, iview->descriptor,
770 iview->layer_size * layer,
771 iview->ubwc_layer_size * layer,
772 linear_filter);
773 }
774
775 static void
776 r3d_src_buffer(struct tu_cmd_buffer *cmd,
777 struct tu_cs *cs,
778 VkFormat vk_format,
779 uint64_t va, uint32_t pitch,
780 uint32_t width, uint32_t height)
781 {
782 uint32_t desc[A6XX_TEX_CONST_DWORDS];
783
784 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
785
786 desc[0] =
787 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
788 A6XX_TEX_CONST_0_FMT(format.fmt) |
789 A6XX_TEX_CONST_0_SWAP(format.swap) |
790 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
791 // XXX to swizzle into .w for stencil buffer_to_image
792 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
793 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
794 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
795 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
796 desc[2] =
797 A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
798 A6XX_TEX_CONST_2_PITCH(pitch) |
799 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
800 desc[3] = 0;
801 desc[4] = va;
802 desc[5] = va >> 32;
803 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
804 desc[i] = 0;
805
806 r3d_src_common(cmd, cs, desc, 0, 0, false);
807 }
808
809 static void
810 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
811 {
812 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
813
814 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
815 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
816 tu_cs_image_ref(cs, iview, layer);
817 tu_cs_emit(cs, 0);
818
819 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
820 tu_cs_image_flag_ref(cs, iview, layer);
821
822 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
823 }
824
825 static void
826 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
827 {
828 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
829
830 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
831
832 tu_cs_emit_regs(cs,
833 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
834 A6XX_RB_MRT_PITCH(0, pitch),
835 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
836 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
837 A6XX_RB_MRT_BASE_HI(0, va >> 32),
838 A6XX_RB_MRT_BASE_GMEM(0, 0));
839
840 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
841 }
842
843 static void
844 r3d_setup(struct tu_cmd_buffer *cmd,
845 struct tu_cs *cs,
846 VkFormat vk_format,
847 enum a6xx_rotation rotation,
848 bool clear,
849 uint8_t mask)
850 {
851 if (!cmd->state.pass) {
852 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
853 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
854 }
855
856 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
857 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
858
859 r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
860
861 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
862 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
863 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
864 0xfc000000);
865 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
866
867 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
868 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
869
870 tu_cs_emit_regs(cs,
871 A6XX_RB_FS_OUTPUT_CNTL0(),
872 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
873
874 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
875 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
876 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
877
878 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
879 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
880 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
881 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
882 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
883 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
884 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
885
886 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
887 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
888
889 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
890 .color_format = tu6_base_format(vk_format),
891 .color_sint = vk_format_is_sint(vk_format),
892 .color_uint = vk_format_is_uint(vk_format)));
893
894 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
895 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
896 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
897 }
898
899 static void
900 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
901 {
902 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
903 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
904 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
905 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
906 tu_cs_emit(cs, 1); /* instance count */
907 tu_cs_emit(cs, 2); /* vertex count */
908 }
909
910 /* blit ops - common interface for 2d/shader paths */
911
912 struct blit_ops {
913 void (*coords)(struct tu_cs *cs,
914 const VkOffset2D *dst,
915 const VkOffset2D *src,
916 const VkExtent2D *extent);
917 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
918 void (*src)(
919 struct tu_cmd_buffer *cmd,
920 struct tu_cs *cs,
921 const struct tu_image_view *iview,
922 uint32_t layer,
923 bool linear_filter);
924 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
925 VkFormat vk_format,
926 uint64_t va, uint32_t pitch,
927 uint32_t width, uint32_t height);
928 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
929 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
930 void (*setup)(struct tu_cmd_buffer *cmd,
931 struct tu_cs *cs,
932 VkFormat vk_format,
933 enum a6xx_rotation rotation,
934 bool clear,
935 uint8_t mask);
936 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
937 };
938
939 static const struct blit_ops r2d_ops = {
940 .coords = r2d_coords,
941 .clear_value = r2d_clear_value,
942 .src = r2d_src,
943 .src_buffer = r2d_src_buffer,
944 .dst = r2d_dst,
945 .dst_buffer = r2d_dst_buffer,
946 .setup = r2d_setup,
947 .run = r2d_run,
948 };
949
950 static const struct blit_ops r3d_ops = {
951 .coords = r3d_coords,
952 .clear_value = r3d_clear_value,
953 .src = r3d_src,
954 .src_buffer = r3d_src_buffer,
955 .dst = r3d_dst,
956 .dst_buffer = r3d_dst_buffer,
957 .setup = r3d_setup,
958 .run = r3d_run,
959 };
960
961 /* passthrough set coords from 3D extents */
962 static void
963 coords(const struct blit_ops *ops,
964 struct tu_cs *cs,
965 const VkOffset3D *dst,
966 const VkOffset3D *src,
967 const VkExtent3D *extent)
968 {
969 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
970 }
971
972 static void
973 tu_image_view_blit2(struct tu_image_view *iview,
974 struct tu_image *image,
975 VkFormat format,
976 const VkImageSubresourceLayers *subres,
977 uint32_t layer,
978 bool stencil_read)
979 {
980 VkImageAspectFlags aspect_mask = subres->aspectMask;
981
982 /* always use the AS_R8G8B8A8 format for these */
983 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
984 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
985 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
986 }
987
988 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
989 .image = tu_image_to_handle(image),
990 .viewType = VK_IMAGE_VIEW_TYPE_2D,
991 .format = format,
992 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
993 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
994 .subresourceRange = {
995 .aspectMask = aspect_mask,
996 .baseMipLevel = subres->mipLevel,
997 .levelCount = 1,
998 .baseArrayLayer = subres->baseArrayLayer + layer,
999 .layerCount = 1,
1000 },
1001 });
1002 }
1003
1004 static void
1005 tu_image_view_blit(struct tu_image_view *iview,
1006 struct tu_image *image,
1007 const VkImageSubresourceLayers *subres,
1008 uint32_t layer)
1009 {
1010 tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1011 }
1012
1013 static void
1014 tu6_blit_image(struct tu_cmd_buffer *cmd,
1015 struct tu_image *src_image,
1016 struct tu_image *dst_image,
1017 const VkImageBlit *info,
1018 VkFilter filter)
1019 {
1020 const struct blit_ops *ops = &r2d_ops;
1021 struct tu_cs *cs = &cmd->cs;
1022 uint32_t layers;
1023
1024 /* 2D blit can't do rotation mirroring from just coordinates */
1025 static const enum a6xx_rotation rotate[2][2] = {
1026 {ROTATE_0, ROTATE_HFLIP},
1027 {ROTATE_VFLIP, ROTATE_180},
1028 };
1029
1030 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1031 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1032 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1033 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1034 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1035 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1036
1037 if (mirror_z) {
1038 tu_finishme("blit z mirror\n");
1039 return;
1040 }
1041
1042 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1043 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1044 tu_finishme("blit z filter\n");
1045 return;
1046 }
1047
1048 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1049 if (info->dstSubresource.layerCount > 1) {
1050 assert(layers <= 1);
1051 layers = info->dstSubresource.layerCount;
1052 }
1053
1054 uint8_t mask = 0xf;
1055 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1056 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1057 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1058 mask = 0x7;
1059 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1060 mask = 0x8;
1061 }
1062
1063 /* BC1_RGB_* formats need to have their last components overriden with 1
1064 * when sampling, which is normally handled with the texture descriptor
1065 * swizzle. The 2d path can't handle that, so use the 3d path.
1066 *
1067 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1068 * the 2d path.
1069 */
1070
1071 if (dst_image->samples > 1 ||
1072 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1073 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1074 ops = &r3d_ops;
1075
1076 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1077 * figure out why (should be able to pass all tests with only shader path)
1078 */
1079
1080 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1081
1082 if (ops == &r3d_ops) {
1083 r3d_coords_raw(cs, (float[]) {
1084 info->dstOffsets[0].x, info->dstOffsets[0].y,
1085 info->srcOffsets[0].x, info->srcOffsets[0].y,
1086 info->dstOffsets[1].x, info->dstOffsets[1].y,
1087 info->srcOffsets[1].x, info->srcOffsets[1].y
1088 });
1089 } else {
1090 tu_cs_emit_regs(cs,
1091 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1092 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1093 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1094 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1095 tu_cs_emit_regs(cs,
1096 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1097 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1098 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1099 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1100 }
1101
1102 struct tu_image_view dst, src;
1103 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1104 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1105
1106 for (uint32_t i = 0; i < layers; i++) {
1107 ops->dst(cs, &dst, i);
1108 ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1109 ops->run(cmd, cs);
1110 }
1111 }
1112
1113 void
1114 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1115 VkImage srcImage,
1116 VkImageLayout srcImageLayout,
1117 VkImage dstImage,
1118 VkImageLayout dstImageLayout,
1119 uint32_t regionCount,
1120 const VkImageBlit *pRegions,
1121 VkFilter filter)
1122
1123 {
1124 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1125 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1126 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1127
1128 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1129 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1130
1131 for (uint32_t i = 0; i < regionCount; ++i)
1132 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1133 }
1134
1135 static VkFormat
1136 copy_format(VkFormat format)
1137 {
1138 switch (vk_format_get_blocksizebits(format)) {
1139 case 8: return VK_FORMAT_R8_UINT;
1140 case 16: return VK_FORMAT_R16_UINT;
1141 case 32: return VK_FORMAT_R32_UINT;
1142 case 64: return VK_FORMAT_R32G32_UINT;
1143 case 96: return VK_FORMAT_R32G32B32_UINT;
1144 case 128:return VK_FORMAT_R32G32B32A32_UINT;
1145 default:
1146 unreachable("unhandled format size");
1147 }
1148 }
1149
1150 static void
1151 copy_compressed(VkFormat format,
1152 VkOffset3D *offset,
1153 VkExtent3D *extent,
1154 uint32_t *width,
1155 uint32_t *height)
1156 {
1157 if (!vk_format_is_compressed(format))
1158 return;
1159
1160 uint32_t block_width = vk_format_get_blockwidth(format);
1161 uint32_t block_height = vk_format_get_blockheight(format);
1162
1163 offset->x /= block_width;
1164 offset->y /= block_height;
1165
1166 if (extent) {
1167 extent->width = DIV_ROUND_UP(extent->width, block_width);
1168 extent->height = DIV_ROUND_UP(extent->height, block_height);
1169 }
1170 if (width)
1171 *width = DIV_ROUND_UP(*width, block_width);
1172 if (height)
1173 *height = DIV_ROUND_UP(*height, block_height);
1174 }
1175
1176 static void
1177 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1178 struct tu_buffer *src_buffer,
1179 struct tu_image *dst_image,
1180 const VkBufferImageCopy *info)
1181 {
1182 struct tu_cs *cs = &cmd->cs;
1183 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1184 VkFormat dst_format = dst_image->vk_format;
1185 VkFormat src_format = dst_image->vk_format;
1186 const struct blit_ops *ops = &r2d_ops;
1187
1188 uint8_t mask = 0xf;
1189
1190 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1191 switch (info->imageSubresource.aspectMask) {
1192 case VK_IMAGE_ASPECT_STENCIL_BIT:
1193 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1194 mask = 0x8;
1195 ops = &r3d_ops;
1196 break;
1197 case VK_IMAGE_ASPECT_DEPTH_BIT:
1198 mask = 0x7;
1199 break;
1200 }
1201 }
1202
1203 VkOffset3D offset = info->imageOffset;
1204 VkExtent3D extent = info->imageExtent;
1205 uint32_t src_width = info->bufferRowLength ?: extent.width;
1206 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1207
1208 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1209 assert(src_format == dst_format);
1210 copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1211 src_format = dst_format = copy_format(dst_format);
1212 }
1213
1214 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1215 uint32_t layer_size = src_height * pitch;
1216
1217 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1218 * it is also valid for 1cpp format with shader path (stencil aspect path)
1219 */
1220
1221 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1222
1223 struct tu_image_view dst;
1224 tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1225
1226 for (uint32_t i = 0; i < layers; i++) {
1227 ops->dst(cs, &dst, i);
1228
1229 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1230 if ((src_va & 63) || (pitch & 63)) {
1231 for (uint32_t y = 0; y < extent.height; y++) {
1232 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1233 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1234 x + extent.width, 1);
1235 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1236 &(VkExtent2D) {extent.width, 1});
1237 ops->run(cmd, cs);
1238 src_va += pitch;
1239 }
1240 } else {
1241 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1242 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1243 ops->run(cmd, cs);
1244 }
1245 }
1246 }
1247
1248 void
1249 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1250 VkBuffer srcBuffer,
1251 VkImage dstImage,
1252 VkImageLayout dstImageLayout,
1253 uint32_t regionCount,
1254 const VkBufferImageCopy *pRegions)
1255 {
1256 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1257 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1258 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1259
1260 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1261 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1262
1263 for (unsigned i = 0; i < regionCount; ++i)
1264 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1265 }
1266
1267 static void
1268 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1269 struct tu_image *src_image,
1270 struct tu_buffer *dst_buffer,
1271 const VkBufferImageCopy *info)
1272 {
1273 struct tu_cs *cs = &cmd->cs;
1274 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1275 VkFormat src_format = src_image->vk_format;
1276 VkFormat dst_format = src_image->vk_format;
1277 bool stencil_read = false;
1278
1279 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1280 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1281 dst_format = VK_FORMAT_R8_UNORM;
1282 stencil_read = true;
1283 }
1284
1285 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1286 VkOffset3D offset = info->imageOffset;
1287 VkExtent3D extent = info->imageExtent;
1288 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1289 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1290
1291 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1292 assert(src_format == dst_format);
1293 copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1294 src_format = dst_format = copy_format(dst_format);
1295 }
1296
1297 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1298 uint32_t layer_size = pitch * dst_height;
1299
1300 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1301 * it is also valid for 1cpp format with shader path (stencil aspect)
1302 */
1303
1304 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1305
1306 struct tu_image_view src;
1307 tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1308
1309 for (uint32_t i = 0; i < layers; i++) {
1310 ops->src(cmd, cs, &src, i, false);
1311
1312 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1313 if ((dst_va & 63) || (pitch & 63)) {
1314 for (uint32_t y = 0; y < extent.height; y++) {
1315 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1316 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1317 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1318 &(VkExtent2D) {extent.width, 1});
1319 ops->run(cmd, cs);
1320 dst_va += pitch;
1321 }
1322 } else {
1323 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1324 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1325 ops->run(cmd, cs);
1326 }
1327 }
1328 }
1329
1330 void
1331 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1332 VkImage srcImage,
1333 VkImageLayout srcImageLayout,
1334 VkBuffer dstBuffer,
1335 uint32_t regionCount,
1336 const VkBufferImageCopy *pRegions)
1337 {
1338 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1339 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1340 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1341
1342 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1343 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1344
1345 for (unsigned i = 0; i < regionCount; ++i)
1346 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1347 }
1348
1349 /* Tiled formats don't support swapping, which means that we can't support
1350 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1351 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1352 * Currently we fake support for tiled swapped formats and use the unswapped
1353 * format instead, but this means that reinterpreting copies to and from
1354 * swapped formats can't be performed correctly unless we can swizzle the
1355 * components by reinterpreting the other image as the "correct" swapped
1356 * format, i.e. only when the other image is linear.
1357 */
1358
1359 static bool
1360 is_swapped_format(VkFormat format)
1361 {
1362 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1363 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1364 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1365 }
1366
1367 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1368 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1369 * versa). This should mirror the logic in fdl6_layout.
1370 */
1371 static bool
1372 image_is_r8g8(struct tu_image *image)
1373 {
1374 return image->layout.cpp == 2 &&
1375 vk_format_get_nr_components(image->vk_format) == 2;
1376 }
1377
1378 static void
1379 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1380 struct tu_image *src_image,
1381 struct tu_image *dst_image,
1382 const VkImageCopy *info)
1383 {
1384 const struct blit_ops *ops = &r2d_ops;
1385 struct tu_cs *cs = &cmd->cs;
1386
1387 uint8_t mask = 0xf;
1388 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1389 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1390 mask = 0x7;
1391 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1392 mask = 0x8;
1393 }
1394
1395 if (dst_image->samples > 1)
1396 ops = &r3d_ops;
1397
1398 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1399
1400 VkFormat format = VK_FORMAT_UNDEFINED;
1401 VkOffset3D src_offset = info->srcOffset;
1402 VkOffset3D dst_offset = info->dstOffset;
1403 VkExtent3D extent = info->extent;
1404
1405 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1406 * Images":
1407 *
1408 * When copying between compressed and uncompressed formats the extent
1409 * members represent the texel dimensions of the source image and not
1410 * the destination. When copying from a compressed image to an
1411 * uncompressed image the image texel dimensions written to the
1412 * uncompressed image will be source extent divided by the compressed
1413 * texel block dimensions. When copying from an uncompressed image to a
1414 * compressed image the image texel dimensions written to the compressed
1415 * image will be the source extent multiplied by the compressed texel
1416 * block dimensions.
1417 *
1418 * This means we only have to adjust the extent if the source image is
1419 * compressed.
1420 */
1421 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1422 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1423
1424 VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1425 copy_format(dst_image->vk_format) : dst_image->vk_format;
1426 VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1427 copy_format(src_image->vk_format) : src_image->vk_format;
1428
1429 bool use_staging_blit = false;
1430
1431 if (src_format == dst_format) {
1432 /* Images that share a format can always be copied directly because it's
1433 * the same as a blit.
1434 */
1435 format = src_format;
1436 } else if (!src_image->layout.tile_mode) {
1437 /* If an image is linear, we can always safely reinterpret it with the
1438 * other image's format and then do a regular blit.
1439 */
1440 format = dst_format;
1441 } else if (!dst_image->layout.tile_mode) {
1442 format = src_format;
1443 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1444 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1445 * due to the different tile layout.
1446 */
1447 use_staging_blit = true;
1448 } else if (is_swapped_format(src_format) ||
1449 is_swapped_format(dst_format)) {
1450 /* If either format has a non-identity swap, then we can't copy
1451 * to/from it.
1452 */
1453 use_staging_blit = true;
1454 } else if (!src_image->layout.ubwc) {
1455 format = dst_format;
1456 } else if (!dst_image->layout.ubwc) {
1457 format = src_format;
1458 } else {
1459 /* Both formats use UBWC and so neither can be reinterpreted.
1460 * TODO: We could do an in-place decompression of the dst instead.
1461 */
1462 use_staging_blit = true;
1463 }
1464
1465 struct tu_image_view dst, src;
1466
1467 if (use_staging_blit) {
1468 tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1469 tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1470
1471 struct tu_image staging_image = {
1472 .vk_format = src_format,
1473 .type = src_image->type,
1474 .tiling = VK_IMAGE_TILING_LINEAR,
1475 .extent = extent,
1476 .level_count = 1,
1477 .layer_count = info->srcSubresource.layerCount,
1478 .samples = src_image->samples,
1479 .bo_offset = 0,
1480 };
1481
1482 VkImageSubresourceLayers staging_subresource = {
1483 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1484 .mipLevel = 0,
1485 .baseArrayLayer = 0,
1486 .layerCount = info->srcSubresource.layerCount,
1487 };
1488
1489 VkOffset3D staging_offset = { 0 };
1490
1491 staging_image.layout.tile_mode = TILE6_LINEAR;
1492 staging_image.layout.ubwc = false;
1493
1494 fdl6_layout(&staging_image.layout,
1495 vk_format_to_pipe_format(staging_image.vk_format),
1496 staging_image.samples,
1497 staging_image.extent.width,
1498 staging_image.extent.height,
1499 staging_image.extent.depth,
1500 staging_image.level_count,
1501 staging_image.layer_count,
1502 staging_image.type == VK_IMAGE_TYPE_3D,
1503 NULL);
1504
1505 VkResult result = tu_get_scratch_bo(cmd->device,
1506 staging_image.layout.size,
1507 &staging_image.bo);
1508 if (result != VK_SUCCESS) {
1509 cmd->record_result = result;
1510 return;
1511 }
1512
1513 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1514 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1515
1516 struct tu_image_view staging;
1517 tu_image_view_blit2(&staging, &staging_image, src_format,
1518 &staging_subresource, 0, false);
1519
1520 ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1521 coords(ops, cs, &staging_offset, &src_offset, &extent);
1522
1523 for (uint32_t i = 0; i < info->extent.depth; i++) {
1524 ops->src(cmd, cs, &src, i, false);
1525 ops->dst(cs, &staging, i);
1526 ops->run(cmd, cs);
1527 }
1528
1529 /* When executed by the user there has to be a pipeline barrier here,
1530 * but since we're doing it manually we'll have to flush ourselves.
1531 */
1532 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1533 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1534
1535 tu_image_view_blit2(&staging, &staging_image, dst_format,
1536 &staging_subresource, 0, false);
1537
1538 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1539 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1540
1541 for (uint32_t i = 0; i < info->extent.depth; i++) {
1542 ops->src(cmd, cs, &staging, i, false);
1543 ops->dst(cs, &dst, i);
1544 ops->run(cmd, cs);
1545 }
1546 } else {
1547 tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1548 tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1549
1550 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1551 coords(ops, cs, &dst_offset, &src_offset, &extent);
1552
1553 for (uint32_t i = 0; i < info->extent.depth; i++) {
1554 ops->src(cmd, cs, &src, i, false);
1555 ops->dst(cs, &dst, i);
1556 ops->run(cmd, cs);
1557 }
1558 }
1559 }
1560
1561 void
1562 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1563 VkImage srcImage,
1564 VkImageLayout srcImageLayout,
1565 VkImage destImage,
1566 VkImageLayout destImageLayout,
1567 uint32_t regionCount,
1568 const VkImageCopy *pRegions)
1569 {
1570 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1571 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1572 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1573
1574 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1575 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1576
1577 for (uint32_t i = 0; i < regionCount; ++i)
1578 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1579 }
1580
1581 static void
1582 copy_buffer(struct tu_cmd_buffer *cmd,
1583 uint64_t dst_va,
1584 uint64_t src_va,
1585 uint64_t size,
1586 uint32_t block_size)
1587 {
1588 const struct blit_ops *ops = &r2d_ops;
1589 struct tu_cs *cs = &cmd->cs;
1590 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1591 uint64_t blocks = size / block_size;
1592
1593 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1594
1595 while (blocks) {
1596 uint32_t src_x = (src_va & 63) / block_size;
1597 uint32_t dst_x = (dst_va & 63) / block_size;
1598 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1599
1600 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1601 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1602 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1603 ops->run(cmd, cs);
1604
1605 src_va += width * block_size;
1606 dst_va += width * block_size;
1607 blocks -= width;
1608 }
1609 }
1610
1611 void
1612 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1613 VkBuffer srcBuffer,
1614 VkBuffer dstBuffer,
1615 uint32_t regionCount,
1616 const VkBufferCopy *pRegions)
1617 {
1618 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1619 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1620 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1621
1622 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1623 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1624
1625 for (unsigned i = 0; i < regionCount; ++i) {
1626 copy_buffer(cmd,
1627 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1628 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1629 pRegions[i].size, 1);
1630 }
1631 }
1632
1633 void
1634 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1635 VkBuffer dstBuffer,
1636 VkDeviceSize dstOffset,
1637 VkDeviceSize dataSize,
1638 const void *pData)
1639 {
1640 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1642
1643 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1644
1645 struct ts_cs_memory tmp;
1646 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1647 if (result != VK_SUCCESS) {
1648 cmd->record_result = result;
1649 return;
1650 }
1651
1652 memcpy(tmp.map, pData, dataSize);
1653 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1654 }
1655
1656 void
1657 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1658 VkBuffer dstBuffer,
1659 VkDeviceSize dstOffset,
1660 VkDeviceSize fillSize,
1661 uint32_t data)
1662 {
1663 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1664 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1665 const struct blit_ops *ops = &r2d_ops;
1666 struct tu_cs *cs = &cmd->cs;
1667
1668 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1669
1670 if (fillSize == VK_WHOLE_SIZE)
1671 fillSize = buffer->size - dstOffset;
1672
1673 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1674 uint32_t blocks = fillSize / 4;
1675
1676 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1677 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1678
1679 while (blocks) {
1680 uint32_t dst_x = (dst_va & 63) / 4;
1681 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1682
1683 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1684 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1685 ops->run(cmd, cs);
1686
1687 dst_va += width * 4;
1688 blocks -= width;
1689 }
1690 }
1691
1692 void
1693 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1694 VkImage srcImage,
1695 VkImageLayout srcImageLayout,
1696 VkImage dstImage,
1697 VkImageLayout dstImageLayout,
1698 uint32_t regionCount,
1699 const VkImageResolve *pRegions)
1700 {
1701 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1702 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1703 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1704 const struct blit_ops *ops = &r2d_ops;
1705 struct tu_cs *cs = &cmd->cs;
1706
1707 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1708 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1709
1710 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1711
1712 for (uint32_t i = 0; i < regionCount; ++i) {
1713 const VkImageResolve *info = &pRegions[i];
1714 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1715
1716 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1717 /* TODO: aspect masks possible ? */
1718
1719 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1720
1721 struct tu_image_view dst, src;
1722 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1723 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1724
1725 for (uint32_t i = 0; i < layers; i++) {
1726 ops->src(cmd, cs, &src, i, false);
1727 ops->dst(cs, &dst, i);
1728 ops->run(cmd, cs);
1729 }
1730 }
1731 }
1732
1733 void
1734 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1735 struct tu_cs *cs,
1736 struct tu_image_view *src,
1737 struct tu_image_view *dst,
1738 uint32_t layers,
1739 const VkRect2D *rect)
1740 {
1741 const struct blit_ops *ops = &r2d_ops;
1742
1743 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1744 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1745
1746 assert(src->image->vk_format == dst->image->vk_format);
1747
1748 ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1749 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1750
1751 for (uint32_t i = 0; i < layers; i++) {
1752 ops->src(cmd, cs, src, i, false);
1753 ops->dst(cs, dst, i);
1754 ops->run(cmd, cs);
1755 }
1756 }
1757
1758 static void
1759 clear_image(struct tu_cmd_buffer *cmd,
1760 struct tu_image *image,
1761 const VkClearValue *clear_value,
1762 const VkImageSubresourceRange *range)
1763 {
1764 uint32_t level_count = tu_get_levelCount(image, range);
1765 uint32_t layer_count = tu_get_layerCount(image, range);
1766 struct tu_cs *cs = &cmd->cs;
1767 VkFormat format = image->vk_format;
1768 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1769 format = VK_FORMAT_R32_UINT;
1770
1771 if (image->type == VK_IMAGE_TYPE_3D) {
1772 assert(layer_count == 1);
1773 assert(range->baseArrayLayer == 0);
1774 }
1775
1776 uint8_t mask = 0xf;
1777 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1778 mask = 0;
1779 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1780 mask |= 0x7;
1781 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1782 mask |= 0x8;
1783 }
1784
1785 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1786
1787 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1788 ops->clear_value(cs, image->vk_format, clear_value);
1789
1790 for (unsigned j = 0; j < level_count; j++) {
1791 if (image->type == VK_IMAGE_TYPE_3D)
1792 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1793
1794 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1795 u_minify(image->extent.width, range->baseMipLevel + j),
1796 u_minify(image->extent.height, range->baseMipLevel + j)
1797 });
1798
1799 struct tu_image_view dst;
1800 tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1801 .aspectMask = range->aspectMask,
1802 .mipLevel = range->baseMipLevel + j,
1803 .baseArrayLayer = range->baseArrayLayer,
1804 .layerCount = 1,
1805 }, 0, false);
1806
1807 for (uint32_t i = 0; i < layer_count; i++) {
1808 ops->dst(cs, &dst, i);
1809 ops->run(cmd, cs);
1810 }
1811 }
1812 }
1813
1814 void
1815 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1816 VkImage image_h,
1817 VkImageLayout imageLayout,
1818 const VkClearColorValue *pColor,
1819 uint32_t rangeCount,
1820 const VkImageSubresourceRange *pRanges)
1821 {
1822 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1823 TU_FROM_HANDLE(tu_image, image, image_h);
1824
1825 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1826
1827 for (unsigned i = 0; i < rangeCount; i++)
1828 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1829 }
1830
1831 void
1832 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1833 VkImage image_h,
1834 VkImageLayout imageLayout,
1835 const VkClearDepthStencilValue *pDepthStencil,
1836 uint32_t rangeCount,
1837 const VkImageSubresourceRange *pRanges)
1838 {
1839 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1840 TU_FROM_HANDLE(tu_image, image, image_h);
1841
1842 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1843
1844 for (unsigned i = 0; i < rangeCount; i++)
1845 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1846 }
1847
1848 static void
1849 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1850 uint32_t attachment_count,
1851 const VkClearAttachment *attachments,
1852 uint32_t rect_count,
1853 const VkClearRect *rects)
1854 {
1855 const struct tu_subpass *subpass = cmd->state.subpass;
1856 /* note: cannot use shader path here.. there is a special shader path
1857 * in tu_clear_sysmem_attachments()
1858 */
1859 const struct blit_ops *ops = &r2d_ops;
1860 struct tu_cs *cs = &cmd->draw_cs;
1861
1862 for (uint32_t j = 0; j < attachment_count; j++) {
1863 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1864 * Pass Instance" says that:
1865 *
1866 * Unlike other clear commands, vkCmdClearAttachments executes as
1867 * a drawing command, rather than a transfer command, with writes
1868 * performed by it executing in rasterization order. Clears to
1869 * color attachments are executed as color attachment writes, by
1870 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1871 * Clears to depth/stencil attachments are executed as depth
1872 * writes and writes by the
1873 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1874 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1875 *
1876 * However, the 2d path here is executed the same way as a
1877 * transfer command, using the CCU color cache exclusively with
1878 * a special depth-as-color format for depth clears. This means that
1879 * we can't rely on the normal pipeline barrier mechanism here, and
1880 * have to manually flush whenever using a different cache domain
1881 * from what the 3d path would've used. This happens when we clear
1882 * depth/stencil, since normally depth attachments use CCU depth, but
1883 * we clear it using a special depth-as-color format. Since the clear
1884 * potentially uses a different attachment state we also need to
1885 * invalidate color beforehand and flush it afterwards.
1886 */
1887
1888 uint32_t a;
1889 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1890 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1891 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1892 } else {
1893 a = subpass->depth_stencil_attachment.attachment;
1894 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1895 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1896 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1897 }
1898
1899 if (a == VK_ATTACHMENT_UNUSED)
1900 continue;
1901
1902 uint8_t mask = 0xf;
1903 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1904 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1905 mask &= ~0x7;
1906 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1907 mask &= ~0x8;
1908 }
1909
1910 const struct tu_image_view *iview =
1911 cmd->state.framebuffer->attachments[a].attachment;
1912
1913 ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1914 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1915
1916 /* Wait for the flushes we triggered manually to complete */
1917 tu_cs_emit_wfi(cs);
1918
1919 for (uint32_t i = 0; i < rect_count; i++) {
1920 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1921 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1922 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1923 ops->run(cmd, cs);
1924 }
1925 }
1926
1927 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1928 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1929 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1930 } else {
1931 /* sync color into depth */
1932 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1933 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1934 }
1935 }
1936 }
1937
1938 static void
1939 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1940 uint32_t attachment_count,
1941 const VkClearAttachment *attachments,
1942 uint32_t rect_count,
1943 const VkClearRect *rects)
1944 {
1945 /* the shader path here is special, it avoids changing MRT/etc state */
1946 const struct tu_render_pass *pass = cmd->state.pass;
1947 const struct tu_subpass *subpass = cmd->state.subpass;
1948 const uint32_t mrt_count = subpass->color_count;
1949 struct tu_cs *cs = &cmd->draw_cs;
1950 uint32_t clear_value[MAX_RTS][4];
1951 float z_clear_val = 0.0f;
1952 uint8_t s_clear_val = 0;
1953 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1954 bool z_clear = false;
1955 bool s_clear = false;
1956 uint32_t max_samples = 1;
1957
1958 for (uint32_t i = 0; i < attachment_count; i++) {
1959 uint32_t a;
1960 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1961 uint32_t c = attachments[i].colorAttachment;
1962 a = subpass->color_attachments[c].attachment;
1963 if (a == VK_ATTACHMENT_UNUSED)
1964 continue;
1965
1966 clear_rts |= 1 << c;
1967 clear_components |= 0xf << (c * 4);
1968 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1969 } else {
1970 a = subpass->depth_stencil_attachment.attachment;
1971 if (a == VK_ATTACHMENT_UNUSED)
1972 continue;
1973
1974 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1975 z_clear = true;
1976 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1977 }
1978
1979 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1980 s_clear = true;
1981 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1982 }
1983 }
1984
1985 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1986 }
1987
1988 /* prefer to use 2D path for clears
1989 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1990 */
1991 if (max_samples == 1 && cmd->state.framebuffer) {
1992 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1993 return;
1994 }
1995
1996 /* TODO: this path doesn't take into account multilayer rendering */
1997
1998 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1999 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2000 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2001 0xfc000000);
2002 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2003
2004 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2005 for (uint32_t i = 0; i < mrt_count; i++) {
2006 if (clear_rts & (1 << i))
2007 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2008 else
2009 tu_cs_emit(cs, 0);
2010 }
2011
2012 r3d_pipeline(cmd, cs, false, num_rts);
2013
2014 tu_cs_emit_regs(cs,
2015 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2016 tu_cs_emit_regs(cs,
2017 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2018
2019 tu_cs_emit_regs(cs,
2020 A6XX_RB_FS_OUTPUT_CNTL0(),
2021 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2022
2023 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2024 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2025 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2026 for (uint32_t i = 0; i < mrt_count; i++) {
2027 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2028 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2029 }
2030
2031 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2032 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2033 .z_enable = z_clear,
2034 .z_write_enable = z_clear,
2035 .zfunc = FUNC_ALWAYS));
2036 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2037 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2038 .stencil_enable = s_clear,
2039 .func = FUNC_ALWAYS,
2040 .zpass = STENCIL_REPLACE));
2041 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2042 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2043 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2044
2045 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2046 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2047 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2048 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2049 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2050 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2051 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2052 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2053 for_each_bit(b, clear_rts)
2054 tu_cs_emit_array(cs, clear_value[b], 4);
2055
2056 for (uint32_t i = 0; i < rect_count; i++) {
2057 r3d_coords_raw(cs, (float[]) {
2058 rects[i].rect.offset.x, rects[i].rect.offset.y,
2059 z_clear_val, 1.0f,
2060 rects[i].rect.offset.x + rects[i].rect.extent.width,
2061 rects[i].rect.offset.y + rects[i].rect.extent.height,
2062 z_clear_val, 1.0f
2063 });
2064 r3d_run(cmd, cs);
2065 }
2066
2067 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2068 TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2069 TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2070 TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2071 TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2072 TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2073 }
2074
2075 /**
2076 * Pack a VkClearValue into a 128-bit buffer. format is respected except
2077 * for the component order. The components are always packed in WZYX order,
2078 * because gmem is tiled and tiled formats always have WZYX swap
2079 */
2080 static void
2081 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2082 {
2083 const struct util_format_description *desc = vk_format_description(format);
2084
2085 switch (format) {
2086 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2087 buf[0] = float3_to_r11g11b10f(val->color.float32);
2088 return;
2089 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2090 buf[0] = float3_to_rgb9e5(val->color.float32);
2091 return;
2092 default:
2093 break;
2094 }
2095
2096 assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2097
2098 /* S8_UINT is special and has no depth */
2099 const int max_components =
2100 format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2101
2102 int buf_offset = 0;
2103 int bit_shift = 0;
2104 for (int comp = 0; comp < max_components; comp++) {
2105 const struct util_format_channel_description *ch =
2106 tu_get_format_channel_description(desc, comp);
2107 if (!ch) {
2108 assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2109 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2110 continue;
2111 }
2112
2113 union tu_clear_component_value v = tu_get_clear_component_value(
2114 val, comp, desc->colorspace);
2115
2116 /* move to the next uint32_t when there is not enough space */
2117 assert(ch->size <= 32);
2118 if (bit_shift + ch->size > 32) {
2119 buf_offset++;
2120 bit_shift = 0;
2121 }
2122
2123 if (bit_shift == 0)
2124 buf[buf_offset] = 0;
2125
2126 buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2127 bit_shift += ch->size;
2128 }
2129 }
2130
2131 static void
2132 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2133 struct tu_cs *cs,
2134 uint32_t attachment,
2135 uint8_t component_mask,
2136 const VkClearValue *value)
2137 {
2138 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2139 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2140 * because D24S8 is cleared with AS_R8G8B8A8 format
2141 */
2142
2143 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2144 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2145
2146 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2147 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2148
2149 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2150 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2151
2152 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2153 tu_cs_emit(cs, 0);
2154
2155 uint32_t clear_vals[4] = {};
2156 pack_gmem_clear_value(value, vk_format, clear_vals);
2157
2158 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2159 tu_cs_emit_array(cs, clear_vals, 4);
2160
2161 tu6_emit_event_write(cmd, cs, BLIT);
2162 }
2163
2164 static void
2165 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2166 uint32_t attachment_count,
2167 const VkClearAttachment *attachments,
2168 uint32_t rect_count,
2169 const VkClearRect *rects)
2170 {
2171 const struct tu_subpass *subpass = cmd->state.subpass;
2172 struct tu_cs *cs = &cmd->draw_cs;
2173
2174 /* TODO: swap the loops for smaller cmdstream */
2175 for (unsigned i = 0; i < rect_count; i++) {
2176 unsigned x1 = rects[i].rect.offset.x;
2177 unsigned y1 = rects[i].rect.offset.y;
2178 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2179 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2180
2181 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2182 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2183 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2184
2185 for (unsigned j = 0; j < attachment_count; j++) {
2186 uint32_t a;
2187 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2188 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2189 else
2190 a = subpass->depth_stencil_attachment.attachment;
2191
2192 if (a == VK_ATTACHMENT_UNUSED)
2193 continue;
2194
2195 unsigned clear_mask = 0xf;
2196 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2197 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2198 clear_mask &= ~0x7;
2199 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2200 clear_mask &= ~0x8;
2201 }
2202
2203 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2204 &attachments[j].clearValue);
2205 }
2206 }
2207 }
2208
2209 void
2210 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2211 uint32_t attachmentCount,
2212 const VkClearAttachment *pAttachments,
2213 uint32_t rectCount,
2214 const VkClearRect *pRects)
2215 {
2216 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2217 struct tu_cs *cs = &cmd->draw_cs;
2218
2219 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2220 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2221 tu_cond_exec_end(cs);
2222
2223 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2224 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2225 tu_cond_exec_end(cs);
2226 }
2227
2228 void
2229 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2230 struct tu_cs *cs,
2231 uint32_t a,
2232 const VkRenderPassBeginInfo *info)
2233 {
2234 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2235 const struct tu_image_view *iview = fb->attachments[a].attachment;
2236 const struct tu_render_pass_attachment *attachment =
2237 &cmd->state.pass->attachments[a];
2238 uint8_t mask = 0;
2239
2240 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2241 mask = 0xf;
2242 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2243 mask |= 0x7;
2244 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2245 mask |= 0x8;
2246
2247 if (!mask)
2248 return;
2249
2250 const struct blit_ops *ops = &r2d_ops;
2251 if (attachment->samples > 1)
2252 ops = &r3d_ops;
2253
2254 ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2255 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2256 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2257
2258 /* Wait for any flushes at the beginning of the renderpass to complete */
2259 tu_cs_emit_wfi(cs);
2260
2261 for (uint32_t i = 0; i < fb->layers; i++) {
2262 ops->dst(cs, iview, i);
2263 ops->run(cmd, cs);
2264 }
2265
2266 /* The spec doesn't explicitly say, but presumably the initial renderpass
2267 * clear is considered part of the renderpass, and therefore barriers
2268 * aren't required inside the subpass/renderpass. Therefore we need to
2269 * flush CCU color into CCU depth here, just like with
2270 * vkCmdClearAttachments(). Note that because this only happens at the
2271 * beginning of a renderpass, and renderpass writes are considered
2272 * "incoherent", we shouldn't have to worry about syncing depth into color
2273 * beforehand as depth should already be flushed.
2274 */
2275 if (vk_format_is_depth_or_stencil(attachment->format)) {
2276 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2277 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2278 } else {
2279 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2280 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2281 }
2282 }
2283
2284 void
2285 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2286 struct tu_cs *cs,
2287 uint32_t a,
2288 const VkRenderPassBeginInfo *info)
2289 {
2290 const struct tu_render_pass_attachment *attachment =
2291 &cmd->state.pass->attachments[a];
2292 unsigned clear_mask = 0;
2293
2294 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2295 clear_mask = 0xf;
2296 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2297 clear_mask |= 0x7;
2298 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2299 clear_mask |= 0x8;
2300
2301 if (!clear_mask)
2302 return;
2303
2304 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2305
2306 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2307 &info->pClearValues[a]);
2308 }
2309
2310 static void
2311 tu_emit_blit(struct tu_cmd_buffer *cmd,
2312 struct tu_cs *cs,
2313 const struct tu_image_view *iview,
2314 const struct tu_render_pass_attachment *attachment,
2315 bool resolve)
2316 {
2317 tu_cs_emit_regs(cs,
2318 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2319
2320 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2321 .unk0 = !resolve,
2322 .gmem = !resolve,
2323 /* "integer" bit disables msaa resolve averaging */
2324 .integer = vk_format_is_int(attachment->format)));
2325
2326 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2327 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2328 tu_cs_image_ref_2d(cs, iview, 0, false);
2329
2330 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2331 tu_cs_image_flag_ref(cs, iview, 0);
2332
2333 tu_cs_emit_regs(cs,
2334 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2335
2336 tu6_emit_event_write(cmd, cs, BLIT);
2337 }
2338
2339 static bool
2340 blit_can_resolve(VkFormat format)
2341 {
2342 const struct util_format_description *desc = vk_format_description(format);
2343
2344 /* blit event can only do resolve for simple cases:
2345 * averaging samples as unsigned integers or choosing only one sample
2346 */
2347 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2348 return false;
2349
2350 /* can't do formats with larger channel sizes
2351 * note: this includes all float formats
2352 * note2: single channel integer formats seem OK
2353 */
2354 if (desc->channel[0].size > 10)
2355 return false;
2356
2357 switch (format) {
2358 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2359 * likely related to these formats having different layout from other cpp=2 formats
2360 */
2361 case VK_FORMAT_R8G8_UNORM:
2362 case VK_FORMAT_R8G8_UINT:
2363 case VK_FORMAT_R8G8_SINT:
2364 /* TODO: this one should be able to work? */
2365 case VK_FORMAT_D24_UNORM_S8_UINT:
2366 return false;
2367 default:
2368 break;
2369 }
2370
2371 return true;
2372 }
2373
2374 void
2375 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2376 struct tu_cs *cs,
2377 uint32_t a,
2378 bool force_load)
2379 {
2380 const struct tu_image_view *iview =
2381 cmd->state.framebuffer->attachments[a].attachment;
2382 const struct tu_render_pass_attachment *attachment =
2383 &cmd->state.pass->attachments[a];
2384
2385 if (attachment->load || force_load)
2386 tu_emit_blit(cmd, cs, iview, attachment, false);
2387 }
2388
2389 void
2390 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2391 struct tu_cs *cs,
2392 uint32_t a,
2393 uint32_t gmem_a)
2394 {
2395 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2396 const VkRect2D *render_area = &tiling->render_area;
2397 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2398 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2399 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2400
2401 if (!dst->store)
2402 return;
2403
2404 uint32_t x1 = render_area->offset.x;
2405 uint32_t y1 = render_area->offset.y;
2406 uint32_t x2 = x1 + render_area->extent.width;
2407 uint32_t y2 = y1 + render_area->extent.height;
2408 /* x2/y2 can be unaligned if equal to the size of the image,
2409 * since it will write into padding space
2410 * the one exception is linear levels which don't have the
2411 * required y padding in the layout (except for the last level)
2412 */
2413 bool need_y2_align =
2414 y2 != iview->extent.height || iview->need_y2_align;
2415
2416 bool unaligned =
2417 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2418 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2419
2420 /* use fast path when render area is aligned, except for unsupported resolve cases */
2421 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2422 tu_emit_blit(cmd, cs, iview, src, true);
2423 return;
2424 }
2425
2426 if (dst->samples > 1) {
2427 /* I guess we need to use shader path in this case?
2428 * need a testcase which fails because of this
2429 */
2430 tu_finishme("unaligned store of msaa attachment\n");
2431 return;
2432 }
2433
2434 r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2435 r2d_dst(cs, iview, 0);
2436 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2437
2438 tu_cs_emit_regs(cs,
2439 A6XX_SP_PS_2D_SRC_INFO(
2440 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2441 .tile_mode = TILE6_2,
2442 .srgb = vk_format_is_srgb(src->format),
2443 .samples = tu_msaa_samples(src->samples),
2444 .samples_average = !vk_format_is_int(src->format),
2445 .unk20 = 1,
2446 .unk22 = 1),
2447 /* note: src size does not matter when not scaling */
2448 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2449 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2450 A6XX_SP_PS_2D_SRC_HI(),
2451 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2452
2453 /* sync GMEM writes with CACHE. */
2454 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2455
2456 /* Wait for CACHE_INVALIDATE to land */
2457 tu_cs_emit_wfi(cs);
2458
2459 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2460 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2461
2462 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2463 * sysmem, and we generally assume that GMEM renderpasses leave their
2464 * results in sysmem, so we need to flush manually here.
2465 */
2466 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2467 }