turnip: fix RENDER_COMPONENTS value
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 /* helper functions previously in tu_formats.c */
20
21 static uint32_t
22 tu_pack_mask(int bits)
23 {
24 assert(bits <= 32);
25 return (1ull << bits) - 1;
26 }
27
28 static uint32_t
29 tu_pack_float32_for_unorm(float val, int bits)
30 {
31 const uint32_t max = tu_pack_mask(bits);
32 if (val < 0.0f)
33 return 0;
34 else if (val > 1.0f)
35 return max;
36 else
37 return _mesa_lroundevenf(val * (float) max);
38 }
39
40 static uint32_t
41 tu_pack_float32_for_snorm(float val, int bits)
42 {
43 const int32_t max = tu_pack_mask(bits - 1);
44 int32_t tmp;
45 if (val < -1.0f)
46 tmp = -max;
47 else if (val > 1.0f)
48 tmp = max;
49 else
50 tmp = _mesa_lroundevenf(val * (float) max);
51
52 return tmp & tu_pack_mask(bits);
53 }
54
55 static uint32_t
56 tu_pack_float32_for_uscaled(float val, int bits)
57 {
58 const uint32_t max = tu_pack_mask(bits);
59 if (val < 0.0f)
60 return 0;
61 else if (val > (float) max)
62 return max;
63 else
64 return (uint32_t) val;
65 }
66
67 static uint32_t
68 tu_pack_float32_for_sscaled(float val, int bits)
69 {
70 const int32_t max = tu_pack_mask(bits - 1);
71 const int32_t min = -max - 1;
72 int32_t tmp;
73 if (val < (float) min)
74 tmp = min;
75 else if (val > (float) max)
76 tmp = max;
77 else
78 tmp = (int32_t) val;
79
80 return tmp & tu_pack_mask(bits);
81 }
82
83 static uint32_t
84 tu_pack_uint32_for_uint(uint32_t val, int bits)
85 {
86 return val & tu_pack_mask(bits);
87 }
88
89 static uint32_t
90 tu_pack_int32_for_sint(int32_t val, int bits)
91 {
92 return val & tu_pack_mask(bits);
93 }
94
95 static uint32_t
96 tu_pack_float32_for_sfloat(float val, int bits)
97 {
98 assert(bits == 16 || bits == 32);
99 return bits == 16 ? util_float_to_half(val) : fui(val);
100 }
101
102 union tu_clear_component_value {
103 float float32;
104 int32_t int32;
105 uint32_t uint32;
106 };
107
108 static uint32_t
109 tu_pack_clear_component_value(union tu_clear_component_value val,
110 const struct util_format_channel_description *ch)
111 {
112 uint32_t packed;
113
114 switch (ch->type) {
115 case UTIL_FORMAT_TYPE_UNSIGNED:
116 /* normalized, scaled, or pure integer */
117 if (ch->normalized)
118 packed = tu_pack_float32_for_unorm(val.float32, ch->size);
119 else if (ch->pure_integer)
120 packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
121 else
122 packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
123 break;
124 case UTIL_FORMAT_TYPE_SIGNED:
125 /* normalized, scaled, or pure integer */
126 if (ch->normalized)
127 packed = tu_pack_float32_for_snorm(val.float32, ch->size);
128 else if (ch->pure_integer)
129 packed = tu_pack_int32_for_sint(val.int32, ch->size);
130 else
131 packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
132 break;
133 case UTIL_FORMAT_TYPE_FLOAT:
134 packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
135 break;
136 default:
137 unreachable("unexpected channel type");
138 packed = 0;
139 break;
140 }
141
142 assert((packed & tu_pack_mask(ch->size)) == packed);
143 return packed;
144 }
145
146 static const struct util_format_channel_description *
147 tu_get_format_channel_description(const struct util_format_description *desc,
148 int comp)
149 {
150 switch (desc->swizzle[comp]) {
151 case PIPE_SWIZZLE_X:
152 return &desc->channel[0];
153 case PIPE_SWIZZLE_Y:
154 return &desc->channel[1];
155 case PIPE_SWIZZLE_Z:
156 return &desc->channel[2];
157 case PIPE_SWIZZLE_W:
158 return &desc->channel[3];
159 default:
160 return NULL;
161 }
162 }
163
164 static union tu_clear_component_value
165 tu_get_clear_component_value(const VkClearValue *val, int comp,
166 enum util_format_colorspace colorspace)
167 {
168 assert(comp < 4);
169
170 union tu_clear_component_value tmp;
171 switch (colorspace) {
172 case UTIL_FORMAT_COLORSPACE_ZS:
173 assert(comp < 2);
174 if (comp == 0)
175 tmp.float32 = val->depthStencil.depth;
176 else
177 tmp.uint32 = val->depthStencil.stencil;
178 break;
179 case UTIL_FORMAT_COLORSPACE_SRGB:
180 if (comp < 3) {
181 tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
182 break;
183 }
184 default:
185 assert(comp < 4);
186 tmp.uint32 = val->color.uint32[comp];
187 break;
188 }
189
190 return tmp;
191 }
192
193 /* r2d_ = BLIT_OP_SCALE operations */
194
195 static enum a6xx_2d_ifmt
196 format_to_ifmt(enum a6xx_format fmt)
197 {
198 switch (fmt) {
199 case FMT6_A8_UNORM:
200 case FMT6_8_UNORM:
201 case FMT6_8_SNORM:
202 case FMT6_8_8_UNORM:
203 case FMT6_8_8_SNORM:
204 case FMT6_8_8_8_8_UNORM:
205 case FMT6_8_8_8_X8_UNORM:
206 case FMT6_8_8_8_8_SNORM:
207 case FMT6_4_4_4_4_UNORM:
208 case FMT6_5_5_5_1_UNORM:
209 case FMT6_5_6_5_UNORM:
210 case FMT6_Z24_UNORM_S8_UINT:
211 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
212 return R2D_UNORM8;
213
214 case FMT6_32_UINT:
215 case FMT6_32_SINT:
216 case FMT6_32_32_UINT:
217 case FMT6_32_32_SINT:
218 case FMT6_32_32_32_32_UINT:
219 case FMT6_32_32_32_32_SINT:
220 return R2D_INT32;
221
222 case FMT6_16_UINT:
223 case FMT6_16_SINT:
224 case FMT6_16_16_UINT:
225 case FMT6_16_16_SINT:
226 case FMT6_16_16_16_16_UINT:
227 case FMT6_16_16_16_16_SINT:
228 case FMT6_10_10_10_2_UINT:
229 return R2D_INT16;
230
231 case FMT6_8_UINT:
232 case FMT6_8_SINT:
233 case FMT6_8_8_UINT:
234 case FMT6_8_8_SINT:
235 case FMT6_8_8_8_8_UINT:
236 case FMT6_8_8_8_8_SINT:
237 return R2D_INT8;
238
239 case FMT6_16_UNORM:
240 case FMT6_16_SNORM:
241 case FMT6_16_16_UNORM:
242 case FMT6_16_16_SNORM:
243 case FMT6_16_16_16_16_UNORM:
244 case FMT6_16_16_16_16_SNORM:
245 case FMT6_32_FLOAT:
246 case FMT6_32_32_FLOAT:
247 case FMT6_32_32_32_32_FLOAT:
248 return R2D_FLOAT32;
249
250 case FMT6_16_FLOAT:
251 case FMT6_16_16_FLOAT:
252 case FMT6_16_16_16_16_FLOAT:
253 case FMT6_11_11_10_FLOAT:
254 case FMT6_10_10_10_2_UNORM:
255 case FMT6_10_10_10_2_UNORM_DEST:
256 return R2D_FLOAT16;
257
258 default:
259 unreachable("bad format");
260 return 0;
261 }
262 }
263
264 static void
265 r2d_coords(struct tu_cs *cs,
266 const VkOffset2D *dst,
267 const VkOffset2D *src,
268 const VkExtent2D *extent)
269 {
270 tu_cs_emit_regs(cs,
271 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
272 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
273
274 if (!src)
275 return;
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
279 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
280 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
281 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
282 }
283
284 static void
285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
286 {
287 uint32_t clear_value[4] = {};
288
289 switch (format) {
290 case VK_FORMAT_X8_D24_UNORM_PACK32:
291 case VK_FORMAT_D24_UNORM_S8_UINT:
292 /* cleared as r8g8b8a8_unorm using special format */
293 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
294 clear_value[1] = clear_value[0] >> 8;
295 clear_value[2] = clear_value[0] >> 16;
296 clear_value[3] = val->depthStencil.stencil;
297 break;
298 case VK_FORMAT_D16_UNORM:
299 case VK_FORMAT_D32_SFLOAT:
300 /* R2D_FLOAT32 */
301 clear_value[0] = fui(val->depthStencil.depth);
302 break;
303 case VK_FORMAT_S8_UINT:
304 clear_value[0] = val->depthStencil.stencil;
305 break;
306 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
307 /* cleared as UINT32 */
308 clear_value[0] = float3_to_rgb9e5(val->color.float32);
309 break;
310 default:
311 assert(!vk_format_is_depth_or_stencil(format));
312 const struct util_format_description *desc = vk_format_description(format);
313 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
314
315 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
316 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
317
318 for (unsigned i = 0; i < desc->nr_channels; i++) {
319 const struct util_format_channel_description *ch = &desc->channel[i];
320 if (ifmt == R2D_UNORM8) {
321 float linear = val->color.float32[i];
322 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
323 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
324
325 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
326 clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
327 else
328 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
329 } else if (ifmt == R2D_FLOAT16) {
330 clear_value[i] = util_float_to_half(val->color.float32[i]);
331 } else {
332 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
333 ifmt == R2D_INT16 || ifmt == R2D_INT8);
334 clear_value[i] = val->color.uint32[i];
335 }
336 }
337 break;
338 }
339
340 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
341 tu_cs_emit_array(cs, clear_value, 4);
342 }
343
344 static void
345 r2d_src(struct tu_cmd_buffer *cmd,
346 struct tu_cs *cs,
347 const struct tu_image_view *iview,
348 uint32_t layer,
349 bool linear_filter)
350 {
351 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
352 tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
353 COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
354 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
355 tu_cs_image_ref_2d(cs, iview, layer, true);
356
357 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
358 tu_cs_image_flag_ref(cs, iview, layer);
359 }
360
361 static void
362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
363 struct tu_cs *cs,
364 VkFormat vk_format,
365 uint64_t va, uint32_t pitch,
366 uint32_t width, uint32_t height)
367 {
368 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
369
370 tu_cs_emit_regs(cs,
371 A6XX_SP_PS_2D_SRC_INFO(
372 .color_format = format.fmt,
373 .color_swap = format.swap,
374 .srgb = vk_format_is_srgb(vk_format),
375 .unk20 = 1,
376 .unk22 = 1),
377 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
378 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
379 A6XX_SP_PS_2D_SRC_HI(va >> 32),
380 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
381 }
382
383 static void
384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
385 {
386 assert(iview->image->samples == 1);
387
388 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
389 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
390 tu_cs_image_ref_2d(cs, iview, layer, false);
391
392 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
393 tu_cs_image_flag_ref(cs, iview, layer);
394 }
395
396 static void
397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
398 {
399 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
400
401 tu_cs_emit_regs(cs,
402 A6XX_RB_2D_DST_INFO(
403 .color_format = format.fmt,
404 .color_swap = format.swap,
405 .srgb = vk_format_is_srgb(vk_format)),
406 A6XX_RB_2D_DST_LO((uint32_t) va),
407 A6XX_RB_2D_DST_HI(va >> 32),
408 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
409 }
410
411 static void
412 r2d_setup_common(struct tu_cmd_buffer *cmd,
413 struct tu_cs *cs,
414 VkFormat vk_format,
415 enum a6xx_rotation rotation,
416 bool clear,
417 uint8_t mask,
418 bool scissor)
419 {
420 enum a6xx_format format = tu6_base_format(vk_format);
421 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
422 uint32_t unknown_8c01 = 0;
423
424 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
425 /* preserve depth channels */
426 if (mask == 0x8)
427 unknown_8c01 = 0x00084001;
428 /* preserve stencil channel */
429 if (mask == 0x7)
430 unknown_8c01 = 0x08000041;
431 }
432
433 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
434 tu_cs_emit(cs, unknown_8c01);
435
436 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
437 .scissor = scissor,
438 .rotate = rotation,
439 .solid_color = clear,
440 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
441 .color_format = format,
442 .mask = 0xf,
443 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
444 ).value;
445
446 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
447 tu_cs_emit(cs, blit_cntl);
448
449 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
450 tu_cs_emit(cs, blit_cntl);
451
452 if (format == FMT6_10_10_10_2_UNORM_DEST)
453 format = FMT6_16_16_16_16_FLOAT;
454
455 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
456 .sint = vk_format_is_sint(vk_format),
457 .uint = vk_format_is_uint(vk_format),
458 .color_format = format,
459 .srgb = vk_format_is_srgb(vk_format),
460 .mask = 0xf));
461 }
462
463 static void
464 r2d_setup(struct tu_cmd_buffer *cmd,
465 struct tu_cs *cs,
466 VkFormat vk_format,
467 enum a6xx_rotation rotation,
468 bool clear,
469 uint8_t mask)
470 {
471 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
472
473 /* TODO: flushing with barriers instead of blindly always flushing */
474 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
475 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
476 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
477 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
478 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
479
480 tu_cs_emit_wfi(cs);
481 tu_cs_emit_regs(cs,
482 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
483
484 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
485 }
486
487 static void
488 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
489 {
490 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
491 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
492
493 /* TODO: flushing with barriers instead of blindly always flushing */
494 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
495 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
496 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
497 }
498
499 /* r3d_ = shader path operations */
500
501 static void
502 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
503 {
504 static const instr_t vs_code[] = {
505 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
506 * r1.xy = r0.w ? c1.zw : c0.zw
507 * r0.w = 1.0f
508 */
509 { .cat3 = {
510 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
511 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
512 .src2 = 3,
513 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
514 } },
515 { .cat3 = {
516 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
517 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
518 .src2 = 3,
519 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
520 } },
521 { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
522 .src_im = 1, .fim_val = 1.0f } },
523 { .cat0 = { .opc = OPC_END } },
524 };
525 #define FS_OFFSET (16 * sizeof(instr_t))
526 STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
527
528 /* vs inputs: only vtx id in r0.w */
529 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 7);
530 tu_cs_emit(cs, 0x00000000);
531 tu_cs_emit(cs, 0xfcfcfc00 | A6XX_VFD_CONTROL_1_REGID4VTX(3));
532 tu_cs_emit(cs, 0x0000fcfc);
533 tu_cs_emit(cs, 0xfcfcfcfc);
534 tu_cs_emit(cs, 0x000000fc);
535 tu_cs_emit(cs, 0x0000fcfc);
536 tu_cs_emit(cs, 0x00000000);
537
538 /* vs outputs: position in r0.xyzw, blit coords in r1.xy */
539 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
540 tu_cs_emit(cs, blit ? 0xffffffcf : 0xffffffff);
541 tu_cs_emit(cs, 0xffffffff);
542 tu_cs_emit(cs, 0xffffffff);
543 tu_cs_emit(cs, 0xffffffff);
544
545 tu_cs_emit_regs(cs, A6XX_SP_VS_OUT_REG(0,
546 .a_regid = 0, .a_compmask = 0xf,
547 .b_regid = 4, .b_compmask = 0x3));
548 tu_cs_emit_regs(cs, A6XX_SP_VS_VPC_DST_REG(0, .outloc0 = 0, .outloc1 = 4));
549
550 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
551 tu_cs_emit(cs, 0xff00ff00 |
552 COND(blit, A6XX_VPC_CNTL_0_VARYING) |
553 A6XX_VPC_CNTL_0_NUMNONPOSVAR(blit ? 8 : 0));
554
555 tu_cs_emit_regs(cs, A6XX_VPC_PACK(
556 .positionloc = 0,
557 .psizeloc = 0xff,
558 .stride_in_vpc = blit ? 6 : 4));
559 tu_cs_emit_regs(cs, A6XX_SP_PRIMITIVE_CNTL(.vsout = blit ? 2 : 1));
560 tu_cs_emit_regs(cs,
561 A6XX_PC_PRIMITIVE_CNTL_0(),
562 A6XX_PC_PRIMITIVE_CNTL_1(.stride_in_vpc = blit ? 6 : 4));
563
564
565 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
566 tu_cs_emit(cs, blit ? 0xe000 : 0); // I think this can just be 0
567 for (uint32_t i = 1; i < 8; i++)
568 tu_cs_emit(cs, 0);
569
570 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
571 for (uint32_t i = 0; i < 8; i++)
572 tu_cs_emit(cs, 0x99999999);
573
574 /* fs inputs: none, prefetch in blit case */
575 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + blit);
576 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(blit) |
577 A6XX_SP_FS_PREFETCH_CNTL_UNK4(0xfc) |
578 0x7000);
579 if (blit) {
580 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(4) |
581 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(0) |
582 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(0) |
583 A6XX_SP_FS_PREFETCH_CMD_DST(0) |
584 A6XX_SP_FS_PREFETCH_CMD_WRMASK(0xf) |
585 A6XX_SP_FS_PREFETCH_CMD_CMD(0x4));
586 }
587
588 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
589 tu_cs_emit(cs, 0x3); // XXX blob uses 3 in blit path
590 tu_cs_emit(cs, 0xfcfcfcfc);
591 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(blit ? 0 : 0xfc) |
592 A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(0xfc) |
593 0xfc00fc00);
594 tu_cs_emit(cs, 0xfcfcfcfc);
595 tu_cs_emit(cs, 0xfcfc);
596
597 tu_cs_emit_regs(cs, A6XX_HLSQ_UNKNOWN_B980(blit ? 3 : 1));
598 tu_cs_emit_regs(cs, A6XX_GRAS_CNTL(.varying = blit));
599 tu_cs_emit_regs(cs,
600 A6XX_RB_RENDER_CONTROL0(.varying = blit, .unk10 = blit),
601 A6XX_RB_RENDER_CONTROL1());
602
603 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_CNTL());
604 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8101());
605 tu_cs_emit_regs(cs, A6XX_GRAS_SAMPLE_CNTL());
606
607 /* shaders */
608 struct ts_cs_memory shaders = { };
609 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
610 assert(result == VK_SUCCESS);
611
612 memcpy(shaders.map, vs_code, sizeof(vs_code));
613
614 instr_t *fs = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
615 for (uint32_t i = 0; i < num_rts; i++) {
616 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
617 fs[i] = (instr_t) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
618 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 } };
619 }
620 fs[num_rts] = (instr_t) { .cat0 = { .opc = OPC_END } };
621 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
622
623 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
624 tu_cs_emit_regs(cs,
625 A6XX_HLSQ_VS_CNTL(.constlen = 8, .enabled = true),
626 A6XX_HLSQ_HS_CNTL(),
627 A6XX_HLSQ_DS_CNTL(),
628 A6XX_HLSQ_GS_CNTL());
629 tu_cs_emit_regs(cs, A6XX_HLSQ_FS_CNTL(.constlen = 4 * num_rts, .enabled = true));
630
631 tu_cs_emit_regs(cs,
632 A6XX_SP_VS_CONFIG(.enabled = true),
633 A6XX_SP_VS_INSTRLEN(1));
634 tu_cs_emit_regs(cs, A6XX_SP_HS_CONFIG());
635 tu_cs_emit_regs(cs, A6XX_SP_DS_CONFIG());
636 tu_cs_emit_regs(cs, A6XX_SP_GS_CONFIG());
637 tu_cs_emit_regs(cs,
638 A6XX_SP_FS_CONFIG(.enabled = true, .ntex = blit, .nsamp = blit),
639 A6XX_SP_FS_INSTRLEN(1));
640
641 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
642 .threadsize = FOUR_QUADS,
643 .fullregfootprint = 2,
644 .mergedregs = true));
645 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
646 .varying = blit,
647 .threadsize = FOUR_QUADS,
648 /* could this be 0 in !blit && !num_rts case ? */
649 .fullregfootprint = MAX2(1, num_rts),
650 .mergedregs = true)); /* note: tu_pipeline also sets 0x1000000 bit */
651
652 tu_cs_emit_regs(cs, A6XX_SP_IBO_COUNT(0));
653
654 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3);
655 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
656 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
657 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
658 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
659 CP_LOAD_STATE6_0_NUM_UNIT(1));
660 tu_cs_emit_qw(cs, shaders.iova);
661
662 tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OBJ_START_LO, 2);
663 tu_cs_emit_qw(cs, shaders.iova);
664
665 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
666 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
667 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
668 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
669 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
670 CP_LOAD_STATE6_0_NUM_UNIT(1));
671 tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
672
673 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OBJ_START_LO, 2);
674 tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
675
676 tu_cs_emit_regs(cs,
677 A6XX_GRAS_CL_CNTL(
678 .persp_division_disable = 1,
679 .vp_xform_disable = 1,
680 .vp_clip_code_ignore = 1,
681 .clip_disable = 1),
682 A6XX_GRAS_UNKNOWN_8001(0));
683 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
684
685 tu_cs_emit_regs(cs,
686 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
687 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
688 tu_cs_emit_regs(cs,
689 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
690 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
691 }
692
693 static void
694 r3d_coords_raw(struct tu_cs *cs, const float *coords)
695 {
696 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
697 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
698 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
699 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
700 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
701 CP_LOAD_STATE6_0_NUM_UNIT(2));
702 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
703 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
704 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
705 }
706
707 static void
708 r3d_coords(struct tu_cs *cs,
709 const VkOffset2D *dst,
710 const VkOffset2D *src,
711 const VkExtent2D *extent)
712 {
713 int32_t src_x1 = src ? src->x : 0;
714 int32_t src_y1 = src ? src->y : 0;
715 r3d_coords_raw(cs, (float[]) {
716 dst->x, dst->y,
717 src_x1, src_y1,
718 dst->x + extent->width, dst->y + extent->height,
719 src_x1 + extent->width, src_y1 + extent->height,
720 });
721 }
722
723 static void
724 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
725 {
726 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
727 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
728 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
729 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
730 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
731 CP_LOAD_STATE6_0_NUM_UNIT(1));
732 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
733 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
734 switch (format) {
735 case VK_FORMAT_X8_D24_UNORM_PACK32:
736 case VK_FORMAT_D24_UNORM_S8_UINT: {
737 /* cleared as r8g8b8a8_unorm using special format */
738 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
739 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
740 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
741 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
742 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
743 } break;
744 case VK_FORMAT_D16_UNORM:
745 case VK_FORMAT_D32_SFLOAT:
746 tu_cs_emit(cs, fui(val->depthStencil.depth));
747 tu_cs_emit(cs, 0);
748 tu_cs_emit(cs, 0);
749 tu_cs_emit(cs, 0);
750 break;
751 case VK_FORMAT_S8_UINT:
752 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
753 tu_cs_emit(cs, 0);
754 tu_cs_emit(cs, 0);
755 tu_cs_emit(cs, 0);
756 break;
757 default:
758 /* as color formats use clear value as-is */
759 assert(!vk_format_is_depth_or_stencil(format));
760 tu_cs_emit_array(cs, val->color.uint32, 4);
761 break;
762 }
763 }
764
765 static void
766 r3d_src_common(struct tu_cmd_buffer *cmd,
767 struct tu_cs *cs,
768 const uint32_t *tex_const,
769 uint32_t offset_base,
770 uint32_t offset_ubwc,
771 bool linear_filter)
772 {
773 struct ts_cs_memory texture = { };
774 VkResult result = tu_cs_alloc(&cmd->sub_cs,
775 2, /* allocate space for a sampler too */
776 A6XX_TEX_CONST_DWORDS, &texture);
777 assert(result == VK_SUCCESS);
778
779 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
780
781 /* patch addresses for layer offset */
782 *(uint64_t*) (texture.map + 4) += offset_base;
783 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
784 texture.map[7] = ubwc_addr;
785 texture.map[8] = ubwc_addr >> 32;
786
787 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
788 A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
789 A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
790 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
791 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
792 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
793 0x60000; /* XXX used by blob, doesn't seem necessary */
794 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
795 0x1 | /* XXX used by blob, doesn't seem necessary */
796 A6XX_TEX_SAMP_1_UNNORM_COORDS |
797 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
798 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
799 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
800
801 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
802 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
803 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
804 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
805 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
806 CP_LOAD_STATE6_0_NUM_UNIT(1));
807 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
808
809 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
810 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
811
812 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
813 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
814 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
815 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
816 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
817 CP_LOAD_STATE6_0_NUM_UNIT(1));
818 tu_cs_emit_qw(cs, texture.iova);
819
820 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
821 tu_cs_emit_qw(cs, texture.iova);
822
823 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
824 }
825
826 static void
827 r3d_src(struct tu_cmd_buffer *cmd,
828 struct tu_cs *cs,
829 const struct tu_image_view *iview,
830 uint32_t layer,
831 bool linear_filter)
832 {
833 r3d_src_common(cmd, cs, iview->descriptor,
834 iview->layer_size * layer,
835 iview->ubwc_layer_size * layer,
836 linear_filter);
837 }
838
839 static void
840 r3d_src_buffer(struct tu_cmd_buffer *cmd,
841 struct tu_cs *cs,
842 VkFormat vk_format,
843 uint64_t va, uint32_t pitch,
844 uint32_t width, uint32_t height)
845 {
846 uint32_t desc[A6XX_TEX_CONST_DWORDS];
847
848 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
849
850 desc[0] =
851 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
852 A6XX_TEX_CONST_0_FMT(format.fmt) |
853 A6XX_TEX_CONST_0_SWAP(format.swap) |
854 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
855 // XXX to swizzle into .w for stencil buffer_to_image
856 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
857 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
858 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
859 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
860 desc[2] =
861 A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
862 A6XX_TEX_CONST_2_PITCH(pitch) |
863 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
864 desc[3] = 0;
865 desc[4] = va;
866 desc[5] = va >> 32;
867 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
868 desc[i] = 0;
869
870 r3d_src_common(cmd, cs, desc, 0, 0, false);
871 }
872
873 static void
874 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
875 {
876 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
877
878 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
879 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
880 tu_cs_image_ref(cs, iview, layer);
881 tu_cs_emit(cs, 0);
882
883 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
884 tu_cs_image_flag_ref(cs, iview, layer);
885
886 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
887 }
888
889 static void
890 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
891 {
892 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
893
894 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
895
896 tu_cs_emit_regs(cs,
897 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
898 A6XX_RB_MRT_PITCH(0, pitch),
899 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
900 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
901 A6XX_RB_MRT_BASE_HI(0, va >> 32),
902 A6XX_RB_MRT_BASE_GMEM(0, 0));
903
904 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
905 }
906
907 static void
908 r3d_setup(struct tu_cmd_buffer *cmd,
909 struct tu_cs *cs,
910 VkFormat vk_format,
911 enum a6xx_rotation rotation,
912 bool clear,
913 uint8_t mask)
914 {
915 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
916
917 if (!cmd->state.pass) {
918 /* TODO: flushing with barriers instead of blindly always flushing */
919 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
920 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
921 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
922 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
923 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
924
925 tu_cs_emit_regs(cs,
926 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
927
928 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
929 }
930 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
931 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
932
933 r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
934
935 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
936 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
937 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
938 0xfc000000);
939 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
940
941 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
942 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
943
944 tu_cs_emit_regs(cs,
945 A6XX_RB_FS_OUTPUT_CNTL0(),
946 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
947
948 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
949 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
950 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
951
952 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
953 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
954 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
955 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
956 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
957 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
958 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
959
960 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
961 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
962
963 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
964 .color_format = tu6_base_format(vk_format),
965 .color_sint = vk_format_is_sint(vk_format),
966 .color_uint = vk_format_is_uint(vk_format)));
967
968 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
969 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
970 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
971 }
972
973 static void
974 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
975 {
976 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
977 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
978 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
979 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
980 tu_cs_emit(cs, 1); /* instance count */
981 tu_cs_emit(cs, 2); /* vertex count */
982
983 if (!cmd->state.pass) {
984 /* TODO: flushing with barriers instead of blindly always flushing */
985 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
986 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
987 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
988 }
989 }
990
991 /* blit ops - common interface for 2d/shader paths */
992
993 struct blit_ops {
994 void (*coords)(struct tu_cs *cs,
995 const VkOffset2D *dst,
996 const VkOffset2D *src,
997 const VkExtent2D *extent);
998 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
999 void (*src)(
1000 struct tu_cmd_buffer *cmd,
1001 struct tu_cs *cs,
1002 const struct tu_image_view *iview,
1003 uint32_t layer,
1004 bool linear_filter);
1005 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1006 VkFormat vk_format,
1007 uint64_t va, uint32_t pitch,
1008 uint32_t width, uint32_t height);
1009 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1010 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1011 void (*setup)(struct tu_cmd_buffer *cmd,
1012 struct tu_cs *cs,
1013 VkFormat vk_format,
1014 enum a6xx_rotation rotation,
1015 bool clear,
1016 uint8_t mask);
1017 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1018 };
1019
1020 static const struct blit_ops r2d_ops = {
1021 .coords = r2d_coords,
1022 .clear_value = r2d_clear_value,
1023 .src = r2d_src,
1024 .src_buffer = r2d_src_buffer,
1025 .dst = r2d_dst,
1026 .dst_buffer = r2d_dst_buffer,
1027 .setup = r2d_setup,
1028 .run = r2d_run,
1029 };
1030
1031 static const struct blit_ops r3d_ops = {
1032 .coords = r3d_coords,
1033 .clear_value = r3d_clear_value,
1034 .src = r3d_src,
1035 .src_buffer = r3d_src_buffer,
1036 .dst = r3d_dst,
1037 .dst_buffer = r3d_dst_buffer,
1038 .setup = r3d_setup,
1039 .run = r3d_run,
1040 };
1041
1042 /* passthrough set coords from 3D extents */
1043 static void
1044 coords(const struct blit_ops *ops,
1045 struct tu_cs *cs,
1046 const VkOffset3D *dst,
1047 const VkOffset3D *src,
1048 const VkExtent3D *extent)
1049 {
1050 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1051 }
1052
1053 static void
1054 tu_image_view_blit2(struct tu_image_view *iview,
1055 struct tu_image *image,
1056 VkFormat format,
1057 const VkImageSubresourceLayers *subres,
1058 uint32_t layer,
1059 bool stencil_read)
1060 {
1061 VkImageAspectFlags aspect_mask = subres->aspectMask;
1062
1063 /* always use the AS_R8G8B8A8 format for these */
1064 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1065 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1066 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1067 }
1068
1069 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1070 .image = tu_image_to_handle(image),
1071 .viewType = VK_IMAGE_VIEW_TYPE_2D,
1072 .format = format,
1073 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1074 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1075 .subresourceRange = {
1076 .aspectMask = aspect_mask,
1077 .baseMipLevel = subres->mipLevel,
1078 .levelCount = 1,
1079 .baseArrayLayer = subres->baseArrayLayer + layer,
1080 .layerCount = 1,
1081 },
1082 });
1083 }
1084
1085 static void
1086 tu_image_view_blit(struct tu_image_view *iview,
1087 struct tu_image *image,
1088 const VkImageSubresourceLayers *subres,
1089 uint32_t layer)
1090 {
1091 tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1092 }
1093
1094 static void
1095 tu6_blit_image(struct tu_cmd_buffer *cmd,
1096 struct tu_image *src_image,
1097 struct tu_image *dst_image,
1098 const VkImageBlit *info,
1099 VkFilter filter)
1100 {
1101 const struct blit_ops *ops = &r2d_ops;
1102 struct tu_cs *cs = &cmd->cs;
1103 uint32_t layers;
1104
1105 /* 2D blit can't do rotation mirroring from just coordinates */
1106 static const enum a6xx_rotation rotate[2][2] = {
1107 {ROTATE_0, ROTATE_HFLIP},
1108 {ROTATE_VFLIP, ROTATE_180},
1109 };
1110
1111 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1112 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1113 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1114 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1115 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1116 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1117
1118 if (mirror_z) {
1119 tu_finishme("blit z mirror\n");
1120 return;
1121 }
1122
1123 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1124 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1125 tu_finishme("blit z filter\n");
1126 return;
1127 }
1128
1129 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1130 if (info->dstSubresource.layerCount > 1) {
1131 assert(layers <= 1);
1132 layers = info->dstSubresource.layerCount;
1133 }
1134
1135 uint8_t mask = 0xf;
1136 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1137 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1138 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1139 mask = 0x7;
1140 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1141 mask = 0x8;
1142 }
1143
1144 /* BC1_RGB_* formats need to have their last components overriden with 1
1145 * when sampling, which is normally handled with the texture descriptor
1146 * swizzle. The 2d path can't handle that, so use the 3d path.
1147 *
1148 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1149 * the 2d path.
1150 */
1151
1152 if (dst_image->samples > 1 ||
1153 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1154 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1155 ops = &r3d_ops;
1156
1157 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1158 * figure out why (should be able to pass all tests with only shader path)
1159 */
1160
1161 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1162
1163 if (ops == &r3d_ops) {
1164 r3d_coords_raw(cs, (float[]) {
1165 info->dstOffsets[0].x, info->dstOffsets[0].y,
1166 info->srcOffsets[0].x, info->srcOffsets[0].y,
1167 info->dstOffsets[1].x, info->dstOffsets[1].y,
1168 info->srcOffsets[1].x, info->srcOffsets[1].y
1169 });
1170 } else {
1171 tu_cs_emit_regs(cs,
1172 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1173 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1174 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1175 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1176 tu_cs_emit_regs(cs,
1177 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1178 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1179 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1180 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1181 }
1182
1183 struct tu_image_view dst, src;
1184 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1185 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1186
1187 for (uint32_t i = 0; i < layers; i++) {
1188 ops->dst(cs, &dst, i);
1189 ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1190 ops->run(cmd, cs);
1191 }
1192 }
1193
1194 void
1195 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1196 VkImage srcImage,
1197 VkImageLayout srcImageLayout,
1198 VkImage dstImage,
1199 VkImageLayout dstImageLayout,
1200 uint32_t regionCount,
1201 const VkImageBlit *pRegions,
1202 VkFilter filter)
1203
1204 {
1205 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1206 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1207 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1208
1209 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1210 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1211
1212 for (uint32_t i = 0; i < regionCount; ++i)
1213 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1214 }
1215
1216 static VkFormat
1217 copy_format(VkFormat format)
1218 {
1219 switch (vk_format_get_blocksizebits(format)) {
1220 case 8: return VK_FORMAT_R8_UINT;
1221 case 16: return VK_FORMAT_R16_UINT;
1222 case 32: return VK_FORMAT_R32_UINT;
1223 case 64: return VK_FORMAT_R32G32_UINT;
1224 case 96: return VK_FORMAT_R32G32B32_UINT;
1225 case 128:return VK_FORMAT_R32G32B32A32_UINT;
1226 default:
1227 unreachable("unhandled format size");
1228 }
1229 }
1230
1231 static void
1232 copy_compressed(VkFormat format,
1233 VkOffset3D *offset,
1234 VkExtent3D *extent,
1235 uint32_t *width,
1236 uint32_t *height)
1237 {
1238 if (!vk_format_is_compressed(format))
1239 return;
1240
1241 uint32_t block_width = vk_format_get_blockwidth(format);
1242 uint32_t block_height = vk_format_get_blockheight(format);
1243
1244 offset->x /= block_width;
1245 offset->y /= block_height;
1246
1247 if (extent) {
1248 extent->width = DIV_ROUND_UP(extent->width, block_width);
1249 extent->height = DIV_ROUND_UP(extent->height, block_height);
1250 }
1251 if (width)
1252 *width = DIV_ROUND_UP(*width, block_width);
1253 if (height)
1254 *height = DIV_ROUND_UP(*height, block_height);
1255 }
1256
1257 static void
1258 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1259 struct tu_buffer *src_buffer,
1260 struct tu_image *dst_image,
1261 const VkBufferImageCopy *info)
1262 {
1263 struct tu_cs *cs = &cmd->cs;
1264 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1265 VkFormat dst_format = dst_image->vk_format;
1266 VkFormat src_format = dst_image->vk_format;
1267 const struct blit_ops *ops = &r2d_ops;
1268
1269 uint8_t mask = 0xf;
1270
1271 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1272 switch (info->imageSubresource.aspectMask) {
1273 case VK_IMAGE_ASPECT_STENCIL_BIT:
1274 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1275 mask = 0x8;
1276 ops = &r3d_ops;
1277 break;
1278 case VK_IMAGE_ASPECT_DEPTH_BIT:
1279 mask = 0x7;
1280 break;
1281 }
1282 }
1283
1284 VkOffset3D offset = info->imageOffset;
1285 VkExtent3D extent = info->imageExtent;
1286 uint32_t src_width = info->bufferRowLength ?: extent.width;
1287 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1288
1289 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1290 assert(src_format == dst_format);
1291 copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1292 src_format = dst_format = copy_format(dst_format);
1293 }
1294
1295 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1296 uint32_t layer_size = src_height * pitch;
1297
1298 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1299 * it is also valid for 1cpp format with shader path (stencil aspect path)
1300 */
1301
1302 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1303
1304 struct tu_image_view dst;
1305 tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1306
1307 for (uint32_t i = 0; i < layers; i++) {
1308 ops->dst(cs, &dst, i);
1309
1310 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1311 if ((src_va & 63) || (pitch & 63)) {
1312 for (uint32_t y = 0; y < extent.height; y++) {
1313 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1314 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1315 x + extent.width, 1);
1316 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1317 &(VkExtent2D) {extent.width, 1});
1318 ops->run(cmd, cs);
1319 src_va += pitch;
1320 }
1321 } else {
1322 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1323 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1324 ops->run(cmd, cs);
1325 }
1326 }
1327 }
1328
1329 void
1330 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1331 VkBuffer srcBuffer,
1332 VkImage dstImage,
1333 VkImageLayout dstImageLayout,
1334 uint32_t regionCount,
1335 const VkBufferImageCopy *pRegions)
1336 {
1337 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1338 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1339 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1340
1341 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1342 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1343
1344 for (unsigned i = 0; i < regionCount; ++i)
1345 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1346 }
1347
1348 static void
1349 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1350 struct tu_image *src_image,
1351 struct tu_buffer *dst_buffer,
1352 const VkBufferImageCopy *info)
1353 {
1354 struct tu_cs *cs = &cmd->cs;
1355 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1356 VkFormat src_format = src_image->vk_format;
1357 VkFormat dst_format = src_image->vk_format;
1358 bool stencil_read = false;
1359
1360 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1361 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1362 dst_format = VK_FORMAT_R8_UNORM;
1363 stencil_read = true;
1364 }
1365
1366 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1367 VkOffset3D offset = info->imageOffset;
1368 VkExtent3D extent = info->imageExtent;
1369 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1370 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1371
1372 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1373 assert(src_format == dst_format);
1374 copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1375 src_format = dst_format = copy_format(dst_format);
1376 }
1377
1378 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1379 uint32_t layer_size = pitch * dst_height;
1380
1381 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1382 * it is also valid for 1cpp format with shader path (stencil aspect)
1383 */
1384
1385 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1386
1387 struct tu_image_view src;
1388 tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1389
1390 for (uint32_t i = 0; i < layers; i++) {
1391 ops->src(cmd, cs, &src, i, false);
1392
1393 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1394 if ((dst_va & 63) || (pitch & 63)) {
1395 for (uint32_t y = 0; y < extent.height; y++) {
1396 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1397 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1398 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1399 &(VkExtent2D) {extent.width, 1});
1400 ops->run(cmd, cs);
1401 dst_va += pitch;
1402 }
1403 } else {
1404 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1405 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1406 ops->run(cmd, cs);
1407 }
1408 }
1409 }
1410
1411 void
1412 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1413 VkImage srcImage,
1414 VkImageLayout srcImageLayout,
1415 VkBuffer dstBuffer,
1416 uint32_t regionCount,
1417 const VkBufferImageCopy *pRegions)
1418 {
1419 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1420 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1421 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1422
1423 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1424 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1425
1426 for (unsigned i = 0; i < regionCount; ++i)
1427 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1428 }
1429
1430 /* Tiled formats don't support swapping, which means that we can't support
1431 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1432 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1433 * Currently we fake support for tiled swapped formats and use the unswapped
1434 * format instead, but this means that reinterpreting copies to and from
1435 * swapped formats can't be performed correctly unless we can swizzle the
1436 * components by reinterpreting the other image as the "correct" swapped
1437 * format, i.e. only when the other image is linear.
1438 */
1439
1440 static bool
1441 is_swapped_format(VkFormat format)
1442 {
1443 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1444 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1445 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1446 }
1447
1448 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1449 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1450 * versa). This should mirror the logic in fdl6_layout.
1451 */
1452 static bool
1453 image_is_r8g8(struct tu_image *image)
1454 {
1455 return image->layout.cpp == 2 &&
1456 vk_format_get_nr_components(image->vk_format) == 2;
1457 }
1458
1459 static void
1460 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1461 struct tu_image *src_image,
1462 struct tu_image *dst_image,
1463 const VkImageCopy *info)
1464 {
1465 const struct blit_ops *ops = &r2d_ops;
1466 struct tu_cs *cs = &cmd->cs;
1467
1468 uint8_t mask = 0xf;
1469 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1470 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1471 mask = 0x7;
1472 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1473 mask = 0x8;
1474 }
1475
1476 if (dst_image->samples > 1)
1477 ops = &r3d_ops;
1478
1479 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1480
1481 VkFormat format = VK_FORMAT_UNDEFINED;
1482 VkOffset3D src_offset = info->srcOffset;
1483 VkOffset3D dst_offset = info->dstOffset;
1484 VkExtent3D extent = info->extent;
1485
1486 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1487 * Images":
1488 *
1489 * When copying between compressed and uncompressed formats the extent
1490 * members represent the texel dimensions of the source image and not
1491 * the destination. When copying from a compressed image to an
1492 * uncompressed image the image texel dimensions written to the
1493 * uncompressed image will be source extent divided by the compressed
1494 * texel block dimensions. When copying from an uncompressed image to a
1495 * compressed image the image texel dimensions written to the compressed
1496 * image will be the source extent multiplied by the compressed texel
1497 * block dimensions.
1498 *
1499 * This means we only have to adjust the extent if the source image is
1500 * compressed.
1501 */
1502 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1503 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1504
1505 VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1506 copy_format(dst_image->vk_format) : dst_image->vk_format;
1507 VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1508 copy_format(src_image->vk_format) : src_image->vk_format;
1509
1510 bool use_staging_blit = false;
1511
1512 if (src_format == dst_format) {
1513 /* Images that share a format can always be copied directly because it's
1514 * the same as a blit.
1515 */
1516 format = src_format;
1517 } else if (!src_image->layout.tile_mode) {
1518 /* If an image is linear, we can always safely reinterpret it with the
1519 * other image's format and then do a regular blit.
1520 */
1521 format = dst_format;
1522 } else if (!dst_image->layout.tile_mode) {
1523 format = src_format;
1524 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1525 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1526 * due to the different tile layout.
1527 */
1528 use_staging_blit = true;
1529 } else if (is_swapped_format(src_format) ||
1530 is_swapped_format(dst_format)) {
1531 /* If either format has a non-identity swap, then we can't copy
1532 * to/from it.
1533 */
1534 use_staging_blit = true;
1535 } else if (!src_image->layout.ubwc) {
1536 format = dst_format;
1537 } else if (!dst_image->layout.ubwc) {
1538 format = src_format;
1539 } else {
1540 /* Both formats use UBWC and so neither can be reinterpreted.
1541 * TODO: We could do an in-place decompression of the dst instead.
1542 */
1543 use_staging_blit = true;
1544 }
1545
1546 struct tu_image_view dst, src;
1547
1548 if (use_staging_blit) {
1549 tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1550 tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1551
1552 struct tu_image staging_image = {
1553 .vk_format = src_format,
1554 .type = src_image->type,
1555 .tiling = VK_IMAGE_TILING_LINEAR,
1556 .extent = extent,
1557 .level_count = 1,
1558 .layer_count = info->srcSubresource.layerCount,
1559 .samples = src_image->samples,
1560 .bo_offset = 0,
1561 };
1562
1563 VkImageSubresourceLayers staging_subresource = {
1564 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1565 .mipLevel = 0,
1566 .baseArrayLayer = 0,
1567 .layerCount = info->srcSubresource.layerCount,
1568 };
1569
1570 VkOffset3D staging_offset = { 0 };
1571
1572 staging_image.layout.tile_mode = TILE6_LINEAR;
1573 staging_image.layout.ubwc = false;
1574
1575 fdl6_layout(&staging_image.layout,
1576 vk_format_to_pipe_format(staging_image.vk_format),
1577 staging_image.samples,
1578 staging_image.extent.width,
1579 staging_image.extent.height,
1580 staging_image.extent.depth,
1581 staging_image.level_count,
1582 staging_image.layer_count,
1583 staging_image.type == VK_IMAGE_TYPE_3D,
1584 NULL);
1585
1586 VkResult result = tu_get_scratch_bo(cmd->device,
1587 staging_image.layout.size,
1588 &staging_image.bo);
1589 if (result != VK_SUCCESS) {
1590 cmd->record_result = result;
1591 return;
1592 }
1593
1594 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1595 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1596
1597 struct tu_image_view staging;
1598 tu_image_view_blit2(&staging, &staging_image, src_format,
1599 &staging_subresource, 0, false);
1600
1601 ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1602 coords(ops, cs, &staging_offset, &src_offset, &extent);
1603
1604 for (uint32_t i = 0; i < info->extent.depth; i++) {
1605 ops->src(cmd, cs, &src, i, false);
1606 ops->dst(cs, &staging, i);
1607 ops->run(cmd, cs);
1608 }
1609
1610 /* When executed by the user there has to be a pipeline barrier here,
1611 * but since we're doing it manually we'll have to flush ourselves.
1612 */
1613 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1614 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1615
1616 tu_image_view_blit2(&staging, &staging_image, dst_format,
1617 &staging_subresource, 0, false);
1618
1619 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1620 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1621
1622 for (uint32_t i = 0; i < info->extent.depth; i++) {
1623 ops->src(cmd, cs, &staging, i, false);
1624 ops->dst(cs, &dst, i);
1625 ops->run(cmd, cs);
1626 }
1627 } else {
1628 tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1629 tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1630
1631 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1632 coords(ops, cs, &dst_offset, &src_offset, &extent);
1633
1634 for (uint32_t i = 0; i < info->extent.depth; i++) {
1635 ops->src(cmd, cs, &src, i, false);
1636 ops->dst(cs, &dst, i);
1637 ops->run(cmd, cs);
1638 }
1639 }
1640 }
1641
1642 void
1643 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1644 VkImage srcImage,
1645 VkImageLayout srcImageLayout,
1646 VkImage destImage,
1647 VkImageLayout destImageLayout,
1648 uint32_t regionCount,
1649 const VkImageCopy *pRegions)
1650 {
1651 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1652 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1653 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1654
1655 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1656 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1657
1658 for (uint32_t i = 0; i < regionCount; ++i)
1659 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1660 }
1661
1662 static void
1663 copy_buffer(struct tu_cmd_buffer *cmd,
1664 uint64_t dst_va,
1665 uint64_t src_va,
1666 uint64_t size,
1667 uint32_t block_size)
1668 {
1669 const struct blit_ops *ops = &r2d_ops;
1670 struct tu_cs *cs = &cmd->cs;
1671 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1672 uint64_t blocks = size / block_size;
1673
1674 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1675
1676 while (blocks) {
1677 uint32_t src_x = (src_va & 63) / block_size;
1678 uint32_t dst_x = (dst_va & 63) / block_size;
1679 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1680
1681 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1682 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1683 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1684 ops->run(cmd, cs);
1685
1686 src_va += width * block_size;
1687 dst_va += width * block_size;
1688 blocks -= width;
1689 }
1690 }
1691
1692 void
1693 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1694 VkBuffer srcBuffer,
1695 VkBuffer dstBuffer,
1696 uint32_t regionCount,
1697 const VkBufferCopy *pRegions)
1698 {
1699 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1700 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1701 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1702
1703 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1704 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1705
1706 for (unsigned i = 0; i < regionCount; ++i) {
1707 copy_buffer(cmd,
1708 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1709 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1710 pRegions[i].size, 1);
1711 }
1712 }
1713
1714 void
1715 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1716 VkBuffer dstBuffer,
1717 VkDeviceSize dstOffset,
1718 VkDeviceSize dataSize,
1719 const void *pData)
1720 {
1721 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1722 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1723
1724 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1725
1726 struct ts_cs_memory tmp;
1727 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1728 if (result != VK_SUCCESS) {
1729 cmd->record_result = result;
1730 return;
1731 }
1732
1733 memcpy(tmp.map, pData, dataSize);
1734 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1735 }
1736
1737 void
1738 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1739 VkBuffer dstBuffer,
1740 VkDeviceSize dstOffset,
1741 VkDeviceSize fillSize,
1742 uint32_t data)
1743 {
1744 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1745 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1746 const struct blit_ops *ops = &r2d_ops;
1747 struct tu_cs *cs = &cmd->cs;
1748
1749 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1750
1751 if (fillSize == VK_WHOLE_SIZE)
1752 fillSize = buffer->size - dstOffset;
1753
1754 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1755 uint32_t blocks = fillSize / 4;
1756
1757 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1758 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1759
1760 while (blocks) {
1761 uint32_t dst_x = (dst_va & 63) / 4;
1762 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1763
1764 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1765 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1766 ops->run(cmd, cs);
1767
1768 dst_va += width * 4;
1769 blocks -= width;
1770 }
1771 }
1772
1773 void
1774 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1775 VkImage srcImage,
1776 VkImageLayout srcImageLayout,
1777 VkImage dstImage,
1778 VkImageLayout dstImageLayout,
1779 uint32_t regionCount,
1780 const VkImageResolve *pRegions)
1781 {
1782 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1783 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1784 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1785 const struct blit_ops *ops = &r2d_ops;
1786 struct tu_cs *cs = &cmd->cs;
1787
1788 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1789 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1790
1791 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1792
1793 for (uint32_t i = 0; i < regionCount; ++i) {
1794 const VkImageResolve *info = &pRegions[i];
1795 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1796
1797 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1798 /* TODO: aspect masks possible ? */
1799
1800 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1801
1802 struct tu_image_view dst, src;
1803 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1804 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1805
1806 for (uint32_t i = 0; i < layers; i++) {
1807 ops->src(cmd, cs, &src, i, false);
1808 ops->dst(cs, &dst, i);
1809 ops->run(cmd, cs);
1810 }
1811 }
1812 }
1813
1814 void
1815 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1816 struct tu_cs *cs,
1817 struct tu_image_view *src,
1818 struct tu_image_view *dst,
1819 uint32_t layers,
1820 const VkRect2D *rect)
1821 {
1822 const struct blit_ops *ops = &r2d_ops;
1823
1824 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1825 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1826
1827 assert(src->image->vk_format == dst->image->vk_format);
1828
1829 ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1830 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1831
1832 for (uint32_t i = 0; i < layers; i++) {
1833 ops->src(cmd, cs, src, i, false);
1834 ops->dst(cs, dst, i);
1835 ops->run(cmd, cs);
1836 }
1837 }
1838
1839 static void
1840 clear_image(struct tu_cmd_buffer *cmd,
1841 struct tu_image *image,
1842 const VkClearValue *clear_value,
1843 const VkImageSubresourceRange *range)
1844 {
1845 uint32_t level_count = tu_get_levelCount(image, range);
1846 uint32_t layer_count = tu_get_layerCount(image, range);
1847 struct tu_cs *cs = &cmd->cs;
1848 VkFormat format = image->vk_format;
1849 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1850 format = VK_FORMAT_R32_UINT;
1851
1852 if (image->type == VK_IMAGE_TYPE_3D) {
1853 assert(layer_count == 1);
1854 assert(range->baseArrayLayer == 0);
1855 }
1856
1857 uint8_t mask = 0xf;
1858 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1859 mask = 0;
1860 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1861 mask |= 0x7;
1862 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1863 mask |= 0x8;
1864 }
1865
1866 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1867
1868 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1869 ops->clear_value(cs, image->vk_format, clear_value);
1870
1871 for (unsigned j = 0; j < level_count; j++) {
1872 if (image->type == VK_IMAGE_TYPE_3D)
1873 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1874
1875 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1876 u_minify(image->extent.width, range->baseMipLevel + j),
1877 u_minify(image->extent.height, range->baseMipLevel + j)
1878 });
1879
1880 struct tu_image_view dst;
1881 tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1882 .aspectMask = range->aspectMask,
1883 .mipLevel = range->baseMipLevel + j,
1884 .baseArrayLayer = range->baseArrayLayer,
1885 .layerCount = 1,
1886 }, 0, false);
1887
1888 for (uint32_t i = 0; i < layer_count; i++) {
1889 ops->dst(cs, &dst, i);
1890 ops->run(cmd, cs);
1891 }
1892 }
1893 }
1894
1895 void
1896 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1897 VkImage image_h,
1898 VkImageLayout imageLayout,
1899 const VkClearColorValue *pColor,
1900 uint32_t rangeCount,
1901 const VkImageSubresourceRange *pRanges)
1902 {
1903 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1904 TU_FROM_HANDLE(tu_image, image, image_h);
1905
1906 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1907
1908 for (unsigned i = 0; i < rangeCount; i++)
1909 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1910 }
1911
1912 void
1913 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1914 VkImage image_h,
1915 VkImageLayout imageLayout,
1916 const VkClearDepthStencilValue *pDepthStencil,
1917 uint32_t rangeCount,
1918 const VkImageSubresourceRange *pRanges)
1919 {
1920 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1921 TU_FROM_HANDLE(tu_image, image, image_h);
1922
1923 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1924
1925 for (unsigned i = 0; i < rangeCount; i++)
1926 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1927 }
1928
1929 static void
1930 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1931 uint32_t attachment_count,
1932 const VkClearAttachment *attachments,
1933 uint32_t rect_count,
1934 const VkClearRect *rects)
1935 {
1936 const struct tu_subpass *subpass = cmd->state.subpass;
1937 /* note: cannot use shader path here.. there is a special shader path
1938 * in tu_clear_sysmem_attachments()
1939 */
1940 const struct blit_ops *ops = &r2d_ops;
1941 struct tu_cs *cs = &cmd->draw_cs;
1942
1943 for (uint32_t j = 0; j < attachment_count; j++) {
1944 uint32_t a;
1945 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1946 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1947 } else {
1948 a = subpass->depth_stencil_attachment.attachment;
1949
1950 /* sync depth into color */
1951 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1952 /* also flush color to avoid losing contents from invalidate */
1953 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1954 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1955 }
1956
1957 if (a == VK_ATTACHMENT_UNUSED)
1958 continue;
1959
1960 uint8_t mask = 0xf;
1961 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1962 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1963 mask &= ~0x7;
1964 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1965 mask &= ~0x8;
1966 }
1967
1968 const struct tu_image_view *iview =
1969 cmd->state.framebuffer->attachments[a].attachment;
1970
1971 ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1972 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1973
1974 for (uint32_t i = 0; i < rect_count; i++) {
1975 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1976 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1977 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1978 ops->run(cmd, cs);
1979 }
1980 }
1981
1982 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1983 /* does not use CCU - flush
1984 * note: cache invalidate might be needed to, and just not covered by test cases
1985 */
1986 if (attachments[j].colorAttachment > 0)
1987 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1988 } else {
1989 /* sync color into depth */
1990 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1991 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1992 }
1993 }
1994 }
1995
1996 static void
1997 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1998 uint32_t attachment_count,
1999 const VkClearAttachment *attachments,
2000 uint32_t rect_count,
2001 const VkClearRect *rects)
2002 {
2003 /* the shader path here is special, it avoids changing MRT/etc state */
2004 const struct tu_render_pass *pass = cmd->state.pass;
2005 const struct tu_subpass *subpass = cmd->state.subpass;
2006 const uint32_t mrt_count = subpass->color_count;
2007 struct tu_cs *cs = &cmd->draw_cs;
2008 uint32_t clear_value[MAX_RTS][4];
2009 float z_clear_val = 0.0f;
2010 uint8_t s_clear_val = 0;
2011 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2012 bool z_clear = false;
2013 bool s_clear = false;
2014 uint32_t max_samples = 1;
2015
2016 for (uint32_t i = 0; i < attachment_count; i++) {
2017 uint32_t a;
2018 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2019 uint32_t c = attachments[i].colorAttachment;
2020 a = subpass->color_attachments[c].attachment;
2021 if (a == VK_ATTACHMENT_UNUSED)
2022 continue;
2023
2024 clear_rts |= 1 << c;
2025 clear_components |= 0xf << (c * 4);
2026 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2027 } else {
2028 a = subpass->depth_stencil_attachment.attachment;
2029 if (a == VK_ATTACHMENT_UNUSED)
2030 continue;
2031
2032 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2033 z_clear = true;
2034 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2035 }
2036
2037 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2038 s_clear = true;
2039 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2040 }
2041 }
2042
2043 max_samples = MAX2(max_samples, pass->attachments[a].samples);
2044 }
2045
2046 /* prefer to use 2D path for clears
2047 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2048 */
2049 if (max_samples == 1 && cmd->state.framebuffer) {
2050 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2051 return;
2052 }
2053
2054 /* TODO: this path doesn't take into account multilayer rendering */
2055
2056 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2057 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2058 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2059 0xfc000000);
2060 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2061
2062 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2063 for (uint32_t i = 0; i < mrt_count; i++) {
2064 if (clear_rts & (1 << i))
2065 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2066 else
2067 tu_cs_emit(cs, 0);
2068 }
2069
2070 r3d_pipeline(cmd, cs, false, num_rts);
2071
2072 tu_cs_emit_regs(cs,
2073 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2074 tu_cs_emit_regs(cs,
2075 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2076
2077 tu_cs_emit_regs(cs,
2078 A6XX_RB_FS_OUTPUT_CNTL0(),
2079 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2080
2081 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2082 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2083 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2084 for (uint32_t i = 0; i < mrt_count; i++) {
2085 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2086 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2087 }
2088
2089 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2090 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2091 .z_enable = z_clear,
2092 .z_write_enable = z_clear,
2093 .zfunc = FUNC_ALWAYS));
2094 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2095 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2096 .stencil_enable = s_clear,
2097 .func = FUNC_ALWAYS,
2098 .zpass = STENCIL_REPLACE));
2099 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2100 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2101 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2102
2103 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2104 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2105 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2106 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2107 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2108 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2109 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2110 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2111 for_each_bit(b, clear_rts)
2112 tu_cs_emit_array(cs, clear_value[b], 4);
2113
2114 for (uint32_t i = 0; i < rect_count; i++) {
2115 r3d_coords_raw(cs, (float[]) {
2116 rects[i].rect.offset.x, rects[i].rect.offset.y,
2117 z_clear_val, 1.0f,
2118 rects[i].rect.offset.x + rects[i].rect.extent.width,
2119 rects[i].rect.offset.y + rects[i].rect.extent.height,
2120 z_clear_val, 1.0f
2121 });
2122 r3d_run(cmd, cs);
2123 }
2124
2125 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2126 TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2127 TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2128 TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2129 TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2130 TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2131 }
2132
2133 /**
2134 * Pack a VkClearValue into a 128-bit buffer. format is respected except
2135 * for the component order. The components are always packed in WZYX order,
2136 * because gmem is tiled and tiled formats always have WZYX swap
2137 */
2138 static void
2139 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2140 {
2141 const struct util_format_description *desc = vk_format_description(format);
2142
2143 switch (format) {
2144 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2145 buf[0] = float3_to_r11g11b10f(val->color.float32);
2146 return;
2147 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2148 buf[0] = float3_to_rgb9e5(val->color.float32);
2149 return;
2150 default:
2151 break;
2152 }
2153
2154 assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2155
2156 /* S8_UINT is special and has no depth */
2157 const int max_components =
2158 format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2159
2160 int buf_offset = 0;
2161 int bit_shift = 0;
2162 for (int comp = 0; comp < max_components; comp++) {
2163 const struct util_format_channel_description *ch =
2164 tu_get_format_channel_description(desc, comp);
2165 if (!ch) {
2166 assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2167 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2168 continue;
2169 }
2170
2171 union tu_clear_component_value v = tu_get_clear_component_value(
2172 val, comp, desc->colorspace);
2173
2174 /* move to the next uint32_t when there is not enough space */
2175 assert(ch->size <= 32);
2176 if (bit_shift + ch->size > 32) {
2177 buf_offset++;
2178 bit_shift = 0;
2179 }
2180
2181 if (bit_shift == 0)
2182 buf[buf_offset] = 0;
2183
2184 buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2185 bit_shift += ch->size;
2186 }
2187 }
2188
2189 static void
2190 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2191 struct tu_cs *cs,
2192 uint32_t attachment,
2193 uint8_t component_mask,
2194 const VkClearValue *value)
2195 {
2196 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2197 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2198 * because D24S8 is cleared with AS_R8G8B8A8 format
2199 */
2200
2201 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2202 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2203
2204 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2205 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2206
2207 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2208 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2209
2210 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2211 tu_cs_emit(cs, 0);
2212
2213 uint32_t clear_vals[4] = {};
2214 pack_gmem_clear_value(value, vk_format, clear_vals);
2215
2216 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2217 tu_cs_emit_array(cs, clear_vals, 4);
2218
2219 tu6_emit_event_write(cmd, cs, BLIT, false);
2220 }
2221
2222 static void
2223 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2224 uint32_t attachment_count,
2225 const VkClearAttachment *attachments,
2226 uint32_t rect_count,
2227 const VkClearRect *rects)
2228 {
2229 const struct tu_subpass *subpass = cmd->state.subpass;
2230 struct tu_cs *cs = &cmd->draw_cs;
2231
2232 /* TODO: swap the loops for smaller cmdstream */
2233 for (unsigned i = 0; i < rect_count; i++) {
2234 unsigned x1 = rects[i].rect.offset.x;
2235 unsigned y1 = rects[i].rect.offset.y;
2236 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2237 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2238
2239 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2240 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2241 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2242
2243 for (unsigned j = 0; j < attachment_count; j++) {
2244 uint32_t a;
2245 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2246 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2247 else
2248 a = subpass->depth_stencil_attachment.attachment;
2249
2250 if (a == VK_ATTACHMENT_UNUSED)
2251 continue;
2252
2253 unsigned clear_mask = 0xf;
2254 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2255 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2256 clear_mask &= ~0x7;
2257 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2258 clear_mask &= ~0x8;
2259 }
2260
2261 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2262 &attachments[j].clearValue);
2263 }
2264 }
2265 }
2266
2267 void
2268 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2269 uint32_t attachmentCount,
2270 const VkClearAttachment *pAttachments,
2271 uint32_t rectCount,
2272 const VkClearRect *pRects)
2273 {
2274 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2275 struct tu_cs *cs = &cmd->draw_cs;
2276
2277 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2278 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2279 tu_cond_exec_end(cs);
2280
2281 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2282 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2283 tu_cond_exec_end(cs);
2284 }
2285
2286 void
2287 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2288 struct tu_cs *cs,
2289 uint32_t a,
2290 const VkRenderPassBeginInfo *info)
2291 {
2292 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2293 const struct tu_image_view *iview = fb->attachments[a].attachment;
2294 const struct tu_render_pass_attachment *attachment =
2295 &cmd->state.pass->attachments[a];
2296 uint8_t mask = 0;
2297
2298 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2299 mask = 0xf;
2300 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2301 mask |= 0x7;
2302 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2303 mask |= 0x8;
2304
2305 if (!mask)
2306 return;
2307
2308 const struct blit_ops *ops = &r2d_ops;
2309 if (attachment->samples > 1)
2310 ops = &r3d_ops;
2311
2312 ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2313 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2314 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2315
2316 for (uint32_t i = 0; i < fb->layers; i++) {
2317 ops->dst(cs, iview, i);
2318 ops->run(cmd, cs);
2319 }
2320 }
2321
2322 void
2323 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2324 struct tu_cs *cs,
2325 uint32_t a,
2326 const VkRenderPassBeginInfo *info)
2327 {
2328 const struct tu_render_pass_attachment *attachment =
2329 &cmd->state.pass->attachments[a];
2330 unsigned clear_mask = 0;
2331
2332 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2333 clear_mask = 0xf;
2334 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2335 clear_mask |= 0x7;
2336 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2337 clear_mask |= 0x8;
2338
2339 if (!clear_mask)
2340 return;
2341
2342 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2343
2344 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2345 &info->pClearValues[a]);
2346 }
2347
2348 static void
2349 tu_emit_blit(struct tu_cmd_buffer *cmd,
2350 struct tu_cs *cs,
2351 const struct tu_image_view *iview,
2352 const struct tu_render_pass_attachment *attachment,
2353 bool resolve)
2354 {
2355 tu_cs_emit_regs(cs,
2356 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2357
2358 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2359 .unk0 = !resolve,
2360 .gmem = !resolve,
2361 /* "integer" bit disables msaa resolve averaging */
2362 .integer = vk_format_is_int(attachment->format)));
2363
2364 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2365 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2366 tu_cs_image_ref_2d(cs, iview, 0, false);
2367
2368 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2369 tu_cs_image_flag_ref(cs, iview, 0);
2370
2371 tu_cs_emit_regs(cs,
2372 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2373
2374 tu6_emit_event_write(cmd, cs, BLIT, false);
2375 }
2376
2377 static bool
2378 blit_can_resolve(VkFormat format)
2379 {
2380 const struct util_format_description *desc = vk_format_description(format);
2381
2382 /* blit event can only do resolve for simple cases:
2383 * averaging samples as unsigned integers or choosing only one sample
2384 */
2385 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2386 return false;
2387
2388 /* can't do formats with larger channel sizes
2389 * note: this includes all float formats
2390 * note2: single channel integer formats seem OK
2391 */
2392 if (desc->channel[0].size > 10)
2393 return false;
2394
2395 switch (format) {
2396 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2397 * likely related to these formats having different layout from other cpp=2 formats
2398 */
2399 case VK_FORMAT_R8G8_UNORM:
2400 case VK_FORMAT_R8G8_UINT:
2401 case VK_FORMAT_R8G8_SINT:
2402 /* TODO: this one should be able to work? */
2403 case VK_FORMAT_D24_UNORM_S8_UINT:
2404 return false;
2405 default:
2406 break;
2407 }
2408
2409 return true;
2410 }
2411
2412 void
2413 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2414 struct tu_cs *cs,
2415 uint32_t a,
2416 bool force_load)
2417 {
2418 const struct tu_image_view *iview =
2419 cmd->state.framebuffer->attachments[a].attachment;
2420 const struct tu_render_pass_attachment *attachment =
2421 &cmd->state.pass->attachments[a];
2422
2423 if (attachment->load || force_load)
2424 tu_emit_blit(cmd, cs, iview, attachment, false);
2425 }
2426
2427 void
2428 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2429 struct tu_cs *cs,
2430 uint32_t a,
2431 uint32_t gmem_a)
2432 {
2433 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2434 const VkRect2D *render_area = &tiling->render_area;
2435 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2436 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2437 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2438
2439 if (!dst->store)
2440 return;
2441
2442 uint32_t x1 = render_area->offset.x;
2443 uint32_t y1 = render_area->offset.y;
2444 uint32_t x2 = x1 + render_area->extent.width;
2445 uint32_t y2 = y1 + render_area->extent.height;
2446 /* x2/y2 can be unaligned if equal to the size of the image,
2447 * since it will write into padding space
2448 * the one exception is linear levels which don't have the
2449 * required y padding in the layout (except for the last level)
2450 */
2451 bool need_y2_align =
2452 y2 != iview->extent.height || iview->need_y2_align;
2453
2454 bool unaligned =
2455 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2456 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2457
2458 /* use fast path when render area is aligned, except for unsupported resolve cases */
2459 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2460 tu_emit_blit(cmd, cs, iview, src, true);
2461 return;
2462 }
2463
2464 if (dst->samples > 1) {
2465 /* I guess we need to use shader path in this case?
2466 * need a testcase which fails because of this
2467 */
2468 tu_finishme("unaligned store of msaa attachment\n");
2469 return;
2470 }
2471
2472 r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2473 r2d_dst(cs, iview, 0);
2474 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2475
2476 tu_cs_emit_regs(cs,
2477 A6XX_SP_PS_2D_SRC_INFO(
2478 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2479 .tile_mode = TILE6_2,
2480 .srgb = vk_format_is_srgb(src->format),
2481 .samples = tu_msaa_samples(src->samples),
2482 .samples_average = !vk_format_is_int(src->format),
2483 .unk20 = 1,
2484 .unk22 = 1),
2485 /* note: src size does not matter when not scaling */
2486 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2487 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2488 A6XX_SP_PS_2D_SRC_HI(),
2489 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2490
2491 /* sync GMEM writes with CACHE */
2492 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2493
2494 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2495 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2496
2497 /* TODO: flushing with barriers instead of blindly always flushing */
2498 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2499 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2500 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2501 }