tu: ir3: Emit push constants directly
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 /* helper functions previously in tu_formats.c */
20
21 static uint32_t
22 tu_pack_mask(int bits)
23 {
24 assert(bits <= 32);
25 return (1ull << bits) - 1;
26 }
27
28 static uint32_t
29 tu_pack_float32_for_unorm(float val, int bits)
30 {
31 const uint32_t max = tu_pack_mask(bits);
32 if (val < 0.0f)
33 return 0;
34 else if (val > 1.0f)
35 return max;
36 else
37 return _mesa_lroundevenf(val * (float) max);
38 }
39
40 static uint32_t
41 tu_pack_float32_for_snorm(float val, int bits)
42 {
43 const int32_t max = tu_pack_mask(bits - 1);
44 int32_t tmp;
45 if (val < -1.0f)
46 tmp = -max;
47 else if (val > 1.0f)
48 tmp = max;
49 else
50 tmp = _mesa_lroundevenf(val * (float) max);
51
52 return tmp & tu_pack_mask(bits);
53 }
54
55 static uint32_t
56 tu_pack_float32_for_uscaled(float val, int bits)
57 {
58 const uint32_t max = tu_pack_mask(bits);
59 if (val < 0.0f)
60 return 0;
61 else if (val > (float) max)
62 return max;
63 else
64 return (uint32_t) val;
65 }
66
67 static uint32_t
68 tu_pack_float32_for_sscaled(float val, int bits)
69 {
70 const int32_t max = tu_pack_mask(bits - 1);
71 const int32_t min = -max - 1;
72 int32_t tmp;
73 if (val < (float) min)
74 tmp = min;
75 else if (val > (float) max)
76 tmp = max;
77 else
78 tmp = (int32_t) val;
79
80 return tmp & tu_pack_mask(bits);
81 }
82
83 static uint32_t
84 tu_pack_uint32_for_uint(uint32_t val, int bits)
85 {
86 return val & tu_pack_mask(bits);
87 }
88
89 static uint32_t
90 tu_pack_int32_for_sint(int32_t val, int bits)
91 {
92 return val & tu_pack_mask(bits);
93 }
94
95 static uint32_t
96 tu_pack_float32_for_sfloat(float val, int bits)
97 {
98 assert(bits == 16 || bits == 32);
99 return bits == 16 ? util_float_to_half(val) : fui(val);
100 }
101
102 union tu_clear_component_value {
103 float float32;
104 int32_t int32;
105 uint32_t uint32;
106 };
107
108 static uint32_t
109 tu_pack_clear_component_value(union tu_clear_component_value val,
110 const struct util_format_channel_description *ch)
111 {
112 uint32_t packed;
113
114 switch (ch->type) {
115 case UTIL_FORMAT_TYPE_UNSIGNED:
116 /* normalized, scaled, or pure integer */
117 if (ch->normalized)
118 packed = tu_pack_float32_for_unorm(val.float32, ch->size);
119 else if (ch->pure_integer)
120 packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
121 else
122 packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
123 break;
124 case UTIL_FORMAT_TYPE_SIGNED:
125 /* normalized, scaled, or pure integer */
126 if (ch->normalized)
127 packed = tu_pack_float32_for_snorm(val.float32, ch->size);
128 else if (ch->pure_integer)
129 packed = tu_pack_int32_for_sint(val.int32, ch->size);
130 else
131 packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
132 break;
133 case UTIL_FORMAT_TYPE_FLOAT:
134 packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
135 break;
136 default:
137 unreachable("unexpected channel type");
138 packed = 0;
139 break;
140 }
141
142 assert((packed & tu_pack_mask(ch->size)) == packed);
143 return packed;
144 }
145
146 static const struct util_format_channel_description *
147 tu_get_format_channel_description(const struct util_format_description *desc,
148 int comp)
149 {
150 switch (desc->swizzle[comp]) {
151 case PIPE_SWIZZLE_X:
152 return &desc->channel[0];
153 case PIPE_SWIZZLE_Y:
154 return &desc->channel[1];
155 case PIPE_SWIZZLE_Z:
156 return &desc->channel[2];
157 case PIPE_SWIZZLE_W:
158 return &desc->channel[3];
159 default:
160 return NULL;
161 }
162 }
163
164 static union tu_clear_component_value
165 tu_get_clear_component_value(const VkClearValue *val, int comp,
166 enum util_format_colorspace colorspace)
167 {
168 assert(comp < 4);
169
170 union tu_clear_component_value tmp;
171 switch (colorspace) {
172 case UTIL_FORMAT_COLORSPACE_ZS:
173 assert(comp < 2);
174 if (comp == 0)
175 tmp.float32 = val->depthStencil.depth;
176 else
177 tmp.uint32 = val->depthStencil.stencil;
178 break;
179 case UTIL_FORMAT_COLORSPACE_SRGB:
180 if (comp < 3) {
181 tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
182 break;
183 }
184 default:
185 assert(comp < 4);
186 tmp.uint32 = val->color.uint32[comp];
187 break;
188 }
189
190 return tmp;
191 }
192
193 /* r2d_ = BLIT_OP_SCALE operations */
194
195 static enum a6xx_2d_ifmt
196 format_to_ifmt(enum a6xx_format fmt)
197 {
198 switch (fmt) {
199 case FMT6_A8_UNORM:
200 case FMT6_8_UNORM:
201 case FMT6_8_SNORM:
202 case FMT6_8_8_UNORM:
203 case FMT6_8_8_SNORM:
204 case FMT6_8_8_8_8_UNORM:
205 case FMT6_8_8_8_X8_UNORM:
206 case FMT6_8_8_8_8_SNORM:
207 case FMT6_4_4_4_4_UNORM:
208 case FMT6_5_5_5_1_UNORM:
209 case FMT6_5_6_5_UNORM:
210 case FMT6_Z24_UNORM_S8_UINT:
211 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
212 return R2D_UNORM8;
213
214 case FMT6_32_UINT:
215 case FMT6_32_SINT:
216 case FMT6_32_32_UINT:
217 case FMT6_32_32_SINT:
218 case FMT6_32_32_32_32_UINT:
219 case FMT6_32_32_32_32_SINT:
220 return R2D_INT32;
221
222 case FMT6_16_UINT:
223 case FMT6_16_SINT:
224 case FMT6_16_16_UINT:
225 case FMT6_16_16_SINT:
226 case FMT6_16_16_16_16_UINT:
227 case FMT6_16_16_16_16_SINT:
228 case FMT6_10_10_10_2_UINT:
229 return R2D_INT16;
230
231 case FMT6_8_UINT:
232 case FMT6_8_SINT:
233 case FMT6_8_8_UINT:
234 case FMT6_8_8_SINT:
235 case FMT6_8_8_8_8_UINT:
236 case FMT6_8_8_8_8_SINT:
237 return R2D_INT8;
238
239 case FMT6_16_UNORM:
240 case FMT6_16_SNORM:
241 case FMT6_16_16_UNORM:
242 case FMT6_16_16_SNORM:
243 case FMT6_16_16_16_16_UNORM:
244 case FMT6_16_16_16_16_SNORM:
245 case FMT6_32_FLOAT:
246 case FMT6_32_32_FLOAT:
247 case FMT6_32_32_32_32_FLOAT:
248 return R2D_FLOAT32;
249
250 case FMT6_16_FLOAT:
251 case FMT6_16_16_FLOAT:
252 case FMT6_16_16_16_16_FLOAT:
253 case FMT6_11_11_10_FLOAT:
254 case FMT6_10_10_10_2_UNORM:
255 case FMT6_10_10_10_2_UNORM_DEST:
256 return R2D_FLOAT16;
257
258 default:
259 unreachable("bad format");
260 return 0;
261 }
262 }
263
264 static void
265 r2d_coords(struct tu_cs *cs,
266 const VkOffset2D *dst,
267 const VkOffset2D *src,
268 const VkExtent2D *extent)
269 {
270 tu_cs_emit_regs(cs,
271 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
272 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
273
274 if (!src)
275 return;
276
277 tu_cs_emit_regs(cs,
278 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
279 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
280 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
281 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
282 }
283
284 static void
285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
286 {
287 uint32_t clear_value[4] = {};
288
289 switch (format) {
290 case VK_FORMAT_X8_D24_UNORM_PACK32:
291 case VK_FORMAT_D24_UNORM_S8_UINT:
292 /* cleared as r8g8b8a8_unorm using special format */
293 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
294 clear_value[1] = clear_value[0] >> 8;
295 clear_value[2] = clear_value[0] >> 16;
296 clear_value[3] = val->depthStencil.stencil;
297 break;
298 case VK_FORMAT_D16_UNORM:
299 case VK_FORMAT_D32_SFLOAT:
300 /* R2D_FLOAT32 */
301 clear_value[0] = fui(val->depthStencil.depth);
302 break;
303 case VK_FORMAT_S8_UINT:
304 clear_value[0] = val->depthStencil.stencil;
305 break;
306 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
307 /* cleared as UINT32 */
308 clear_value[0] = float3_to_rgb9e5(val->color.float32);
309 break;
310 default:
311 assert(!vk_format_is_depth_or_stencil(format));
312 const struct util_format_description *desc = vk_format_description(format);
313 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
314
315 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
316 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
317
318 for (unsigned i = 0; i < desc->nr_channels; i++) {
319 const struct util_format_channel_description *ch = &desc->channel[i];
320 if (ifmt == R2D_UNORM8) {
321 float linear = val->color.float32[i];
322 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
323 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
324
325 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
326 clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
327 else
328 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
329 } else if (ifmt == R2D_FLOAT16) {
330 clear_value[i] = util_float_to_half(val->color.float32[i]);
331 } else {
332 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
333 ifmt == R2D_INT16 || ifmt == R2D_INT8);
334 clear_value[i] = val->color.uint32[i];
335 }
336 }
337 break;
338 }
339
340 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
341 tu_cs_emit_array(cs, clear_value, 4);
342 }
343
344 static void
345 r2d_src(struct tu_cmd_buffer *cmd,
346 struct tu_cs *cs,
347 struct tu_image *image,
348 VkFormat vk_format,
349 uint32_t level,
350 uint32_t layer,
351 bool linear_filter,
352 bool stencil_read)
353 {
354 struct tu_native_format format = tu6_format_image_src(image, vk_format, level);
355
356 /* stencil readout path fails with UBWC enabled (why?) */
357 assert(!stencil_read || !image->layout.ubwc_layer_size);
358
359 if (stencil_read)
360 format.swap = XYZW;
361
362 tu_cs_emit_regs(cs,
363 A6XX_SP_PS_2D_SRC_INFO(
364 .color_format = format.fmt,
365 .tile_mode = format.tile_mode,
366 .color_swap = format.swap,
367 .flags = image->layout.ubwc_layer_size != 0,
368 .srgb = vk_format_is_srgb(vk_format),
369 .samples = tu_msaa_samples(image->samples),
370 .filter = linear_filter,
371 .samples_average = image->samples > 1 &&
372 !vk_format_is_int(vk_format) &&
373 !vk_format_is_depth_or_stencil(vk_format),
374 .unk20 = 1,
375 .unk22 = 1),
376 A6XX_SP_PS_2D_SRC_SIZE(
377 .width = tu_minify(image->extent.width, level),
378 .height = tu_minify(image->extent.height, level)),
379 A6XX_SP_PS_2D_SRC(tu_image_base_ref(image, level, layer)),
380 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tu_image_pitch(image, level)));
381
382 if (image->layout.ubwc_layer_size) {
383 tu_cs_emit_regs(cs,
384 A6XX_SP_PS_2D_SRC_FLAGS(tu_image_ubwc_base_ref(image, level, layer)),
385 A6XX_SP_PS_2D_SRC_FLAGS_PITCH(.pitch = tu_image_ubwc_pitch(image, level)));
386 }
387 }
388
389 static void
390 r2d_src_buffer(struct tu_cmd_buffer *cmd,
391 struct tu_cs *cs,
392 VkFormat vk_format,
393 uint64_t va, uint32_t pitch,
394 uint32_t width, uint32_t height)
395 {
396 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
397
398 tu_cs_emit_regs(cs,
399 A6XX_SP_PS_2D_SRC_INFO(
400 .color_format = format.fmt,
401 .color_swap = format.swap,
402 .srgb = vk_format_is_srgb(vk_format),
403 .unk20 = 1,
404 .unk22 = 1),
405 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
406 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
407 A6XX_SP_PS_2D_SRC_HI(va >> 32),
408 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
409 }
410
411 static void
412 r2d_dst(struct tu_cs *cs,
413 struct tu_image *image,
414 VkFormat vk_format,
415 uint32_t level,
416 uint32_t layer)
417 {
418 struct tu_native_format format = tu6_format_image(image, vk_format, level);
419
420 assert(image->samples == 1);
421
422 tu_cs_emit_regs(cs,
423 A6XX_RB_2D_DST_INFO(
424 .color_format = format.fmt,
425 .tile_mode = format.tile_mode,
426 .color_swap = format.swap,
427 .flags = image->layout.ubwc_layer_size != 0,
428 .srgb = vk_format_is_srgb(image->vk_format)),
429 A6XX_RB_2D_DST(tu_image_base_ref(image, level, layer)),
430 A6XX_RB_2D_DST_SIZE(.pitch = tu_image_pitch(image, level)));
431
432 if (image->layout.ubwc_layer_size) {
433 tu_cs_emit_regs(cs,
434 A6XX_RB_2D_DST_FLAGS(tu_image_ubwc_base_ref(image, level, layer)),
435 A6XX_RB_2D_DST_FLAGS_PITCH(.pitch = tu_image_ubwc_pitch(image, level)));
436 }
437 }
438
439 static void
440 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
441 {
442 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
443
444 tu_cs_emit_regs(cs,
445 A6XX_RB_2D_DST_INFO(
446 .color_format = format.fmt,
447 .color_swap = format.swap,
448 .srgb = vk_format_is_srgb(vk_format)),
449 A6XX_RB_2D_DST_LO((uint32_t) va),
450 A6XX_RB_2D_DST_HI(va >> 32),
451 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
452 }
453
454 static void
455 r2d_setup_common(struct tu_cmd_buffer *cmd,
456 struct tu_cs *cs,
457 VkFormat vk_format,
458 enum a6xx_rotation rotation,
459 bool clear,
460 uint8_t mask,
461 bool scissor)
462 {
463 enum a6xx_format format = tu6_base_format(vk_format);
464 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
465 uint32_t unknown_8c01 = 0;
466
467 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
468 /* preserve depth channels */
469 if (mask == 0x8)
470 unknown_8c01 = 0x00084001;
471 /* preserve stencil channel */
472 if (mask == 0x7)
473 unknown_8c01 = 0x08000041;
474 }
475
476 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
477 tu_cs_emit(cs, unknown_8c01);
478
479 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
480 .scissor = scissor,
481 .rotate = rotation,
482 .solid_color = clear,
483 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
484 .color_format = format,
485 .mask = 0xf,
486 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
487 ).value;
488
489 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
490 tu_cs_emit(cs, blit_cntl);
491
492 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
493 tu_cs_emit(cs, blit_cntl);
494
495 if (format == FMT6_10_10_10_2_UNORM_DEST)
496 format = FMT6_16_16_16_16_FLOAT;
497
498 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
499 .sint = vk_format_is_sint(vk_format),
500 .uint = vk_format_is_uint(vk_format),
501 .color_format = format,
502 .srgb = vk_format_is_srgb(vk_format),
503 .mask = 0xf));
504 }
505
506 static void
507 r2d_setup(struct tu_cmd_buffer *cmd,
508 struct tu_cs *cs,
509 VkFormat vk_format,
510 enum a6xx_rotation rotation,
511 bool clear,
512 uint8_t mask)
513 {
514 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
515
516 /* TODO: flushing with barriers instead of blindly always flushing */
517 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
518 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
519 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
520 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
521 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
522
523 tu_cs_emit_wfi(cs);
524 tu_cs_emit_regs(cs,
525 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
526
527 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
528 }
529
530 static void
531 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
532 {
533 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
534 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
535
536 /* TODO: flushing with barriers instead of blindly always flushing */
537 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
538 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
539 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
540 }
541
542 /* r3d_ = shader path operations */
543
544 static void
545 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
546 {
547 static const instr_t vs_code[] = {
548 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
549 * r1.xy = r0.w ? c1.zw : c0.zw
550 * r0.w = 1.0f
551 */
552 { .cat3 = {
553 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
554 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
555 .src2 = 3,
556 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
557 } },
558 { .cat3 = {
559 .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
560 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
561 .src2 = 3,
562 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
563 } },
564 { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
565 .src_im = 1, .fim_val = 1.0f } },
566 { .cat0 = { .opc = OPC_END } },
567 };
568 #define FS_OFFSET (16 * sizeof(instr_t))
569 STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
570
571 /* vs inputs: only vtx id in r0.w */
572 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 7);
573 tu_cs_emit(cs, 0x00000000);
574 tu_cs_emit(cs, 0xfcfcfc00 | A6XX_VFD_CONTROL_1_REGID4VTX(3));
575 tu_cs_emit(cs, 0x0000fcfc);
576 tu_cs_emit(cs, 0xfcfcfcfc);
577 tu_cs_emit(cs, 0x000000fc);
578 tu_cs_emit(cs, 0x0000fcfc);
579 tu_cs_emit(cs, 0x00000000);
580
581 /* vs outputs: position in r0.xyzw, blit coords in r1.xy */
582 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
583 tu_cs_emit(cs, blit ? 0xffffffcf : 0xffffffff);
584 tu_cs_emit(cs, 0xffffffff);
585 tu_cs_emit(cs, 0xffffffff);
586 tu_cs_emit(cs, 0xffffffff);
587
588 tu_cs_emit_regs(cs, A6XX_SP_VS_OUT_REG(0,
589 .a_regid = 0, .a_compmask = 0xf,
590 .b_regid = 4, .b_compmask = 0x3));
591 tu_cs_emit_regs(cs, A6XX_SP_VS_VPC_DST_REG(0, .outloc0 = 0, .outloc1 = 4));
592
593 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
594 tu_cs_emit(cs, 0xff00ff00 |
595 COND(blit, A6XX_VPC_CNTL_0_VARYING) |
596 A6XX_VPC_CNTL_0_NUMNONPOSVAR(blit ? 8 : 0));
597
598 tu_cs_emit_regs(cs, A6XX_VPC_PACK(
599 .positionloc = 0,
600 .psizeloc = 0xff,
601 .stride_in_vpc = blit ? 6 : 4));
602 tu_cs_emit_regs(cs, A6XX_SP_PRIMITIVE_CNTL(.vsout = blit ? 2 : 1));
603 tu_cs_emit_regs(cs,
604 A6XX_PC_PRIMITIVE_CNTL_0(),
605 A6XX_PC_PRIMITIVE_CNTL_1(.stride_in_vpc = blit ? 6 : 4));
606
607
608 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
609 tu_cs_emit(cs, blit ? 0xe000 : 0); // I think this can just be 0
610 for (uint32_t i = 1; i < 8; i++)
611 tu_cs_emit(cs, 0);
612
613 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
614 for (uint32_t i = 0; i < 8; i++)
615 tu_cs_emit(cs, 0x99999999);
616
617 /* fs inputs: none, prefetch in blit case */
618 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + blit);
619 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(blit) |
620 A6XX_SP_FS_PREFETCH_CNTL_UNK4(0xfc) |
621 0x7000);
622 if (blit) {
623 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(4) |
624 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(0) |
625 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(0) |
626 A6XX_SP_FS_PREFETCH_CMD_DST(0) |
627 A6XX_SP_FS_PREFETCH_CMD_WRMASK(0xf) |
628 A6XX_SP_FS_PREFETCH_CMD_CMD(0x4));
629 }
630
631 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
632 tu_cs_emit(cs, 0x3); // XXX blob uses 3 in blit path
633 tu_cs_emit(cs, 0xfcfcfcfc);
634 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(blit ? 0 : 0xfc) |
635 A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(0xfc) |
636 0xfc00fc00);
637 tu_cs_emit(cs, 0xfcfcfcfc);
638 tu_cs_emit(cs, 0xfcfc);
639
640 tu_cs_emit_regs(cs, A6XX_HLSQ_UNKNOWN_B980(blit ? 3 : 1));
641 tu_cs_emit_regs(cs, A6XX_GRAS_CNTL(.varying = blit));
642 tu_cs_emit_regs(cs,
643 A6XX_RB_RENDER_CONTROL0(.varying = blit, .unk10 = blit),
644 A6XX_RB_RENDER_CONTROL1());
645
646 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_CNTL());
647 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8101());
648 tu_cs_emit_regs(cs, A6XX_GRAS_SAMPLE_CNTL());
649
650 /* shaders */
651 struct ts_cs_memory shaders = { };
652 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
653 assert(result == VK_SUCCESS);
654
655 memcpy(shaders.map, vs_code, sizeof(vs_code));
656
657 instr_t *fs = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
658 for (uint32_t i = 0; i < num_rts; i++) {
659 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
660 fs[i] = (instr_t) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
661 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 } };
662 }
663 fs[num_rts] = (instr_t) { .cat0 = { .opc = OPC_END } };
664 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
665
666 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
667 tu_cs_emit_regs(cs,
668 A6XX_HLSQ_VS_CNTL(.constlen = 8, .enabled = true),
669 A6XX_HLSQ_HS_CNTL(),
670 A6XX_HLSQ_DS_CNTL(),
671 A6XX_HLSQ_GS_CNTL());
672 tu_cs_emit_regs(cs, A6XX_HLSQ_FS_CNTL(.constlen = 4 * num_rts, .enabled = true));
673
674 tu_cs_emit_regs(cs,
675 A6XX_SP_VS_CONFIG(.enabled = true),
676 A6XX_SP_VS_INSTRLEN(1));
677 tu_cs_emit_regs(cs, A6XX_SP_HS_CONFIG());
678 tu_cs_emit_regs(cs, A6XX_SP_DS_CONFIG());
679 tu_cs_emit_regs(cs, A6XX_SP_GS_CONFIG());
680 tu_cs_emit_regs(cs,
681 A6XX_SP_FS_CONFIG(.enabled = true, .ntex = blit, .nsamp = blit),
682 A6XX_SP_FS_INSTRLEN(1));
683
684 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
685 .threadsize = FOUR_QUADS,
686 .fullregfootprint = 2,
687 .mergedregs = true));
688 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
689 .varying = blit,
690 .threadsize = FOUR_QUADS,
691 /* could this be 0 in !blit && !num_rts case ? */
692 .fullregfootprint = MAX2(1, num_rts),
693 .mergedregs = true)); /* note: tu_pipeline also sets 0x1000000 bit */
694
695 tu_cs_emit_regs(cs, A6XX_SP_IBO_COUNT(0));
696
697 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3);
698 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
699 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
700 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
701 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
702 CP_LOAD_STATE6_0_NUM_UNIT(1));
703 tu_cs_emit_qw(cs, shaders.iova);
704
705 tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OBJ_START_LO, 2);
706 tu_cs_emit_qw(cs, shaders.iova);
707
708 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
709 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
710 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
711 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
712 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
713 CP_LOAD_STATE6_0_NUM_UNIT(1));
714 tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
715
716 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OBJ_START_LO, 2);
717 tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
718
719 tu_cs_emit_regs(cs,
720 A6XX_GRAS_CL_CNTL(
721 .persp_division_disable = 1,
722 .vp_xform_disable = 1,
723 .vp_clip_code_ignore = 1,
724 .clip_disable = 1),
725 A6XX_GRAS_UNKNOWN_8001(0));
726 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
727
728 tu_cs_emit_regs(cs,
729 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
730 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
731 tu_cs_emit_regs(cs,
732 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
733 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
734 }
735
736 static void
737 r3d_coords_raw(struct tu_cs *cs, const float *coords)
738 {
739 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
740 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
741 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
742 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
743 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
744 CP_LOAD_STATE6_0_NUM_UNIT(2));
745 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
746 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
747 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
748 }
749
750 static void
751 r3d_coords(struct tu_cs *cs,
752 const VkOffset2D *dst,
753 const VkOffset2D *src,
754 const VkExtent2D *extent)
755 {
756 int32_t src_x1 = src ? src->x : 0;
757 int32_t src_y1 = src ? src->y : 0;
758 r3d_coords_raw(cs, (float[]) {
759 dst->x, dst->y,
760 src_x1, src_y1,
761 dst->x + extent->width, dst->y + extent->height,
762 src_x1 + extent->width, src_y1 + extent->height,
763 });
764 }
765
766 static void
767 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
768 {
769 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
770 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
771 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
772 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
773 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
774 CP_LOAD_STATE6_0_NUM_UNIT(1));
775 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
776 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
777 switch (format) {
778 case VK_FORMAT_X8_D24_UNORM_PACK32:
779 case VK_FORMAT_D24_UNORM_S8_UINT: {
780 /* cleared as r8g8b8a8_unorm using special format */
781 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
782 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
783 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
784 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
785 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
786 } break;
787 case VK_FORMAT_D16_UNORM:
788 case VK_FORMAT_D32_SFLOAT:
789 tu_cs_emit(cs, fui(val->depthStencil.depth));
790 tu_cs_emit(cs, 0);
791 tu_cs_emit(cs, 0);
792 tu_cs_emit(cs, 0);
793 break;
794 case VK_FORMAT_S8_UINT:
795 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
796 tu_cs_emit(cs, 0);
797 tu_cs_emit(cs, 0);
798 tu_cs_emit(cs, 0);
799 break;
800 default:
801 /* as color formats use clear value as-is */
802 assert(!vk_format_is_depth_or_stencil(format));
803 tu_cs_emit_array(cs, val->color.uint32, 4);
804 break;
805 }
806 }
807
808 static void
809 r3d_src_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t *tex_const, bool linear_filter)
810 {
811 struct ts_cs_memory texture = { };
812 VkResult result = tu_cs_alloc(&cmd->sub_cs,
813 2, /* allocate space for a sampler too */
814 A6XX_TEX_CONST_DWORDS, &texture);
815 assert(result == VK_SUCCESS);
816
817 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
818
819 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
820 A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
821 A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
822 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
823 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
824 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
825 0x60000; /* XXX used by blob, doesn't seem necessary */
826 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
827 0x1 | /* XXX used by blob, doesn't seem necessary */
828 A6XX_TEX_SAMP_1_UNNORM_COORDS |
829 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
830 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
831 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
832
833 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
834 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
835 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
836 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
837 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
838 CP_LOAD_STATE6_0_NUM_UNIT(1));
839 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
840
841 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
842 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
843
844 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
845 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
846 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
847 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
848 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
849 CP_LOAD_STATE6_0_NUM_UNIT(1));
850 tu_cs_emit_qw(cs, texture.iova);
851
852 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
853 tu_cs_emit_qw(cs, texture.iova);
854
855 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
856 }
857
858 static void
859 r3d_src(struct tu_cmd_buffer *cmd,
860 struct tu_cs *cs,
861 struct tu_image *image,
862 VkFormat format,
863 uint32_t level,
864 uint32_t layer,
865 bool linear_filter,
866 bool stencil_read)
867 {
868 struct tu_image_view view;
869
870 /* use tu_image_view_init to fill out a view descriptor */
871 tu_image_view_init(&view, cmd->device, &(VkImageViewCreateInfo) {
872 .image = tu_image_to_handle(image),
873 .viewType = VK_IMAGE_VIEW_TYPE_2D,
874 .format = format,
875 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
876 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
877 .subresourceRange = {
878 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
879 .baseMipLevel = level,
880 .levelCount = 1,
881 .baseArrayLayer = layer,
882 .layerCount = 1,
883 },
884 });
885 r3d_src_common(cmd, cs, view.descriptor, linear_filter);
886 }
887
888 static void
889 r3d_src_buffer(struct tu_cmd_buffer *cmd,
890 struct tu_cs *cs,
891 VkFormat vk_format,
892 uint64_t va, uint32_t pitch,
893 uint32_t width, uint32_t height)
894 {
895 uint32_t desc[A6XX_TEX_CONST_DWORDS];
896
897 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
898
899 desc[0] =
900 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
901 A6XX_TEX_CONST_0_FMT(format.fmt) |
902 A6XX_TEX_CONST_0_SWAP(format.swap) |
903 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
904 // XXX to swizzle into .w for stencil buffer_to_image
905 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
906 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
907 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
908 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
909 desc[2] =
910 A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
911 A6XX_TEX_CONST_2_PITCH(pitch) |
912 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
913 desc[3] = 0;
914 desc[4] = va;
915 desc[5] = va >> 32;
916 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
917 desc[i] = 0;
918
919 r3d_src_common(cmd, cs, desc, false);
920 }
921
922 static void
923 r3d_dst(struct tu_cs *cs,
924 struct tu_image *image,
925 VkFormat vk_format,
926 uint32_t level,
927 uint32_t layer)
928 {
929 tu6_emit_msaa(cs, image->samples); /* TODO: move to setup */
930
931 struct tu_native_format format = tu6_format_image(image, vk_format, level);
932
933 tu_cs_emit_regs(cs,
934 A6XX_RB_MRT_BUF_INFO(0,
935 .color_tile_mode = format.tile_mode,
936 .color_format = format.fmt,
937 .color_swap = format.swap),
938 A6XX_RB_MRT_PITCH(0, tu_image_pitch(image, level)),
939 A6XX_RB_MRT_ARRAY_PITCH(0, image->layout.layer_size),
940 A6XX_RB_MRT_BASE(0, tu_image_base_ref(image, level, layer)),
941 A6XX_RB_MRT_BASE_GMEM(0, 0));
942
943 tu_cs_emit_regs(cs,
944 A6XX_RB_MRT_FLAG_BUFFER_ADDR(0, tu_image_ubwc_base_ref(image, level, layer)),
945 A6XX_RB_MRT_FLAG_BUFFER_PITCH(0, .pitch = tu_image_ubwc_pitch(image, level)));
946
947 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = image->layout.ubwc_layer_size != 0));
948 }
949
950 static void
951 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
952 {
953 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
954
955 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
956
957 tu_cs_emit_regs(cs,
958 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
959 A6XX_RB_MRT_PITCH(0, pitch),
960 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
961 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
962 A6XX_RB_MRT_BASE_HI(0, va >> 32),
963 A6XX_RB_MRT_BASE_GMEM(0, 0));
964
965 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
966 }
967
968 static void
969 r3d_setup(struct tu_cmd_buffer *cmd,
970 struct tu_cs *cs,
971 VkFormat vk_format,
972 enum a6xx_rotation rotation,
973 bool clear,
974 uint8_t mask)
975 {
976 const struct tu_physical_device *phys_dev = cmd->device->physical_device;
977
978 if (!cmd->state.pass) {
979 /* TODO: flushing with barriers instead of blindly always flushing */
980 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
981 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
982 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
983 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
984 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
985
986 tu_cs_emit_regs(cs,
987 A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
988
989 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
990 }
991 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
992 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
993
994 r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
995
996 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
997 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
998 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
999 0xfc000000);
1000 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1001
1002 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
1003 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
1004
1005 tu_cs_emit_regs(cs,
1006 A6XX_RB_FS_OUTPUT_CNTL0(),
1007 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1008
1009 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1010 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1011 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1012
1013 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1014 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1015 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1016 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1017 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1018 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1019 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1020
1021 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1022 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1023
1024 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1025 .color_format = tu6_base_format(vk_format),
1026 .color_sint = vk_format_is_sint(vk_format),
1027 .color_uint = vk_format_is_uint(vk_format)));
1028
1029 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
1030 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1031 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1032 }
1033
1034 static void
1035 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1036 {
1037 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1038 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1039 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1040 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1041 tu_cs_emit(cs, 1); /* instance count */
1042 tu_cs_emit(cs, 2); /* vertex count */
1043
1044 if (!cmd->state.pass) {
1045 /* TODO: flushing with barriers instead of blindly always flushing */
1046 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1047 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1048 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1049 }
1050 }
1051
1052 /* blit ops - common interface for 2d/shader paths */
1053
1054 struct blit_ops {
1055 void (*coords)(struct tu_cs *cs,
1056 const VkOffset2D *dst,
1057 const VkOffset2D *src,
1058 const VkExtent2D *extent);
1059 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1060 void (*src)(
1061 struct tu_cmd_buffer *cmd,
1062 struct tu_cs *cs,
1063 struct tu_image *image,
1064 VkFormat format,
1065 uint32_t level,
1066 uint32_t layer,
1067 bool linear_filter,
1068 bool stencil_read);
1069 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1070 VkFormat vk_format,
1071 uint64_t va, uint32_t pitch,
1072 uint32_t width, uint32_t height);
1073 void (*dst)(struct tu_cs *cs,
1074 struct tu_image *image,
1075 VkFormat format,
1076 uint32_t level,
1077 uint32_t layer);
1078 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1079 void (*setup)(struct tu_cmd_buffer *cmd,
1080 struct tu_cs *cs,
1081 VkFormat vk_format,
1082 enum a6xx_rotation rotation,
1083 bool clear,
1084 uint8_t mask);
1085 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1086 };
1087
1088 static const struct blit_ops r2d_ops = {
1089 .coords = r2d_coords,
1090 .clear_value = r2d_clear_value,
1091 .src = r2d_src,
1092 .src_buffer = r2d_src_buffer,
1093 .dst = r2d_dst,
1094 .dst_buffer = r2d_dst_buffer,
1095 .setup = r2d_setup,
1096 .run = r2d_run,
1097 };
1098
1099 static const struct blit_ops r3d_ops = {
1100 .coords = r3d_coords,
1101 .clear_value = r3d_clear_value,
1102 .src = r3d_src,
1103 .src_buffer = r3d_src_buffer,
1104 .dst = r3d_dst,
1105 .dst_buffer = r3d_dst_buffer,
1106 .setup = r3d_setup,
1107 .run = r3d_run,
1108 };
1109
1110 /* passthrough set coords from 3D extents */
1111 static void
1112 coords(const struct blit_ops *ops,
1113 struct tu_cs *cs,
1114 const VkOffset3D *dst,
1115 const VkOffset3D *src,
1116 const VkExtent3D *extent)
1117 {
1118 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1119 }
1120
1121 static void
1122 tu6_blit_image(struct tu_cmd_buffer *cmd,
1123 struct tu_image *src_image,
1124 struct tu_image *dst_image,
1125 const VkImageBlit *info,
1126 VkFilter filter)
1127 {
1128 const struct blit_ops *ops = &r2d_ops;
1129 struct tu_cs *cs = &cmd->cs;
1130 uint32_t layers;
1131
1132 /* 2D blit can't do rotation mirroring from just coordinates */
1133 static const enum a6xx_rotation rotate[2][2] = {
1134 {ROTATE_0, ROTATE_HFLIP},
1135 {ROTATE_VFLIP, ROTATE_180},
1136 };
1137
1138 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1139 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1140 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1141 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1142 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1143 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1144
1145 if (mirror_z) {
1146 tu_finishme("blit z mirror\n");
1147 return;
1148 }
1149
1150 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1151 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1152 tu_finishme("blit z filter\n");
1153 return;
1154 }
1155
1156 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1157 if (info->dstSubresource.layerCount > 1) {
1158 assert(layers <= 1);
1159 layers = info->dstSubresource.layerCount;
1160 }
1161
1162 uint8_t mask = 0xf;
1163 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1164 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1165 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1166 mask = 0x7;
1167 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1168 mask = 0x8;
1169 }
1170
1171 if (dst_image->samples > 1)
1172 ops = &r3d_ops;
1173
1174 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1175 * figure out why (should be able to pass all tests with only shader path)
1176 */
1177
1178 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1179
1180 if (ops == &r3d_ops) {
1181 r3d_coords_raw(cs, (float[]) {
1182 info->dstOffsets[0].x, info->dstOffsets[0].y,
1183 info->srcOffsets[0].x, info->srcOffsets[0].y,
1184 info->dstOffsets[1].x, info->dstOffsets[1].y,
1185 info->srcOffsets[1].x, info->srcOffsets[1].y
1186 });
1187 } else {
1188 tu_cs_emit_regs(cs,
1189 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1190 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1191 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1192 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1193 tu_cs_emit_regs(cs,
1194 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1195 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1196 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1197 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1198 }
1199
1200 for (uint32_t i = 0; i < layers; i++) {
1201 ops->src(cmd, cs, src_image, src_image->vk_format,
1202 info->srcSubresource.mipLevel,
1203 info->srcSubresource.baseArrayLayer + info->srcOffsets[0].z + i,
1204 filter == VK_FILTER_LINEAR, false);
1205 ops->dst(cs, dst_image, dst_image->vk_format,
1206 info->dstSubresource.mipLevel,
1207 info->dstSubresource.baseArrayLayer + info->dstOffsets[0].z + i);
1208 ops->run(cmd, cs);
1209 }
1210 }
1211
1212 void
1213 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1214 VkImage srcImage,
1215 VkImageLayout srcImageLayout,
1216 VkImage dstImage,
1217 VkImageLayout dstImageLayout,
1218 uint32_t regionCount,
1219 const VkImageBlit *pRegions,
1220 VkFilter filter)
1221
1222 {
1223 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1224 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1225 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1226
1227 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1228 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1229
1230 for (uint32_t i = 0; i < regionCount; ++i)
1231 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1232 }
1233
1234 static VkFormat
1235 copy_format(VkFormat format)
1236 {
1237 switch (vk_format_get_blocksizebits(format)) {
1238 case 8: return VK_FORMAT_R8_UINT;
1239 case 16: return VK_FORMAT_R16_UINT;
1240 case 32: return VK_FORMAT_R32_UINT;
1241 case 64: return VK_FORMAT_R32G32_UINT;
1242 case 96: return VK_FORMAT_R32G32B32_UINT;
1243 case 128:return VK_FORMAT_R32G32B32A32_UINT;
1244 default:
1245 unreachable("unhandled format size");
1246 }
1247 }
1248
1249 static void
1250 copy_compressed(VkFormat format,
1251 VkOffset3D *offset,
1252 VkExtent3D *extent,
1253 uint32_t *pitch,
1254 uint32_t *layer_size)
1255 {
1256 if (!vk_format_is_compressed(format))
1257 return;
1258
1259 uint32_t block_width = vk_format_get_blockwidth(format);
1260 uint32_t block_height = vk_format_get_blockheight(format);
1261
1262 offset->x /= block_width;
1263 offset->y /= block_height;
1264
1265 if (extent) {
1266 extent->width = DIV_ROUND_UP(extent->width, block_width);
1267 extent->height = DIV_ROUND_UP(extent->height, block_height);
1268 }
1269 if (pitch)
1270 *pitch /= block_width;
1271 if (layer_size)
1272 *layer_size /= (block_width * block_height);
1273 }
1274
1275 static void
1276 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1277 struct tu_buffer *src_buffer,
1278 struct tu_image *dst_image,
1279 const VkBufferImageCopy *info)
1280 {
1281 struct tu_cs *cs = &cmd->cs;
1282 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1283 VkFormat dst_format = dst_image->vk_format;
1284 VkFormat src_format = dst_image->vk_format;
1285 const struct blit_ops *ops = &r2d_ops;
1286 uint8_t mask = 0xf;
1287
1288 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1289 switch (info->imageSubresource.aspectMask) {
1290 case VK_IMAGE_ASPECT_STENCIL_BIT:
1291 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1292 mask = 0x8;
1293 ops = &r3d_ops;
1294 break;
1295 case VK_IMAGE_ASPECT_DEPTH_BIT:
1296 mask = 0x7;
1297 break;
1298 }
1299 }
1300
1301 VkOffset3D offset = info->imageOffset;
1302 VkExtent3D extent = info->imageExtent;
1303 uint32_t pitch =
1304 (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(src_format);
1305 uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch;
1306
1307 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1308 assert(src_format == dst_format);
1309 copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size);
1310 src_format = dst_format = copy_format(dst_format);
1311 }
1312
1313 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1314 * it is also valid for 1cpp format with shader path (stencil aspect path)
1315 */
1316
1317 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1318
1319 for (uint32_t i = 0; i < layers; i++) {
1320 ops->dst(cs, dst_image, dst_format,
1321 info->imageSubresource.mipLevel,
1322 info->imageSubresource.baseArrayLayer + info->imageOffset.z + i);
1323
1324 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1325 if ((src_va & 63) || (pitch & 63)) {
1326 for (uint32_t y = 0; y < extent.height; y++) {
1327 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1328 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1329 x + extent.width, 1);
1330 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1331 &(VkExtent2D) {extent.width, 1});
1332 ops->run(cmd, cs);
1333 src_va += pitch;
1334 }
1335 } else {
1336 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1337 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1338 ops->run(cmd, cs);
1339 }
1340 }
1341 }
1342
1343 void
1344 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1345 VkBuffer srcBuffer,
1346 VkImage dstImage,
1347 VkImageLayout dstImageLayout,
1348 uint32_t regionCount,
1349 const VkBufferImageCopy *pRegions)
1350 {
1351 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1352 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1353 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1354
1355 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1356 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1357
1358 for (unsigned i = 0; i < regionCount; ++i)
1359 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1360 }
1361
1362 static void
1363 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1364 struct tu_image *src_image,
1365 struct tu_buffer *dst_buffer,
1366 const VkBufferImageCopy *info)
1367 {
1368 struct tu_cs *cs = &cmd->cs;
1369 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1370 VkFormat src_format = src_image->vk_format;
1371 VkFormat dst_format = src_image->vk_format;
1372 bool stencil_read = false;
1373
1374 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1375 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1376 dst_format = VK_FORMAT_R8_UNORM;
1377 stencil_read = true;
1378 }
1379
1380 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1381 VkOffset3D offset = info->imageOffset;
1382 VkExtent3D extent = info->imageExtent;
1383 uint32_t pitch = (info->bufferRowLength ?: extent.width) * vk_format_get_blocksize(dst_format);
1384 uint32_t layer_size = (info->bufferImageHeight ?: extent.height) * pitch;
1385
1386 if (src_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1387 assert(src_format == dst_format);
1388 copy_compressed(dst_format, &offset, &extent, &pitch, &layer_size);
1389 src_format = dst_format = copy_format(dst_format);
1390 }
1391
1392 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1393 * it is also valid for 1cpp format with shader path (stencil aspect)
1394 */
1395
1396 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1397
1398 for (uint32_t i = 0; i < layers; i++) {
1399 ops->src(cmd, cs, src_image, src_format,
1400 info->imageSubresource.mipLevel,
1401 info->imageSubresource.baseArrayLayer + info->imageOffset.z + i,
1402 false, stencil_read);
1403
1404 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1405 if ((dst_va & 63) || (pitch & 63)) {
1406 for (uint32_t y = 0; y < extent.height; y++) {
1407 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1408 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1409 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1410 &(VkExtent2D) {extent.width, 1});
1411 ops->run(cmd, cs);
1412 dst_va += pitch;
1413 }
1414 } else {
1415 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1416 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1417 ops->run(cmd, cs);
1418 }
1419 }
1420 }
1421
1422 void
1423 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1424 VkImage srcImage,
1425 VkImageLayout srcImageLayout,
1426 VkBuffer dstBuffer,
1427 uint32_t regionCount,
1428 const VkBufferImageCopy *pRegions)
1429 {
1430 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1431 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1432 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1433
1434 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1435 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1436
1437 for (unsigned i = 0; i < regionCount; ++i)
1438 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1439 }
1440
1441 static void
1442 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1443 struct tu_image *src_image,
1444 struct tu_image *dst_image,
1445 const VkImageCopy *info)
1446 {
1447 const struct blit_ops *ops = &r2d_ops;
1448 struct tu_cs *cs = &cmd->cs;
1449
1450 uint8_t mask = 0xf;
1451 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1452 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1453 mask = 0x7;
1454 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1455 mask = 0x8;
1456 }
1457
1458 if (dst_image->samples > 1)
1459 ops = &r3d_ops;
1460
1461 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1462
1463 VkFormat format = VK_FORMAT_UNDEFINED;
1464 VkOffset3D src_offset = info->srcOffset;
1465 VkOffset3D dst_offset = info->dstOffset;
1466 VkExtent3D extent = info->extent;
1467
1468 /* TODO: should check (ubwc || (tile_mode && swap)) instead */
1469 if (src_image->layout.tile_mode && src_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1470 format = src_image->vk_format;
1471
1472 if (dst_image->layout.tile_mode && dst_image->vk_format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
1473 if (format != VK_FORMAT_UNDEFINED && format != dst_image->vk_format) {
1474 /* can be clever in some cases but in some cases we need and intermediate
1475 * linear buffer
1476 */
1477 tu_finishme("image copy between two tiled/ubwc images\n");
1478 return;
1479 }
1480 format = dst_image->vk_format;
1481 }
1482
1483 if (format == VK_FORMAT_UNDEFINED)
1484 format = copy_format(src_image->vk_format);
1485
1486 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1487 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1488
1489 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1490 coords(ops, cs, &dst_offset, &src_offset, &extent);
1491
1492 for (uint32_t i = 0; i < info->extent.depth; i++) {
1493 ops->src(cmd, cs, src_image, format,
1494 info->srcSubresource.mipLevel,
1495 info->srcSubresource.baseArrayLayer + info->srcOffset.z + i,
1496 false, false);
1497 ops->dst(cs, dst_image, format,
1498 info->dstSubresource.mipLevel,
1499 info->dstSubresource.baseArrayLayer + info->dstOffset.z + i);
1500 ops->run(cmd, cs);
1501 }
1502 }
1503
1504 void
1505 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1506 VkImage srcImage,
1507 VkImageLayout srcImageLayout,
1508 VkImage destImage,
1509 VkImageLayout destImageLayout,
1510 uint32_t regionCount,
1511 const VkImageCopy *pRegions)
1512 {
1513 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1514 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1515 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1516
1517 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1518 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1519
1520 for (uint32_t i = 0; i < regionCount; ++i)
1521 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1522 }
1523
1524 static void
1525 copy_buffer(struct tu_cmd_buffer *cmd,
1526 uint64_t dst_va,
1527 uint64_t src_va,
1528 uint64_t size,
1529 uint32_t block_size)
1530 {
1531 const struct blit_ops *ops = &r2d_ops;
1532 struct tu_cs *cs = &cmd->cs;
1533 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1534 uint64_t blocks = size / block_size;
1535
1536 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1537
1538 while (blocks) {
1539 uint32_t src_x = (src_va & 63) / block_size;
1540 uint32_t dst_x = (dst_va & 63) / block_size;
1541 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1542
1543 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1544 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1545 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1546 ops->run(cmd, cs);
1547
1548 src_va += width * block_size;
1549 dst_va += width * block_size;
1550 blocks -= width;
1551 }
1552 }
1553
1554 void
1555 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1556 VkBuffer srcBuffer,
1557 VkBuffer dstBuffer,
1558 uint32_t regionCount,
1559 const VkBufferCopy *pRegions)
1560 {
1561 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1562 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1563 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1564
1565 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1566 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1567
1568 for (unsigned i = 0; i < regionCount; ++i) {
1569 copy_buffer(cmd,
1570 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1571 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1572 pRegions[i].size, 1);
1573 }
1574 }
1575
1576 void
1577 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1578 VkBuffer dstBuffer,
1579 VkDeviceSize dstOffset,
1580 VkDeviceSize dataSize,
1581 const void *pData)
1582 {
1583 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1584 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1585
1586 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1587
1588 struct ts_cs_memory tmp;
1589 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1590 if (result != VK_SUCCESS) {
1591 cmd->record_result = result;
1592 return;
1593 }
1594
1595 memcpy(tmp.map, pData, dataSize);
1596 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1597 }
1598
1599 void
1600 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1601 VkBuffer dstBuffer,
1602 VkDeviceSize dstOffset,
1603 VkDeviceSize fillSize,
1604 uint32_t data)
1605 {
1606 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1607 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1608 const struct blit_ops *ops = &r2d_ops;
1609 struct tu_cs *cs = &cmd->cs;
1610
1611 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1612
1613 if (fillSize == VK_WHOLE_SIZE)
1614 fillSize = buffer->size - dstOffset;
1615
1616 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1617 uint32_t blocks = fillSize / 4;
1618
1619 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1620 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1621
1622 while (blocks) {
1623 uint32_t dst_x = (dst_va & 63) / 4;
1624 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1625
1626 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1627 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1628 ops->run(cmd, cs);
1629
1630 dst_va += width * 4;
1631 blocks -= width;
1632 }
1633 }
1634
1635 void
1636 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1637 VkImage srcImage,
1638 VkImageLayout srcImageLayout,
1639 VkImage dstImage,
1640 VkImageLayout dstImageLayout,
1641 uint32_t regionCount,
1642 const VkImageResolve *pRegions)
1643 {
1644 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1645 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1646 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1647 const struct blit_ops *ops = &r2d_ops;
1648 struct tu_cs *cs = &cmd->cs;
1649
1650 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1651 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1652
1653 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1654
1655 for (uint32_t i = 0; i < regionCount; ++i) {
1656 const VkImageResolve *info = &pRegions[i];
1657 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1658
1659 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1660 /* TODO: aspect masks possible ? */
1661
1662 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1663
1664 for (uint32_t i = 0; i < layers; i++) {
1665 ops->src(cmd, cs, src_image, src_image->vk_format,
1666 info->srcSubresource.mipLevel,
1667 info->srcSubresource.baseArrayLayer + info->srcOffset.z + i,
1668 false, false);
1669 ops->dst(cs, dst_image, dst_image->vk_format,
1670 info->dstSubresource.mipLevel,
1671 info->dstSubresource.baseArrayLayer + info->dstOffset.z + i);
1672 ops->run(cmd, cs);
1673 }
1674 }
1675 }
1676
1677 void
1678 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1679 struct tu_cs *cs,
1680 struct tu_image_view *src,
1681 struct tu_image_view *dst,
1682 uint32_t layers,
1683 const VkRect2D *rect)
1684 {
1685 const struct blit_ops *ops = &r2d_ops;
1686
1687 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1688 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1689
1690 assert(src->vk_format == dst->vk_format);
1691
1692 ops->setup(cmd, cs, dst->vk_format, ROTATE_0, false, 0xf);
1693 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1694
1695 for (uint32_t i = 0; i < layers; i++) {
1696 ops->src(cmd, cs, src->image, src->vk_format,
1697 src->base_mip,
1698 src->base_layer + i,
1699 false, false);
1700 ops->dst(cs, dst->image, dst->vk_format,
1701 dst->base_mip,
1702 dst->base_layer + i);
1703 ops->run(cmd, cs);
1704 }
1705 }
1706
1707 static void
1708 clear_image(struct tu_cmd_buffer *cmd,
1709 struct tu_image *image,
1710 const VkClearValue *clear_value,
1711 const VkImageSubresourceRange *range)
1712 {
1713 uint32_t level_count = tu_get_levelCount(image, range);
1714 uint32_t layer_count = tu_get_layerCount(image, range);
1715 struct tu_cs *cs = &cmd->cs;
1716 VkFormat format = image->vk_format;
1717 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1718 format = VK_FORMAT_R32_UINT;
1719
1720 if (image->type == VK_IMAGE_TYPE_3D) {
1721 assert(layer_count == 1);
1722 assert(range->baseArrayLayer == 0);
1723 }
1724
1725 uint8_t mask = 0xf;
1726 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1727 mask = 0;
1728 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1729 mask |= 0x7;
1730 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1731 mask |= 0x8;
1732 }
1733
1734 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1735
1736 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1737 ops->clear_value(cs, image->vk_format, clear_value);
1738
1739 for (unsigned j = 0; j < level_count; j++) {
1740 if (image->type == VK_IMAGE_TYPE_3D)
1741 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1742
1743 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1744 u_minify(image->extent.width, range->baseMipLevel + j),
1745 u_minify(image->extent.height, range->baseMipLevel + j)
1746 });
1747
1748 for (uint32_t i = 0; i < layer_count; i++) {
1749 ops->dst(cs, image, format, range->baseMipLevel + j, range->baseArrayLayer + i);
1750 ops->run(cmd, cs);
1751 }
1752 }
1753 }
1754
1755 void
1756 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1757 VkImage image_h,
1758 VkImageLayout imageLayout,
1759 const VkClearColorValue *pColor,
1760 uint32_t rangeCount,
1761 const VkImageSubresourceRange *pRanges)
1762 {
1763 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1764 TU_FROM_HANDLE(tu_image, image, image_h);
1765
1766 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1767
1768 for (unsigned i = 0; i < rangeCount; i++)
1769 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1770 }
1771
1772 void
1773 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1774 VkImage image_h,
1775 VkImageLayout imageLayout,
1776 const VkClearDepthStencilValue *pDepthStencil,
1777 uint32_t rangeCount,
1778 const VkImageSubresourceRange *pRanges)
1779 {
1780 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1781 TU_FROM_HANDLE(tu_image, image, image_h);
1782
1783 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1784
1785 for (unsigned i = 0; i < rangeCount; i++)
1786 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1787 }
1788
1789 static void
1790 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1791 uint32_t attachment_count,
1792 const VkClearAttachment *attachments,
1793 uint32_t rect_count,
1794 const VkClearRect *rects)
1795 {
1796 const struct tu_subpass *subpass = cmd->state.subpass;
1797 /* note: cannot use shader path here.. there is a special shader path
1798 * in tu_clear_sysmem_attachments()
1799 */
1800 const struct blit_ops *ops = &r2d_ops;
1801 struct tu_cs *cs = &cmd->draw_cs;
1802
1803 for (uint32_t j = 0; j < attachment_count; j++) {
1804 uint32_t a;
1805 uint8_t mask = 0xf;
1806
1807 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1808 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1809 } else {
1810 a = subpass->depth_stencil_attachment.attachment;
1811
1812 /* sync depth into color */
1813 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1814 /* also flush color to avoid losing contents from invalidate */
1815 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1816 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1817
1818
1819 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1820 mask &= ~0x7;
1821 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1822 mask &= ~0x8;
1823 }
1824
1825 if (a == VK_ATTACHMENT_UNUSED)
1826 continue;
1827
1828 const struct tu_image_view *iview =
1829 cmd->state.framebuffer->attachments[a].attachment;
1830
1831 ops->setup(cmd, cs, iview->vk_format, ROTATE_0, true, mask);
1832 ops->clear_value(cs, iview->vk_format, &attachments[j].clearValue);
1833
1834 for (uint32_t i = 0; i < rect_count; i++) {
1835 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1836 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1837 ops->dst(cs, iview->image, iview->vk_format, iview->base_mip,
1838 iview->base_layer + rects[i].baseArrayLayer + layer);
1839 ops->run(cmd, cs);
1840 }
1841 }
1842
1843 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1844 /* does not use CCU - flush
1845 * note: cache invalidate might be needed to, and just not covered by test cases
1846 */
1847 if (attachments[j].colorAttachment > 0)
1848 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1849 } else {
1850 /* sync color into depth */
1851 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1852 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1853 }
1854 }
1855 }
1856
1857 static void
1858 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1859 uint32_t attachment_count,
1860 const VkClearAttachment *attachments,
1861 uint32_t rect_count,
1862 const VkClearRect *rects)
1863 {
1864 /* the shader path here is special, it avoids changing MRT/etc state */
1865 const struct tu_render_pass *pass = cmd->state.pass;
1866 const struct tu_subpass *subpass = cmd->state.subpass;
1867 const uint32_t mrt_count = subpass->color_count;
1868 struct tu_cs *cs = &cmd->draw_cs;
1869 uint32_t clear_value[MAX_RTS][4];
1870 float z_clear_val = 0.0f;
1871 uint8_t s_clear_val = 0;
1872 uint32_t clear_rts = 0, num_rts = 0, b;
1873 bool z_clear = false;
1874 bool s_clear = false;
1875 uint32_t max_samples = 1;
1876
1877 for (uint32_t i = 0; i < attachment_count; i++) {
1878 uint32_t a;
1879 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1880 uint32_t c = attachments[i].colorAttachment;
1881 a = subpass->color_attachments[c].attachment;
1882 if (a == VK_ATTACHMENT_UNUSED)
1883 continue;
1884
1885 clear_rts |= 1 << c;
1886 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1887 } else {
1888 a = subpass->depth_stencil_attachment.attachment;
1889 if (a == VK_ATTACHMENT_UNUSED)
1890 continue;
1891
1892 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1893 z_clear = true;
1894 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1895 }
1896
1897 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1898 s_clear = true;
1899 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1900 }
1901 }
1902
1903 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1904 }
1905
1906 /* prefer to use 2D path for clears
1907 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1908 */
1909 if (max_samples == 1 && cmd->state.framebuffer) {
1910 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1911 return;
1912 }
1913
1914 /* TODO: this path doesn't take into account multilayer rendering */
1915
1916 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1917 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1918 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1919 0xfc000000);
1920 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1921
1922 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1923 for (uint32_t i = 0; i < mrt_count; i++) {
1924 if (clear_rts & (1 << i))
1925 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1926 else
1927 tu_cs_emit(cs, 0);
1928 }
1929
1930 r3d_pipeline(cmd, cs, false, num_rts);
1931
1932 tu_cs_emit_regs(cs,
1933 A6XX_RB_FS_OUTPUT_CNTL0(),
1934 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1935
1936 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1937 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1938 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1939 for (uint32_t i = 0; i < mrt_count; i++) {
1940 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1941 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1942 }
1943
1944 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1945 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1946 .z_enable = z_clear,
1947 .z_write_enable = z_clear,
1948 .zfunc = FUNC_ALWAYS));
1949 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1950 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1951 .stencil_enable = s_clear,
1952 .func = FUNC_ALWAYS,
1953 .zpass = VK_STENCIL_OP_REPLACE));
1954 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1955 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1956 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1957
1958 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1959 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1960 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1961 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1962 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1963 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1964 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1965 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1966 for_each_bit(b, clear_rts)
1967 tu_cs_emit_array(cs, clear_value[b], 4);
1968
1969 for (uint32_t i = 0; i < rect_count; i++) {
1970 r3d_coords_raw(cs, (float[]) {
1971 rects[i].rect.offset.x, rects[i].rect.offset.y,
1972 z_clear_val, 1.0f,
1973 rects[i].rect.offset.x + rects[i].rect.extent.width,
1974 rects[i].rect.offset.y + rects[i].rect.extent.height,
1975 z_clear_val, 1.0f
1976 });
1977 r3d_run(cmd, cs);
1978 }
1979
1980 cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
1981 TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
1982 TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
1983 TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
1984 TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
1985 TU_CMD_DIRTY_DYNAMIC_SCISSOR;
1986 }
1987
1988 /**
1989 * Pack a VkClearValue into a 128-bit buffer. format is respected except
1990 * for the component order. The components are always packed in WZYX order,
1991 * because gmem is tiled and tiled formats always have WZYX swap
1992 */
1993 static void
1994 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
1995 {
1996 const struct util_format_description *desc = vk_format_description(format);
1997
1998 switch (format) {
1999 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2000 buf[0] = float3_to_r11g11b10f(val->color.float32);
2001 return;
2002 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2003 buf[0] = float3_to_rgb9e5(val->color.float32);
2004 return;
2005 default:
2006 break;
2007 }
2008
2009 assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2010
2011 /* S8_UINT is special and has no depth */
2012 const int max_components =
2013 format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2014
2015 int buf_offset = 0;
2016 int bit_shift = 0;
2017 for (int comp = 0; comp < max_components; comp++) {
2018 const struct util_format_channel_description *ch =
2019 tu_get_format_channel_description(desc, comp);
2020 if (!ch) {
2021 assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2022 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2023 continue;
2024 }
2025
2026 union tu_clear_component_value v = tu_get_clear_component_value(
2027 val, comp, desc->colorspace);
2028
2029 /* move to the next uint32_t when there is not enough space */
2030 assert(ch->size <= 32);
2031 if (bit_shift + ch->size > 32) {
2032 buf_offset++;
2033 bit_shift = 0;
2034 }
2035
2036 if (bit_shift == 0)
2037 buf[buf_offset] = 0;
2038
2039 buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2040 bit_shift += ch->size;
2041 }
2042 }
2043
2044 static void
2045 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2046 struct tu_cs *cs,
2047 uint32_t attachment,
2048 uint8_t component_mask,
2049 const VkClearValue *value)
2050 {
2051 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2052 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2053 * because D24S8 is cleared with AS_R8G8B8A8 format
2054 */
2055
2056 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2057 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2058
2059 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2060 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2061
2062 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2063 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2064
2065 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2066 tu_cs_emit(cs, 0);
2067
2068 uint32_t clear_vals[4] = {};
2069 pack_gmem_clear_value(value, vk_format, clear_vals);
2070
2071 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2072 tu_cs_emit_array(cs, clear_vals, 4);
2073
2074 tu6_emit_event_write(cmd, cs, BLIT, false);
2075 }
2076
2077 static void
2078 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2079 uint32_t attachment_count,
2080 const VkClearAttachment *attachments,
2081 uint32_t rect_count,
2082 const VkClearRect *rects)
2083 {
2084 const struct tu_subpass *subpass = cmd->state.subpass;
2085 struct tu_cs *cs = &cmd->draw_cs;
2086
2087 /* TODO: swap the loops for smaller cmdstream */
2088 for (unsigned i = 0; i < rect_count; i++) {
2089 unsigned x1 = rects[i].rect.offset.x;
2090 unsigned y1 = rects[i].rect.offset.y;
2091 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2092 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2093
2094 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2095 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2096 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2097
2098 for (unsigned j = 0; j < attachment_count; j++) {
2099 uint32_t a;
2100 unsigned clear_mask = 0;
2101 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2102 clear_mask = 0xf;
2103 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2104 } else {
2105 a = subpass->depth_stencil_attachment.attachment;
2106 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2107 clear_mask |= 0x7;
2108 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2109 clear_mask |= 0x8;
2110 }
2111
2112 if (a == VK_ATTACHMENT_UNUSED)
2113 continue;
2114
2115 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2116 &attachments[j].clearValue);
2117 }
2118 }
2119 }
2120
2121 void
2122 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2123 uint32_t attachmentCount,
2124 const VkClearAttachment *pAttachments,
2125 uint32_t rectCount,
2126 const VkClearRect *pRects)
2127 {
2128 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129 struct tu_cs *cs = &cmd->draw_cs;
2130
2131 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2132 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2133 tu_cond_exec_end(cs);
2134
2135 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2136 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2137 tu_cond_exec_end(cs);
2138 }
2139
2140 void
2141 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2142 struct tu_cs *cs,
2143 uint32_t a,
2144 const VkRenderPassBeginInfo *info)
2145 {
2146 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2147 const struct tu_image_view *iview = fb->attachments[a].attachment;
2148 const struct tu_render_pass_attachment *attachment =
2149 &cmd->state.pass->attachments[a];
2150 uint8_t mask = 0;
2151
2152 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
2153 mask = 0xf;
2154
2155 if (iview->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
2156 mask &= 0x7;
2157 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
2158 mask |= 0x8;
2159 }
2160
2161 /* gmem_offset<0 means it isn't used by any subpass and shouldn't be cleared */
2162 if (attachment->gmem_offset < 0 || !mask)
2163 return;
2164
2165 const struct blit_ops *ops = &r2d_ops;
2166 if (attachment->samples > 1)
2167 ops = &r3d_ops;
2168
2169 ops->setup(cmd, cs, iview->vk_format, ROTATE_0, true, mask);
2170 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2171 ops->clear_value(cs, iview->vk_format, &info->pClearValues[a]);
2172
2173 for (uint32_t i = 0; i < fb->layers; i++) {
2174 ops->dst(cs, iview->image, iview->vk_format, iview->base_mip, iview->base_layer + i);
2175 ops->run(cmd, cs);
2176 }
2177 }
2178
2179 void
2180 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2181 struct tu_cs *cs,
2182 uint32_t a,
2183 const VkRenderPassBeginInfo *info)
2184 {
2185 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2186 const struct tu_image_view *iview = fb->attachments[a].attachment;
2187 const struct tu_render_pass_attachment *attachment =
2188 &cmd->state.pass->attachments[a];
2189 unsigned clear_mask = 0;
2190
2191 /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */
2192 if (attachment->gmem_offset < 0)
2193 return;
2194
2195 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
2196 clear_mask = 0xf;
2197
2198 if (vk_format_has_stencil(iview->vk_format)) {
2199 clear_mask &= 0x7;
2200 if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR)
2201 clear_mask |= 0x8;
2202 }
2203 if (!clear_mask)
2204 return;
2205
2206 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2207
2208 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2209 &info->pClearValues[a]);
2210 }
2211
2212 static void
2213 tu_emit_blit(struct tu_cmd_buffer *cmd,
2214 struct tu_cs *cs,
2215 const struct tu_image_view *iview,
2216 struct tu_render_pass_attachment *attachment,
2217 bool resolve)
2218 {
2219 const struct tu_native_format format =
2220 tu6_format_image(iview->image, iview->vk_format, iview->base_mip);
2221
2222 tu_cs_emit_regs(cs,
2223 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2224
2225 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2226 .unk0 = !resolve,
2227 .gmem = !resolve,
2228 /* "integer" bit disables msaa resolve averaging */
2229 .integer = vk_format_is_int(iview->vk_format)));
2230
2231 tu_cs_emit_regs(cs,
2232 A6XX_RB_BLIT_DST_INFO(
2233 .tile_mode = format.tile_mode,
2234 .samples = tu_msaa_samples(iview->image->samples),
2235 .color_format = format.fmt,
2236 .color_swap = format.swap,
2237 .flags = iview->image->layout.ubwc_layer_size != 0),
2238 A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
2239 A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
2240 A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size));
2241
2242 if (iview->image->layout.ubwc_layer_size) {
2243 tu_cs_emit_regs(cs,
2244 A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)),
2245 A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview)));
2246 }
2247
2248 tu_cs_emit_regs(cs,
2249 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2250
2251 tu6_emit_event_write(cmd, cs, BLIT, false);
2252 }
2253
2254 static bool
2255 blit_can_resolve(VkFormat format)
2256 {
2257 const struct util_format_description *desc = vk_format_description(format);
2258
2259 /* blit event can only do resolve for simple cases:
2260 * averaging samples as unsigned integers or choosing only one sample
2261 */
2262 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2263 return false;
2264
2265 /* can't do formats with larger channel sizes
2266 * note: this includes all float formats
2267 * note2: single channel integer formats seem OK
2268 */
2269 if (desc->channel[0].size > 10)
2270 return false;
2271
2272 switch (format) {
2273 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2274 * likely related to these formats having different layout from other cpp=2 formats
2275 */
2276 case VK_FORMAT_R8G8_UNORM:
2277 case VK_FORMAT_R8G8_UINT:
2278 case VK_FORMAT_R8G8_SINT:
2279 /* TODO: this one should be able to work? */
2280 case VK_FORMAT_D24_UNORM_S8_UINT:
2281 return false;
2282 default:
2283 break;
2284 }
2285
2286 return true;
2287 }
2288
2289 void
2290 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
2291 {
2292 tu_emit_blit(cmd, cs,
2293 cmd->state.framebuffer->attachments[a].attachment,
2294 &cmd->state.pass->attachments[a],
2295 false);
2296 }
2297
2298 void
2299 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
2300 {
2301 const struct tu_render_pass_attachment *attachment =
2302 &cmd->state.pass->attachments[a];
2303
2304 if (attachment->gmem_offset < 0)
2305 return;
2306
2307 if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD ||
2308 (vk_format_has_stencil(attachment->format) &&
2309 attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD)) {
2310 tu_emit_load_gmem_attachment(cmd, cs, a);
2311 }
2312 }
2313
2314 void
2315 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2316 struct tu_cs *cs,
2317 uint32_t a,
2318 uint32_t gmem_a)
2319 {
2320 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2321 const VkRect2D *render_area = &tiling->render_area;
2322 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2323 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2324 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2325
2326 if (dst->store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE)
2327 return;
2328
2329 uint32_t x1 = render_area->offset.x;
2330 uint32_t y1 = render_area->offset.y;
2331 uint32_t x2 = x1 + render_area->extent.width;
2332 uint32_t y2 = y1 + render_area->extent.height;
2333 /* x2/y2 can be unaligned if equal to the size of the image,
2334 * since it will write into padding space
2335 * the one exception is linear levels which don't have the
2336 * required y padding in the layout (except for the last level)
2337 */
2338 bool need_y2_align =
2339 y2 != iview->extent.height ||
2340 (tu6_get_image_tile_mode(iview->image, iview->base_mip) == TILE6_LINEAR &&
2341 iview->base_mip != iview->image->level_count - 1);
2342
2343 bool unaligned =
2344 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2345 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2346
2347 /* use fast path when render area is aligned, except for unsupported resolve cases */
2348 if (!unaligned && (a == gmem_a || blit_can_resolve(iview->vk_format))) {
2349 tu_emit_blit(cmd, cs, iview, src, true);
2350 return;
2351 }
2352
2353 if (dst->samples > 1) {
2354 /* I guess we need to use shader path in this case?
2355 * need a testcase which fails because of this
2356 */
2357 tu_finishme("unaligned store of msaa attachment\n");
2358 return;
2359 }
2360
2361 r2d_setup_common(cmd, cs, iview->vk_format, ROTATE_0, false, 0xf, true);
2362 r2d_dst(cs, iview->image, iview->vk_format, iview->base_mip, iview->base_layer);
2363 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2364
2365 tu_cs_emit_regs(cs,
2366 A6XX_SP_PS_2D_SRC_INFO(
2367 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2368 .tile_mode = TILE6_2,
2369 .srgb = vk_format_is_srgb(src->format),
2370 .samples = tu_msaa_samples(src->samples),
2371 .samples_average = !vk_format_is_int(src->format),
2372 .unk20 = 1,
2373 .unk22 = 1),
2374 /* note: src size does not matter when not scaling */
2375 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2376 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2377 A6XX_SP_PS_2D_SRC_HI(),
2378 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2379
2380 /* sync GMEM writes with CACHE */
2381 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2382
2383 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2384 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2385
2386 /* TODO: flushing with barriers instead of blindly always flushing */
2387 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2388 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2389 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2390 }