turnip: add support for D32_SFLOAT_S8_UINT
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(src->x),
111 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
233 {
234 assert(iview->image->samples == 1);
235
236 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
237 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
238 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
239 tu_cs_emit(cs, iview->stencil_PITCH);
240 }
241
242 static void
243 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
244 {
245 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
246
247 tu_cs_emit_regs(cs,
248 A6XX_RB_2D_DST_INFO(
249 .color_format = format.fmt,
250 .color_swap = format.swap,
251 .srgb = vk_format_is_srgb(vk_format)),
252 A6XX_RB_2D_DST_LO((uint32_t) va),
253 A6XX_RB_2D_DST_HI(va >> 32),
254 A6XX_RB_2D_DST_PITCH(pitch));
255 }
256
257 static void
258 r2d_setup_common(struct tu_cmd_buffer *cmd,
259 struct tu_cs *cs,
260 VkFormat vk_format,
261 VkImageAspectFlags aspect_mask,
262 enum a6xx_rotation rotation,
263 bool clear,
264 bool ubwc,
265 bool scissor)
266 {
267 enum a6xx_format format = tu6_base_format(vk_format);
268 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
269 uint32_t unknown_8c01 = 0;
270
271 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
272 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
273 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
274 }
275
276 /* note: the only format with partial clearing is D24S8 */
277 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
278 /* preserve stencil channel */
279 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
280 unknown_8c01 = 0x08000041;
281 /* preserve depth channels */
282 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
283 unknown_8c01 = 0x00084001;
284 }
285
286 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
287 tu_cs_emit(cs, unknown_8c01);
288
289 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
290 .scissor = scissor,
291 .rotate = rotation,
292 .solid_color = clear,
293 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
294 .color_format = format,
295 .mask = 0xf,
296 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
297 ).value;
298
299 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
300 tu_cs_emit(cs, blit_cntl);
301
302 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
303 tu_cs_emit(cs, blit_cntl);
304
305 if (format == FMT6_10_10_10_2_UNORM_DEST)
306 format = FMT6_16_16_16_16_FLOAT;
307
308 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
309 .sint = vk_format_is_sint(vk_format),
310 .uint = vk_format_is_uint(vk_format),
311 .color_format = format,
312 .srgb = vk_format_is_srgb(vk_format),
313 .mask = 0xf));
314 }
315
316 static void
317 r2d_setup(struct tu_cmd_buffer *cmd,
318 struct tu_cs *cs,
319 VkFormat vk_format,
320 VkImageAspectFlags aspect_mask,
321 enum a6xx_rotation rotation,
322 bool clear,
323 bool ubwc)
324 {
325 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
326
327 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
328 }
329
330 static void
331 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
332 {
333 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
334 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
335 }
336
337 /* r3d_ = shader path operations */
338
339 void
340 tu_init_clear_blit_shaders(struct tu6_global *global)
341 {
342 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
343 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
344 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
345
346 static const instr_t vs_code[] = {
347 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
348 * r1.xy = r0.w ? c1.zw : c0.zw
349 * r0.w = 1.0f
350 */
351 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
352 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
353 .src2 = 3,
354 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
355 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
356 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
357 .src2 = 3,
358 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
359 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
360 { .cat0 = { .opc = OPC_END } },
361 };
362
363 static const instr_t fs_blit[] = {
364 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
365 * blit path (its not clear what allows it to not have it)
366 */
367 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
368 { .cat0 = { .opc = OPC_END } },
369 };
370
371 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
372 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
373
374 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
375 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
376 for (uint32_t i = 0; i < num_rts; i++) {
377 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
378 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
379 }
380 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
381 }
382 }
383
384 static void
385 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
386 bool layered_clear)
387 {
388 struct ir3_const_state dummy_const_state = {};
389 struct ir3_shader dummy_shader = {};
390
391 struct ir3_shader_variant vs = {
392 .type = MESA_SHADER_VERTEX,
393 .instrlen = 1,
394 .constlen = 4,
395 .info.max_reg = 1,
396 .inputs_count = 1,
397 .inputs[0] = {
398 .slot = SYSTEM_VALUE_VERTEX_ID,
399 .regid = regid(0, 3),
400 .sysval = true,
401 },
402 .outputs_count = blit ? 2 : 1,
403 .outputs[0] = {
404 .slot = VARYING_SLOT_POS,
405 .regid = regid(0, 0),
406 },
407 .outputs[1] = {
408 .slot = VARYING_SLOT_VAR0,
409 .regid = regid(1, 0),
410 },
411 .shader = &dummy_shader,
412 .const_state = &dummy_const_state,
413 };
414 if (layered_clear) {
415 vs.outputs[1].slot = VARYING_SLOT_LAYER;
416 vs.outputs[1].regid = regid(1, 1);
417 vs.outputs_count = 2;
418 }
419
420 struct ir3_shader_variant fs = {
421 .type = MESA_SHADER_FRAGMENT,
422 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
423 .constlen = align(num_rts, 4),
424 .info.max_reg = MAX2(num_rts, 1) - 1,
425 .total_in = blit ? 2 : 0,
426 .num_samp = blit ? 1 : 0,
427 .inputs_count = blit ? 2 : 0,
428 .inputs[0] = {
429 .slot = VARYING_SLOT_VAR0,
430 .inloc = 0,
431 .compmask = 3,
432 .bary = true,
433 },
434 .inputs[1] = {
435 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
436 .regid = regid(0, 0),
437 .sysval = 1,
438 },
439 .num_sampler_prefetch = blit ? 1 : 0,
440 .sampler_prefetch[0] = {
441 .src = 0,
442 .wrmask = 0xf,
443 .cmd = 4,
444 },
445 .shader = &dummy_shader,
446 .const_state = &dummy_const_state,
447 };
448
449 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
450 .vs_state = true,
451 .hs_state = true,
452 .ds_state = true,
453 .gs_state = true,
454 .fs_state = true,
455 .cs_state = true,
456 .gfx_ibo = true,
457 .cs_ibo = true,
458 .gfx_shared_const = true,
459 .gfx_bindless = 0x1f,
460 .cs_bindless = 0x1f));
461
462 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
463 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
464 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
465 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
466 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
467 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
468
469 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
470 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
471
472 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
473
474 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
475 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
476 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
477
478 tu6_emit_fs_inputs(cs, &fs);
479
480 tu_cs_emit_regs(cs,
481 A6XX_GRAS_CL_CNTL(
482 .persp_division_disable = 1,
483 .vp_xform_disable = 1,
484 .vp_clip_code_ignore = 1,
485 .clip_disable = 1));
486 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
487
488 tu_cs_emit_regs(cs,
489 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
490 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
491 tu_cs_emit_regs(cs,
492 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
493 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
494
495 tu_cs_emit_regs(cs,
496 A6XX_VFD_INDEX_OFFSET(),
497 A6XX_VFD_INSTANCE_START_OFFSET());
498 }
499
500 static void
501 r3d_coords_raw(struct tu_cs *cs, const float *coords)
502 {
503 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
504 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
505 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
506 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
507 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
508 CP_LOAD_STATE6_0_NUM_UNIT(2));
509 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
510 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
511 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
512 }
513
514 static void
515 r3d_coords(struct tu_cs *cs,
516 const VkOffset2D *dst,
517 const VkOffset2D *src,
518 const VkExtent2D *extent)
519 {
520 int32_t src_x1 = src ? src->x : 0;
521 int32_t src_y1 = src ? src->y : 0;
522 r3d_coords_raw(cs, (float[]) {
523 dst->x, dst->y,
524 src_x1, src_y1,
525 dst->x + extent->width, dst->y + extent->height,
526 src_x1 + extent->width, src_y1 + extent->height,
527 });
528 }
529
530 static void
531 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
532 {
533 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
534 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
535 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
536 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
537 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
538 CP_LOAD_STATE6_0_NUM_UNIT(1));
539 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
540 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
541 switch (format) {
542 case VK_FORMAT_X8_D24_UNORM_PACK32:
543 case VK_FORMAT_D24_UNORM_S8_UINT: {
544 /* cleared as r8g8b8a8_unorm using special format */
545 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
546 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
547 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
548 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
549 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
550 } break;
551 case VK_FORMAT_D16_UNORM:
552 case VK_FORMAT_D32_SFLOAT:
553 tu_cs_emit(cs, fui(val->depthStencil.depth));
554 tu_cs_emit(cs, 0);
555 tu_cs_emit(cs, 0);
556 tu_cs_emit(cs, 0);
557 break;
558 case VK_FORMAT_S8_UINT:
559 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
560 tu_cs_emit(cs, 0);
561 tu_cs_emit(cs, 0);
562 tu_cs_emit(cs, 0);
563 break;
564 default:
565 /* as color formats use clear value as-is */
566 assert(!vk_format_is_depth_or_stencil(format));
567 tu_cs_emit_array(cs, val->color.uint32, 4);
568 break;
569 }
570 }
571
572 static void
573 r3d_src_common(struct tu_cmd_buffer *cmd,
574 struct tu_cs *cs,
575 const uint32_t *tex_const,
576 uint32_t offset_base,
577 uint32_t offset_ubwc,
578 VkFilter filter)
579 {
580 struct tu_cs_memory texture = { };
581 VkResult result = tu_cs_alloc(&cmd->sub_cs,
582 2, /* allocate space for a sampler too */
583 A6XX_TEX_CONST_DWORDS, &texture);
584 assert(result == VK_SUCCESS);
585
586 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
587
588 /* patch addresses for layer offset */
589 *(uint64_t*) (texture.map + 4) += offset_base;
590 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
591 texture.map[7] = ubwc_addr;
592 texture.map[8] = ubwc_addr >> 32;
593
594 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
595 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
596 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
597 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
598 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
599 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
600 0x60000; /* XXX used by blob, doesn't seem necessary */
601 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
602 0x1 | /* XXX used by blob, doesn't seem necessary */
603 A6XX_TEX_SAMP_1_UNNORM_COORDS |
604 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
605 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
606 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
607
608 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
609 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
610 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
611 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
612 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
613 CP_LOAD_STATE6_0_NUM_UNIT(1));
614 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
615
616 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
617 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
618
619 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
620 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
621 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
622 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
623 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
624 CP_LOAD_STATE6_0_NUM_UNIT(1));
625 tu_cs_emit_qw(cs, texture.iova);
626
627 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
628 tu_cs_emit_qw(cs, texture.iova);
629
630 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
631 }
632
633 static void
634 r3d_src(struct tu_cmd_buffer *cmd,
635 struct tu_cs *cs,
636 const struct tu_image_view *iview,
637 uint32_t layer,
638 VkFilter filter)
639 {
640 r3d_src_common(cmd, cs, iview->descriptor,
641 iview->layer_size * layer,
642 iview->ubwc_layer_size * layer,
643 filter);
644 }
645
646 static void
647 r3d_src_buffer(struct tu_cmd_buffer *cmd,
648 struct tu_cs *cs,
649 VkFormat vk_format,
650 uint64_t va, uint32_t pitch,
651 uint32_t width, uint32_t height)
652 {
653 uint32_t desc[A6XX_TEX_CONST_DWORDS];
654
655 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
656
657 desc[0] =
658 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
659 A6XX_TEX_CONST_0_FMT(format.fmt) |
660 A6XX_TEX_CONST_0_SWAP(format.swap) |
661 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
662 // XXX to swizzle into .w for stencil buffer_to_image
663 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
664 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
665 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
666 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
667 desc[2] =
668 A6XX_TEX_CONST_2_PITCH(pitch) |
669 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
670 desc[3] = 0;
671 desc[4] = va;
672 desc[5] = va >> 32;
673 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
674 desc[i] = 0;
675
676 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
677 }
678
679 static void
680 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
681 {
682 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
683
684 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
685 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
686 tu_cs_image_ref(cs, iview, layer);
687 tu_cs_emit(cs, 0);
688
689 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
690 tu_cs_image_flag_ref(cs, iview, layer);
691
692 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
693 }
694
695 static void
696 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
697 {
698 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
699
700 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
701 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
702 tu_cs_image_stencil_ref(cs, iview, layer);
703 tu_cs_emit(cs, 0);
704
705 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
706 }
707
708 static void
709 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
710 {
711 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
712
713 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
714
715 tu_cs_emit_regs(cs,
716 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
717 A6XX_RB_MRT_PITCH(0, pitch),
718 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
719 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
720 A6XX_RB_MRT_BASE_HI(0, va >> 32),
721 A6XX_RB_MRT_BASE_GMEM(0, 0));
722
723 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
724 }
725
726 static uint8_t
727 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
728 {
729 uint8_t mask = 0xf;
730 assert(aspect_mask);
731 /* note: the only format with partial writing is D24S8,
732 * clear/blit uses the _AS_R8G8B8A8 format to access it
733 */
734 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
735 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
736 mask = 0x7;
737 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
738 mask = 0x8;
739 }
740 return mask;
741 }
742
743 static void
744 r3d_setup(struct tu_cmd_buffer *cmd,
745 struct tu_cs *cs,
746 VkFormat vk_format,
747 VkImageAspectFlags aspect_mask,
748 enum a6xx_rotation rotation,
749 bool clear,
750 bool ubwc)
751 {
752 enum a6xx_format format = tu6_base_format(vk_format);
753
754 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
755 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
756 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
757 }
758
759 if (!cmd->state.pass) {
760 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
761 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
762 }
763
764 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
765 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
766
767 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
768
769 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
770 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
771 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
772 0xfc000000);
773 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
774
775 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
776 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
777
778 tu_cs_emit_regs(cs,
779 A6XX_RB_FS_OUTPUT_CNTL0(),
780 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
781
782 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
783 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
784 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
785
786 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
787 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
788 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
789 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
790 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
791 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
792 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
793
794 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
795 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
796
797 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
798 .color_format = format,
799 .color_sint = vk_format_is_sint(vk_format),
800 .color_uint = vk_format_is_uint(vk_format)));
801
802 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
803 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
804 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
805 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
806 }
807
808 static void
809 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
810 {
811 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
812 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
813 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
814 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
815 tu_cs_emit(cs, 1); /* instance count */
816 tu_cs_emit(cs, 2); /* vertex count */
817 }
818
819 /* blit ops - common interface for 2d/shader paths */
820
821 struct blit_ops {
822 void (*coords)(struct tu_cs *cs,
823 const VkOffset2D *dst,
824 const VkOffset2D *src,
825 const VkExtent2D *extent);
826 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
827 void (*src)(
828 struct tu_cmd_buffer *cmd,
829 struct tu_cs *cs,
830 const struct tu_image_view *iview,
831 uint32_t layer,
832 VkFilter filter);
833 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
834 VkFormat vk_format,
835 uint64_t va, uint32_t pitch,
836 uint32_t width, uint32_t height);
837 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
838 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
839 void (*setup)(struct tu_cmd_buffer *cmd,
840 struct tu_cs *cs,
841 VkFormat vk_format,
842 VkImageAspectFlags aspect_mask,
843 enum a6xx_rotation rotation,
844 bool clear,
845 bool ubwc);
846 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
847 };
848
849 static const struct blit_ops r2d_ops = {
850 .coords = r2d_coords,
851 .clear_value = r2d_clear_value,
852 .src = r2d_src,
853 .src_buffer = r2d_src_buffer,
854 .dst = r2d_dst,
855 .dst_buffer = r2d_dst_buffer,
856 .setup = r2d_setup,
857 .run = r2d_run,
858 };
859
860 static const struct blit_ops r3d_ops = {
861 .coords = r3d_coords,
862 .clear_value = r3d_clear_value,
863 .src = r3d_src,
864 .src_buffer = r3d_src_buffer,
865 .dst = r3d_dst,
866 .dst_buffer = r3d_dst_buffer,
867 .setup = r3d_setup,
868 .run = r3d_run,
869 };
870
871 /* passthrough set coords from 3D extents */
872 static void
873 coords(const struct blit_ops *ops,
874 struct tu_cs *cs,
875 const VkOffset3D *dst,
876 const VkOffset3D *src,
877 const VkExtent3D *extent)
878 {
879 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
880 }
881
882 static VkFormat
883 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
884 {
885 if (vk_format_is_compressed(format)) {
886 switch (vk_format_get_blocksize(format)) {
887 case 1: return VK_FORMAT_R8_UINT;
888 case 2: return VK_FORMAT_R16_UINT;
889 case 4: return VK_FORMAT_R32_UINT;
890 case 8: return VK_FORMAT_R32G32_UINT;
891 case 16:return VK_FORMAT_R32G32B32A32_UINT;
892 default:
893 unreachable("unhandled format size");
894 }
895 }
896
897 switch (format) {
898 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
899 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
900 return VK_FORMAT_R8G8_UNORM;
901 /* fallthrough */
902 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
903 return VK_FORMAT_R8_UNORM;
904 case VK_FORMAT_D24_UNORM_S8_UINT:
905 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
906 return VK_FORMAT_R8_UNORM;
907 /* fallthrough */
908 default:
909 return format;
910 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
911 return VK_FORMAT_R32_UINT;
912 case VK_FORMAT_D32_SFLOAT_S8_UINT:
913 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
914 return VK_FORMAT_S8_UINT;
915 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
916 return VK_FORMAT_D32_SFLOAT;
917 }
918 }
919
920 static void
921 tu_image_view_copy_blit(struct tu_image_view *iview,
922 struct tu_image *image,
923 VkFormat format,
924 const VkImageSubresourceLayers *subres,
925 uint32_t layer,
926 bool stencil_read)
927 {
928 VkImageAspectFlags aspect_mask = subres->aspectMask;
929
930 /* always use the AS_R8G8B8A8 format for these */
931 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
932 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
933 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
934 }
935
936 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
937 .image = tu_image_to_handle(image),
938 .viewType = VK_IMAGE_VIEW_TYPE_2D,
939 .format = format,
940 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
941 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
942 .subresourceRange = {
943 .aspectMask = aspect_mask,
944 .baseMipLevel = subres->mipLevel,
945 .levelCount = 1,
946 .baseArrayLayer = subres->baseArrayLayer + layer,
947 .layerCount = 1,
948 },
949 }, false);
950 }
951
952 static void
953 tu_image_view_copy(struct tu_image_view *iview,
954 struct tu_image *image,
955 VkFormat format,
956 const VkImageSubresourceLayers *subres,
957 uint32_t layer,
958 bool stencil_read)
959 {
960 format = copy_format(format, subres->aspectMask, false);
961 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
962 }
963
964 static void
965 tu_image_view_blit(struct tu_image_view *iview,
966 struct tu_image *image,
967 const VkImageSubresourceLayers *subres,
968 uint32_t layer)
969 {
970 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
971 }
972
973 static void
974 tu6_blit_image(struct tu_cmd_buffer *cmd,
975 struct tu_image *src_image,
976 struct tu_image *dst_image,
977 const VkImageBlit *info,
978 VkFilter filter)
979 {
980 const struct blit_ops *ops = &r2d_ops;
981 struct tu_cs *cs = &cmd->cs;
982 uint32_t layers;
983
984 /* 2D blit can't do rotation mirroring from just coordinates */
985 static const enum a6xx_rotation rotate[2][2] = {
986 {ROTATE_0, ROTATE_HFLIP},
987 {ROTATE_VFLIP, ROTATE_180},
988 };
989
990 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
991 (info->dstOffsets[1].x < info->dstOffsets[0].x);
992 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
993 (info->dstOffsets[1].y < info->dstOffsets[0].y);
994 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
995 (info->dstOffsets[1].z < info->dstOffsets[0].z);
996
997 if (mirror_z) {
998 tu_finishme("blit z mirror\n");
999 return;
1000 }
1001
1002 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1003 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1004 tu_finishme("blit z filter\n");
1005 return;
1006 }
1007
1008 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1009 if (info->dstSubresource.layerCount > 1) {
1010 assert(layers <= 1);
1011 layers = info->dstSubresource.layerCount;
1012 }
1013
1014 /* BC1_RGB_* formats need to have their last components overriden with 1
1015 * when sampling, which is normally handled with the texture descriptor
1016 * swizzle. The 2d path can't handle that, so use the 3d path.
1017 *
1018 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1019 * the 2d path.
1020 */
1021
1022 if (dst_image->samples > 1 ||
1023 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1024 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1025 filter == VK_FILTER_CUBIC_EXT)
1026 ops = &r3d_ops;
1027
1028 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1029 * figure out why (should be able to pass all tests with only shader path)
1030 */
1031
1032 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1033 rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1034
1035 if (ops == &r3d_ops) {
1036 r3d_coords_raw(cs, (float[]) {
1037 info->dstOffsets[0].x, info->dstOffsets[0].y,
1038 info->srcOffsets[0].x, info->srcOffsets[0].y,
1039 info->dstOffsets[1].x, info->dstOffsets[1].y,
1040 info->srcOffsets[1].x, info->srcOffsets[1].y
1041 });
1042 } else {
1043 tu_cs_emit_regs(cs,
1044 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1045 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1046 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1047 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1048 tu_cs_emit_regs(cs,
1049 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1050 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1051 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1052 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1053 }
1054
1055 struct tu_image_view dst, src;
1056 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1057 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1058
1059 for (uint32_t i = 0; i < layers; i++) {
1060 ops->dst(cs, &dst, i);
1061 ops->src(cmd, cs, &src, i, filter);
1062 ops->run(cmd, cs);
1063 }
1064 }
1065
1066 void
1067 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1068 VkImage srcImage,
1069 VkImageLayout srcImageLayout,
1070 VkImage dstImage,
1071 VkImageLayout dstImageLayout,
1072 uint32_t regionCount,
1073 const VkImageBlit *pRegions,
1074 VkFilter filter)
1075
1076 {
1077 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1078 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1079 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1080
1081 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1082 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1083
1084 for (uint32_t i = 0; i < regionCount; ++i)
1085 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1086 }
1087
1088 static void
1089 copy_compressed(VkFormat format,
1090 VkOffset3D *offset,
1091 VkExtent3D *extent,
1092 uint32_t *width,
1093 uint32_t *height)
1094 {
1095 if (!vk_format_is_compressed(format))
1096 return;
1097
1098 uint32_t block_width = vk_format_get_blockwidth(format);
1099 uint32_t block_height = vk_format_get_blockheight(format);
1100
1101 offset->x /= block_width;
1102 offset->y /= block_height;
1103
1104 if (extent) {
1105 extent->width = DIV_ROUND_UP(extent->width, block_width);
1106 extent->height = DIV_ROUND_UP(extent->height, block_height);
1107 }
1108 if (width)
1109 *width = DIV_ROUND_UP(*width, block_width);
1110 if (height)
1111 *height = DIV_ROUND_UP(*height, block_height);
1112 }
1113
1114 static void
1115 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1116 struct tu_buffer *src_buffer,
1117 struct tu_image *dst_image,
1118 const VkBufferImageCopy *info)
1119 {
1120 struct tu_cs *cs = &cmd->cs;
1121 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1122 VkFormat src_format =
1123 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1124 const struct blit_ops *ops = &r2d_ops;
1125
1126 /* special case for buffer to stencil */
1127 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1128 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1129 ops = &r3d_ops;
1130 }
1131
1132 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1133 * which matters for UBWC. buffer_to_image/etc can fail because of this
1134 */
1135
1136 VkOffset3D offset = info->imageOffset;
1137 VkExtent3D extent = info->imageExtent;
1138 uint32_t src_width = info->bufferRowLength ?: extent.width;
1139 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1140
1141 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1142
1143 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1144 uint32_t layer_size = src_height * pitch;
1145
1146 ops->setup(cmd, cs,
1147 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1148 info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1149
1150 struct tu_image_view dst;
1151 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1152
1153 for (uint32_t i = 0; i < layers; i++) {
1154 ops->dst(cs, &dst, i);
1155
1156 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1157 if ((src_va & 63) || (pitch & 63)) {
1158 for (uint32_t y = 0; y < extent.height; y++) {
1159 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1160 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1161 x + extent.width, 1);
1162 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1163 &(VkExtent2D) {extent.width, 1});
1164 ops->run(cmd, cs);
1165 src_va += pitch;
1166 }
1167 } else {
1168 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1169 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1170 ops->run(cmd, cs);
1171 }
1172 }
1173 }
1174
1175 void
1176 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1177 VkBuffer srcBuffer,
1178 VkImage dstImage,
1179 VkImageLayout dstImageLayout,
1180 uint32_t regionCount,
1181 const VkBufferImageCopy *pRegions)
1182 {
1183 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1184 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1185 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1186
1187 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1188 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1189
1190 for (unsigned i = 0; i < regionCount; ++i)
1191 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1192 }
1193
1194 static void
1195 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1196 struct tu_image *src_image,
1197 struct tu_buffer *dst_buffer,
1198 const VkBufferImageCopy *info)
1199 {
1200 struct tu_cs *cs = &cmd->cs;
1201 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1202 VkFormat dst_format =
1203 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1204 bool stencil_read = false;
1205
1206 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1207 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1208 stencil_read = true;
1209 }
1210
1211 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1212 VkOffset3D offset = info->imageOffset;
1213 VkExtent3D extent = info->imageExtent;
1214 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1215 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1216
1217 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1218
1219 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1220 uint32_t layer_size = pitch * dst_height;
1221
1222 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1223
1224 struct tu_image_view src;
1225 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1226
1227 for (uint32_t i = 0; i < layers; i++) {
1228 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1229
1230 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1231 if ((dst_va & 63) || (pitch & 63)) {
1232 for (uint32_t y = 0; y < extent.height; y++) {
1233 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1234 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1235 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1236 &(VkExtent2D) {extent.width, 1});
1237 ops->run(cmd, cs);
1238 dst_va += pitch;
1239 }
1240 } else {
1241 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1242 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1243 ops->run(cmd, cs);
1244 }
1245 }
1246 }
1247
1248 void
1249 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1250 VkImage srcImage,
1251 VkImageLayout srcImageLayout,
1252 VkBuffer dstBuffer,
1253 uint32_t regionCount,
1254 const VkBufferImageCopy *pRegions)
1255 {
1256 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1257 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1258 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1259
1260 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1261 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1262
1263 for (unsigned i = 0; i < regionCount; ++i)
1264 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1265 }
1266
1267 /* Tiled formats don't support swapping, which means that we can't support
1268 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1269 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1270 * Currently we fake support for tiled swapped formats and use the unswapped
1271 * format instead, but this means that reinterpreting copies to and from
1272 * swapped formats can't be performed correctly unless we can swizzle the
1273 * components by reinterpreting the other image as the "correct" swapped
1274 * format, i.e. only when the other image is linear.
1275 */
1276
1277 static bool
1278 is_swapped_format(VkFormat format)
1279 {
1280 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1281 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1282 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1283 }
1284
1285 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1286 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1287 * versa). This should mirror the logic in fdl6_layout.
1288 */
1289 static bool
1290 image_is_r8g8(struct tu_image *image)
1291 {
1292 return image->layout[0].cpp == 2 &&
1293 vk_format_get_nr_components(image->vk_format) == 2;
1294 }
1295
1296 static void
1297 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1298 struct tu_image *src_image,
1299 struct tu_image *dst_image,
1300 const VkImageCopy *info)
1301 {
1302 const struct blit_ops *ops = &r2d_ops;
1303 struct tu_cs *cs = &cmd->cs;
1304
1305 if (dst_image->samples > 1)
1306 ops = &r3d_ops;
1307
1308 VkFormat format = VK_FORMAT_UNDEFINED;
1309 VkOffset3D src_offset = info->srcOffset;
1310 VkOffset3D dst_offset = info->dstOffset;
1311 VkExtent3D extent = info->extent;
1312
1313 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1314 * Images":
1315 *
1316 * When copying between compressed and uncompressed formats the extent
1317 * members represent the texel dimensions of the source image and not
1318 * the destination. When copying from a compressed image to an
1319 * uncompressed image the image texel dimensions written to the
1320 * uncompressed image will be source extent divided by the compressed
1321 * texel block dimensions. When copying from an uncompressed image to a
1322 * compressed image the image texel dimensions written to the compressed
1323 * image will be the source extent multiplied by the compressed texel
1324 * block dimensions.
1325 *
1326 * This means we only have to adjust the extent if the source image is
1327 * compressed.
1328 */
1329 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1330 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1331
1332 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1333 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1334
1335 bool use_staging_blit = false;
1336
1337 if (src_format == dst_format) {
1338 /* Images that share a format can always be copied directly because it's
1339 * the same as a blit.
1340 */
1341 format = src_format;
1342 } else if (!src_image->layout[0].tile_mode) {
1343 /* If an image is linear, we can always safely reinterpret it with the
1344 * other image's format and then do a regular blit.
1345 */
1346 format = dst_format;
1347 } else if (!dst_image->layout[0].tile_mode) {
1348 format = src_format;
1349 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1350 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1351 * due to the different tile layout.
1352 */
1353 use_staging_blit = true;
1354 } else if (is_swapped_format(src_format) ||
1355 is_swapped_format(dst_format)) {
1356 /* If either format has a non-identity swap, then we can't copy
1357 * to/from it.
1358 */
1359 use_staging_blit = true;
1360 } else if (!src_image->layout[0].ubwc) {
1361 format = dst_format;
1362 } else if (!dst_image->layout[0].ubwc) {
1363 format = src_format;
1364 } else {
1365 /* Both formats use UBWC and so neither can be reinterpreted.
1366 * TODO: We could do an in-place decompression of the dst instead.
1367 */
1368 use_staging_blit = true;
1369 }
1370
1371 struct tu_image_view dst, src;
1372
1373 if (use_staging_blit) {
1374 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1375 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1376
1377 struct tu_image staging_image = {
1378 .vk_format = src_format,
1379 .type = src_image->type,
1380 .tiling = VK_IMAGE_TILING_LINEAR,
1381 .extent = extent,
1382 .level_count = 1,
1383 .layer_count = info->srcSubresource.layerCount,
1384 .samples = src_image->samples,
1385 .bo_offset = 0,
1386 };
1387
1388 VkImageSubresourceLayers staging_subresource = {
1389 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1390 .mipLevel = 0,
1391 .baseArrayLayer = 0,
1392 .layerCount = info->srcSubresource.layerCount,
1393 };
1394
1395 VkOffset3D staging_offset = { 0 };
1396
1397 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1398 staging_image.layout[0].ubwc = false;
1399
1400 fdl6_layout(&staging_image.layout[0],
1401 vk_format_to_pipe_format(staging_image.vk_format),
1402 staging_image.samples,
1403 staging_image.extent.width,
1404 staging_image.extent.height,
1405 staging_image.extent.depth,
1406 staging_image.level_count,
1407 staging_image.layer_count,
1408 staging_image.type == VK_IMAGE_TYPE_3D,
1409 NULL);
1410
1411 VkResult result = tu_get_scratch_bo(cmd->device,
1412 staging_image.layout[0].size,
1413 &staging_image.bo);
1414 if (result != VK_SUCCESS) {
1415 cmd->record_result = result;
1416 return;
1417 }
1418
1419 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1420 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1421
1422 struct tu_image_view staging;
1423 tu_image_view_copy(&staging, &staging_image, src_format,
1424 &staging_subresource, 0, false);
1425
1426 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1427 coords(ops, cs, &staging_offset, &src_offset, &extent);
1428
1429 for (uint32_t i = 0; i < info->extent.depth; i++) {
1430 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1431 ops->dst(cs, &staging, i);
1432 ops->run(cmd, cs);
1433 }
1434
1435 /* When executed by the user there has to be a pipeline barrier here,
1436 * but since we're doing it manually we'll have to flush ourselves.
1437 */
1438 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1439 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1440
1441 tu_image_view_copy(&staging, &staging_image, dst_format,
1442 &staging_subresource, 0, false);
1443
1444 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1445 ROTATE_0, false, dst_image->layout[0].ubwc);
1446 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1447
1448 for (uint32_t i = 0; i < info->extent.depth; i++) {
1449 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1450 ops->dst(cs, &dst, i);
1451 ops->run(cmd, cs);
1452 }
1453 } else {
1454 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1455 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1456
1457 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1458 ROTATE_0, false, dst_image->layout[0].ubwc);
1459 coords(ops, cs, &dst_offset, &src_offset, &extent);
1460
1461 for (uint32_t i = 0; i < info->extent.depth; i++) {
1462 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1463 ops->dst(cs, &dst, i);
1464 ops->run(cmd, cs);
1465 }
1466 }
1467 }
1468
1469 void
1470 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1471 VkImage srcImage,
1472 VkImageLayout srcImageLayout,
1473 VkImage destImage,
1474 VkImageLayout destImageLayout,
1475 uint32_t regionCount,
1476 const VkImageCopy *pRegions)
1477 {
1478 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1479 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1480 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1481
1482 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1483 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1484
1485 for (uint32_t i = 0; i < regionCount; ++i)
1486 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1487 }
1488
1489 static void
1490 copy_buffer(struct tu_cmd_buffer *cmd,
1491 uint64_t dst_va,
1492 uint64_t src_va,
1493 uint64_t size,
1494 uint32_t block_size)
1495 {
1496 const struct blit_ops *ops = &r2d_ops;
1497 struct tu_cs *cs = &cmd->cs;
1498 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1499 uint64_t blocks = size / block_size;
1500
1501 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1502
1503 while (blocks) {
1504 uint32_t src_x = (src_va & 63) / block_size;
1505 uint32_t dst_x = (dst_va & 63) / block_size;
1506 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1507
1508 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1509 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1510 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1511 ops->run(cmd, cs);
1512
1513 src_va += width * block_size;
1514 dst_va += width * block_size;
1515 blocks -= width;
1516 }
1517 }
1518
1519 void
1520 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1521 VkBuffer srcBuffer,
1522 VkBuffer dstBuffer,
1523 uint32_t regionCount,
1524 const VkBufferCopy *pRegions)
1525 {
1526 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1527 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1528 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1529
1530 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1531 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1532
1533 for (unsigned i = 0; i < regionCount; ++i) {
1534 copy_buffer(cmd,
1535 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1536 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1537 pRegions[i].size, 1);
1538 }
1539 }
1540
1541 void
1542 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1543 VkBuffer dstBuffer,
1544 VkDeviceSize dstOffset,
1545 VkDeviceSize dataSize,
1546 const void *pData)
1547 {
1548 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1549 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1550
1551 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1552
1553 struct tu_cs_memory tmp;
1554 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1555 if (result != VK_SUCCESS) {
1556 cmd->record_result = result;
1557 return;
1558 }
1559
1560 memcpy(tmp.map, pData, dataSize);
1561 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1562 }
1563
1564 void
1565 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1566 VkBuffer dstBuffer,
1567 VkDeviceSize dstOffset,
1568 VkDeviceSize fillSize,
1569 uint32_t data)
1570 {
1571 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1572 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1573 const struct blit_ops *ops = &r2d_ops;
1574 struct tu_cs *cs = &cmd->cs;
1575
1576 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1577
1578 if (fillSize == VK_WHOLE_SIZE)
1579 fillSize = buffer->size - dstOffset;
1580
1581 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1582 uint32_t blocks = fillSize / 4;
1583
1584 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1585 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1586
1587 while (blocks) {
1588 uint32_t dst_x = (dst_va & 63) / 4;
1589 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1590
1591 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1592 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1593 ops->run(cmd, cs);
1594
1595 dst_va += width * 4;
1596 blocks -= width;
1597 }
1598 }
1599
1600 void
1601 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1602 VkImage srcImage,
1603 VkImageLayout srcImageLayout,
1604 VkImage dstImage,
1605 VkImageLayout dstImageLayout,
1606 uint32_t regionCount,
1607 const VkImageResolve *pRegions)
1608 {
1609 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1610 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1611 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1612 const struct blit_ops *ops = &r2d_ops;
1613 struct tu_cs *cs = &cmd->cs;
1614
1615 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1616 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1617
1618 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1619 ROTATE_0, false, dst_image->layout[0].ubwc);
1620
1621 for (uint32_t i = 0; i < regionCount; ++i) {
1622 const VkImageResolve *info = &pRegions[i];
1623 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1624
1625 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1626 /* TODO: aspect masks possible ? */
1627
1628 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1629
1630 struct tu_image_view dst, src;
1631 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1632 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1633
1634 for (uint32_t i = 0; i < layers; i++) {
1635 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1636 ops->dst(cs, &dst, i);
1637 ops->run(cmd, cs);
1638 }
1639 }
1640 }
1641
1642 void
1643 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1644 struct tu_cs *cs,
1645 struct tu_image_view *src,
1646 struct tu_image_view *dst,
1647 uint32_t layers,
1648 const VkRect2D *rect)
1649 {
1650 const struct blit_ops *ops = &r2d_ops;
1651
1652 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1653 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1654
1655 assert(src->image->vk_format == dst->image->vk_format);
1656
1657 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1658 ROTATE_0, false, dst->ubwc_enabled);
1659 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1660
1661 for (uint32_t i = 0; i < layers; i++) {
1662 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1663 ops->dst(cs, dst, i);
1664 ops->run(cmd, cs);
1665 }
1666 }
1667
1668 static void
1669 clear_image(struct tu_cmd_buffer *cmd,
1670 struct tu_image *image,
1671 const VkClearValue *clear_value,
1672 const VkImageSubresourceRange *range,
1673 VkImageAspectFlags aspect_mask)
1674 {
1675 uint32_t level_count = tu_get_levelCount(image, range);
1676 uint32_t layer_count = tu_get_layerCount(image, range);
1677 struct tu_cs *cs = &cmd->cs;
1678 VkFormat format = image->vk_format;
1679 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1680 format = copy_format(format, aspect_mask, false);
1681
1682 if (image->type == VK_IMAGE_TYPE_3D) {
1683 assert(layer_count == 1);
1684 assert(range->baseArrayLayer == 0);
1685 }
1686
1687 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1688
1689 ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
1690 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1691 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
1692 else
1693 ops->clear_value(cs, format, clear_value);
1694
1695 for (unsigned j = 0; j < level_count; j++) {
1696 if (image->type == VK_IMAGE_TYPE_3D)
1697 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1698
1699 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1700 u_minify(image->extent.width, range->baseMipLevel + j),
1701 u_minify(image->extent.height, range->baseMipLevel + j)
1702 });
1703
1704 struct tu_image_view dst;
1705 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1706 .aspectMask = aspect_mask,
1707 .mipLevel = range->baseMipLevel + j,
1708 .baseArrayLayer = range->baseArrayLayer,
1709 .layerCount = 1,
1710 }, 0, false);
1711
1712 for (uint32_t i = 0; i < layer_count; i++) {
1713 ops->dst(cs, &dst, i);
1714 ops->run(cmd, cs);
1715 }
1716 }
1717 }
1718
1719 void
1720 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1721 VkImage image_h,
1722 VkImageLayout imageLayout,
1723 const VkClearColorValue *pColor,
1724 uint32_t rangeCount,
1725 const VkImageSubresourceRange *pRanges)
1726 {
1727 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1728 TU_FROM_HANDLE(tu_image, image, image_h);
1729
1730 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1731
1732 for (unsigned i = 0; i < rangeCount; i++)
1733 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
1734 }
1735
1736 void
1737 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1738 VkImage image_h,
1739 VkImageLayout imageLayout,
1740 const VkClearDepthStencilValue *pDepthStencil,
1741 uint32_t rangeCount,
1742 const VkImageSubresourceRange *pRanges)
1743 {
1744 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1745 TU_FROM_HANDLE(tu_image, image, image_h);
1746
1747 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1748
1749 for (unsigned i = 0; i < rangeCount; i++) {
1750 const VkImageSubresourceRange *range = &pRanges[i];
1751
1752 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1753 /* can't clear both depth and stencil at once, split up the aspect mask */
1754 uint32_t b;
1755 for_each_bit(b, range->aspectMask)
1756 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
1757 continue;
1758 }
1759
1760 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
1761 }
1762 }
1763
1764 static void
1765 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1766 uint32_t attachment_count,
1767 const VkClearAttachment *attachments,
1768 uint32_t rect_count,
1769 const VkClearRect *rects)
1770 {
1771 /* the shader path here is special, it avoids changing MRT/etc state */
1772 const struct tu_render_pass *pass = cmd->state.pass;
1773 const struct tu_subpass *subpass = cmd->state.subpass;
1774 const uint32_t mrt_count = subpass->color_count;
1775 struct tu_cs *cs = &cmd->draw_cs;
1776 uint32_t clear_value[MAX_RTS][4];
1777 float z_clear_val = 0.0f;
1778 uint8_t s_clear_val = 0;
1779 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1780 bool z_clear = false;
1781 bool s_clear = false;
1782 bool layered_clear = false;
1783 uint32_t max_samples = 1;
1784
1785 for (uint32_t i = 0; i < attachment_count; i++) {
1786 uint32_t a;
1787 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1788 uint32_t c = attachments[i].colorAttachment;
1789 a = subpass->color_attachments[c].attachment;
1790 if (a == VK_ATTACHMENT_UNUSED)
1791 continue;
1792
1793 clear_rts |= 1 << c;
1794 clear_components |= 0xf << (c * 4);
1795 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1796 } else {
1797 a = subpass->depth_stencil_attachment.attachment;
1798 if (a == VK_ATTACHMENT_UNUSED)
1799 continue;
1800
1801 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1802 z_clear = true;
1803 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1804 }
1805
1806 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1807 s_clear = true;
1808 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1809 }
1810 }
1811
1812 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1813 }
1814
1815 /* disable all draw states so they don't interfere
1816 * TODO: use and re-use draw states
1817 * we have to disable draw states individually to preserve
1818 * input attachment states, because a secondary command buffer
1819 * won't be able to restore them
1820 */
1821 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1822 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1823 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1824 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1825 continue;
1826 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1827 CP_SET_DRAW_STATE__0_DISABLE);
1828 tu_cs_emit_qw(cs, 0);
1829 }
1830 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1831
1832 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1833 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1834 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1835 0xfc000000);
1836 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1837
1838 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1839 for (uint32_t i = 0; i < mrt_count; i++) {
1840 if (clear_rts & (1 << i))
1841 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1842 else
1843 tu_cs_emit(cs, 0);
1844 }
1845
1846 for (uint32_t i = 0; i < rect_count; i++) {
1847 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1848 layered_clear = true;
1849 }
1850
1851 r3d_common(cmd, cs, false, num_rts, layered_clear);
1852
1853 tu_cs_emit_regs(cs,
1854 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1855 tu_cs_emit_regs(cs,
1856 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1857
1858 tu_cs_emit_regs(cs,
1859 A6XX_RB_FS_OUTPUT_CNTL0(),
1860 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1861
1862 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1863 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1864 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1865 for (uint32_t i = 0; i < mrt_count; i++) {
1866 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1867 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1868 }
1869
1870 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1871 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1872 .z_enable = z_clear,
1873 .z_write_enable = z_clear,
1874 .zfunc = FUNC_ALWAYS));
1875 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1876 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1877 .stencil_enable = s_clear,
1878 .func = FUNC_ALWAYS,
1879 .zpass = STENCIL_REPLACE));
1880 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1881 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1882 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1883
1884 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1885 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1886 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1887 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1888 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1889 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1890 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1891 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1892 for_each_bit(b, clear_rts)
1893 tu_cs_emit_array(cs, clear_value[b], 4);
1894
1895 for (uint32_t i = 0; i < rect_count; i++) {
1896 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1897 r3d_coords_raw(cs, (float[]) {
1898 rects[i].rect.offset.x, rects[i].rect.offset.y,
1899 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1900 rects[i].rect.offset.x + rects[i].rect.extent.width,
1901 rects[i].rect.offset.y + rects[i].rect.extent.height,
1902 z_clear_val, 1.0f,
1903 });
1904 r3d_run(cmd, cs);
1905 }
1906 }
1907 }
1908
1909 static void
1910 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1911 {
1912 enum pipe_format pformat = vk_format_to_pipe_format(format);
1913
1914 switch (format) {
1915 case VK_FORMAT_X8_D24_UNORM_PACK32:
1916 case VK_FORMAT_D24_UNORM_S8_UINT:
1917 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1918 val->depthStencil.stencil << 24;
1919 return;
1920 case VK_FORMAT_D16_UNORM:
1921 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1922 return;
1923 case VK_FORMAT_D32_SFLOAT:
1924 clear_value[0] = fui(val->depthStencil.depth);
1925 return;
1926 case VK_FORMAT_S8_UINT:
1927 clear_value[0] = val->depthStencil.stencil;
1928 return;
1929 /* these formats use a different base format when tiled
1930 * the same format can be used for both because GMEM is always in WZYX order
1931 */
1932 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1933 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1934 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1935 default:
1936 break;
1937 }
1938
1939 VkClearColorValue color;
1940
1941 /**
1942 * GMEM is tiled and wants the components in WZYX order,
1943 * apply swizzle to the color before packing, to counteract
1944 * deswizzling applied by packing functions
1945 */
1946 pipe_swizzle_4f(color.float32, val->color.float32,
1947 util_format_description(pformat)->swizzle);
1948
1949 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
1950 }
1951
1952 static void
1953 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1954 struct tu_cs *cs,
1955 VkFormat format,
1956 uint8_t clear_mask,
1957 uint32_t gmem_offset,
1958 const VkClearValue *value)
1959 {
1960 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
1961 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
1962
1963 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
1964
1965 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1966 tu_cs_emit(cs, gmem_offset);
1967
1968 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1969 tu_cs_emit(cs, 0);
1970
1971 uint32_t clear_vals[4] = {};
1972 pack_gmem_clear_value(value, format, clear_vals);
1973
1974 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
1975 tu_cs_emit_array(cs, clear_vals, 4);
1976
1977 tu6_emit_event_write(cmd, cs, BLIT);
1978 }
1979
1980 static void
1981 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1982 struct tu_cs *cs,
1983 uint32_t attachment,
1984 VkImageAspectFlags mask,
1985 const VkClearValue *value)
1986 {
1987 const struct tu_render_pass_attachment *att =
1988 &cmd->state.pass->attachments[attachment];
1989
1990 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1991 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
1992 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
1993 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
1994 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
1995 return;
1996 }
1997
1998 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
1999 }
2000
2001 static void
2002 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2003 uint32_t attachment_count,
2004 const VkClearAttachment *attachments,
2005 uint32_t rect_count,
2006 const VkClearRect *rects)
2007 {
2008 const struct tu_subpass *subpass = cmd->state.subpass;
2009 struct tu_cs *cs = &cmd->draw_cs;
2010
2011 /* TODO: swap the loops for smaller cmdstream */
2012 for (unsigned i = 0; i < rect_count; i++) {
2013 unsigned x1 = rects[i].rect.offset.x;
2014 unsigned y1 = rects[i].rect.offset.y;
2015 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2016 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2017
2018 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2019 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2020 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2021
2022 for (unsigned j = 0; j < attachment_count; j++) {
2023 uint32_t a;
2024 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2025 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2026 else
2027 a = subpass->depth_stencil_attachment.attachment;
2028
2029 if (a == VK_ATTACHMENT_UNUSED)
2030 continue;
2031
2032 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2033 &attachments[j].clearValue);
2034 }
2035 }
2036 }
2037
2038 void
2039 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2040 uint32_t attachmentCount,
2041 const VkClearAttachment *pAttachments,
2042 uint32_t rectCount,
2043 const VkClearRect *pRects)
2044 {
2045 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2046 struct tu_cs *cs = &cmd->draw_cs;
2047
2048 /* sysmem path behaves like a draw, note we don't have a way of using different
2049 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2050 */
2051 tu_emit_cache_flush_renderpass(cmd, cs);
2052
2053 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2054 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2055 tu_cond_exec_end(cs);
2056
2057 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2058 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2059 tu_cond_exec_end(cs);
2060 }
2061
2062 static void
2063 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2064 struct tu_cs *cs,
2065 VkFormat format,
2066 VkImageAspectFlags clear_mask,
2067 const VkRenderPassBeginInfo *info,
2068 uint32_t a,
2069 bool separate_stencil)
2070 {
2071 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2072 const struct tu_image_view *iview = fb->attachments[a].attachment;
2073 const struct blit_ops *ops = &r2d_ops;
2074 if (cmd->state.pass->attachments[a].samples > 1)
2075 ops = &r3d_ops;
2076
2077 ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
2078 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2079 ops->clear_value(cs, format, &info->pClearValues[a]);
2080
2081 for (uint32_t i = 0; i < fb->layers; i++) {
2082 if (separate_stencil) {
2083 if (ops == &r3d_ops)
2084 r3d_dst_stencil(cs, iview, i);
2085 else
2086 r2d_dst_stencil(cs, iview, i);
2087 } else {
2088 ops->dst(cs, iview, i);
2089 }
2090 ops->run(cmd, cs);
2091 }
2092 }
2093
2094 void
2095 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2096 struct tu_cs *cs,
2097 uint32_t a,
2098 const VkRenderPassBeginInfo *info)
2099 {
2100 const struct tu_render_pass_attachment *attachment =
2101 &cmd->state.pass->attachments[a];
2102
2103 if (!attachment->clear_mask)
2104 return;
2105
2106 /* Wait for any flushes at the beginning of the renderpass to complete */
2107 tu_cs_emit_wfi(cs);
2108
2109 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2110 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2111 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2112 info, a, false);
2113 }
2114 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2115 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2116 info, a, true);
2117 }
2118 } else {
2119 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2120 info, a, false);
2121 }
2122
2123 /* The spec doesn't explicitly say, but presumably the initial renderpass
2124 * clear is considered part of the renderpass, and therefore barriers
2125 * aren't required inside the subpass/renderpass. Therefore we need to
2126 * flush CCU color into CCU depth here, just like with
2127 * vkCmdClearAttachments(). Note that because this only happens at the
2128 * beginning of a renderpass, and renderpass writes are considered
2129 * "incoherent", we shouldn't have to worry about syncing depth into color
2130 * beforehand as depth should already be flushed.
2131 */
2132 if (vk_format_is_depth_or_stencil(attachment->format)) {
2133 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2134 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2135 } else {
2136 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2137 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2138 }
2139 }
2140
2141 void
2142 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2143 struct tu_cs *cs,
2144 uint32_t a,
2145 const VkRenderPassBeginInfo *info)
2146 {
2147 const struct tu_render_pass_attachment *attachment =
2148 &cmd->state.pass->attachments[a];
2149
2150 if (!attachment->clear_mask)
2151 return;
2152
2153 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2154
2155 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2156 &info->pClearValues[a]);
2157 }
2158
2159 static void
2160 tu_emit_blit(struct tu_cmd_buffer *cmd,
2161 struct tu_cs *cs,
2162 const struct tu_image_view *iview,
2163 const struct tu_render_pass_attachment *attachment,
2164 bool resolve,
2165 bool separate_stencil)
2166 {
2167 tu_cs_emit_regs(cs,
2168 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2169
2170 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2171 .unk0 = !resolve,
2172 .gmem = !resolve,
2173 /* "integer" bit disables msaa resolve averaging */
2174 .integer = vk_format_is_int(attachment->format)));
2175
2176 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2177 if (separate_stencil) {
2178 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2179 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2180 tu_cs_emit(cs, iview->stencil_PITCH);
2181
2182 tu_cs_emit_regs(cs,
2183 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2184 } else {
2185 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2186 tu_cs_image_ref_2d(cs, iview, 0, false);
2187
2188 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2189 tu_cs_image_flag_ref(cs, iview, 0);
2190
2191 tu_cs_emit_regs(cs,
2192 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2193 }
2194
2195 tu6_emit_event_write(cmd, cs, BLIT);
2196 }
2197
2198 static bool
2199 blit_can_resolve(VkFormat format)
2200 {
2201 const struct util_format_description *desc = vk_format_description(format);
2202
2203 /* blit event can only do resolve for simple cases:
2204 * averaging samples as unsigned integers or choosing only one sample
2205 */
2206 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2207 return false;
2208
2209 /* can't do formats with larger channel sizes
2210 * note: this includes all float formats
2211 * note2: single channel integer formats seem OK
2212 */
2213 if (desc->channel[0].size > 10)
2214 return false;
2215
2216 switch (format) {
2217 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2218 * likely related to these formats having different layout from other cpp=2 formats
2219 */
2220 case VK_FORMAT_R8G8_UNORM:
2221 case VK_FORMAT_R8G8_UINT:
2222 case VK_FORMAT_R8G8_SINT:
2223 /* TODO: this one should be able to work? */
2224 case VK_FORMAT_D24_UNORM_S8_UINT:
2225 return false;
2226 default:
2227 break;
2228 }
2229
2230 return true;
2231 }
2232
2233 void
2234 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2235 struct tu_cs *cs,
2236 uint32_t a,
2237 bool force_load)
2238 {
2239 const struct tu_image_view *iview =
2240 cmd->state.framebuffer->attachments[a].attachment;
2241 const struct tu_render_pass_attachment *attachment =
2242 &cmd->state.pass->attachments[a];
2243
2244 if (attachment->load || force_load)
2245 tu_emit_blit(cmd, cs, iview, attachment, false, false);
2246
2247 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2248 tu_emit_blit(cmd, cs, iview, attachment, false, true);
2249 }
2250
2251 static void
2252 store_cp_blit(struct tu_cmd_buffer *cmd,
2253 struct tu_cs *cs,
2254 struct tu_image_view *iview,
2255 uint32_t samples,
2256 bool separate_stencil,
2257 VkFormat format,
2258 uint32_t gmem_offset,
2259 uint32_t cpp)
2260 {
2261 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
2262 iview->ubwc_enabled, true);
2263 if (separate_stencil)
2264 r2d_dst_stencil(cs, iview, 0);
2265 else
2266 r2d_dst(cs, iview, 0);
2267
2268 tu_cs_emit_regs(cs,
2269 A6XX_SP_PS_2D_SRC_INFO(
2270 .color_format = tu6_format_texture(format, TILE6_2).fmt,
2271 .tile_mode = TILE6_2,
2272 .srgb = vk_format_is_srgb(format),
2273 .samples = tu_msaa_samples(samples),
2274 .samples_average = !vk_format_is_int(format),
2275 .unk20 = 1,
2276 .unk22 = 1),
2277 /* note: src size does not matter when not scaling */
2278 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2279 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
2280 A6XX_SP_PS_2D_SRC_HI(),
2281 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2282
2283 /* sync GMEM writes with CACHE. */
2284 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2285
2286 /* Wait for CACHE_INVALIDATE to land */
2287 tu_cs_emit_wfi(cs);
2288
2289 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2290 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2291
2292 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2293 * sysmem, and we generally assume that GMEM renderpasses leave their
2294 * results in sysmem, so we need to flush manually here.
2295 */
2296 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2297 }
2298
2299 void
2300 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2301 struct tu_cs *cs,
2302 uint32_t a,
2303 uint32_t gmem_a)
2304 {
2305 const VkRect2D *render_area = &cmd->state.render_area;
2306 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2307 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2308 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2309
2310 if (!dst->store && !dst->store_stencil)
2311 return;
2312
2313 uint32_t x1 = render_area->offset.x;
2314 uint32_t y1 = render_area->offset.y;
2315 uint32_t x2 = x1 + render_area->extent.width;
2316 uint32_t y2 = y1 + render_area->extent.height;
2317 /* x2/y2 can be unaligned if equal to the size of the image,
2318 * since it will write into padding space
2319 * the one exception is linear levels which don't have the
2320 * required y padding in the layout (except for the last level)
2321 */
2322 bool need_y2_align =
2323 y2 != iview->extent.height || iview->need_y2_align;
2324
2325 bool unaligned =
2326 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2327 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2328
2329 /* use fast path when render area is aligned, except for unsupported resolve cases */
2330 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2331 if (dst->store)
2332 tu_emit_blit(cmd, cs, iview, src, true, false);
2333 if (dst->store_stencil)
2334 tu_emit_blit(cmd, cs, iview, src, true, true);
2335 return;
2336 }
2337
2338 if (dst->samples > 1) {
2339 /* I guess we need to use shader path in this case?
2340 * need a testcase which fails because of this
2341 */
2342 tu_finishme("unaligned store of msaa attachment\n");
2343 return;
2344 }
2345
2346 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2347
2348 VkFormat format = src->format;
2349 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2350 format = VK_FORMAT_D32_SFLOAT;
2351
2352 if (dst->store) {
2353 store_cp_blit(cmd, cs, iview, src->samples, false, format,
2354 src->gmem_offset, src->cpp);
2355 }
2356 if (dst->store_stencil) {
2357 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2358 src->gmem_offset_stencil, src->samples);
2359 }
2360 }