freedreno/decode: move dependencies up a level
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(src->x),
111 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
233 {
234 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
235
236 tu_cs_emit_regs(cs,
237 A6XX_RB_2D_DST_INFO(
238 .color_format = format.fmt,
239 .color_swap = format.swap,
240 .srgb = vk_format_is_srgb(vk_format)),
241 A6XX_RB_2D_DST_LO((uint32_t) va),
242 A6XX_RB_2D_DST_HI(va >> 32),
243 A6XX_RB_2D_DST_PITCH(pitch));
244 }
245
246 static void
247 r2d_setup_common(struct tu_cmd_buffer *cmd,
248 struct tu_cs *cs,
249 VkFormat vk_format,
250 VkImageAspectFlags aspect_mask,
251 enum a6xx_rotation rotation,
252 bool clear,
253 bool scissor)
254 {
255 enum a6xx_format format = tu6_base_format(vk_format);
256 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
257 uint32_t unknown_8c01 = 0;
258
259 /* note: the only format with partial clearing is D24S8 */
260 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
261 /* preserve stencil channel */
262 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
263 unknown_8c01 = 0x08000041;
264 /* preserve depth channels */
265 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
266 unknown_8c01 = 0x00084001;
267 }
268
269 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
270 tu_cs_emit(cs, unknown_8c01);
271
272 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
273 .scissor = scissor,
274 .rotate = rotation,
275 .solid_color = clear,
276 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
277 .color_format = format,
278 .mask = 0xf,
279 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
280 ).value;
281
282 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
283 tu_cs_emit(cs, blit_cntl);
284
285 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
286 tu_cs_emit(cs, blit_cntl);
287
288 if (format == FMT6_10_10_10_2_UNORM_DEST)
289 format = FMT6_16_16_16_16_FLOAT;
290
291 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
292 .sint = vk_format_is_sint(vk_format),
293 .uint = vk_format_is_uint(vk_format),
294 .color_format = format,
295 .srgb = vk_format_is_srgb(vk_format),
296 .mask = 0xf));
297 }
298
299 static void
300 r2d_setup(struct tu_cmd_buffer *cmd,
301 struct tu_cs *cs,
302 VkFormat vk_format,
303 VkImageAspectFlags aspect_mask,
304 enum a6xx_rotation rotation,
305 bool clear)
306 {
307 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
308
309 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, false);
310 }
311
312 static void
313 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
314 {
315 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
316 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
317 }
318
319 /* r3d_ = shader path operations */
320
321 void
322 tu_init_clear_blit_shaders(struct tu6_global *global)
323 {
324 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
325 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
326 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
327
328 static const instr_t vs_code[] = {
329 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
330 * r1.xy = r0.w ? c1.zw : c0.zw
331 * r0.w = 1.0f
332 */
333 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
334 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
335 .src2 = 3,
336 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
337 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
338 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
339 .src2 = 3,
340 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
341 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
342 { .cat0 = { .opc = OPC_END } },
343 };
344
345 static const instr_t fs_blit[] = {
346 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
347 * blit path (its not clear what allows it to not have it)
348 */
349 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
350 { .cat0 = { .opc = OPC_END } },
351 };
352
353 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
354 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
355
356 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
357 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
358 for (uint32_t i = 0; i < num_rts; i++) {
359 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
360 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
361 }
362 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
363 }
364 }
365
366 static void
367 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
368 bool layered_clear)
369 {
370 struct ir3_const_state dummy_const_state = {};
371 struct ir3_shader dummy_shader = {};
372
373 struct ir3_shader_variant vs = {
374 .type = MESA_SHADER_VERTEX,
375 .instrlen = 1,
376 .constlen = 4,
377 .info.max_reg = 1,
378 .inputs_count = 1,
379 .inputs[0] = {
380 .slot = SYSTEM_VALUE_VERTEX_ID,
381 .regid = regid(0, 3),
382 .sysval = true,
383 },
384 .outputs_count = blit ? 2 : 1,
385 .outputs[0] = {
386 .slot = VARYING_SLOT_POS,
387 .regid = regid(0, 0),
388 },
389 .outputs[1] = {
390 .slot = VARYING_SLOT_VAR0,
391 .regid = regid(1, 0),
392 },
393 .shader = &dummy_shader,
394 .const_state = &dummy_const_state,
395 };
396 if (layered_clear) {
397 vs.outputs[1].slot = VARYING_SLOT_LAYER;
398 vs.outputs[1].regid = regid(1, 1);
399 vs.outputs_count = 2;
400 }
401
402 struct ir3_shader_variant fs = {
403 .type = MESA_SHADER_FRAGMENT,
404 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
405 .constlen = align(num_rts, 4),
406 .info.max_reg = MAX2(num_rts, 1) - 1,
407 .total_in = blit ? 2 : 0,
408 .num_samp = blit ? 1 : 0,
409 .inputs_count = blit ? 2 : 0,
410 .inputs[0] = {
411 .slot = VARYING_SLOT_VAR0,
412 .inloc = 0,
413 .compmask = 3,
414 .bary = true,
415 },
416 .inputs[1] = {
417 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
418 .regid = regid(0, 0),
419 .sysval = 1,
420 },
421 .num_sampler_prefetch = blit ? 1 : 0,
422 .sampler_prefetch[0] = {
423 .src = 0,
424 .wrmask = 0xf,
425 .cmd = 4,
426 },
427 .shader = &dummy_shader,
428 .const_state = &dummy_const_state,
429 };
430
431 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
432 .vs_state = true,
433 .hs_state = true,
434 .ds_state = true,
435 .gs_state = true,
436 .fs_state = true,
437 .cs_state = true,
438 .gfx_ibo = true,
439 .cs_ibo = true,
440 .gfx_shared_const = true,
441 .gfx_bindless = 0x1f,
442 .cs_bindless = 0x1f));
443
444 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
445 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
446 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
447 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
448 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
449 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
450
451 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
452 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
453
454 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
455
456 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
457 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
458 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
459
460 tu6_emit_fs_inputs(cs, &fs);
461
462 tu_cs_emit_regs(cs,
463 A6XX_GRAS_CL_CNTL(
464 .persp_division_disable = 1,
465 .vp_xform_disable = 1,
466 .vp_clip_code_ignore = 1,
467 .clip_disable = 1));
468 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
469
470 tu_cs_emit_regs(cs,
471 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
472 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
473 tu_cs_emit_regs(cs,
474 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
475 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
476
477 tu_cs_emit_regs(cs,
478 A6XX_VFD_INDEX_OFFSET(),
479 A6XX_VFD_INSTANCE_START_OFFSET());
480 }
481
482 static void
483 r3d_coords_raw(struct tu_cs *cs, const float *coords)
484 {
485 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
486 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
487 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
488 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
489 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
490 CP_LOAD_STATE6_0_NUM_UNIT(2));
491 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
492 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
493 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
494 }
495
496 static void
497 r3d_coords(struct tu_cs *cs,
498 const VkOffset2D *dst,
499 const VkOffset2D *src,
500 const VkExtent2D *extent)
501 {
502 int32_t src_x1 = src ? src->x : 0;
503 int32_t src_y1 = src ? src->y : 0;
504 r3d_coords_raw(cs, (float[]) {
505 dst->x, dst->y,
506 src_x1, src_y1,
507 dst->x + extent->width, dst->y + extent->height,
508 src_x1 + extent->width, src_y1 + extent->height,
509 });
510 }
511
512 static void
513 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
514 {
515 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
516 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
517 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
518 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
519 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
520 CP_LOAD_STATE6_0_NUM_UNIT(1));
521 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
522 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
523 switch (format) {
524 case VK_FORMAT_X8_D24_UNORM_PACK32:
525 case VK_FORMAT_D24_UNORM_S8_UINT: {
526 /* cleared as r8g8b8a8_unorm using special format */
527 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
528 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
529 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
530 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
531 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
532 } break;
533 case VK_FORMAT_D16_UNORM:
534 case VK_FORMAT_D32_SFLOAT:
535 tu_cs_emit(cs, fui(val->depthStencil.depth));
536 tu_cs_emit(cs, 0);
537 tu_cs_emit(cs, 0);
538 tu_cs_emit(cs, 0);
539 break;
540 case VK_FORMAT_S8_UINT:
541 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
542 tu_cs_emit(cs, 0);
543 tu_cs_emit(cs, 0);
544 tu_cs_emit(cs, 0);
545 break;
546 default:
547 /* as color formats use clear value as-is */
548 assert(!vk_format_is_depth_or_stencil(format));
549 tu_cs_emit_array(cs, val->color.uint32, 4);
550 break;
551 }
552 }
553
554 static void
555 r3d_src_common(struct tu_cmd_buffer *cmd,
556 struct tu_cs *cs,
557 const uint32_t *tex_const,
558 uint32_t offset_base,
559 uint32_t offset_ubwc,
560 VkFilter filter)
561 {
562 struct tu_cs_memory texture = { };
563 VkResult result = tu_cs_alloc(&cmd->sub_cs,
564 2, /* allocate space for a sampler too */
565 A6XX_TEX_CONST_DWORDS, &texture);
566 assert(result == VK_SUCCESS);
567
568 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
569
570 /* patch addresses for layer offset */
571 *(uint64_t*) (texture.map + 4) += offset_base;
572 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
573 texture.map[7] = ubwc_addr;
574 texture.map[8] = ubwc_addr >> 32;
575
576 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
577 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
578 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
579 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
580 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
581 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
582 0x60000; /* XXX used by blob, doesn't seem necessary */
583 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
584 0x1 | /* XXX used by blob, doesn't seem necessary */
585 A6XX_TEX_SAMP_1_UNNORM_COORDS |
586 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
587 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
588 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
589
590 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
591 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
592 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
593 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
594 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
595 CP_LOAD_STATE6_0_NUM_UNIT(1));
596 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
597
598 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
599 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
600
601 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
602 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
603 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
604 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
605 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
606 CP_LOAD_STATE6_0_NUM_UNIT(1));
607 tu_cs_emit_qw(cs, texture.iova);
608
609 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
610 tu_cs_emit_qw(cs, texture.iova);
611
612 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
613 }
614
615 static void
616 r3d_src(struct tu_cmd_buffer *cmd,
617 struct tu_cs *cs,
618 const struct tu_image_view *iview,
619 uint32_t layer,
620 VkFilter filter)
621 {
622 r3d_src_common(cmd, cs, iview->descriptor,
623 iview->layer_size * layer,
624 iview->ubwc_layer_size * layer,
625 filter);
626 }
627
628 static void
629 r3d_src_buffer(struct tu_cmd_buffer *cmd,
630 struct tu_cs *cs,
631 VkFormat vk_format,
632 uint64_t va, uint32_t pitch,
633 uint32_t width, uint32_t height)
634 {
635 uint32_t desc[A6XX_TEX_CONST_DWORDS];
636
637 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
638
639 desc[0] =
640 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
641 A6XX_TEX_CONST_0_FMT(format.fmt) |
642 A6XX_TEX_CONST_0_SWAP(format.swap) |
643 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
644 // XXX to swizzle into .w for stencil buffer_to_image
645 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
646 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
647 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
648 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
649 desc[2] =
650 A6XX_TEX_CONST_2_PITCH(pitch) |
651 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
652 desc[3] = 0;
653 desc[4] = va;
654 desc[5] = va >> 32;
655 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
656 desc[i] = 0;
657
658 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
659 }
660
661 static void
662 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
663 {
664 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
665
666 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
667 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
668 tu_cs_image_ref(cs, iview, layer);
669 tu_cs_emit(cs, 0);
670
671 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
672 tu_cs_image_flag_ref(cs, iview, layer);
673
674 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
675 }
676
677 static void
678 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
679 {
680 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
681
682 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
683
684 tu_cs_emit_regs(cs,
685 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
686 A6XX_RB_MRT_PITCH(0, pitch),
687 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
688 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
689 A6XX_RB_MRT_BASE_HI(0, va >> 32),
690 A6XX_RB_MRT_BASE_GMEM(0, 0));
691
692 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
693 }
694
695 static uint8_t
696 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
697 {
698 uint8_t mask = 0xf;
699 assert(aspect_mask);
700 /* note: the only format with partial writing is D24S8,
701 * clear/blit uses the _AS_R8G8B8A8 format to access it
702 */
703 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
704 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
705 mask = 0x7;
706 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
707 mask = 0x8;
708 }
709 return mask;
710 }
711
712 static void
713 r3d_setup(struct tu_cmd_buffer *cmd,
714 struct tu_cs *cs,
715 VkFormat vk_format,
716 VkImageAspectFlags aspect_mask,
717 enum a6xx_rotation rotation,
718 bool clear)
719 {
720 if (!cmd->state.pass) {
721 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
722 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
723 }
724
725 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
726 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
727
728 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
729
730 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
731 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
732 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
733 0xfc000000);
734 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
735
736 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
737 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
738
739 tu_cs_emit_regs(cs,
740 A6XX_RB_FS_OUTPUT_CNTL0(),
741 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
742
743 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
744 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
745 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
746
747 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
748 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
749 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
750 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
751 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
752 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
753 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
754
755 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
756 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
757
758 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
759 .color_format = tu6_base_format(vk_format),
760 .color_sint = vk_format_is_sint(vk_format),
761 .color_uint = vk_format_is_uint(vk_format)));
762
763 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
764 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
765 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
766 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
767 }
768
769 static void
770 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
771 {
772 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
773 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
774 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
775 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
776 tu_cs_emit(cs, 1); /* instance count */
777 tu_cs_emit(cs, 2); /* vertex count */
778 }
779
780 /* blit ops - common interface for 2d/shader paths */
781
782 struct blit_ops {
783 void (*coords)(struct tu_cs *cs,
784 const VkOffset2D *dst,
785 const VkOffset2D *src,
786 const VkExtent2D *extent);
787 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
788 void (*src)(
789 struct tu_cmd_buffer *cmd,
790 struct tu_cs *cs,
791 const struct tu_image_view *iview,
792 uint32_t layer,
793 VkFilter filter);
794 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
795 VkFormat vk_format,
796 uint64_t va, uint32_t pitch,
797 uint32_t width, uint32_t height);
798 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
799 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
800 void (*setup)(struct tu_cmd_buffer *cmd,
801 struct tu_cs *cs,
802 VkFormat vk_format,
803 VkImageAspectFlags aspect_mask,
804 enum a6xx_rotation rotation,
805 bool clear);
806 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
807 };
808
809 static const struct blit_ops r2d_ops = {
810 .coords = r2d_coords,
811 .clear_value = r2d_clear_value,
812 .src = r2d_src,
813 .src_buffer = r2d_src_buffer,
814 .dst = r2d_dst,
815 .dst_buffer = r2d_dst_buffer,
816 .setup = r2d_setup,
817 .run = r2d_run,
818 };
819
820 static const struct blit_ops r3d_ops = {
821 .coords = r3d_coords,
822 .clear_value = r3d_clear_value,
823 .src = r3d_src,
824 .src_buffer = r3d_src_buffer,
825 .dst = r3d_dst,
826 .dst_buffer = r3d_dst_buffer,
827 .setup = r3d_setup,
828 .run = r3d_run,
829 };
830
831 /* passthrough set coords from 3D extents */
832 static void
833 coords(const struct blit_ops *ops,
834 struct tu_cs *cs,
835 const VkOffset3D *dst,
836 const VkOffset3D *src,
837 const VkExtent3D *extent)
838 {
839 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
840 }
841
842 static VkFormat
843 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
844 {
845 if (vk_format_is_compressed(format)) {
846 switch (vk_format_get_blocksize(format)) {
847 case 1: return VK_FORMAT_R8_UINT;
848 case 2: return VK_FORMAT_R16_UINT;
849 case 4: return VK_FORMAT_R32_UINT;
850 case 8: return VK_FORMAT_R32G32_UINT;
851 case 16:return VK_FORMAT_R32G32B32A32_UINT;
852 default:
853 unreachable("unhandled format size");
854 }
855 }
856
857 switch (format) {
858 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
859 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
860 return VK_FORMAT_R8G8_UNORM;
861 /* fallthrough */
862 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
863 return VK_FORMAT_R8_UNORM;
864 case VK_FORMAT_D24_UNORM_S8_UINT:
865 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
866 return VK_FORMAT_R8_UNORM;
867 /* fallthrough */
868 default:
869 return format;
870 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
871 return VK_FORMAT_R32_UINT;
872 }
873 }
874
875 static void
876 tu_image_view_copy_blit(struct tu_image_view *iview,
877 struct tu_image *image,
878 VkFormat format,
879 const VkImageSubresourceLayers *subres,
880 uint32_t layer,
881 bool stencil_read)
882 {
883 VkImageAspectFlags aspect_mask = subres->aspectMask;
884
885 /* always use the AS_R8G8B8A8 format for these */
886 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
887 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
888 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
889 }
890
891 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
892 .image = tu_image_to_handle(image),
893 .viewType = VK_IMAGE_VIEW_TYPE_2D,
894 .format = format,
895 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
896 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
897 .subresourceRange = {
898 .aspectMask = aspect_mask,
899 .baseMipLevel = subres->mipLevel,
900 .levelCount = 1,
901 .baseArrayLayer = subres->baseArrayLayer + layer,
902 .layerCount = 1,
903 },
904 });
905 }
906
907 static void
908 tu_image_view_copy(struct tu_image_view *iview,
909 struct tu_image *image,
910 VkFormat format,
911 const VkImageSubresourceLayers *subres,
912 uint32_t layer,
913 bool stencil_read)
914 {
915 format = copy_format(format, subres->aspectMask, false);
916 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
917 }
918
919 static void
920 tu_image_view_blit(struct tu_image_view *iview,
921 struct tu_image *image,
922 const VkImageSubresourceLayers *subres,
923 uint32_t layer)
924 {
925 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
926 }
927
928 static void
929 tu6_blit_image(struct tu_cmd_buffer *cmd,
930 struct tu_image *src_image,
931 struct tu_image *dst_image,
932 const VkImageBlit *info,
933 VkFilter filter)
934 {
935 const struct blit_ops *ops = &r2d_ops;
936 struct tu_cs *cs = &cmd->cs;
937 uint32_t layers;
938
939 /* 2D blit can't do rotation mirroring from just coordinates */
940 static const enum a6xx_rotation rotate[2][2] = {
941 {ROTATE_0, ROTATE_HFLIP},
942 {ROTATE_VFLIP, ROTATE_180},
943 };
944
945 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
946 (info->dstOffsets[1].x < info->dstOffsets[0].x);
947 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
948 (info->dstOffsets[1].y < info->dstOffsets[0].y);
949 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
950 (info->dstOffsets[1].z < info->dstOffsets[0].z);
951
952 if (mirror_z) {
953 tu_finishme("blit z mirror\n");
954 return;
955 }
956
957 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
958 info->dstOffsets[1].z - info->dstOffsets[0].z) {
959 tu_finishme("blit z filter\n");
960 return;
961 }
962
963 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
964 if (info->dstSubresource.layerCount > 1) {
965 assert(layers <= 1);
966 layers = info->dstSubresource.layerCount;
967 }
968
969 /* BC1_RGB_* formats need to have their last components overriden with 1
970 * when sampling, which is normally handled with the texture descriptor
971 * swizzle. The 2d path can't handle that, so use the 3d path.
972 *
973 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
974 * the 2d path.
975 */
976
977 if (dst_image->samples > 1 ||
978 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
979 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
980 filter == VK_FILTER_CUBIC_EXT)
981 ops = &r3d_ops;
982
983 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
984 * figure out why (should be able to pass all tests with only shader path)
985 */
986
987 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
988 rotate[mirror_y][mirror_x], false);
989
990 if (ops == &r3d_ops) {
991 r3d_coords_raw(cs, (float[]) {
992 info->dstOffsets[0].x, info->dstOffsets[0].y,
993 info->srcOffsets[0].x, info->srcOffsets[0].y,
994 info->dstOffsets[1].x, info->dstOffsets[1].y,
995 info->srcOffsets[1].x, info->srcOffsets[1].y
996 });
997 } else {
998 tu_cs_emit_regs(cs,
999 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1000 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1001 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1002 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1003 tu_cs_emit_regs(cs,
1004 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1005 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1006 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1007 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1008 }
1009
1010 struct tu_image_view dst, src;
1011 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1012 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1013
1014 for (uint32_t i = 0; i < layers; i++) {
1015 ops->dst(cs, &dst, i);
1016 ops->src(cmd, cs, &src, i, filter);
1017 ops->run(cmd, cs);
1018 }
1019 }
1020
1021 void
1022 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1023 VkImage srcImage,
1024 VkImageLayout srcImageLayout,
1025 VkImage dstImage,
1026 VkImageLayout dstImageLayout,
1027 uint32_t regionCount,
1028 const VkImageBlit *pRegions,
1029 VkFilter filter)
1030
1031 {
1032 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1033 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1034 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1035
1036 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1037 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1038
1039 for (uint32_t i = 0; i < regionCount; ++i)
1040 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1041 }
1042
1043 static void
1044 copy_compressed(VkFormat format,
1045 VkOffset3D *offset,
1046 VkExtent3D *extent,
1047 uint32_t *width,
1048 uint32_t *height)
1049 {
1050 if (!vk_format_is_compressed(format))
1051 return;
1052
1053 uint32_t block_width = vk_format_get_blockwidth(format);
1054 uint32_t block_height = vk_format_get_blockheight(format);
1055
1056 offset->x /= block_width;
1057 offset->y /= block_height;
1058
1059 if (extent) {
1060 extent->width = DIV_ROUND_UP(extent->width, block_width);
1061 extent->height = DIV_ROUND_UP(extent->height, block_height);
1062 }
1063 if (width)
1064 *width = DIV_ROUND_UP(*width, block_width);
1065 if (height)
1066 *height = DIV_ROUND_UP(*height, block_height);
1067 }
1068
1069 static void
1070 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1071 struct tu_buffer *src_buffer,
1072 struct tu_image *dst_image,
1073 const VkBufferImageCopy *info)
1074 {
1075 struct tu_cs *cs = &cmd->cs;
1076 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1077 VkFormat src_format =
1078 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1079 const struct blit_ops *ops = &r2d_ops;
1080
1081 /* special case for buffer to stencil */
1082 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1083 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1084 ops = &r3d_ops;
1085 }
1086
1087 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1088 * which matters for UBWC. buffer_to_image/etc can fail because of this
1089 */
1090
1091 VkOffset3D offset = info->imageOffset;
1092 VkExtent3D extent = info->imageExtent;
1093 uint32_t src_width = info->bufferRowLength ?: extent.width;
1094 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1095
1096 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1097
1098 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1099 uint32_t layer_size = src_height * pitch;
1100
1101 ops->setup(cmd, cs,
1102 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1103 info->imageSubresource.aspectMask, ROTATE_0, false);
1104
1105 struct tu_image_view dst;
1106 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1107
1108 for (uint32_t i = 0; i < layers; i++) {
1109 ops->dst(cs, &dst, i);
1110
1111 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1112 if ((src_va & 63) || (pitch & 63)) {
1113 for (uint32_t y = 0; y < extent.height; y++) {
1114 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1115 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1116 x + extent.width, 1);
1117 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1118 &(VkExtent2D) {extent.width, 1});
1119 ops->run(cmd, cs);
1120 src_va += pitch;
1121 }
1122 } else {
1123 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1124 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1125 ops->run(cmd, cs);
1126 }
1127 }
1128 }
1129
1130 void
1131 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1132 VkBuffer srcBuffer,
1133 VkImage dstImage,
1134 VkImageLayout dstImageLayout,
1135 uint32_t regionCount,
1136 const VkBufferImageCopy *pRegions)
1137 {
1138 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1139 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1140 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1141
1142 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1143 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1144
1145 for (unsigned i = 0; i < regionCount; ++i)
1146 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1147 }
1148
1149 static void
1150 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1151 struct tu_image *src_image,
1152 struct tu_buffer *dst_buffer,
1153 const VkBufferImageCopy *info)
1154 {
1155 struct tu_cs *cs = &cmd->cs;
1156 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1157 VkFormat dst_format =
1158 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1159 bool stencil_read = false;
1160
1161 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1162 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1163 stencil_read = true;
1164 }
1165
1166 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1167 VkOffset3D offset = info->imageOffset;
1168 VkExtent3D extent = info->imageExtent;
1169 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1170 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1171
1172 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1173
1174 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1175 uint32_t layer_size = pitch * dst_height;
1176
1177 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1178
1179 struct tu_image_view src;
1180 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1181
1182 for (uint32_t i = 0; i < layers; i++) {
1183 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1184
1185 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1186 if ((dst_va & 63) || (pitch & 63)) {
1187 for (uint32_t y = 0; y < extent.height; y++) {
1188 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1189 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1190 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1191 &(VkExtent2D) {extent.width, 1});
1192 ops->run(cmd, cs);
1193 dst_va += pitch;
1194 }
1195 } else {
1196 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1197 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1198 ops->run(cmd, cs);
1199 }
1200 }
1201 }
1202
1203 void
1204 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1205 VkImage srcImage,
1206 VkImageLayout srcImageLayout,
1207 VkBuffer dstBuffer,
1208 uint32_t regionCount,
1209 const VkBufferImageCopy *pRegions)
1210 {
1211 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1212 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1213 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1214
1215 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1216 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1217
1218 for (unsigned i = 0; i < regionCount; ++i)
1219 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1220 }
1221
1222 /* Tiled formats don't support swapping, which means that we can't support
1223 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1224 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1225 * Currently we fake support for tiled swapped formats and use the unswapped
1226 * format instead, but this means that reinterpreting copies to and from
1227 * swapped formats can't be performed correctly unless we can swizzle the
1228 * components by reinterpreting the other image as the "correct" swapped
1229 * format, i.e. only when the other image is linear.
1230 */
1231
1232 static bool
1233 is_swapped_format(VkFormat format)
1234 {
1235 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1236 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1237 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1238 }
1239
1240 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1241 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1242 * versa). This should mirror the logic in fdl6_layout.
1243 */
1244 static bool
1245 image_is_r8g8(struct tu_image *image)
1246 {
1247 return image->layout[0].cpp == 2 &&
1248 vk_format_get_nr_components(image->vk_format) == 2;
1249 }
1250
1251 static void
1252 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1253 struct tu_image *src_image,
1254 struct tu_image *dst_image,
1255 const VkImageCopy *info)
1256 {
1257 const struct blit_ops *ops = &r2d_ops;
1258 struct tu_cs *cs = &cmd->cs;
1259
1260 if (dst_image->samples > 1)
1261 ops = &r3d_ops;
1262
1263 VkFormat format = VK_FORMAT_UNDEFINED;
1264 VkOffset3D src_offset = info->srcOffset;
1265 VkOffset3D dst_offset = info->dstOffset;
1266 VkExtent3D extent = info->extent;
1267
1268 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1269 * Images":
1270 *
1271 * When copying between compressed and uncompressed formats the extent
1272 * members represent the texel dimensions of the source image and not
1273 * the destination. When copying from a compressed image to an
1274 * uncompressed image the image texel dimensions written to the
1275 * uncompressed image will be source extent divided by the compressed
1276 * texel block dimensions. When copying from an uncompressed image to a
1277 * compressed image the image texel dimensions written to the compressed
1278 * image will be the source extent multiplied by the compressed texel
1279 * block dimensions.
1280 *
1281 * This means we only have to adjust the extent if the source image is
1282 * compressed.
1283 */
1284 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1285 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1286
1287 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1288 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1289
1290 bool use_staging_blit = false;
1291
1292 if (src_format == dst_format) {
1293 /* Images that share a format can always be copied directly because it's
1294 * the same as a blit.
1295 */
1296 format = src_format;
1297 } else if (!src_image->layout[0].tile_mode) {
1298 /* If an image is linear, we can always safely reinterpret it with the
1299 * other image's format and then do a regular blit.
1300 */
1301 format = dst_format;
1302 } else if (!dst_image->layout[0].tile_mode) {
1303 format = src_format;
1304 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1305 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1306 * due to the different tile layout.
1307 */
1308 use_staging_blit = true;
1309 } else if (is_swapped_format(src_format) ||
1310 is_swapped_format(dst_format)) {
1311 /* If either format has a non-identity swap, then we can't copy
1312 * to/from it.
1313 */
1314 use_staging_blit = true;
1315 } else if (!src_image->layout[0].ubwc) {
1316 format = dst_format;
1317 } else if (!dst_image->layout[0].ubwc) {
1318 format = src_format;
1319 } else {
1320 /* Both formats use UBWC and so neither can be reinterpreted.
1321 * TODO: We could do an in-place decompression of the dst instead.
1322 */
1323 use_staging_blit = true;
1324 }
1325
1326 struct tu_image_view dst, src;
1327
1328 if (use_staging_blit) {
1329 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1330 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1331
1332 struct tu_image staging_image = {
1333 .vk_format = src_format,
1334 .type = src_image->type,
1335 .tiling = VK_IMAGE_TILING_LINEAR,
1336 .extent = extent,
1337 .level_count = 1,
1338 .layer_count = info->srcSubresource.layerCount,
1339 .samples = src_image->samples,
1340 .bo_offset = 0,
1341 };
1342
1343 VkImageSubresourceLayers staging_subresource = {
1344 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1345 .mipLevel = 0,
1346 .baseArrayLayer = 0,
1347 .layerCount = info->srcSubresource.layerCount,
1348 };
1349
1350 VkOffset3D staging_offset = { 0 };
1351
1352 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1353 staging_image.layout[0].ubwc = false;
1354
1355 fdl6_layout(&staging_image.layout[0],
1356 vk_format_to_pipe_format(staging_image.vk_format),
1357 staging_image.samples,
1358 staging_image.extent.width,
1359 staging_image.extent.height,
1360 staging_image.extent.depth,
1361 staging_image.level_count,
1362 staging_image.layer_count,
1363 staging_image.type == VK_IMAGE_TYPE_3D,
1364 NULL);
1365
1366 VkResult result = tu_get_scratch_bo(cmd->device,
1367 staging_image.layout[0].size,
1368 &staging_image.bo);
1369 if (result != VK_SUCCESS) {
1370 cmd->record_result = result;
1371 return;
1372 }
1373
1374 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1375 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1376
1377 struct tu_image_view staging;
1378 tu_image_view_copy(&staging, &staging_image, src_format,
1379 &staging_subresource, 0, false);
1380
1381 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1382 coords(ops, cs, &staging_offset, &src_offset, &extent);
1383
1384 for (uint32_t i = 0; i < info->extent.depth; i++) {
1385 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1386 ops->dst(cs, &staging, i);
1387 ops->run(cmd, cs);
1388 }
1389
1390 /* When executed by the user there has to be a pipeline barrier here,
1391 * but since we're doing it manually we'll have to flush ourselves.
1392 */
1393 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1394 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1395
1396 tu_image_view_copy(&staging, &staging_image, dst_format,
1397 &staging_subresource, 0, false);
1398
1399 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, ROTATE_0, false);
1400 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1401
1402 for (uint32_t i = 0; i < info->extent.depth; i++) {
1403 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1404 ops->dst(cs, &dst, i);
1405 ops->run(cmd, cs);
1406 }
1407 } else {
1408 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1409 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1410
1411 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, ROTATE_0, false);
1412 coords(ops, cs, &dst_offset, &src_offset, &extent);
1413
1414 for (uint32_t i = 0; i < info->extent.depth; i++) {
1415 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1416 ops->dst(cs, &dst, i);
1417 ops->run(cmd, cs);
1418 }
1419 }
1420 }
1421
1422 void
1423 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1424 VkImage srcImage,
1425 VkImageLayout srcImageLayout,
1426 VkImage destImage,
1427 VkImageLayout destImageLayout,
1428 uint32_t regionCount,
1429 const VkImageCopy *pRegions)
1430 {
1431 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1432 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1433 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1434
1435 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1436 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1437
1438 for (uint32_t i = 0; i < regionCount; ++i)
1439 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1440 }
1441
1442 static void
1443 copy_buffer(struct tu_cmd_buffer *cmd,
1444 uint64_t dst_va,
1445 uint64_t src_va,
1446 uint64_t size,
1447 uint32_t block_size)
1448 {
1449 const struct blit_ops *ops = &r2d_ops;
1450 struct tu_cs *cs = &cmd->cs;
1451 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1452 uint64_t blocks = size / block_size;
1453
1454 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1455
1456 while (blocks) {
1457 uint32_t src_x = (src_va & 63) / block_size;
1458 uint32_t dst_x = (dst_va & 63) / block_size;
1459 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1460
1461 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1462 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1463 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1464 ops->run(cmd, cs);
1465
1466 src_va += width * block_size;
1467 dst_va += width * block_size;
1468 blocks -= width;
1469 }
1470 }
1471
1472 void
1473 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1474 VkBuffer srcBuffer,
1475 VkBuffer dstBuffer,
1476 uint32_t regionCount,
1477 const VkBufferCopy *pRegions)
1478 {
1479 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1480 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1481 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1482
1483 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1484 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1485
1486 for (unsigned i = 0; i < regionCount; ++i) {
1487 copy_buffer(cmd,
1488 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1489 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1490 pRegions[i].size, 1);
1491 }
1492 }
1493
1494 void
1495 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1496 VkBuffer dstBuffer,
1497 VkDeviceSize dstOffset,
1498 VkDeviceSize dataSize,
1499 const void *pData)
1500 {
1501 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1502 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1503
1504 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1505
1506 struct tu_cs_memory tmp;
1507 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1508 if (result != VK_SUCCESS) {
1509 cmd->record_result = result;
1510 return;
1511 }
1512
1513 memcpy(tmp.map, pData, dataSize);
1514 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1515 }
1516
1517 void
1518 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1519 VkBuffer dstBuffer,
1520 VkDeviceSize dstOffset,
1521 VkDeviceSize fillSize,
1522 uint32_t data)
1523 {
1524 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1525 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1526 const struct blit_ops *ops = &r2d_ops;
1527 struct tu_cs *cs = &cmd->cs;
1528
1529 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1530
1531 if (fillSize == VK_WHOLE_SIZE)
1532 fillSize = buffer->size - dstOffset;
1533
1534 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1535 uint32_t blocks = fillSize / 4;
1536
1537 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true);
1538 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1539
1540 while (blocks) {
1541 uint32_t dst_x = (dst_va & 63) / 4;
1542 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1543
1544 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1545 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1546 ops->run(cmd, cs);
1547
1548 dst_va += width * 4;
1549 blocks -= width;
1550 }
1551 }
1552
1553 void
1554 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1555 VkImage srcImage,
1556 VkImageLayout srcImageLayout,
1557 VkImage dstImage,
1558 VkImageLayout dstImageLayout,
1559 uint32_t regionCount,
1560 const VkImageResolve *pRegions)
1561 {
1562 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1563 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1564 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1565 const struct blit_ops *ops = &r2d_ops;
1566 struct tu_cs *cs = &cmd->cs;
1567
1568 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1569 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1570
1571 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1572
1573 for (uint32_t i = 0; i < regionCount; ++i) {
1574 const VkImageResolve *info = &pRegions[i];
1575 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1576
1577 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1578 /* TODO: aspect masks possible ? */
1579
1580 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1581
1582 struct tu_image_view dst, src;
1583 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1584 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1585
1586 for (uint32_t i = 0; i < layers; i++) {
1587 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1588 ops->dst(cs, &dst, i);
1589 ops->run(cmd, cs);
1590 }
1591 }
1592 }
1593
1594 void
1595 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1596 struct tu_cs *cs,
1597 struct tu_image_view *src,
1598 struct tu_image_view *dst,
1599 uint32_t layers,
1600 const VkRect2D *rect)
1601 {
1602 const struct blit_ops *ops = &r2d_ops;
1603
1604 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1605 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1606
1607 assert(src->image->vk_format == dst->image->vk_format);
1608
1609 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1610 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1611
1612 for (uint32_t i = 0; i < layers; i++) {
1613 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1614 ops->dst(cs, dst, i);
1615 ops->run(cmd, cs);
1616 }
1617 }
1618
1619 static void
1620 clear_image(struct tu_cmd_buffer *cmd,
1621 struct tu_image *image,
1622 const VkClearValue *clear_value,
1623 const VkImageSubresourceRange *range)
1624 {
1625 uint32_t level_count = tu_get_levelCount(image, range);
1626 uint32_t layer_count = tu_get_layerCount(image, range);
1627 struct tu_cs *cs = &cmd->cs;
1628 VkFormat format = image->vk_format;
1629 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1630 format = VK_FORMAT_R32_UINT;
1631
1632 if (image->type == VK_IMAGE_TYPE_3D) {
1633 assert(layer_count == 1);
1634 assert(range->baseArrayLayer == 0);
1635 }
1636
1637 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1638
1639 ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true);
1640 ops->clear_value(cs, image->vk_format, clear_value);
1641
1642 for (unsigned j = 0; j < level_count; j++) {
1643 if (image->type == VK_IMAGE_TYPE_3D)
1644 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1645
1646 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1647 u_minify(image->extent.width, range->baseMipLevel + j),
1648 u_minify(image->extent.height, range->baseMipLevel + j)
1649 });
1650
1651 struct tu_image_view dst;
1652 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1653 .aspectMask = range->aspectMask,
1654 .mipLevel = range->baseMipLevel + j,
1655 .baseArrayLayer = range->baseArrayLayer,
1656 .layerCount = 1,
1657 }, 0, false);
1658
1659 for (uint32_t i = 0; i < layer_count; i++) {
1660 ops->dst(cs, &dst, i);
1661 ops->run(cmd, cs);
1662 }
1663 }
1664 }
1665
1666 void
1667 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1668 VkImage image_h,
1669 VkImageLayout imageLayout,
1670 const VkClearColorValue *pColor,
1671 uint32_t rangeCount,
1672 const VkImageSubresourceRange *pRanges)
1673 {
1674 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1675 TU_FROM_HANDLE(tu_image, image, image_h);
1676
1677 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1678
1679 for (unsigned i = 0; i < rangeCount; i++)
1680 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1681 }
1682
1683 void
1684 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1685 VkImage image_h,
1686 VkImageLayout imageLayout,
1687 const VkClearDepthStencilValue *pDepthStencil,
1688 uint32_t rangeCount,
1689 const VkImageSubresourceRange *pRanges)
1690 {
1691 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1692 TU_FROM_HANDLE(tu_image, image, image_h);
1693
1694 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1695
1696 for (unsigned i = 0; i < rangeCount; i++)
1697 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1698 }
1699
1700 static void
1701 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1702 uint32_t attachment_count,
1703 const VkClearAttachment *attachments,
1704 uint32_t rect_count,
1705 const VkClearRect *rects)
1706 {
1707 const struct tu_subpass *subpass = cmd->state.subpass;
1708 /* note: cannot use shader path here.. there is a special shader path
1709 * in tu_clear_sysmem_attachments()
1710 */
1711 const struct blit_ops *ops = &r2d_ops;
1712 struct tu_cs *cs = &cmd->draw_cs;
1713
1714 for (uint32_t j = 0; j < attachment_count; j++) {
1715 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1716 * Pass Instance" says that:
1717 *
1718 * Unlike other clear commands, vkCmdClearAttachments executes as
1719 * a drawing command, rather than a transfer command, with writes
1720 * performed by it executing in rasterization order. Clears to
1721 * color attachments are executed as color attachment writes, by
1722 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1723 * Clears to depth/stencil attachments are executed as depth
1724 * writes and writes by the
1725 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1726 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1727 *
1728 * However, the 2d path here is executed the same way as a
1729 * transfer command, using the CCU color cache exclusively with
1730 * a special depth-as-color format for depth clears. This means that
1731 * we can't rely on the normal pipeline barrier mechanism here, and
1732 * have to manually flush whenever using a different cache domain
1733 * from what the 3d path would've used. This happens when we clear
1734 * depth/stencil, since normally depth attachments use CCU depth, but
1735 * we clear it using a special depth-as-color format. Since the clear
1736 * potentially uses a different attachment state we also need to
1737 * invalidate color beforehand and flush it afterwards.
1738 */
1739
1740 uint32_t a;
1741 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1742 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1743 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1744 } else {
1745 a = subpass->depth_stencil_attachment.attachment;
1746 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1747 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1748 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1749 }
1750
1751 if (a == VK_ATTACHMENT_UNUSED)
1752 continue;
1753
1754 const struct tu_image_view *iview =
1755 cmd->state.framebuffer->attachments[a].attachment;
1756
1757 ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask, ROTATE_0, true);
1758 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1759
1760 /* Wait for the flushes we triggered manually to complete */
1761 tu_cs_emit_wfi(cs);
1762
1763 for (uint32_t i = 0; i < rect_count; i++) {
1764 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1765 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1766 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1767 ops->run(cmd, cs);
1768 }
1769 }
1770
1771 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1772 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1773 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1774 } else {
1775 /* sync color into depth */
1776 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1777 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1778 }
1779 }
1780 }
1781
1782 static void
1783 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1784 uint32_t attachment_count,
1785 const VkClearAttachment *attachments,
1786 uint32_t rect_count,
1787 const VkClearRect *rects)
1788 {
1789 /* the shader path here is special, it avoids changing MRT/etc state */
1790 const struct tu_render_pass *pass = cmd->state.pass;
1791 const struct tu_subpass *subpass = cmd->state.subpass;
1792 const uint32_t mrt_count = subpass->color_count;
1793 struct tu_cs *cs = &cmd->draw_cs;
1794 uint32_t clear_value[MAX_RTS][4];
1795 float z_clear_val = 0.0f;
1796 uint8_t s_clear_val = 0;
1797 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1798 bool z_clear = false;
1799 bool s_clear = false;
1800 bool layered_clear = false;
1801 uint32_t max_samples = 1;
1802
1803 for (uint32_t i = 0; i < attachment_count; i++) {
1804 uint32_t a;
1805 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1806 uint32_t c = attachments[i].colorAttachment;
1807 a = subpass->color_attachments[c].attachment;
1808 if (a == VK_ATTACHMENT_UNUSED)
1809 continue;
1810
1811 clear_rts |= 1 << c;
1812 clear_components |= 0xf << (c * 4);
1813 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1814 } else {
1815 a = subpass->depth_stencil_attachment.attachment;
1816 if (a == VK_ATTACHMENT_UNUSED)
1817 continue;
1818
1819 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1820 z_clear = true;
1821 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1822 }
1823
1824 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1825 s_clear = true;
1826 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1827 }
1828 }
1829
1830 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1831 }
1832
1833 /* prefer to use 2D path for clears
1834 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1835 */
1836 if (max_samples == 1 && cmd->state.framebuffer) {
1837 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1838 return;
1839 }
1840
1841 /* This clear path behaves like a draw, needs the same flush as tu_draw */
1842 tu_emit_cache_flush_renderpass(cmd, cs);
1843
1844 /* disable all draw states so they don't interfere
1845 * TODO: use and re-use draw states for this path
1846 * we have to disable draw states individually to preserve
1847 * input attachment states, because a secondary command buffer
1848 * won't be able to restore them
1849 */
1850 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1851 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1852 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1853 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1854 continue;
1855 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1856 CP_SET_DRAW_STATE__0_DISABLE);
1857 tu_cs_emit_qw(cs, 0);
1858 }
1859 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1860
1861 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1862 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1863 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1864 0xfc000000);
1865 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1866
1867 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1868 for (uint32_t i = 0; i < mrt_count; i++) {
1869 if (clear_rts & (1 << i))
1870 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1871 else
1872 tu_cs_emit(cs, 0);
1873 }
1874
1875 for (uint32_t i = 0; i < rect_count; i++) {
1876 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1877 layered_clear = true;
1878 }
1879
1880 r3d_common(cmd, cs, false, num_rts, layered_clear);
1881
1882 tu_cs_emit_regs(cs,
1883 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1884 tu_cs_emit_regs(cs,
1885 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1886
1887 tu_cs_emit_regs(cs,
1888 A6XX_RB_FS_OUTPUT_CNTL0(),
1889 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1890
1891 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1892 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1893 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1894 for (uint32_t i = 0; i < mrt_count; i++) {
1895 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1896 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1897 }
1898
1899 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1900 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1901 .z_enable = z_clear,
1902 .z_write_enable = z_clear,
1903 .zfunc = FUNC_ALWAYS));
1904 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1905 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1906 .stencil_enable = s_clear,
1907 .func = FUNC_ALWAYS,
1908 .zpass = STENCIL_REPLACE));
1909 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1910 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1911 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1912
1913 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1914 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1915 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1916 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1917 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1918 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1919 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1920 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1921 for_each_bit(b, clear_rts)
1922 tu_cs_emit_array(cs, clear_value[b], 4);
1923
1924 for (uint32_t i = 0; i < rect_count; i++) {
1925 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1926 r3d_coords_raw(cs, (float[]) {
1927 rects[i].rect.offset.x, rects[i].rect.offset.y,
1928 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1929 rects[i].rect.offset.x + rects[i].rect.extent.width,
1930 rects[i].rect.offset.y + rects[i].rect.extent.height,
1931 z_clear_val, 1.0f,
1932 });
1933 r3d_run(cmd, cs);
1934 }
1935 }
1936 }
1937
1938 static void
1939 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1940 {
1941 enum pipe_format pformat = vk_format_to_pipe_format(format);
1942
1943 switch (format) {
1944 case VK_FORMAT_X8_D24_UNORM_PACK32:
1945 case VK_FORMAT_D24_UNORM_S8_UINT:
1946 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1947 val->depthStencil.stencil << 24;
1948 return;
1949 case VK_FORMAT_D16_UNORM:
1950 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1951 return;
1952 case VK_FORMAT_D32_SFLOAT:
1953 clear_value[0] = fui(val->depthStencil.depth);
1954 return;
1955 case VK_FORMAT_S8_UINT:
1956 clear_value[0] = val->depthStencil.stencil;
1957 return;
1958 /* these formats use a different base format when tiled
1959 * the same format can be used for both because GMEM is always in WZYX order
1960 */
1961 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1962 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1963 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1964 default:
1965 break;
1966 }
1967
1968 VkClearColorValue color;
1969
1970 /**
1971 * GMEM is tiled and wants the components in WZYX order,
1972 * apply swizzle to the color before packing, to counteract
1973 * deswizzling applied by packing functions
1974 */
1975 pipe_swizzle_4f(color.float32, val->color.float32,
1976 util_format_description(pformat)->swizzle);
1977
1978 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
1979 }
1980
1981 static void
1982 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1983 struct tu_cs *cs,
1984 uint32_t attachment,
1985 VkImageAspectFlags mask,
1986 const VkClearValue *value)
1987 {
1988 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
1989
1990
1991 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
1992 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
1993
1994 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
1995 .clear_mask = aspect_write_mask(vk_format, mask)));
1996
1997 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1998 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
1999
2000 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2001 tu_cs_emit(cs, 0);
2002
2003 uint32_t clear_vals[4] = {};
2004 pack_gmem_clear_value(value, vk_format, clear_vals);
2005
2006 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2007 tu_cs_emit_array(cs, clear_vals, 4);
2008
2009 tu6_emit_event_write(cmd, cs, BLIT);
2010 }
2011
2012 static void
2013 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2014 uint32_t attachment_count,
2015 const VkClearAttachment *attachments,
2016 uint32_t rect_count,
2017 const VkClearRect *rects)
2018 {
2019 const struct tu_subpass *subpass = cmd->state.subpass;
2020 struct tu_cs *cs = &cmd->draw_cs;
2021
2022 /* TODO: swap the loops for smaller cmdstream */
2023 for (unsigned i = 0; i < rect_count; i++) {
2024 unsigned x1 = rects[i].rect.offset.x;
2025 unsigned y1 = rects[i].rect.offset.y;
2026 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2027 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2028
2029 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2030 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2031 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2032
2033 for (unsigned j = 0; j < attachment_count; j++) {
2034 uint32_t a;
2035 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2036 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2037 else
2038 a = subpass->depth_stencil_attachment.attachment;
2039
2040 if (a == VK_ATTACHMENT_UNUSED)
2041 continue;
2042
2043 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2044 &attachments[j].clearValue);
2045 }
2046 }
2047 }
2048
2049 void
2050 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2051 uint32_t attachmentCount,
2052 const VkClearAttachment *pAttachments,
2053 uint32_t rectCount,
2054 const VkClearRect *pRects)
2055 {
2056 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2057 struct tu_cs *cs = &cmd->draw_cs;
2058
2059 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2060 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2061 tu_cond_exec_end(cs);
2062
2063 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2064 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2065 tu_cond_exec_end(cs);
2066 }
2067
2068 void
2069 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2070 struct tu_cs *cs,
2071 uint32_t a,
2072 const VkRenderPassBeginInfo *info)
2073 {
2074 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2075 const struct tu_image_view *iview = fb->attachments[a].attachment;
2076 const struct tu_render_pass_attachment *attachment =
2077 &cmd->state.pass->attachments[a];
2078
2079 if (!attachment->clear_mask)
2080 return;
2081
2082 const struct blit_ops *ops = &r2d_ops;
2083 if (attachment->samples > 1)
2084 ops = &r3d_ops;
2085
2086 ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, true);
2087 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2088 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2089
2090 /* Wait for any flushes at the beginning of the renderpass to complete */
2091 tu_cs_emit_wfi(cs);
2092
2093 for (uint32_t i = 0; i < fb->layers; i++) {
2094 ops->dst(cs, iview, i);
2095 ops->run(cmd, cs);
2096 }
2097
2098 /* The spec doesn't explicitly say, but presumably the initial renderpass
2099 * clear is considered part of the renderpass, and therefore barriers
2100 * aren't required inside the subpass/renderpass. Therefore we need to
2101 * flush CCU color into CCU depth here, just like with
2102 * vkCmdClearAttachments(). Note that because this only happens at the
2103 * beginning of a renderpass, and renderpass writes are considered
2104 * "incoherent", we shouldn't have to worry about syncing depth into color
2105 * beforehand as depth should already be flushed.
2106 */
2107 if (vk_format_is_depth_or_stencil(attachment->format)) {
2108 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2109 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2110 } else {
2111 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2112 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2113 }
2114 }
2115
2116 void
2117 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2118 struct tu_cs *cs,
2119 uint32_t a,
2120 const VkRenderPassBeginInfo *info)
2121 {
2122 const struct tu_render_pass_attachment *attachment =
2123 &cmd->state.pass->attachments[a];
2124
2125 if (!attachment->clear_mask)
2126 return;
2127
2128 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2129
2130 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2131 &info->pClearValues[a]);
2132 }
2133
2134 static void
2135 tu_emit_blit(struct tu_cmd_buffer *cmd,
2136 struct tu_cs *cs,
2137 const struct tu_image_view *iview,
2138 const struct tu_render_pass_attachment *attachment,
2139 bool resolve)
2140 {
2141 tu_cs_emit_regs(cs,
2142 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2143
2144 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2145 .unk0 = !resolve,
2146 .gmem = !resolve,
2147 /* "integer" bit disables msaa resolve averaging */
2148 .integer = vk_format_is_int(attachment->format)));
2149
2150 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2151 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2152 tu_cs_image_ref_2d(cs, iview, 0, false);
2153
2154 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2155 tu_cs_image_flag_ref(cs, iview, 0);
2156
2157 tu_cs_emit_regs(cs,
2158 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2159
2160 tu6_emit_event_write(cmd, cs, BLIT);
2161 }
2162
2163 static bool
2164 blit_can_resolve(VkFormat format)
2165 {
2166 const struct util_format_description *desc = vk_format_description(format);
2167
2168 /* blit event can only do resolve for simple cases:
2169 * averaging samples as unsigned integers or choosing only one sample
2170 */
2171 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2172 return false;
2173
2174 /* can't do formats with larger channel sizes
2175 * note: this includes all float formats
2176 * note2: single channel integer formats seem OK
2177 */
2178 if (desc->channel[0].size > 10)
2179 return false;
2180
2181 switch (format) {
2182 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2183 * likely related to these formats having different layout from other cpp=2 formats
2184 */
2185 case VK_FORMAT_R8G8_UNORM:
2186 case VK_FORMAT_R8G8_UINT:
2187 case VK_FORMAT_R8G8_SINT:
2188 /* TODO: this one should be able to work? */
2189 case VK_FORMAT_D24_UNORM_S8_UINT:
2190 return false;
2191 default:
2192 break;
2193 }
2194
2195 return true;
2196 }
2197
2198 void
2199 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2200 struct tu_cs *cs,
2201 uint32_t a,
2202 bool force_load)
2203 {
2204 const struct tu_image_view *iview =
2205 cmd->state.framebuffer->attachments[a].attachment;
2206 const struct tu_render_pass_attachment *attachment =
2207 &cmd->state.pass->attachments[a];
2208
2209 if (attachment->load || force_load)
2210 tu_emit_blit(cmd, cs, iview, attachment, false);
2211 }
2212
2213 void
2214 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2215 struct tu_cs *cs,
2216 uint32_t a,
2217 uint32_t gmem_a)
2218 {
2219 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2220 const VkRect2D *render_area = &cmd->state.render_area;
2221 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2222 struct tu_image_view *iview = fb->attachments[a].attachment;
2223 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2224
2225 if (!dst->store)
2226 return;
2227
2228 uint32_t x1 = render_area->offset.x;
2229 uint32_t y1 = render_area->offset.y;
2230 uint32_t x2 = x1 + render_area->extent.width;
2231 uint32_t y2 = y1 + render_area->extent.height;
2232 /* x2/y2 can be unaligned if equal to the size of the image,
2233 * since it will write into padding space
2234 * the one exception is linear levels which don't have the
2235 * required y padding in the layout (except for the last level)
2236 */
2237 bool need_y2_align =
2238 y2 != iview->extent.height || iview->need_y2_align;
2239
2240 bool unaligned =
2241 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2242 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2243
2244 /* use fast path when render area is aligned, except for unsupported resolve cases */
2245 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2246 tu_emit_blit(cmd, cs, iview, src, true);
2247 return;
2248 }
2249
2250 if (dst->samples > 1) {
2251 /* I guess we need to use shader path in this case?
2252 * need a testcase which fails because of this
2253 */
2254 tu_finishme("unaligned store of msaa attachment\n");
2255 return;
2256 }
2257
2258 r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, true);
2259 r2d_dst(cs, iview, 0);
2260 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2261
2262 tu_cs_emit_regs(cs,
2263 A6XX_SP_PS_2D_SRC_INFO(
2264 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2265 .tile_mode = TILE6_2,
2266 .srgb = vk_format_is_srgb(src->format),
2267 .samples = tu_msaa_samples(src->samples),
2268 .samples_average = !vk_format_is_int(src->format),
2269 .unk20 = 1,
2270 .unk22 = 1),
2271 /* note: src size does not matter when not scaling */
2272 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2273 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2274 A6XX_SP_PS_2D_SRC_HI(),
2275 A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2276
2277 /* sync GMEM writes with CACHE. */
2278 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2279
2280 /* Wait for CACHE_INVALIDATE to land */
2281 tu_cs_emit_wfi(cs);
2282
2283 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2284 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2285
2286 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2287 * sysmem, and we generally assume that GMEM renderpasses leave their
2288 * results in sysmem, so we need to flush manually here.
2289 */
2290 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2291 }