1d25147a6cfcdd316764c737d431a14a96bb5192
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(src->x),
111 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
233 {
234 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
235
236 tu_cs_emit_regs(cs,
237 A6XX_RB_2D_DST_INFO(
238 .color_format = format.fmt,
239 .color_swap = format.swap,
240 .srgb = vk_format_is_srgb(vk_format)),
241 A6XX_RB_2D_DST_LO((uint32_t) va),
242 A6XX_RB_2D_DST_HI(va >> 32),
243 A6XX_RB_2D_DST_PITCH(pitch));
244 }
245
246 static void
247 r2d_setup_common(struct tu_cmd_buffer *cmd,
248 struct tu_cs *cs,
249 VkFormat vk_format,
250 VkImageAspectFlags aspect_mask,
251 enum a6xx_rotation rotation,
252 bool clear,
253 bool ubwc,
254 bool scissor)
255 {
256 enum a6xx_format format = tu6_base_format(vk_format);
257 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
258 uint32_t unknown_8c01 = 0;
259
260 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
261 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
262 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
263 }
264
265 /* note: the only format with partial clearing is D24S8 */
266 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
267 /* preserve stencil channel */
268 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
269 unknown_8c01 = 0x08000041;
270 /* preserve depth channels */
271 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
272 unknown_8c01 = 0x00084001;
273 }
274
275 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
276 tu_cs_emit(cs, unknown_8c01);
277
278 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
279 .scissor = scissor,
280 .rotate = rotation,
281 .solid_color = clear,
282 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
283 .color_format = format,
284 .mask = 0xf,
285 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
286 ).value;
287
288 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
289 tu_cs_emit(cs, blit_cntl);
290
291 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
292 tu_cs_emit(cs, blit_cntl);
293
294 if (format == FMT6_10_10_10_2_UNORM_DEST)
295 format = FMT6_16_16_16_16_FLOAT;
296
297 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
298 .sint = vk_format_is_sint(vk_format),
299 .uint = vk_format_is_uint(vk_format),
300 .color_format = format,
301 .srgb = vk_format_is_srgb(vk_format),
302 .mask = 0xf));
303 }
304
305 static void
306 r2d_setup(struct tu_cmd_buffer *cmd,
307 struct tu_cs *cs,
308 VkFormat vk_format,
309 VkImageAspectFlags aspect_mask,
310 enum a6xx_rotation rotation,
311 bool clear,
312 bool ubwc)
313 {
314 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
315
316 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
317 }
318
319 static void
320 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
321 {
322 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
323 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
324 }
325
326 /* r3d_ = shader path operations */
327
328 void
329 tu_init_clear_blit_shaders(struct tu6_global *global)
330 {
331 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
332 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
333 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
334
335 static const instr_t vs_code[] = {
336 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
337 * r1.xy = r0.w ? c1.zw : c0.zw
338 * r0.w = 1.0f
339 */
340 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
341 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
342 .src2 = 3,
343 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
344 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
345 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
346 .src2 = 3,
347 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
348 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
349 { .cat0 = { .opc = OPC_END } },
350 };
351
352 static const instr_t fs_blit[] = {
353 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
354 * blit path (its not clear what allows it to not have it)
355 */
356 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
357 { .cat0 = { .opc = OPC_END } },
358 };
359
360 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
361 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
362
363 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
364 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
365 for (uint32_t i = 0; i < num_rts; i++) {
366 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
367 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
368 }
369 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
370 }
371 }
372
373 static void
374 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
375 bool layered_clear)
376 {
377 struct ir3_const_state dummy_const_state = {};
378 struct ir3_shader dummy_shader = {};
379
380 struct ir3_shader_variant vs = {
381 .type = MESA_SHADER_VERTEX,
382 .instrlen = 1,
383 .constlen = 4,
384 .info.max_reg = 1,
385 .inputs_count = 1,
386 .inputs[0] = {
387 .slot = SYSTEM_VALUE_VERTEX_ID,
388 .regid = regid(0, 3),
389 .sysval = true,
390 },
391 .outputs_count = blit ? 2 : 1,
392 .outputs[0] = {
393 .slot = VARYING_SLOT_POS,
394 .regid = regid(0, 0),
395 },
396 .outputs[1] = {
397 .slot = VARYING_SLOT_VAR0,
398 .regid = regid(1, 0),
399 },
400 .shader = &dummy_shader,
401 .const_state = &dummy_const_state,
402 };
403 if (layered_clear) {
404 vs.outputs[1].slot = VARYING_SLOT_LAYER;
405 vs.outputs[1].regid = regid(1, 1);
406 vs.outputs_count = 2;
407 }
408
409 struct ir3_shader_variant fs = {
410 .type = MESA_SHADER_FRAGMENT,
411 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
412 .constlen = align(num_rts, 4),
413 .info.max_reg = MAX2(num_rts, 1) - 1,
414 .total_in = blit ? 2 : 0,
415 .num_samp = blit ? 1 : 0,
416 .inputs_count = blit ? 2 : 0,
417 .inputs[0] = {
418 .slot = VARYING_SLOT_VAR0,
419 .inloc = 0,
420 .compmask = 3,
421 .bary = true,
422 },
423 .inputs[1] = {
424 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
425 .regid = regid(0, 0),
426 .sysval = 1,
427 },
428 .num_sampler_prefetch = blit ? 1 : 0,
429 .sampler_prefetch[0] = {
430 .src = 0,
431 .wrmask = 0xf,
432 .cmd = 4,
433 },
434 .shader = &dummy_shader,
435 .const_state = &dummy_const_state,
436 };
437
438 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
439 .vs_state = true,
440 .hs_state = true,
441 .ds_state = true,
442 .gs_state = true,
443 .fs_state = true,
444 .cs_state = true,
445 .gfx_ibo = true,
446 .cs_ibo = true,
447 .gfx_shared_const = true,
448 .gfx_bindless = 0x1f,
449 .cs_bindless = 0x1f));
450
451 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
452 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
453 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
454 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
455 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
456 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
457
458 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
459 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
460
461 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
462
463 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
464 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
465 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
466
467 tu6_emit_fs_inputs(cs, &fs);
468
469 tu_cs_emit_regs(cs,
470 A6XX_GRAS_CL_CNTL(
471 .persp_division_disable = 1,
472 .vp_xform_disable = 1,
473 .vp_clip_code_ignore = 1,
474 .clip_disable = 1));
475 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
476
477 tu_cs_emit_regs(cs,
478 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
479 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
480 tu_cs_emit_regs(cs,
481 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
482 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
483
484 tu_cs_emit_regs(cs,
485 A6XX_VFD_INDEX_OFFSET(),
486 A6XX_VFD_INSTANCE_START_OFFSET());
487 }
488
489 static void
490 r3d_coords_raw(struct tu_cs *cs, const float *coords)
491 {
492 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
493 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
494 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
495 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
496 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
497 CP_LOAD_STATE6_0_NUM_UNIT(2));
498 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
499 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
500 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
501 }
502
503 static void
504 r3d_coords(struct tu_cs *cs,
505 const VkOffset2D *dst,
506 const VkOffset2D *src,
507 const VkExtent2D *extent)
508 {
509 int32_t src_x1 = src ? src->x : 0;
510 int32_t src_y1 = src ? src->y : 0;
511 r3d_coords_raw(cs, (float[]) {
512 dst->x, dst->y,
513 src_x1, src_y1,
514 dst->x + extent->width, dst->y + extent->height,
515 src_x1 + extent->width, src_y1 + extent->height,
516 });
517 }
518
519 static void
520 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
521 {
522 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
523 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
524 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
525 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
526 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
527 CP_LOAD_STATE6_0_NUM_UNIT(1));
528 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
529 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
530 switch (format) {
531 case VK_FORMAT_X8_D24_UNORM_PACK32:
532 case VK_FORMAT_D24_UNORM_S8_UINT: {
533 /* cleared as r8g8b8a8_unorm using special format */
534 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
535 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
536 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
537 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
538 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
539 } break;
540 case VK_FORMAT_D16_UNORM:
541 case VK_FORMAT_D32_SFLOAT:
542 tu_cs_emit(cs, fui(val->depthStencil.depth));
543 tu_cs_emit(cs, 0);
544 tu_cs_emit(cs, 0);
545 tu_cs_emit(cs, 0);
546 break;
547 case VK_FORMAT_S8_UINT:
548 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
549 tu_cs_emit(cs, 0);
550 tu_cs_emit(cs, 0);
551 tu_cs_emit(cs, 0);
552 break;
553 default:
554 /* as color formats use clear value as-is */
555 assert(!vk_format_is_depth_or_stencil(format));
556 tu_cs_emit_array(cs, val->color.uint32, 4);
557 break;
558 }
559 }
560
561 static void
562 r3d_src_common(struct tu_cmd_buffer *cmd,
563 struct tu_cs *cs,
564 const uint32_t *tex_const,
565 uint32_t offset_base,
566 uint32_t offset_ubwc,
567 VkFilter filter)
568 {
569 struct tu_cs_memory texture = { };
570 VkResult result = tu_cs_alloc(&cmd->sub_cs,
571 2, /* allocate space for a sampler too */
572 A6XX_TEX_CONST_DWORDS, &texture);
573 assert(result == VK_SUCCESS);
574
575 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
576
577 /* patch addresses for layer offset */
578 *(uint64_t*) (texture.map + 4) += offset_base;
579 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
580 texture.map[7] = ubwc_addr;
581 texture.map[8] = ubwc_addr >> 32;
582
583 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
584 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
585 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
586 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
587 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
588 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
589 0x60000; /* XXX used by blob, doesn't seem necessary */
590 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
591 0x1 | /* XXX used by blob, doesn't seem necessary */
592 A6XX_TEX_SAMP_1_UNNORM_COORDS |
593 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
594 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
595 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
596
597 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
598 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
599 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
600 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
601 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
602 CP_LOAD_STATE6_0_NUM_UNIT(1));
603 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
604
605 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
606 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
607
608 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
609 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
610 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
611 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
612 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
613 CP_LOAD_STATE6_0_NUM_UNIT(1));
614 tu_cs_emit_qw(cs, texture.iova);
615
616 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
617 tu_cs_emit_qw(cs, texture.iova);
618
619 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
620 }
621
622 static void
623 r3d_src(struct tu_cmd_buffer *cmd,
624 struct tu_cs *cs,
625 const struct tu_image_view *iview,
626 uint32_t layer,
627 VkFilter filter)
628 {
629 r3d_src_common(cmd, cs, iview->descriptor,
630 iview->layer_size * layer,
631 iview->ubwc_layer_size * layer,
632 filter);
633 }
634
635 static void
636 r3d_src_buffer(struct tu_cmd_buffer *cmd,
637 struct tu_cs *cs,
638 VkFormat vk_format,
639 uint64_t va, uint32_t pitch,
640 uint32_t width, uint32_t height)
641 {
642 uint32_t desc[A6XX_TEX_CONST_DWORDS];
643
644 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
645
646 desc[0] =
647 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
648 A6XX_TEX_CONST_0_FMT(format.fmt) |
649 A6XX_TEX_CONST_0_SWAP(format.swap) |
650 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
651 // XXX to swizzle into .w for stencil buffer_to_image
652 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
653 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
654 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
655 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
656 desc[2] =
657 A6XX_TEX_CONST_2_PITCH(pitch) |
658 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
659 desc[3] = 0;
660 desc[4] = va;
661 desc[5] = va >> 32;
662 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
663 desc[i] = 0;
664
665 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
666 }
667
668 static void
669 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
670 {
671 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
672
673 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
674 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
675 tu_cs_image_ref(cs, iview, layer);
676 tu_cs_emit(cs, 0);
677
678 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
679 tu_cs_image_flag_ref(cs, iview, layer);
680
681 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
682 }
683
684 static void
685 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
686 {
687 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
688
689 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
690
691 tu_cs_emit_regs(cs,
692 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
693 A6XX_RB_MRT_PITCH(0, pitch),
694 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
695 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
696 A6XX_RB_MRT_BASE_HI(0, va >> 32),
697 A6XX_RB_MRT_BASE_GMEM(0, 0));
698
699 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
700 }
701
702 static uint8_t
703 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
704 {
705 uint8_t mask = 0xf;
706 assert(aspect_mask);
707 /* note: the only format with partial writing is D24S8,
708 * clear/blit uses the _AS_R8G8B8A8 format to access it
709 */
710 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
711 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
712 mask = 0x7;
713 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
714 mask = 0x8;
715 }
716 return mask;
717 }
718
719 static void
720 r3d_setup(struct tu_cmd_buffer *cmd,
721 struct tu_cs *cs,
722 VkFormat vk_format,
723 VkImageAspectFlags aspect_mask,
724 enum a6xx_rotation rotation,
725 bool clear,
726 bool ubwc)
727 {
728 enum a6xx_format format = tu6_base_format(vk_format);
729
730 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
731 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
732 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
733 }
734
735 if (!cmd->state.pass) {
736 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
737 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
738 }
739
740 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
741 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
742
743 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
744
745 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
746 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
747 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
748 0xfc000000);
749 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
750
751 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
752 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
753
754 tu_cs_emit_regs(cs,
755 A6XX_RB_FS_OUTPUT_CNTL0(),
756 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
757
758 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
759 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
760 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
761
762 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
763 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
764 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
765 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
766 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
767 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
768 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
769
770 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
771 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
772
773 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
774 .color_format = format,
775 .color_sint = vk_format_is_sint(vk_format),
776 .color_uint = vk_format_is_uint(vk_format)));
777
778 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
779 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
780 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
781 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
782 }
783
784 static void
785 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
786 {
787 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
788 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
789 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
790 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
791 tu_cs_emit(cs, 1); /* instance count */
792 tu_cs_emit(cs, 2); /* vertex count */
793 }
794
795 /* blit ops - common interface for 2d/shader paths */
796
797 struct blit_ops {
798 void (*coords)(struct tu_cs *cs,
799 const VkOffset2D *dst,
800 const VkOffset2D *src,
801 const VkExtent2D *extent);
802 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
803 void (*src)(
804 struct tu_cmd_buffer *cmd,
805 struct tu_cs *cs,
806 const struct tu_image_view *iview,
807 uint32_t layer,
808 VkFilter filter);
809 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
810 VkFormat vk_format,
811 uint64_t va, uint32_t pitch,
812 uint32_t width, uint32_t height);
813 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
814 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
815 void (*setup)(struct tu_cmd_buffer *cmd,
816 struct tu_cs *cs,
817 VkFormat vk_format,
818 VkImageAspectFlags aspect_mask,
819 enum a6xx_rotation rotation,
820 bool clear,
821 bool ubwc);
822 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
823 };
824
825 static const struct blit_ops r2d_ops = {
826 .coords = r2d_coords,
827 .clear_value = r2d_clear_value,
828 .src = r2d_src,
829 .src_buffer = r2d_src_buffer,
830 .dst = r2d_dst,
831 .dst_buffer = r2d_dst_buffer,
832 .setup = r2d_setup,
833 .run = r2d_run,
834 };
835
836 static const struct blit_ops r3d_ops = {
837 .coords = r3d_coords,
838 .clear_value = r3d_clear_value,
839 .src = r3d_src,
840 .src_buffer = r3d_src_buffer,
841 .dst = r3d_dst,
842 .dst_buffer = r3d_dst_buffer,
843 .setup = r3d_setup,
844 .run = r3d_run,
845 };
846
847 /* passthrough set coords from 3D extents */
848 static void
849 coords(const struct blit_ops *ops,
850 struct tu_cs *cs,
851 const VkOffset3D *dst,
852 const VkOffset3D *src,
853 const VkExtent3D *extent)
854 {
855 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
856 }
857
858 static VkFormat
859 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
860 {
861 if (vk_format_is_compressed(format)) {
862 switch (vk_format_get_blocksize(format)) {
863 case 1: return VK_FORMAT_R8_UINT;
864 case 2: return VK_FORMAT_R16_UINT;
865 case 4: return VK_FORMAT_R32_UINT;
866 case 8: return VK_FORMAT_R32G32_UINT;
867 case 16:return VK_FORMAT_R32G32B32A32_UINT;
868 default:
869 unreachable("unhandled format size");
870 }
871 }
872
873 switch (format) {
874 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
875 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
876 return VK_FORMAT_R8G8_UNORM;
877 /* fallthrough */
878 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
879 return VK_FORMAT_R8_UNORM;
880 case VK_FORMAT_D24_UNORM_S8_UINT:
881 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
882 return VK_FORMAT_R8_UNORM;
883 /* fallthrough */
884 default:
885 return format;
886 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
887 return VK_FORMAT_R32_UINT;
888 }
889 }
890
891 static void
892 tu_image_view_copy_blit(struct tu_image_view *iview,
893 struct tu_image *image,
894 VkFormat format,
895 const VkImageSubresourceLayers *subres,
896 uint32_t layer,
897 bool stencil_read)
898 {
899 VkImageAspectFlags aspect_mask = subres->aspectMask;
900
901 /* always use the AS_R8G8B8A8 format for these */
902 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
903 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
904 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
905 }
906
907 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
908 .image = tu_image_to_handle(image),
909 .viewType = VK_IMAGE_VIEW_TYPE_2D,
910 .format = format,
911 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
912 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
913 .subresourceRange = {
914 .aspectMask = aspect_mask,
915 .baseMipLevel = subres->mipLevel,
916 .levelCount = 1,
917 .baseArrayLayer = subres->baseArrayLayer + layer,
918 .layerCount = 1,
919 },
920 }, false);
921 }
922
923 static void
924 tu_image_view_copy(struct tu_image_view *iview,
925 struct tu_image *image,
926 VkFormat format,
927 const VkImageSubresourceLayers *subres,
928 uint32_t layer,
929 bool stencil_read)
930 {
931 format = copy_format(format, subres->aspectMask, false);
932 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
933 }
934
935 static void
936 tu_image_view_blit(struct tu_image_view *iview,
937 struct tu_image *image,
938 const VkImageSubresourceLayers *subres,
939 uint32_t layer)
940 {
941 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
942 }
943
944 static void
945 tu6_blit_image(struct tu_cmd_buffer *cmd,
946 struct tu_image *src_image,
947 struct tu_image *dst_image,
948 const VkImageBlit *info,
949 VkFilter filter)
950 {
951 const struct blit_ops *ops = &r2d_ops;
952 struct tu_cs *cs = &cmd->cs;
953 uint32_t layers;
954
955 /* 2D blit can't do rotation mirroring from just coordinates */
956 static const enum a6xx_rotation rotate[2][2] = {
957 {ROTATE_0, ROTATE_HFLIP},
958 {ROTATE_VFLIP, ROTATE_180},
959 };
960
961 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
962 (info->dstOffsets[1].x < info->dstOffsets[0].x);
963 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
964 (info->dstOffsets[1].y < info->dstOffsets[0].y);
965 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
966 (info->dstOffsets[1].z < info->dstOffsets[0].z);
967
968 if (mirror_z) {
969 tu_finishme("blit z mirror\n");
970 return;
971 }
972
973 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
974 info->dstOffsets[1].z - info->dstOffsets[0].z) {
975 tu_finishme("blit z filter\n");
976 return;
977 }
978
979 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
980 if (info->dstSubresource.layerCount > 1) {
981 assert(layers <= 1);
982 layers = info->dstSubresource.layerCount;
983 }
984
985 /* BC1_RGB_* formats need to have their last components overriden with 1
986 * when sampling, which is normally handled with the texture descriptor
987 * swizzle. The 2d path can't handle that, so use the 3d path.
988 *
989 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
990 * the 2d path.
991 */
992
993 if (dst_image->samples > 1 ||
994 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
995 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
996 filter == VK_FILTER_CUBIC_EXT)
997 ops = &r3d_ops;
998
999 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1000 * figure out why (should be able to pass all tests with only shader path)
1001 */
1002
1003 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1004 rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1005
1006 if (ops == &r3d_ops) {
1007 r3d_coords_raw(cs, (float[]) {
1008 info->dstOffsets[0].x, info->dstOffsets[0].y,
1009 info->srcOffsets[0].x, info->srcOffsets[0].y,
1010 info->dstOffsets[1].x, info->dstOffsets[1].y,
1011 info->srcOffsets[1].x, info->srcOffsets[1].y
1012 });
1013 } else {
1014 tu_cs_emit_regs(cs,
1015 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1016 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1017 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1018 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1019 tu_cs_emit_regs(cs,
1020 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1021 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1022 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1023 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1024 }
1025
1026 struct tu_image_view dst, src;
1027 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1028 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1029
1030 for (uint32_t i = 0; i < layers; i++) {
1031 ops->dst(cs, &dst, i);
1032 ops->src(cmd, cs, &src, i, filter);
1033 ops->run(cmd, cs);
1034 }
1035 }
1036
1037 void
1038 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1039 VkImage srcImage,
1040 VkImageLayout srcImageLayout,
1041 VkImage dstImage,
1042 VkImageLayout dstImageLayout,
1043 uint32_t regionCount,
1044 const VkImageBlit *pRegions,
1045 VkFilter filter)
1046
1047 {
1048 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1049 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1050 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1051
1052 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1053 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1054
1055 for (uint32_t i = 0; i < regionCount; ++i)
1056 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1057 }
1058
1059 static void
1060 copy_compressed(VkFormat format,
1061 VkOffset3D *offset,
1062 VkExtent3D *extent,
1063 uint32_t *width,
1064 uint32_t *height)
1065 {
1066 if (!vk_format_is_compressed(format))
1067 return;
1068
1069 uint32_t block_width = vk_format_get_blockwidth(format);
1070 uint32_t block_height = vk_format_get_blockheight(format);
1071
1072 offset->x /= block_width;
1073 offset->y /= block_height;
1074
1075 if (extent) {
1076 extent->width = DIV_ROUND_UP(extent->width, block_width);
1077 extent->height = DIV_ROUND_UP(extent->height, block_height);
1078 }
1079 if (width)
1080 *width = DIV_ROUND_UP(*width, block_width);
1081 if (height)
1082 *height = DIV_ROUND_UP(*height, block_height);
1083 }
1084
1085 static void
1086 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1087 struct tu_buffer *src_buffer,
1088 struct tu_image *dst_image,
1089 const VkBufferImageCopy *info)
1090 {
1091 struct tu_cs *cs = &cmd->cs;
1092 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1093 VkFormat src_format =
1094 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1095 const struct blit_ops *ops = &r2d_ops;
1096
1097 /* special case for buffer to stencil */
1098 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1099 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1100 ops = &r3d_ops;
1101 }
1102
1103 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1104 * which matters for UBWC. buffer_to_image/etc can fail because of this
1105 */
1106
1107 VkOffset3D offset = info->imageOffset;
1108 VkExtent3D extent = info->imageExtent;
1109 uint32_t src_width = info->bufferRowLength ?: extent.width;
1110 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1111
1112 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1113
1114 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1115 uint32_t layer_size = src_height * pitch;
1116
1117 ops->setup(cmd, cs,
1118 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1119 info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1120
1121 struct tu_image_view dst;
1122 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1123
1124 for (uint32_t i = 0; i < layers; i++) {
1125 ops->dst(cs, &dst, i);
1126
1127 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1128 if ((src_va & 63) || (pitch & 63)) {
1129 for (uint32_t y = 0; y < extent.height; y++) {
1130 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1131 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1132 x + extent.width, 1);
1133 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1134 &(VkExtent2D) {extent.width, 1});
1135 ops->run(cmd, cs);
1136 src_va += pitch;
1137 }
1138 } else {
1139 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1140 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1141 ops->run(cmd, cs);
1142 }
1143 }
1144 }
1145
1146 void
1147 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1148 VkBuffer srcBuffer,
1149 VkImage dstImage,
1150 VkImageLayout dstImageLayout,
1151 uint32_t regionCount,
1152 const VkBufferImageCopy *pRegions)
1153 {
1154 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1155 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1156 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1157
1158 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1159 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1160
1161 for (unsigned i = 0; i < regionCount; ++i)
1162 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1163 }
1164
1165 static void
1166 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1167 struct tu_image *src_image,
1168 struct tu_buffer *dst_buffer,
1169 const VkBufferImageCopy *info)
1170 {
1171 struct tu_cs *cs = &cmd->cs;
1172 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1173 VkFormat dst_format =
1174 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1175 bool stencil_read = false;
1176
1177 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1178 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1179 stencil_read = true;
1180 }
1181
1182 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1183 VkOffset3D offset = info->imageOffset;
1184 VkExtent3D extent = info->imageExtent;
1185 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1186 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1187
1188 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1189
1190 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1191 uint32_t layer_size = pitch * dst_height;
1192
1193 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1194
1195 struct tu_image_view src;
1196 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1197
1198 for (uint32_t i = 0; i < layers; i++) {
1199 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1200
1201 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1202 if ((dst_va & 63) || (pitch & 63)) {
1203 for (uint32_t y = 0; y < extent.height; y++) {
1204 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1205 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1206 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1207 &(VkExtent2D) {extent.width, 1});
1208 ops->run(cmd, cs);
1209 dst_va += pitch;
1210 }
1211 } else {
1212 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1213 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1214 ops->run(cmd, cs);
1215 }
1216 }
1217 }
1218
1219 void
1220 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1221 VkImage srcImage,
1222 VkImageLayout srcImageLayout,
1223 VkBuffer dstBuffer,
1224 uint32_t regionCount,
1225 const VkBufferImageCopy *pRegions)
1226 {
1227 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1228 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1229 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1230
1231 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1232 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1233
1234 for (unsigned i = 0; i < regionCount; ++i)
1235 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1236 }
1237
1238 /* Tiled formats don't support swapping, which means that we can't support
1239 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1240 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1241 * Currently we fake support for tiled swapped formats and use the unswapped
1242 * format instead, but this means that reinterpreting copies to and from
1243 * swapped formats can't be performed correctly unless we can swizzle the
1244 * components by reinterpreting the other image as the "correct" swapped
1245 * format, i.e. only when the other image is linear.
1246 */
1247
1248 static bool
1249 is_swapped_format(VkFormat format)
1250 {
1251 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1252 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1253 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1254 }
1255
1256 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1257 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1258 * versa). This should mirror the logic in fdl6_layout.
1259 */
1260 static bool
1261 image_is_r8g8(struct tu_image *image)
1262 {
1263 return image->layout[0].cpp == 2 &&
1264 vk_format_get_nr_components(image->vk_format) == 2;
1265 }
1266
1267 static void
1268 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1269 struct tu_image *src_image,
1270 struct tu_image *dst_image,
1271 const VkImageCopy *info)
1272 {
1273 const struct blit_ops *ops = &r2d_ops;
1274 struct tu_cs *cs = &cmd->cs;
1275
1276 if (dst_image->samples > 1)
1277 ops = &r3d_ops;
1278
1279 VkFormat format = VK_FORMAT_UNDEFINED;
1280 VkOffset3D src_offset = info->srcOffset;
1281 VkOffset3D dst_offset = info->dstOffset;
1282 VkExtent3D extent = info->extent;
1283
1284 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1285 * Images":
1286 *
1287 * When copying between compressed and uncompressed formats the extent
1288 * members represent the texel dimensions of the source image and not
1289 * the destination. When copying from a compressed image to an
1290 * uncompressed image the image texel dimensions written to the
1291 * uncompressed image will be source extent divided by the compressed
1292 * texel block dimensions. When copying from an uncompressed image to a
1293 * compressed image the image texel dimensions written to the compressed
1294 * image will be the source extent multiplied by the compressed texel
1295 * block dimensions.
1296 *
1297 * This means we only have to adjust the extent if the source image is
1298 * compressed.
1299 */
1300 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1301 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1302
1303 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1304 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1305
1306 bool use_staging_blit = false;
1307
1308 if (src_format == dst_format) {
1309 /* Images that share a format can always be copied directly because it's
1310 * the same as a blit.
1311 */
1312 format = src_format;
1313 } else if (!src_image->layout[0].tile_mode) {
1314 /* If an image is linear, we can always safely reinterpret it with the
1315 * other image's format and then do a regular blit.
1316 */
1317 format = dst_format;
1318 } else if (!dst_image->layout[0].tile_mode) {
1319 format = src_format;
1320 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1321 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1322 * due to the different tile layout.
1323 */
1324 use_staging_blit = true;
1325 } else if (is_swapped_format(src_format) ||
1326 is_swapped_format(dst_format)) {
1327 /* If either format has a non-identity swap, then we can't copy
1328 * to/from it.
1329 */
1330 use_staging_blit = true;
1331 } else if (!src_image->layout[0].ubwc) {
1332 format = dst_format;
1333 } else if (!dst_image->layout[0].ubwc) {
1334 format = src_format;
1335 } else {
1336 /* Both formats use UBWC and so neither can be reinterpreted.
1337 * TODO: We could do an in-place decompression of the dst instead.
1338 */
1339 use_staging_blit = true;
1340 }
1341
1342 struct tu_image_view dst, src;
1343
1344 if (use_staging_blit) {
1345 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1346 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1347
1348 struct tu_image staging_image = {
1349 .vk_format = src_format,
1350 .type = src_image->type,
1351 .tiling = VK_IMAGE_TILING_LINEAR,
1352 .extent = extent,
1353 .level_count = 1,
1354 .layer_count = info->srcSubresource.layerCount,
1355 .samples = src_image->samples,
1356 .bo_offset = 0,
1357 };
1358
1359 VkImageSubresourceLayers staging_subresource = {
1360 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1361 .mipLevel = 0,
1362 .baseArrayLayer = 0,
1363 .layerCount = info->srcSubresource.layerCount,
1364 };
1365
1366 VkOffset3D staging_offset = { 0 };
1367
1368 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1369 staging_image.layout[0].ubwc = false;
1370
1371 fdl6_layout(&staging_image.layout[0],
1372 vk_format_to_pipe_format(staging_image.vk_format),
1373 staging_image.samples,
1374 staging_image.extent.width,
1375 staging_image.extent.height,
1376 staging_image.extent.depth,
1377 staging_image.level_count,
1378 staging_image.layer_count,
1379 staging_image.type == VK_IMAGE_TYPE_3D,
1380 NULL);
1381
1382 VkResult result = tu_get_scratch_bo(cmd->device,
1383 staging_image.layout[0].size,
1384 &staging_image.bo);
1385 if (result != VK_SUCCESS) {
1386 cmd->record_result = result;
1387 return;
1388 }
1389
1390 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1391 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1392
1393 struct tu_image_view staging;
1394 tu_image_view_copy(&staging, &staging_image, src_format,
1395 &staging_subresource, 0, false);
1396
1397 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1398 coords(ops, cs, &staging_offset, &src_offset, &extent);
1399
1400 for (uint32_t i = 0; i < info->extent.depth; i++) {
1401 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1402 ops->dst(cs, &staging, i);
1403 ops->run(cmd, cs);
1404 }
1405
1406 /* When executed by the user there has to be a pipeline barrier here,
1407 * but since we're doing it manually we'll have to flush ourselves.
1408 */
1409 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1410 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1411
1412 tu_image_view_copy(&staging, &staging_image, dst_format,
1413 &staging_subresource, 0, false);
1414
1415 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1416 ROTATE_0, false, dst_image->layout[0].ubwc);
1417 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1418
1419 for (uint32_t i = 0; i < info->extent.depth; i++) {
1420 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1421 ops->dst(cs, &dst, i);
1422 ops->run(cmd, cs);
1423 }
1424 } else {
1425 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1426 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1427
1428 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1429 ROTATE_0, false, dst_image->layout[0].ubwc);
1430 coords(ops, cs, &dst_offset, &src_offset, &extent);
1431
1432 for (uint32_t i = 0; i < info->extent.depth; i++) {
1433 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1434 ops->dst(cs, &dst, i);
1435 ops->run(cmd, cs);
1436 }
1437 }
1438 }
1439
1440 void
1441 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1442 VkImage srcImage,
1443 VkImageLayout srcImageLayout,
1444 VkImage destImage,
1445 VkImageLayout destImageLayout,
1446 uint32_t regionCount,
1447 const VkImageCopy *pRegions)
1448 {
1449 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1450 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1451 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1452
1453 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1454 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1455
1456 for (uint32_t i = 0; i < regionCount; ++i)
1457 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1458 }
1459
1460 static void
1461 copy_buffer(struct tu_cmd_buffer *cmd,
1462 uint64_t dst_va,
1463 uint64_t src_va,
1464 uint64_t size,
1465 uint32_t block_size)
1466 {
1467 const struct blit_ops *ops = &r2d_ops;
1468 struct tu_cs *cs = &cmd->cs;
1469 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1470 uint64_t blocks = size / block_size;
1471
1472 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1473
1474 while (blocks) {
1475 uint32_t src_x = (src_va & 63) / block_size;
1476 uint32_t dst_x = (dst_va & 63) / block_size;
1477 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1478
1479 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1480 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1481 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1482 ops->run(cmd, cs);
1483
1484 src_va += width * block_size;
1485 dst_va += width * block_size;
1486 blocks -= width;
1487 }
1488 }
1489
1490 void
1491 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1492 VkBuffer srcBuffer,
1493 VkBuffer dstBuffer,
1494 uint32_t regionCount,
1495 const VkBufferCopy *pRegions)
1496 {
1497 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1498 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1499 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1500
1501 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1502 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1503
1504 for (unsigned i = 0; i < regionCount; ++i) {
1505 copy_buffer(cmd,
1506 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1507 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1508 pRegions[i].size, 1);
1509 }
1510 }
1511
1512 void
1513 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1514 VkBuffer dstBuffer,
1515 VkDeviceSize dstOffset,
1516 VkDeviceSize dataSize,
1517 const void *pData)
1518 {
1519 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1520 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1521
1522 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1523
1524 struct tu_cs_memory tmp;
1525 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1526 if (result != VK_SUCCESS) {
1527 cmd->record_result = result;
1528 return;
1529 }
1530
1531 memcpy(tmp.map, pData, dataSize);
1532 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1533 }
1534
1535 void
1536 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1537 VkBuffer dstBuffer,
1538 VkDeviceSize dstOffset,
1539 VkDeviceSize fillSize,
1540 uint32_t data)
1541 {
1542 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1543 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1544 const struct blit_ops *ops = &r2d_ops;
1545 struct tu_cs *cs = &cmd->cs;
1546
1547 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1548
1549 if (fillSize == VK_WHOLE_SIZE)
1550 fillSize = buffer->size - dstOffset;
1551
1552 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1553 uint32_t blocks = fillSize / 4;
1554
1555 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1556 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1557
1558 while (blocks) {
1559 uint32_t dst_x = (dst_va & 63) / 4;
1560 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1561
1562 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1563 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1564 ops->run(cmd, cs);
1565
1566 dst_va += width * 4;
1567 blocks -= width;
1568 }
1569 }
1570
1571 void
1572 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1573 VkImage srcImage,
1574 VkImageLayout srcImageLayout,
1575 VkImage dstImage,
1576 VkImageLayout dstImageLayout,
1577 uint32_t regionCount,
1578 const VkImageResolve *pRegions)
1579 {
1580 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1581 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1582 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1583 const struct blit_ops *ops = &r2d_ops;
1584 struct tu_cs *cs = &cmd->cs;
1585
1586 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1587 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1588
1589 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1590 ROTATE_0, false, dst_image->layout[0].ubwc);
1591
1592 for (uint32_t i = 0; i < regionCount; ++i) {
1593 const VkImageResolve *info = &pRegions[i];
1594 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1595
1596 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1597 /* TODO: aspect masks possible ? */
1598
1599 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1600
1601 struct tu_image_view dst, src;
1602 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1603 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1604
1605 for (uint32_t i = 0; i < layers; i++) {
1606 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1607 ops->dst(cs, &dst, i);
1608 ops->run(cmd, cs);
1609 }
1610 }
1611 }
1612
1613 void
1614 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1615 struct tu_cs *cs,
1616 struct tu_image_view *src,
1617 struct tu_image_view *dst,
1618 uint32_t layers,
1619 const VkRect2D *rect)
1620 {
1621 const struct blit_ops *ops = &r2d_ops;
1622
1623 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1624 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1625
1626 assert(src->image->vk_format == dst->image->vk_format);
1627
1628 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1629 ROTATE_0, false, dst->ubwc_enabled);
1630 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1631
1632 for (uint32_t i = 0; i < layers; i++) {
1633 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1634 ops->dst(cs, dst, i);
1635 ops->run(cmd, cs);
1636 }
1637 }
1638
1639 static void
1640 clear_image(struct tu_cmd_buffer *cmd,
1641 struct tu_image *image,
1642 const VkClearValue *clear_value,
1643 const VkImageSubresourceRange *range)
1644 {
1645 uint32_t level_count = tu_get_levelCount(image, range);
1646 uint32_t layer_count = tu_get_layerCount(image, range);
1647 struct tu_cs *cs = &cmd->cs;
1648 VkFormat format = image->vk_format;
1649 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1650 format = VK_FORMAT_R32_UINT;
1651
1652 if (image->type == VK_IMAGE_TYPE_3D) {
1653 assert(layer_count == 1);
1654 assert(range->baseArrayLayer == 0);
1655 }
1656
1657 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1658
1659 ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true, image->layout[0].ubwc);
1660 ops->clear_value(cs, image->vk_format, clear_value);
1661
1662 for (unsigned j = 0; j < level_count; j++) {
1663 if (image->type == VK_IMAGE_TYPE_3D)
1664 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1665
1666 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1667 u_minify(image->extent.width, range->baseMipLevel + j),
1668 u_minify(image->extent.height, range->baseMipLevel + j)
1669 });
1670
1671 struct tu_image_view dst;
1672 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1673 .aspectMask = range->aspectMask,
1674 .mipLevel = range->baseMipLevel + j,
1675 .baseArrayLayer = range->baseArrayLayer,
1676 .layerCount = 1,
1677 }, 0, false);
1678
1679 for (uint32_t i = 0; i < layer_count; i++) {
1680 ops->dst(cs, &dst, i);
1681 ops->run(cmd, cs);
1682 }
1683 }
1684 }
1685
1686 void
1687 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1688 VkImage image_h,
1689 VkImageLayout imageLayout,
1690 const VkClearColorValue *pColor,
1691 uint32_t rangeCount,
1692 const VkImageSubresourceRange *pRanges)
1693 {
1694 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1695 TU_FROM_HANDLE(tu_image, image, image_h);
1696
1697 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1698
1699 for (unsigned i = 0; i < rangeCount; i++)
1700 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1701 }
1702
1703 void
1704 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1705 VkImage image_h,
1706 VkImageLayout imageLayout,
1707 const VkClearDepthStencilValue *pDepthStencil,
1708 uint32_t rangeCount,
1709 const VkImageSubresourceRange *pRanges)
1710 {
1711 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1712 TU_FROM_HANDLE(tu_image, image, image_h);
1713
1714 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1715
1716 for (unsigned i = 0; i < rangeCount; i++)
1717 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1718 }
1719
1720 static void
1721 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1722 uint32_t attachment_count,
1723 const VkClearAttachment *attachments,
1724 uint32_t rect_count,
1725 const VkClearRect *rects)
1726 {
1727 const struct tu_subpass *subpass = cmd->state.subpass;
1728 /* note: cannot use shader path here.. there is a special shader path
1729 * in tu_clear_sysmem_attachments()
1730 */
1731 const struct blit_ops *ops = &r2d_ops;
1732 struct tu_cs *cs = &cmd->draw_cs;
1733
1734 for (uint32_t j = 0; j < attachment_count; j++) {
1735 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1736 * Pass Instance" says that:
1737 *
1738 * Unlike other clear commands, vkCmdClearAttachments executes as
1739 * a drawing command, rather than a transfer command, with writes
1740 * performed by it executing in rasterization order. Clears to
1741 * color attachments are executed as color attachment writes, by
1742 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1743 * Clears to depth/stencil attachments are executed as depth
1744 * writes and writes by the
1745 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1746 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1747 *
1748 * However, the 2d path here is executed the same way as a
1749 * transfer command, using the CCU color cache exclusively with
1750 * a special depth-as-color format for depth clears. This means that
1751 * we can't rely on the normal pipeline barrier mechanism here, and
1752 * have to manually flush whenever using a different cache domain
1753 * from what the 3d path would've used. This happens when we clear
1754 * depth/stencil, since normally depth attachments use CCU depth, but
1755 * we clear it using a special depth-as-color format. Since the clear
1756 * potentially uses a different attachment state we also need to
1757 * invalidate color beforehand and flush it afterwards.
1758 */
1759
1760 uint32_t a;
1761 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1762 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1763 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1764 } else {
1765 a = subpass->depth_stencil_attachment.attachment;
1766 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1767 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1768 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1769 }
1770
1771 if (a == VK_ATTACHMENT_UNUSED)
1772 continue;
1773
1774 const struct tu_image_view *iview =
1775 cmd->state.framebuffer->attachments[a].attachment;
1776
1777 ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask,
1778 ROTATE_0, true, iview->ubwc_enabled);
1779 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1780
1781 /* Wait for the flushes we triggered manually to complete */
1782 tu_cs_emit_wfi(cs);
1783
1784 for (uint32_t i = 0; i < rect_count; i++) {
1785 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1786 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1787 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1788 ops->run(cmd, cs);
1789 }
1790 }
1791
1792 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1793 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1794 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1795 } else {
1796 /* sync color into depth */
1797 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1798 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1799 }
1800 }
1801 }
1802
1803 static void
1804 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1805 uint32_t attachment_count,
1806 const VkClearAttachment *attachments,
1807 uint32_t rect_count,
1808 const VkClearRect *rects)
1809 {
1810 /* the shader path here is special, it avoids changing MRT/etc state */
1811 const struct tu_render_pass *pass = cmd->state.pass;
1812 const struct tu_subpass *subpass = cmd->state.subpass;
1813 const uint32_t mrt_count = subpass->color_count;
1814 struct tu_cs *cs = &cmd->draw_cs;
1815 uint32_t clear_value[MAX_RTS][4];
1816 float z_clear_val = 0.0f;
1817 uint8_t s_clear_val = 0;
1818 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1819 bool z_clear = false;
1820 bool s_clear = false;
1821 bool layered_clear = false;
1822 uint32_t max_samples = 1;
1823
1824 for (uint32_t i = 0; i < attachment_count; i++) {
1825 uint32_t a;
1826 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1827 uint32_t c = attachments[i].colorAttachment;
1828 a = subpass->color_attachments[c].attachment;
1829 if (a == VK_ATTACHMENT_UNUSED)
1830 continue;
1831
1832 clear_rts |= 1 << c;
1833 clear_components |= 0xf << (c * 4);
1834 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1835 } else {
1836 a = subpass->depth_stencil_attachment.attachment;
1837 if (a == VK_ATTACHMENT_UNUSED)
1838 continue;
1839
1840 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1841 z_clear = true;
1842 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1843 }
1844
1845 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1846 s_clear = true;
1847 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1848 }
1849 }
1850
1851 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1852 }
1853
1854 /* prefer to use 2D path for clears
1855 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1856 */
1857 if (max_samples == 1 && cmd->state.framebuffer) {
1858 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1859 return;
1860 }
1861
1862 /* This clear path behaves like a draw, needs the same flush as tu_draw */
1863 tu_emit_cache_flush_renderpass(cmd, cs);
1864
1865 /* disable all draw states so they don't interfere
1866 * TODO: use and re-use draw states for this path
1867 * we have to disable draw states individually to preserve
1868 * input attachment states, because a secondary command buffer
1869 * won't be able to restore them
1870 */
1871 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1872 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1873 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1874 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1875 continue;
1876 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1877 CP_SET_DRAW_STATE__0_DISABLE);
1878 tu_cs_emit_qw(cs, 0);
1879 }
1880 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1881
1882 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1883 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1884 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1885 0xfc000000);
1886 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1887
1888 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1889 for (uint32_t i = 0; i < mrt_count; i++) {
1890 if (clear_rts & (1 << i))
1891 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1892 else
1893 tu_cs_emit(cs, 0);
1894 }
1895
1896 for (uint32_t i = 0; i < rect_count; i++) {
1897 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1898 layered_clear = true;
1899 }
1900
1901 r3d_common(cmd, cs, false, num_rts, layered_clear);
1902
1903 tu_cs_emit_regs(cs,
1904 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1905 tu_cs_emit_regs(cs,
1906 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1907
1908 tu_cs_emit_regs(cs,
1909 A6XX_RB_FS_OUTPUT_CNTL0(),
1910 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1911
1912 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1913 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1914 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1915 for (uint32_t i = 0; i < mrt_count; i++) {
1916 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1917 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1918 }
1919
1920 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1921 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1922 .z_enable = z_clear,
1923 .z_write_enable = z_clear,
1924 .zfunc = FUNC_ALWAYS));
1925 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1926 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1927 .stencil_enable = s_clear,
1928 .func = FUNC_ALWAYS,
1929 .zpass = STENCIL_REPLACE));
1930 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1931 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1932 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1933
1934 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1935 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1936 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1937 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1938 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1939 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1940 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1941 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1942 for_each_bit(b, clear_rts)
1943 tu_cs_emit_array(cs, clear_value[b], 4);
1944
1945 for (uint32_t i = 0; i < rect_count; i++) {
1946 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1947 r3d_coords_raw(cs, (float[]) {
1948 rects[i].rect.offset.x, rects[i].rect.offset.y,
1949 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1950 rects[i].rect.offset.x + rects[i].rect.extent.width,
1951 rects[i].rect.offset.y + rects[i].rect.extent.height,
1952 z_clear_val, 1.0f,
1953 });
1954 r3d_run(cmd, cs);
1955 }
1956 }
1957 }
1958
1959 static void
1960 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1961 {
1962 enum pipe_format pformat = vk_format_to_pipe_format(format);
1963
1964 switch (format) {
1965 case VK_FORMAT_X8_D24_UNORM_PACK32:
1966 case VK_FORMAT_D24_UNORM_S8_UINT:
1967 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1968 val->depthStencil.stencil << 24;
1969 return;
1970 case VK_FORMAT_D16_UNORM:
1971 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1972 return;
1973 case VK_FORMAT_D32_SFLOAT:
1974 clear_value[0] = fui(val->depthStencil.depth);
1975 return;
1976 case VK_FORMAT_S8_UINT:
1977 clear_value[0] = val->depthStencil.stencil;
1978 return;
1979 /* these formats use a different base format when tiled
1980 * the same format can be used for both because GMEM is always in WZYX order
1981 */
1982 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1983 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1984 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1985 default:
1986 break;
1987 }
1988
1989 VkClearColorValue color;
1990
1991 /**
1992 * GMEM is tiled and wants the components in WZYX order,
1993 * apply swizzle to the color before packing, to counteract
1994 * deswizzling applied by packing functions
1995 */
1996 pipe_swizzle_4f(color.float32, val->color.float32,
1997 util_format_description(pformat)->swizzle);
1998
1999 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2000 }
2001
2002 static void
2003 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2004 struct tu_cs *cs,
2005 uint32_t attachment,
2006 VkImageAspectFlags mask,
2007 const VkClearValue *value)
2008 {
2009 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2010
2011
2012 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2013 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2014
2015 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
2016 .clear_mask = aspect_write_mask(vk_format, mask)));
2017
2018 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2019 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2020
2021 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2022 tu_cs_emit(cs, 0);
2023
2024 uint32_t clear_vals[4] = {};
2025 pack_gmem_clear_value(value, vk_format, clear_vals);
2026
2027 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2028 tu_cs_emit_array(cs, clear_vals, 4);
2029
2030 tu6_emit_event_write(cmd, cs, BLIT);
2031 }
2032
2033 static void
2034 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2035 uint32_t attachment_count,
2036 const VkClearAttachment *attachments,
2037 uint32_t rect_count,
2038 const VkClearRect *rects)
2039 {
2040 const struct tu_subpass *subpass = cmd->state.subpass;
2041 struct tu_cs *cs = &cmd->draw_cs;
2042
2043 /* TODO: swap the loops for smaller cmdstream */
2044 for (unsigned i = 0; i < rect_count; i++) {
2045 unsigned x1 = rects[i].rect.offset.x;
2046 unsigned y1 = rects[i].rect.offset.y;
2047 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2048 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2049
2050 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2051 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2052 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2053
2054 for (unsigned j = 0; j < attachment_count; j++) {
2055 uint32_t a;
2056 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2057 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2058 else
2059 a = subpass->depth_stencil_attachment.attachment;
2060
2061 if (a == VK_ATTACHMENT_UNUSED)
2062 continue;
2063
2064 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2065 &attachments[j].clearValue);
2066 }
2067 }
2068 }
2069
2070 void
2071 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2072 uint32_t attachmentCount,
2073 const VkClearAttachment *pAttachments,
2074 uint32_t rectCount,
2075 const VkClearRect *pRects)
2076 {
2077 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2078 struct tu_cs *cs = &cmd->draw_cs;
2079
2080 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2081 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2082 tu_cond_exec_end(cs);
2083
2084 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2085 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2086 tu_cond_exec_end(cs);
2087 }
2088
2089 void
2090 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2091 struct tu_cs *cs,
2092 uint32_t a,
2093 const VkRenderPassBeginInfo *info)
2094 {
2095 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2096 const struct tu_image_view *iview = fb->attachments[a].attachment;
2097 const struct tu_render_pass_attachment *attachment =
2098 &cmd->state.pass->attachments[a];
2099
2100 if (!attachment->clear_mask)
2101 return;
2102
2103 const struct blit_ops *ops = &r2d_ops;
2104 if (attachment->samples > 1)
2105 ops = &r3d_ops;
2106
2107 ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0,
2108 true, iview->ubwc_enabled);
2109 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2110 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2111
2112 /* Wait for any flushes at the beginning of the renderpass to complete */
2113 tu_cs_emit_wfi(cs);
2114
2115 for (uint32_t i = 0; i < fb->layers; i++) {
2116 ops->dst(cs, iview, i);
2117 ops->run(cmd, cs);
2118 }
2119
2120 /* The spec doesn't explicitly say, but presumably the initial renderpass
2121 * clear is considered part of the renderpass, and therefore barriers
2122 * aren't required inside the subpass/renderpass. Therefore we need to
2123 * flush CCU color into CCU depth here, just like with
2124 * vkCmdClearAttachments(). Note that because this only happens at the
2125 * beginning of a renderpass, and renderpass writes are considered
2126 * "incoherent", we shouldn't have to worry about syncing depth into color
2127 * beforehand as depth should already be flushed.
2128 */
2129 if (vk_format_is_depth_or_stencil(attachment->format)) {
2130 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2131 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2132 } else {
2133 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2134 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2135 }
2136 }
2137
2138 void
2139 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2140 struct tu_cs *cs,
2141 uint32_t a,
2142 const VkRenderPassBeginInfo *info)
2143 {
2144 const struct tu_render_pass_attachment *attachment =
2145 &cmd->state.pass->attachments[a];
2146
2147 if (!attachment->clear_mask)
2148 return;
2149
2150 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2151
2152 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2153 &info->pClearValues[a]);
2154 }
2155
2156 static void
2157 tu_emit_blit(struct tu_cmd_buffer *cmd,
2158 struct tu_cs *cs,
2159 const struct tu_image_view *iview,
2160 const struct tu_render_pass_attachment *attachment,
2161 bool resolve)
2162 {
2163 tu_cs_emit_regs(cs,
2164 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2165
2166 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2167 .unk0 = !resolve,
2168 .gmem = !resolve,
2169 /* "integer" bit disables msaa resolve averaging */
2170 .integer = vk_format_is_int(attachment->format)));
2171
2172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2173 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2174 tu_cs_image_ref_2d(cs, iview, 0, false);
2175
2176 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2177 tu_cs_image_flag_ref(cs, iview, 0);
2178
2179 tu_cs_emit_regs(cs,
2180 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2181
2182 tu6_emit_event_write(cmd, cs, BLIT);
2183 }
2184
2185 static bool
2186 blit_can_resolve(VkFormat format)
2187 {
2188 const struct util_format_description *desc = vk_format_description(format);
2189
2190 /* blit event can only do resolve for simple cases:
2191 * averaging samples as unsigned integers or choosing only one sample
2192 */
2193 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2194 return false;
2195
2196 /* can't do formats with larger channel sizes
2197 * note: this includes all float formats
2198 * note2: single channel integer formats seem OK
2199 */
2200 if (desc->channel[0].size > 10)
2201 return false;
2202
2203 switch (format) {
2204 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2205 * likely related to these formats having different layout from other cpp=2 formats
2206 */
2207 case VK_FORMAT_R8G8_UNORM:
2208 case VK_FORMAT_R8G8_UINT:
2209 case VK_FORMAT_R8G8_SINT:
2210 /* TODO: this one should be able to work? */
2211 case VK_FORMAT_D24_UNORM_S8_UINT:
2212 return false;
2213 default:
2214 break;
2215 }
2216
2217 return true;
2218 }
2219
2220 void
2221 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2222 struct tu_cs *cs,
2223 uint32_t a,
2224 bool force_load)
2225 {
2226 const struct tu_image_view *iview =
2227 cmd->state.framebuffer->attachments[a].attachment;
2228 const struct tu_render_pass_attachment *attachment =
2229 &cmd->state.pass->attachments[a];
2230
2231 if (attachment->load || force_load)
2232 tu_emit_blit(cmd, cs, iview, attachment, false);
2233 }
2234
2235 void
2236 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2237 struct tu_cs *cs,
2238 uint32_t a,
2239 uint32_t gmem_a)
2240 {
2241 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2242 const VkRect2D *render_area = &cmd->state.render_area;
2243 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2244 struct tu_image_view *iview = fb->attachments[a].attachment;
2245 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2246
2247 if (!dst->store)
2248 return;
2249
2250 uint32_t x1 = render_area->offset.x;
2251 uint32_t y1 = render_area->offset.y;
2252 uint32_t x2 = x1 + render_area->extent.width;
2253 uint32_t y2 = y1 + render_area->extent.height;
2254 /* x2/y2 can be unaligned if equal to the size of the image,
2255 * since it will write into padding space
2256 * the one exception is linear levels which don't have the
2257 * required y padding in the layout (except for the last level)
2258 */
2259 bool need_y2_align =
2260 y2 != iview->extent.height || iview->need_y2_align;
2261
2262 bool unaligned =
2263 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2264 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2265
2266 /* use fast path when render area is aligned, except for unsupported resolve cases */
2267 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2268 tu_emit_blit(cmd, cs, iview, src, true);
2269 return;
2270 }
2271
2272 if (dst->samples > 1) {
2273 /* I guess we need to use shader path in this case?
2274 * need a testcase which fails because of this
2275 */
2276 tu_finishme("unaligned store of msaa attachment\n");
2277 return;
2278 }
2279
2280 r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT,
2281 ROTATE_0, false, iview->ubwc_enabled, true);
2282 r2d_dst(cs, iview, 0);
2283 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2284
2285 tu_cs_emit_regs(cs,
2286 A6XX_SP_PS_2D_SRC_INFO(
2287 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2288 .tile_mode = TILE6_2,
2289 .srgb = vk_format_is_srgb(src->format),
2290 .samples = tu_msaa_samples(src->samples),
2291 .samples_average = !vk_format_is_int(src->format),
2292 .unk20 = 1,
2293 .unk22 = 1),
2294 /* note: src size does not matter when not scaling */
2295 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2296 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2297 A6XX_SP_PS_2D_SRC_HI(),
2298 A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2299
2300 /* sync GMEM writes with CACHE. */
2301 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2302
2303 /* Wait for CACHE_INVALIDATE to land */
2304 tu_cs_emit_wfi(cs);
2305
2306 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2307 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2308
2309 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2310 * sysmem, and we generally assume that GMEM renderpasses leave their
2311 * results in sysmem, so we need to flush manually here.
2312 */
2313 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2314 }