4082d3e21ddf5176a539794251ee465902410939
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(src->x),
111 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
233 {
234 assert(iview->image->samples == 1);
235
236 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
237 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
238 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
239 tu_cs_emit(cs, iview->stencil_PITCH);
240 }
241
242 static void
243 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
244 {
245 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
246
247 tu_cs_emit_regs(cs,
248 A6XX_RB_2D_DST_INFO(
249 .color_format = format.fmt,
250 .color_swap = format.swap,
251 .srgb = vk_format_is_srgb(vk_format)),
252 A6XX_RB_2D_DST_LO((uint32_t) va),
253 A6XX_RB_2D_DST_HI(va >> 32),
254 A6XX_RB_2D_DST_PITCH(pitch));
255 }
256
257 static void
258 r2d_setup_common(struct tu_cmd_buffer *cmd,
259 struct tu_cs *cs,
260 VkFormat vk_format,
261 VkImageAspectFlags aspect_mask,
262 enum a6xx_rotation rotation,
263 bool clear,
264 bool ubwc,
265 bool scissor)
266 {
267 enum a6xx_format format = tu6_base_format(vk_format);
268 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
269 uint32_t unknown_8c01 = 0;
270
271 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
272 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
273 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
274 }
275
276 /* note: the only format with partial clearing is D24S8 */
277 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
278 /* preserve stencil channel */
279 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
280 unknown_8c01 = 0x08000041;
281 /* preserve depth channels */
282 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
283 unknown_8c01 = 0x00084001;
284 }
285
286 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
287 tu_cs_emit(cs, unknown_8c01);
288
289 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
290 .scissor = scissor,
291 .rotate = rotation,
292 .solid_color = clear,
293 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
294 .color_format = format,
295 .mask = 0xf,
296 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
297 ).value;
298
299 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
300 tu_cs_emit(cs, blit_cntl);
301
302 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
303 tu_cs_emit(cs, blit_cntl);
304
305 if (format == FMT6_10_10_10_2_UNORM_DEST)
306 format = FMT6_16_16_16_16_FLOAT;
307
308 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
309 .sint = vk_format_is_sint(vk_format),
310 .uint = vk_format_is_uint(vk_format),
311 .color_format = format,
312 .srgb = vk_format_is_srgb(vk_format),
313 .mask = 0xf));
314 }
315
316 static void
317 r2d_setup(struct tu_cmd_buffer *cmd,
318 struct tu_cs *cs,
319 VkFormat vk_format,
320 VkImageAspectFlags aspect_mask,
321 enum a6xx_rotation rotation,
322 bool clear,
323 bool ubwc)
324 {
325 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
326
327 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
328 }
329
330 static void
331 r2d_teardown(struct tu_cmd_buffer *cmd,
332 struct tu_cs *cs)
333 {
334 /* nothing to do here */
335 }
336
337 static void
338 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
339 {
340 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
341 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
342 }
343
344 /* r3d_ = shader path operations */
345
346 void
347 tu_init_clear_blit_shaders(struct tu6_global *global)
348 {
349 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
350 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
351 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
352
353 static const instr_t vs_code[] = {
354 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
355 * r1.xy = r0.w ? c1.zw : c0.zw
356 * r0.w = 1.0f
357 */
358 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
359 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
360 .src2 = 3,
361 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
362 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
363 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
364 .src2 = 3,
365 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
366 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
367 { .cat0 = { .opc = OPC_END } },
368 };
369
370 static const instr_t fs_blit[] = {
371 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
372 * blit path (its not clear what allows it to not have it)
373 */
374 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
375 { .cat0 = { .opc = OPC_END } },
376 };
377
378 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
379 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
380
381 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
382 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
383 for (uint32_t i = 0; i < num_rts; i++) {
384 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
385 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
386 }
387 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
388 }
389 }
390
391 static void
392 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
393 bool layered_clear)
394 {
395 struct ir3_const_state dummy_const_state = {};
396 struct ir3_shader dummy_shader = {};
397
398 struct ir3_shader_variant vs = {
399 .type = MESA_SHADER_VERTEX,
400 .instrlen = 1,
401 .constlen = 4,
402 .info.max_reg = 1,
403 .inputs_count = 1,
404 .inputs[0] = {
405 .slot = SYSTEM_VALUE_VERTEX_ID,
406 .regid = regid(0, 3),
407 .sysval = true,
408 },
409 .outputs_count = blit ? 2 : 1,
410 .outputs[0] = {
411 .slot = VARYING_SLOT_POS,
412 .regid = regid(0, 0),
413 },
414 .outputs[1] = {
415 .slot = VARYING_SLOT_VAR0,
416 .regid = regid(1, 0),
417 },
418 .shader = &dummy_shader,
419 .const_state = &dummy_const_state,
420 };
421 if (layered_clear) {
422 vs.outputs[1].slot = VARYING_SLOT_LAYER;
423 vs.outputs[1].regid = regid(1, 1);
424 vs.outputs_count = 2;
425 }
426
427 struct ir3_shader_variant fs = {
428 .type = MESA_SHADER_FRAGMENT,
429 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
430 .constlen = align(num_rts, 4),
431 .info.max_reg = MAX2(num_rts, 1) - 1,
432 .total_in = blit ? 2 : 0,
433 .num_samp = blit ? 1 : 0,
434 .inputs_count = blit ? 2 : 0,
435 .inputs[0] = {
436 .slot = VARYING_SLOT_VAR0,
437 .inloc = 0,
438 .compmask = 3,
439 .bary = true,
440 },
441 .inputs[1] = {
442 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
443 .regid = regid(0, 0),
444 .sysval = 1,
445 },
446 .num_sampler_prefetch = blit ? 1 : 0,
447 .sampler_prefetch[0] = {
448 .src = 0,
449 .wrmask = 0xf,
450 .cmd = 4,
451 },
452 .shader = &dummy_shader,
453 .const_state = &dummy_const_state,
454 };
455
456 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
457 .vs_state = true,
458 .hs_state = true,
459 .ds_state = true,
460 .gs_state = true,
461 .fs_state = true,
462 .cs_state = true,
463 .gfx_ibo = true,
464 .cs_ibo = true,
465 .gfx_shared_const = true,
466 .gfx_bindless = 0x1f,
467 .cs_bindless = 0x1f));
468
469 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
470 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
471 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
472 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
473 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
474 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
475
476 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
477 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
478
479 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
480
481 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
482 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
483 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
484
485 tu6_emit_fs_inputs(cs, &fs);
486
487 tu_cs_emit_regs(cs,
488 A6XX_GRAS_CL_CNTL(
489 .persp_division_disable = 1,
490 .vp_xform_disable = 1,
491 .vp_clip_code_ignore = 1,
492 .clip_disable = 1));
493 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
494
495 tu_cs_emit_regs(cs,
496 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
497 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
498 tu_cs_emit_regs(cs,
499 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
500 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
501
502 tu_cs_emit_regs(cs,
503 A6XX_VFD_INDEX_OFFSET(),
504 A6XX_VFD_INSTANCE_START_OFFSET());
505 }
506
507 static void
508 r3d_coords_raw(struct tu_cs *cs, const float *coords)
509 {
510 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
511 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
512 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
513 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
514 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
515 CP_LOAD_STATE6_0_NUM_UNIT(2));
516 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
517 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
518 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
519 }
520
521 static void
522 r3d_coords(struct tu_cs *cs,
523 const VkOffset2D *dst,
524 const VkOffset2D *src,
525 const VkExtent2D *extent)
526 {
527 int32_t src_x1 = src ? src->x : 0;
528 int32_t src_y1 = src ? src->y : 0;
529 r3d_coords_raw(cs, (float[]) {
530 dst->x, dst->y,
531 src_x1, src_y1,
532 dst->x + extent->width, dst->y + extent->height,
533 src_x1 + extent->width, src_y1 + extent->height,
534 });
535 }
536
537 static void
538 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
539 {
540 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
541 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
542 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
543 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
544 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
545 CP_LOAD_STATE6_0_NUM_UNIT(1));
546 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
547 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
548 switch (format) {
549 case VK_FORMAT_X8_D24_UNORM_PACK32:
550 case VK_FORMAT_D24_UNORM_S8_UINT: {
551 /* cleared as r8g8b8a8_unorm using special format */
552 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
553 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
554 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
555 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
556 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
557 } break;
558 case VK_FORMAT_D16_UNORM:
559 case VK_FORMAT_D32_SFLOAT:
560 tu_cs_emit(cs, fui(val->depthStencil.depth));
561 tu_cs_emit(cs, 0);
562 tu_cs_emit(cs, 0);
563 tu_cs_emit(cs, 0);
564 break;
565 case VK_FORMAT_S8_UINT:
566 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
567 tu_cs_emit(cs, 0);
568 tu_cs_emit(cs, 0);
569 tu_cs_emit(cs, 0);
570 break;
571 default:
572 /* as color formats use clear value as-is */
573 assert(!vk_format_is_depth_or_stencil(format));
574 tu_cs_emit_array(cs, val->color.uint32, 4);
575 break;
576 }
577 }
578
579 static void
580 r3d_src_common(struct tu_cmd_buffer *cmd,
581 struct tu_cs *cs,
582 const uint32_t *tex_const,
583 uint32_t offset_base,
584 uint32_t offset_ubwc,
585 VkFilter filter)
586 {
587 struct tu_cs_memory texture = { };
588 VkResult result = tu_cs_alloc(&cmd->sub_cs,
589 2, /* allocate space for a sampler too */
590 A6XX_TEX_CONST_DWORDS, &texture);
591 assert(result == VK_SUCCESS);
592
593 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
594
595 /* patch addresses for layer offset */
596 *(uint64_t*) (texture.map + 4) += offset_base;
597 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
598 texture.map[7] = ubwc_addr;
599 texture.map[8] = ubwc_addr >> 32;
600
601 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
602 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
603 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
604 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
605 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
606 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
607 0x60000; /* XXX used by blob, doesn't seem necessary */
608 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
609 0x1 | /* XXX used by blob, doesn't seem necessary */
610 A6XX_TEX_SAMP_1_UNNORM_COORDS |
611 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
612 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
613 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
614
615 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
616 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
617 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
618 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
619 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
620 CP_LOAD_STATE6_0_NUM_UNIT(1));
621 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
622
623 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
624 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
625
626 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
627 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
628 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
629 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
630 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
631 CP_LOAD_STATE6_0_NUM_UNIT(1));
632 tu_cs_emit_qw(cs, texture.iova);
633
634 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
635 tu_cs_emit_qw(cs, texture.iova);
636
637 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
638 }
639
640 static void
641 r3d_src(struct tu_cmd_buffer *cmd,
642 struct tu_cs *cs,
643 const struct tu_image_view *iview,
644 uint32_t layer,
645 VkFilter filter)
646 {
647 r3d_src_common(cmd, cs, iview->descriptor,
648 iview->layer_size * layer,
649 iview->ubwc_layer_size * layer,
650 filter);
651 }
652
653 static void
654 r3d_src_buffer(struct tu_cmd_buffer *cmd,
655 struct tu_cs *cs,
656 VkFormat vk_format,
657 uint64_t va, uint32_t pitch,
658 uint32_t width, uint32_t height)
659 {
660 uint32_t desc[A6XX_TEX_CONST_DWORDS];
661
662 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
663
664 desc[0] =
665 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
666 A6XX_TEX_CONST_0_FMT(format.fmt) |
667 A6XX_TEX_CONST_0_SWAP(format.swap) |
668 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
669 // XXX to swizzle into .w for stencil buffer_to_image
670 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
671 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
672 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
673 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
674 desc[2] =
675 A6XX_TEX_CONST_2_PITCH(pitch) |
676 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
677 desc[3] = 0;
678 desc[4] = va;
679 desc[5] = va >> 32;
680 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
681 desc[i] = 0;
682
683 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
684 }
685
686 static void
687 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
688 {
689 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
690
691 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
692 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
693 tu_cs_image_ref(cs, iview, layer);
694 tu_cs_emit(cs, 0);
695
696 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
697 tu_cs_image_flag_ref(cs, iview, layer);
698
699 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
700 }
701
702 static void
703 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
704 {
705 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
706
707 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
708 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
709 tu_cs_image_stencil_ref(cs, iview, layer);
710 tu_cs_emit(cs, 0);
711
712 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
713 }
714
715 static void
716 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
717 {
718 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
719
720 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
721
722 tu_cs_emit_regs(cs,
723 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
724 A6XX_RB_MRT_PITCH(0, pitch),
725 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
726 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
727 A6XX_RB_MRT_BASE_HI(0, va >> 32),
728 A6XX_RB_MRT_BASE_GMEM(0, 0));
729
730 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
731 }
732
733 static uint8_t
734 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
735 {
736 uint8_t mask = 0xf;
737 assert(aspect_mask);
738 /* note: the only format with partial writing is D24S8,
739 * clear/blit uses the _AS_R8G8B8A8 format to access it
740 */
741 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
742 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
743 mask = 0x7;
744 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
745 mask = 0x8;
746 }
747 return mask;
748 }
749
750 static void
751 r3d_setup(struct tu_cmd_buffer *cmd,
752 struct tu_cs *cs,
753 VkFormat vk_format,
754 VkImageAspectFlags aspect_mask,
755 enum a6xx_rotation rotation,
756 bool clear,
757 bool ubwc)
758 {
759 enum a6xx_format format = tu6_base_format(vk_format);
760
761 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
762 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
763 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
764 }
765
766 if (!cmd->state.pass) {
767 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
768 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
769 }
770
771 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
772 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
773
774 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
775
776 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
777 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
778 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
779 0xfc000000);
780 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
781
782 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
783 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
784
785 tu_cs_emit_regs(cs,
786 A6XX_RB_FS_OUTPUT_CNTL0(),
787 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
788
789 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
790 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
791 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
792
793 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
794 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
795 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
796 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
797 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
798 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
799 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
800
801 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
802 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
803
804 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
805 .color_format = format,
806 .color_sint = vk_format_is_sint(vk_format),
807 .color_uint = vk_format_is_uint(vk_format)));
808
809 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
810 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
811 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
812 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
813
814 if (cmd->state.predication_active) {
815 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
816 tu_cs_emit(cs, 0);
817 }
818 }
819
820 static void
821 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
822 {
823 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
824 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
825 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
826 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
827 tu_cs_emit(cs, 1); /* instance count */
828 tu_cs_emit(cs, 2); /* vertex count */
829 }
830
831 static void
832 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
833 {
834 if (cmd->state.predication_active) {
835 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
836 tu_cs_emit(cs, 1);
837 }
838 }
839
840 /* blit ops - common interface for 2d/shader paths */
841
842 struct blit_ops {
843 void (*coords)(struct tu_cs *cs,
844 const VkOffset2D *dst,
845 const VkOffset2D *src,
846 const VkExtent2D *extent);
847 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
848 void (*src)(
849 struct tu_cmd_buffer *cmd,
850 struct tu_cs *cs,
851 const struct tu_image_view *iview,
852 uint32_t layer,
853 VkFilter filter);
854 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
855 VkFormat vk_format,
856 uint64_t va, uint32_t pitch,
857 uint32_t width, uint32_t height);
858 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
859 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
860 void (*setup)(struct tu_cmd_buffer *cmd,
861 struct tu_cs *cs,
862 VkFormat vk_format,
863 VkImageAspectFlags aspect_mask,
864 enum a6xx_rotation rotation,
865 bool clear,
866 bool ubwc);
867 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
868 void (*teardown)(struct tu_cmd_buffer *cmd,
869 struct tu_cs *cs);
870 };
871
872 static const struct blit_ops r2d_ops = {
873 .coords = r2d_coords,
874 .clear_value = r2d_clear_value,
875 .src = r2d_src,
876 .src_buffer = r2d_src_buffer,
877 .dst = r2d_dst,
878 .dst_buffer = r2d_dst_buffer,
879 .setup = r2d_setup,
880 .run = r2d_run,
881 .teardown = r2d_teardown,
882 };
883
884 static const struct blit_ops r3d_ops = {
885 .coords = r3d_coords,
886 .clear_value = r3d_clear_value,
887 .src = r3d_src,
888 .src_buffer = r3d_src_buffer,
889 .dst = r3d_dst,
890 .dst_buffer = r3d_dst_buffer,
891 .setup = r3d_setup,
892 .run = r3d_run,
893 .teardown = r3d_teardown,
894 };
895
896 /* passthrough set coords from 3D extents */
897 static void
898 coords(const struct blit_ops *ops,
899 struct tu_cs *cs,
900 const VkOffset3D *dst,
901 const VkOffset3D *src,
902 const VkExtent3D *extent)
903 {
904 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
905 }
906
907 static VkFormat
908 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
909 {
910 if (vk_format_is_compressed(format)) {
911 switch (vk_format_get_blocksize(format)) {
912 case 1: return VK_FORMAT_R8_UINT;
913 case 2: return VK_FORMAT_R16_UINT;
914 case 4: return VK_FORMAT_R32_UINT;
915 case 8: return VK_FORMAT_R32G32_UINT;
916 case 16:return VK_FORMAT_R32G32B32A32_UINT;
917 default:
918 unreachable("unhandled format size");
919 }
920 }
921
922 switch (format) {
923 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
924 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
925 return VK_FORMAT_R8G8_UNORM;
926 /* fallthrough */
927 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
928 return VK_FORMAT_R8_UNORM;
929 case VK_FORMAT_D24_UNORM_S8_UINT:
930 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
931 return VK_FORMAT_R8_UNORM;
932 /* fallthrough */
933 default:
934 return format;
935 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
936 return VK_FORMAT_R32_UINT;
937 case VK_FORMAT_D32_SFLOAT_S8_UINT:
938 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
939 return VK_FORMAT_S8_UINT;
940 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
941 return VK_FORMAT_D32_SFLOAT;
942 }
943 }
944
945 static void
946 tu_image_view_copy_blit(struct tu_image_view *iview,
947 struct tu_image *image,
948 VkFormat format,
949 const VkImageSubresourceLayers *subres,
950 uint32_t layer,
951 bool stencil_read)
952 {
953 VkImageAspectFlags aspect_mask = subres->aspectMask;
954
955 /* always use the AS_R8G8B8A8 format for these */
956 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
957 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
958 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
959 }
960
961 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
962 .image = tu_image_to_handle(image),
963 .viewType = VK_IMAGE_VIEW_TYPE_2D,
964 .format = format,
965 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
966 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
967 .subresourceRange = {
968 .aspectMask = aspect_mask,
969 .baseMipLevel = subres->mipLevel,
970 .levelCount = 1,
971 .baseArrayLayer = subres->baseArrayLayer + layer,
972 .layerCount = 1,
973 },
974 }, false);
975 }
976
977 static void
978 tu_image_view_copy(struct tu_image_view *iview,
979 struct tu_image *image,
980 VkFormat format,
981 const VkImageSubresourceLayers *subres,
982 uint32_t layer,
983 bool stencil_read)
984 {
985 format = copy_format(format, subres->aspectMask, false);
986 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
987 }
988
989 static void
990 tu_image_view_blit(struct tu_image_view *iview,
991 struct tu_image *image,
992 const VkImageSubresourceLayers *subres,
993 uint32_t layer)
994 {
995 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
996 }
997
998 static void
999 tu6_blit_image(struct tu_cmd_buffer *cmd,
1000 struct tu_image *src_image,
1001 struct tu_image *dst_image,
1002 const VkImageBlit *info,
1003 VkFilter filter)
1004 {
1005 const struct blit_ops *ops = &r2d_ops;
1006 struct tu_cs *cs = &cmd->cs;
1007 uint32_t layers;
1008
1009 /* 2D blit can't do rotation mirroring from just coordinates */
1010 static const enum a6xx_rotation rotate[2][2] = {
1011 {ROTATE_0, ROTATE_HFLIP},
1012 {ROTATE_VFLIP, ROTATE_180},
1013 };
1014
1015 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1016 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1017 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1018 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1019 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1020 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1021
1022 if (mirror_z) {
1023 tu_finishme("blit z mirror\n");
1024 return;
1025 }
1026
1027 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1028 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1029 tu_finishme("blit z filter\n");
1030 return;
1031 }
1032
1033 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1034 if (info->dstSubresource.layerCount > 1) {
1035 assert(layers <= 1);
1036 layers = info->dstSubresource.layerCount;
1037 }
1038
1039 /* BC1_RGB_* formats need to have their last components overriden with 1
1040 * when sampling, which is normally handled with the texture descriptor
1041 * swizzle. The 2d path can't handle that, so use the 3d path.
1042 *
1043 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1044 * the 2d path.
1045 */
1046
1047 if (dst_image->samples > 1 ||
1048 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1049 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1050 filter == VK_FILTER_CUBIC_EXT)
1051 ops = &r3d_ops;
1052
1053 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1054 rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1055
1056 if (ops == &r3d_ops) {
1057 r3d_coords_raw(cs, (float[]) {
1058 info->dstOffsets[0].x, info->dstOffsets[0].y,
1059 info->srcOffsets[0].x, info->srcOffsets[0].y,
1060 info->dstOffsets[1].x, info->dstOffsets[1].y,
1061 info->srcOffsets[1].x, info->srcOffsets[1].y
1062 });
1063 } else {
1064 tu_cs_emit_regs(cs,
1065 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1066 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1067 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1068 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1069 tu_cs_emit_regs(cs,
1070 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1071 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1072 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1073 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1074 }
1075
1076 struct tu_image_view dst, src;
1077 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1078 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1079
1080 for (uint32_t i = 0; i < layers; i++) {
1081 ops->dst(cs, &dst, i);
1082 ops->src(cmd, cs, &src, i, filter);
1083 ops->run(cmd, cs);
1084 }
1085
1086 ops->teardown(cmd, cs);
1087 }
1088
1089 void
1090 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1091 VkImage srcImage,
1092 VkImageLayout srcImageLayout,
1093 VkImage dstImage,
1094 VkImageLayout dstImageLayout,
1095 uint32_t regionCount,
1096 const VkImageBlit *pRegions,
1097 VkFilter filter)
1098
1099 {
1100 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1101 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1102 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1103
1104 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1105 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1106
1107 for (uint32_t i = 0; i < regionCount; ++i)
1108 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1109 }
1110
1111 static void
1112 copy_compressed(VkFormat format,
1113 VkOffset3D *offset,
1114 VkExtent3D *extent,
1115 uint32_t *width,
1116 uint32_t *height)
1117 {
1118 if (!vk_format_is_compressed(format))
1119 return;
1120
1121 uint32_t block_width = vk_format_get_blockwidth(format);
1122 uint32_t block_height = vk_format_get_blockheight(format);
1123
1124 offset->x /= block_width;
1125 offset->y /= block_height;
1126
1127 if (extent) {
1128 extent->width = DIV_ROUND_UP(extent->width, block_width);
1129 extent->height = DIV_ROUND_UP(extent->height, block_height);
1130 }
1131 if (width)
1132 *width = DIV_ROUND_UP(*width, block_width);
1133 if (height)
1134 *height = DIV_ROUND_UP(*height, block_height);
1135 }
1136
1137 static void
1138 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1139 struct tu_buffer *src_buffer,
1140 struct tu_image *dst_image,
1141 const VkBufferImageCopy *info)
1142 {
1143 struct tu_cs *cs = &cmd->cs;
1144 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1145 VkFormat src_format =
1146 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1147 const struct blit_ops *ops = &r2d_ops;
1148
1149 /* special case for buffer to stencil */
1150 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1151 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1152 ops = &r3d_ops;
1153 }
1154
1155 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1156 * which matters for UBWC. buffer_to_image/etc can fail because of this
1157 */
1158
1159 VkOffset3D offset = info->imageOffset;
1160 VkExtent3D extent = info->imageExtent;
1161 uint32_t src_width = info->bufferRowLength ?: extent.width;
1162 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1163
1164 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1165
1166 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1167 uint32_t layer_size = src_height * pitch;
1168
1169 ops->setup(cmd, cs,
1170 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1171 info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1172
1173 struct tu_image_view dst;
1174 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1175
1176 for (uint32_t i = 0; i < layers; i++) {
1177 ops->dst(cs, &dst, i);
1178
1179 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1180 if ((src_va & 63) || (pitch & 63)) {
1181 for (uint32_t y = 0; y < extent.height; y++) {
1182 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1183 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1184 x + extent.width, 1);
1185 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1186 &(VkExtent2D) {extent.width, 1});
1187 ops->run(cmd, cs);
1188 src_va += pitch;
1189 }
1190 } else {
1191 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1192 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1193 ops->run(cmd, cs);
1194 }
1195 }
1196
1197 ops->teardown(cmd, cs);
1198 }
1199
1200 void
1201 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1202 VkBuffer srcBuffer,
1203 VkImage dstImage,
1204 VkImageLayout dstImageLayout,
1205 uint32_t regionCount,
1206 const VkBufferImageCopy *pRegions)
1207 {
1208 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1209 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1210 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1211
1212 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1213 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1214
1215 for (unsigned i = 0; i < regionCount; ++i)
1216 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1217 }
1218
1219 static void
1220 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1221 struct tu_image *src_image,
1222 struct tu_buffer *dst_buffer,
1223 const VkBufferImageCopy *info)
1224 {
1225 struct tu_cs *cs = &cmd->cs;
1226 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1227 VkFormat dst_format =
1228 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1229 bool stencil_read = false;
1230
1231 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1232 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1233 stencil_read = true;
1234 }
1235
1236 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1237 VkOffset3D offset = info->imageOffset;
1238 VkExtent3D extent = info->imageExtent;
1239 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1240 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1241
1242 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1243
1244 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1245 uint32_t layer_size = pitch * dst_height;
1246
1247 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1248
1249 struct tu_image_view src;
1250 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1251
1252 for (uint32_t i = 0; i < layers; i++) {
1253 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1254
1255 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1256 if ((dst_va & 63) || (pitch & 63)) {
1257 for (uint32_t y = 0; y < extent.height; y++) {
1258 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1259 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1260 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1261 &(VkExtent2D) {extent.width, 1});
1262 ops->run(cmd, cs);
1263 dst_va += pitch;
1264 }
1265 } else {
1266 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1267 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1268 ops->run(cmd, cs);
1269 }
1270 }
1271
1272 ops->teardown(cmd, cs);
1273 }
1274
1275 void
1276 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1277 VkImage srcImage,
1278 VkImageLayout srcImageLayout,
1279 VkBuffer dstBuffer,
1280 uint32_t regionCount,
1281 const VkBufferImageCopy *pRegions)
1282 {
1283 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1284 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1285 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1286
1287 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1288 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1289
1290 for (unsigned i = 0; i < regionCount; ++i)
1291 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1292 }
1293
1294 /* Tiled formats don't support swapping, which means that we can't support
1295 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1296 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1297 * Currently we fake support for tiled swapped formats and use the unswapped
1298 * format instead, but this means that reinterpreting copies to and from
1299 * swapped formats can't be performed correctly unless we can swizzle the
1300 * components by reinterpreting the other image as the "correct" swapped
1301 * format, i.e. only when the other image is linear.
1302 */
1303
1304 static bool
1305 is_swapped_format(VkFormat format)
1306 {
1307 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1308 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1309 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1310 }
1311
1312 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1313 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1314 * versa). This should mirror the logic in fdl6_layout.
1315 */
1316 static bool
1317 image_is_r8g8(struct tu_image *image)
1318 {
1319 return image->layout[0].cpp == 2 &&
1320 vk_format_get_nr_components(image->vk_format) == 2;
1321 }
1322
1323 static void
1324 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1325 struct tu_image *src_image,
1326 struct tu_image *dst_image,
1327 const VkImageCopy *info)
1328 {
1329 const struct blit_ops *ops = &r2d_ops;
1330 struct tu_cs *cs = &cmd->cs;
1331
1332 if (dst_image->samples > 1)
1333 ops = &r3d_ops;
1334
1335 VkFormat format = VK_FORMAT_UNDEFINED;
1336 VkOffset3D src_offset = info->srcOffset;
1337 VkOffset3D dst_offset = info->dstOffset;
1338 VkExtent3D extent = info->extent;
1339
1340 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1341 * Images":
1342 *
1343 * When copying between compressed and uncompressed formats the extent
1344 * members represent the texel dimensions of the source image and not
1345 * the destination. When copying from a compressed image to an
1346 * uncompressed image the image texel dimensions written to the
1347 * uncompressed image will be source extent divided by the compressed
1348 * texel block dimensions. When copying from an uncompressed image to a
1349 * compressed image the image texel dimensions written to the compressed
1350 * image will be the source extent multiplied by the compressed texel
1351 * block dimensions.
1352 *
1353 * This means we only have to adjust the extent if the source image is
1354 * compressed.
1355 */
1356 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1357 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1358
1359 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1360 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1361
1362 bool use_staging_blit = false;
1363
1364 if (src_format == dst_format) {
1365 /* Images that share a format can always be copied directly because it's
1366 * the same as a blit.
1367 */
1368 format = src_format;
1369 } else if (!src_image->layout[0].tile_mode) {
1370 /* If an image is linear, we can always safely reinterpret it with the
1371 * other image's format and then do a regular blit.
1372 */
1373 format = dst_format;
1374 } else if (!dst_image->layout[0].tile_mode) {
1375 format = src_format;
1376 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1377 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1378 * due to the different tile layout.
1379 */
1380 use_staging_blit = true;
1381 } else if (is_swapped_format(src_format) ||
1382 is_swapped_format(dst_format)) {
1383 /* If either format has a non-identity swap, then we can't copy
1384 * to/from it.
1385 */
1386 use_staging_blit = true;
1387 } else if (!src_image->layout[0].ubwc) {
1388 format = dst_format;
1389 } else if (!dst_image->layout[0].ubwc) {
1390 format = src_format;
1391 } else {
1392 /* Both formats use UBWC and so neither can be reinterpreted.
1393 * TODO: We could do an in-place decompression of the dst instead.
1394 */
1395 use_staging_blit = true;
1396 }
1397
1398 struct tu_image_view dst, src;
1399
1400 if (use_staging_blit) {
1401 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1402 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1403
1404 struct tu_image staging_image = {
1405 .vk_format = src_format,
1406 .type = src_image->type,
1407 .tiling = VK_IMAGE_TILING_LINEAR,
1408 .extent = extent,
1409 .level_count = 1,
1410 .layer_count = info->srcSubresource.layerCount,
1411 .samples = src_image->samples,
1412 .bo_offset = 0,
1413 };
1414
1415 VkImageSubresourceLayers staging_subresource = {
1416 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1417 .mipLevel = 0,
1418 .baseArrayLayer = 0,
1419 .layerCount = info->srcSubresource.layerCount,
1420 };
1421
1422 VkOffset3D staging_offset = { 0 };
1423
1424 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1425 staging_image.layout[0].ubwc = false;
1426
1427 fdl6_layout(&staging_image.layout[0],
1428 vk_format_to_pipe_format(staging_image.vk_format),
1429 staging_image.samples,
1430 staging_image.extent.width,
1431 staging_image.extent.height,
1432 staging_image.extent.depth,
1433 staging_image.level_count,
1434 staging_image.layer_count,
1435 staging_image.type == VK_IMAGE_TYPE_3D,
1436 NULL);
1437
1438 VkResult result = tu_get_scratch_bo(cmd->device,
1439 staging_image.layout[0].size,
1440 &staging_image.bo);
1441 if (result != VK_SUCCESS) {
1442 cmd->record_result = result;
1443 return;
1444 }
1445
1446 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1447 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1448
1449 struct tu_image_view staging;
1450 tu_image_view_copy(&staging, &staging_image, src_format,
1451 &staging_subresource, 0, false);
1452
1453 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1454 coords(ops, cs, &staging_offset, &src_offset, &extent);
1455
1456 for (uint32_t i = 0; i < info->extent.depth; i++) {
1457 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1458 ops->dst(cs, &staging, i);
1459 ops->run(cmd, cs);
1460 }
1461
1462 /* When executed by the user there has to be a pipeline barrier here,
1463 * but since we're doing it manually we'll have to flush ourselves.
1464 */
1465 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1466 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1467
1468 tu_image_view_copy(&staging, &staging_image, dst_format,
1469 &staging_subresource, 0, false);
1470
1471 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1472 ROTATE_0, false, dst_image->layout[0].ubwc);
1473 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1474
1475 for (uint32_t i = 0; i < info->extent.depth; i++) {
1476 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1477 ops->dst(cs, &dst, i);
1478 ops->run(cmd, cs);
1479 }
1480 } else {
1481 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1482 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1483
1484 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1485 ROTATE_0, false, dst_image->layout[0].ubwc);
1486 coords(ops, cs, &dst_offset, &src_offset, &extent);
1487
1488 for (uint32_t i = 0; i < info->extent.depth; i++) {
1489 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1490 ops->dst(cs, &dst, i);
1491 ops->run(cmd, cs);
1492 }
1493 }
1494
1495 ops->teardown(cmd, cs);
1496 }
1497
1498 void
1499 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1500 VkImage srcImage,
1501 VkImageLayout srcImageLayout,
1502 VkImage destImage,
1503 VkImageLayout destImageLayout,
1504 uint32_t regionCount,
1505 const VkImageCopy *pRegions)
1506 {
1507 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1508 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1509 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1510
1511 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1512 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1513
1514 for (uint32_t i = 0; i < regionCount; ++i)
1515 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1516 }
1517
1518 static void
1519 copy_buffer(struct tu_cmd_buffer *cmd,
1520 uint64_t dst_va,
1521 uint64_t src_va,
1522 uint64_t size,
1523 uint32_t block_size)
1524 {
1525 const struct blit_ops *ops = &r2d_ops;
1526 struct tu_cs *cs = &cmd->cs;
1527 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1528 uint64_t blocks = size / block_size;
1529
1530 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1531
1532 while (blocks) {
1533 uint32_t src_x = (src_va & 63) / block_size;
1534 uint32_t dst_x = (dst_va & 63) / block_size;
1535 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1536
1537 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1538 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1539 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1540 ops->run(cmd, cs);
1541
1542 src_va += width * block_size;
1543 dst_va += width * block_size;
1544 blocks -= width;
1545 }
1546
1547 ops->teardown(cmd, cs);
1548 }
1549
1550 void
1551 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1552 VkBuffer srcBuffer,
1553 VkBuffer dstBuffer,
1554 uint32_t regionCount,
1555 const VkBufferCopy *pRegions)
1556 {
1557 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1558 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1559 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1560
1561 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1562 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1563
1564 for (unsigned i = 0; i < regionCount; ++i) {
1565 copy_buffer(cmd,
1566 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1567 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1568 pRegions[i].size, 1);
1569 }
1570 }
1571
1572 void
1573 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1574 VkBuffer dstBuffer,
1575 VkDeviceSize dstOffset,
1576 VkDeviceSize dataSize,
1577 const void *pData)
1578 {
1579 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1580 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1581
1582 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1583
1584 struct tu_cs_memory tmp;
1585 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1586 if (result != VK_SUCCESS) {
1587 cmd->record_result = result;
1588 return;
1589 }
1590
1591 memcpy(tmp.map, pData, dataSize);
1592 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1593 }
1594
1595 void
1596 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1597 VkBuffer dstBuffer,
1598 VkDeviceSize dstOffset,
1599 VkDeviceSize fillSize,
1600 uint32_t data)
1601 {
1602 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1603 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1604 const struct blit_ops *ops = &r2d_ops;
1605 struct tu_cs *cs = &cmd->cs;
1606
1607 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1608
1609 if (fillSize == VK_WHOLE_SIZE)
1610 fillSize = buffer->size - dstOffset;
1611
1612 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1613 uint32_t blocks = fillSize / 4;
1614
1615 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1616 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1617
1618 while (blocks) {
1619 uint32_t dst_x = (dst_va & 63) / 4;
1620 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1621
1622 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1623 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1624 ops->run(cmd, cs);
1625
1626 dst_va += width * 4;
1627 blocks -= width;
1628 }
1629
1630 ops->teardown(cmd, cs);
1631 }
1632
1633 void
1634 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1635 VkImage srcImage,
1636 VkImageLayout srcImageLayout,
1637 VkImage dstImage,
1638 VkImageLayout dstImageLayout,
1639 uint32_t regionCount,
1640 const VkImageResolve *pRegions)
1641 {
1642 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1643 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1644 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1645 const struct blit_ops *ops = &r2d_ops;
1646 struct tu_cs *cs = &cmd->cs;
1647
1648 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1649 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1650
1651 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1652 ROTATE_0, false, dst_image->layout[0].ubwc);
1653
1654 for (uint32_t i = 0; i < regionCount; ++i) {
1655 const VkImageResolve *info = &pRegions[i];
1656 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1657
1658 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1659 /* TODO: aspect masks possible ? */
1660
1661 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1662
1663 struct tu_image_view dst, src;
1664 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1665 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1666
1667 for (uint32_t i = 0; i < layers; i++) {
1668 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1669 ops->dst(cs, &dst, i);
1670 ops->run(cmd, cs);
1671 }
1672 }
1673
1674 ops->teardown(cmd, cs);
1675 }
1676
1677 void
1678 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1679 struct tu_cs *cs,
1680 struct tu_image_view *src,
1681 struct tu_image_view *dst,
1682 uint32_t layers,
1683 const VkRect2D *rect)
1684 {
1685 const struct blit_ops *ops = &r2d_ops;
1686
1687 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1688 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1689
1690 assert(src->image->vk_format == dst->image->vk_format);
1691
1692 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1693 ROTATE_0, false, dst->ubwc_enabled);
1694 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1695
1696 for (uint32_t i = 0; i < layers; i++) {
1697 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1698 ops->dst(cs, dst, i);
1699 ops->run(cmd, cs);
1700 }
1701
1702 ops->teardown(cmd, cs);
1703 }
1704
1705 static void
1706 clear_image(struct tu_cmd_buffer *cmd,
1707 struct tu_image *image,
1708 const VkClearValue *clear_value,
1709 const VkImageSubresourceRange *range,
1710 VkImageAspectFlags aspect_mask)
1711 {
1712 uint32_t level_count = tu_get_levelCount(image, range);
1713 uint32_t layer_count = tu_get_layerCount(image, range);
1714 struct tu_cs *cs = &cmd->cs;
1715 VkFormat format = image->vk_format;
1716 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1717 format = copy_format(format, aspect_mask, false);
1718
1719 if (image->type == VK_IMAGE_TYPE_3D) {
1720 assert(layer_count == 1);
1721 assert(range->baseArrayLayer == 0);
1722 }
1723
1724 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1725
1726 ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
1727 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1728 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
1729 else
1730 ops->clear_value(cs, format, clear_value);
1731
1732 for (unsigned j = 0; j < level_count; j++) {
1733 if (image->type == VK_IMAGE_TYPE_3D)
1734 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1735
1736 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1737 u_minify(image->extent.width, range->baseMipLevel + j),
1738 u_minify(image->extent.height, range->baseMipLevel + j)
1739 });
1740
1741 struct tu_image_view dst;
1742 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1743 .aspectMask = aspect_mask,
1744 .mipLevel = range->baseMipLevel + j,
1745 .baseArrayLayer = range->baseArrayLayer,
1746 .layerCount = 1,
1747 }, 0, false);
1748
1749 for (uint32_t i = 0; i < layer_count; i++) {
1750 ops->dst(cs, &dst, i);
1751 ops->run(cmd, cs);
1752 }
1753 }
1754
1755 ops->teardown(cmd, cs);
1756 }
1757
1758 void
1759 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1760 VkImage image_h,
1761 VkImageLayout imageLayout,
1762 const VkClearColorValue *pColor,
1763 uint32_t rangeCount,
1764 const VkImageSubresourceRange *pRanges)
1765 {
1766 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1767 TU_FROM_HANDLE(tu_image, image, image_h);
1768
1769 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1770
1771 for (unsigned i = 0; i < rangeCount; i++)
1772 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
1773 }
1774
1775 void
1776 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1777 VkImage image_h,
1778 VkImageLayout imageLayout,
1779 const VkClearDepthStencilValue *pDepthStencil,
1780 uint32_t rangeCount,
1781 const VkImageSubresourceRange *pRanges)
1782 {
1783 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1784 TU_FROM_HANDLE(tu_image, image, image_h);
1785
1786 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1787
1788 for (unsigned i = 0; i < rangeCount; i++) {
1789 const VkImageSubresourceRange *range = &pRanges[i];
1790
1791 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1792 /* can't clear both depth and stencil at once, split up the aspect mask */
1793 uint32_t b;
1794 for_each_bit(b, range->aspectMask)
1795 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
1796 continue;
1797 }
1798
1799 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
1800 }
1801 }
1802
1803 static void
1804 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1805 uint32_t attachment_count,
1806 const VkClearAttachment *attachments,
1807 uint32_t rect_count,
1808 const VkClearRect *rects)
1809 {
1810 /* the shader path here is special, it avoids changing MRT/etc state */
1811 const struct tu_render_pass *pass = cmd->state.pass;
1812 const struct tu_subpass *subpass = cmd->state.subpass;
1813 const uint32_t mrt_count = subpass->color_count;
1814 struct tu_cs *cs = &cmd->draw_cs;
1815 uint32_t clear_value[MAX_RTS][4];
1816 float z_clear_val = 0.0f;
1817 uint8_t s_clear_val = 0;
1818 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1819 bool z_clear = false;
1820 bool s_clear = false;
1821 bool layered_clear = false;
1822 uint32_t max_samples = 1;
1823
1824 for (uint32_t i = 0; i < attachment_count; i++) {
1825 uint32_t a;
1826 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1827 uint32_t c = attachments[i].colorAttachment;
1828 a = subpass->color_attachments[c].attachment;
1829 if (a == VK_ATTACHMENT_UNUSED)
1830 continue;
1831
1832 clear_rts |= 1 << c;
1833 clear_components |= 0xf << (c * 4);
1834 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1835 } else {
1836 a = subpass->depth_stencil_attachment.attachment;
1837 if (a == VK_ATTACHMENT_UNUSED)
1838 continue;
1839
1840 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1841 z_clear = true;
1842 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1843 }
1844
1845 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1846 s_clear = true;
1847 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1848 }
1849 }
1850
1851 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1852 }
1853
1854 /* disable all draw states so they don't interfere
1855 * TODO: use and re-use draw states
1856 * we have to disable draw states individually to preserve
1857 * input attachment states, because a secondary command buffer
1858 * won't be able to restore them
1859 */
1860 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1861 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1862 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1863 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1864 continue;
1865 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1866 CP_SET_DRAW_STATE__0_DISABLE);
1867 tu_cs_emit_qw(cs, 0);
1868 }
1869 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1870
1871 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1872 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1873 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1874 0xfc000000);
1875 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1876
1877 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1878 for (uint32_t i = 0; i < mrt_count; i++) {
1879 if (clear_rts & (1 << i))
1880 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1881 else
1882 tu_cs_emit(cs, 0);
1883 }
1884
1885 for (uint32_t i = 0; i < rect_count; i++) {
1886 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1887 layered_clear = true;
1888 }
1889
1890 r3d_common(cmd, cs, false, num_rts, layered_clear);
1891
1892 tu_cs_emit_regs(cs,
1893 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1894 tu_cs_emit_regs(cs,
1895 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1896
1897 tu_cs_emit_regs(cs,
1898 A6XX_RB_FS_OUTPUT_CNTL0(),
1899 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1900
1901 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1902 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1903 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1904 for (uint32_t i = 0; i < mrt_count; i++) {
1905 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1906 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1907 }
1908
1909 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1910 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1911 .z_enable = z_clear,
1912 .z_write_enable = z_clear,
1913 .zfunc = FUNC_ALWAYS));
1914 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1915 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1916 .stencil_enable = s_clear,
1917 .func = FUNC_ALWAYS,
1918 .zpass = STENCIL_REPLACE));
1919 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1920 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1921 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1922
1923 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1924 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1925 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1926 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1927 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1928 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1929 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1930 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1931 for_each_bit(b, clear_rts)
1932 tu_cs_emit_array(cs, clear_value[b], 4);
1933
1934 for (uint32_t i = 0; i < rect_count; i++) {
1935 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1936 r3d_coords_raw(cs, (float[]) {
1937 rects[i].rect.offset.x, rects[i].rect.offset.y,
1938 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1939 rects[i].rect.offset.x + rects[i].rect.extent.width,
1940 rects[i].rect.offset.y + rects[i].rect.extent.height,
1941 z_clear_val, 1.0f,
1942 });
1943 r3d_run(cmd, cs);
1944 }
1945 }
1946 }
1947
1948 static void
1949 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1950 {
1951 enum pipe_format pformat = vk_format_to_pipe_format(format);
1952
1953 switch (format) {
1954 case VK_FORMAT_X8_D24_UNORM_PACK32:
1955 case VK_FORMAT_D24_UNORM_S8_UINT:
1956 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1957 val->depthStencil.stencil << 24;
1958 return;
1959 case VK_FORMAT_D16_UNORM:
1960 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1961 return;
1962 case VK_FORMAT_D32_SFLOAT:
1963 clear_value[0] = fui(val->depthStencil.depth);
1964 return;
1965 case VK_FORMAT_S8_UINT:
1966 clear_value[0] = val->depthStencil.stencil;
1967 return;
1968 /* these formats use a different base format when tiled
1969 * the same format can be used for both because GMEM is always in WZYX order
1970 */
1971 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1972 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1973 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1974 default:
1975 break;
1976 }
1977
1978 VkClearColorValue color;
1979
1980 /**
1981 * GMEM is tiled and wants the components in WZYX order,
1982 * apply swizzle to the color before packing, to counteract
1983 * deswizzling applied by packing functions
1984 */
1985 pipe_swizzle_4f(color.float32, val->color.float32,
1986 util_format_description(pformat)->swizzle);
1987
1988 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
1989 }
1990
1991 static void
1992 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1993 struct tu_cs *cs,
1994 VkFormat format,
1995 uint8_t clear_mask,
1996 uint32_t gmem_offset,
1997 const VkClearValue *value)
1998 {
1999 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2000 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2001
2002 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2003
2004 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2005 tu_cs_emit(cs, gmem_offset);
2006
2007 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2008 tu_cs_emit(cs, 0);
2009
2010 uint32_t clear_vals[4] = {};
2011 pack_gmem_clear_value(value, format, clear_vals);
2012
2013 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2014 tu_cs_emit_array(cs, clear_vals, 4);
2015
2016 tu6_emit_event_write(cmd, cs, BLIT);
2017 }
2018
2019 static void
2020 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2021 struct tu_cs *cs,
2022 uint32_t attachment,
2023 VkImageAspectFlags mask,
2024 const VkClearValue *value)
2025 {
2026 const struct tu_render_pass_attachment *att =
2027 &cmd->state.pass->attachments[attachment];
2028
2029 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2030 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2031 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2032 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2033 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2034 return;
2035 }
2036
2037 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2038 }
2039
2040 static void
2041 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2042 uint32_t attachment_count,
2043 const VkClearAttachment *attachments,
2044 uint32_t rect_count,
2045 const VkClearRect *rects)
2046 {
2047 const struct tu_subpass *subpass = cmd->state.subpass;
2048 struct tu_cs *cs = &cmd->draw_cs;
2049
2050 /* TODO: swap the loops for smaller cmdstream */
2051 for (unsigned i = 0; i < rect_count; i++) {
2052 unsigned x1 = rects[i].rect.offset.x;
2053 unsigned y1 = rects[i].rect.offset.y;
2054 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2055 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2056
2057 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2058 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2059 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2060
2061 for (unsigned j = 0; j < attachment_count; j++) {
2062 uint32_t a;
2063 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2064 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2065 else
2066 a = subpass->depth_stencil_attachment.attachment;
2067
2068 if (a == VK_ATTACHMENT_UNUSED)
2069 continue;
2070
2071 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2072 &attachments[j].clearValue);
2073 }
2074 }
2075 }
2076
2077 void
2078 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2079 uint32_t attachmentCount,
2080 const VkClearAttachment *pAttachments,
2081 uint32_t rectCount,
2082 const VkClearRect *pRects)
2083 {
2084 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2085 struct tu_cs *cs = &cmd->draw_cs;
2086
2087 /* sysmem path behaves like a draw, note we don't have a way of using different
2088 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2089 */
2090 tu_emit_cache_flush_renderpass(cmd, cs);
2091
2092 /* vkCmdClearAttachments is supposed to respect the predicate if active.
2093 * The easiest way to do this is to always use the 3d path, which always
2094 * works even with GMEM because it's just a simple draw using the existing
2095 * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2096 * skipped in the binning pass, since otherwise they produce binning data
2097 * which isn't consumed and leads to the wrong binning data being read, so
2098 * condition on GMEM | SYSMEM.
2099 */
2100 if (cmd->state.predication_active) {
2101 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2102 CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2103 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2104 tu_cond_exec_end(cs);
2105 return;
2106 }
2107
2108 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2109 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2110 tu_cond_exec_end(cs);
2111
2112 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2113 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2114 tu_cond_exec_end(cs);
2115 }
2116
2117 static void
2118 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2119 struct tu_cs *cs,
2120 VkFormat format,
2121 VkImageAspectFlags clear_mask,
2122 const VkRenderPassBeginInfo *info,
2123 uint32_t a,
2124 bool separate_stencil)
2125 {
2126 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2127 const struct tu_image_view *iview = fb->attachments[a].attachment;
2128 const struct blit_ops *ops = &r2d_ops;
2129 if (cmd->state.pass->attachments[a].samples > 1)
2130 ops = &r3d_ops;
2131
2132 ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
2133 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2134 ops->clear_value(cs, format, &info->pClearValues[a]);
2135
2136 for (uint32_t i = 0; i < fb->layers; i++) {
2137 if (separate_stencil) {
2138 if (ops == &r3d_ops)
2139 r3d_dst_stencil(cs, iview, i);
2140 else
2141 r2d_dst_stencil(cs, iview, i);
2142 } else {
2143 ops->dst(cs, iview, i);
2144 }
2145 ops->run(cmd, cs);
2146 }
2147
2148 ops->teardown(cmd, cs);
2149 }
2150
2151 void
2152 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2153 struct tu_cs *cs,
2154 uint32_t a,
2155 const VkRenderPassBeginInfo *info)
2156 {
2157 const struct tu_render_pass_attachment *attachment =
2158 &cmd->state.pass->attachments[a];
2159
2160 if (!attachment->clear_mask)
2161 return;
2162
2163 /* Wait for any flushes at the beginning of the renderpass to complete */
2164 tu_cs_emit_wfi(cs);
2165
2166 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2167 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2168 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2169 info, a, false);
2170 }
2171 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2172 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2173 info, a, true);
2174 }
2175 } else {
2176 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2177 info, a, false);
2178 }
2179
2180 /* The spec doesn't explicitly say, but presumably the initial renderpass
2181 * clear is considered part of the renderpass, and therefore barriers
2182 * aren't required inside the subpass/renderpass. Therefore we need to
2183 * flush CCU color into CCU depth here, just like with
2184 * vkCmdClearAttachments(). Note that because this only happens at the
2185 * beginning of a renderpass, and renderpass writes are considered
2186 * "incoherent", we shouldn't have to worry about syncing depth into color
2187 * beforehand as depth should already be flushed.
2188 */
2189 if (vk_format_is_depth_or_stencil(attachment->format)) {
2190 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2191 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2192 } else {
2193 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2194 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2195 }
2196 }
2197
2198 void
2199 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2200 struct tu_cs *cs,
2201 uint32_t a,
2202 const VkRenderPassBeginInfo *info)
2203 {
2204 const struct tu_render_pass_attachment *attachment =
2205 &cmd->state.pass->attachments[a];
2206
2207 if (!attachment->clear_mask)
2208 return;
2209
2210 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2211
2212 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2213 &info->pClearValues[a]);
2214 }
2215
2216 static void
2217 tu_emit_blit(struct tu_cmd_buffer *cmd,
2218 struct tu_cs *cs,
2219 const struct tu_image_view *iview,
2220 const struct tu_render_pass_attachment *attachment,
2221 bool resolve,
2222 bool separate_stencil)
2223 {
2224 tu_cs_emit_regs(cs,
2225 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2226
2227 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2228 .unk0 = !resolve,
2229 .gmem = !resolve,
2230 /* "integer" bit disables msaa resolve averaging */
2231 .integer = vk_format_is_int(attachment->format)));
2232
2233 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2234 if (separate_stencil) {
2235 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2236 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2237 tu_cs_emit(cs, iview->stencil_PITCH);
2238
2239 tu_cs_emit_regs(cs,
2240 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2241 } else {
2242 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2243 tu_cs_image_ref_2d(cs, iview, 0, false);
2244
2245 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2246 tu_cs_image_flag_ref(cs, iview, 0);
2247
2248 tu_cs_emit_regs(cs,
2249 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2250 }
2251
2252 tu6_emit_event_write(cmd, cs, BLIT);
2253 }
2254
2255 static bool
2256 blit_can_resolve(VkFormat format)
2257 {
2258 const struct util_format_description *desc = vk_format_description(format);
2259
2260 /* blit event can only do resolve for simple cases:
2261 * averaging samples as unsigned integers or choosing only one sample
2262 */
2263 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2264 return false;
2265
2266 /* can't do formats with larger channel sizes
2267 * note: this includes all float formats
2268 * note2: single channel integer formats seem OK
2269 */
2270 if (desc->channel[0].size > 10)
2271 return false;
2272
2273 switch (format) {
2274 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2275 * likely related to these formats having different layout from other cpp=2 formats
2276 */
2277 case VK_FORMAT_R8G8_UNORM:
2278 case VK_FORMAT_R8G8_UINT:
2279 case VK_FORMAT_R8G8_SINT:
2280 /* TODO: this one should be able to work? */
2281 case VK_FORMAT_D24_UNORM_S8_UINT:
2282 return false;
2283 default:
2284 break;
2285 }
2286
2287 return true;
2288 }
2289
2290 void
2291 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2292 struct tu_cs *cs,
2293 uint32_t a,
2294 bool force_load)
2295 {
2296 const struct tu_image_view *iview =
2297 cmd->state.framebuffer->attachments[a].attachment;
2298 const struct tu_render_pass_attachment *attachment =
2299 &cmd->state.pass->attachments[a];
2300
2301 if (attachment->load || force_load)
2302 tu_emit_blit(cmd, cs, iview, attachment, false, false);
2303
2304 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2305 tu_emit_blit(cmd, cs, iview, attachment, false, true);
2306 }
2307
2308 static void
2309 store_cp_blit(struct tu_cmd_buffer *cmd,
2310 struct tu_cs *cs,
2311 struct tu_image_view *iview,
2312 uint32_t samples,
2313 bool separate_stencil,
2314 VkFormat format,
2315 uint32_t gmem_offset,
2316 uint32_t cpp)
2317 {
2318 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
2319 iview->ubwc_enabled, true);
2320 if (separate_stencil)
2321 r2d_dst_stencil(cs, iview, 0);
2322 else
2323 r2d_dst(cs, iview, 0);
2324
2325 tu_cs_emit_regs(cs,
2326 A6XX_SP_PS_2D_SRC_INFO(
2327 .color_format = tu6_format_texture(format, TILE6_2).fmt,
2328 .tile_mode = TILE6_2,
2329 .srgb = vk_format_is_srgb(format),
2330 .samples = tu_msaa_samples(samples),
2331 .samples_average = !vk_format_is_int(format),
2332 .unk20 = 1,
2333 .unk22 = 1),
2334 /* note: src size does not matter when not scaling */
2335 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2336 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
2337 A6XX_SP_PS_2D_SRC_HI(),
2338 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2339
2340 /* sync GMEM writes with CACHE. */
2341 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2342
2343 /* Wait for CACHE_INVALIDATE to land */
2344 tu_cs_emit_wfi(cs);
2345
2346 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2347 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2348
2349 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2350 * sysmem, and we generally assume that GMEM renderpasses leave their
2351 * results in sysmem, so we need to flush manually here.
2352 */
2353 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2354 }
2355
2356 void
2357 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2358 struct tu_cs *cs,
2359 uint32_t a,
2360 uint32_t gmem_a)
2361 {
2362 const VkRect2D *render_area = &cmd->state.render_area;
2363 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2364 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2365 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2366
2367 if (!dst->store && !dst->store_stencil)
2368 return;
2369
2370 uint32_t x1 = render_area->offset.x;
2371 uint32_t y1 = render_area->offset.y;
2372 uint32_t x2 = x1 + render_area->extent.width;
2373 uint32_t y2 = y1 + render_area->extent.height;
2374 /* x2/y2 can be unaligned if equal to the size of the image,
2375 * since it will write into padding space
2376 * the one exception is linear levels which don't have the
2377 * required y padding in the layout (except for the last level)
2378 */
2379 bool need_y2_align =
2380 y2 != iview->extent.height || iview->need_y2_align;
2381
2382 bool unaligned =
2383 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2384 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2385
2386 /* use fast path when render area is aligned, except for unsupported resolve cases */
2387 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2388 if (dst->store)
2389 tu_emit_blit(cmd, cs, iview, src, true, false);
2390 if (dst->store_stencil)
2391 tu_emit_blit(cmd, cs, iview, src, true, true);
2392 return;
2393 }
2394
2395 if (dst->samples > 1) {
2396 /* I guess we need to use shader path in this case?
2397 * need a testcase which fails because of this
2398 */
2399 tu_finishme("unaligned store of msaa attachment\n");
2400 return;
2401 }
2402
2403 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2404
2405 VkFormat format = src->format;
2406 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2407 format = VK_FORMAT_D32_SFLOAT;
2408
2409 if (dst->store) {
2410 store_cp_blit(cmd, cs, iview, src->samples, false, format,
2411 src->gmem_offset, src->cpp);
2412 }
2413 if (dst->store_stencil) {
2414 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2415 src->gmem_offset_stencil, src->samples);
2416 }
2417 }