freedreno: Drop UNIFORM_BUFFER_OFFSET_ALIGNMENT to 32
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(VkFormat format)
29 {
30 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
31 format == VK_FORMAT_X8_D24_UNORM_PACK32)
32 return R2D_UNORM8;
33
34 /* get_component_bits doesn't work with depth/stencil formats: */
35 if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
36 return R2D_FLOAT32;
37 if (format == VK_FORMAT_S8_UINT)
38 return R2D_INT8;
39
40 /* use the size of the red channel to find the corresponding "ifmt" */
41 bool is_int = vk_format_is_int(format);
42 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
43 case 4: case 5: case 8:
44 return is_int ? R2D_INT8 : R2D_UNORM8;
45 case 10: case 11:
46 return is_int ? R2D_INT16 : R2D_FLOAT16;
47 case 16:
48 if (vk_format_is_float(format))
49 return R2D_FLOAT16;
50 return is_int ? R2D_INT16 : R2D_FLOAT32;
51 case 32:
52 return is_int ? R2D_INT32 : R2D_FLOAT32;
53 default:
54 unreachable("bad format");
55 return 0;
56 }
57 }
58
59 static void
60 r2d_coords(struct tu_cs *cs,
61 const VkOffset2D *dst,
62 const VkOffset2D *src,
63 const VkExtent2D *extent)
64 {
65 tu_cs_emit_regs(cs,
66 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
67 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
68
69 if (!src)
70 return;
71
72 tu_cs_emit_regs(cs,
73 A6XX_GRAS_2D_SRC_TL_X(src->x),
74 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
75 A6XX_GRAS_2D_SRC_TL_Y(src->y),
76 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
77 }
78
79 static void
80 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
81 {
82 uint32_t clear_value[4] = {};
83
84 switch (format) {
85 case VK_FORMAT_X8_D24_UNORM_PACK32:
86 case VK_FORMAT_D24_UNORM_S8_UINT:
87 /* cleared as r8g8b8a8_unorm using special format */
88 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
89 clear_value[1] = clear_value[0] >> 8;
90 clear_value[2] = clear_value[0] >> 16;
91 clear_value[3] = val->depthStencil.stencil;
92 break;
93 case VK_FORMAT_D16_UNORM:
94 case VK_FORMAT_D32_SFLOAT:
95 /* R2D_FLOAT32 */
96 clear_value[0] = fui(val->depthStencil.depth);
97 break;
98 case VK_FORMAT_S8_UINT:
99 clear_value[0] = val->depthStencil.stencil;
100 break;
101 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
102 /* cleared as UINT32 */
103 clear_value[0] = float3_to_rgb9e5(val->color.float32);
104 break;
105 default:
106 assert(!vk_format_is_depth_or_stencil(format));
107 const struct util_format_description *desc = vk_format_description(format);
108 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
109
110 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
111 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
112
113 for (unsigned i = 0; i < desc->nr_channels; i++) {
114 const struct util_format_channel_description *ch = &desc->channel[i];
115 if (ifmt == R2D_UNORM8) {
116 float linear = val->color.float32[i];
117 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
118 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
119
120 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
121 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
122 else
123 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
124 } else if (ifmt == R2D_FLOAT16) {
125 clear_value[i] = util_float_to_half(val->color.float32[i]);
126 } else {
127 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
128 ifmt == R2D_INT16 || ifmt == R2D_INT8);
129 clear_value[i] = val->color.uint32[i];
130 }
131 }
132 break;
133 }
134
135 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
136 tu_cs_emit_array(cs, clear_value, 4);
137 }
138
139 static void
140 r2d_src(struct tu_cmd_buffer *cmd,
141 struct tu_cs *cs,
142 const struct tu_image_view *iview,
143 uint32_t layer,
144 VkFilter filter)
145 {
146 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
147 if (filter != VK_FILTER_NEAREST)
148 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
149
150 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
151 tu_cs_emit(cs, src_info);
152 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
153 tu_cs_image_ref_2d(cs, iview, layer, true);
154
155 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
156 tu_cs_image_flag_ref(cs, iview, layer);
157 }
158
159 static void
160 r2d_src_buffer(struct tu_cmd_buffer *cmd,
161 struct tu_cs *cs,
162 VkFormat vk_format,
163 uint64_t va, uint32_t pitch,
164 uint32_t width, uint32_t height)
165 {
166 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
167
168 tu_cs_emit_regs(cs,
169 A6XX_SP_PS_2D_SRC_INFO(
170 .color_format = format.fmt,
171 .color_swap = format.swap,
172 .srgb = vk_format_is_srgb(vk_format),
173 .unk20 = 1,
174 .unk22 = 1),
175 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
176 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
177 A6XX_SP_PS_2D_SRC_HI(va >> 32),
178 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
179 }
180
181 static void
182 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
183 {
184 assert(iview->image->samples == 1);
185
186 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
187 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
188 tu_cs_image_ref_2d(cs, iview, layer, false);
189
190 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
191 tu_cs_image_flag_ref(cs, iview, layer);
192 }
193
194 static void
195 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
196 {
197 assert(iview->image->samples == 1);
198
199 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
200 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
201 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
202 tu_cs_emit(cs, iview->stencil_PITCH);
203 }
204
205 static void
206 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
207 {
208 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
209
210 tu_cs_emit_regs(cs,
211 A6XX_RB_2D_DST_INFO(
212 .color_format = format.fmt,
213 .color_swap = format.swap,
214 .srgb = vk_format_is_srgb(vk_format)),
215 A6XX_RB_2D_DST_LO((uint32_t) va),
216 A6XX_RB_2D_DST_HI(va >> 32),
217 A6XX_RB_2D_DST_PITCH(pitch));
218 }
219
220 static void
221 r2d_setup_common(struct tu_cmd_buffer *cmd,
222 struct tu_cs *cs,
223 VkFormat vk_format,
224 VkImageAspectFlags aspect_mask,
225 enum a6xx_rotation rotation,
226 bool clear,
227 bool ubwc,
228 bool scissor)
229 {
230 enum a6xx_format format = tu6_base_format(vk_format);
231 enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
232 uint32_t unknown_8c01 = 0;
233
234 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
235 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
236 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
237 }
238
239 /* note: the only format with partial clearing is D24S8 */
240 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
241 /* preserve stencil channel */
242 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
243 unknown_8c01 = 0x08000041;
244 /* preserve depth channels */
245 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
246 unknown_8c01 = 0x00084001;
247 }
248
249 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
250 tu_cs_emit(cs, unknown_8c01);
251
252 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
253 .scissor = scissor,
254 .rotate = rotation,
255 .solid_color = clear,
256 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
257 .color_format = format,
258 .mask = 0xf,
259 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
260 ).value;
261
262 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
263 tu_cs_emit(cs, blit_cntl);
264
265 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
266 tu_cs_emit(cs, blit_cntl);
267
268 if (format == FMT6_10_10_10_2_UNORM_DEST)
269 format = FMT6_16_16_16_16_FLOAT;
270
271 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
272 .sint = vk_format_is_sint(vk_format),
273 .uint = vk_format_is_uint(vk_format),
274 .color_format = format,
275 .srgb = vk_format_is_srgb(vk_format),
276 .mask = 0xf));
277 }
278
279 static void
280 r2d_setup(struct tu_cmd_buffer *cmd,
281 struct tu_cs *cs,
282 VkFormat vk_format,
283 VkImageAspectFlags aspect_mask,
284 enum a6xx_rotation rotation,
285 bool clear,
286 bool ubwc)
287 {
288 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
289
290 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
291 }
292
293 static void
294 r2d_teardown(struct tu_cmd_buffer *cmd,
295 struct tu_cs *cs)
296 {
297 /* nothing to do here */
298 }
299
300 static void
301 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
302 {
303 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
304 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
305 }
306
307 /* r3d_ = shader path operations */
308
309 void
310 tu_init_clear_blit_shaders(struct tu6_global *global)
311 {
312 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
313 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
314 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
315
316 static const instr_t vs_code[] = {
317 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
318 * r1.xy = r0.w ? c1.zw : c0.zw
319 * r0.w = 1.0f
320 */
321 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
322 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
323 .src2 = 3,
324 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
325 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
326 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
327 .src2 = 3,
328 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
329 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
330 { .cat0 = { .opc = OPC_END } },
331 };
332
333 static const instr_t fs_blit[] = {
334 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
335 * blit path (its not clear what allows it to not have it)
336 */
337 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
338 { .cat0 = { .opc = OPC_END } },
339 };
340
341 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
342 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
343
344 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
345 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
346 for (uint32_t i = 0; i < num_rts; i++) {
347 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
348 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
349 }
350 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
351 }
352 }
353
354 static void
355 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
356 bool layered_clear)
357 {
358 struct ir3_const_state dummy_const_state = {};
359 struct ir3_shader dummy_shader = {};
360
361 struct ir3_shader_variant vs = {
362 .type = MESA_SHADER_VERTEX,
363 .instrlen = 1,
364 .constlen = 4,
365 .info.max_reg = 1,
366 .inputs_count = 1,
367 .inputs[0] = {
368 .slot = SYSTEM_VALUE_VERTEX_ID,
369 .regid = regid(0, 3),
370 .sysval = true,
371 },
372 .outputs_count = blit ? 2 : 1,
373 .outputs[0] = {
374 .slot = VARYING_SLOT_POS,
375 .regid = regid(0, 0),
376 },
377 .outputs[1] = {
378 .slot = VARYING_SLOT_VAR0,
379 .regid = regid(1, 0),
380 },
381 .shader = &dummy_shader,
382 .const_state = &dummy_const_state,
383 };
384 if (layered_clear) {
385 vs.outputs[1].slot = VARYING_SLOT_LAYER;
386 vs.outputs[1].regid = regid(1, 1);
387 vs.outputs_count = 2;
388 }
389
390 struct ir3_shader_variant fs = {
391 .type = MESA_SHADER_FRAGMENT,
392 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
393 .constlen = align(num_rts, 4),
394 .info.max_reg = MAX2(num_rts, 1) - 1,
395 .total_in = blit ? 2 : 0,
396 .num_samp = blit ? 1 : 0,
397 .inputs_count = blit ? 2 : 0,
398 .inputs[0] = {
399 .slot = VARYING_SLOT_VAR0,
400 .inloc = 0,
401 .compmask = 3,
402 .bary = true,
403 },
404 .inputs[1] = {
405 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
406 .regid = regid(0, 0),
407 .sysval = 1,
408 },
409 .num_sampler_prefetch = blit ? 1 : 0,
410 .sampler_prefetch[0] = {
411 .src = 0,
412 .wrmask = 0xf,
413 .cmd = 4,
414 },
415 .shader = &dummy_shader,
416 .const_state = &dummy_const_state,
417 };
418
419 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
420 .vs_state = true,
421 .hs_state = true,
422 .ds_state = true,
423 .gs_state = true,
424 .fs_state = true,
425 .cs_state = true,
426 .gfx_ibo = true,
427 .cs_ibo = true,
428 .gfx_shared_const = true,
429 .gfx_bindless = 0x1f,
430 .cs_bindless = 0x1f));
431
432 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
433 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
434 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
435 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
436 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
437 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
438
439 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
440 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
441
442 /* Copy what the blob does here. This will emit an extra 0x3f
443 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
444 * this is working around yet.
445 */
446 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
447 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
448 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
449 tu_cs_emit(cs, 0);
450 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
451
452 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
453
454 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
455 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
456 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
457
458 tu6_emit_fs_inputs(cs, &fs);
459
460 tu_cs_emit_regs(cs,
461 A6XX_GRAS_CL_CNTL(
462 .persp_division_disable = 1,
463 .vp_xform_disable = 1,
464 .vp_clip_code_ignore = 1,
465 .clip_disable = 1));
466 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
467
468 tu_cs_emit_regs(cs,
469 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
470 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
471 tu_cs_emit_regs(cs,
472 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
473 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
474
475 tu_cs_emit_regs(cs,
476 A6XX_VFD_INDEX_OFFSET(),
477 A6XX_VFD_INSTANCE_START_OFFSET());
478 }
479
480 static void
481 r3d_coords_raw(struct tu_cs *cs, const float *coords)
482 {
483 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
484 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
485 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
486 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
487 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
488 CP_LOAD_STATE6_0_NUM_UNIT(2));
489 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
490 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
491 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
492 }
493
494 static void
495 r3d_coords(struct tu_cs *cs,
496 const VkOffset2D *dst,
497 const VkOffset2D *src,
498 const VkExtent2D *extent)
499 {
500 int32_t src_x1 = src ? src->x : 0;
501 int32_t src_y1 = src ? src->y : 0;
502 r3d_coords_raw(cs, (float[]) {
503 dst->x, dst->y,
504 src_x1, src_y1,
505 dst->x + extent->width, dst->y + extent->height,
506 src_x1 + extent->width, src_y1 + extent->height,
507 });
508 }
509
510 static void
511 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
512 {
513 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
514 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
515 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
516 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
517 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
518 CP_LOAD_STATE6_0_NUM_UNIT(1));
519 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
520 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
521 switch (format) {
522 case VK_FORMAT_X8_D24_UNORM_PACK32:
523 case VK_FORMAT_D24_UNORM_S8_UINT: {
524 /* cleared as r8g8b8a8_unorm using special format */
525 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
526 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
527 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
528 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
529 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
530 } break;
531 case VK_FORMAT_D16_UNORM:
532 case VK_FORMAT_D32_SFLOAT:
533 tu_cs_emit(cs, fui(val->depthStencil.depth));
534 tu_cs_emit(cs, 0);
535 tu_cs_emit(cs, 0);
536 tu_cs_emit(cs, 0);
537 break;
538 case VK_FORMAT_S8_UINT:
539 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
540 tu_cs_emit(cs, 0);
541 tu_cs_emit(cs, 0);
542 tu_cs_emit(cs, 0);
543 break;
544 default:
545 /* as color formats use clear value as-is */
546 assert(!vk_format_is_depth_or_stencil(format));
547 tu_cs_emit_array(cs, val->color.uint32, 4);
548 break;
549 }
550 }
551
552 static void
553 r3d_src_common(struct tu_cmd_buffer *cmd,
554 struct tu_cs *cs,
555 const uint32_t *tex_const,
556 uint32_t offset_base,
557 uint32_t offset_ubwc,
558 VkFilter filter)
559 {
560 struct tu_cs_memory texture = { };
561 VkResult result = tu_cs_alloc(&cmd->sub_cs,
562 2, /* allocate space for a sampler too */
563 A6XX_TEX_CONST_DWORDS, &texture);
564 assert(result == VK_SUCCESS);
565
566 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
567
568 /* patch addresses for layer offset */
569 *(uint64_t*) (texture.map + 4) += offset_base;
570 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
571 texture.map[7] = ubwc_addr;
572 texture.map[8] = ubwc_addr >> 32;
573
574 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
575 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
576 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
577 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
578 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
579 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
580 0x60000; /* XXX used by blob, doesn't seem necessary */
581 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
582 0x1 | /* XXX used by blob, doesn't seem necessary */
583 A6XX_TEX_SAMP_1_UNNORM_COORDS |
584 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
585 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
586 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
587
588 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
589 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
590 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
591 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
592 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
593 CP_LOAD_STATE6_0_NUM_UNIT(1));
594 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
595
596 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
597 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
598
599 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
600 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
601 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
602 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
603 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
604 CP_LOAD_STATE6_0_NUM_UNIT(1));
605 tu_cs_emit_qw(cs, texture.iova);
606
607 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
608 tu_cs_emit_qw(cs, texture.iova);
609
610 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
611 }
612
613 static void
614 r3d_src(struct tu_cmd_buffer *cmd,
615 struct tu_cs *cs,
616 const struct tu_image_view *iview,
617 uint32_t layer,
618 VkFilter filter)
619 {
620 r3d_src_common(cmd, cs, iview->descriptor,
621 iview->layer_size * layer,
622 iview->ubwc_layer_size * layer,
623 filter);
624 }
625
626 static void
627 r3d_src_buffer(struct tu_cmd_buffer *cmd,
628 struct tu_cs *cs,
629 VkFormat vk_format,
630 uint64_t va, uint32_t pitch,
631 uint32_t width, uint32_t height)
632 {
633 uint32_t desc[A6XX_TEX_CONST_DWORDS];
634
635 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
636
637 desc[0] =
638 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
639 A6XX_TEX_CONST_0_FMT(format.fmt) |
640 A6XX_TEX_CONST_0_SWAP(format.swap) |
641 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
642 // XXX to swizzle into .w for stencil buffer_to_image
643 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
644 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
645 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
646 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
647 desc[2] =
648 A6XX_TEX_CONST_2_PITCH(pitch) |
649 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
650 desc[3] = 0;
651 desc[4] = va;
652 desc[5] = va >> 32;
653 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
654 desc[i] = 0;
655
656 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
657 }
658
659 static void
660 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
661 {
662 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
663
664 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
665 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
666 tu_cs_image_ref(cs, iview, layer);
667 tu_cs_emit(cs, 0);
668
669 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
670 tu_cs_image_flag_ref(cs, iview, layer);
671
672 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
673 }
674
675 static void
676 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
677 {
678 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
679
680 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
681 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
682 tu_cs_image_stencil_ref(cs, iview, layer);
683 tu_cs_emit(cs, 0);
684
685 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
686 }
687
688 static void
689 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
690 {
691 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
692
693 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
694
695 tu_cs_emit_regs(cs,
696 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
697 A6XX_RB_MRT_PITCH(0, pitch),
698 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
699 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
700 A6XX_RB_MRT_BASE_HI(0, va >> 32),
701 A6XX_RB_MRT_BASE_GMEM(0, 0));
702
703 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
704 }
705
706 static uint8_t
707 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
708 {
709 uint8_t mask = 0xf;
710 assert(aspect_mask);
711 /* note: the only format with partial writing is D24S8,
712 * clear/blit uses the _AS_R8G8B8A8 format to access it
713 */
714 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
715 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
716 mask = 0x7;
717 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
718 mask = 0x8;
719 }
720 return mask;
721 }
722
723 static void
724 r3d_setup(struct tu_cmd_buffer *cmd,
725 struct tu_cs *cs,
726 VkFormat vk_format,
727 VkImageAspectFlags aspect_mask,
728 enum a6xx_rotation rotation,
729 bool clear,
730 bool ubwc)
731 {
732 enum a6xx_format format = tu6_base_format(vk_format);
733
734 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
735 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
736 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
737 }
738
739 if (!cmd->state.pass) {
740 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
741 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
742 }
743
744 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
745 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
746
747 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
748
749 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
750 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
751 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
752 0xfc000000);
753 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
754
755 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
756 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
757
758 tu_cs_emit_regs(cs,
759 A6XX_RB_FS_OUTPUT_CNTL0(),
760 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
761
762 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
763 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
764 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
765
766 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
767 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
768 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
769 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
770 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
771 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
772 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
773
774 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
775 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
776
777 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
778 .color_format = format,
779 .color_sint = vk_format_is_sint(vk_format),
780 .color_uint = vk_format_is_uint(vk_format)));
781
782 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
783 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
784 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
785 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
786
787 if (cmd->state.predication_active) {
788 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
789 tu_cs_emit(cs, 0);
790 }
791 }
792
793 static void
794 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
795 {
796 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
797 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
798 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
799 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
800 tu_cs_emit(cs, 1); /* instance count */
801 tu_cs_emit(cs, 2); /* vertex count */
802 }
803
804 static void
805 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
806 {
807 if (cmd->state.predication_active) {
808 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
809 tu_cs_emit(cs, 1);
810 }
811 }
812
813 /* blit ops - common interface for 2d/shader paths */
814
815 struct blit_ops {
816 void (*coords)(struct tu_cs *cs,
817 const VkOffset2D *dst,
818 const VkOffset2D *src,
819 const VkExtent2D *extent);
820 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
821 void (*src)(
822 struct tu_cmd_buffer *cmd,
823 struct tu_cs *cs,
824 const struct tu_image_view *iview,
825 uint32_t layer,
826 VkFilter filter);
827 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
828 VkFormat vk_format,
829 uint64_t va, uint32_t pitch,
830 uint32_t width, uint32_t height);
831 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
832 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
833 void (*setup)(struct tu_cmd_buffer *cmd,
834 struct tu_cs *cs,
835 VkFormat vk_format,
836 VkImageAspectFlags aspect_mask,
837 enum a6xx_rotation rotation,
838 bool clear,
839 bool ubwc);
840 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
841 void (*teardown)(struct tu_cmd_buffer *cmd,
842 struct tu_cs *cs);
843 };
844
845 static const struct blit_ops r2d_ops = {
846 .coords = r2d_coords,
847 .clear_value = r2d_clear_value,
848 .src = r2d_src,
849 .src_buffer = r2d_src_buffer,
850 .dst = r2d_dst,
851 .dst_buffer = r2d_dst_buffer,
852 .setup = r2d_setup,
853 .run = r2d_run,
854 .teardown = r2d_teardown,
855 };
856
857 static const struct blit_ops r3d_ops = {
858 .coords = r3d_coords,
859 .clear_value = r3d_clear_value,
860 .src = r3d_src,
861 .src_buffer = r3d_src_buffer,
862 .dst = r3d_dst,
863 .dst_buffer = r3d_dst_buffer,
864 .setup = r3d_setup,
865 .run = r3d_run,
866 .teardown = r3d_teardown,
867 };
868
869 /* passthrough set coords from 3D extents */
870 static void
871 coords(const struct blit_ops *ops,
872 struct tu_cs *cs,
873 const VkOffset3D *dst,
874 const VkOffset3D *src,
875 const VkExtent3D *extent)
876 {
877 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
878 }
879
880 static VkFormat
881 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
882 {
883 if (vk_format_is_compressed(format)) {
884 switch (vk_format_get_blocksize(format)) {
885 case 1: return VK_FORMAT_R8_UINT;
886 case 2: return VK_FORMAT_R16_UINT;
887 case 4: return VK_FORMAT_R32_UINT;
888 case 8: return VK_FORMAT_R32G32_UINT;
889 case 16:return VK_FORMAT_R32G32B32A32_UINT;
890 default:
891 unreachable("unhandled format size");
892 }
893 }
894
895 switch (format) {
896 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
897 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
898 return VK_FORMAT_R8G8_UNORM;
899 /* fallthrough */
900 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
901 return VK_FORMAT_R8_UNORM;
902 case VK_FORMAT_D24_UNORM_S8_UINT:
903 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
904 return VK_FORMAT_R8_UNORM;
905 /* fallthrough */
906 default:
907 return format;
908 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
909 return VK_FORMAT_R32_UINT;
910 case VK_FORMAT_D32_SFLOAT_S8_UINT:
911 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
912 return VK_FORMAT_S8_UINT;
913 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
914 return VK_FORMAT_D32_SFLOAT;
915 }
916 }
917
918 static void
919 tu_image_view_copy_blit(struct tu_image_view *iview,
920 struct tu_image *image,
921 VkFormat format,
922 const VkImageSubresourceLayers *subres,
923 uint32_t layer,
924 bool stencil_read)
925 {
926 VkImageAspectFlags aspect_mask = subres->aspectMask;
927
928 /* always use the AS_R8G8B8A8 format for these */
929 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
930 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
931 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
932 }
933
934 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
935 .image = tu_image_to_handle(image),
936 .viewType = VK_IMAGE_VIEW_TYPE_2D,
937 .format = format,
938 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
939 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
940 .subresourceRange = {
941 .aspectMask = aspect_mask,
942 .baseMipLevel = subres->mipLevel,
943 .levelCount = 1,
944 .baseArrayLayer = subres->baseArrayLayer + layer,
945 .layerCount = 1,
946 },
947 }, false);
948 }
949
950 static void
951 tu_image_view_copy(struct tu_image_view *iview,
952 struct tu_image *image,
953 VkFormat format,
954 const VkImageSubresourceLayers *subres,
955 uint32_t layer,
956 bool stencil_read)
957 {
958 format = copy_format(format, subres->aspectMask, false);
959 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
960 }
961
962 static void
963 tu_image_view_blit(struct tu_image_view *iview,
964 struct tu_image *image,
965 const VkImageSubresourceLayers *subres,
966 uint32_t layer)
967 {
968 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
969 }
970
971 static void
972 tu6_blit_image(struct tu_cmd_buffer *cmd,
973 struct tu_image *src_image,
974 struct tu_image *dst_image,
975 const VkImageBlit *info,
976 VkFilter filter)
977 {
978 const struct blit_ops *ops = &r2d_ops;
979 struct tu_cs *cs = &cmd->cs;
980 uint32_t layers;
981
982 /* 2D blit can't do rotation mirroring from just coordinates */
983 static const enum a6xx_rotation rotate[2][2] = {
984 {ROTATE_0, ROTATE_HFLIP},
985 {ROTATE_VFLIP, ROTATE_180},
986 };
987
988 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
989 (info->dstOffsets[1].x < info->dstOffsets[0].x);
990 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
991 (info->dstOffsets[1].y < info->dstOffsets[0].y);
992 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
993 (info->dstOffsets[1].z < info->dstOffsets[0].z);
994
995 if (mirror_z) {
996 tu_finishme("blit z mirror\n");
997 return;
998 }
999
1000 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1001 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1002 tu_finishme("blit z filter\n");
1003 return;
1004 }
1005
1006 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1007 if (info->dstSubresource.layerCount > 1) {
1008 assert(layers <= 1);
1009 layers = info->dstSubresource.layerCount;
1010 }
1011
1012 /* BC1_RGB_* formats need to have their last components overriden with 1
1013 * when sampling, which is normally handled with the texture descriptor
1014 * swizzle. The 2d path can't handle that, so use the 3d path.
1015 *
1016 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1017 * the 2d path.
1018 */
1019
1020 if (dst_image->samples > 1 ||
1021 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1022 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1023 filter == VK_FILTER_CUBIC_EXT)
1024 ops = &r3d_ops;
1025
1026 /* use the right format in setup() for D32_S8
1027 * TODO: this probably should use a helper
1028 */
1029 VkFormat format = dst_image->vk_format;
1030 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1031 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1032 format = VK_FORMAT_D32_SFLOAT;
1033 else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1034 format = VK_FORMAT_S8_UINT;
1035 else
1036 unreachable("unexpected D32_S8 aspect mask in blit_image");
1037 }
1038
1039 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1040 rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1041
1042 if (ops == &r3d_ops) {
1043 r3d_coords_raw(cs, (float[]) {
1044 info->dstOffsets[0].x, info->dstOffsets[0].y,
1045 info->srcOffsets[0].x, info->srcOffsets[0].y,
1046 info->dstOffsets[1].x, info->dstOffsets[1].y,
1047 info->srcOffsets[1].x, info->srcOffsets[1].y
1048 });
1049 } else {
1050 tu_cs_emit_regs(cs,
1051 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1052 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1053 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1054 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1055 tu_cs_emit_regs(cs,
1056 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1057 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1058 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1059 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1060 }
1061
1062 struct tu_image_view dst, src;
1063 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1064 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1065
1066 for (uint32_t i = 0; i < layers; i++) {
1067 ops->dst(cs, &dst, i);
1068 ops->src(cmd, cs, &src, i, filter);
1069 ops->run(cmd, cs);
1070 }
1071
1072 ops->teardown(cmd, cs);
1073 }
1074
1075 void
1076 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1077 VkImage srcImage,
1078 VkImageLayout srcImageLayout,
1079 VkImage dstImage,
1080 VkImageLayout dstImageLayout,
1081 uint32_t regionCount,
1082 const VkImageBlit *pRegions,
1083 VkFilter filter)
1084
1085 {
1086 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1087 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1088 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1089
1090 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1091 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1092
1093 for (uint32_t i = 0; i < regionCount; ++i) {
1094 /* can't blit both depth and stencil at once with D32_S8
1095 * TODO: more advanced 3D blit path to support it instead?
1096 */
1097 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1098 dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1099 VkImageBlit region = pRegions[i];
1100 uint32_t b;
1101 for_each_bit(b, pRegions[i].dstSubresource.aspectMask) {
1102 region.srcSubresource.aspectMask = BIT(b);
1103 region.dstSubresource.aspectMask = BIT(b);
1104 tu6_blit_image(cmd, src_image, dst_image, &region, filter);
1105 }
1106 continue;
1107 }
1108 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1109 }
1110 }
1111
1112 static void
1113 copy_compressed(VkFormat format,
1114 VkOffset3D *offset,
1115 VkExtent3D *extent,
1116 uint32_t *width,
1117 uint32_t *height)
1118 {
1119 if (!vk_format_is_compressed(format))
1120 return;
1121
1122 uint32_t block_width = vk_format_get_blockwidth(format);
1123 uint32_t block_height = vk_format_get_blockheight(format);
1124
1125 offset->x /= block_width;
1126 offset->y /= block_height;
1127
1128 if (extent) {
1129 extent->width = DIV_ROUND_UP(extent->width, block_width);
1130 extent->height = DIV_ROUND_UP(extent->height, block_height);
1131 }
1132 if (width)
1133 *width = DIV_ROUND_UP(*width, block_width);
1134 if (height)
1135 *height = DIV_ROUND_UP(*height, block_height);
1136 }
1137
1138 static void
1139 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1140 struct tu_buffer *src_buffer,
1141 struct tu_image *dst_image,
1142 const VkBufferImageCopy *info)
1143 {
1144 struct tu_cs *cs = &cmd->cs;
1145 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1146 VkFormat src_format =
1147 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1148 const struct blit_ops *ops = &r2d_ops;
1149
1150 /* special case for buffer to stencil */
1151 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1152 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1153 ops = &r3d_ops;
1154 }
1155
1156 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1157 * which matters for UBWC. buffer_to_image/etc can fail because of this
1158 */
1159
1160 VkOffset3D offset = info->imageOffset;
1161 VkExtent3D extent = info->imageExtent;
1162 uint32_t src_width = info->bufferRowLength ?: extent.width;
1163 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1164
1165 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1166
1167 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1168 uint32_t layer_size = src_height * pitch;
1169
1170 ops->setup(cmd, cs,
1171 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1172 info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1173
1174 struct tu_image_view dst;
1175 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1176
1177 for (uint32_t i = 0; i < layers; i++) {
1178 ops->dst(cs, &dst, i);
1179
1180 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1181 if ((src_va & 63) || (pitch & 63)) {
1182 for (uint32_t y = 0; y < extent.height; y++) {
1183 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1184 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1185 x + extent.width, 1);
1186 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1187 &(VkExtent2D) {extent.width, 1});
1188 ops->run(cmd, cs);
1189 src_va += pitch;
1190 }
1191 } else {
1192 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1193 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1194 ops->run(cmd, cs);
1195 }
1196 }
1197
1198 ops->teardown(cmd, cs);
1199 }
1200
1201 void
1202 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1203 VkBuffer srcBuffer,
1204 VkImage dstImage,
1205 VkImageLayout dstImageLayout,
1206 uint32_t regionCount,
1207 const VkBufferImageCopy *pRegions)
1208 {
1209 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1210 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1211 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1212
1213 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1214 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1215
1216 for (unsigned i = 0; i < regionCount; ++i)
1217 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1218 }
1219
1220 static void
1221 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1222 struct tu_image *src_image,
1223 struct tu_buffer *dst_buffer,
1224 const VkBufferImageCopy *info)
1225 {
1226 struct tu_cs *cs = &cmd->cs;
1227 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1228 VkFormat dst_format =
1229 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1230 bool stencil_read = false;
1231
1232 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1233 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1234 stencil_read = true;
1235 }
1236
1237 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1238 VkOffset3D offset = info->imageOffset;
1239 VkExtent3D extent = info->imageExtent;
1240 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1241 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1242
1243 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1244
1245 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1246 uint32_t layer_size = pitch * dst_height;
1247
1248 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1249
1250 struct tu_image_view src;
1251 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1252
1253 for (uint32_t i = 0; i < layers; i++) {
1254 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1255
1256 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1257 if ((dst_va & 63) || (pitch & 63)) {
1258 for (uint32_t y = 0; y < extent.height; y++) {
1259 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1260 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1261 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1262 &(VkExtent2D) {extent.width, 1});
1263 ops->run(cmd, cs);
1264 dst_va += pitch;
1265 }
1266 } else {
1267 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1268 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1269 ops->run(cmd, cs);
1270 }
1271 }
1272
1273 ops->teardown(cmd, cs);
1274 }
1275
1276 void
1277 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1278 VkImage srcImage,
1279 VkImageLayout srcImageLayout,
1280 VkBuffer dstBuffer,
1281 uint32_t regionCount,
1282 const VkBufferImageCopy *pRegions)
1283 {
1284 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1285 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1286 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1287
1288 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1289 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1290
1291 for (unsigned i = 0; i < regionCount; ++i)
1292 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1293 }
1294
1295 /* Tiled formats don't support swapping, which means that we can't support
1296 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1297 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1298 * Currently we fake support for tiled swapped formats and use the unswapped
1299 * format instead, but this means that reinterpreting copies to and from
1300 * swapped formats can't be performed correctly unless we can swizzle the
1301 * components by reinterpreting the other image as the "correct" swapped
1302 * format, i.e. only when the other image is linear.
1303 */
1304
1305 static bool
1306 is_swapped_format(VkFormat format)
1307 {
1308 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1309 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1310 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1311 }
1312
1313 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1314 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1315 * versa). This should mirror the logic in fdl6_layout.
1316 */
1317 static bool
1318 image_is_r8g8(struct tu_image *image)
1319 {
1320 return image->layout[0].cpp == 2 &&
1321 vk_format_get_nr_components(image->vk_format) == 2;
1322 }
1323
1324 static void
1325 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1326 struct tu_image *src_image,
1327 struct tu_image *dst_image,
1328 const VkImageCopy *info)
1329 {
1330 const struct blit_ops *ops = &r2d_ops;
1331 struct tu_cs *cs = &cmd->cs;
1332
1333 if (dst_image->samples > 1)
1334 ops = &r3d_ops;
1335
1336 VkFormat format = VK_FORMAT_UNDEFINED;
1337 VkOffset3D src_offset = info->srcOffset;
1338 VkOffset3D dst_offset = info->dstOffset;
1339 VkExtent3D extent = info->extent;
1340
1341 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1342 * Images":
1343 *
1344 * When copying between compressed and uncompressed formats the extent
1345 * members represent the texel dimensions of the source image and not
1346 * the destination. When copying from a compressed image to an
1347 * uncompressed image the image texel dimensions written to the
1348 * uncompressed image will be source extent divided by the compressed
1349 * texel block dimensions. When copying from an uncompressed image to a
1350 * compressed image the image texel dimensions written to the compressed
1351 * image will be the source extent multiplied by the compressed texel
1352 * block dimensions.
1353 *
1354 * This means we only have to adjust the extent if the source image is
1355 * compressed.
1356 */
1357 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1358 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1359
1360 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1361 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1362
1363 bool use_staging_blit = false;
1364
1365 if (src_format == dst_format) {
1366 /* Images that share a format can always be copied directly because it's
1367 * the same as a blit.
1368 */
1369 format = src_format;
1370 } else if (!src_image->layout[0].tile_mode) {
1371 /* If an image is linear, we can always safely reinterpret it with the
1372 * other image's format and then do a regular blit.
1373 */
1374 format = dst_format;
1375 } else if (!dst_image->layout[0].tile_mode) {
1376 format = src_format;
1377 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1378 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1379 * due to the different tile layout.
1380 */
1381 use_staging_blit = true;
1382 } else if (is_swapped_format(src_format) ||
1383 is_swapped_format(dst_format)) {
1384 /* If either format has a non-identity swap, then we can't copy
1385 * to/from it.
1386 */
1387 use_staging_blit = true;
1388 } else if (!src_image->layout[0].ubwc) {
1389 format = dst_format;
1390 } else if (!dst_image->layout[0].ubwc) {
1391 format = src_format;
1392 } else {
1393 /* Both formats use UBWC and so neither can be reinterpreted.
1394 * TODO: We could do an in-place decompression of the dst instead.
1395 */
1396 use_staging_blit = true;
1397 }
1398
1399 struct tu_image_view dst, src;
1400
1401 if (use_staging_blit) {
1402 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1403 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1404
1405 struct tu_image staging_image = {
1406 .vk_format = src_format,
1407 .type = src_image->type,
1408 .tiling = VK_IMAGE_TILING_LINEAR,
1409 .extent = extent,
1410 .level_count = 1,
1411 .layer_count = info->srcSubresource.layerCount,
1412 .samples = src_image->samples,
1413 .bo_offset = 0,
1414 };
1415
1416 VkImageSubresourceLayers staging_subresource = {
1417 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1418 .mipLevel = 0,
1419 .baseArrayLayer = 0,
1420 .layerCount = info->srcSubresource.layerCount,
1421 };
1422
1423 VkOffset3D staging_offset = { 0 };
1424
1425 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1426 staging_image.layout[0].ubwc = false;
1427
1428 fdl6_layout(&staging_image.layout[0],
1429 vk_format_to_pipe_format(staging_image.vk_format),
1430 staging_image.samples,
1431 staging_image.extent.width,
1432 staging_image.extent.height,
1433 staging_image.extent.depth,
1434 staging_image.level_count,
1435 staging_image.layer_count,
1436 staging_image.type == VK_IMAGE_TYPE_3D,
1437 NULL);
1438
1439 VkResult result = tu_get_scratch_bo(cmd->device,
1440 staging_image.layout[0].size,
1441 &staging_image.bo);
1442 if (result != VK_SUCCESS) {
1443 cmd->record_result = result;
1444 return;
1445 }
1446
1447 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1448 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1449
1450 struct tu_image_view staging;
1451 tu_image_view_copy(&staging, &staging_image, src_format,
1452 &staging_subresource, 0, false);
1453
1454 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1455 coords(ops, cs, &staging_offset, &src_offset, &extent);
1456
1457 for (uint32_t i = 0; i < info->extent.depth; i++) {
1458 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1459 ops->dst(cs, &staging, i);
1460 ops->run(cmd, cs);
1461 }
1462
1463 /* When executed by the user there has to be a pipeline barrier here,
1464 * but since we're doing it manually we'll have to flush ourselves.
1465 */
1466 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1467 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1468
1469 tu_image_view_copy(&staging, &staging_image, dst_format,
1470 &staging_subresource, 0, false);
1471
1472 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1473 ROTATE_0, false, dst_image->layout[0].ubwc);
1474 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1475
1476 for (uint32_t i = 0; i < info->extent.depth; i++) {
1477 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1478 ops->dst(cs, &dst, i);
1479 ops->run(cmd, cs);
1480 }
1481 } else {
1482 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1483 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1484
1485 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1486 ROTATE_0, false, dst_image->layout[0].ubwc);
1487 coords(ops, cs, &dst_offset, &src_offset, &extent);
1488
1489 for (uint32_t i = 0; i < info->extent.depth; i++) {
1490 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1491 ops->dst(cs, &dst, i);
1492 ops->run(cmd, cs);
1493 }
1494 }
1495
1496 ops->teardown(cmd, cs);
1497 }
1498
1499 void
1500 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1501 VkImage srcImage,
1502 VkImageLayout srcImageLayout,
1503 VkImage destImage,
1504 VkImageLayout destImageLayout,
1505 uint32_t regionCount,
1506 const VkImageCopy *pRegions)
1507 {
1508 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1509 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1510 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1511
1512 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1513 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1514
1515 for (uint32_t i = 0; i < regionCount; ++i)
1516 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1517 }
1518
1519 static void
1520 copy_buffer(struct tu_cmd_buffer *cmd,
1521 uint64_t dst_va,
1522 uint64_t src_va,
1523 uint64_t size,
1524 uint32_t block_size)
1525 {
1526 const struct blit_ops *ops = &r2d_ops;
1527 struct tu_cs *cs = &cmd->cs;
1528 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1529 uint64_t blocks = size / block_size;
1530
1531 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1532
1533 while (blocks) {
1534 uint32_t src_x = (src_va & 63) / block_size;
1535 uint32_t dst_x = (dst_va & 63) / block_size;
1536 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1537
1538 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1539 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1540 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1541 ops->run(cmd, cs);
1542
1543 src_va += width * block_size;
1544 dst_va += width * block_size;
1545 blocks -= width;
1546 }
1547
1548 ops->teardown(cmd, cs);
1549 }
1550
1551 void
1552 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1553 VkBuffer srcBuffer,
1554 VkBuffer dstBuffer,
1555 uint32_t regionCount,
1556 const VkBufferCopy *pRegions)
1557 {
1558 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1559 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1560 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1561
1562 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1563 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1564
1565 for (unsigned i = 0; i < regionCount; ++i) {
1566 copy_buffer(cmd,
1567 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1568 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1569 pRegions[i].size, 1);
1570 }
1571 }
1572
1573 void
1574 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1575 VkBuffer dstBuffer,
1576 VkDeviceSize dstOffset,
1577 VkDeviceSize dataSize,
1578 const void *pData)
1579 {
1580 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1581 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1582
1583 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1584
1585 struct tu_cs_memory tmp;
1586 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1587 if (result != VK_SUCCESS) {
1588 cmd->record_result = result;
1589 return;
1590 }
1591
1592 memcpy(tmp.map, pData, dataSize);
1593 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1594 }
1595
1596 void
1597 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1598 VkBuffer dstBuffer,
1599 VkDeviceSize dstOffset,
1600 VkDeviceSize fillSize,
1601 uint32_t data)
1602 {
1603 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1604 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1605 const struct blit_ops *ops = &r2d_ops;
1606 struct tu_cs *cs = &cmd->cs;
1607
1608 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1609
1610 if (fillSize == VK_WHOLE_SIZE)
1611 fillSize = buffer->size - dstOffset;
1612
1613 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1614 uint32_t blocks = fillSize / 4;
1615
1616 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1617 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1618
1619 while (blocks) {
1620 uint32_t dst_x = (dst_va & 63) / 4;
1621 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1622
1623 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1624 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1625 ops->run(cmd, cs);
1626
1627 dst_va += width * 4;
1628 blocks -= width;
1629 }
1630
1631 ops->teardown(cmd, cs);
1632 }
1633
1634 void
1635 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1636 VkImage srcImage,
1637 VkImageLayout srcImageLayout,
1638 VkImage dstImage,
1639 VkImageLayout dstImageLayout,
1640 uint32_t regionCount,
1641 const VkImageResolve *pRegions)
1642 {
1643 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1644 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1645 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1646 const struct blit_ops *ops = &r2d_ops;
1647 struct tu_cs *cs = &cmd->cs;
1648
1649 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1650 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1651
1652 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1653 ROTATE_0, false, dst_image->layout[0].ubwc);
1654
1655 for (uint32_t i = 0; i < regionCount; ++i) {
1656 const VkImageResolve *info = &pRegions[i];
1657 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1658
1659 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1660 /* TODO: aspect masks possible ? */
1661
1662 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1663
1664 struct tu_image_view dst, src;
1665 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1666 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1667
1668 for (uint32_t i = 0; i < layers; i++) {
1669 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1670 ops->dst(cs, &dst, i);
1671 ops->run(cmd, cs);
1672 }
1673 }
1674
1675 ops->teardown(cmd, cs);
1676 }
1677
1678 #define for_each_layer(layer, layer_mask, layers) \
1679 for (uint32_t layer = 0; \
1680 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
1681 layer++) \
1682 if (!layer_mask || (layer_mask & BIT(layer)))
1683
1684 void
1685 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1686 struct tu_cs *cs,
1687 struct tu_image_view *src,
1688 struct tu_image_view *dst,
1689 uint32_t layer_mask,
1690 uint32_t layers,
1691 const VkRect2D *rect)
1692 {
1693 const struct blit_ops *ops = &r2d_ops;
1694
1695 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1696 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1697
1698 assert(src->image->vk_format == dst->image->vk_format);
1699
1700 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1701 ROTATE_0, false, dst->ubwc_enabled);
1702 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1703
1704 for_each_layer(i, layer_mask, layers) {
1705 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1706 ops->dst(cs, dst, i);
1707 ops->run(cmd, cs);
1708 }
1709
1710 ops->teardown(cmd, cs);
1711 }
1712
1713 static void
1714 clear_image(struct tu_cmd_buffer *cmd,
1715 struct tu_image *image,
1716 const VkClearValue *clear_value,
1717 const VkImageSubresourceRange *range,
1718 VkImageAspectFlags aspect_mask)
1719 {
1720 uint32_t level_count = tu_get_levelCount(image, range);
1721 uint32_t layer_count = tu_get_layerCount(image, range);
1722 struct tu_cs *cs = &cmd->cs;
1723 VkFormat format = image->vk_format;
1724 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1725 format = copy_format(format, aspect_mask, false);
1726
1727 if (image->type == VK_IMAGE_TYPE_3D) {
1728 assert(layer_count == 1);
1729 assert(range->baseArrayLayer == 0);
1730 }
1731
1732 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1733
1734 ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
1735 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1736 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
1737 else
1738 ops->clear_value(cs, format, clear_value);
1739
1740 for (unsigned j = 0; j < level_count; j++) {
1741 if (image->type == VK_IMAGE_TYPE_3D)
1742 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1743
1744 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1745 u_minify(image->extent.width, range->baseMipLevel + j),
1746 u_minify(image->extent.height, range->baseMipLevel + j)
1747 });
1748
1749 struct tu_image_view dst;
1750 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1751 .aspectMask = aspect_mask,
1752 .mipLevel = range->baseMipLevel + j,
1753 .baseArrayLayer = range->baseArrayLayer,
1754 .layerCount = 1,
1755 }, 0, false);
1756
1757 for (uint32_t i = 0; i < layer_count; i++) {
1758 ops->dst(cs, &dst, i);
1759 ops->run(cmd, cs);
1760 }
1761 }
1762
1763 ops->teardown(cmd, cs);
1764 }
1765
1766 void
1767 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1768 VkImage image_h,
1769 VkImageLayout imageLayout,
1770 const VkClearColorValue *pColor,
1771 uint32_t rangeCount,
1772 const VkImageSubresourceRange *pRanges)
1773 {
1774 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1775 TU_FROM_HANDLE(tu_image, image, image_h);
1776
1777 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1778
1779 for (unsigned i = 0; i < rangeCount; i++)
1780 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
1781 }
1782
1783 void
1784 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1785 VkImage image_h,
1786 VkImageLayout imageLayout,
1787 const VkClearDepthStencilValue *pDepthStencil,
1788 uint32_t rangeCount,
1789 const VkImageSubresourceRange *pRanges)
1790 {
1791 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1792 TU_FROM_HANDLE(tu_image, image, image_h);
1793
1794 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1795
1796 for (unsigned i = 0; i < rangeCount; i++) {
1797 const VkImageSubresourceRange *range = &pRanges[i];
1798
1799 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1800 /* can't clear both depth and stencil at once, split up the aspect mask */
1801 uint32_t b;
1802 for_each_bit(b, range->aspectMask)
1803 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
1804 continue;
1805 }
1806
1807 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
1808 }
1809 }
1810
1811 static void
1812 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1813 uint32_t attachment_count,
1814 const VkClearAttachment *attachments,
1815 uint32_t rect_count,
1816 const VkClearRect *rects)
1817 {
1818 /* the shader path here is special, it avoids changing MRT/etc state */
1819 const struct tu_render_pass *pass = cmd->state.pass;
1820 const struct tu_subpass *subpass = cmd->state.subpass;
1821 const uint32_t mrt_count = subpass->color_count;
1822 struct tu_cs *cs = &cmd->draw_cs;
1823 uint32_t clear_value[MAX_RTS][4];
1824 float z_clear_val = 0.0f;
1825 uint8_t s_clear_val = 0;
1826 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1827 bool z_clear = false;
1828 bool s_clear = false;
1829 bool layered_clear = false;
1830 uint32_t max_samples = 1;
1831
1832 for (uint32_t i = 0; i < attachment_count; i++) {
1833 uint32_t a;
1834 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1835 uint32_t c = attachments[i].colorAttachment;
1836 a = subpass->color_attachments[c].attachment;
1837 if (a == VK_ATTACHMENT_UNUSED)
1838 continue;
1839
1840 clear_rts |= 1 << c;
1841 clear_components |= 0xf << (c * 4);
1842 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1843 } else {
1844 a = subpass->depth_stencil_attachment.attachment;
1845 if (a == VK_ATTACHMENT_UNUSED)
1846 continue;
1847
1848 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1849 z_clear = true;
1850 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1851 }
1852
1853 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1854 s_clear = true;
1855 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1856 }
1857 }
1858
1859 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1860 }
1861
1862 /* disable all draw states so they don't interfere
1863 * TODO: use and re-use draw states
1864 * we have to disable draw states individually to preserve
1865 * input attachment states, because a secondary command buffer
1866 * won't be able to restore them
1867 */
1868 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1869 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1870 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1871 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1872 continue;
1873 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1874 CP_SET_DRAW_STATE__0_DISABLE);
1875 tu_cs_emit_qw(cs, 0);
1876 }
1877 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1878
1879 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1880 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1881 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1882 0xfc000000);
1883 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1884
1885 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1886 for (uint32_t i = 0; i < mrt_count; i++) {
1887 if (clear_rts & (1 << i))
1888 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1889 else
1890 tu_cs_emit(cs, 0);
1891 }
1892
1893 for (uint32_t i = 0; i < rect_count; i++) {
1894 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1895 layered_clear = true;
1896 }
1897
1898 /* a630 doesn't support multiview masks, which means that we can't use the
1899 * normal multiview path without potentially recompiling a shader on-demand
1900 * or using a more complicated variant that takes the mask as a const. Just
1901 * use the layered path instead, since it shouldn't be much worse.
1902 */
1903 if (subpass->multiview_mask) {
1904 layered_clear = true;
1905 }
1906
1907 r3d_common(cmd, cs, false, num_rts, layered_clear);
1908
1909 tu_cs_emit_regs(cs,
1910 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1911 tu_cs_emit_regs(cs,
1912 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1913
1914 tu_cs_emit_regs(cs,
1915 A6XX_RB_FS_OUTPUT_CNTL0(),
1916 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1917
1918 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1919 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1920 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1921 for (uint32_t i = 0; i < mrt_count; i++) {
1922 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1923 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1924 }
1925
1926 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1927 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1928 .z_enable = z_clear,
1929 .z_write_enable = z_clear,
1930 .zfunc = FUNC_ALWAYS));
1931 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1932 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1933 .stencil_enable = s_clear,
1934 .func = FUNC_ALWAYS,
1935 .zpass = STENCIL_REPLACE));
1936 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1937 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1938 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1939
1940 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1941 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1942 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1943 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1944 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1945 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1946 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1947 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1948 for_each_bit(b, clear_rts)
1949 tu_cs_emit_array(cs, clear_value[b], 4);
1950
1951 for (uint32_t i = 0; i < rect_count; i++) {
1952 /* This should be true because of this valid usage for
1953 * vkCmdClearAttachments:
1954 *
1955 * "If the render pass instance this is recorded in uses multiview,
1956 * then baseArrayLayer must be zero and layerCount must be one"
1957 */
1958 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
1959
1960 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
1961 r3d_coords_raw(cs, (float[]) {
1962 rects[i].rect.offset.x, rects[i].rect.offset.y,
1963 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1964 rects[i].rect.offset.x + rects[i].rect.extent.width,
1965 rects[i].rect.offset.y + rects[i].rect.extent.height,
1966 z_clear_val, 1.0f,
1967 });
1968 r3d_run(cmd, cs);
1969 }
1970 }
1971 }
1972
1973 static void
1974 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1975 {
1976 switch (format) {
1977 case VK_FORMAT_X8_D24_UNORM_PACK32:
1978 case VK_FORMAT_D24_UNORM_S8_UINT:
1979 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1980 val->depthStencil.stencil << 24;
1981 return;
1982 case VK_FORMAT_D16_UNORM:
1983 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1984 return;
1985 case VK_FORMAT_D32_SFLOAT:
1986 clear_value[0] = fui(val->depthStencil.depth);
1987 return;
1988 case VK_FORMAT_S8_UINT:
1989 clear_value[0] = val->depthStencil.stencil;
1990 return;
1991 default:
1992 break;
1993 }
1994
1995 float tmp[4];
1996 memcpy(tmp, val->color.float32, 4 * sizeof(float));
1997 if (vk_format_is_srgb(format)) {
1998 for (int i = 0; i < 4; i++)
1999 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2000 }
2001
2002 #define PACK_F(type) util_format_##type##_pack_rgba_float \
2003 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2004 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2005 case 4:
2006 PACK_F(r4g4b4a4_unorm);
2007 break;
2008 case 5:
2009 if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2010 PACK_F(r5g6b5_unorm);
2011 else
2012 PACK_F(r5g5b5a1_unorm);
2013 break;
2014 case 8:
2015 if (vk_format_is_snorm(format))
2016 PACK_F(r8g8b8a8_snorm);
2017 else if (vk_format_is_unorm(format))
2018 PACK_F(r8g8b8a8_unorm);
2019 else
2020 pack_int8(clear_value, val->color.uint32);
2021 break;
2022 case 10:
2023 if (vk_format_is_int(format))
2024 pack_int10_2(clear_value, val->color.uint32);
2025 else
2026 PACK_F(r10g10b10a2_unorm);
2027 break;
2028 case 11:
2029 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2030 break;
2031 case 16:
2032 if (vk_format_is_snorm(format))
2033 PACK_F(r16g16b16a16_snorm);
2034 else if (vk_format_is_unorm(format))
2035 PACK_F(r16g16b16a16_unorm);
2036 else if (vk_format_is_float(format))
2037 PACK_F(r16g16b16a16_float);
2038 else
2039 pack_int16(clear_value, val->color.uint32);
2040 break;
2041 case 32:
2042 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2043 break;
2044 default:
2045 unreachable("unexpected channel size");
2046 }
2047 #undef PACK_F
2048 }
2049
2050 static void
2051 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2052 struct tu_cs *cs,
2053 VkFormat format,
2054 uint8_t clear_mask,
2055 uint32_t gmem_offset,
2056 const VkClearValue *value)
2057 {
2058 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2059 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2060
2061 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2062
2063 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2064 tu_cs_emit(cs, gmem_offset);
2065
2066 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2067 tu_cs_emit(cs, 0);
2068
2069 uint32_t clear_vals[4] = {};
2070 pack_gmem_clear_value(value, format, clear_vals);
2071
2072 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2073 tu_cs_emit_array(cs, clear_vals, 4);
2074
2075 tu6_emit_event_write(cmd, cs, BLIT);
2076 }
2077
2078 static void
2079 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2080 struct tu_cs *cs,
2081 uint32_t attachment,
2082 VkImageAspectFlags mask,
2083 const VkClearValue *value)
2084 {
2085 const struct tu_render_pass_attachment *att =
2086 &cmd->state.pass->attachments[attachment];
2087
2088 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2089 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2090 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2091 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2092 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2093 return;
2094 }
2095
2096 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2097 }
2098
2099 static void
2100 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2101 uint32_t attachment_count,
2102 const VkClearAttachment *attachments,
2103 uint32_t rect_count,
2104 const VkClearRect *rects)
2105 {
2106 const struct tu_subpass *subpass = cmd->state.subpass;
2107 struct tu_cs *cs = &cmd->draw_cs;
2108
2109 /* TODO: swap the loops for smaller cmdstream */
2110 for (unsigned i = 0; i < rect_count; i++) {
2111 unsigned x1 = rects[i].rect.offset.x;
2112 unsigned y1 = rects[i].rect.offset.y;
2113 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2114 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2115
2116 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2117 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2118 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2119
2120 for (unsigned j = 0; j < attachment_count; j++) {
2121 uint32_t a;
2122 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2123 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2124 else
2125 a = subpass->depth_stencil_attachment.attachment;
2126
2127 if (a == VK_ATTACHMENT_UNUSED)
2128 continue;
2129
2130 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2131 &attachments[j].clearValue);
2132 }
2133 }
2134 }
2135
2136 void
2137 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2138 uint32_t attachmentCount,
2139 const VkClearAttachment *pAttachments,
2140 uint32_t rectCount,
2141 const VkClearRect *pRects)
2142 {
2143 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2144 struct tu_cs *cs = &cmd->draw_cs;
2145
2146 /* sysmem path behaves like a draw, note we don't have a way of using different
2147 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2148 */
2149 tu_emit_cache_flush_renderpass(cmd, cs);
2150
2151 /* vkCmdClearAttachments is supposed to respect the predicate if active.
2152 * The easiest way to do this is to always use the 3d path, which always
2153 * works even with GMEM because it's just a simple draw using the existing
2154 * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2155 * skipped in the binning pass, since otherwise they produce binning data
2156 * which isn't consumed and leads to the wrong binning data being read, so
2157 * condition on GMEM | SYSMEM.
2158 */
2159 if (cmd->state.predication_active) {
2160 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2161 CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2162 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2163 tu_cond_exec_end(cs);
2164 return;
2165 }
2166
2167 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2168 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2169 tu_cond_exec_end(cs);
2170
2171 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2172 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2173 tu_cond_exec_end(cs);
2174 }
2175
2176 static void
2177 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2178 struct tu_cs *cs,
2179 VkFormat format,
2180 VkImageAspectFlags clear_mask,
2181 const VkRenderPassBeginInfo *info,
2182 uint32_t a,
2183 bool separate_stencil)
2184 {
2185 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2186 const struct tu_image_view *iview = fb->attachments[a].attachment;
2187 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2188 const struct blit_ops *ops = &r2d_ops;
2189 if (cmd->state.pass->attachments[a].samples > 1)
2190 ops = &r3d_ops;
2191
2192 ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
2193 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2194 ops->clear_value(cs, format, &info->pClearValues[a]);
2195
2196 for_each_layer(i, clear_views, fb->layers) {
2197 if (separate_stencil) {
2198 if (ops == &r3d_ops)
2199 r3d_dst_stencil(cs, iview, i);
2200 else
2201 r2d_dst_stencil(cs, iview, i);
2202 } else {
2203 ops->dst(cs, iview, i);
2204 }
2205 ops->run(cmd, cs);
2206 }
2207
2208 ops->teardown(cmd, cs);
2209 }
2210
2211 void
2212 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2213 struct tu_cs *cs,
2214 uint32_t a,
2215 const VkRenderPassBeginInfo *info)
2216 {
2217 const struct tu_render_pass_attachment *attachment =
2218 &cmd->state.pass->attachments[a];
2219
2220 if (!attachment->clear_mask)
2221 return;
2222
2223 /* Wait for any flushes at the beginning of the renderpass to complete */
2224 tu_cs_emit_wfi(cs);
2225
2226 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2227 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2228 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2229 info, a, false);
2230 }
2231 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2232 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2233 info, a, true);
2234 }
2235 } else {
2236 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2237 info, a, false);
2238 }
2239
2240 /* The spec doesn't explicitly say, but presumably the initial renderpass
2241 * clear is considered part of the renderpass, and therefore barriers
2242 * aren't required inside the subpass/renderpass. Therefore we need to
2243 * flush CCU color into CCU depth here, just like with
2244 * vkCmdClearAttachments(). Note that because this only happens at the
2245 * beginning of a renderpass, and renderpass writes are considered
2246 * "incoherent", we shouldn't have to worry about syncing depth into color
2247 * beforehand as depth should already be flushed.
2248 */
2249 if (vk_format_is_depth_or_stencil(attachment->format)) {
2250 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2251 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2252 } else {
2253 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2254 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2255 }
2256 }
2257
2258 void
2259 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2260 struct tu_cs *cs,
2261 uint32_t a,
2262 const VkRenderPassBeginInfo *info)
2263 {
2264 const struct tu_render_pass_attachment *attachment =
2265 &cmd->state.pass->attachments[a];
2266
2267 if (!attachment->clear_mask)
2268 return;
2269
2270 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2271
2272 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2273 &info->pClearValues[a]);
2274 }
2275
2276 static void
2277 tu_emit_blit(struct tu_cmd_buffer *cmd,
2278 struct tu_cs *cs,
2279 const struct tu_image_view *iview,
2280 const struct tu_render_pass_attachment *attachment,
2281 bool resolve,
2282 bool separate_stencil)
2283 {
2284 tu_cs_emit_regs(cs,
2285 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2286
2287 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2288 .unk0 = !resolve,
2289 .gmem = !resolve,
2290 /* "integer" bit disables msaa resolve averaging */
2291 .integer = vk_format_is_int(attachment->format)));
2292
2293 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2294 if (separate_stencil) {
2295 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2296 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2297 tu_cs_emit(cs, iview->stencil_PITCH);
2298
2299 tu_cs_emit_regs(cs,
2300 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2301 } else {
2302 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2303 tu_cs_image_ref_2d(cs, iview, 0, false);
2304
2305 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2306 tu_cs_image_flag_ref(cs, iview, 0);
2307
2308 tu_cs_emit_regs(cs,
2309 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2310 }
2311
2312 tu6_emit_event_write(cmd, cs, BLIT);
2313 }
2314
2315 static bool
2316 blit_can_resolve(VkFormat format)
2317 {
2318 const struct util_format_description *desc = vk_format_description(format);
2319
2320 /* blit event can only do resolve for simple cases:
2321 * averaging samples as unsigned integers or choosing only one sample
2322 */
2323 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2324 return false;
2325
2326 /* can't do formats with larger channel sizes
2327 * note: this includes all float formats
2328 * note2: single channel integer formats seem OK
2329 */
2330 if (desc->channel[0].size > 10)
2331 return false;
2332
2333 switch (format) {
2334 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2335 * likely related to these formats having different layout from other cpp=2 formats
2336 */
2337 case VK_FORMAT_R8G8_UNORM:
2338 case VK_FORMAT_R8G8_UINT:
2339 case VK_FORMAT_R8G8_SINT:
2340 /* TODO: this one should be able to work? */
2341 case VK_FORMAT_D24_UNORM_S8_UINT:
2342 return false;
2343 default:
2344 break;
2345 }
2346
2347 return true;
2348 }
2349
2350 void
2351 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2352 struct tu_cs *cs,
2353 uint32_t a,
2354 bool force_load)
2355 {
2356 const struct tu_image_view *iview =
2357 cmd->state.framebuffer->attachments[a].attachment;
2358 const struct tu_render_pass_attachment *attachment =
2359 &cmd->state.pass->attachments[a];
2360
2361 if (attachment->load || force_load)
2362 tu_emit_blit(cmd, cs, iview, attachment, false, false);
2363
2364 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2365 tu_emit_blit(cmd, cs, iview, attachment, false, true);
2366 }
2367
2368 static void
2369 store_cp_blit(struct tu_cmd_buffer *cmd,
2370 struct tu_cs *cs,
2371 struct tu_image_view *iview,
2372 uint32_t samples,
2373 bool separate_stencil,
2374 VkFormat format,
2375 uint32_t gmem_offset,
2376 uint32_t cpp)
2377 {
2378 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
2379 iview->ubwc_enabled, true);
2380 if (separate_stencil)
2381 r2d_dst_stencil(cs, iview, 0);
2382 else
2383 r2d_dst(cs, iview, 0);
2384
2385 tu_cs_emit_regs(cs,
2386 A6XX_SP_PS_2D_SRC_INFO(
2387 .color_format = tu6_format_texture(format, TILE6_2).fmt,
2388 .tile_mode = TILE6_2,
2389 .srgb = vk_format_is_srgb(format),
2390 .samples = tu_msaa_samples(samples),
2391 .samples_average = !vk_format_is_int(format),
2392 .unk20 = 1,
2393 .unk22 = 1),
2394 /* note: src size does not matter when not scaling */
2395 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2396 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
2397 A6XX_SP_PS_2D_SRC_HI(),
2398 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2399
2400 /* sync GMEM writes with CACHE. */
2401 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2402
2403 /* Wait for CACHE_INVALIDATE to land */
2404 tu_cs_emit_wfi(cs);
2405
2406 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2407 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2408
2409 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2410 * sysmem, and we generally assume that GMEM renderpasses leave their
2411 * results in sysmem, so we need to flush manually here.
2412 */
2413 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2414 }
2415
2416 void
2417 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2418 struct tu_cs *cs,
2419 uint32_t a,
2420 uint32_t gmem_a)
2421 {
2422 const VkRect2D *render_area = &cmd->state.render_area;
2423 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2424 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2425 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2426
2427 if (!dst->store && !dst->store_stencil)
2428 return;
2429
2430 uint32_t x1 = render_area->offset.x;
2431 uint32_t y1 = render_area->offset.y;
2432 uint32_t x2 = x1 + render_area->extent.width;
2433 uint32_t y2 = y1 + render_area->extent.height;
2434 /* x2/y2 can be unaligned if equal to the size of the image,
2435 * since it will write into padding space
2436 * the one exception is linear levels which don't have the
2437 * required y padding in the layout (except for the last level)
2438 */
2439 bool need_y2_align =
2440 y2 != iview->extent.height || iview->need_y2_align;
2441
2442 bool unaligned =
2443 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2444 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2445
2446 /* use fast path when render area is aligned, except for unsupported resolve cases */
2447 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2448 if (dst->store)
2449 tu_emit_blit(cmd, cs, iview, src, true, false);
2450 if (dst->store_stencil)
2451 tu_emit_blit(cmd, cs, iview, src, true, true);
2452 return;
2453 }
2454
2455 if (dst->samples > 1) {
2456 /* I guess we need to use shader path in this case?
2457 * need a testcase which fails because of this
2458 */
2459 tu_finishme("unaligned store of msaa attachment\n");
2460 return;
2461 }
2462
2463 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2464
2465 VkFormat format = src->format;
2466 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2467 format = VK_FORMAT_D32_SFLOAT;
2468
2469 if (dst->store) {
2470 store_cp_blit(cmd, cs, iview, src->samples, false, format,
2471 src->gmem_offset, src->cpp);
2472 }
2473 if (dst->store_stencil) {
2474 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2475 src->gmem_offset_stencil, src->samples);
2476 }
2477 }