freedreno/a6xx: FETCHSIZE is PITCHALIGN
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
111 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
233 {
234 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
235
236 tu_cs_emit_regs(cs,
237 A6XX_RB_2D_DST_INFO(
238 .color_format = format.fmt,
239 .color_swap = format.swap,
240 .srgb = vk_format_is_srgb(vk_format)),
241 A6XX_RB_2D_DST_LO((uint32_t) va),
242 A6XX_RB_2D_DST_HI(va >> 32),
243 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
244 }
245
246 static void
247 r2d_setup_common(struct tu_cmd_buffer *cmd,
248 struct tu_cs *cs,
249 VkFormat vk_format,
250 enum a6xx_rotation rotation,
251 bool clear,
252 uint8_t mask,
253 bool scissor)
254 {
255 enum a6xx_format format = tu6_base_format(vk_format);
256 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
257 uint32_t unknown_8c01 = 0;
258
259 if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
260 /* preserve depth channels */
261 if (mask == 0x8)
262 unknown_8c01 = 0x00084001;
263 /* preserve stencil channel */
264 if (mask == 0x7)
265 unknown_8c01 = 0x08000041;
266 }
267
268 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
269 tu_cs_emit(cs, unknown_8c01);
270
271 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
272 .scissor = scissor,
273 .rotate = rotation,
274 .solid_color = clear,
275 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
276 .color_format = format,
277 .mask = 0xf,
278 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
279 ).value;
280
281 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
282 tu_cs_emit(cs, blit_cntl);
283
284 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
285 tu_cs_emit(cs, blit_cntl);
286
287 if (format == FMT6_10_10_10_2_UNORM_DEST)
288 format = FMT6_16_16_16_16_FLOAT;
289
290 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
291 .sint = vk_format_is_sint(vk_format),
292 .uint = vk_format_is_uint(vk_format),
293 .color_format = format,
294 .srgb = vk_format_is_srgb(vk_format),
295 .mask = 0xf));
296 }
297
298 static void
299 r2d_setup(struct tu_cmd_buffer *cmd,
300 struct tu_cs *cs,
301 VkFormat vk_format,
302 enum a6xx_rotation rotation,
303 bool clear,
304 uint8_t mask)
305 {
306 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
307
308 r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
309 }
310
311 static void
312 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
313 {
314 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
315 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
316 }
317
318 /* r3d_ = shader path operations */
319
320 static void
321 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
322 bool layered_clear)
323 {
324 struct ir3_shader dummy_shader = {};
325
326 struct ir3_shader_variant vs = {
327 .type = MESA_SHADER_VERTEX,
328 .instrlen = 1,
329 .constlen = 2,
330 .info.max_reg = 1,
331 .inputs_count = 1,
332 .inputs[0] = {
333 .slot = SYSTEM_VALUE_VERTEX_ID,
334 .regid = regid(0, 3),
335 .sysval = true,
336 },
337 .outputs_count = blit ? 2 : 1,
338 .outputs[0] = {
339 .slot = VARYING_SLOT_POS,
340 .regid = regid(0, 0),
341 },
342 .outputs[1] = {
343 .slot = VARYING_SLOT_VAR0,
344 .regid = regid(1, 0),
345 },
346 .shader = &dummy_shader,
347 };
348 if (layered_clear) {
349 vs = (struct ir3_shader_variant) {
350 .type = MESA_SHADER_VERTEX,
351 .instrlen = 1,
352 .info.max_reg = 0,
353 .shader = &dummy_shader,
354 };
355 }
356
357 struct ir3_shader_variant fs = {
358 .type = MESA_SHADER_FRAGMENT,
359 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
360 .constlen = num_rts,
361 .info.max_reg = MAX2(num_rts, 1) - 1,
362 .total_in = blit ? 2 : 0,
363 .num_samp = blit ? 1 : 0,
364 .inputs_count = blit ? 2 : 0,
365 .inputs[0] = {
366 .slot = VARYING_SLOT_VAR0,
367 .inloc = 0,
368 .compmask = 3,
369 .bary = true,
370 },
371 .inputs[1] = {
372 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
373 .regid = regid(0, 0),
374 .sysval = 1,
375 },
376 .num_sampler_prefetch = blit ? 1 : 0,
377 .sampler_prefetch[0] = {
378 .src = 0,
379 .wrmask = 0xf,
380 .cmd = 4,
381 },
382 .shader = &dummy_shader,
383 };
384
385 struct ir3_shader_variant gs_shader = {
386 .type = MESA_SHADER_GEOMETRY,
387 .instrlen = 1,
388 .constlen = 2,
389 .info.max_reg = 1,
390 .inputs_count = 1,
391 .inputs[0] = {
392 .slot = SYSTEM_VALUE_GS_HEADER_IR3,
393 .regid = regid(0, 0),
394 .sysval = true,
395 },
396 .outputs_count = 3,
397 .outputs[0] = {
398 .slot = VARYING_SLOT_POS,
399 .regid = regid(0, 0),
400 },
401 .outputs[1] = {
402 .slot = VARYING_SLOT_LAYER,
403 .regid = regid(1, 1),
404 },
405 .outputs[2] = {
406 .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
407 .regid = regid(1, 0),
408 },
409 .shader = &dummy_shader,
410 }, *gs = layered_clear ? &gs_shader : NULL;
411
412
413 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
414 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
415 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
416
417 static const instr_t vs_code[] = {
418 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
419 * r1.xy = r0.w ? c1.zw : c0.zw
420 * r0.w = 1.0f
421 */
422 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
423 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
424 .src2 = 3,
425 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
426 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
427 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
428 .src2 = 3,
429 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
430 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
431 { .cat0 = { .opc = OPC_END } },
432 };
433
434 static const instr_t vs_layered[] = {
435 { .cat0 = { .opc = OPC_CHMASK } },
436 { .cat0 = { .opc = OPC_CHSH } },
437 };
438
439 static const instr_t gs_code[16] = {
440 /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
441 CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
442 .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
443 /* x = (local_id & 1) ? c1.x : c0.x */
444 CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
445 /* y = (local_id & 2) ? c1.y : c0.y */
446 CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
447 /* pred = (local_id >= 4), used by OPC_KILL */
448 CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
449 /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
450 CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
451
452 MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
453 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
454 MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
455
456 /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
457 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
458 .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
459 .src2 = 0,
460 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
461
462 CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
463
464 { .cat0 = { .opc = OPC_KILL } },
465 { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
466 };
467 #define FS_OFFSET (16 * sizeof(instr_t))
468 #define GS_OFFSET (32 * sizeof(instr_t))
469
470 /* shaders */
471 struct ts_cs_memory shaders = { };
472 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
473 16 * sizeof(instr_t), &shaders);
474 assert(result == VK_SUCCESS);
475
476 if (layered_clear) {
477 memcpy(shaders.map, vs_layered, sizeof(vs_layered));
478 memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
479 } else {
480 memcpy(shaders.map, vs_code, sizeof(vs_code));
481 }
482
483 instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
484 for (uint32_t i = 0; i < num_rts; i++) {
485 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
486 *fs_code++ = (instr_t) { .cat1 = {
487 .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
488 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
489 } };
490 }
491
492 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
493 * blit path (its not clear what allows it to not have it)
494 */
495 if (blit) {
496 *fs_code++ = (instr_t) { .cat2 = {
497 .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
498 .dst = regid(63, 0), .src1_im = 1
499 } };
500 }
501 *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
502 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
503
504 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
505
506 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
507 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
508 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
509 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
510 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
511
512 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
513 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
514
515 tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
516
517 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
518 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
519 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
520
521 tu6_emit_fs_inputs(cs, &fs);
522
523 tu_cs_emit_regs(cs,
524 A6XX_GRAS_CL_CNTL(
525 .persp_division_disable = 1,
526 .vp_xform_disable = 1,
527 .vp_clip_code_ignore = 1,
528 .clip_disable = 1),
529 A6XX_GRAS_UNKNOWN_8001(0));
530 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
531
532 tu_cs_emit_regs(cs,
533 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
534 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
535 tu_cs_emit_regs(cs,
536 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
537 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
538
539 tu_cs_emit_regs(cs,
540 A6XX_VFD_INDEX_OFFSET(),
541 A6XX_VFD_INSTANCE_START_OFFSET());
542 }
543
544 static void
545 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
546 {
547 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
548 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
549 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
550 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
551 CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
552 CP_LOAD_STATE6_0_NUM_UNIT(2));
553 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
554 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
555 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
556 }
557
558 static void
559 r3d_coords(struct tu_cs *cs,
560 const VkOffset2D *dst,
561 const VkOffset2D *src,
562 const VkExtent2D *extent)
563 {
564 int32_t src_x1 = src ? src->x : 0;
565 int32_t src_y1 = src ? src->y : 0;
566 r3d_coords_raw(cs, false, (float[]) {
567 dst->x, dst->y,
568 src_x1, src_y1,
569 dst->x + extent->width, dst->y + extent->height,
570 src_x1 + extent->width, src_y1 + extent->height,
571 });
572 }
573
574 static void
575 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
576 {
577 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
578 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
579 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
580 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
581 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
582 CP_LOAD_STATE6_0_NUM_UNIT(1));
583 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
584 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
585 switch (format) {
586 case VK_FORMAT_X8_D24_UNORM_PACK32:
587 case VK_FORMAT_D24_UNORM_S8_UINT: {
588 /* cleared as r8g8b8a8_unorm using special format */
589 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
590 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
591 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
592 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
593 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
594 } break;
595 case VK_FORMAT_D16_UNORM:
596 case VK_FORMAT_D32_SFLOAT:
597 tu_cs_emit(cs, fui(val->depthStencil.depth));
598 tu_cs_emit(cs, 0);
599 tu_cs_emit(cs, 0);
600 tu_cs_emit(cs, 0);
601 break;
602 case VK_FORMAT_S8_UINT:
603 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
604 tu_cs_emit(cs, 0);
605 tu_cs_emit(cs, 0);
606 tu_cs_emit(cs, 0);
607 break;
608 default:
609 /* as color formats use clear value as-is */
610 assert(!vk_format_is_depth_or_stencil(format));
611 tu_cs_emit_array(cs, val->color.uint32, 4);
612 break;
613 }
614 }
615
616 static void
617 r3d_src_common(struct tu_cmd_buffer *cmd,
618 struct tu_cs *cs,
619 const uint32_t *tex_const,
620 uint32_t offset_base,
621 uint32_t offset_ubwc,
622 VkFilter filter)
623 {
624 struct ts_cs_memory texture = { };
625 VkResult result = tu_cs_alloc(&cmd->sub_cs,
626 2, /* allocate space for a sampler too */
627 A6XX_TEX_CONST_DWORDS, &texture);
628 assert(result == VK_SUCCESS);
629
630 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
631
632 /* patch addresses for layer offset */
633 *(uint64_t*) (texture.map + 4) += offset_base;
634 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
635 texture.map[7] = ubwc_addr;
636 texture.map[8] = ubwc_addr >> 32;
637
638 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
639 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
640 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
641 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
642 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
643 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
644 0x60000; /* XXX used by blob, doesn't seem necessary */
645 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
646 0x1 | /* XXX used by blob, doesn't seem necessary */
647 A6XX_TEX_SAMP_1_UNNORM_COORDS |
648 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
649 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
650 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
651
652 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
653 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
654 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
655 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
656 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
657 CP_LOAD_STATE6_0_NUM_UNIT(1));
658 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
659
660 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
661 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
662
663 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
664 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
665 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
666 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
667 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
668 CP_LOAD_STATE6_0_NUM_UNIT(1));
669 tu_cs_emit_qw(cs, texture.iova);
670
671 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
672 tu_cs_emit_qw(cs, texture.iova);
673
674 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
675 }
676
677 static void
678 r3d_src(struct tu_cmd_buffer *cmd,
679 struct tu_cs *cs,
680 const struct tu_image_view *iview,
681 uint32_t layer,
682 VkFilter filter)
683 {
684 r3d_src_common(cmd, cs, iview->descriptor,
685 iview->layer_size * layer,
686 iview->ubwc_layer_size * layer,
687 filter);
688 }
689
690 static void
691 r3d_src_buffer(struct tu_cmd_buffer *cmd,
692 struct tu_cs *cs,
693 VkFormat vk_format,
694 uint64_t va, uint32_t pitch,
695 uint32_t width, uint32_t height)
696 {
697 uint32_t desc[A6XX_TEX_CONST_DWORDS];
698
699 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
700
701 desc[0] =
702 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
703 A6XX_TEX_CONST_0_FMT(format.fmt) |
704 A6XX_TEX_CONST_0_SWAP(format.swap) |
705 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
706 // XXX to swizzle into .w for stencil buffer_to_image
707 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
708 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
709 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
710 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
711 desc[2] =
712 A6XX_TEX_CONST_2_PITCH(pitch) |
713 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
714 desc[3] = 0;
715 desc[4] = va;
716 desc[5] = va >> 32;
717 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
718 desc[i] = 0;
719
720 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
721 }
722
723 static void
724 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
725 {
726 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
727
728 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
729 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
730 tu_cs_image_ref(cs, iview, layer);
731 tu_cs_emit(cs, 0);
732
733 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
734 tu_cs_image_flag_ref(cs, iview, layer);
735
736 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
737 }
738
739 static void
740 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
741 {
742 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
743
744 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
745
746 tu_cs_emit_regs(cs,
747 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
748 A6XX_RB_MRT_PITCH(0, pitch),
749 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
750 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
751 A6XX_RB_MRT_BASE_HI(0, va >> 32),
752 A6XX_RB_MRT_BASE_GMEM(0, 0));
753
754 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
755 }
756
757 static void
758 r3d_setup(struct tu_cmd_buffer *cmd,
759 struct tu_cs *cs,
760 VkFormat vk_format,
761 enum a6xx_rotation rotation,
762 bool clear,
763 uint8_t mask)
764 {
765 if (!cmd->state.pass) {
766 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
767 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
768 }
769
770 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
771 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
772
773 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
774
775 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
776 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
777 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
778 0xfc000000);
779 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
780
781 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
782 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
783
784 tu_cs_emit_regs(cs,
785 A6XX_RB_FS_OUTPUT_CNTL0(),
786 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
787
788 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
789 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
790 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
791
792 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
793 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
794 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
795 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
796 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
797 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
798 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
799
800 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
801 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
802
803 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
804 .color_format = tu6_base_format(vk_format),
805 .color_sint = vk_format_is_sint(vk_format),
806 .color_uint = vk_format_is_uint(vk_format)));
807
808 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
809 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
810 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
811 }
812
813 static void
814 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
815 {
816 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
817 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
818 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
819 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
820 tu_cs_emit(cs, 1); /* instance count */
821 tu_cs_emit(cs, 2); /* vertex count */
822 }
823
824 /* blit ops - common interface for 2d/shader paths */
825
826 struct blit_ops {
827 void (*coords)(struct tu_cs *cs,
828 const VkOffset2D *dst,
829 const VkOffset2D *src,
830 const VkExtent2D *extent);
831 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
832 void (*src)(
833 struct tu_cmd_buffer *cmd,
834 struct tu_cs *cs,
835 const struct tu_image_view *iview,
836 uint32_t layer,
837 VkFilter filter);
838 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
839 VkFormat vk_format,
840 uint64_t va, uint32_t pitch,
841 uint32_t width, uint32_t height);
842 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
843 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
844 void (*setup)(struct tu_cmd_buffer *cmd,
845 struct tu_cs *cs,
846 VkFormat vk_format,
847 enum a6xx_rotation rotation,
848 bool clear,
849 uint8_t mask);
850 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
851 };
852
853 static const struct blit_ops r2d_ops = {
854 .coords = r2d_coords,
855 .clear_value = r2d_clear_value,
856 .src = r2d_src,
857 .src_buffer = r2d_src_buffer,
858 .dst = r2d_dst,
859 .dst_buffer = r2d_dst_buffer,
860 .setup = r2d_setup,
861 .run = r2d_run,
862 };
863
864 static const struct blit_ops r3d_ops = {
865 .coords = r3d_coords,
866 .clear_value = r3d_clear_value,
867 .src = r3d_src,
868 .src_buffer = r3d_src_buffer,
869 .dst = r3d_dst,
870 .dst_buffer = r3d_dst_buffer,
871 .setup = r3d_setup,
872 .run = r3d_run,
873 };
874
875 /* passthrough set coords from 3D extents */
876 static void
877 coords(const struct blit_ops *ops,
878 struct tu_cs *cs,
879 const VkOffset3D *dst,
880 const VkOffset3D *src,
881 const VkExtent3D *extent)
882 {
883 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
884 }
885
886 static void
887 tu_image_view_blit2(struct tu_image_view *iview,
888 struct tu_image *image,
889 VkFormat format,
890 const VkImageSubresourceLayers *subres,
891 uint32_t layer,
892 bool stencil_read)
893 {
894 VkImageAspectFlags aspect_mask = subres->aspectMask;
895
896 /* always use the AS_R8G8B8A8 format for these */
897 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
898 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
899 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
900 }
901
902 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
903 .image = tu_image_to_handle(image),
904 .viewType = VK_IMAGE_VIEW_TYPE_2D,
905 .format = format,
906 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
907 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
908 .subresourceRange = {
909 .aspectMask = aspect_mask,
910 .baseMipLevel = subres->mipLevel,
911 .levelCount = 1,
912 .baseArrayLayer = subres->baseArrayLayer + layer,
913 .layerCount = 1,
914 },
915 });
916 }
917
918 static void
919 tu_image_view_blit(struct tu_image_view *iview,
920 struct tu_image *image,
921 const VkImageSubresourceLayers *subres,
922 uint32_t layer)
923 {
924 tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
925 }
926
927 static void
928 tu6_blit_image(struct tu_cmd_buffer *cmd,
929 struct tu_image *src_image,
930 struct tu_image *dst_image,
931 const VkImageBlit *info,
932 VkFilter filter)
933 {
934 const struct blit_ops *ops = &r2d_ops;
935 struct tu_cs *cs = &cmd->cs;
936 uint32_t layers;
937
938 /* 2D blit can't do rotation mirroring from just coordinates */
939 static const enum a6xx_rotation rotate[2][2] = {
940 {ROTATE_0, ROTATE_HFLIP},
941 {ROTATE_VFLIP, ROTATE_180},
942 };
943
944 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
945 (info->dstOffsets[1].x < info->dstOffsets[0].x);
946 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
947 (info->dstOffsets[1].y < info->dstOffsets[0].y);
948 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
949 (info->dstOffsets[1].z < info->dstOffsets[0].z);
950
951 if (mirror_z) {
952 tu_finishme("blit z mirror\n");
953 return;
954 }
955
956 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
957 info->dstOffsets[1].z - info->dstOffsets[0].z) {
958 tu_finishme("blit z filter\n");
959 return;
960 }
961
962 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
963 if (info->dstSubresource.layerCount > 1) {
964 assert(layers <= 1);
965 layers = info->dstSubresource.layerCount;
966 }
967
968 uint8_t mask = 0xf;
969 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
970 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
971 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
972 mask = 0x7;
973 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
974 mask = 0x8;
975 }
976
977 /* BC1_RGB_* formats need to have their last components overriden with 1
978 * when sampling, which is normally handled with the texture descriptor
979 * swizzle. The 2d path can't handle that, so use the 3d path.
980 *
981 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
982 * the 2d path.
983 */
984
985 if (dst_image->samples > 1 ||
986 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
987 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
988 filter == VK_FILTER_CUBIC_EXT)
989 ops = &r3d_ops;
990
991 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
992 * figure out why (should be able to pass all tests with only shader path)
993 */
994
995 ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
996
997 if (ops == &r3d_ops) {
998 r3d_coords_raw(cs, false, (float[]) {
999 info->dstOffsets[0].x, info->dstOffsets[0].y,
1000 info->srcOffsets[0].x, info->srcOffsets[0].y,
1001 info->dstOffsets[1].x, info->dstOffsets[1].y,
1002 info->srcOffsets[1].x, info->srcOffsets[1].y
1003 });
1004 } else {
1005 tu_cs_emit_regs(cs,
1006 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1007 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1008 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1009 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1010 tu_cs_emit_regs(cs,
1011 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1012 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1013 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1014 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1015 }
1016
1017 struct tu_image_view dst, src;
1018 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1019 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1020
1021 for (uint32_t i = 0; i < layers; i++) {
1022 ops->dst(cs, &dst, i);
1023 ops->src(cmd, cs, &src, i, filter);
1024 ops->run(cmd, cs);
1025 }
1026 }
1027
1028 void
1029 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1030 VkImage srcImage,
1031 VkImageLayout srcImageLayout,
1032 VkImage dstImage,
1033 VkImageLayout dstImageLayout,
1034 uint32_t regionCount,
1035 const VkImageBlit *pRegions,
1036 VkFilter filter)
1037
1038 {
1039 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1040 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1041 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1042
1043 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1044 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1045
1046 for (uint32_t i = 0; i < regionCount; ++i)
1047 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1048 }
1049
1050 static VkFormat
1051 copy_format(VkFormat format)
1052 {
1053 switch (vk_format_get_blocksize(format)) {
1054 case 1: return VK_FORMAT_R8_UINT;
1055 case 2: return VK_FORMAT_R16_UINT;
1056 case 4: return VK_FORMAT_R32_UINT;
1057 case 8: return VK_FORMAT_R32G32_UINT;
1058 case 12:return VK_FORMAT_R32G32B32_UINT;
1059 case 16:return VK_FORMAT_R32G32B32A32_UINT;
1060 default:
1061 unreachable("unhandled format size");
1062 }
1063 }
1064
1065 static void
1066 copy_compressed(VkFormat format,
1067 VkOffset3D *offset,
1068 VkExtent3D *extent,
1069 uint32_t *width,
1070 uint32_t *height)
1071 {
1072 if (!vk_format_is_compressed(format))
1073 return;
1074
1075 uint32_t block_width = vk_format_get_blockwidth(format);
1076 uint32_t block_height = vk_format_get_blockheight(format);
1077
1078 offset->x /= block_width;
1079 offset->y /= block_height;
1080
1081 if (extent) {
1082 extent->width = DIV_ROUND_UP(extent->width, block_width);
1083 extent->height = DIV_ROUND_UP(extent->height, block_height);
1084 }
1085 if (width)
1086 *width = DIV_ROUND_UP(*width, block_width);
1087 if (height)
1088 *height = DIV_ROUND_UP(*height, block_height);
1089 }
1090
1091 static void
1092 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1093 struct tu_buffer *src_buffer,
1094 struct tu_image *dst_image,
1095 const VkBufferImageCopy *info)
1096 {
1097 struct tu_cs *cs = &cmd->cs;
1098 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1099 VkFormat dst_format = dst_image->vk_format;
1100 VkFormat src_format = dst_image->vk_format;
1101 const struct blit_ops *ops = &r2d_ops;
1102
1103 uint8_t mask = 0xf;
1104
1105 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1106 switch (info->imageSubresource.aspectMask) {
1107 case VK_IMAGE_ASPECT_STENCIL_BIT:
1108 src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1109 mask = 0x8;
1110 ops = &r3d_ops;
1111 break;
1112 case VK_IMAGE_ASPECT_DEPTH_BIT:
1113 mask = 0x7;
1114 break;
1115 }
1116 }
1117
1118 VkOffset3D offset = info->imageOffset;
1119 VkExtent3D extent = info->imageExtent;
1120 uint32_t src_width = info->bufferRowLength ?: extent.width;
1121 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1122
1123 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1124 assert(src_format == dst_format);
1125 copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1126 src_format = dst_format = copy_format(dst_format);
1127 }
1128
1129 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1130 uint32_t layer_size = src_height * pitch;
1131
1132 /* note: the src_va/pitch alignment of 64 is for 2D engine,
1133 * it is also valid for 1cpp format with shader path (stencil aspect path)
1134 */
1135
1136 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1137
1138 struct tu_image_view dst;
1139 tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1140
1141 for (uint32_t i = 0; i < layers; i++) {
1142 ops->dst(cs, &dst, i);
1143
1144 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1145 if ((src_va & 63) || (pitch & 63)) {
1146 for (uint32_t y = 0; y < extent.height; y++) {
1147 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1148 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1149 x + extent.width, 1);
1150 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1151 &(VkExtent2D) {extent.width, 1});
1152 ops->run(cmd, cs);
1153 src_va += pitch;
1154 }
1155 } else {
1156 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1157 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1158 ops->run(cmd, cs);
1159 }
1160 }
1161 }
1162
1163 void
1164 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1165 VkBuffer srcBuffer,
1166 VkImage dstImage,
1167 VkImageLayout dstImageLayout,
1168 uint32_t regionCount,
1169 const VkBufferImageCopy *pRegions)
1170 {
1171 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1172 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1173 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1174
1175 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1176 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1177
1178 for (unsigned i = 0; i < regionCount; ++i)
1179 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1180 }
1181
1182 static void
1183 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1184 struct tu_image *src_image,
1185 struct tu_buffer *dst_buffer,
1186 const VkBufferImageCopy *info)
1187 {
1188 struct tu_cs *cs = &cmd->cs;
1189 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1190 VkFormat src_format = src_image->vk_format;
1191 VkFormat dst_format = src_image->vk_format;
1192 bool stencil_read = false;
1193
1194 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1195 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1196 dst_format = VK_FORMAT_R8_UNORM;
1197 stencil_read = true;
1198 }
1199
1200 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1201 VkOffset3D offset = info->imageOffset;
1202 VkExtent3D extent = info->imageExtent;
1203 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1204 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1205
1206 if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1207 assert(src_format == dst_format);
1208 copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1209 src_format = dst_format = copy_format(dst_format);
1210 }
1211
1212 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1213 uint32_t layer_size = pitch * dst_height;
1214
1215 /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1216 * it is also valid for 1cpp format with shader path (stencil aspect)
1217 */
1218
1219 ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1220
1221 struct tu_image_view src;
1222 tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1223
1224 for (uint32_t i = 0; i < layers; i++) {
1225 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1226
1227 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1228 if ((dst_va & 63) || (pitch & 63)) {
1229 for (uint32_t y = 0; y < extent.height; y++) {
1230 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1231 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1232 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1233 &(VkExtent2D) {extent.width, 1});
1234 ops->run(cmd, cs);
1235 dst_va += pitch;
1236 }
1237 } else {
1238 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1239 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1240 ops->run(cmd, cs);
1241 }
1242 }
1243 }
1244
1245 void
1246 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1247 VkImage srcImage,
1248 VkImageLayout srcImageLayout,
1249 VkBuffer dstBuffer,
1250 uint32_t regionCount,
1251 const VkBufferImageCopy *pRegions)
1252 {
1253 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1254 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1255 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1256
1257 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1258 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1259
1260 for (unsigned i = 0; i < regionCount; ++i)
1261 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1262 }
1263
1264 /* Tiled formats don't support swapping, which means that we can't support
1265 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1266 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1267 * Currently we fake support for tiled swapped formats and use the unswapped
1268 * format instead, but this means that reinterpreting copies to and from
1269 * swapped formats can't be performed correctly unless we can swizzle the
1270 * components by reinterpreting the other image as the "correct" swapped
1271 * format, i.e. only when the other image is linear.
1272 */
1273
1274 static bool
1275 is_swapped_format(VkFormat format)
1276 {
1277 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1278 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1279 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1280 }
1281
1282 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1283 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1284 * versa). This should mirror the logic in fdl6_layout.
1285 */
1286 static bool
1287 image_is_r8g8(struct tu_image *image)
1288 {
1289 return image->layout.cpp == 2 &&
1290 vk_format_get_nr_components(image->vk_format) == 2;
1291 }
1292
1293 static void
1294 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1295 struct tu_image *src_image,
1296 struct tu_image *dst_image,
1297 const VkImageCopy *info)
1298 {
1299 const struct blit_ops *ops = &r2d_ops;
1300 struct tu_cs *cs = &cmd->cs;
1301
1302 uint8_t mask = 0xf;
1303 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1304 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1305 mask = 0x7;
1306 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1307 mask = 0x8;
1308 }
1309
1310 if (dst_image->samples > 1)
1311 ops = &r3d_ops;
1312
1313 assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1314
1315 VkFormat format = VK_FORMAT_UNDEFINED;
1316 VkOffset3D src_offset = info->srcOffset;
1317 VkOffset3D dst_offset = info->dstOffset;
1318 VkExtent3D extent = info->extent;
1319
1320 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1321 * Images":
1322 *
1323 * When copying between compressed and uncompressed formats the extent
1324 * members represent the texel dimensions of the source image and not
1325 * the destination. When copying from a compressed image to an
1326 * uncompressed image the image texel dimensions written to the
1327 * uncompressed image will be source extent divided by the compressed
1328 * texel block dimensions. When copying from an uncompressed image to a
1329 * compressed image the image texel dimensions written to the compressed
1330 * image will be the source extent multiplied by the compressed texel
1331 * block dimensions.
1332 *
1333 * This means we only have to adjust the extent if the source image is
1334 * compressed.
1335 */
1336 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1337 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1338
1339 VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1340 copy_format(dst_image->vk_format) : dst_image->vk_format;
1341 VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1342 copy_format(src_image->vk_format) : src_image->vk_format;
1343
1344 bool use_staging_blit = false;
1345
1346 if (src_format == dst_format) {
1347 /* Images that share a format can always be copied directly because it's
1348 * the same as a blit.
1349 */
1350 format = src_format;
1351 } else if (!src_image->layout.tile_mode) {
1352 /* If an image is linear, we can always safely reinterpret it with the
1353 * other image's format and then do a regular blit.
1354 */
1355 format = dst_format;
1356 } else if (!dst_image->layout.tile_mode) {
1357 format = src_format;
1358 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1359 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1360 * due to the different tile layout.
1361 */
1362 use_staging_blit = true;
1363 } else if (is_swapped_format(src_format) ||
1364 is_swapped_format(dst_format)) {
1365 /* If either format has a non-identity swap, then we can't copy
1366 * to/from it.
1367 */
1368 use_staging_blit = true;
1369 } else if (!src_image->layout.ubwc) {
1370 format = dst_format;
1371 } else if (!dst_image->layout.ubwc) {
1372 format = src_format;
1373 } else {
1374 /* Both formats use UBWC and so neither can be reinterpreted.
1375 * TODO: We could do an in-place decompression of the dst instead.
1376 */
1377 use_staging_blit = true;
1378 }
1379
1380 struct tu_image_view dst, src;
1381
1382 if (use_staging_blit) {
1383 tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1384 tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1385
1386 struct tu_image staging_image = {
1387 .vk_format = src_format,
1388 .type = src_image->type,
1389 .tiling = VK_IMAGE_TILING_LINEAR,
1390 .extent = extent,
1391 .level_count = 1,
1392 .layer_count = info->srcSubresource.layerCount,
1393 .samples = src_image->samples,
1394 .bo_offset = 0,
1395 };
1396
1397 VkImageSubresourceLayers staging_subresource = {
1398 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1399 .mipLevel = 0,
1400 .baseArrayLayer = 0,
1401 .layerCount = info->srcSubresource.layerCount,
1402 };
1403
1404 VkOffset3D staging_offset = { 0 };
1405
1406 staging_image.layout.tile_mode = TILE6_LINEAR;
1407 staging_image.layout.ubwc = false;
1408
1409 fdl6_layout(&staging_image.layout,
1410 vk_format_to_pipe_format(staging_image.vk_format),
1411 staging_image.samples,
1412 staging_image.extent.width,
1413 staging_image.extent.height,
1414 staging_image.extent.depth,
1415 staging_image.level_count,
1416 staging_image.layer_count,
1417 staging_image.type == VK_IMAGE_TYPE_3D,
1418 NULL);
1419
1420 VkResult result = tu_get_scratch_bo(cmd->device,
1421 staging_image.layout.size,
1422 &staging_image.bo);
1423 if (result != VK_SUCCESS) {
1424 cmd->record_result = result;
1425 return;
1426 }
1427
1428 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1429 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1430
1431 struct tu_image_view staging;
1432 tu_image_view_blit2(&staging, &staging_image, src_format,
1433 &staging_subresource, 0, false);
1434
1435 ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1436 coords(ops, cs, &staging_offset, &src_offset, &extent);
1437
1438 for (uint32_t i = 0; i < info->extent.depth; i++) {
1439 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1440 ops->dst(cs, &staging, i);
1441 ops->run(cmd, cs);
1442 }
1443
1444 /* When executed by the user there has to be a pipeline barrier here,
1445 * but since we're doing it manually we'll have to flush ourselves.
1446 */
1447 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1448 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1449
1450 tu_image_view_blit2(&staging, &staging_image, dst_format,
1451 &staging_subresource, 0, false);
1452
1453 ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1454 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1455
1456 for (uint32_t i = 0; i < info->extent.depth; i++) {
1457 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1458 ops->dst(cs, &dst, i);
1459 ops->run(cmd, cs);
1460 }
1461 } else {
1462 tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1463 tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1464
1465 ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1466 coords(ops, cs, &dst_offset, &src_offset, &extent);
1467
1468 for (uint32_t i = 0; i < info->extent.depth; i++) {
1469 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1470 ops->dst(cs, &dst, i);
1471 ops->run(cmd, cs);
1472 }
1473 }
1474 }
1475
1476 void
1477 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1478 VkImage srcImage,
1479 VkImageLayout srcImageLayout,
1480 VkImage destImage,
1481 VkImageLayout destImageLayout,
1482 uint32_t regionCount,
1483 const VkImageCopy *pRegions)
1484 {
1485 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1486 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1487 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1488
1489 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1490 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1491
1492 for (uint32_t i = 0; i < regionCount; ++i)
1493 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1494 }
1495
1496 static void
1497 copy_buffer(struct tu_cmd_buffer *cmd,
1498 uint64_t dst_va,
1499 uint64_t src_va,
1500 uint64_t size,
1501 uint32_t block_size)
1502 {
1503 const struct blit_ops *ops = &r2d_ops;
1504 struct tu_cs *cs = &cmd->cs;
1505 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1506 uint64_t blocks = size / block_size;
1507
1508 ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1509
1510 while (blocks) {
1511 uint32_t src_x = (src_va & 63) / block_size;
1512 uint32_t dst_x = (dst_va & 63) / block_size;
1513 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1514
1515 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1516 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1517 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1518 ops->run(cmd, cs);
1519
1520 src_va += width * block_size;
1521 dst_va += width * block_size;
1522 blocks -= width;
1523 }
1524 }
1525
1526 void
1527 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1528 VkBuffer srcBuffer,
1529 VkBuffer dstBuffer,
1530 uint32_t regionCount,
1531 const VkBufferCopy *pRegions)
1532 {
1533 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1534 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1535 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1536
1537 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1538 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1539
1540 for (unsigned i = 0; i < regionCount; ++i) {
1541 copy_buffer(cmd,
1542 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1543 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1544 pRegions[i].size, 1);
1545 }
1546 }
1547
1548 void
1549 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1550 VkBuffer dstBuffer,
1551 VkDeviceSize dstOffset,
1552 VkDeviceSize dataSize,
1553 const void *pData)
1554 {
1555 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1556 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1557
1558 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1559
1560 struct ts_cs_memory tmp;
1561 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1562 if (result != VK_SUCCESS) {
1563 cmd->record_result = result;
1564 return;
1565 }
1566
1567 memcpy(tmp.map, pData, dataSize);
1568 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1569 }
1570
1571 void
1572 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1573 VkBuffer dstBuffer,
1574 VkDeviceSize dstOffset,
1575 VkDeviceSize fillSize,
1576 uint32_t data)
1577 {
1578 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1579 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1580 const struct blit_ops *ops = &r2d_ops;
1581 struct tu_cs *cs = &cmd->cs;
1582
1583 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1584
1585 if (fillSize == VK_WHOLE_SIZE)
1586 fillSize = buffer->size - dstOffset;
1587
1588 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1589 uint32_t blocks = fillSize / 4;
1590
1591 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1592 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1593
1594 while (blocks) {
1595 uint32_t dst_x = (dst_va & 63) / 4;
1596 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1597
1598 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1599 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1600 ops->run(cmd, cs);
1601
1602 dst_va += width * 4;
1603 blocks -= width;
1604 }
1605 }
1606
1607 void
1608 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1609 VkImage srcImage,
1610 VkImageLayout srcImageLayout,
1611 VkImage dstImage,
1612 VkImageLayout dstImageLayout,
1613 uint32_t regionCount,
1614 const VkImageResolve *pRegions)
1615 {
1616 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1617 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1618 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1619 const struct blit_ops *ops = &r2d_ops;
1620 struct tu_cs *cs = &cmd->cs;
1621
1622 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1623 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1624
1625 ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1626
1627 for (uint32_t i = 0; i < regionCount; ++i) {
1628 const VkImageResolve *info = &pRegions[i];
1629 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1630
1631 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1632 /* TODO: aspect masks possible ? */
1633
1634 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1635
1636 struct tu_image_view dst, src;
1637 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1638 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1639
1640 for (uint32_t i = 0; i < layers; i++) {
1641 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1642 ops->dst(cs, &dst, i);
1643 ops->run(cmd, cs);
1644 }
1645 }
1646 }
1647
1648 void
1649 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1650 struct tu_cs *cs,
1651 struct tu_image_view *src,
1652 struct tu_image_view *dst,
1653 uint32_t layers,
1654 const VkRect2D *rect)
1655 {
1656 const struct blit_ops *ops = &r2d_ops;
1657
1658 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1659 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1660
1661 assert(src->image->vk_format == dst->image->vk_format);
1662
1663 ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1664 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1665
1666 for (uint32_t i = 0; i < layers; i++) {
1667 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1668 ops->dst(cs, dst, i);
1669 ops->run(cmd, cs);
1670 }
1671 }
1672
1673 static void
1674 clear_image(struct tu_cmd_buffer *cmd,
1675 struct tu_image *image,
1676 const VkClearValue *clear_value,
1677 const VkImageSubresourceRange *range)
1678 {
1679 uint32_t level_count = tu_get_levelCount(image, range);
1680 uint32_t layer_count = tu_get_layerCount(image, range);
1681 struct tu_cs *cs = &cmd->cs;
1682 VkFormat format = image->vk_format;
1683 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1684 format = VK_FORMAT_R32_UINT;
1685
1686 if (image->type == VK_IMAGE_TYPE_3D) {
1687 assert(layer_count == 1);
1688 assert(range->baseArrayLayer == 0);
1689 }
1690
1691 uint8_t mask = 0xf;
1692 if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1693 mask = 0;
1694 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1695 mask |= 0x7;
1696 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1697 mask |= 0x8;
1698 }
1699
1700 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1701
1702 ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1703 ops->clear_value(cs, image->vk_format, clear_value);
1704
1705 for (unsigned j = 0; j < level_count; j++) {
1706 if (image->type == VK_IMAGE_TYPE_3D)
1707 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1708
1709 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1710 u_minify(image->extent.width, range->baseMipLevel + j),
1711 u_minify(image->extent.height, range->baseMipLevel + j)
1712 });
1713
1714 struct tu_image_view dst;
1715 tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1716 .aspectMask = range->aspectMask,
1717 .mipLevel = range->baseMipLevel + j,
1718 .baseArrayLayer = range->baseArrayLayer,
1719 .layerCount = 1,
1720 }, 0, false);
1721
1722 for (uint32_t i = 0; i < layer_count; i++) {
1723 ops->dst(cs, &dst, i);
1724 ops->run(cmd, cs);
1725 }
1726 }
1727 }
1728
1729 void
1730 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1731 VkImage image_h,
1732 VkImageLayout imageLayout,
1733 const VkClearColorValue *pColor,
1734 uint32_t rangeCount,
1735 const VkImageSubresourceRange *pRanges)
1736 {
1737 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1738 TU_FROM_HANDLE(tu_image, image, image_h);
1739
1740 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1741
1742 for (unsigned i = 0; i < rangeCount; i++)
1743 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1744 }
1745
1746 void
1747 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1748 VkImage image_h,
1749 VkImageLayout imageLayout,
1750 const VkClearDepthStencilValue *pDepthStencil,
1751 uint32_t rangeCount,
1752 const VkImageSubresourceRange *pRanges)
1753 {
1754 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1755 TU_FROM_HANDLE(tu_image, image, image_h);
1756
1757 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1758
1759 for (unsigned i = 0; i < rangeCount; i++)
1760 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1761 }
1762
1763 static void
1764 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1765 uint32_t attachment_count,
1766 const VkClearAttachment *attachments,
1767 uint32_t rect_count,
1768 const VkClearRect *rects)
1769 {
1770 const struct tu_subpass *subpass = cmd->state.subpass;
1771 /* note: cannot use shader path here.. there is a special shader path
1772 * in tu_clear_sysmem_attachments()
1773 */
1774 const struct blit_ops *ops = &r2d_ops;
1775 struct tu_cs *cs = &cmd->draw_cs;
1776
1777 for (uint32_t j = 0; j < attachment_count; j++) {
1778 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1779 * Pass Instance" says that:
1780 *
1781 * Unlike other clear commands, vkCmdClearAttachments executes as
1782 * a drawing command, rather than a transfer command, with writes
1783 * performed by it executing in rasterization order. Clears to
1784 * color attachments are executed as color attachment writes, by
1785 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1786 * Clears to depth/stencil attachments are executed as depth
1787 * writes and writes by the
1788 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1789 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1790 *
1791 * However, the 2d path here is executed the same way as a
1792 * transfer command, using the CCU color cache exclusively with
1793 * a special depth-as-color format for depth clears. This means that
1794 * we can't rely on the normal pipeline barrier mechanism here, and
1795 * have to manually flush whenever using a different cache domain
1796 * from what the 3d path would've used. This happens when we clear
1797 * depth/stencil, since normally depth attachments use CCU depth, but
1798 * we clear it using a special depth-as-color format. Since the clear
1799 * potentially uses a different attachment state we also need to
1800 * invalidate color beforehand and flush it afterwards.
1801 */
1802
1803 uint32_t a;
1804 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1805 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1806 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1807 } else {
1808 a = subpass->depth_stencil_attachment.attachment;
1809 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1810 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1811 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1812 }
1813
1814 if (a == VK_ATTACHMENT_UNUSED)
1815 continue;
1816
1817 uint8_t mask = 0xf;
1818 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1819 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1820 mask &= ~0x7;
1821 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1822 mask &= ~0x8;
1823 }
1824
1825 const struct tu_image_view *iview =
1826 cmd->state.framebuffer->attachments[a].attachment;
1827
1828 ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1829 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1830
1831 /* Wait for the flushes we triggered manually to complete */
1832 tu_cs_emit_wfi(cs);
1833
1834 for (uint32_t i = 0; i < rect_count; i++) {
1835 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1836 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1837 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1838 ops->run(cmd, cs);
1839 }
1840 }
1841
1842 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1843 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1844 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1845 } else {
1846 /* sync color into depth */
1847 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1848 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1849 }
1850 }
1851 }
1852
1853 static void
1854 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1855 uint32_t attachment_count,
1856 const VkClearAttachment *attachments,
1857 uint32_t rect_count,
1858 const VkClearRect *rects)
1859 {
1860 /* the shader path here is special, it avoids changing MRT/etc state */
1861 const struct tu_render_pass *pass = cmd->state.pass;
1862 const struct tu_subpass *subpass = cmd->state.subpass;
1863 const uint32_t mrt_count = subpass->color_count;
1864 struct tu_cs *cs = &cmd->draw_cs;
1865 uint32_t clear_value[MAX_RTS][4];
1866 float z_clear_val = 0.0f;
1867 uint8_t s_clear_val = 0;
1868 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1869 bool z_clear = false;
1870 bool s_clear = false;
1871 bool layered_clear = false;
1872 uint32_t max_samples = 1;
1873
1874 for (uint32_t i = 0; i < attachment_count; i++) {
1875 uint32_t a;
1876 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1877 uint32_t c = attachments[i].colorAttachment;
1878 a = subpass->color_attachments[c].attachment;
1879 if (a == VK_ATTACHMENT_UNUSED)
1880 continue;
1881
1882 clear_rts |= 1 << c;
1883 clear_components |= 0xf << (c * 4);
1884 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1885 } else {
1886 a = subpass->depth_stencil_attachment.attachment;
1887 if (a == VK_ATTACHMENT_UNUSED)
1888 continue;
1889
1890 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1891 z_clear = true;
1892 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1893 }
1894
1895 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1896 s_clear = true;
1897 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1898 }
1899 }
1900
1901 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1902 }
1903
1904 /* prefer to use 2D path for clears
1905 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1906 */
1907 if (max_samples == 1 && cmd->state.framebuffer) {
1908 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1909 return;
1910 }
1911
1912 /* This clear path behaves like a draw, needs the same flush as tu_draw */
1913 tu_emit_cache_flush_renderpass(cmd, cs);
1914
1915 /* disable all draw states so they don't interfere
1916 * TODO: use and re-use draw states for this path
1917 * we have to disable draw states individually to preserve
1918 * input attachment states, because a secondary command buffer
1919 * won't be able to restore them
1920 */
1921 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1922 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1923 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1924 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1925 continue;
1926 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1927 CP_SET_DRAW_STATE__0_DISABLE);
1928 tu_cs_emit_qw(cs, 0);
1929 }
1930 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1931
1932 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1933 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1934 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1935 0xfc000000);
1936 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1937
1938 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1939 for (uint32_t i = 0; i < mrt_count; i++) {
1940 if (clear_rts & (1 << i))
1941 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1942 else
1943 tu_cs_emit(cs, 0);
1944 }
1945
1946 for (uint32_t i = 0; i < rect_count; i++) {
1947 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1948 layered_clear = true;
1949 }
1950
1951 r3d_common(cmd, cs, false, num_rts, layered_clear);
1952
1953 tu_cs_emit_regs(cs,
1954 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1955 tu_cs_emit_regs(cs,
1956 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1957
1958 tu_cs_emit_regs(cs,
1959 A6XX_RB_FS_OUTPUT_CNTL0(),
1960 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1961
1962 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1963 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1964 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1965 for (uint32_t i = 0; i < mrt_count; i++) {
1966 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1967 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1968 }
1969
1970 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1971 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1972 .z_enable = z_clear,
1973 .z_write_enable = z_clear,
1974 .zfunc = FUNC_ALWAYS));
1975 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1976 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1977 .stencil_enable = s_clear,
1978 .func = FUNC_ALWAYS,
1979 .zpass = STENCIL_REPLACE));
1980 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1981 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1982 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1983
1984 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1985 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1986 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1987 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1988 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1989 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1990 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1991 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1992 for_each_bit(b, clear_rts)
1993 tu_cs_emit_array(cs, clear_value[b], 4);
1994
1995 for (uint32_t i = 0; i < rect_count; i++) {
1996 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1997 r3d_coords_raw(cs, layered_clear, (float[]) {
1998 rects[i].rect.offset.x, rects[i].rect.offset.y,
1999 z_clear_val, uif(rects[i].baseArrayLayer + layer),
2000 rects[i].rect.offset.x + rects[i].rect.extent.width,
2001 rects[i].rect.offset.y + rects[i].rect.extent.height,
2002 z_clear_val, 1.0f,
2003 });
2004
2005 if (layered_clear) {
2006 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2007 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2008 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2009 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2010 CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2011 tu_cs_emit(cs, 1); /* instance count */
2012 tu_cs_emit(cs, 1); /* vertex count */
2013 } else {
2014 r3d_run(cmd, cs);
2015 }
2016 }
2017 }
2018 }
2019
2020 static void
2021 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2022 {
2023 enum pipe_format pformat = vk_format_to_pipe_format(format);
2024
2025 switch (format) {
2026 case VK_FORMAT_X8_D24_UNORM_PACK32:
2027 case VK_FORMAT_D24_UNORM_S8_UINT:
2028 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2029 val->depthStencil.stencil << 24;
2030 return;
2031 case VK_FORMAT_D16_UNORM:
2032 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2033 return;
2034 case VK_FORMAT_D32_SFLOAT:
2035 clear_value[0] = fui(val->depthStencil.depth);
2036 return;
2037 case VK_FORMAT_S8_UINT:
2038 clear_value[0] = val->depthStencil.stencil;
2039 return;
2040 /* these formats use a different base format when tiled
2041 * the same format can be used for both because GMEM is always in WZYX order
2042 */
2043 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2044 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2045 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
2046 default:
2047 break;
2048 }
2049
2050 VkClearColorValue color;
2051
2052 /**
2053 * GMEM is tiled and wants the components in WZYX order,
2054 * apply swizzle to the color before packing, to counteract
2055 * deswizzling applied by packing functions
2056 */
2057 pipe_swizzle_4f(color.float32, val->color.float32,
2058 util_format_description(pformat)->swizzle);
2059
2060 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2061 }
2062
2063 static void
2064 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2065 struct tu_cs *cs,
2066 uint32_t attachment,
2067 uint8_t component_mask,
2068 const VkClearValue *value)
2069 {
2070 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2071 /* note: component_mask is 0x7 for depth and 0x8 for stencil
2072 * because D24S8 is cleared with AS_R8G8B8A8 format
2073 */
2074
2075 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2076 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2077
2078 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2079 tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2080
2081 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2082 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2083
2084 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2085 tu_cs_emit(cs, 0);
2086
2087 uint32_t clear_vals[4] = {};
2088 pack_gmem_clear_value(value, vk_format, clear_vals);
2089
2090 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2091 tu_cs_emit_array(cs, clear_vals, 4);
2092
2093 tu6_emit_event_write(cmd, cs, BLIT);
2094 }
2095
2096 static void
2097 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2098 uint32_t attachment_count,
2099 const VkClearAttachment *attachments,
2100 uint32_t rect_count,
2101 const VkClearRect *rects)
2102 {
2103 const struct tu_subpass *subpass = cmd->state.subpass;
2104 struct tu_cs *cs = &cmd->draw_cs;
2105
2106 /* TODO: swap the loops for smaller cmdstream */
2107 for (unsigned i = 0; i < rect_count; i++) {
2108 unsigned x1 = rects[i].rect.offset.x;
2109 unsigned y1 = rects[i].rect.offset.y;
2110 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2111 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2112
2113 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2114 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2115 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2116
2117 for (unsigned j = 0; j < attachment_count; j++) {
2118 uint32_t a;
2119 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2120 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2121 else
2122 a = subpass->depth_stencil_attachment.attachment;
2123
2124 if (a == VK_ATTACHMENT_UNUSED)
2125 continue;
2126
2127 unsigned clear_mask = 0xf;
2128 if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2129 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2130 clear_mask &= ~0x7;
2131 if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2132 clear_mask &= ~0x8;
2133 }
2134
2135 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2136 &attachments[j].clearValue);
2137 }
2138 }
2139 }
2140
2141 void
2142 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2143 uint32_t attachmentCount,
2144 const VkClearAttachment *pAttachments,
2145 uint32_t rectCount,
2146 const VkClearRect *pRects)
2147 {
2148 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149 struct tu_cs *cs = &cmd->draw_cs;
2150
2151 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2152 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2153 tu_cond_exec_end(cs);
2154
2155 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2156 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2157 tu_cond_exec_end(cs);
2158 }
2159
2160 void
2161 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2162 struct tu_cs *cs,
2163 uint32_t a,
2164 const VkRenderPassBeginInfo *info)
2165 {
2166 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2167 const struct tu_image_view *iview = fb->attachments[a].attachment;
2168 const struct tu_render_pass_attachment *attachment =
2169 &cmd->state.pass->attachments[a];
2170 uint8_t mask = 0;
2171
2172 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2173 mask = 0xf;
2174 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2175 mask |= 0x7;
2176 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2177 mask |= 0x8;
2178
2179 if (!mask)
2180 return;
2181
2182 const struct blit_ops *ops = &r2d_ops;
2183 if (attachment->samples > 1)
2184 ops = &r3d_ops;
2185
2186 ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2187 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2188 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2189
2190 /* Wait for any flushes at the beginning of the renderpass to complete */
2191 tu_cs_emit_wfi(cs);
2192
2193 for (uint32_t i = 0; i < fb->layers; i++) {
2194 ops->dst(cs, iview, i);
2195 ops->run(cmd, cs);
2196 }
2197
2198 /* The spec doesn't explicitly say, but presumably the initial renderpass
2199 * clear is considered part of the renderpass, and therefore barriers
2200 * aren't required inside the subpass/renderpass. Therefore we need to
2201 * flush CCU color into CCU depth here, just like with
2202 * vkCmdClearAttachments(). Note that because this only happens at the
2203 * beginning of a renderpass, and renderpass writes are considered
2204 * "incoherent", we shouldn't have to worry about syncing depth into color
2205 * beforehand as depth should already be flushed.
2206 */
2207 if (vk_format_is_depth_or_stencil(attachment->format)) {
2208 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2209 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2210 } else {
2211 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2212 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2213 }
2214 }
2215
2216 void
2217 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2218 struct tu_cs *cs,
2219 uint32_t a,
2220 const VkRenderPassBeginInfo *info)
2221 {
2222 const struct tu_render_pass_attachment *attachment =
2223 &cmd->state.pass->attachments[a];
2224 unsigned clear_mask = 0;
2225
2226 if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2227 clear_mask = 0xf;
2228 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2229 clear_mask |= 0x7;
2230 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2231 clear_mask |= 0x8;
2232
2233 if (!clear_mask)
2234 return;
2235
2236 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2237
2238 tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2239 &info->pClearValues[a]);
2240 }
2241
2242 static void
2243 tu_emit_blit(struct tu_cmd_buffer *cmd,
2244 struct tu_cs *cs,
2245 const struct tu_image_view *iview,
2246 const struct tu_render_pass_attachment *attachment,
2247 bool resolve)
2248 {
2249 tu_cs_emit_regs(cs,
2250 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2251
2252 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2253 .unk0 = !resolve,
2254 .gmem = !resolve,
2255 /* "integer" bit disables msaa resolve averaging */
2256 .integer = vk_format_is_int(attachment->format)));
2257
2258 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2259 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2260 tu_cs_image_ref_2d(cs, iview, 0, false);
2261
2262 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2263 tu_cs_image_flag_ref(cs, iview, 0);
2264
2265 tu_cs_emit_regs(cs,
2266 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2267
2268 tu6_emit_event_write(cmd, cs, BLIT);
2269 }
2270
2271 static bool
2272 blit_can_resolve(VkFormat format)
2273 {
2274 const struct util_format_description *desc = vk_format_description(format);
2275
2276 /* blit event can only do resolve for simple cases:
2277 * averaging samples as unsigned integers or choosing only one sample
2278 */
2279 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2280 return false;
2281
2282 /* can't do formats with larger channel sizes
2283 * note: this includes all float formats
2284 * note2: single channel integer formats seem OK
2285 */
2286 if (desc->channel[0].size > 10)
2287 return false;
2288
2289 switch (format) {
2290 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2291 * likely related to these formats having different layout from other cpp=2 formats
2292 */
2293 case VK_FORMAT_R8G8_UNORM:
2294 case VK_FORMAT_R8G8_UINT:
2295 case VK_FORMAT_R8G8_SINT:
2296 /* TODO: this one should be able to work? */
2297 case VK_FORMAT_D24_UNORM_S8_UINT:
2298 return false;
2299 default:
2300 break;
2301 }
2302
2303 return true;
2304 }
2305
2306 void
2307 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2308 struct tu_cs *cs,
2309 uint32_t a,
2310 bool force_load)
2311 {
2312 const struct tu_image_view *iview =
2313 cmd->state.framebuffer->attachments[a].attachment;
2314 const struct tu_render_pass_attachment *attachment =
2315 &cmd->state.pass->attachments[a];
2316
2317 if (attachment->load || force_load)
2318 tu_emit_blit(cmd, cs, iview, attachment, false);
2319 }
2320
2321 void
2322 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2323 struct tu_cs *cs,
2324 uint32_t a,
2325 uint32_t gmem_a)
2326 {
2327 const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2328 const VkRect2D *render_area = &tiling->render_area;
2329 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2330 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2331 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2332
2333 if (!dst->store)
2334 return;
2335
2336 uint32_t x1 = render_area->offset.x;
2337 uint32_t y1 = render_area->offset.y;
2338 uint32_t x2 = x1 + render_area->extent.width;
2339 uint32_t y2 = y1 + render_area->extent.height;
2340 /* x2/y2 can be unaligned if equal to the size of the image,
2341 * since it will write into padding space
2342 * the one exception is linear levels which don't have the
2343 * required y padding in the layout (except for the last level)
2344 */
2345 bool need_y2_align =
2346 y2 != iview->extent.height || iview->need_y2_align;
2347
2348 bool unaligned =
2349 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2350 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2351
2352 /* use fast path when render area is aligned, except for unsupported resolve cases */
2353 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2354 tu_emit_blit(cmd, cs, iview, src, true);
2355 return;
2356 }
2357
2358 if (dst->samples > 1) {
2359 /* I guess we need to use shader path in this case?
2360 * need a testcase which fails because of this
2361 */
2362 tu_finishme("unaligned store of msaa attachment\n");
2363 return;
2364 }
2365
2366 r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2367 r2d_dst(cs, iview, 0);
2368 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2369
2370 tu_cs_emit_regs(cs,
2371 A6XX_SP_PS_2D_SRC_INFO(
2372 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2373 .tile_mode = TILE6_2,
2374 .srgb = vk_format_is_srgb(src->format),
2375 .samples = tu_msaa_samples(src->samples),
2376 .samples_average = !vk_format_is_int(src->format),
2377 .unk20 = 1,
2378 .unk22 = 1),
2379 /* note: src size does not matter when not scaling */
2380 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2381 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2382 A6XX_SP_PS_2D_SRC_HI(),
2383 A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2384
2385 /* sync GMEM writes with CACHE. */
2386 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2387
2388 /* Wait for CACHE_INVALIDATE to land */
2389 tu_cs_emit_wfi(cs);
2390
2391 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2392 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2393
2394 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2395 * sysmem, and we generally assume that GMEM renderpasses leave their
2396 * results in sysmem, so we need to flush manually here.
2397 */
2398 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2399 }