turnip: enable 420_UNORM formats
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
111 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
233 {
234 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
235
236 tu_cs_emit_regs(cs,
237 A6XX_RB_2D_DST_INFO(
238 .color_format = format.fmt,
239 .color_swap = format.swap,
240 .srgb = vk_format_is_srgb(vk_format)),
241 A6XX_RB_2D_DST_LO((uint32_t) va),
242 A6XX_RB_2D_DST_HI(va >> 32),
243 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
244 }
245
246 static void
247 r2d_setup_common(struct tu_cmd_buffer *cmd,
248 struct tu_cs *cs,
249 VkFormat vk_format,
250 VkImageAspectFlags aspect_mask,
251 enum a6xx_rotation rotation,
252 bool clear,
253 bool scissor)
254 {
255 enum a6xx_format format = tu6_base_format(vk_format);
256 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
257 uint32_t unknown_8c01 = 0;
258
259 /* note: the only format with partial clearing is D24S8 */
260 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
261 /* preserve stencil channel */
262 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
263 unknown_8c01 = 0x08000041;
264 /* preserve depth channels */
265 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
266 unknown_8c01 = 0x00084001;
267 }
268
269 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
270 tu_cs_emit(cs, unknown_8c01);
271
272 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
273 .scissor = scissor,
274 .rotate = rotation,
275 .solid_color = clear,
276 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
277 .color_format = format,
278 .mask = 0xf,
279 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
280 ).value;
281
282 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
283 tu_cs_emit(cs, blit_cntl);
284
285 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
286 tu_cs_emit(cs, blit_cntl);
287
288 if (format == FMT6_10_10_10_2_UNORM_DEST)
289 format = FMT6_16_16_16_16_FLOAT;
290
291 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
292 .sint = vk_format_is_sint(vk_format),
293 .uint = vk_format_is_uint(vk_format),
294 .color_format = format,
295 .srgb = vk_format_is_srgb(vk_format),
296 .mask = 0xf));
297 }
298
299 static void
300 r2d_setup(struct tu_cmd_buffer *cmd,
301 struct tu_cs *cs,
302 VkFormat vk_format,
303 VkImageAspectFlags aspect_mask,
304 enum a6xx_rotation rotation,
305 bool clear)
306 {
307 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
308
309 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, false);
310 }
311
312 static void
313 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
314 {
315 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
316 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
317 }
318
319 /* r3d_ = shader path operations */
320
321 static void
322 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
323 bool layered_clear)
324 {
325 struct ir3_const_state dummy_const_state = {};
326 struct ir3_shader dummy_shader = {};
327
328 struct ir3_shader_variant vs = {
329 .type = MESA_SHADER_VERTEX,
330 .instrlen = 1,
331 .constlen = 4,
332 .info.max_reg = 1,
333 .inputs_count = 1,
334 .inputs[0] = {
335 .slot = SYSTEM_VALUE_VERTEX_ID,
336 .regid = regid(0, 3),
337 .sysval = true,
338 },
339 .outputs_count = blit ? 2 : 1,
340 .outputs[0] = {
341 .slot = VARYING_SLOT_POS,
342 .regid = regid(0, 0),
343 },
344 .outputs[1] = {
345 .slot = VARYING_SLOT_VAR0,
346 .regid = regid(1, 0),
347 },
348 .shader = &dummy_shader,
349 .const_state = &dummy_const_state,
350 };
351 if (layered_clear) {
352 vs = (struct ir3_shader_variant) {
353 .type = MESA_SHADER_VERTEX,
354 .instrlen = 1,
355 .info.max_reg = 0,
356 .shader = &dummy_shader,
357 .const_state = &dummy_const_state,
358 };
359 }
360
361 struct ir3_shader_variant fs = {
362 .type = MESA_SHADER_FRAGMENT,
363 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
364 .constlen = align(num_rts, 4),
365 .info.max_reg = MAX2(num_rts, 1) - 1,
366 .total_in = blit ? 2 : 0,
367 .num_samp = blit ? 1 : 0,
368 .inputs_count = blit ? 2 : 0,
369 .inputs[0] = {
370 .slot = VARYING_SLOT_VAR0,
371 .inloc = 0,
372 .compmask = 3,
373 .bary = true,
374 },
375 .inputs[1] = {
376 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
377 .regid = regid(0, 0),
378 .sysval = 1,
379 },
380 .num_sampler_prefetch = blit ? 1 : 0,
381 .sampler_prefetch[0] = {
382 .src = 0,
383 .wrmask = 0xf,
384 .cmd = 4,
385 },
386 .shader = &dummy_shader,
387 .const_state = &dummy_const_state,
388 };
389
390 struct ir3_shader_variant gs_shader = {
391 .type = MESA_SHADER_GEOMETRY,
392 .instrlen = 1,
393 .constlen = 4,
394 .info.max_reg = 1,
395 .inputs_count = 1,
396 .inputs[0] = {
397 .slot = SYSTEM_VALUE_GS_HEADER_IR3,
398 .regid = regid(0, 0),
399 .sysval = true,
400 },
401 .outputs_count = 3,
402 .outputs[0] = {
403 .slot = VARYING_SLOT_POS,
404 .regid = regid(0, 0),
405 },
406 .outputs[1] = {
407 .slot = VARYING_SLOT_LAYER,
408 .regid = regid(1, 1),
409 },
410 .outputs[2] = {
411 .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
412 .regid = regid(1, 0),
413 },
414 .shader = &dummy_shader,
415 .const_state = &dummy_const_state,
416 }, *gs = layered_clear ? &gs_shader : NULL;
417
418
419 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
420 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
421 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
422
423 static const instr_t vs_code[] = {
424 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
425 * r1.xy = r0.w ? c1.zw : c0.zw
426 * r0.w = 1.0f
427 */
428 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
429 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
430 .src2 = 3,
431 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
432 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
433 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
434 .src2 = 3,
435 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
436 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
437 { .cat0 = { .opc = OPC_END } },
438 };
439
440 static const instr_t vs_layered[] = {
441 { .cat0 = { .opc = OPC_CHMASK } },
442 { .cat0 = { .opc = OPC_CHSH } },
443 };
444
445 static const instr_t gs_code[16] = {
446 /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
447 CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
448 .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
449 /* x = (local_id & 1) ? c1.x : c0.x */
450 CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
451 /* y = (local_id & 2) ? c1.y : c0.y */
452 CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
453 /* pred = (local_id >= 4), used by OPC_KILL */
454 CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
455 /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
456 CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
457
458 MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
459 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
460 MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
461
462 /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
463 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
464 .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
465 .src2 = 0,
466 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
467
468 CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
469
470 { .cat0 = { .opc = OPC_KILL } },
471 { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
472 };
473 #define FS_OFFSET (16 * sizeof(instr_t))
474 #define GS_OFFSET (32 * sizeof(instr_t))
475
476 /* shaders */
477 struct tu_cs_memory shaders = { };
478 VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
479 16 * sizeof(instr_t), &shaders);
480 assert(result == VK_SUCCESS);
481
482 if (layered_clear) {
483 memcpy(shaders.map, vs_layered, sizeof(vs_layered));
484 memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
485 } else {
486 memcpy(shaders.map, vs_code, sizeof(vs_code));
487 }
488
489 instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
490 for (uint32_t i = 0; i < num_rts; i++) {
491 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
492 *fs_code++ = (instr_t) { .cat1 = {
493 .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
494 .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
495 } };
496 }
497
498 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
499 * blit path (its not clear what allows it to not have it)
500 */
501 if (blit) {
502 *fs_code++ = (instr_t) { .cat2 = {
503 .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
504 .dst = regid(63, 0), .src1_im = 1
505 } };
506 }
507 *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
508 /* note: assumed <= 16 instructions (MAX_RTS is 8) */
509
510 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
511
512 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
513 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
514 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
515 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
516 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
517
518 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
519 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
520
521 tu6_emit_vpc(cs, &vs, NULL, NULL, gs, &fs);
522
523 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
524 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
525 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
526
527 tu6_emit_fs_inputs(cs, &fs);
528
529 tu_cs_emit_regs(cs,
530 A6XX_GRAS_CL_CNTL(
531 .persp_division_disable = 1,
532 .vp_xform_disable = 1,
533 .vp_clip_code_ignore = 1,
534 .clip_disable = 1),
535 A6XX_GRAS_UNKNOWN_8001(0));
536 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
537
538 tu_cs_emit_regs(cs,
539 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
540 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
541 tu_cs_emit_regs(cs,
542 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
543 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
544
545 tu_cs_emit_regs(cs,
546 A6XX_VFD_INDEX_OFFSET(),
547 A6XX_VFD_INSTANCE_START_OFFSET());
548 }
549
550 static void
551 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
552 {
553 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
554 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
555 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
556 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
557 CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
558 CP_LOAD_STATE6_0_NUM_UNIT(2));
559 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
560 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
561 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
562 }
563
564 static void
565 r3d_coords(struct tu_cs *cs,
566 const VkOffset2D *dst,
567 const VkOffset2D *src,
568 const VkExtent2D *extent)
569 {
570 int32_t src_x1 = src ? src->x : 0;
571 int32_t src_y1 = src ? src->y : 0;
572 r3d_coords_raw(cs, false, (float[]) {
573 dst->x, dst->y,
574 src_x1, src_y1,
575 dst->x + extent->width, dst->y + extent->height,
576 src_x1 + extent->width, src_y1 + extent->height,
577 });
578 }
579
580 static void
581 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
582 {
583 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
584 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
585 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
586 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
587 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
588 CP_LOAD_STATE6_0_NUM_UNIT(1));
589 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
590 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
591 switch (format) {
592 case VK_FORMAT_X8_D24_UNORM_PACK32:
593 case VK_FORMAT_D24_UNORM_S8_UINT: {
594 /* cleared as r8g8b8a8_unorm using special format */
595 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
596 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
597 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
598 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
599 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
600 } break;
601 case VK_FORMAT_D16_UNORM:
602 case VK_FORMAT_D32_SFLOAT:
603 tu_cs_emit(cs, fui(val->depthStencil.depth));
604 tu_cs_emit(cs, 0);
605 tu_cs_emit(cs, 0);
606 tu_cs_emit(cs, 0);
607 break;
608 case VK_FORMAT_S8_UINT:
609 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
610 tu_cs_emit(cs, 0);
611 tu_cs_emit(cs, 0);
612 tu_cs_emit(cs, 0);
613 break;
614 default:
615 /* as color formats use clear value as-is */
616 assert(!vk_format_is_depth_or_stencil(format));
617 tu_cs_emit_array(cs, val->color.uint32, 4);
618 break;
619 }
620 }
621
622 static void
623 r3d_src_common(struct tu_cmd_buffer *cmd,
624 struct tu_cs *cs,
625 const uint32_t *tex_const,
626 uint32_t offset_base,
627 uint32_t offset_ubwc,
628 VkFilter filter)
629 {
630 struct tu_cs_memory texture = { };
631 VkResult result = tu_cs_alloc(&cmd->sub_cs,
632 2, /* allocate space for a sampler too */
633 A6XX_TEX_CONST_DWORDS, &texture);
634 assert(result == VK_SUCCESS);
635
636 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
637
638 /* patch addresses for layer offset */
639 *(uint64_t*) (texture.map + 4) += offset_base;
640 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
641 texture.map[7] = ubwc_addr;
642 texture.map[8] = ubwc_addr >> 32;
643
644 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
645 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
646 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
647 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
648 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
649 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
650 0x60000; /* XXX used by blob, doesn't seem necessary */
651 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
652 0x1 | /* XXX used by blob, doesn't seem necessary */
653 A6XX_TEX_SAMP_1_UNNORM_COORDS |
654 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
655 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
656 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
657
658 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
659 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
660 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
661 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
662 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
663 CP_LOAD_STATE6_0_NUM_UNIT(1));
664 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
665
666 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
667 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
668
669 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
670 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
671 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
672 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
673 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
674 CP_LOAD_STATE6_0_NUM_UNIT(1));
675 tu_cs_emit_qw(cs, texture.iova);
676
677 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
678 tu_cs_emit_qw(cs, texture.iova);
679
680 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
681 }
682
683 static void
684 r3d_src(struct tu_cmd_buffer *cmd,
685 struct tu_cs *cs,
686 const struct tu_image_view *iview,
687 uint32_t layer,
688 VkFilter filter)
689 {
690 r3d_src_common(cmd, cs, iview->descriptor,
691 iview->layer_size * layer,
692 iview->ubwc_layer_size * layer,
693 filter);
694 }
695
696 static void
697 r3d_src_buffer(struct tu_cmd_buffer *cmd,
698 struct tu_cs *cs,
699 VkFormat vk_format,
700 uint64_t va, uint32_t pitch,
701 uint32_t width, uint32_t height)
702 {
703 uint32_t desc[A6XX_TEX_CONST_DWORDS];
704
705 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
706
707 desc[0] =
708 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
709 A6XX_TEX_CONST_0_FMT(format.fmt) |
710 A6XX_TEX_CONST_0_SWAP(format.swap) |
711 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
712 // XXX to swizzle into .w for stencil buffer_to_image
713 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
714 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
715 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
716 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
717 desc[2] =
718 A6XX_TEX_CONST_2_PITCH(pitch) |
719 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
720 desc[3] = 0;
721 desc[4] = va;
722 desc[5] = va >> 32;
723 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
724 desc[i] = 0;
725
726 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
727 }
728
729 static void
730 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
731 {
732 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
733
734 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
735 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
736 tu_cs_image_ref(cs, iview, layer);
737 tu_cs_emit(cs, 0);
738
739 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
740 tu_cs_image_flag_ref(cs, iview, layer);
741
742 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
743 }
744
745 static void
746 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
747 {
748 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
749
750 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
751
752 tu_cs_emit_regs(cs,
753 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
754 A6XX_RB_MRT_PITCH(0, pitch),
755 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
756 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
757 A6XX_RB_MRT_BASE_HI(0, va >> 32),
758 A6XX_RB_MRT_BASE_GMEM(0, 0));
759
760 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
761 }
762
763 static uint8_t
764 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
765 {
766 uint8_t mask = 0xf;
767 assert(aspect_mask);
768 /* note: the only format with partial writing is D24S8,
769 * clear/blit uses the _AS_R8G8B8A8 format to access it
770 */
771 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
772 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
773 mask = 0x7;
774 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
775 mask = 0x8;
776 }
777 return mask;
778 }
779
780 static void
781 r3d_setup(struct tu_cmd_buffer *cmd,
782 struct tu_cs *cs,
783 VkFormat vk_format,
784 VkImageAspectFlags aspect_mask,
785 enum a6xx_rotation rotation,
786 bool clear)
787 {
788 if (!cmd->state.pass) {
789 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
790 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
791 }
792
793 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
794 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
795
796 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
797
798 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
799 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
800 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
801 0xfc000000);
802 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
803
804 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
805 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
806
807 tu_cs_emit_regs(cs,
808 A6XX_RB_FS_OUTPUT_CNTL0(),
809 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
810
811 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
812 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
813 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
814
815 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
816 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
817 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
818 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
819 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
820 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
821 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
822
823 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
824 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
825
826 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
827 .color_format = tu6_base_format(vk_format),
828 .color_sint = vk_format_is_sint(vk_format),
829 .color_uint = vk_format_is_uint(vk_format)));
830
831 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
832 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
833 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
834 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
835 }
836
837 static void
838 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
839 {
840 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
841 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
842 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
843 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
844 tu_cs_emit(cs, 1); /* instance count */
845 tu_cs_emit(cs, 2); /* vertex count */
846 }
847
848 /* blit ops - common interface for 2d/shader paths */
849
850 struct blit_ops {
851 void (*coords)(struct tu_cs *cs,
852 const VkOffset2D *dst,
853 const VkOffset2D *src,
854 const VkExtent2D *extent);
855 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
856 void (*src)(
857 struct tu_cmd_buffer *cmd,
858 struct tu_cs *cs,
859 const struct tu_image_view *iview,
860 uint32_t layer,
861 VkFilter filter);
862 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
863 VkFormat vk_format,
864 uint64_t va, uint32_t pitch,
865 uint32_t width, uint32_t height);
866 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
867 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
868 void (*setup)(struct tu_cmd_buffer *cmd,
869 struct tu_cs *cs,
870 VkFormat vk_format,
871 VkImageAspectFlags aspect_mask,
872 enum a6xx_rotation rotation,
873 bool clear);
874 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
875 };
876
877 static const struct blit_ops r2d_ops = {
878 .coords = r2d_coords,
879 .clear_value = r2d_clear_value,
880 .src = r2d_src,
881 .src_buffer = r2d_src_buffer,
882 .dst = r2d_dst,
883 .dst_buffer = r2d_dst_buffer,
884 .setup = r2d_setup,
885 .run = r2d_run,
886 };
887
888 static const struct blit_ops r3d_ops = {
889 .coords = r3d_coords,
890 .clear_value = r3d_clear_value,
891 .src = r3d_src,
892 .src_buffer = r3d_src_buffer,
893 .dst = r3d_dst,
894 .dst_buffer = r3d_dst_buffer,
895 .setup = r3d_setup,
896 .run = r3d_run,
897 };
898
899 /* passthrough set coords from 3D extents */
900 static void
901 coords(const struct blit_ops *ops,
902 struct tu_cs *cs,
903 const VkOffset3D *dst,
904 const VkOffset3D *src,
905 const VkExtent3D *extent)
906 {
907 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
908 }
909
910 static VkFormat
911 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
912 {
913 if (vk_format_is_compressed(format)) {
914 switch (vk_format_get_blocksize(format)) {
915 case 1: return VK_FORMAT_R8_UINT;
916 case 2: return VK_FORMAT_R16_UINT;
917 case 4: return VK_FORMAT_R32_UINT;
918 case 8: return VK_FORMAT_R32G32_UINT;
919 case 16:return VK_FORMAT_R32G32B32A32_UINT;
920 default:
921 unreachable("unhandled format size");
922 }
923 }
924
925 switch (format) {
926 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
927 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
928 return VK_FORMAT_R8G8_UNORM;
929 /* fallthrough */
930 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
931 return VK_FORMAT_R8_UNORM;
932 case VK_FORMAT_D24_UNORM_S8_UINT:
933 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
934 return VK_FORMAT_R8_UNORM;
935 /* fallthrough */
936 default:
937 return format;
938 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
939 return VK_FORMAT_R32_UINT;
940 }
941 }
942
943 static void
944 tu_image_view_copy_blit(struct tu_image_view *iview,
945 struct tu_image *image,
946 VkFormat format,
947 const VkImageSubresourceLayers *subres,
948 uint32_t layer,
949 bool stencil_read)
950 {
951 VkImageAspectFlags aspect_mask = subres->aspectMask;
952
953 /* always use the AS_R8G8B8A8 format for these */
954 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
955 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
956 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
957 }
958
959 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
960 .image = tu_image_to_handle(image),
961 .viewType = VK_IMAGE_VIEW_TYPE_2D,
962 .format = format,
963 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
964 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
965 .subresourceRange = {
966 .aspectMask = aspect_mask,
967 .baseMipLevel = subres->mipLevel,
968 .levelCount = 1,
969 .baseArrayLayer = subres->baseArrayLayer + layer,
970 .layerCount = 1,
971 },
972 });
973 }
974
975 static void
976 tu_image_view_copy(struct tu_image_view *iview,
977 struct tu_image *image,
978 VkFormat format,
979 const VkImageSubresourceLayers *subres,
980 uint32_t layer,
981 bool stencil_read)
982 {
983 format = copy_format(format, subres->aspectMask, false);
984 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
985 }
986
987 static void
988 tu_image_view_blit(struct tu_image_view *iview,
989 struct tu_image *image,
990 const VkImageSubresourceLayers *subres,
991 uint32_t layer)
992 {
993 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
994 }
995
996 static void
997 tu6_blit_image(struct tu_cmd_buffer *cmd,
998 struct tu_image *src_image,
999 struct tu_image *dst_image,
1000 const VkImageBlit *info,
1001 VkFilter filter)
1002 {
1003 const struct blit_ops *ops = &r2d_ops;
1004 struct tu_cs *cs = &cmd->cs;
1005 uint32_t layers;
1006
1007 /* 2D blit can't do rotation mirroring from just coordinates */
1008 static const enum a6xx_rotation rotate[2][2] = {
1009 {ROTATE_0, ROTATE_HFLIP},
1010 {ROTATE_VFLIP, ROTATE_180},
1011 };
1012
1013 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1014 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1015 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1016 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1017 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1018 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1019
1020 if (mirror_z) {
1021 tu_finishme("blit z mirror\n");
1022 return;
1023 }
1024
1025 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1026 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1027 tu_finishme("blit z filter\n");
1028 return;
1029 }
1030
1031 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1032 if (info->dstSubresource.layerCount > 1) {
1033 assert(layers <= 1);
1034 layers = info->dstSubresource.layerCount;
1035 }
1036
1037 /* BC1_RGB_* formats need to have their last components overriden with 1
1038 * when sampling, which is normally handled with the texture descriptor
1039 * swizzle. The 2d path can't handle that, so use the 3d path.
1040 *
1041 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1042 * the 2d path.
1043 */
1044
1045 if (dst_image->samples > 1 ||
1046 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1047 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1048 filter == VK_FILTER_CUBIC_EXT)
1049 ops = &r3d_ops;
1050
1051 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1052 * figure out why (should be able to pass all tests with only shader path)
1053 */
1054
1055 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1056 rotate[mirror_y][mirror_x], false);
1057
1058 if (ops == &r3d_ops) {
1059 r3d_coords_raw(cs, false, (float[]) {
1060 info->dstOffsets[0].x, info->dstOffsets[0].y,
1061 info->srcOffsets[0].x, info->srcOffsets[0].y,
1062 info->dstOffsets[1].x, info->dstOffsets[1].y,
1063 info->srcOffsets[1].x, info->srcOffsets[1].y
1064 });
1065 } else {
1066 tu_cs_emit_regs(cs,
1067 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1068 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1069 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1070 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1071 tu_cs_emit_regs(cs,
1072 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1073 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1074 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1075 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1076 }
1077
1078 struct tu_image_view dst, src;
1079 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1080 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1081
1082 for (uint32_t i = 0; i < layers; i++) {
1083 ops->dst(cs, &dst, i);
1084 ops->src(cmd, cs, &src, i, filter);
1085 ops->run(cmd, cs);
1086 }
1087 }
1088
1089 void
1090 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1091 VkImage srcImage,
1092 VkImageLayout srcImageLayout,
1093 VkImage dstImage,
1094 VkImageLayout dstImageLayout,
1095 uint32_t regionCount,
1096 const VkImageBlit *pRegions,
1097 VkFilter filter)
1098
1099 {
1100 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1101 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1102 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1103
1104 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1105 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1106
1107 for (uint32_t i = 0; i < regionCount; ++i)
1108 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1109 }
1110
1111 static void
1112 copy_compressed(VkFormat format,
1113 VkOffset3D *offset,
1114 VkExtent3D *extent,
1115 uint32_t *width,
1116 uint32_t *height)
1117 {
1118 if (!vk_format_is_compressed(format))
1119 return;
1120
1121 uint32_t block_width = vk_format_get_blockwidth(format);
1122 uint32_t block_height = vk_format_get_blockheight(format);
1123
1124 offset->x /= block_width;
1125 offset->y /= block_height;
1126
1127 if (extent) {
1128 extent->width = DIV_ROUND_UP(extent->width, block_width);
1129 extent->height = DIV_ROUND_UP(extent->height, block_height);
1130 }
1131 if (width)
1132 *width = DIV_ROUND_UP(*width, block_width);
1133 if (height)
1134 *height = DIV_ROUND_UP(*height, block_height);
1135 }
1136
1137 static void
1138 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1139 struct tu_buffer *src_buffer,
1140 struct tu_image *dst_image,
1141 const VkBufferImageCopy *info)
1142 {
1143 struct tu_cs *cs = &cmd->cs;
1144 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1145 VkFormat src_format =
1146 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1147 const struct blit_ops *ops = &r2d_ops;
1148
1149 /* special case for buffer to stencil */
1150 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1151 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1152 ops = &r3d_ops;
1153 }
1154
1155 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1156 * which matters for UBWC. buffer_to_image/etc can fail because of this
1157 */
1158
1159 VkOffset3D offset = info->imageOffset;
1160 VkExtent3D extent = info->imageExtent;
1161 uint32_t src_width = info->bufferRowLength ?: extent.width;
1162 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1163
1164 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1165
1166 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1167 uint32_t layer_size = src_height * pitch;
1168
1169 ops->setup(cmd, cs,
1170 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1171 info->imageSubresource.aspectMask, ROTATE_0, false);
1172
1173 struct tu_image_view dst;
1174 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1175
1176 for (uint32_t i = 0; i < layers; i++) {
1177 ops->dst(cs, &dst, i);
1178
1179 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1180 if ((src_va & 63) || (pitch & 63)) {
1181 for (uint32_t y = 0; y < extent.height; y++) {
1182 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1183 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1184 x + extent.width, 1);
1185 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1186 &(VkExtent2D) {extent.width, 1});
1187 ops->run(cmd, cs);
1188 src_va += pitch;
1189 }
1190 } else {
1191 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1192 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1193 ops->run(cmd, cs);
1194 }
1195 }
1196 }
1197
1198 void
1199 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1200 VkBuffer srcBuffer,
1201 VkImage dstImage,
1202 VkImageLayout dstImageLayout,
1203 uint32_t regionCount,
1204 const VkBufferImageCopy *pRegions)
1205 {
1206 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1207 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1208 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1209
1210 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1211 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1212
1213 for (unsigned i = 0; i < regionCount; ++i)
1214 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1215 }
1216
1217 static void
1218 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1219 struct tu_image *src_image,
1220 struct tu_buffer *dst_buffer,
1221 const VkBufferImageCopy *info)
1222 {
1223 struct tu_cs *cs = &cmd->cs;
1224 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1225 VkFormat dst_format =
1226 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1227 bool stencil_read = false;
1228
1229 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1230 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1231 stencil_read = true;
1232 }
1233
1234 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1235 VkOffset3D offset = info->imageOffset;
1236 VkExtent3D extent = info->imageExtent;
1237 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1238 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1239
1240 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1241
1242 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1243 uint32_t layer_size = pitch * dst_height;
1244
1245 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1246
1247 struct tu_image_view src;
1248 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1249
1250 for (uint32_t i = 0; i < layers; i++) {
1251 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1252
1253 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1254 if ((dst_va & 63) || (pitch & 63)) {
1255 for (uint32_t y = 0; y < extent.height; y++) {
1256 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1257 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1258 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1259 &(VkExtent2D) {extent.width, 1});
1260 ops->run(cmd, cs);
1261 dst_va += pitch;
1262 }
1263 } else {
1264 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1265 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1266 ops->run(cmd, cs);
1267 }
1268 }
1269 }
1270
1271 void
1272 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1273 VkImage srcImage,
1274 VkImageLayout srcImageLayout,
1275 VkBuffer dstBuffer,
1276 uint32_t regionCount,
1277 const VkBufferImageCopy *pRegions)
1278 {
1279 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1280 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1281 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1282
1283 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1284 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1285
1286 for (unsigned i = 0; i < regionCount; ++i)
1287 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1288 }
1289
1290 /* Tiled formats don't support swapping, which means that we can't support
1291 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1292 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1293 * Currently we fake support for tiled swapped formats and use the unswapped
1294 * format instead, but this means that reinterpreting copies to and from
1295 * swapped formats can't be performed correctly unless we can swizzle the
1296 * components by reinterpreting the other image as the "correct" swapped
1297 * format, i.e. only when the other image is linear.
1298 */
1299
1300 static bool
1301 is_swapped_format(VkFormat format)
1302 {
1303 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1304 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1305 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1306 }
1307
1308 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1309 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1310 * versa). This should mirror the logic in fdl6_layout.
1311 */
1312 static bool
1313 image_is_r8g8(struct tu_image *image)
1314 {
1315 return image->layout[0].cpp == 2 &&
1316 vk_format_get_nr_components(image->vk_format) == 2;
1317 }
1318
1319 static void
1320 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1321 struct tu_image *src_image,
1322 struct tu_image *dst_image,
1323 const VkImageCopy *info)
1324 {
1325 const struct blit_ops *ops = &r2d_ops;
1326 struct tu_cs *cs = &cmd->cs;
1327
1328 if (dst_image->samples > 1)
1329 ops = &r3d_ops;
1330
1331 VkFormat format = VK_FORMAT_UNDEFINED;
1332 VkOffset3D src_offset = info->srcOffset;
1333 VkOffset3D dst_offset = info->dstOffset;
1334 VkExtent3D extent = info->extent;
1335
1336 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1337 * Images":
1338 *
1339 * When copying between compressed and uncompressed formats the extent
1340 * members represent the texel dimensions of the source image and not
1341 * the destination. When copying from a compressed image to an
1342 * uncompressed image the image texel dimensions written to the
1343 * uncompressed image will be source extent divided by the compressed
1344 * texel block dimensions. When copying from an uncompressed image to a
1345 * compressed image the image texel dimensions written to the compressed
1346 * image will be the source extent multiplied by the compressed texel
1347 * block dimensions.
1348 *
1349 * This means we only have to adjust the extent if the source image is
1350 * compressed.
1351 */
1352 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1353 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1354
1355 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1356 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1357
1358 bool use_staging_blit = false;
1359
1360 if (src_format == dst_format) {
1361 /* Images that share a format can always be copied directly because it's
1362 * the same as a blit.
1363 */
1364 format = src_format;
1365 } else if (!src_image->layout[0].tile_mode) {
1366 /* If an image is linear, we can always safely reinterpret it with the
1367 * other image's format and then do a regular blit.
1368 */
1369 format = dst_format;
1370 } else if (!dst_image->layout[0].tile_mode) {
1371 format = src_format;
1372 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1373 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1374 * due to the different tile layout.
1375 */
1376 use_staging_blit = true;
1377 } else if (is_swapped_format(src_format) ||
1378 is_swapped_format(dst_format)) {
1379 /* If either format has a non-identity swap, then we can't copy
1380 * to/from it.
1381 */
1382 use_staging_blit = true;
1383 } else if (!src_image->layout[0].ubwc) {
1384 format = dst_format;
1385 } else if (!dst_image->layout[0].ubwc) {
1386 format = src_format;
1387 } else {
1388 /* Both formats use UBWC and so neither can be reinterpreted.
1389 * TODO: We could do an in-place decompression of the dst instead.
1390 */
1391 use_staging_blit = true;
1392 }
1393
1394 struct tu_image_view dst, src;
1395
1396 if (use_staging_blit) {
1397 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1398 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1399
1400 struct tu_image staging_image = {
1401 .vk_format = src_format,
1402 .type = src_image->type,
1403 .tiling = VK_IMAGE_TILING_LINEAR,
1404 .extent = extent,
1405 .level_count = 1,
1406 .layer_count = info->srcSubresource.layerCount,
1407 .samples = src_image->samples,
1408 .bo_offset = 0,
1409 };
1410
1411 VkImageSubresourceLayers staging_subresource = {
1412 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1413 .mipLevel = 0,
1414 .baseArrayLayer = 0,
1415 .layerCount = info->srcSubresource.layerCount,
1416 };
1417
1418 VkOffset3D staging_offset = { 0 };
1419
1420 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1421 staging_image.layout[0].ubwc = false;
1422
1423 fdl6_layout(&staging_image.layout[0],
1424 vk_format_to_pipe_format(staging_image.vk_format),
1425 staging_image.samples,
1426 staging_image.extent.width,
1427 staging_image.extent.height,
1428 staging_image.extent.depth,
1429 staging_image.level_count,
1430 staging_image.layer_count,
1431 staging_image.type == VK_IMAGE_TYPE_3D,
1432 NULL);
1433
1434 VkResult result = tu_get_scratch_bo(cmd->device,
1435 staging_image.layout[0].size,
1436 &staging_image.bo);
1437 if (result != VK_SUCCESS) {
1438 cmd->record_result = result;
1439 return;
1440 }
1441
1442 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1443 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1444
1445 struct tu_image_view staging;
1446 tu_image_view_copy(&staging, &staging_image, src_format,
1447 &staging_subresource, 0, false);
1448
1449 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1450 coords(ops, cs, &staging_offset, &src_offset, &extent);
1451
1452 for (uint32_t i = 0; i < info->extent.depth; i++) {
1453 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1454 ops->dst(cs, &staging, i);
1455 ops->run(cmd, cs);
1456 }
1457
1458 /* When executed by the user there has to be a pipeline barrier here,
1459 * but since we're doing it manually we'll have to flush ourselves.
1460 */
1461 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1462 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1463
1464 tu_image_view_copy(&staging, &staging_image, dst_format,
1465 &staging_subresource, 0, false);
1466
1467 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, ROTATE_0, false);
1468 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1469
1470 for (uint32_t i = 0; i < info->extent.depth; i++) {
1471 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1472 ops->dst(cs, &dst, i);
1473 ops->run(cmd, cs);
1474 }
1475 } else {
1476 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1477 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1478
1479 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, ROTATE_0, false);
1480 coords(ops, cs, &dst_offset, &src_offset, &extent);
1481
1482 for (uint32_t i = 0; i < info->extent.depth; i++) {
1483 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1484 ops->dst(cs, &dst, i);
1485 ops->run(cmd, cs);
1486 }
1487 }
1488 }
1489
1490 void
1491 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1492 VkImage srcImage,
1493 VkImageLayout srcImageLayout,
1494 VkImage destImage,
1495 VkImageLayout destImageLayout,
1496 uint32_t regionCount,
1497 const VkImageCopy *pRegions)
1498 {
1499 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1500 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1501 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1502
1503 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1504 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1505
1506 for (uint32_t i = 0; i < regionCount; ++i)
1507 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1508 }
1509
1510 static void
1511 copy_buffer(struct tu_cmd_buffer *cmd,
1512 uint64_t dst_va,
1513 uint64_t src_va,
1514 uint64_t size,
1515 uint32_t block_size)
1516 {
1517 const struct blit_ops *ops = &r2d_ops;
1518 struct tu_cs *cs = &cmd->cs;
1519 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1520 uint64_t blocks = size / block_size;
1521
1522 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1523
1524 while (blocks) {
1525 uint32_t src_x = (src_va & 63) / block_size;
1526 uint32_t dst_x = (dst_va & 63) / block_size;
1527 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1528
1529 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1530 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1531 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1532 ops->run(cmd, cs);
1533
1534 src_va += width * block_size;
1535 dst_va += width * block_size;
1536 blocks -= width;
1537 }
1538 }
1539
1540 void
1541 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1542 VkBuffer srcBuffer,
1543 VkBuffer dstBuffer,
1544 uint32_t regionCount,
1545 const VkBufferCopy *pRegions)
1546 {
1547 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1548 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1549 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1550
1551 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1552 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1553
1554 for (unsigned i = 0; i < regionCount; ++i) {
1555 copy_buffer(cmd,
1556 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1557 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1558 pRegions[i].size, 1);
1559 }
1560 }
1561
1562 void
1563 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1564 VkBuffer dstBuffer,
1565 VkDeviceSize dstOffset,
1566 VkDeviceSize dataSize,
1567 const void *pData)
1568 {
1569 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1570 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1571
1572 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1573
1574 struct tu_cs_memory tmp;
1575 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1576 if (result != VK_SUCCESS) {
1577 cmd->record_result = result;
1578 return;
1579 }
1580
1581 memcpy(tmp.map, pData, dataSize);
1582 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1583 }
1584
1585 void
1586 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1587 VkBuffer dstBuffer,
1588 VkDeviceSize dstOffset,
1589 VkDeviceSize fillSize,
1590 uint32_t data)
1591 {
1592 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1593 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1594 const struct blit_ops *ops = &r2d_ops;
1595 struct tu_cs *cs = &cmd->cs;
1596
1597 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1598
1599 if (fillSize == VK_WHOLE_SIZE)
1600 fillSize = buffer->size - dstOffset;
1601
1602 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1603 uint32_t blocks = fillSize / 4;
1604
1605 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true);
1606 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1607
1608 while (blocks) {
1609 uint32_t dst_x = (dst_va & 63) / 4;
1610 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1611
1612 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1613 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1614 ops->run(cmd, cs);
1615
1616 dst_va += width * 4;
1617 blocks -= width;
1618 }
1619 }
1620
1621 void
1622 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1623 VkImage srcImage,
1624 VkImageLayout srcImageLayout,
1625 VkImage dstImage,
1626 VkImageLayout dstImageLayout,
1627 uint32_t regionCount,
1628 const VkImageResolve *pRegions)
1629 {
1630 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1631 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1632 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1633 const struct blit_ops *ops = &r2d_ops;
1634 struct tu_cs *cs = &cmd->cs;
1635
1636 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1637 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1638
1639 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1640
1641 for (uint32_t i = 0; i < regionCount; ++i) {
1642 const VkImageResolve *info = &pRegions[i];
1643 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1644
1645 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1646 /* TODO: aspect masks possible ? */
1647
1648 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1649
1650 struct tu_image_view dst, src;
1651 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1652 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1653
1654 for (uint32_t i = 0; i < layers; i++) {
1655 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1656 ops->dst(cs, &dst, i);
1657 ops->run(cmd, cs);
1658 }
1659 }
1660 }
1661
1662 void
1663 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1664 struct tu_cs *cs,
1665 struct tu_image_view *src,
1666 struct tu_image_view *dst,
1667 uint32_t layers,
1668 const VkRect2D *rect)
1669 {
1670 const struct blit_ops *ops = &r2d_ops;
1671
1672 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1673 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1674
1675 assert(src->image->vk_format == dst->image->vk_format);
1676
1677 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1678 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1679
1680 for (uint32_t i = 0; i < layers; i++) {
1681 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1682 ops->dst(cs, dst, i);
1683 ops->run(cmd, cs);
1684 }
1685 }
1686
1687 static void
1688 clear_image(struct tu_cmd_buffer *cmd,
1689 struct tu_image *image,
1690 const VkClearValue *clear_value,
1691 const VkImageSubresourceRange *range)
1692 {
1693 uint32_t level_count = tu_get_levelCount(image, range);
1694 uint32_t layer_count = tu_get_layerCount(image, range);
1695 struct tu_cs *cs = &cmd->cs;
1696 VkFormat format = image->vk_format;
1697 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1698 format = VK_FORMAT_R32_UINT;
1699
1700 if (image->type == VK_IMAGE_TYPE_3D) {
1701 assert(layer_count == 1);
1702 assert(range->baseArrayLayer == 0);
1703 }
1704
1705 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1706
1707 ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true);
1708 ops->clear_value(cs, image->vk_format, clear_value);
1709
1710 for (unsigned j = 0; j < level_count; j++) {
1711 if (image->type == VK_IMAGE_TYPE_3D)
1712 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1713
1714 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1715 u_minify(image->extent.width, range->baseMipLevel + j),
1716 u_minify(image->extent.height, range->baseMipLevel + j)
1717 });
1718
1719 struct tu_image_view dst;
1720 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1721 .aspectMask = range->aspectMask,
1722 .mipLevel = range->baseMipLevel + j,
1723 .baseArrayLayer = range->baseArrayLayer,
1724 .layerCount = 1,
1725 }, 0, false);
1726
1727 for (uint32_t i = 0; i < layer_count; i++) {
1728 ops->dst(cs, &dst, i);
1729 ops->run(cmd, cs);
1730 }
1731 }
1732 }
1733
1734 void
1735 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1736 VkImage image_h,
1737 VkImageLayout imageLayout,
1738 const VkClearColorValue *pColor,
1739 uint32_t rangeCount,
1740 const VkImageSubresourceRange *pRanges)
1741 {
1742 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1743 TU_FROM_HANDLE(tu_image, image, image_h);
1744
1745 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1746
1747 for (unsigned i = 0; i < rangeCount; i++)
1748 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1749 }
1750
1751 void
1752 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1753 VkImage image_h,
1754 VkImageLayout imageLayout,
1755 const VkClearDepthStencilValue *pDepthStencil,
1756 uint32_t rangeCount,
1757 const VkImageSubresourceRange *pRanges)
1758 {
1759 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1760 TU_FROM_HANDLE(tu_image, image, image_h);
1761
1762 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1763
1764 for (unsigned i = 0; i < rangeCount; i++)
1765 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1766 }
1767
1768 static void
1769 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1770 uint32_t attachment_count,
1771 const VkClearAttachment *attachments,
1772 uint32_t rect_count,
1773 const VkClearRect *rects)
1774 {
1775 const struct tu_subpass *subpass = cmd->state.subpass;
1776 /* note: cannot use shader path here.. there is a special shader path
1777 * in tu_clear_sysmem_attachments()
1778 */
1779 const struct blit_ops *ops = &r2d_ops;
1780 struct tu_cs *cs = &cmd->draw_cs;
1781
1782 for (uint32_t j = 0; j < attachment_count; j++) {
1783 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1784 * Pass Instance" says that:
1785 *
1786 * Unlike other clear commands, vkCmdClearAttachments executes as
1787 * a drawing command, rather than a transfer command, with writes
1788 * performed by it executing in rasterization order. Clears to
1789 * color attachments are executed as color attachment writes, by
1790 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1791 * Clears to depth/stencil attachments are executed as depth
1792 * writes and writes by the
1793 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1794 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1795 *
1796 * However, the 2d path here is executed the same way as a
1797 * transfer command, using the CCU color cache exclusively with
1798 * a special depth-as-color format for depth clears. This means that
1799 * we can't rely on the normal pipeline barrier mechanism here, and
1800 * have to manually flush whenever using a different cache domain
1801 * from what the 3d path would've used. This happens when we clear
1802 * depth/stencil, since normally depth attachments use CCU depth, but
1803 * we clear it using a special depth-as-color format. Since the clear
1804 * potentially uses a different attachment state we also need to
1805 * invalidate color beforehand and flush it afterwards.
1806 */
1807
1808 uint32_t a;
1809 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1810 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1811 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1812 } else {
1813 a = subpass->depth_stencil_attachment.attachment;
1814 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1815 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1816 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1817 }
1818
1819 if (a == VK_ATTACHMENT_UNUSED)
1820 continue;
1821
1822 const struct tu_image_view *iview =
1823 cmd->state.framebuffer->attachments[a].attachment;
1824
1825 ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask, ROTATE_0, true);
1826 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1827
1828 /* Wait for the flushes we triggered manually to complete */
1829 tu_cs_emit_wfi(cs);
1830
1831 for (uint32_t i = 0; i < rect_count; i++) {
1832 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1833 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1834 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1835 ops->run(cmd, cs);
1836 }
1837 }
1838
1839 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1840 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1841 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1842 } else {
1843 /* sync color into depth */
1844 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1845 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1846 }
1847 }
1848 }
1849
1850 static void
1851 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1852 uint32_t attachment_count,
1853 const VkClearAttachment *attachments,
1854 uint32_t rect_count,
1855 const VkClearRect *rects)
1856 {
1857 /* the shader path here is special, it avoids changing MRT/etc state */
1858 const struct tu_render_pass *pass = cmd->state.pass;
1859 const struct tu_subpass *subpass = cmd->state.subpass;
1860 const uint32_t mrt_count = subpass->color_count;
1861 struct tu_cs *cs = &cmd->draw_cs;
1862 uint32_t clear_value[MAX_RTS][4];
1863 float z_clear_val = 0.0f;
1864 uint8_t s_clear_val = 0;
1865 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1866 bool z_clear = false;
1867 bool s_clear = false;
1868 bool layered_clear = false;
1869 uint32_t max_samples = 1;
1870
1871 for (uint32_t i = 0; i < attachment_count; i++) {
1872 uint32_t a;
1873 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1874 uint32_t c = attachments[i].colorAttachment;
1875 a = subpass->color_attachments[c].attachment;
1876 if (a == VK_ATTACHMENT_UNUSED)
1877 continue;
1878
1879 clear_rts |= 1 << c;
1880 clear_components |= 0xf << (c * 4);
1881 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1882 } else {
1883 a = subpass->depth_stencil_attachment.attachment;
1884 if (a == VK_ATTACHMENT_UNUSED)
1885 continue;
1886
1887 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1888 z_clear = true;
1889 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1890 }
1891
1892 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1893 s_clear = true;
1894 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1895 }
1896 }
1897
1898 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1899 }
1900
1901 /* prefer to use 2D path for clears
1902 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1903 */
1904 if (max_samples == 1 && cmd->state.framebuffer) {
1905 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1906 return;
1907 }
1908
1909 /* This clear path behaves like a draw, needs the same flush as tu_draw */
1910 tu_emit_cache_flush_renderpass(cmd, cs);
1911
1912 /* disable all draw states so they don't interfere
1913 * TODO: use and re-use draw states for this path
1914 * we have to disable draw states individually to preserve
1915 * input attachment states, because a secondary command buffer
1916 * won't be able to restore them
1917 */
1918 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1919 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1920 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1921 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1922 continue;
1923 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1924 CP_SET_DRAW_STATE__0_DISABLE);
1925 tu_cs_emit_qw(cs, 0);
1926 }
1927 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1928
1929 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1930 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1931 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1932 0xfc000000);
1933 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1934
1935 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1936 for (uint32_t i = 0; i < mrt_count; i++) {
1937 if (clear_rts & (1 << i))
1938 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1939 else
1940 tu_cs_emit(cs, 0);
1941 }
1942
1943 for (uint32_t i = 0; i < rect_count; i++) {
1944 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1945 layered_clear = true;
1946 }
1947
1948 r3d_common(cmd, cs, false, num_rts, layered_clear);
1949
1950 tu_cs_emit_regs(cs,
1951 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1952 tu_cs_emit_regs(cs,
1953 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1954
1955 tu_cs_emit_regs(cs,
1956 A6XX_RB_FS_OUTPUT_CNTL0(),
1957 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1958
1959 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1960 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1961 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1962 for (uint32_t i = 0; i < mrt_count; i++) {
1963 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1964 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1965 }
1966
1967 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1968 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1969 .z_enable = z_clear,
1970 .z_write_enable = z_clear,
1971 .zfunc = FUNC_ALWAYS));
1972 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1973 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1974 .stencil_enable = s_clear,
1975 .func = FUNC_ALWAYS,
1976 .zpass = STENCIL_REPLACE));
1977 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1978 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1979 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1980
1981 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1982 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1983 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1984 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1985 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1986 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1987 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1988 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1989 for_each_bit(b, clear_rts)
1990 tu_cs_emit_array(cs, clear_value[b], 4);
1991
1992 for (uint32_t i = 0; i < rect_count; i++) {
1993 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1994 r3d_coords_raw(cs, layered_clear, (float[]) {
1995 rects[i].rect.offset.x, rects[i].rect.offset.y,
1996 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1997 rects[i].rect.offset.x + rects[i].rect.extent.width,
1998 rects[i].rect.offset.y + rects[i].rect.extent.height,
1999 z_clear_val, 1.0f,
2000 });
2001
2002 if (layered_clear) {
2003 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2004 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2005 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2006 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2007 CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2008 tu_cs_emit(cs, 1); /* instance count */
2009 tu_cs_emit(cs, 1); /* vertex count */
2010 } else {
2011 r3d_run(cmd, cs);
2012 }
2013 }
2014 }
2015 }
2016
2017 static void
2018 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2019 {
2020 enum pipe_format pformat = vk_format_to_pipe_format(format);
2021
2022 switch (format) {
2023 case VK_FORMAT_X8_D24_UNORM_PACK32:
2024 case VK_FORMAT_D24_UNORM_S8_UINT:
2025 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2026 val->depthStencil.stencil << 24;
2027 return;
2028 case VK_FORMAT_D16_UNORM:
2029 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2030 return;
2031 case VK_FORMAT_D32_SFLOAT:
2032 clear_value[0] = fui(val->depthStencil.depth);
2033 return;
2034 case VK_FORMAT_S8_UINT:
2035 clear_value[0] = val->depthStencil.stencil;
2036 return;
2037 /* these formats use a different base format when tiled
2038 * the same format can be used for both because GMEM is always in WZYX order
2039 */
2040 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2041 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2042 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
2043 default:
2044 break;
2045 }
2046
2047 VkClearColorValue color;
2048
2049 /**
2050 * GMEM is tiled and wants the components in WZYX order,
2051 * apply swizzle to the color before packing, to counteract
2052 * deswizzling applied by packing functions
2053 */
2054 pipe_swizzle_4f(color.float32, val->color.float32,
2055 util_format_description(pformat)->swizzle);
2056
2057 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2058 }
2059
2060 static void
2061 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2062 struct tu_cs *cs,
2063 uint32_t attachment,
2064 VkImageAspectFlags mask,
2065 const VkClearValue *value)
2066 {
2067 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2068
2069
2070 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2071 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2072
2073 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
2074 .clear_mask = aspect_write_mask(vk_format, mask)));
2075
2076 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2077 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2078
2079 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2080 tu_cs_emit(cs, 0);
2081
2082 uint32_t clear_vals[4] = {};
2083 pack_gmem_clear_value(value, vk_format, clear_vals);
2084
2085 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2086 tu_cs_emit_array(cs, clear_vals, 4);
2087
2088 tu6_emit_event_write(cmd, cs, BLIT);
2089 }
2090
2091 static void
2092 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2093 uint32_t attachment_count,
2094 const VkClearAttachment *attachments,
2095 uint32_t rect_count,
2096 const VkClearRect *rects)
2097 {
2098 const struct tu_subpass *subpass = cmd->state.subpass;
2099 struct tu_cs *cs = &cmd->draw_cs;
2100
2101 /* TODO: swap the loops for smaller cmdstream */
2102 for (unsigned i = 0; i < rect_count; i++) {
2103 unsigned x1 = rects[i].rect.offset.x;
2104 unsigned y1 = rects[i].rect.offset.y;
2105 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2106 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2107
2108 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2109 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2110 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2111
2112 for (unsigned j = 0; j < attachment_count; j++) {
2113 uint32_t a;
2114 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2115 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2116 else
2117 a = subpass->depth_stencil_attachment.attachment;
2118
2119 if (a == VK_ATTACHMENT_UNUSED)
2120 continue;
2121
2122 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2123 &attachments[j].clearValue);
2124 }
2125 }
2126 }
2127
2128 void
2129 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2130 uint32_t attachmentCount,
2131 const VkClearAttachment *pAttachments,
2132 uint32_t rectCount,
2133 const VkClearRect *pRects)
2134 {
2135 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2136 struct tu_cs *cs = &cmd->draw_cs;
2137
2138 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2139 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2140 tu_cond_exec_end(cs);
2141
2142 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2143 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2144 tu_cond_exec_end(cs);
2145 }
2146
2147 void
2148 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2149 struct tu_cs *cs,
2150 uint32_t a,
2151 const VkRenderPassBeginInfo *info)
2152 {
2153 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2154 const struct tu_image_view *iview = fb->attachments[a].attachment;
2155 const struct tu_render_pass_attachment *attachment =
2156 &cmd->state.pass->attachments[a];
2157
2158 if (!attachment->clear_mask)
2159 return;
2160
2161 const struct blit_ops *ops = &r2d_ops;
2162 if (attachment->samples > 1)
2163 ops = &r3d_ops;
2164
2165 ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, true);
2166 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2167 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2168
2169 /* Wait for any flushes at the beginning of the renderpass to complete */
2170 tu_cs_emit_wfi(cs);
2171
2172 for (uint32_t i = 0; i < fb->layers; i++) {
2173 ops->dst(cs, iview, i);
2174 ops->run(cmd, cs);
2175 }
2176
2177 /* The spec doesn't explicitly say, but presumably the initial renderpass
2178 * clear is considered part of the renderpass, and therefore barriers
2179 * aren't required inside the subpass/renderpass. Therefore we need to
2180 * flush CCU color into CCU depth here, just like with
2181 * vkCmdClearAttachments(). Note that because this only happens at the
2182 * beginning of a renderpass, and renderpass writes are considered
2183 * "incoherent", we shouldn't have to worry about syncing depth into color
2184 * beforehand as depth should already be flushed.
2185 */
2186 if (vk_format_is_depth_or_stencil(attachment->format)) {
2187 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2188 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2189 } else {
2190 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2191 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2192 }
2193 }
2194
2195 void
2196 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2197 struct tu_cs *cs,
2198 uint32_t a,
2199 const VkRenderPassBeginInfo *info)
2200 {
2201 const struct tu_render_pass_attachment *attachment =
2202 &cmd->state.pass->attachments[a];
2203
2204 if (!attachment->clear_mask)
2205 return;
2206
2207 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2208
2209 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2210 &info->pClearValues[a]);
2211 }
2212
2213 static void
2214 tu_emit_blit(struct tu_cmd_buffer *cmd,
2215 struct tu_cs *cs,
2216 const struct tu_image_view *iview,
2217 const struct tu_render_pass_attachment *attachment,
2218 bool resolve)
2219 {
2220 tu_cs_emit_regs(cs,
2221 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2222
2223 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2224 .unk0 = !resolve,
2225 .gmem = !resolve,
2226 /* "integer" bit disables msaa resolve averaging */
2227 .integer = vk_format_is_int(attachment->format)));
2228
2229 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2230 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2231 tu_cs_image_ref_2d(cs, iview, 0, false);
2232
2233 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2234 tu_cs_image_flag_ref(cs, iview, 0);
2235
2236 tu_cs_emit_regs(cs,
2237 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2238
2239 tu6_emit_event_write(cmd, cs, BLIT);
2240 }
2241
2242 static bool
2243 blit_can_resolve(VkFormat format)
2244 {
2245 const struct util_format_description *desc = vk_format_description(format);
2246
2247 /* blit event can only do resolve for simple cases:
2248 * averaging samples as unsigned integers or choosing only one sample
2249 */
2250 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2251 return false;
2252
2253 /* can't do formats with larger channel sizes
2254 * note: this includes all float formats
2255 * note2: single channel integer formats seem OK
2256 */
2257 if (desc->channel[0].size > 10)
2258 return false;
2259
2260 switch (format) {
2261 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2262 * likely related to these formats having different layout from other cpp=2 formats
2263 */
2264 case VK_FORMAT_R8G8_UNORM:
2265 case VK_FORMAT_R8G8_UINT:
2266 case VK_FORMAT_R8G8_SINT:
2267 /* TODO: this one should be able to work? */
2268 case VK_FORMAT_D24_UNORM_S8_UINT:
2269 return false;
2270 default:
2271 break;
2272 }
2273
2274 return true;
2275 }
2276
2277 void
2278 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2279 struct tu_cs *cs,
2280 uint32_t a,
2281 bool force_load)
2282 {
2283 const struct tu_image_view *iview =
2284 cmd->state.framebuffer->attachments[a].attachment;
2285 const struct tu_render_pass_attachment *attachment =
2286 &cmd->state.pass->attachments[a];
2287
2288 if (attachment->load || force_load)
2289 tu_emit_blit(cmd, cs, iview, attachment, false);
2290 }
2291
2292 void
2293 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2294 struct tu_cs *cs,
2295 uint32_t a,
2296 uint32_t gmem_a)
2297 {
2298 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2299 const VkRect2D *render_area = &cmd->state.render_area;
2300 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2301 struct tu_image_view *iview = fb->attachments[a].attachment;
2302 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2303
2304 if (!dst->store)
2305 return;
2306
2307 uint32_t x1 = render_area->offset.x;
2308 uint32_t y1 = render_area->offset.y;
2309 uint32_t x2 = x1 + render_area->extent.width;
2310 uint32_t y2 = y1 + render_area->extent.height;
2311 /* x2/y2 can be unaligned if equal to the size of the image,
2312 * since it will write into padding space
2313 * the one exception is linear levels which don't have the
2314 * required y padding in the layout (except for the last level)
2315 */
2316 bool need_y2_align =
2317 y2 != iview->extent.height || iview->need_y2_align;
2318
2319 bool unaligned =
2320 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2321 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2322
2323 /* use fast path when render area is aligned, except for unsupported resolve cases */
2324 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2325 tu_emit_blit(cmd, cs, iview, src, true);
2326 return;
2327 }
2328
2329 if (dst->samples > 1) {
2330 /* I guess we need to use shader path in this case?
2331 * need a testcase which fails because of this
2332 */
2333 tu_finishme("unaligned store of msaa attachment\n");
2334 return;
2335 }
2336
2337 r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, true);
2338 r2d_dst(cs, iview, 0);
2339 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2340
2341 tu_cs_emit_regs(cs,
2342 A6XX_SP_PS_2D_SRC_INFO(
2343 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2344 .tile_mode = TILE6_2,
2345 .srgb = vk_format_is_srgb(src->format),
2346 .samples = tu_msaa_samples(src->samples),
2347 .samples_average = !vk_format_is_int(src->format),
2348 .unk20 = 1,
2349 .unk22 = 1),
2350 /* note: src size does not matter when not scaling */
2351 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2352 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2353 A6XX_SP_PS_2D_SRC_HI(),
2354 A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2355
2356 /* sync GMEM writes with CACHE. */
2357 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2358
2359 /* Wait for CACHE_INVALIDATE to land */
2360 tu_cs_emit_wfi(cs);
2361
2362 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2363 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2364
2365 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2366 * sysmem, and we generally assume that GMEM renderpasses leave their
2367 * results in sysmem, so we need to flush manually here.
2368 */
2369 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2370 }