turnip: implement VK_EXT_private_data
[mesa.git] / src / freedreno / vulkan / tu_clear_blit.c
1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/u_half.h"
18
19 static uint32_t
20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
28 format_to_ifmt(enum a6xx_format fmt)
29 {
30 switch (fmt) {
31 case FMT6_A8_UNORM:
32 case FMT6_8_UNORM:
33 case FMT6_8_SNORM:
34 case FMT6_8_8_UNORM:
35 case FMT6_8_8_SNORM:
36 case FMT6_8_8_8_8_UNORM:
37 case FMT6_8_8_8_X8_UNORM:
38 case FMT6_8_8_8_8_SNORM:
39 case FMT6_4_4_4_4_UNORM:
40 case FMT6_5_5_5_1_UNORM:
41 case FMT6_5_6_5_UNORM:
42 case FMT6_Z24_UNORM_S8_UINT:
43 case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
44 return R2D_UNORM8;
45
46 case FMT6_32_UINT:
47 case FMT6_32_SINT:
48 case FMT6_32_32_UINT:
49 case FMT6_32_32_SINT:
50 case FMT6_32_32_32_32_UINT:
51 case FMT6_32_32_32_32_SINT:
52 return R2D_INT32;
53
54 case FMT6_16_UINT:
55 case FMT6_16_SINT:
56 case FMT6_16_16_UINT:
57 case FMT6_16_16_SINT:
58 case FMT6_16_16_16_16_UINT:
59 case FMT6_16_16_16_16_SINT:
60 case FMT6_10_10_10_2_UINT:
61 return R2D_INT16;
62
63 case FMT6_8_UINT:
64 case FMT6_8_SINT:
65 case FMT6_8_8_UINT:
66 case FMT6_8_8_SINT:
67 case FMT6_8_8_8_8_UINT:
68 case FMT6_8_8_8_8_SINT:
69 return R2D_INT8;
70
71 case FMT6_16_UNORM:
72 case FMT6_16_SNORM:
73 case FMT6_16_16_UNORM:
74 case FMT6_16_16_SNORM:
75 case FMT6_16_16_16_16_UNORM:
76 case FMT6_16_16_16_16_SNORM:
77 case FMT6_32_FLOAT:
78 case FMT6_32_32_FLOAT:
79 case FMT6_32_32_32_32_FLOAT:
80 return R2D_FLOAT32;
81
82 case FMT6_16_FLOAT:
83 case FMT6_16_16_FLOAT:
84 case FMT6_16_16_16_16_FLOAT:
85 case FMT6_11_11_10_FLOAT:
86 case FMT6_10_10_10_2_UNORM:
87 case FMT6_10_10_10_2_UNORM_DEST:
88 return R2D_FLOAT16;
89
90 default:
91 unreachable("bad format");
92 return 0;
93 }
94 }
95
96 static void
97 r2d_coords(struct tu_cs *cs,
98 const VkOffset2D *dst,
99 const VkOffset2D *src,
100 const VkExtent2D *extent)
101 {
102 tu_cs_emit_regs(cs,
103 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
104 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
105
106 if (!src)
107 return;
108
109 tu_cs_emit_regs(cs,
110 A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
111 A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
112 A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
113 A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
114 }
115
116 static void
117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
118 {
119 uint32_t clear_value[4] = {};
120
121 switch (format) {
122 case VK_FORMAT_X8_D24_UNORM_PACK32:
123 case VK_FORMAT_D24_UNORM_S8_UINT:
124 /* cleared as r8g8b8a8_unorm using special format */
125 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
126 clear_value[1] = clear_value[0] >> 8;
127 clear_value[2] = clear_value[0] >> 16;
128 clear_value[3] = val->depthStencil.stencil;
129 break;
130 case VK_FORMAT_D16_UNORM:
131 case VK_FORMAT_D32_SFLOAT:
132 /* R2D_FLOAT32 */
133 clear_value[0] = fui(val->depthStencil.depth);
134 break;
135 case VK_FORMAT_S8_UINT:
136 clear_value[0] = val->depthStencil.stencil;
137 break;
138 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
139 /* cleared as UINT32 */
140 clear_value[0] = float3_to_rgb9e5(val->color.float32);
141 break;
142 default:
143 assert(!vk_format_is_depth_or_stencil(format));
144 const struct util_format_description *desc = vk_format_description(format);
145 enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
146
147 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
148 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
149
150 for (unsigned i = 0; i < desc->nr_channels; i++) {
151 const struct util_format_channel_description *ch = &desc->channel[i];
152 if (ifmt == R2D_UNORM8) {
153 float linear = val->color.float32[i];
154 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
155 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
156
157 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
158 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
159 else
160 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
161 } else if (ifmt == R2D_FLOAT16) {
162 clear_value[i] = util_float_to_half(val->color.float32[i]);
163 } else {
164 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
165 ifmt == R2D_INT16 || ifmt == R2D_INT8);
166 clear_value[i] = val->color.uint32[i];
167 }
168 }
169 break;
170 }
171
172 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
173 tu_cs_emit_array(cs, clear_value, 4);
174 }
175
176 static void
177 r2d_src(struct tu_cmd_buffer *cmd,
178 struct tu_cs *cs,
179 const struct tu_image_view *iview,
180 uint32_t layer,
181 VkFilter filter)
182 {
183 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
184 if (filter != VK_FILTER_NEAREST)
185 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
186
187 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
188 tu_cs_emit(cs, src_info);
189 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
190 tu_cs_image_ref_2d(cs, iview, layer, true);
191
192 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
193 tu_cs_image_flag_ref(cs, iview, layer);
194 }
195
196 static void
197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
198 struct tu_cs *cs,
199 VkFormat vk_format,
200 uint64_t va, uint32_t pitch,
201 uint32_t width, uint32_t height)
202 {
203 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
204
205 tu_cs_emit_regs(cs,
206 A6XX_SP_PS_2D_SRC_INFO(
207 .color_format = format.fmt,
208 .color_swap = format.swap,
209 .srgb = vk_format_is_srgb(vk_format),
210 .unk20 = 1,
211 .unk22 = 1),
212 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
213 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
214 A6XX_SP_PS_2D_SRC_HI(va >> 32),
215 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
216 }
217
218 static void
219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
220 {
221 assert(iview->image->samples == 1);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
224 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
225 tu_cs_image_ref_2d(cs, iview, layer, false);
226
227 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
228 tu_cs_image_flag_ref(cs, iview, layer);
229 }
230
231 static void
232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
233 {
234 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
235
236 tu_cs_emit_regs(cs,
237 A6XX_RB_2D_DST_INFO(
238 .color_format = format.fmt,
239 .color_swap = format.swap,
240 .srgb = vk_format_is_srgb(vk_format)),
241 A6XX_RB_2D_DST_LO((uint32_t) va),
242 A6XX_RB_2D_DST_HI(va >> 32),
243 A6XX_RB_2D_DST_SIZE(.pitch = pitch));
244 }
245
246 static void
247 r2d_setup_common(struct tu_cmd_buffer *cmd,
248 struct tu_cs *cs,
249 VkFormat vk_format,
250 VkImageAspectFlags aspect_mask,
251 enum a6xx_rotation rotation,
252 bool clear,
253 bool scissor)
254 {
255 enum a6xx_format format = tu6_base_format(vk_format);
256 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
257 uint32_t unknown_8c01 = 0;
258
259 /* note: the only format with partial clearing is D24S8 */
260 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
261 /* preserve stencil channel */
262 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
263 unknown_8c01 = 0x08000041;
264 /* preserve depth channels */
265 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
266 unknown_8c01 = 0x00084001;
267 }
268
269 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
270 tu_cs_emit(cs, unknown_8c01);
271
272 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
273 .scissor = scissor,
274 .rotate = rotation,
275 .solid_color = clear,
276 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
277 .color_format = format,
278 .mask = 0xf,
279 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
280 ).value;
281
282 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
283 tu_cs_emit(cs, blit_cntl);
284
285 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
286 tu_cs_emit(cs, blit_cntl);
287
288 if (format == FMT6_10_10_10_2_UNORM_DEST)
289 format = FMT6_16_16_16_16_FLOAT;
290
291 tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
292 .sint = vk_format_is_sint(vk_format),
293 .uint = vk_format_is_uint(vk_format),
294 .color_format = format,
295 .srgb = vk_format_is_srgb(vk_format),
296 .mask = 0xf));
297 }
298
299 static void
300 r2d_setup(struct tu_cmd_buffer *cmd,
301 struct tu_cs *cs,
302 VkFormat vk_format,
303 VkImageAspectFlags aspect_mask,
304 enum a6xx_rotation rotation,
305 bool clear)
306 {
307 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
308
309 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, false);
310 }
311
312 static void
313 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
314 {
315 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
316 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
317 }
318
319 /* r3d_ = shader path operations */
320
321 void
322 tu_init_clear_blit_shaders(struct tu6_global *global)
323 {
324 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
325 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
326 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
327
328 static const instr_t vs_code[] = {
329 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
330 * r1.xy = r0.w ? c1.zw : c0.zw
331 * r0.w = 1.0f
332 */
333 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
334 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
335 .src2 = 3,
336 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
337 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
338 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
339 .src2 = 3,
340 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
341 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
342 { .cat0 = { .opc = OPC_END } },
343 };
344
345 static const instr_t vs_layered[] = {
346 { .cat0 = { .opc = OPC_CHMASK } },
347 { .cat0 = { .opc = OPC_CHSH } },
348 };
349
350 static const instr_t gs_code[] = {
351 /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
352 CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
353 .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
354 /* x = (local_id & 1) ? c1.x : c0.x */
355 CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
356 /* y = (local_id & 2) ? c1.y : c0.y */
357 CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
358 /* pred = (local_id >= 4), used by OPC_KILL */
359 CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
360 /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
361 CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
362
363 MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
364 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
365 MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
366
367 /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
368 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
369 .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
370 .src2 = 0,
371 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
372
373 CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
374
375 { .cat0 = { .opc = OPC_KILL } },
376 { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
377 };
378
379 static const instr_t fs_blit[] = {
380 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
381 * blit path (its not clear what allows it to not have it)
382 */
383 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
384 { .cat0 = { .opc = OPC_END } },
385 };
386
387 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
388 memcpy(&global->shaders[GLOBAL_SH_VS_LAYER], vs_layered, sizeof(vs_layered));
389 memcpy(&global->shaders[GLOBAL_SH_GS_LAYER], gs_code, sizeof(gs_code));
390 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
391
392 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
393 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
394 for (uint32_t i = 0; i < num_rts; i++) {
395 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
396 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
397 }
398 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
399 }
400 }
401
402 static void
403 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
404 bool layered_clear)
405 {
406 struct ir3_const_state dummy_const_state = {};
407 struct ir3_shader dummy_shader = {};
408
409 struct ir3_shader_variant vs = {
410 .type = MESA_SHADER_VERTEX,
411 .instrlen = 1,
412 .constlen = 4,
413 .info.max_reg = 1,
414 .inputs_count = 1,
415 .inputs[0] = {
416 .slot = SYSTEM_VALUE_VERTEX_ID,
417 .regid = regid(0, 3),
418 .sysval = true,
419 },
420 .outputs_count = blit ? 2 : 1,
421 .outputs[0] = {
422 .slot = VARYING_SLOT_POS,
423 .regid = regid(0, 0),
424 },
425 .outputs[1] = {
426 .slot = VARYING_SLOT_VAR0,
427 .regid = regid(1, 0),
428 },
429 .shader = &dummy_shader,
430 .const_state = &dummy_const_state,
431 };
432 if (layered_clear) {
433 vs = (struct ir3_shader_variant) {
434 .type = MESA_SHADER_VERTEX,
435 .instrlen = 1,
436 .info.max_reg = 0,
437 .shader = &dummy_shader,
438 .const_state = &dummy_const_state,
439 };
440 }
441
442 struct ir3_shader_variant fs = {
443 .type = MESA_SHADER_FRAGMENT,
444 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
445 .constlen = align(num_rts, 4),
446 .info.max_reg = MAX2(num_rts, 1) - 1,
447 .total_in = blit ? 2 : 0,
448 .num_samp = blit ? 1 : 0,
449 .inputs_count = blit ? 2 : 0,
450 .inputs[0] = {
451 .slot = VARYING_SLOT_VAR0,
452 .inloc = 0,
453 .compmask = 3,
454 .bary = true,
455 },
456 .inputs[1] = {
457 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
458 .regid = regid(0, 0),
459 .sysval = 1,
460 },
461 .num_sampler_prefetch = blit ? 1 : 0,
462 .sampler_prefetch[0] = {
463 .src = 0,
464 .wrmask = 0xf,
465 .cmd = 4,
466 },
467 .shader = &dummy_shader,
468 .const_state = &dummy_const_state,
469 };
470
471 struct ir3_shader_variant gs_shader = {
472 .type = MESA_SHADER_GEOMETRY,
473 .instrlen = 1,
474 .constlen = 4,
475 .info.max_reg = 1,
476 .inputs_count = 1,
477 .inputs[0] = {
478 .slot = SYSTEM_VALUE_GS_HEADER_IR3,
479 .regid = regid(0, 0),
480 .sysval = true,
481 },
482 .outputs_count = 3,
483 .outputs[0] = {
484 .slot = VARYING_SLOT_POS,
485 .regid = regid(0, 0),
486 },
487 .outputs[1] = {
488 .slot = VARYING_SLOT_LAYER,
489 .regid = regid(1, 1),
490 },
491 .outputs[2] = {
492 .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
493 .regid = regid(1, 0),
494 },
495 .shader = &dummy_shader,
496 .const_state = &dummy_const_state,
497 }, *gs = layered_clear ? &gs_shader : NULL;
498
499 /* shaders */
500 tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
501
502 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs,
503 global_iova(cmd, shaders[gs ? GLOBAL_SH_VS_LAYER : GLOBAL_SH_VS]));
504 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
505 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
506 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs,
507 global_iova(cmd, shaders[GLOBAL_SH_GS_LAYER]));
508 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
509 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
510
511 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
512 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
513
514 tu6_emit_vpc(cs, &vs, NULL, NULL, gs, &fs);
515
516 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
517 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
518 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
519
520 tu6_emit_fs_inputs(cs, &fs);
521
522 tu_cs_emit_regs(cs,
523 A6XX_GRAS_CL_CNTL(
524 .persp_division_disable = 1,
525 .vp_xform_disable = 1,
526 .vp_clip_code_ignore = 1,
527 .clip_disable = 1),
528 A6XX_GRAS_UNKNOWN_8001(0));
529 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
530
531 tu_cs_emit_regs(cs,
532 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
533 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
534 tu_cs_emit_regs(cs,
535 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
536 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
537
538 tu_cs_emit_regs(cs,
539 A6XX_VFD_INDEX_OFFSET(),
540 A6XX_VFD_INSTANCE_START_OFFSET());
541 }
542
543 static void
544 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
545 {
546 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
547 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
548 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
549 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
550 CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
551 CP_LOAD_STATE6_0_NUM_UNIT(2));
552 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
553 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
554 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
555 }
556
557 static void
558 r3d_coords(struct tu_cs *cs,
559 const VkOffset2D *dst,
560 const VkOffset2D *src,
561 const VkExtent2D *extent)
562 {
563 int32_t src_x1 = src ? src->x : 0;
564 int32_t src_y1 = src ? src->y : 0;
565 r3d_coords_raw(cs, false, (float[]) {
566 dst->x, dst->y,
567 src_x1, src_y1,
568 dst->x + extent->width, dst->y + extent->height,
569 src_x1 + extent->width, src_y1 + extent->height,
570 });
571 }
572
573 static void
574 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
575 {
576 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
577 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
578 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
579 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
580 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
581 CP_LOAD_STATE6_0_NUM_UNIT(1));
582 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
583 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
584 switch (format) {
585 case VK_FORMAT_X8_D24_UNORM_PACK32:
586 case VK_FORMAT_D24_UNORM_S8_UINT: {
587 /* cleared as r8g8b8a8_unorm using special format */
588 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
589 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
590 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
591 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
592 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
593 } break;
594 case VK_FORMAT_D16_UNORM:
595 case VK_FORMAT_D32_SFLOAT:
596 tu_cs_emit(cs, fui(val->depthStencil.depth));
597 tu_cs_emit(cs, 0);
598 tu_cs_emit(cs, 0);
599 tu_cs_emit(cs, 0);
600 break;
601 case VK_FORMAT_S8_UINT:
602 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
603 tu_cs_emit(cs, 0);
604 tu_cs_emit(cs, 0);
605 tu_cs_emit(cs, 0);
606 break;
607 default:
608 /* as color formats use clear value as-is */
609 assert(!vk_format_is_depth_or_stencil(format));
610 tu_cs_emit_array(cs, val->color.uint32, 4);
611 break;
612 }
613 }
614
615 static void
616 r3d_src_common(struct tu_cmd_buffer *cmd,
617 struct tu_cs *cs,
618 const uint32_t *tex_const,
619 uint32_t offset_base,
620 uint32_t offset_ubwc,
621 VkFilter filter)
622 {
623 struct tu_cs_memory texture = { };
624 VkResult result = tu_cs_alloc(&cmd->sub_cs,
625 2, /* allocate space for a sampler too */
626 A6XX_TEX_CONST_DWORDS, &texture);
627 assert(result == VK_SUCCESS);
628
629 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
630
631 /* patch addresses for layer offset */
632 *(uint64_t*) (texture.map + 4) += offset_base;
633 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
634 texture.map[7] = ubwc_addr;
635 texture.map[8] = ubwc_addr >> 32;
636
637 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
638 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
639 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
640 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
641 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
642 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
643 0x60000; /* XXX used by blob, doesn't seem necessary */
644 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
645 0x1 | /* XXX used by blob, doesn't seem necessary */
646 A6XX_TEX_SAMP_1_UNNORM_COORDS |
647 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
648 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
649 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
650
651 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
652 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
653 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
654 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
655 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
656 CP_LOAD_STATE6_0_NUM_UNIT(1));
657 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
658
659 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
660 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
661
662 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
663 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
664 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
665 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
666 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
667 CP_LOAD_STATE6_0_NUM_UNIT(1));
668 tu_cs_emit_qw(cs, texture.iova);
669
670 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
671 tu_cs_emit_qw(cs, texture.iova);
672
673 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
674 }
675
676 static void
677 r3d_src(struct tu_cmd_buffer *cmd,
678 struct tu_cs *cs,
679 const struct tu_image_view *iview,
680 uint32_t layer,
681 VkFilter filter)
682 {
683 r3d_src_common(cmd, cs, iview->descriptor,
684 iview->layer_size * layer,
685 iview->ubwc_layer_size * layer,
686 filter);
687 }
688
689 static void
690 r3d_src_buffer(struct tu_cmd_buffer *cmd,
691 struct tu_cs *cs,
692 VkFormat vk_format,
693 uint64_t va, uint32_t pitch,
694 uint32_t width, uint32_t height)
695 {
696 uint32_t desc[A6XX_TEX_CONST_DWORDS];
697
698 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
699
700 desc[0] =
701 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
702 A6XX_TEX_CONST_0_FMT(format.fmt) |
703 A6XX_TEX_CONST_0_SWAP(format.swap) |
704 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
705 // XXX to swizzle into .w for stencil buffer_to_image
706 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
707 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
708 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
709 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
710 desc[2] =
711 A6XX_TEX_CONST_2_PITCH(pitch) |
712 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
713 desc[3] = 0;
714 desc[4] = va;
715 desc[5] = va >> 32;
716 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
717 desc[i] = 0;
718
719 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
720 }
721
722 static void
723 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
724 {
725 tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
726
727 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
728 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
729 tu_cs_image_ref(cs, iview, layer);
730 tu_cs_emit(cs, 0);
731
732 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
733 tu_cs_image_flag_ref(cs, iview, layer);
734
735 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
736 }
737
738 static void
739 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
740 {
741 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
742
743 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
744
745 tu_cs_emit_regs(cs,
746 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
747 A6XX_RB_MRT_PITCH(0, pitch),
748 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
749 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
750 A6XX_RB_MRT_BASE_HI(0, va >> 32),
751 A6XX_RB_MRT_BASE_GMEM(0, 0));
752
753 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
754 }
755
756 static uint8_t
757 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
758 {
759 uint8_t mask = 0xf;
760 assert(aspect_mask);
761 /* note: the only format with partial writing is D24S8,
762 * clear/blit uses the _AS_R8G8B8A8 format to access it
763 */
764 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
765 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
766 mask = 0x7;
767 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
768 mask = 0x8;
769 }
770 return mask;
771 }
772
773 static void
774 r3d_setup(struct tu_cmd_buffer *cmd,
775 struct tu_cs *cs,
776 VkFormat vk_format,
777 VkImageAspectFlags aspect_mask,
778 enum a6xx_rotation rotation,
779 bool clear)
780 {
781 if (!cmd->state.pass) {
782 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
783 tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
784 }
785
786 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
787 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
788
789 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
790
791 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
792 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
793 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
794 0xfc000000);
795 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
796
797 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
798 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
799
800 tu_cs_emit_regs(cs,
801 A6XX_RB_FS_OUTPUT_CNTL0(),
802 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
803
804 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
805 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
806 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
807
808 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
809 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
810 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
811 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
812 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
813 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
814 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
815
816 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
817 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
818
819 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
820 .color_format = tu6_base_format(vk_format),
821 .color_sint = vk_format_is_sint(vk_format),
822 .color_uint = vk_format_is_uint(vk_format)));
823
824 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
825 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
826 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
827 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
828 }
829
830 static void
831 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
832 {
833 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
834 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
835 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
836 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
837 tu_cs_emit(cs, 1); /* instance count */
838 tu_cs_emit(cs, 2); /* vertex count */
839 }
840
841 /* blit ops - common interface for 2d/shader paths */
842
843 struct blit_ops {
844 void (*coords)(struct tu_cs *cs,
845 const VkOffset2D *dst,
846 const VkOffset2D *src,
847 const VkExtent2D *extent);
848 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
849 void (*src)(
850 struct tu_cmd_buffer *cmd,
851 struct tu_cs *cs,
852 const struct tu_image_view *iview,
853 uint32_t layer,
854 VkFilter filter);
855 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
856 VkFormat vk_format,
857 uint64_t va, uint32_t pitch,
858 uint32_t width, uint32_t height);
859 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
860 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
861 void (*setup)(struct tu_cmd_buffer *cmd,
862 struct tu_cs *cs,
863 VkFormat vk_format,
864 VkImageAspectFlags aspect_mask,
865 enum a6xx_rotation rotation,
866 bool clear);
867 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
868 };
869
870 static const struct blit_ops r2d_ops = {
871 .coords = r2d_coords,
872 .clear_value = r2d_clear_value,
873 .src = r2d_src,
874 .src_buffer = r2d_src_buffer,
875 .dst = r2d_dst,
876 .dst_buffer = r2d_dst_buffer,
877 .setup = r2d_setup,
878 .run = r2d_run,
879 };
880
881 static const struct blit_ops r3d_ops = {
882 .coords = r3d_coords,
883 .clear_value = r3d_clear_value,
884 .src = r3d_src,
885 .src_buffer = r3d_src_buffer,
886 .dst = r3d_dst,
887 .dst_buffer = r3d_dst_buffer,
888 .setup = r3d_setup,
889 .run = r3d_run,
890 };
891
892 /* passthrough set coords from 3D extents */
893 static void
894 coords(const struct blit_ops *ops,
895 struct tu_cs *cs,
896 const VkOffset3D *dst,
897 const VkOffset3D *src,
898 const VkExtent3D *extent)
899 {
900 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
901 }
902
903 static VkFormat
904 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
905 {
906 if (vk_format_is_compressed(format)) {
907 switch (vk_format_get_blocksize(format)) {
908 case 1: return VK_FORMAT_R8_UINT;
909 case 2: return VK_FORMAT_R16_UINT;
910 case 4: return VK_FORMAT_R32_UINT;
911 case 8: return VK_FORMAT_R32G32_UINT;
912 case 16:return VK_FORMAT_R32G32B32A32_UINT;
913 default:
914 unreachable("unhandled format size");
915 }
916 }
917
918 switch (format) {
919 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
920 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
921 return VK_FORMAT_R8G8_UNORM;
922 /* fallthrough */
923 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
924 return VK_FORMAT_R8_UNORM;
925 case VK_FORMAT_D24_UNORM_S8_UINT:
926 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
927 return VK_FORMAT_R8_UNORM;
928 /* fallthrough */
929 default:
930 return format;
931 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
932 return VK_FORMAT_R32_UINT;
933 }
934 }
935
936 static void
937 tu_image_view_copy_blit(struct tu_image_view *iview,
938 struct tu_image *image,
939 VkFormat format,
940 const VkImageSubresourceLayers *subres,
941 uint32_t layer,
942 bool stencil_read)
943 {
944 VkImageAspectFlags aspect_mask = subres->aspectMask;
945
946 /* always use the AS_R8G8B8A8 format for these */
947 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
948 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
949 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
950 }
951
952 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
953 .image = tu_image_to_handle(image),
954 .viewType = VK_IMAGE_VIEW_TYPE_2D,
955 .format = format,
956 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
957 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
958 .subresourceRange = {
959 .aspectMask = aspect_mask,
960 .baseMipLevel = subres->mipLevel,
961 .levelCount = 1,
962 .baseArrayLayer = subres->baseArrayLayer + layer,
963 .layerCount = 1,
964 },
965 });
966 }
967
968 static void
969 tu_image_view_copy(struct tu_image_view *iview,
970 struct tu_image *image,
971 VkFormat format,
972 const VkImageSubresourceLayers *subres,
973 uint32_t layer,
974 bool stencil_read)
975 {
976 format = copy_format(format, subres->aspectMask, false);
977 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
978 }
979
980 static void
981 tu_image_view_blit(struct tu_image_view *iview,
982 struct tu_image *image,
983 const VkImageSubresourceLayers *subres,
984 uint32_t layer)
985 {
986 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
987 }
988
989 static void
990 tu6_blit_image(struct tu_cmd_buffer *cmd,
991 struct tu_image *src_image,
992 struct tu_image *dst_image,
993 const VkImageBlit *info,
994 VkFilter filter)
995 {
996 const struct blit_ops *ops = &r2d_ops;
997 struct tu_cs *cs = &cmd->cs;
998 uint32_t layers;
999
1000 /* 2D blit can't do rotation mirroring from just coordinates */
1001 static const enum a6xx_rotation rotate[2][2] = {
1002 {ROTATE_0, ROTATE_HFLIP},
1003 {ROTATE_VFLIP, ROTATE_180},
1004 };
1005
1006 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1007 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1008 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1009 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1010 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1011 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1012
1013 if (mirror_z) {
1014 tu_finishme("blit z mirror\n");
1015 return;
1016 }
1017
1018 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1019 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1020 tu_finishme("blit z filter\n");
1021 return;
1022 }
1023
1024 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1025 if (info->dstSubresource.layerCount > 1) {
1026 assert(layers <= 1);
1027 layers = info->dstSubresource.layerCount;
1028 }
1029
1030 /* BC1_RGB_* formats need to have their last components overriden with 1
1031 * when sampling, which is normally handled with the texture descriptor
1032 * swizzle. The 2d path can't handle that, so use the 3d path.
1033 *
1034 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1035 * the 2d path.
1036 */
1037
1038 if (dst_image->samples > 1 ||
1039 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1040 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1041 filter == VK_FILTER_CUBIC_EXT)
1042 ops = &r3d_ops;
1043
1044 /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1045 * figure out why (should be able to pass all tests with only shader path)
1046 */
1047
1048 ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1049 rotate[mirror_y][mirror_x], false);
1050
1051 if (ops == &r3d_ops) {
1052 r3d_coords_raw(cs, false, (float[]) {
1053 info->dstOffsets[0].x, info->dstOffsets[0].y,
1054 info->srcOffsets[0].x, info->srcOffsets[0].y,
1055 info->dstOffsets[1].x, info->dstOffsets[1].y,
1056 info->srcOffsets[1].x, info->srcOffsets[1].y
1057 });
1058 } else {
1059 tu_cs_emit_regs(cs,
1060 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1061 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1062 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1063 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1064 tu_cs_emit_regs(cs,
1065 A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1066 A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1067 A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1068 A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1069 }
1070
1071 struct tu_image_view dst, src;
1072 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1073 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1074
1075 for (uint32_t i = 0; i < layers; i++) {
1076 ops->dst(cs, &dst, i);
1077 ops->src(cmd, cs, &src, i, filter);
1078 ops->run(cmd, cs);
1079 }
1080 }
1081
1082 void
1083 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1084 VkImage srcImage,
1085 VkImageLayout srcImageLayout,
1086 VkImage dstImage,
1087 VkImageLayout dstImageLayout,
1088 uint32_t regionCount,
1089 const VkImageBlit *pRegions,
1090 VkFilter filter)
1091
1092 {
1093 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1094 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1095 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1096
1097 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1098 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1099
1100 for (uint32_t i = 0; i < regionCount; ++i)
1101 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1102 }
1103
1104 static void
1105 copy_compressed(VkFormat format,
1106 VkOffset3D *offset,
1107 VkExtent3D *extent,
1108 uint32_t *width,
1109 uint32_t *height)
1110 {
1111 if (!vk_format_is_compressed(format))
1112 return;
1113
1114 uint32_t block_width = vk_format_get_blockwidth(format);
1115 uint32_t block_height = vk_format_get_blockheight(format);
1116
1117 offset->x /= block_width;
1118 offset->y /= block_height;
1119
1120 if (extent) {
1121 extent->width = DIV_ROUND_UP(extent->width, block_width);
1122 extent->height = DIV_ROUND_UP(extent->height, block_height);
1123 }
1124 if (width)
1125 *width = DIV_ROUND_UP(*width, block_width);
1126 if (height)
1127 *height = DIV_ROUND_UP(*height, block_height);
1128 }
1129
1130 static void
1131 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1132 struct tu_buffer *src_buffer,
1133 struct tu_image *dst_image,
1134 const VkBufferImageCopy *info)
1135 {
1136 struct tu_cs *cs = &cmd->cs;
1137 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1138 VkFormat src_format =
1139 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1140 const struct blit_ops *ops = &r2d_ops;
1141
1142 /* special case for buffer to stencil */
1143 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1144 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1145 ops = &r3d_ops;
1146 }
1147
1148 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1149 * which matters for UBWC. buffer_to_image/etc can fail because of this
1150 */
1151
1152 VkOffset3D offset = info->imageOffset;
1153 VkExtent3D extent = info->imageExtent;
1154 uint32_t src_width = info->bufferRowLength ?: extent.width;
1155 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1156
1157 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1158
1159 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1160 uint32_t layer_size = src_height * pitch;
1161
1162 ops->setup(cmd, cs,
1163 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1164 info->imageSubresource.aspectMask, ROTATE_0, false);
1165
1166 struct tu_image_view dst;
1167 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1168
1169 for (uint32_t i = 0; i < layers; i++) {
1170 ops->dst(cs, &dst, i);
1171
1172 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1173 if ((src_va & 63) || (pitch & 63)) {
1174 for (uint32_t y = 0; y < extent.height; y++) {
1175 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1176 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1177 x + extent.width, 1);
1178 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1179 &(VkExtent2D) {extent.width, 1});
1180 ops->run(cmd, cs);
1181 src_va += pitch;
1182 }
1183 } else {
1184 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1185 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1186 ops->run(cmd, cs);
1187 }
1188 }
1189 }
1190
1191 void
1192 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1193 VkBuffer srcBuffer,
1194 VkImage dstImage,
1195 VkImageLayout dstImageLayout,
1196 uint32_t regionCount,
1197 const VkBufferImageCopy *pRegions)
1198 {
1199 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1200 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1201 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1202
1203 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1204 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1205
1206 for (unsigned i = 0; i < regionCount; ++i)
1207 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1208 }
1209
1210 static void
1211 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1212 struct tu_image *src_image,
1213 struct tu_buffer *dst_buffer,
1214 const VkBufferImageCopy *info)
1215 {
1216 struct tu_cs *cs = &cmd->cs;
1217 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1218 VkFormat dst_format =
1219 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1220 bool stencil_read = false;
1221
1222 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1223 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1224 stencil_read = true;
1225 }
1226
1227 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1228 VkOffset3D offset = info->imageOffset;
1229 VkExtent3D extent = info->imageExtent;
1230 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1231 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1232
1233 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1234
1235 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1236 uint32_t layer_size = pitch * dst_height;
1237
1238 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1239
1240 struct tu_image_view src;
1241 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1242
1243 for (uint32_t i = 0; i < layers; i++) {
1244 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1245
1246 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1247 if ((dst_va & 63) || (pitch & 63)) {
1248 for (uint32_t y = 0; y < extent.height; y++) {
1249 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1250 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1251 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1252 &(VkExtent2D) {extent.width, 1});
1253 ops->run(cmd, cs);
1254 dst_va += pitch;
1255 }
1256 } else {
1257 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1258 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1259 ops->run(cmd, cs);
1260 }
1261 }
1262 }
1263
1264 void
1265 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1266 VkImage srcImage,
1267 VkImageLayout srcImageLayout,
1268 VkBuffer dstBuffer,
1269 uint32_t regionCount,
1270 const VkBufferImageCopy *pRegions)
1271 {
1272 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1273 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1274 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1275
1276 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1277 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1278
1279 for (unsigned i = 0; i < regionCount; ++i)
1280 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1281 }
1282
1283 /* Tiled formats don't support swapping, which means that we can't support
1284 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1285 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1286 * Currently we fake support for tiled swapped formats and use the unswapped
1287 * format instead, but this means that reinterpreting copies to and from
1288 * swapped formats can't be performed correctly unless we can swizzle the
1289 * components by reinterpreting the other image as the "correct" swapped
1290 * format, i.e. only when the other image is linear.
1291 */
1292
1293 static bool
1294 is_swapped_format(VkFormat format)
1295 {
1296 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1297 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1298 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1299 }
1300
1301 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1302 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1303 * versa). This should mirror the logic in fdl6_layout.
1304 */
1305 static bool
1306 image_is_r8g8(struct tu_image *image)
1307 {
1308 return image->layout[0].cpp == 2 &&
1309 vk_format_get_nr_components(image->vk_format) == 2;
1310 }
1311
1312 static void
1313 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1314 struct tu_image *src_image,
1315 struct tu_image *dst_image,
1316 const VkImageCopy *info)
1317 {
1318 const struct blit_ops *ops = &r2d_ops;
1319 struct tu_cs *cs = &cmd->cs;
1320
1321 if (dst_image->samples > 1)
1322 ops = &r3d_ops;
1323
1324 VkFormat format = VK_FORMAT_UNDEFINED;
1325 VkOffset3D src_offset = info->srcOffset;
1326 VkOffset3D dst_offset = info->dstOffset;
1327 VkExtent3D extent = info->extent;
1328
1329 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1330 * Images":
1331 *
1332 * When copying between compressed and uncompressed formats the extent
1333 * members represent the texel dimensions of the source image and not
1334 * the destination. When copying from a compressed image to an
1335 * uncompressed image the image texel dimensions written to the
1336 * uncompressed image will be source extent divided by the compressed
1337 * texel block dimensions. When copying from an uncompressed image to a
1338 * compressed image the image texel dimensions written to the compressed
1339 * image will be the source extent multiplied by the compressed texel
1340 * block dimensions.
1341 *
1342 * This means we only have to adjust the extent if the source image is
1343 * compressed.
1344 */
1345 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1346 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1347
1348 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1349 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1350
1351 bool use_staging_blit = false;
1352
1353 if (src_format == dst_format) {
1354 /* Images that share a format can always be copied directly because it's
1355 * the same as a blit.
1356 */
1357 format = src_format;
1358 } else if (!src_image->layout[0].tile_mode) {
1359 /* If an image is linear, we can always safely reinterpret it with the
1360 * other image's format and then do a regular blit.
1361 */
1362 format = dst_format;
1363 } else if (!dst_image->layout[0].tile_mode) {
1364 format = src_format;
1365 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1366 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1367 * due to the different tile layout.
1368 */
1369 use_staging_blit = true;
1370 } else if (is_swapped_format(src_format) ||
1371 is_swapped_format(dst_format)) {
1372 /* If either format has a non-identity swap, then we can't copy
1373 * to/from it.
1374 */
1375 use_staging_blit = true;
1376 } else if (!src_image->layout[0].ubwc) {
1377 format = dst_format;
1378 } else if (!dst_image->layout[0].ubwc) {
1379 format = src_format;
1380 } else {
1381 /* Both formats use UBWC and so neither can be reinterpreted.
1382 * TODO: We could do an in-place decompression of the dst instead.
1383 */
1384 use_staging_blit = true;
1385 }
1386
1387 struct tu_image_view dst, src;
1388
1389 if (use_staging_blit) {
1390 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1391 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1392
1393 struct tu_image staging_image = {
1394 .vk_format = src_format,
1395 .type = src_image->type,
1396 .tiling = VK_IMAGE_TILING_LINEAR,
1397 .extent = extent,
1398 .level_count = 1,
1399 .layer_count = info->srcSubresource.layerCount,
1400 .samples = src_image->samples,
1401 .bo_offset = 0,
1402 };
1403
1404 VkImageSubresourceLayers staging_subresource = {
1405 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1406 .mipLevel = 0,
1407 .baseArrayLayer = 0,
1408 .layerCount = info->srcSubresource.layerCount,
1409 };
1410
1411 VkOffset3D staging_offset = { 0 };
1412
1413 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1414 staging_image.layout[0].ubwc = false;
1415
1416 fdl6_layout(&staging_image.layout[0],
1417 vk_format_to_pipe_format(staging_image.vk_format),
1418 staging_image.samples,
1419 staging_image.extent.width,
1420 staging_image.extent.height,
1421 staging_image.extent.depth,
1422 staging_image.level_count,
1423 staging_image.layer_count,
1424 staging_image.type == VK_IMAGE_TYPE_3D,
1425 NULL);
1426
1427 VkResult result = tu_get_scratch_bo(cmd->device,
1428 staging_image.layout[0].size,
1429 &staging_image.bo);
1430 if (result != VK_SUCCESS) {
1431 cmd->record_result = result;
1432 return;
1433 }
1434
1435 tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1436 MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1437
1438 struct tu_image_view staging;
1439 tu_image_view_copy(&staging, &staging_image, src_format,
1440 &staging_subresource, 0, false);
1441
1442 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1443 coords(ops, cs, &staging_offset, &src_offset, &extent);
1444
1445 for (uint32_t i = 0; i < info->extent.depth; i++) {
1446 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1447 ops->dst(cs, &staging, i);
1448 ops->run(cmd, cs);
1449 }
1450
1451 /* When executed by the user there has to be a pipeline barrier here,
1452 * but since we're doing it manually we'll have to flush ourselves.
1453 */
1454 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1455 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1456
1457 tu_image_view_copy(&staging, &staging_image, dst_format,
1458 &staging_subresource, 0, false);
1459
1460 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, ROTATE_0, false);
1461 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1462
1463 for (uint32_t i = 0; i < info->extent.depth; i++) {
1464 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1465 ops->dst(cs, &dst, i);
1466 ops->run(cmd, cs);
1467 }
1468 } else {
1469 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1470 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1471
1472 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, ROTATE_0, false);
1473 coords(ops, cs, &dst_offset, &src_offset, &extent);
1474
1475 for (uint32_t i = 0; i < info->extent.depth; i++) {
1476 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1477 ops->dst(cs, &dst, i);
1478 ops->run(cmd, cs);
1479 }
1480 }
1481 }
1482
1483 void
1484 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1485 VkImage srcImage,
1486 VkImageLayout srcImageLayout,
1487 VkImage destImage,
1488 VkImageLayout destImageLayout,
1489 uint32_t regionCount,
1490 const VkImageCopy *pRegions)
1491 {
1492 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1493 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1494 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1495
1496 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1497 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1498
1499 for (uint32_t i = 0; i < regionCount; ++i)
1500 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1501 }
1502
1503 static void
1504 copy_buffer(struct tu_cmd_buffer *cmd,
1505 uint64_t dst_va,
1506 uint64_t src_va,
1507 uint64_t size,
1508 uint32_t block_size)
1509 {
1510 const struct blit_ops *ops = &r2d_ops;
1511 struct tu_cs *cs = &cmd->cs;
1512 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1513 uint64_t blocks = size / block_size;
1514
1515 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1516
1517 while (blocks) {
1518 uint32_t src_x = (src_va & 63) / block_size;
1519 uint32_t dst_x = (dst_va & 63) / block_size;
1520 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1521
1522 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1523 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1524 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1525 ops->run(cmd, cs);
1526
1527 src_va += width * block_size;
1528 dst_va += width * block_size;
1529 blocks -= width;
1530 }
1531 }
1532
1533 void
1534 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1535 VkBuffer srcBuffer,
1536 VkBuffer dstBuffer,
1537 uint32_t regionCount,
1538 const VkBufferCopy *pRegions)
1539 {
1540 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1541 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1542 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1543
1544 tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1545 tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1546
1547 for (unsigned i = 0; i < regionCount; ++i) {
1548 copy_buffer(cmd,
1549 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1550 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1551 pRegions[i].size, 1);
1552 }
1553 }
1554
1555 void
1556 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1557 VkBuffer dstBuffer,
1558 VkDeviceSize dstOffset,
1559 VkDeviceSize dataSize,
1560 const void *pData)
1561 {
1562 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1563 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1564
1565 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1566
1567 struct tu_cs_memory tmp;
1568 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1569 if (result != VK_SUCCESS) {
1570 cmd->record_result = result;
1571 return;
1572 }
1573
1574 memcpy(tmp.map, pData, dataSize);
1575 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1576 }
1577
1578 void
1579 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1580 VkBuffer dstBuffer,
1581 VkDeviceSize dstOffset,
1582 VkDeviceSize fillSize,
1583 uint32_t data)
1584 {
1585 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1586 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1587 const struct blit_ops *ops = &r2d_ops;
1588 struct tu_cs *cs = &cmd->cs;
1589
1590 tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1591
1592 if (fillSize == VK_WHOLE_SIZE)
1593 fillSize = buffer->size - dstOffset;
1594
1595 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1596 uint32_t blocks = fillSize / 4;
1597
1598 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true);
1599 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1600
1601 while (blocks) {
1602 uint32_t dst_x = (dst_va & 63) / 4;
1603 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1604
1605 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1606 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1607 ops->run(cmd, cs);
1608
1609 dst_va += width * 4;
1610 blocks -= width;
1611 }
1612 }
1613
1614 void
1615 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1616 VkImage srcImage,
1617 VkImageLayout srcImageLayout,
1618 VkImage dstImage,
1619 VkImageLayout dstImageLayout,
1620 uint32_t regionCount,
1621 const VkImageResolve *pRegions)
1622 {
1623 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1624 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1625 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1626 const struct blit_ops *ops = &r2d_ops;
1627 struct tu_cs *cs = &cmd->cs;
1628
1629 tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1630 tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1631
1632 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1633
1634 for (uint32_t i = 0; i < regionCount; ++i) {
1635 const VkImageResolve *info = &pRegions[i];
1636 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1637
1638 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1639 /* TODO: aspect masks possible ? */
1640
1641 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1642
1643 struct tu_image_view dst, src;
1644 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1645 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1646
1647 for (uint32_t i = 0; i < layers; i++) {
1648 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1649 ops->dst(cs, &dst, i);
1650 ops->run(cmd, cs);
1651 }
1652 }
1653 }
1654
1655 void
1656 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1657 struct tu_cs *cs,
1658 struct tu_image_view *src,
1659 struct tu_image_view *dst,
1660 uint32_t layers,
1661 const VkRect2D *rect)
1662 {
1663 const struct blit_ops *ops = &r2d_ops;
1664
1665 tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1666 tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1667
1668 assert(src->image->vk_format == dst->image->vk_format);
1669
1670 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1671 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1672
1673 for (uint32_t i = 0; i < layers; i++) {
1674 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1675 ops->dst(cs, dst, i);
1676 ops->run(cmd, cs);
1677 }
1678 }
1679
1680 static void
1681 clear_image(struct tu_cmd_buffer *cmd,
1682 struct tu_image *image,
1683 const VkClearValue *clear_value,
1684 const VkImageSubresourceRange *range)
1685 {
1686 uint32_t level_count = tu_get_levelCount(image, range);
1687 uint32_t layer_count = tu_get_layerCount(image, range);
1688 struct tu_cs *cs = &cmd->cs;
1689 VkFormat format = image->vk_format;
1690 if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1691 format = VK_FORMAT_R32_UINT;
1692
1693 if (image->type == VK_IMAGE_TYPE_3D) {
1694 assert(layer_count == 1);
1695 assert(range->baseArrayLayer == 0);
1696 }
1697
1698 const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1699
1700 ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true);
1701 ops->clear_value(cs, image->vk_format, clear_value);
1702
1703 for (unsigned j = 0; j < level_count; j++) {
1704 if (image->type == VK_IMAGE_TYPE_3D)
1705 layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1706
1707 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1708 u_minify(image->extent.width, range->baseMipLevel + j),
1709 u_minify(image->extent.height, range->baseMipLevel + j)
1710 });
1711
1712 struct tu_image_view dst;
1713 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1714 .aspectMask = range->aspectMask,
1715 .mipLevel = range->baseMipLevel + j,
1716 .baseArrayLayer = range->baseArrayLayer,
1717 .layerCount = 1,
1718 }, 0, false);
1719
1720 for (uint32_t i = 0; i < layer_count; i++) {
1721 ops->dst(cs, &dst, i);
1722 ops->run(cmd, cs);
1723 }
1724 }
1725 }
1726
1727 void
1728 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1729 VkImage image_h,
1730 VkImageLayout imageLayout,
1731 const VkClearColorValue *pColor,
1732 uint32_t rangeCount,
1733 const VkImageSubresourceRange *pRanges)
1734 {
1735 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1736 TU_FROM_HANDLE(tu_image, image, image_h);
1737
1738 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1739
1740 for (unsigned i = 0; i < rangeCount; i++)
1741 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1742 }
1743
1744 void
1745 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1746 VkImage image_h,
1747 VkImageLayout imageLayout,
1748 const VkClearDepthStencilValue *pDepthStencil,
1749 uint32_t rangeCount,
1750 const VkImageSubresourceRange *pRanges)
1751 {
1752 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1753 TU_FROM_HANDLE(tu_image, image, image_h);
1754
1755 tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1756
1757 for (unsigned i = 0; i < rangeCount; i++)
1758 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1759 }
1760
1761 static void
1762 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1763 uint32_t attachment_count,
1764 const VkClearAttachment *attachments,
1765 uint32_t rect_count,
1766 const VkClearRect *rects)
1767 {
1768 const struct tu_subpass *subpass = cmd->state.subpass;
1769 /* note: cannot use shader path here.. there is a special shader path
1770 * in tu_clear_sysmem_attachments()
1771 */
1772 const struct blit_ops *ops = &r2d_ops;
1773 struct tu_cs *cs = &cmd->draw_cs;
1774
1775 for (uint32_t j = 0; j < attachment_count; j++) {
1776 /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1777 * Pass Instance" says that:
1778 *
1779 * Unlike other clear commands, vkCmdClearAttachments executes as
1780 * a drawing command, rather than a transfer command, with writes
1781 * performed by it executing in rasterization order. Clears to
1782 * color attachments are executed as color attachment writes, by
1783 * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1784 * Clears to depth/stencil attachments are executed as depth
1785 * writes and writes by the
1786 * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1787 * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1788 *
1789 * However, the 2d path here is executed the same way as a
1790 * transfer command, using the CCU color cache exclusively with
1791 * a special depth-as-color format for depth clears. This means that
1792 * we can't rely on the normal pipeline barrier mechanism here, and
1793 * have to manually flush whenever using a different cache domain
1794 * from what the 3d path would've used. This happens when we clear
1795 * depth/stencil, since normally depth attachments use CCU depth, but
1796 * we clear it using a special depth-as-color format. Since the clear
1797 * potentially uses a different attachment state we also need to
1798 * invalidate color beforehand and flush it afterwards.
1799 */
1800
1801 uint32_t a;
1802 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1803 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1804 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1805 } else {
1806 a = subpass->depth_stencil_attachment.attachment;
1807 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1808 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1809 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1810 }
1811
1812 if (a == VK_ATTACHMENT_UNUSED)
1813 continue;
1814
1815 const struct tu_image_view *iview =
1816 cmd->state.framebuffer->attachments[a].attachment;
1817
1818 ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask, ROTATE_0, true);
1819 ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1820
1821 /* Wait for the flushes we triggered manually to complete */
1822 tu_cs_emit_wfi(cs);
1823
1824 for (uint32_t i = 0; i < rect_count; i++) {
1825 ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1826 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1827 ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1828 ops->run(cmd, cs);
1829 }
1830 }
1831
1832 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1833 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1834 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1835 } else {
1836 /* sync color into depth */
1837 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1838 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1839 }
1840 }
1841 }
1842
1843 static void
1844 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1845 uint32_t attachment_count,
1846 const VkClearAttachment *attachments,
1847 uint32_t rect_count,
1848 const VkClearRect *rects)
1849 {
1850 /* the shader path here is special, it avoids changing MRT/etc state */
1851 const struct tu_render_pass *pass = cmd->state.pass;
1852 const struct tu_subpass *subpass = cmd->state.subpass;
1853 const uint32_t mrt_count = subpass->color_count;
1854 struct tu_cs *cs = &cmd->draw_cs;
1855 uint32_t clear_value[MAX_RTS][4];
1856 float z_clear_val = 0.0f;
1857 uint8_t s_clear_val = 0;
1858 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1859 bool z_clear = false;
1860 bool s_clear = false;
1861 bool layered_clear = false;
1862 uint32_t max_samples = 1;
1863
1864 for (uint32_t i = 0; i < attachment_count; i++) {
1865 uint32_t a;
1866 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1867 uint32_t c = attachments[i].colorAttachment;
1868 a = subpass->color_attachments[c].attachment;
1869 if (a == VK_ATTACHMENT_UNUSED)
1870 continue;
1871
1872 clear_rts |= 1 << c;
1873 clear_components |= 0xf << (c * 4);
1874 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1875 } else {
1876 a = subpass->depth_stencil_attachment.attachment;
1877 if (a == VK_ATTACHMENT_UNUSED)
1878 continue;
1879
1880 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1881 z_clear = true;
1882 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1883 }
1884
1885 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1886 s_clear = true;
1887 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1888 }
1889 }
1890
1891 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1892 }
1893
1894 /* prefer to use 2D path for clears
1895 * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1896 */
1897 if (max_samples == 1 && cmd->state.framebuffer) {
1898 tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1899 return;
1900 }
1901
1902 /* This clear path behaves like a draw, needs the same flush as tu_draw */
1903 tu_emit_cache_flush_renderpass(cmd, cs);
1904
1905 /* disable all draw states so they don't interfere
1906 * TODO: use and re-use draw states for this path
1907 * we have to disable draw states individually to preserve
1908 * input attachment states, because a secondary command buffer
1909 * won't be able to restore them
1910 */
1911 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1912 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1913 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1914 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1915 continue;
1916 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1917 CP_SET_DRAW_STATE__0_DISABLE);
1918 tu_cs_emit_qw(cs, 0);
1919 }
1920 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1921
1922 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1923 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1924 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1925 0xfc000000);
1926 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1927
1928 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1929 for (uint32_t i = 0; i < mrt_count; i++) {
1930 if (clear_rts & (1 << i))
1931 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1932 else
1933 tu_cs_emit(cs, 0);
1934 }
1935
1936 for (uint32_t i = 0; i < rect_count; i++) {
1937 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1938 layered_clear = true;
1939 }
1940
1941 r3d_common(cmd, cs, false, num_rts, layered_clear);
1942
1943 tu_cs_emit_regs(cs,
1944 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1945 tu_cs_emit_regs(cs,
1946 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1947
1948 tu_cs_emit_regs(cs,
1949 A6XX_RB_FS_OUTPUT_CNTL0(),
1950 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1951
1952 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1953 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1954 tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1955 for (uint32_t i = 0; i < mrt_count; i++) {
1956 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1957 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1958 }
1959
1960 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1961 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1962 .z_enable = z_clear,
1963 .z_write_enable = z_clear,
1964 .zfunc = FUNC_ALWAYS));
1965 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1966 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1967 .stencil_enable = s_clear,
1968 .func = FUNC_ALWAYS,
1969 .zpass = STENCIL_REPLACE));
1970 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1971 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1972 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1973
1974 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1975 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1976 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1977 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1978 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1979 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1980 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1981 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1982 for_each_bit(b, clear_rts)
1983 tu_cs_emit_array(cs, clear_value[b], 4);
1984
1985 for (uint32_t i = 0; i < rect_count; i++) {
1986 for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1987 r3d_coords_raw(cs, layered_clear, (float[]) {
1988 rects[i].rect.offset.x, rects[i].rect.offset.y,
1989 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1990 rects[i].rect.offset.x + rects[i].rect.extent.width,
1991 rects[i].rect.offset.y + rects[i].rect.extent.height,
1992 z_clear_val, 1.0f,
1993 });
1994
1995 if (layered_clear) {
1996 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1997 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
1998 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1999 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2000 CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2001 tu_cs_emit(cs, 1); /* instance count */
2002 tu_cs_emit(cs, 1); /* vertex count */
2003 } else {
2004 r3d_run(cmd, cs);
2005 }
2006 }
2007 }
2008 }
2009
2010 static void
2011 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2012 {
2013 enum pipe_format pformat = vk_format_to_pipe_format(format);
2014
2015 switch (format) {
2016 case VK_FORMAT_X8_D24_UNORM_PACK32:
2017 case VK_FORMAT_D24_UNORM_S8_UINT:
2018 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2019 val->depthStencil.stencil << 24;
2020 return;
2021 case VK_FORMAT_D16_UNORM:
2022 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2023 return;
2024 case VK_FORMAT_D32_SFLOAT:
2025 clear_value[0] = fui(val->depthStencil.depth);
2026 return;
2027 case VK_FORMAT_S8_UINT:
2028 clear_value[0] = val->depthStencil.stencil;
2029 return;
2030 /* these formats use a different base format when tiled
2031 * the same format can be used for both because GMEM is always in WZYX order
2032 */
2033 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2034 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2035 pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
2036 default:
2037 break;
2038 }
2039
2040 VkClearColorValue color;
2041
2042 /**
2043 * GMEM is tiled and wants the components in WZYX order,
2044 * apply swizzle to the color before packing, to counteract
2045 * deswizzling applied by packing functions
2046 */
2047 pipe_swizzle_4f(color.float32, val->color.float32,
2048 util_format_description(pformat)->swizzle);
2049
2050 util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2051 }
2052
2053 static void
2054 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2055 struct tu_cs *cs,
2056 uint32_t attachment,
2057 VkImageAspectFlags mask,
2058 const VkClearValue *value)
2059 {
2060 VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2061
2062
2063 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2064 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2065
2066 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
2067 .clear_mask = aspect_write_mask(vk_format, mask)));
2068
2069 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2070 tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2071
2072 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2073 tu_cs_emit(cs, 0);
2074
2075 uint32_t clear_vals[4] = {};
2076 pack_gmem_clear_value(value, vk_format, clear_vals);
2077
2078 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2079 tu_cs_emit_array(cs, clear_vals, 4);
2080
2081 tu6_emit_event_write(cmd, cs, BLIT);
2082 }
2083
2084 static void
2085 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2086 uint32_t attachment_count,
2087 const VkClearAttachment *attachments,
2088 uint32_t rect_count,
2089 const VkClearRect *rects)
2090 {
2091 const struct tu_subpass *subpass = cmd->state.subpass;
2092 struct tu_cs *cs = &cmd->draw_cs;
2093
2094 /* TODO: swap the loops for smaller cmdstream */
2095 for (unsigned i = 0; i < rect_count; i++) {
2096 unsigned x1 = rects[i].rect.offset.x;
2097 unsigned y1 = rects[i].rect.offset.y;
2098 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2099 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2100
2101 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2102 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2103 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2104
2105 for (unsigned j = 0; j < attachment_count; j++) {
2106 uint32_t a;
2107 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2108 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2109 else
2110 a = subpass->depth_stencil_attachment.attachment;
2111
2112 if (a == VK_ATTACHMENT_UNUSED)
2113 continue;
2114
2115 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2116 &attachments[j].clearValue);
2117 }
2118 }
2119 }
2120
2121 void
2122 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2123 uint32_t attachmentCount,
2124 const VkClearAttachment *pAttachments,
2125 uint32_t rectCount,
2126 const VkClearRect *pRects)
2127 {
2128 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129 struct tu_cs *cs = &cmd->draw_cs;
2130
2131 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2132 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2133 tu_cond_exec_end(cs);
2134
2135 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2136 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2137 tu_cond_exec_end(cs);
2138 }
2139
2140 void
2141 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2142 struct tu_cs *cs,
2143 uint32_t a,
2144 const VkRenderPassBeginInfo *info)
2145 {
2146 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2147 const struct tu_image_view *iview = fb->attachments[a].attachment;
2148 const struct tu_render_pass_attachment *attachment =
2149 &cmd->state.pass->attachments[a];
2150
2151 if (!attachment->clear_mask)
2152 return;
2153
2154 const struct blit_ops *ops = &r2d_ops;
2155 if (attachment->samples > 1)
2156 ops = &r3d_ops;
2157
2158 ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, true);
2159 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2160 ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2161
2162 /* Wait for any flushes at the beginning of the renderpass to complete */
2163 tu_cs_emit_wfi(cs);
2164
2165 for (uint32_t i = 0; i < fb->layers; i++) {
2166 ops->dst(cs, iview, i);
2167 ops->run(cmd, cs);
2168 }
2169
2170 /* The spec doesn't explicitly say, but presumably the initial renderpass
2171 * clear is considered part of the renderpass, and therefore barriers
2172 * aren't required inside the subpass/renderpass. Therefore we need to
2173 * flush CCU color into CCU depth here, just like with
2174 * vkCmdClearAttachments(). Note that because this only happens at the
2175 * beginning of a renderpass, and renderpass writes are considered
2176 * "incoherent", we shouldn't have to worry about syncing depth into color
2177 * beforehand as depth should already be flushed.
2178 */
2179 if (vk_format_is_depth_or_stencil(attachment->format)) {
2180 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2181 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2182 } else {
2183 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2184 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2185 }
2186 }
2187
2188 void
2189 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2190 struct tu_cs *cs,
2191 uint32_t a,
2192 const VkRenderPassBeginInfo *info)
2193 {
2194 const struct tu_render_pass_attachment *attachment =
2195 &cmd->state.pass->attachments[a];
2196
2197 if (!attachment->clear_mask)
2198 return;
2199
2200 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2201
2202 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2203 &info->pClearValues[a]);
2204 }
2205
2206 static void
2207 tu_emit_blit(struct tu_cmd_buffer *cmd,
2208 struct tu_cs *cs,
2209 const struct tu_image_view *iview,
2210 const struct tu_render_pass_attachment *attachment,
2211 bool resolve)
2212 {
2213 tu_cs_emit_regs(cs,
2214 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2215
2216 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2217 .unk0 = !resolve,
2218 .gmem = !resolve,
2219 /* "integer" bit disables msaa resolve averaging */
2220 .integer = vk_format_is_int(attachment->format)));
2221
2222 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2223 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2224 tu_cs_image_ref_2d(cs, iview, 0, false);
2225
2226 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2227 tu_cs_image_flag_ref(cs, iview, 0);
2228
2229 tu_cs_emit_regs(cs,
2230 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2231
2232 tu6_emit_event_write(cmd, cs, BLIT);
2233 }
2234
2235 static bool
2236 blit_can_resolve(VkFormat format)
2237 {
2238 const struct util_format_description *desc = vk_format_description(format);
2239
2240 /* blit event can only do resolve for simple cases:
2241 * averaging samples as unsigned integers or choosing only one sample
2242 */
2243 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2244 return false;
2245
2246 /* can't do formats with larger channel sizes
2247 * note: this includes all float formats
2248 * note2: single channel integer formats seem OK
2249 */
2250 if (desc->channel[0].size > 10)
2251 return false;
2252
2253 switch (format) {
2254 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2255 * likely related to these formats having different layout from other cpp=2 formats
2256 */
2257 case VK_FORMAT_R8G8_UNORM:
2258 case VK_FORMAT_R8G8_UINT:
2259 case VK_FORMAT_R8G8_SINT:
2260 /* TODO: this one should be able to work? */
2261 case VK_FORMAT_D24_UNORM_S8_UINT:
2262 return false;
2263 default:
2264 break;
2265 }
2266
2267 return true;
2268 }
2269
2270 void
2271 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2272 struct tu_cs *cs,
2273 uint32_t a,
2274 bool force_load)
2275 {
2276 const struct tu_image_view *iview =
2277 cmd->state.framebuffer->attachments[a].attachment;
2278 const struct tu_render_pass_attachment *attachment =
2279 &cmd->state.pass->attachments[a];
2280
2281 if (attachment->load || force_load)
2282 tu_emit_blit(cmd, cs, iview, attachment, false);
2283 }
2284
2285 void
2286 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2287 struct tu_cs *cs,
2288 uint32_t a,
2289 uint32_t gmem_a)
2290 {
2291 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2292 const VkRect2D *render_area = &cmd->state.render_area;
2293 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2294 struct tu_image_view *iview = fb->attachments[a].attachment;
2295 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2296
2297 if (!dst->store)
2298 return;
2299
2300 uint32_t x1 = render_area->offset.x;
2301 uint32_t y1 = render_area->offset.y;
2302 uint32_t x2 = x1 + render_area->extent.width;
2303 uint32_t y2 = y1 + render_area->extent.height;
2304 /* x2/y2 can be unaligned if equal to the size of the image,
2305 * since it will write into padding space
2306 * the one exception is linear levels which don't have the
2307 * required y padding in the layout (except for the last level)
2308 */
2309 bool need_y2_align =
2310 y2 != iview->extent.height || iview->need_y2_align;
2311
2312 bool unaligned =
2313 x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2314 y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2315
2316 /* use fast path when render area is aligned, except for unsupported resolve cases */
2317 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2318 tu_emit_blit(cmd, cs, iview, src, true);
2319 return;
2320 }
2321
2322 if (dst->samples > 1) {
2323 /* I guess we need to use shader path in this case?
2324 * need a testcase which fails because of this
2325 */
2326 tu_finishme("unaligned store of msaa attachment\n");
2327 return;
2328 }
2329
2330 r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, true);
2331 r2d_dst(cs, iview, 0);
2332 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2333
2334 tu_cs_emit_regs(cs,
2335 A6XX_SP_PS_2D_SRC_INFO(
2336 .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2337 .tile_mode = TILE6_2,
2338 .srgb = vk_format_is_srgb(src->format),
2339 .samples = tu_msaa_samples(src->samples),
2340 .samples_average = !vk_format_is_int(src->format),
2341 .unk20 = 1,
2342 .unk22 = 1),
2343 /* note: src size does not matter when not scaling */
2344 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2345 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2346 A6XX_SP_PS_2D_SRC_HI(),
2347 A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2348
2349 /* sync GMEM writes with CACHE. */
2350 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2351
2352 /* Wait for CACHE_INVALIDATE to land */
2353 tu_cs_emit_wfi(cs);
2354
2355 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2356 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2357
2358 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2359 * sysmem, and we generally assume that GMEM renderpasses leave their
2360 * results in sysmem, so we need to flush manually here.
2361 */
2362 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2363 }