X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fvulkan%2Ftu_clear_blit.c;h=4082d3e21ddf5176a539794251ee465902410939;hb=f83e89507de69b55c8c899fb0fa52f9c9bf3ce26;hp=5635375cc2356aaab8c3afc9789fe3341d6fd15c;hpb=c93753e6181b82988c84e9af43d3aa377a6eae36;p=mesa.git diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index 5635375cc23..4082d3e21dd 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -16,178 +16,10 @@ #include "util/format_srgb.h" #include "util/u_half.h" -/* helper functions previously in tu_formats.c */ - -static uint32_t -tu_pack_mask(int bits) -{ - assert(bits <= 32); - return (1ull << bits) - 1; -} - static uint32_t tu_pack_float32_for_unorm(float val, int bits) { - const uint32_t max = tu_pack_mask(bits); - if (val < 0.0f) - return 0; - else if (val > 1.0f) - return max; - else - return _mesa_lroundevenf(val * (float) max); -} - -static uint32_t -tu_pack_float32_for_snorm(float val, int bits) -{ - const int32_t max = tu_pack_mask(bits - 1); - int32_t tmp; - if (val < -1.0f) - tmp = -max; - else if (val > 1.0f) - tmp = max; - else - tmp = _mesa_lroundevenf(val * (float) max); - - return tmp & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_float32_for_uscaled(float val, int bits) -{ - const uint32_t max = tu_pack_mask(bits); - if (val < 0.0f) - return 0; - else if (val > (float) max) - return max; - else - return (uint32_t) val; -} - -static uint32_t -tu_pack_float32_for_sscaled(float val, int bits) -{ - const int32_t max = tu_pack_mask(bits - 1); - const int32_t min = -max - 1; - int32_t tmp; - if (val < (float) min) - tmp = min; - else if (val > (float) max) - tmp = max; - else - tmp = (int32_t) val; - - return tmp & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_uint32_for_uint(uint32_t val, int bits) -{ - return val & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_int32_for_sint(int32_t val, int bits) -{ - return val & tu_pack_mask(bits); -} - -static uint32_t -tu_pack_float32_for_sfloat(float val, int bits) -{ - assert(bits == 16 || bits == 32); - return bits == 16 ? util_float_to_half(val) : fui(val); -} - -union tu_clear_component_value { - float float32; - int32_t int32; - uint32_t uint32; -}; - -static uint32_t -tu_pack_clear_component_value(union tu_clear_component_value val, - const struct util_format_channel_description *ch) -{ - uint32_t packed; - - switch (ch->type) { - case UTIL_FORMAT_TYPE_UNSIGNED: - /* normalized, scaled, or pure integer */ - if (ch->normalized) - packed = tu_pack_float32_for_unorm(val.float32, ch->size); - else if (ch->pure_integer) - packed = tu_pack_uint32_for_uint(val.uint32, ch->size); - else - packed = tu_pack_float32_for_uscaled(val.float32, ch->size); - break; - case UTIL_FORMAT_TYPE_SIGNED: - /* normalized, scaled, or pure integer */ - if (ch->normalized) - packed = tu_pack_float32_for_snorm(val.float32, ch->size); - else if (ch->pure_integer) - packed = tu_pack_int32_for_sint(val.int32, ch->size); - else - packed = tu_pack_float32_for_sscaled(val.float32, ch->size); - break; - case UTIL_FORMAT_TYPE_FLOAT: - packed = tu_pack_float32_for_sfloat(val.float32, ch->size); - break; - default: - unreachable("unexpected channel type"); - packed = 0; - break; - } - - assert((packed & tu_pack_mask(ch->size)) == packed); - return packed; -} - -static const struct util_format_channel_description * -tu_get_format_channel_description(const struct util_format_description *desc, - int comp) -{ - switch (desc->swizzle[comp]) { - case PIPE_SWIZZLE_X: - return &desc->channel[0]; - case PIPE_SWIZZLE_Y: - return &desc->channel[1]; - case PIPE_SWIZZLE_Z: - return &desc->channel[2]; - case PIPE_SWIZZLE_W: - return &desc->channel[3]; - default: - return NULL; - } -} - -static union tu_clear_component_value -tu_get_clear_component_value(const VkClearValue *val, int comp, - enum util_format_colorspace colorspace) -{ - assert(comp < 4); - - union tu_clear_component_value tmp; - switch (colorspace) { - case UTIL_FORMAT_COLORSPACE_ZS: - assert(comp < 2); - if (comp == 0) - tmp.float32 = val->depthStencil.depth; - else - tmp.uint32 = val->depthStencil.stencil; - break; - case UTIL_FORMAT_COLORSPACE_SRGB: - if (comp < 3) { - tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]); - break; - } - default: - assert(comp < 4); - tmp.uint32 = val->color.uint32[comp]; - break; - } - - return tmp; + return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1)); } /* r2d_ = BLIT_OP_SCALE operations */ @@ -275,10 +107,10 @@ r2d_coords(struct tu_cs *cs, return; tu_cs_emit_regs(cs, - A6XX_GRAS_2D_SRC_TL_X(.x = src->x), - A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1), - A6XX_GRAS_2D_SRC_TL_Y(.y = src->y), - A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1)); + A6XX_GRAS_2D_SRC_TL_X(src->x), + A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1), + A6XX_GRAS_2D_SRC_TL_Y(src->y), + A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1)); } static void @@ -323,7 +155,7 @@ r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val) linear = util_format_linear_to_srgb_float(val->color.float32[i]); if (ch->type == UTIL_FORMAT_TYPE_SIGNED) - clear_value[i] = tu_pack_float32_for_snorm(linear, 8); + clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f); else clear_value[i] = tu_pack_float32_for_unorm(linear, 8); } else if (ifmt == R2D_FLOAT16) { @@ -346,11 +178,14 @@ r2d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, - bool linear_filter) + VkFilter filter) { + uint32_t src_info = iview->SP_PS_2D_SRC_INFO; + if (filter != VK_FILTER_NEAREST) + src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER; + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); - tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO | - COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER)); + tu_cs_emit(cs, src_info); tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE); tu_cs_image_ref_2d(cs, iview, layer, true); @@ -393,6 +228,17 @@ r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) tu_cs_image_flag_ref(cs, iview, layer); } +static void +r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) +{ + assert(iview->image->samples == 1); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS); + tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); + tu_cs_emit(cs, iview->stencil_PITCH); +} + static void r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) { @@ -405,32 +251,39 @@ r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch .srgb = vk_format_is_srgb(vk_format)), A6XX_RB_2D_DST_LO((uint32_t) va), A6XX_RB_2D_DST_HI(va >> 32), - A6XX_RB_2D_DST_SIZE(.pitch = pitch)); + A6XX_RB_2D_DST_PITCH(pitch)); } static void r2d_setup_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, + VkImageAspectFlags aspect_mask, enum a6xx_rotation rotation, bool clear, - uint8_t mask, + bool ubwc, bool scissor) { enum a6xx_format format = tu6_base_format(vk_format); enum a6xx_2d_ifmt ifmt = format_to_ifmt(format); uint32_t unknown_8c01 = 0; - if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) { - /* preserve depth channels */ - if (mask == 0x8) - unknown_8c01 = 0x00084001; + if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT || + vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) { + format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + } + + /* note: the only format with partial clearing is D24S8 */ + if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { /* preserve stencil channel */ - if (mask == 0x7) + if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) unknown_8c01 = 0x08000041; + /* preserve depth channels */ + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + unknown_8c01 = 0x00084001; } - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); tu_cs_emit(cs, unknown_8c01); uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL( @@ -452,7 +305,7 @@ r2d_setup_common(struct tu_cmd_buffer *cmd, if (format == FMT6_10_10_10_2_UNORM_DEST) format = FMT6_16_16_16_16_FLOAT; - tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT( + tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT( .sint = vk_format_is_sint(vk_format), .uint = vk_format_is_uint(vk_format), .color_format = format, @@ -464,13 +317,21 @@ static void r2d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, + VkImageAspectFlags aspect_mask, enum a6xx_rotation rotation, bool clear, - uint8_t mask) + bool ubwc) { tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); - r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false); + r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false); +} + +static void +r2d_teardown(struct tu_cmd_buffer *cmd, + struct tu_cs *cs) +{ + /* nothing to do here */ } static void @@ -482,16 +343,62 @@ r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) /* r3d_ = shader path operations */ +void +tu_init_clear_blit_shaders(struct tu6_global *global) +{ +#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } } +#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } } +#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } } + + static const instr_t vs_code[] = { + /* r0.xyz = r0.w ? c1.xyz : c0.xyz + * r1.xy = r0.w ? c1.zw : c0.zw + * r0.w = 1.0f + */ + CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0, + .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1, + .src2 = 3, + .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}), + CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4, + .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1, + .src2 = 3, + .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}), + MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ), + { .cat0 = { .opc = OPC_END } }, + }; + + static const instr_t fs_blit[] = { + /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its + * blit path (its not clear what allows it to not have it) + */ + CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1), + { .cat0 = { .opc = OPC_END } }, + }; + + memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code)); + memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit)); + + for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) { + instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts]; + for (uint32_t i = 0; i < num_rts; i++) { + /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */ + *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4); + } + *code++ = (instr_t) { .cat0 = { .opc = OPC_END } }; + } +} + static void -r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts, - bool layered_clear) +r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts, + bool layered_clear) { + struct ir3_const_state dummy_const_state = {}; struct ir3_shader dummy_shader = {}; struct ir3_shader_variant vs = { .type = MESA_SHADER_VERTEX, .instrlen = 1, - .constlen = 2, + .constlen = 4, .info.max_reg = 1, .inputs_count = 1, .inputs[0] = { @@ -509,20 +416,18 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu .regid = regid(1, 0), }, .shader = &dummy_shader, + .const_state = &dummy_const_state, }; if (layered_clear) { - vs = (struct ir3_shader_variant) { - .type = MESA_SHADER_VERTEX, - .instrlen = 1, - .info.max_reg = 0, - .shader = &dummy_shader, - }; + vs.outputs[1].slot = VARYING_SLOT_LAYER; + vs.outputs[1].regid = regid(1, 1); + vs.outputs_count = 2; } struct ir3_shader_variant fs = { .type = MESA_SHADER_FRAGMENT, .instrlen = 1, /* max of 9 instructions with num_rts = 8 */ - .constlen = num_rts, + .constlen = align(num_rts, 4), .info.max_reg = MAX2(num_rts, 1) - 1, .total_in = blit ? 2 : 0, .num_samp = blit ? 1 : 0, @@ -545,139 +450,33 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu .cmd = 4, }, .shader = &dummy_shader, + .const_state = &dummy_const_state, }; - struct ir3_shader_variant gs_shader = { - .type = MESA_SHADER_GEOMETRY, - .instrlen = 1, - .constlen = 2, - .info.max_reg = 1, - .inputs_count = 1, - .inputs[0] = { - .slot = SYSTEM_VALUE_GS_HEADER_IR3, - .regid = regid(0, 0), - .sysval = true, - }, - .outputs_count = 3, - .outputs[0] = { - .slot = VARYING_SLOT_POS, - .regid = regid(0, 0), - }, - .outputs[1] = { - .slot = VARYING_SLOT_LAYER, - .regid = regid(1, 1), - }, - .outputs[2] = { - .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3, - .regid = regid(1, 0), - }, - .shader = &dummy_shader, - }, *gs = layered_clear ? &gs_shader : NULL; - - -#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } } -#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } } -#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } } - - static const instr_t vs_code[] = { - /* r0.xyz = r0.w ? c1.xyz : c0.xyz - * r1.xy = r0.w ? c1.zw : c0.zw - * r0.w = 1.0f - */ - CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0, - .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1, - .src2 = 3, - .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}), - CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4, - .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1, - .src2 = 3, - .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}), - MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ), - { .cat0 = { .opc = OPC_END } }, - }; - - static const instr_t vs_layered[] = { - { .cat0 = { .opc = OPC_CHMASK } }, - { .cat0 = { .opc = OPC_CHSH } }, - }; - - static const instr_t gs_code[16] = { - /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */ - CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16, - .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1), - /* x = (local_id & 1) ? c1.x : c0.x */ - CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1), - /* y = (local_id & 2) ? c1.y : c0.y */ - CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2), - /* pred = (local_id >= 4), used by OPC_KILL */ - CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4), - /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */ - CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0), - - MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */ - MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f), - MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */ - - /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */ - CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0, - .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1, - .src2 = 0, - .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}), - - CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2), - - { .cat0 = { .opc = OPC_KILL } }, - { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } }, - }; -#define FS_OFFSET (16 * sizeof(instr_t)) -#define GS_OFFSET (32 * sizeof(instr_t)) - - /* shaders */ - struct ts_cs_memory shaders = { }; - VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear, - 16 * sizeof(instr_t), &shaders); - assert(result == VK_SUCCESS); - - if (layered_clear) { - memcpy(shaders.map, vs_layered, sizeof(vs_layered)); - memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code)); - } else { - memcpy(shaders.map, vs_code, sizeof(vs_code)); - } - - instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET); - for (uint32_t i = 0; i < num_rts; i++) { - /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */ - *fs_code++ = (instr_t) { .cat1 = { - .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, - .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 - } }; - } - - /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its - * blit path (its not clear what allows it to not have it) - */ - if (blit) { - *fs_code++ = (instr_t) { .cat2 = { - .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1, - .dst = regid(63, 0), .src1_im = 1 - } }; - } - *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } }; - /* note: assumed <= 16 instructions (MAX_RTS is 8) */ - - tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff)); - - tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova); + tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + .vs_state = true, + .hs_state = true, + .ds_state = true, + .gs_state = true, + .fs_state = true, + .cs_state = true, + .gfx_ibo = true, + .cs_ibo = true, + .gfx_shared_const = true, + .gfx_bindless = 0x1f, + .cs_bindless = 0x1f)); + + tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS])); tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0); tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0); - tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET); - tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET); + tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0); + tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, + global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)])); tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0()); tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0()); - tu6_emit_vpc(cs, &vs, gs, &fs, NULL); + tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false); /* REPL_MODE for varying with RECTLIST (2 vertices only) */ tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0)); @@ -690,26 +489,29 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu .persp_division_disable = 1, .vp_xform_disable = 1, .vp_clip_code_ignore = 1, - .clip_disable = 1), - A6XX_GRAS_UNKNOWN_8001(0)); + .clip_disable = 1)); tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable? tu_cs_emit_regs(cs, - A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0), - A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff)); + A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0), + A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); tu_cs_emit_regs(cs, - A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0), - A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff)); + A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0), + A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); + + tu_cs_emit_regs(cs, + A6XX_VFD_INDEX_OFFSET(), + A6XX_VFD_INSTANCE_START_OFFSET()); } static void -r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords) +r3d_coords_raw(struct tu_cs *cs, const float *coords) { tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | CP_LOAD_STATE6_0_NUM_UNIT(2)); tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); @@ -724,7 +526,7 @@ r3d_coords(struct tu_cs *cs, { int32_t src_x1 = src ? src->x : 0; int32_t src_y1 = src ? src->y : 0; - r3d_coords_raw(cs, false, (float[]) { + r3d_coords_raw(cs, (float[]) { dst->x, dst->y, src_x1, src_y1, dst->x + extent->width, dst->y + extent->height, @@ -780,9 +582,9 @@ r3d_src_common(struct tu_cmd_buffer *cmd, const uint32_t *tex_const, uint32_t offset_base, uint32_t offset_ubwc, - bool linear_filter) + VkFilter filter) { - struct ts_cs_memory texture = { }; + struct tu_cs_memory texture = { }; VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, /* allocate space for a sampler too */ A6XX_TEX_CONST_DWORDS, &texture); @@ -797,8 +599,8 @@ r3d_src_common(struct tu_cmd_buffer *cmd, texture.map[8] = ubwc_addr >> 32; texture.map[A6XX_TEX_CONST_DWORDS + 0] = - A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) | - A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) | + A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) | + A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) | A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) | A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) | A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) | @@ -840,12 +642,12 @@ r3d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, - bool linear_filter) + VkFilter filter) { r3d_src_common(cmd, cs, iview->descriptor, iview->layer_size * layer, iview->ubwc_layer_size * layer, - linear_filter); + filter); } static void @@ -870,7 +672,6 @@ r3d_src_buffer(struct tu_cmd_buffer *cmd, A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W); desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); desc[2] = - A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) | A6XX_TEX_CONST_2_PITCH(pitch) | A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D); desc[3] = 0; @@ -879,7 +680,7 @@ r3d_src_buffer(struct tu_cmd_buffer *cmd, for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++) desc[i] = 0; - r3d_src_common(cmd, cs, desc, 0, 0, false); + r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); } static void @@ -898,6 +699,19 @@ r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled)); } +static void +r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) +{ + tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */ + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO)); + tu_cs_image_stencil_ref(cs, iview, layer); + tu_cs_emit(cs, 0); + + tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); +} + static void r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch) { @@ -916,23 +730,48 @@ r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); } +static uint8_t +aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask) +{ + uint8_t mask = 0xf; + assert(aspect_mask); + /* note: the only format with partial writing is D24S8, + * clear/blit uses the _AS_R8G8B8A8 format to access it + */ + if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { + if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) + mask = 0x7; + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + mask = 0x8; + } + return mask; +} + static void r3d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, + VkImageAspectFlags aspect_mask, enum a6xx_rotation rotation, bool clear, - uint8_t mask) + bool ubwc) { + enum a6xx_format format = tu6_base_format(vk_format); + + if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT || + vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) { + format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + } + if (!cmd->state.pass) { tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); - tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff); + tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff); } tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000)); tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000)); - r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0, false); + r3d_common(cmd, cs, !clear, clear ? 1 : 0, false); tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | @@ -963,13 +802,19 @@ r3d_setup(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf)); tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0, - .color_format = tu6_base_format(vk_format), + .color_format = format, .color_sint = vk_format_is_sint(vk_format), .color_uint = vk_format_is_uint(vk_format))); - tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask)); + tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, + .component_enable = aspect_write_mask(vk_format, aspect_mask))); tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format))); tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format))); + + if (cmd->state.predication_active) { + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); + tu_cs_emit(cs, 0); + } } static void @@ -983,6 +828,15 @@ r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, 2); /* vertex count */ } +static void +r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (cmd->state.predication_active) { + tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); + tu_cs_emit(cs, 1); + } +} + /* blit ops - common interface for 2d/shader paths */ struct blit_ops { @@ -996,7 +850,7 @@ struct blit_ops { struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, - bool linear_filter); + VkFilter filter); void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch, @@ -1006,10 +860,13 @@ struct blit_ops { void (*setup)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, + VkImageAspectFlags aspect_mask, enum a6xx_rotation rotation, bool clear, - uint8_t mask); + bool ubwc); void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + void (*teardown)(struct tu_cmd_buffer *cmd, + struct tu_cs *cs); }; static const struct blit_ops r2d_ops = { @@ -1021,6 +878,7 @@ static const struct blit_ops r2d_ops = { .dst_buffer = r2d_dst_buffer, .setup = r2d_setup, .run = r2d_run, + .teardown = r2d_teardown, }; static const struct blit_ops r3d_ops = { @@ -1032,6 +890,7 @@ static const struct blit_ops r3d_ops = { .dst_buffer = r3d_dst_buffer, .setup = r3d_setup, .run = r3d_run, + .teardown = r3d_teardown, }; /* passthrough set coords from 3D extents */ @@ -1045,13 +904,51 @@ coords(const struct blit_ops *ops, ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent); } +static VkFormat +copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer) +{ + if (vk_format_is_compressed(format)) { + switch (vk_format_get_blocksize(format)) { + case 1: return VK_FORMAT_R8_UINT; + case 2: return VK_FORMAT_R16_UINT; + case 4: return VK_FORMAT_R32_UINT; + case 8: return VK_FORMAT_R32G32_UINT; + case 16:return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("unhandled format size"); + } + } + + switch (format) { + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT) + return VK_FORMAT_R8G8_UNORM; + /* fallthrough */ + case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: + return VK_FORMAT_R8_UNORM; + case VK_FORMAT_D24_UNORM_S8_UINT: + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer) + return VK_FORMAT_R8_UNORM; + /* fallthrough */ + default: + return format; + case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: + return VK_FORMAT_R32_UINT; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) + return VK_FORMAT_S8_UINT; + assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT); + return VK_FORMAT_D32_SFLOAT; + } +} + static void -tu_image_view_blit2(struct tu_image_view *iview, - struct tu_image *image, - VkFormat format, - const VkImageSubresourceLayers *subres, - uint32_t layer, - bool stencil_read) +tu_image_view_copy_blit(struct tu_image_view *iview, + struct tu_image *image, + VkFormat format, + const VkImageSubresourceLayers *subres, + uint32_t layer, + bool stencil_read) { VkImageAspectFlags aspect_mask = subres->aspectMask; @@ -1074,7 +971,19 @@ tu_image_view_blit2(struct tu_image_view *iview, .baseArrayLayer = subres->baseArrayLayer + layer, .layerCount = 1, }, - }); + }, false); +} + +static void +tu_image_view_copy(struct tu_image_view *iview, + struct tu_image *image, + VkFormat format, + const VkImageSubresourceLayers *subres, + uint32_t layer, + bool stencil_read) +{ + format = copy_format(format, subres->aspectMask, false); + tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read); } static void @@ -1083,7 +992,7 @@ tu_image_view_blit(struct tu_image_view *iview, const VkImageSubresourceLayers *subres, uint32_t layer) { - tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false); + tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false); } static void @@ -1093,7 +1002,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, const VkImageBlit *info, VkFilter filter) { - const struct blit_ops *ops = &r3d_ops; + const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; uint32_t layers; @@ -1127,15 +1036,6 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, layers = info->dstSubresource.layerCount; } - uint8_t mask = 0xf; - if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { - assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask); - if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) - mask = 0x7; - if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) - mask = 0x8; - } - /* BC1_RGB_* formats need to have their last components overriden with 1 * when sampling, which is normally handled with the texture descriptor * swizzle. The 2d path can't handle that, so use the 3d path. @@ -1146,17 +1046,15 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, if (dst_image->samples > 1 || src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK || - src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK) + src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK || + filter == VK_FILTER_CUBIC_EXT) ops = &r3d_ops; - /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests, - * figure out why (should be able to pass all tests with only shader path) - */ - - ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask); + ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask, + rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc); if (ops == &r3d_ops) { - r3d_coords_raw(cs, false, (float[]) { + r3d_coords_raw(cs, (float[]) { info->dstOffsets[0].x, info->dstOffsets[0].y, info->srcOffsets[0].x, info->srcOffsets[0].y, info->dstOffsets[1].x, info->dstOffsets[1].y, @@ -1169,10 +1067,10 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1, .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1)); tu_cs_emit_regs(cs, - A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)), - A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1), - A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)), - A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1)); + A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)), + A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1), + A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)), + A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1)); } struct tu_image_view dst, src; @@ -1181,9 +1079,11 @@ tu6_blit_image(struct tu_cmd_buffer *cmd, for (uint32_t i = 0; i < layers; i++) { ops->dst(cs, &dst, i); - ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR); + ops->src(cmd, cs, &src, i, filter); ops->run(cmd, cs); } + + ops->teardown(cmd, cs); } void @@ -1208,21 +1108,6 @@ tu_CmdBlitImage(VkCommandBuffer commandBuffer, tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter); } -static VkFormat -copy_format(VkFormat format) -{ - switch (vk_format_get_blocksizebits(format)) { - case 8: return VK_FORMAT_R8_UINT; - case 16: return VK_FORMAT_R16_UINT; - case 32: return VK_FORMAT_R32_UINT; - case 64: return VK_FORMAT_R32G32_UINT; - case 96: return VK_FORMAT_R32G32B32_UINT; - case 128:return VK_FORMAT_R32G32B32A32_UINT; - default: - unreachable("unhandled format size"); - } -} - static void copy_compressed(VkFormat format, VkOffset3D *offset, @@ -1257,47 +1142,36 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, { struct tu_cs *cs = &cmd->cs; uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); - VkFormat dst_format = dst_image->vk_format; - VkFormat src_format = dst_image->vk_format; + VkFormat src_format = + copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true); const struct blit_ops *ops = &r2d_ops; - uint8_t mask = 0xf; - - if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { - switch (info->imageSubresource.aspectMask) { - case VK_IMAGE_ASPECT_STENCIL_BIT: - src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */ - mask = 0x8; - ops = &r3d_ops; - break; - case VK_IMAGE_ASPECT_DEPTH_BIT: - mask = 0x7; - break; - } + /* special case for buffer to stencil */ + if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT && + info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { + ops = &r3d_ops; } + /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format, + * which matters for UBWC. buffer_to_image/etc can fail because of this + */ + VkOffset3D offset = info->imageOffset; VkExtent3D extent = info->imageExtent; uint32_t src_width = info->bufferRowLength ?: extent.width; uint32_t src_height = info->bufferImageHeight ?: extent.height; - if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) { - assert(src_format == dst_format); - copy_compressed(dst_format, &offset, &extent, &src_width, &src_height); - src_format = dst_format = copy_format(dst_format); - } + copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height); uint32_t pitch = src_width * vk_format_get_blocksize(src_format); uint32_t layer_size = src_height * pitch; - /* note: the src_va/pitch alignment of 64 is for 2D engine, - * it is also valid for 1cpp format with shader path (stencil aspect path) - */ - - ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask); + ops->setup(cmd, cs, + copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false), + info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc); struct tu_image_view dst; - tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false); + tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false); for (uint32_t i = 0; i < layers; i++) { ops->dst(cs, &dst, i); @@ -1319,6 +1193,8 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1348,13 +1224,12 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, { struct tu_cs *cs = &cmd->cs; uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); - VkFormat src_format = src_image->vk_format; - VkFormat dst_format = src_image->vk_format; + VkFormat dst_format = + copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true); bool stencil_read = false; if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT && info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { - dst_format = VK_FORMAT_R8_UNORM; stencil_read = true; } @@ -1364,26 +1239,18 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, uint32_t dst_width = info->bufferRowLength ?: extent.width; uint32_t dst_height = info->bufferImageHeight ?: extent.height; - if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) { - assert(src_format == dst_format); - copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height); - src_format = dst_format = copy_format(dst_format); - } + copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height); uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format); uint32_t layer_size = pitch * dst_height; - /* note: the dst_va/pitch alignment of 64 is for 2D engine, - * it is also valid for 1cpp format with shader path (stencil aspect) - */ - - ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf); + ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false); struct tu_image_view src; - tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read); + tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read); for (uint32_t i = 0; i < layers; i++) { - ops->src(cmd, cs, &src, i, false); + ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i; if ((dst_va & 63) || (pitch & 63)) { @@ -1401,6 +1268,8 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1447,7 +1316,7 @@ is_swapped_format(VkFormat format) static bool image_is_r8g8(struct tu_image *image) { - return image->layout.cpp == 2 && + return image->layout[0].cpp == 2 && vk_format_get_nr_components(image->vk_format) == 2; } @@ -1460,19 +1329,9 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, const struct blit_ops *ops = &r2d_ops; struct tu_cs *cs = &cmd->cs; - uint8_t mask = 0xf; - if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { - if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) - mask = 0x7; - if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) - mask = 0x8; - } - if (dst_image->samples > 1) ops = &r3d_ops; - assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask); - VkFormat format = VK_FORMAT_UNDEFINED; VkOffset3D src_offset = info->srcOffset; VkOffset3D dst_offset = info->dstOffset; @@ -1497,10 +1356,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL); copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL); - VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ? - copy_format(dst_image->vk_format) : dst_image->vk_format; - VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ? - copy_format(src_image->vk_format) : src_image->vk_format; + VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false); + VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false); bool use_staging_blit = false; @@ -1509,12 +1366,12 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, * the same as a blit. */ format = src_format; - } else if (!src_image->layout.tile_mode) { + } else if (!src_image->layout[0].tile_mode) { /* If an image is linear, we can always safely reinterpret it with the * other image's format and then do a regular blit. */ format = dst_format; - } else if (!dst_image->layout.tile_mode) { + } else if (!dst_image->layout[0].tile_mode) { format = src_format; } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) { /* We can't currently copy r8g8 images to/from other cpp=2 images, @@ -1527,9 +1384,9 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, * to/from it. */ use_staging_blit = true; - } else if (!src_image->layout.ubwc) { + } else if (!src_image->layout[0].ubwc) { format = dst_format; - } else if (!dst_image->layout.ubwc) { + } else if (!dst_image->layout[0].ubwc) { format = src_format; } else { /* Both formats use UBWC and so neither can be reinterpreted. @@ -1541,8 +1398,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, struct tu_image_view dst, src; if (use_staging_blit) { - tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false); - tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false); + tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false); + tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false); struct tu_image staging_image = { .vk_format = src_format, @@ -1564,10 +1421,10 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, VkOffset3D staging_offset = { 0 }; - staging_image.layout.tile_mode = TILE6_LINEAR; - staging_image.layout.ubwc = false; + staging_image.layout[0].tile_mode = TILE6_LINEAR; + staging_image.layout[0].ubwc = false; - fdl6_layout(&staging_image.layout, + fdl6_layout(&staging_image.layout[0], vk_format_to_pipe_format(staging_image.vk_format), staging_image.samples, staging_image.extent.width, @@ -1579,7 +1436,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, NULL); VkResult result = tu_get_scratch_bo(cmd->device, - staging_image.layout.size, + staging_image.layout[0].size, &staging_image.bo); if (result != VK_SUCCESS) { cmd->record_result = result; @@ -1590,14 +1447,14 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); struct tu_image_view staging; - tu_image_view_blit2(&staging, &staging_image, src_format, - &staging_subresource, 0, false); + tu_image_view_copy(&staging, &staging_image, src_format, + &staging_subresource, 0, false); - ops->setup(cmd, cs, src_format, ROTATE_0, false, mask); + ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false); coords(ops, cs, &staging_offset, &src_offset, &extent); for (uint32_t i = 0; i < info->extent.depth; i++) { - ops->src(cmd, cs, &src, i, false); + ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); ops->dst(cs, &staging, i); ops->run(cmd, cs); } @@ -1608,30 +1465,34 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd, tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - tu_image_view_blit2(&staging, &staging_image, dst_format, - &staging_subresource, 0, false); + tu_image_view_copy(&staging, &staging_image, dst_format, + &staging_subresource, 0, false); - ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask); + ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, + ROTATE_0, false, dst_image->layout[0].ubwc); coords(ops, cs, &dst_offset, &staging_offset, &extent); for (uint32_t i = 0; i < info->extent.depth; i++) { - ops->src(cmd, cs, &staging, i, false); + ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST); ops->dst(cs, &dst, i); ops->run(cmd, cs); } } else { - tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false); - tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false); + tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false); + tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false); - ops->setup(cmd, cs, format, ROTATE_0, false, mask); + ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, + ROTATE_0, false, dst_image->layout[0].ubwc); coords(ops, cs, &dst_offset, &src_offset, &extent); for (uint32_t i = 0; i < info->extent.depth; i++) { - ops->src(cmd, cs, &src, i, false); + ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); ops->dst(cs, &dst, i); ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1666,7 +1527,7 @@ copy_buffer(struct tu_cmd_buffer *cmd, VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM; uint64_t blocks = size / block_size; - ops->setup(cmd, cs, format, ROTATE_0, false, 0xf); + ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false); while (blocks) { uint32_t src_x = (src_va & 63) / block_size; @@ -1682,6 +1543,8 @@ copy_buffer(struct tu_cmd_buffer *cmd, dst_va += width * block_size; blocks -= width; } + + ops->teardown(cmd, cs); } void @@ -1718,7 +1581,7 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); - struct ts_cs_memory tmp; + struct tu_cs_memory tmp; VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp); if (result != VK_SUCCESS) { cmd->record_result = result; @@ -1749,7 +1612,7 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset; uint32_t blocks = fillSize / 4; - ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf); + ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false); ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}}); while (blocks) { @@ -1763,6 +1626,8 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer, dst_va += width * 4; blocks -= width; } + + ops->teardown(cmd, cs); } void @@ -1783,7 +1648,8 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf); + ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, + ROTATE_0, false, dst_image->layout[0].ubwc); for (uint32_t i = 0; i < regionCount; ++i) { const VkImageResolve *info = &pRegions[i]; @@ -1799,11 +1665,13 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer, tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z); for (uint32_t i = 0; i < layers; i++) { - ops->src(cmd, cs, &src, i, false); + ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST); ops->dst(cs, &dst, i); ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1821,47 +1689,45 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, assert(src->image->vk_format == dst->image->vk_format); - ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf); + ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, + ROTATE_0, false, dst->ubwc_enabled); ops->coords(cs, &rect->offset, &rect->offset, &rect->extent); for (uint32_t i = 0; i < layers; i++) { - ops->src(cmd, cs, src, i, false); + ops->src(cmd, cs, src, i, VK_FILTER_NEAREST); ops->dst(cs, dst, i); ops->run(cmd, cs); } + + ops->teardown(cmd, cs); } static void clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, const VkClearValue *clear_value, - const VkImageSubresourceRange *range) + const VkImageSubresourceRange *range, + VkImageAspectFlags aspect_mask) { uint32_t level_count = tu_get_levelCount(image, range); uint32_t layer_count = tu_get_layerCount(image, range); struct tu_cs *cs = &cmd->cs; VkFormat format = image->vk_format; - if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) - format = VK_FORMAT_R32_UINT; + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + format = copy_format(format, aspect_mask, false); if (image->type == VK_IMAGE_TYPE_3D) { assert(layer_count == 1); assert(range->baseArrayLayer == 0); } - uint8_t mask = 0xf; - if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) { - mask = 0; - if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) - mask |= 0x7; - if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) - mask |= 0x8; - } - const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops; - ops->setup(cmd, cs, format, ROTATE_0, true, mask); - ops->clear_value(cs, image->vk_format, clear_value); + ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc); + if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value); + else + ops->clear_value(cs, format, clear_value); for (unsigned j = 0; j < level_count; j++) { if (image->type == VK_IMAGE_TYPE_3D) @@ -1873,8 +1739,8 @@ clear_image(struct tu_cmd_buffer *cmd, }); struct tu_image_view dst; - tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) { - .aspectMask = range->aspectMask, + tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) { + .aspectMask = aspect_mask, .mipLevel = range->baseMipLevel + j, .baseArrayLayer = range->baseArrayLayer, .layerCount = 1, @@ -1885,6 +1751,8 @@ clear_image(struct tu_cmd_buffer *cmd, ops->run(cmd, cs); } } + + ops->teardown(cmd, cs); } void @@ -1901,7 +1769,7 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i); + clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); } void @@ -1917,97 +1785,18 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); - for (unsigned i = 0; i < rangeCount; i++) - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i); -} - -static void -tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd, - uint32_t attachment_count, - const VkClearAttachment *attachments, - uint32_t rect_count, - const VkClearRect *rects) -{ - const struct tu_subpass *subpass = cmd->state.subpass; - /* note: cannot use shader path here.. there is a special shader path - * in tu_clear_sysmem_attachments() - */ - const struct blit_ops *ops = &r2d_ops; - struct tu_cs *cs = &cmd->draw_cs; - - for (uint32_t j = 0; j < attachment_count; j++) { - /* The vulkan spec, section 17.2 "Clearing Images Inside a Render - * Pass Instance" says that: - * - * Unlike other clear commands, vkCmdClearAttachments executes as - * a drawing command, rather than a transfer command, with writes - * performed by it executing in rasterization order. Clears to - * color attachments are executed as color attachment writes, by - * the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage. - * Clears to depth/stencil attachments are executed as depth - * writes and writes by the - * VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and - * VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages. - * - * However, the 2d path here is executed the same way as a - * transfer command, using the CCU color cache exclusively with - * a special depth-as-color format for depth clears. This means that - * we can't rely on the normal pipeline barrier mechanism here, and - * have to manually flush whenever using a different cache domain - * from what the 3d path would've used. This happens when we clear - * depth/stencil, since normally depth attachments use CCU depth, but - * we clear it using a special depth-as-color format. Since the clear - * potentially uses a different attachment state we also need to - * invalidate color beforehand and flush it afterwards. - */ + for (unsigned i = 0; i < rangeCount; i++) { + const VkImageSubresourceRange *range = &pRanges[i]; - uint32_t a; - if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { - a = subpass->color_attachments[attachments[j].colorAttachment].attachment; - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - } else { - a = subpass->depth_stencil_attachment.attachment; - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - } - - if (a == VK_ATTACHMENT_UNUSED) - continue; - - uint8_t mask = 0xf; - if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) { - if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) - mask &= ~0x7; - if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)) - mask &= ~0x8; - } - - const struct tu_image_view *iview = - cmd->state.framebuffer->attachments[a].attachment; - - ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask); - ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue); - - /* Wait for the flushes we triggered manually to complete */ - tu_cs_emit_wfi(cs); - - for (uint32_t i = 0; i < rect_count; i++) { - ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent); - for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) { - ops->dst(cs, iview, rects[i].baseArrayLayer + layer); - ops->run(cmd, cs); - } - } + if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + /* can't clear both depth and stencil at once, split up the aspect mask */ + uint32_t b; + for_each_bit(b, range->aspectMask) + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); + continue; + } - if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); - } else { - /* sync color into depth */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); - } + clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); } } @@ -2062,16 +1851,22 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, max_samples = MAX2(max_samples, pass->attachments[a].samples); } - /* prefer to use 2D path for clears - * 2D can't clear separate depth/stencil and msaa, needs known framebuffer + /* disable all draw states so they don't interfere + * TODO: use and re-use draw states + * we have to disable draw states individually to preserve + * input attachment states, because a secondary command buffer + * won't be able to restore them */ - if (max_samples == 1 && cmd->state.framebuffer) { - tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects); - return; + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2)); + for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) { + if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM || + i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM) + continue; + tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) | + CP_SET_DRAW_STATE__0_DISABLE); + tu_cs_emit_qw(cs, 0); } - - /* This clear path behaves like a draw, needs the same flush as tu_draw */ - tu_emit_cache_flush_renderpass(cmd, cs); + cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | @@ -2092,7 +1887,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, layered_clear = true; } - r3d_pipeline(cmd, cs, false, num_rts, layered_clear); + r3d_common(cmd, cs, false, num_rts, layered_clear); tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components)); @@ -2138,118 +1933,82 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, for (uint32_t i = 0; i < rect_count; i++) { for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) { - r3d_coords_raw(cs, layered_clear, (float[]) { + r3d_coords_raw(cs, (float[]) { rects[i].rect.offset.x, rects[i].rect.offset.y, z_clear_val, uif(rects[i].baseArrayLayer + layer), rects[i].rect.offset.x + rects[i].rect.extent.width, rects[i].rect.offset.y + rects[i].rect.extent.height, z_clear_val, 1.0f, }); - - if (layered_clear) { - tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); - tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) | - CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | - CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | - CP_DRAW_INDX_OFFSET_0_GS_ENABLE); - tu_cs_emit(cs, 1); /* instance count */ - tu_cs_emit(cs, 1); /* vertex count */ - } else { - r3d_run(cmd, cs); - } + r3d_run(cmd, cs); } } - - cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE | - TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK | - TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | - TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | - TU_CMD_DIRTY_DYNAMIC_VIEWPORT | - TU_CMD_DIRTY_DYNAMIC_SCISSOR; } -/** - * Pack a VkClearValue into a 128-bit buffer. format is respected except - * for the component order. The components are always packed in WZYX order, - * because gmem is tiled and tiled formats always have WZYX swap - */ static void -pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]) +pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4]) { - const struct util_format_description *desc = vk_format_description(format); + enum pipe_format pformat = vk_format_to_pipe_format(format); switch (format) { - case VK_FORMAT_B10G11R11_UFLOAT_PACK32: - buf[0] = float3_to_r11g11b10f(val->color.float32); + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) | + val->depthStencil.stencil << 24; return; - case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: - buf[0] = float3_to_rgb9e5(val->color.float32); + case VK_FORMAT_D16_UNORM: + clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16); + return; + case VK_FORMAT_D32_SFLOAT: + clear_value[0] = fui(val->depthStencil.depth); return; + case VK_FORMAT_S8_UINT: + clear_value[0] = val->depthStencil.stencil; + return; + /* these formats use a different base format when tiled + * the same format can be used for both because GMEM is always in WZYX order + */ + case VK_FORMAT_R5G5B5A1_UNORM_PACK16: + case VK_FORMAT_B5G5R5A1_UNORM_PACK16: + pformat = PIPE_FORMAT_B5G5R5A1_UNORM; default: break; } - assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); - - /* S8_UINT is special and has no depth */ - const int max_components = - format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels; - - int buf_offset = 0; - int bit_shift = 0; - for (int comp = 0; comp < max_components; comp++) { - const struct util_format_channel_description *ch = - tu_get_format_channel_description(desc, comp); - if (!ch) { - assert((format == VK_FORMAT_S8_UINT && comp == 0) || - (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1)); - continue; - } - - union tu_clear_component_value v = tu_get_clear_component_value( - val, comp, desc->colorspace); - - /* move to the next uint32_t when there is not enough space */ - assert(ch->size <= 32); - if (bit_shift + ch->size > 32) { - buf_offset++; - bit_shift = 0; - } + VkClearColorValue color; - if (bit_shift == 0) - buf[buf_offset] = 0; + /** + * GMEM is tiled and wants the components in WZYX order, + * apply swizzle to the color before packing, to counteract + * deswizzling applied by packing functions + */ + pipe_swizzle_4f(color.float32, val->color.float32, + util_format_description(pformat)->swizzle); - buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift; - bit_shift += ch->size; - } + util_format_pack_rgba(pformat, clear_value, color.uint32, 1); } static void -tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t attachment, - uint8_t component_mask, - const VkClearValue *value) +clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat format, + uint8_t clear_mask, + uint32_t gmem_offset, + const VkClearValue *value) { - VkFormat vk_format = cmd->state.pass->attachments[attachment].format; - /* note: component_mask is 0x7 for depth and 0x8 for stencil - * because D24S8 is cleared with AS_R8G8B8A8 format - */ - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format))); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format))); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask)); + tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask)); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset); + tu_cs_emit(cs, gmem_offset); tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); tu_cs_emit(cs, 0); uint32_t clear_vals[4] = {}; - pack_gmem_clear_value(value, vk_format, clear_vals); + pack_gmem_clear_value(value, format, clear_vals); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); tu_cs_emit_array(cs, clear_vals, 4); @@ -2257,6 +2016,27 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, tu6_emit_event_write(cmd, cs, BLIT); } +static void +tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t attachment, + VkImageAspectFlags mask, + const VkClearValue *value) +{ + const struct tu_render_pass_attachment *att = + &cmd->state.pass->attachments[attachment]; + + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) + clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value); + if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) + clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value); + return; + } + + clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value); +} + static void tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, @@ -2288,15 +2068,7 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, if (a == VK_ATTACHMENT_UNUSED) continue; - unsigned clear_mask = 0xf; - if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) { - if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) - clear_mask &= ~0x7; - if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)) - clear_mask &= ~0x8; - } - - tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask, + tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask, &attachments[j].clearValue); } } @@ -2312,6 +2084,27 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); struct tu_cs *cs = &cmd->draw_cs; + /* sysmem path behaves like a draw, note we don't have a way of using different + * flushes for sysmem/gmem, so this needs to be outside of the cond_exec + */ + tu_emit_cache_flush_renderpass(cmd, cs); + + /* vkCmdClearAttachments is supposed to respect the predicate if active. + * The easiest way to do this is to always use the 3d path, which always + * works even with GMEM because it's just a simple draw using the existing + * attachment state. However it seems that IGNORE_VISIBILITY draws must be + * skipped in the binning pass, since otherwise they produce binning data + * which isn't consumed and leads to the wrong binning data being read, so + * condition on GMEM | SYSMEM. + */ + if (cmd->state.predication_active) { + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM | + CP_COND_EXEC_0_RENDER_MODE_SYSMEM); + tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); + tu_cond_exec_end(cs); + return; + } + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); tu_cond_exec_end(cs); @@ -2321,42 +2114,67 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); } +static void +clear_sysmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + VkFormat format, + VkImageAspectFlags clear_mask, + const VkRenderPassBeginInfo *info, + uint32_t a, + bool separate_stencil) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct blit_ops *ops = &r2d_ops; + if (cmd->state.pass->attachments[a].samples > 1) + ops = &r3d_ops; + + ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled); + ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); + ops->clear_value(cs, format, &info->pClearValues[a]); + + for (uint32_t i = 0; i < fb->layers; i++) { + if (separate_stencil) { + if (ops == &r3d_ops) + r3d_dst_stencil(cs, iview, i); + else + r2d_dst_stencil(cs, iview, i); + } else { + ops->dst(cs, iview, i); + } + ops->run(cmd, cs); + } + + ops->teardown(cmd, cs); +} + void tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkRenderPassBeginInfo *info) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *iview = fb->attachments[a].attachment; const struct tu_render_pass_attachment *attachment = &cmd->state.pass->attachments[a]; - uint8_t mask = 0; - - if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT) - mask = 0xf; - if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) - mask |= 0x7; - if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) - mask |= 0x8; - if (!mask) + if (!attachment->clear_mask) return; - const struct blit_ops *ops = &r2d_ops; - if (attachment->samples > 1) - ops = &r3d_ops; - - ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask); - ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent); - ops->clear_value(cs, attachment->format, &info->pClearValues[a]); - /* Wait for any flushes at the beginning of the renderpass to complete */ tu_cs_emit_wfi(cs); - for (uint32_t i = 0; i < fb->layers; i++) { - ops->dst(cs, iview, i); - ops->run(cmd, cs); + if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { + clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, + info, a, false); + } + if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { + clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, + info, a, true); + } + } else { + clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, + info, a, false); } /* The spec doesn't explicitly say, but presumably the initial renderpass @@ -2385,21 +2203,13 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, { const struct tu_render_pass_attachment *attachment = &cmd->state.pass->attachments[a]; - unsigned clear_mask = 0; - - if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT) - clear_mask = 0xf; - if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) - clear_mask |= 0x7; - if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) - clear_mask |= 0x8; - if (!clear_mask) + if (!attachment->clear_mask) return; tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); - tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask, + tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, &info->pClearValues[a]); } @@ -2408,7 +2218,8 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, const struct tu_render_pass_attachment *attachment, - bool resolve) + bool resolve, + bool separate_stencil) { tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); @@ -2420,14 +2231,23 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, .integer = vk_format_is_int(attachment->format))); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4); - tu_cs_emit(cs, iview->RB_BLIT_DST_INFO); - tu_cs_image_ref_2d(cs, iview, 0, false); + if (separate_stencil) { + tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS); + tu_cs_emit_qw(cs, iview->stencil_base_addr); + tu_cs_emit(cs, iview->stencil_PITCH); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3); - tu_cs_image_flag_ref(cs, iview, 0); + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil)); + } else { + tu_cs_emit(cs, iview->RB_BLIT_DST_INFO); + tu_cs_image_ref_2d(cs, iview, 0, false); - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3); + tu_cs_image_flag_ref(cs, iview, 0); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + } tu6_emit_event_write(cmd, cs, BLIT); } @@ -2479,7 +2299,58 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, &cmd->state.pass->attachments[a]; if (attachment->load || force_load) - tu_emit_blit(cmd, cs, iview, attachment, false); + tu_emit_blit(cmd, cs, iview, attachment, false, false); + + if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load)) + tu_emit_blit(cmd, cs, iview, attachment, false, true); +} + +static void +store_cp_blit(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image_view *iview, + uint32_t samples, + bool separate_stencil, + VkFormat format, + uint32_t gmem_offset, + uint32_t cpp) +{ + r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, + iview->ubwc_enabled, true); + if (separate_stencil) + r2d_dst_stencil(cs, iview, 0); + else + r2d_dst(cs, iview, 0); + + tu_cs_emit_regs(cs, + A6XX_SP_PS_2D_SRC_INFO( + .color_format = tu6_format_texture(format, TILE6_2).fmt, + .tile_mode = TILE6_2, + .srgb = vk_format_is_srgb(format), + .samples = tu_msaa_samples(samples), + .samples_average = !vk_format_is_int(format), + .unk20 = 1, + .unk22 = 1), + /* note: src size does not matter when not scaling */ + A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), + A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset), + A6XX_SP_PS_2D_SRC_HI(), + A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp)); + + /* sync GMEM writes with CACHE. */ + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + + /* Wait for CACHE_INVALIDATE to land */ + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt7(cs, CP_BLIT, 1); + tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to + * sysmem, and we generally assume that GMEM renderpasses leave their + * results in sysmem, so we need to flush manually here. + */ + tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); } void @@ -2488,13 +2359,12 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t a, uint32_t gmem_a) { - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - const VkRect2D *render_area = &tiling->render_area; + const VkRect2D *render_area = &cmd->state.render_area; struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment; struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a]; - if (!dst->store) + if (!dst->store && !dst->store_stencil) return; uint32_t x1 = render_area->offset.x; @@ -2515,7 +2385,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, /* use fast path when render area is aligned, except for unsupported resolve cases */ if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) { - tu_emit_blit(cmd, cs, iview, src, true); + if (dst->store) + tu_emit_blit(cmd, cs, iview, src, true, false); + if (dst->store_stencil) + tu_emit_blit(cmd, cs, iview, src, true, true); return; } @@ -2527,37 +2400,18 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, return; } - r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true); - r2d_dst(cs, iview, 0); r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); - tu_cs_emit_regs(cs, - A6XX_SP_PS_2D_SRC_INFO( - .color_format = tu6_format_texture(src->format, TILE6_2).fmt, - .tile_mode = TILE6_2, - .srgb = vk_format_is_srgb(src->format), - .samples = tu_msaa_samples(src->samples), - .samples_average = !vk_format_is_int(src->format), - .unk20 = 1, - .unk22 = 1), - /* note: src size does not matter when not scaling */ - A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), - A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset), - A6XX_SP_PS_2D_SRC_HI(), - A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp)); - - /* sync GMEM writes with CACHE. */ - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - /* Wait for CACHE_INVALIDATE to land */ - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_BLIT, 1); - tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + VkFormat format = src->format; + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) + format = VK_FORMAT_D32_SFLOAT; - /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to - * sysmem, and we generally assume that GMEM renderpasses leave their - * results in sysmem, so we need to flush manually here. - */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); + if (dst->store) { + store_cp_blit(cmd, cs, iview, src->samples, false, format, + src->gmem_offset, src->cpp); + } + if (dst->store_stencil) { + store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT, + src->gmem_offset_stencil, src->samples); + } }