X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fvulkan%2Ftu_clear_blit.c;h=4082d3e21ddf5176a539794251ee465902410939;hb=f83e89507de69b55c8c899fb0fa52f9c9bf3ce26;hp=5635375cc2356aaab8c3afc9789fe3341d6fd15c;hpb=c93753e6181b82988c84e9af43d3aa377a6eae36;p=mesa.git

diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index 5635375cc23..4082d3e21dd 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -16,178 +16,10 @@
 #include "util/format_srgb.h"
 #include "util/u_half.h"
 
-/* helper functions previously in tu_formats.c */
-
-static uint32_t
-tu_pack_mask(int bits)
-{
-   assert(bits <= 32);
-   return (1ull << bits) - 1;
-}
-
 static uint32_t
 tu_pack_float32_for_unorm(float val, int bits)
 {
-   const uint32_t max = tu_pack_mask(bits);
-   if (val < 0.0f)
-      return 0;
-   else if (val > 1.0f)
-      return max;
-   else
-      return _mesa_lroundevenf(val * (float) max);
-}
-
-static uint32_t
-tu_pack_float32_for_snorm(float val, int bits)
-{
-   const int32_t max = tu_pack_mask(bits - 1);
-   int32_t tmp;
-   if (val < -1.0f)
-      tmp = -max;
-   else if (val > 1.0f)
-      tmp = max;
-   else
-      tmp = _mesa_lroundevenf(val * (float) max);
-
-   return tmp & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_float32_for_uscaled(float val, int bits)
-{
-   const uint32_t max = tu_pack_mask(bits);
-   if (val < 0.0f)
-      return 0;
-   else if (val > (float) max)
-      return max;
-   else
-      return (uint32_t) val;
-}
-
-static uint32_t
-tu_pack_float32_for_sscaled(float val, int bits)
-{
-   const int32_t max = tu_pack_mask(bits - 1);
-   const int32_t min = -max - 1;
-   int32_t tmp;
-   if (val < (float) min)
-      tmp = min;
-   else if (val > (float) max)
-      tmp = max;
-   else
-      tmp = (int32_t) val;
-
-   return tmp & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_uint32_for_uint(uint32_t val, int bits)
-{
-   return val & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_int32_for_sint(int32_t val, int bits)
-{
-   return val & tu_pack_mask(bits);
-}
-
-static uint32_t
-tu_pack_float32_for_sfloat(float val, int bits)
-{
-   assert(bits == 16 || bits == 32);
-   return bits == 16 ? util_float_to_half(val) : fui(val);
-}
-
-union tu_clear_component_value {
-   float float32;
-   int32_t int32;
-   uint32_t uint32;
-};
-
-static uint32_t
-tu_pack_clear_component_value(union tu_clear_component_value val,
-                              const struct util_format_channel_description *ch)
-{
-   uint32_t packed;
-
-   switch (ch->type) {
-   case UTIL_FORMAT_TYPE_UNSIGNED:
-      /* normalized, scaled, or pure integer */
-      if (ch->normalized)
-         packed = tu_pack_float32_for_unorm(val.float32, ch->size);
-      else if (ch->pure_integer)
-         packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
-      else
-         packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
-      break;
-   case UTIL_FORMAT_TYPE_SIGNED:
-      /* normalized, scaled, or pure integer */
-      if (ch->normalized)
-         packed = tu_pack_float32_for_snorm(val.float32, ch->size);
-      else if (ch->pure_integer)
-         packed = tu_pack_int32_for_sint(val.int32, ch->size);
-      else
-         packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
-      break;
-   case UTIL_FORMAT_TYPE_FLOAT:
-      packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
-      break;
-   default:
-      unreachable("unexpected channel type");
-      packed = 0;
-      break;
-   }
-
-   assert((packed & tu_pack_mask(ch->size)) == packed);
-   return packed;
-}
-
-static const struct util_format_channel_description *
-tu_get_format_channel_description(const struct util_format_description *desc,
-                                  int comp)
-{
-   switch (desc->swizzle[comp]) {
-   case PIPE_SWIZZLE_X:
-      return &desc->channel[0];
-   case PIPE_SWIZZLE_Y:
-      return &desc->channel[1];
-   case PIPE_SWIZZLE_Z:
-      return &desc->channel[2];
-   case PIPE_SWIZZLE_W:
-      return &desc->channel[3];
-   default:
-      return NULL;
-   }
-}
-
-static union tu_clear_component_value
-tu_get_clear_component_value(const VkClearValue *val, int comp,
-                             enum util_format_colorspace colorspace)
-{
-   assert(comp < 4);
-
-   union tu_clear_component_value tmp;
-   switch (colorspace) {
-   case UTIL_FORMAT_COLORSPACE_ZS:
-      assert(comp < 2);
-      if (comp == 0)
-         tmp.float32 = val->depthStencil.depth;
-      else
-         tmp.uint32 = val->depthStencil.stencil;
-      break;
-   case UTIL_FORMAT_COLORSPACE_SRGB:
-      if (comp < 3) {
-         tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
-         break;
-      }
-   default:
-      assert(comp < 4);
-      tmp.uint32 = val->color.uint32[comp];
-      break;
-   }
-
-   return tmp;
+   return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
 }
 
 /* r2d_ = BLIT_OP_SCALE operations */
@@ -275,10 +107,10 @@ r2d_coords(struct tu_cs *cs,
       return;
 
    tu_cs_emit_regs(cs,
-                   A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
-                   A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
-                   A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
-                   A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
+                   A6XX_GRAS_2D_SRC_TL_X(src->x),
+                   A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
+                   A6XX_GRAS_2D_SRC_TL_Y(src->y),
+                   A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
 }
 
 static void
@@ -323,7 +155,7 @@ r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 
             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
-               clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
+               clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
             else
                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
          } else if (ifmt == R2D_FLOAT16) {
@@ -346,11 +178,14 @@ r2d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
         const struct tu_image_view *iview,
         uint32_t layer,
-        bool linear_filter)
+        VkFilter filter)
 {
+   uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
+   if (filter != VK_FILTER_NEAREST)
+      src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
+
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
-   tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
-                  COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
+   tu_cs_emit(cs, src_info);
    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
    tu_cs_image_ref_2d(cs, iview, layer, true);
 
@@ -393,6 +228,17 @@ r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
    tu_cs_image_flag_ref(cs, iview, layer);
 }
 
+static void
+r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+{
+   assert(iview->image->samples == 1);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
+   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
+   tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
+   tu_cs_emit(cs, iview->stencil_PITCH);
+}
+
 static void
 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 {
@@ -405,32 +251,39 @@ r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
                       .srgb = vk_format_is_srgb(vk_format)),
                    A6XX_RB_2D_DST_LO((uint32_t) va),
                    A6XX_RB_2D_DST_HI(va >> 32),
-                   A6XX_RB_2D_DST_SIZE(.pitch = pitch));
+                   A6XX_RB_2D_DST_PITCH(pitch));
 }
 
 static void
 r2d_setup_common(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
                  VkFormat vk_format,
+                 VkImageAspectFlags aspect_mask,
                  enum a6xx_rotation rotation,
                  bool clear,
-                 uint8_t mask,
+                 bool ubwc,
                  bool scissor)
 {
    enum a6xx_format format = tu6_base_format(vk_format);
    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
    uint32_t unknown_8c01 = 0;
 
-   if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
-      /* preserve depth channels */
-      if (mask == 0x8)
-         unknown_8c01 = 0x00084001;
+   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
+        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
+      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   }
+
+   /* note: the only format with partial clearing is D24S8 */
+   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
       /* preserve stencil channel */
-      if (mask == 0x7)
+      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
          unknown_8c01 = 0x08000041;
+      /* preserve depth channels */
+      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
+         unknown_8c01 = 0x00084001;
    }
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
    tu_cs_emit(cs, unknown_8c01);
 
    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
@@ -452,7 +305,7 @@ r2d_setup_common(struct tu_cmd_buffer *cmd,
    if (format == FMT6_10_10_10_2_UNORM_DEST)
       format = FMT6_16_16_16_16_FLOAT;
 
-   tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
+   tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
          .sint = vk_format_is_sint(vk_format),
          .uint = vk_format_is_uint(vk_format),
          .color_format = format,
@@ -464,13 +317,21 @@ static void
 r2d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
           VkFormat vk_format,
+          VkImageAspectFlags aspect_mask,
           enum a6xx_rotation rotation,
           bool clear,
-          uint8_t mask)
+          bool ubwc)
 {
    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 
-   r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
+   r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
+}
+
+static void
+r2d_teardown(struct tu_cmd_buffer *cmd,
+             struct tu_cs *cs)
+{
+   /* nothing to do here */
 }
 
 static void
@@ -482,16 +343,62 @@ r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
 /* r3d_ = shader path operations */
 
+void
+tu_init_clear_blit_shaders(struct tu6_global *global)
+{
+#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
+#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
+#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
+
+   static const instr_t vs_code[] = {
+      /* r0.xyz = r0.w ? c1.xyz : c0.xyz
+       * r1.xy = r0.w ? c1.zw : c0.zw
+       * r0.w = 1.0f
+       */
+      CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
+         .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
+         .src2 = 3,
+         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
+      CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
+         .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
+         .src2 = 3,
+         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
+      MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
+      { .cat0 = { .opc = OPC_END } },
+   };
+
+   static const instr_t fs_blit[] = {
+      /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
+       * blit path (its not clear what allows it to not have it)
+       */
+      CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
+      { .cat0 = { .opc = OPC_END } },
+   };
+
+   memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
+   memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
+
+   for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
+      instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
+      for (uint32_t i = 0; i < num_rts; i++) {
+         /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
+         *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
+      }
+      *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
+   }
+}
+
 static void
-r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
-             bool layered_clear)
+r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
+           bool layered_clear)
 {
+   struct ir3_const_state dummy_const_state = {};
    struct ir3_shader dummy_shader = {};
 
    struct ir3_shader_variant vs = {
       .type = MESA_SHADER_VERTEX,
       .instrlen = 1,
-      .constlen = 2,
+      .constlen = 4,
       .info.max_reg = 1,
       .inputs_count = 1,
       .inputs[0] = {
@@ -509,20 +416,18 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu
          .regid = regid(1, 0),
       },
       .shader = &dummy_shader,
+      .const_state = &dummy_const_state,
    };
    if (layered_clear) {
-      vs = (struct ir3_shader_variant) {
-         .type = MESA_SHADER_VERTEX,
-         .instrlen = 1,
-         .info.max_reg = 0,
-         .shader = &dummy_shader,
-      };
+      vs.outputs[1].slot = VARYING_SLOT_LAYER;
+      vs.outputs[1].regid = regid(1, 1);
+      vs.outputs_count = 2;
    }
 
    struct ir3_shader_variant fs = {
       .type = MESA_SHADER_FRAGMENT,
       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
-      .constlen = num_rts,
+      .constlen = align(num_rts, 4),
       .info.max_reg = MAX2(num_rts, 1) - 1,
       .total_in = blit ? 2 : 0,
       .num_samp = blit ? 1 : 0,
@@ -545,139 +450,33 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu
          .cmd = 4,
       },
       .shader = &dummy_shader,
+      .const_state = &dummy_const_state,
    };
 
-   struct ir3_shader_variant gs_shader = {
-      .type = MESA_SHADER_GEOMETRY,
-      .instrlen = 1,
-      .constlen = 2,
-      .info.max_reg = 1,
-      .inputs_count = 1,
-      .inputs[0] = {
-         .slot = SYSTEM_VALUE_GS_HEADER_IR3,
-         .regid = regid(0, 0),
-         .sysval = true,
-      },
-      .outputs_count = 3,
-      .outputs[0] = {
-         .slot = VARYING_SLOT_POS,
-         .regid = regid(0, 0),
-      },
-      .outputs[1] = {
-         .slot = VARYING_SLOT_LAYER,
-         .regid = regid(1, 1),
-      },
-      .outputs[2] = {
-         .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
-         .regid = regid(1, 0),
-      },
-      .shader = &dummy_shader,
-   }, *gs = layered_clear ? &gs_shader : NULL;
-
-
-#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
-#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
-#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
-
-   static const instr_t vs_code[] = {
-      /* r0.xyz = r0.w ? c1.xyz : c0.xyz
-       * r1.xy = r0.w ? c1.zw : c0.zw
-       * r0.w = 1.0f
-       */
-      CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
-         .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
-         .src2 = 3,
-         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
-      CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
-         .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
-         .src2 = 3,
-         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
-      MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
-      { .cat0 = { .opc = OPC_END } },
-   };
-
-   static const instr_t vs_layered[] = {
-      { .cat0 = { .opc = OPC_CHMASK } },
-      { .cat0 = { .opc = OPC_CHSH } },
-   };
-
-   static const instr_t gs_code[16] = {
-      /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
-      CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
-           .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
-      /* x = (local_id & 1) ? c1.x : c0.x */
-      CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
-      /* y = (local_id & 2) ? c1.y : c0.y */
-      CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
-      /* pred = (local_id >= 4), used by OPC_KILL */
-      CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
-      /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
-      CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
-
-      MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
-      MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
-      MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
-
-      /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
-      CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
-         .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
-         .src2 = 0,
-         .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
-
-      CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
-
-      { .cat0 = { .opc = OPC_KILL } },
-      { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
-   };
-#define FS_OFFSET (16 * sizeof(instr_t))
-#define GS_OFFSET (32 * sizeof(instr_t))
-
-   /* shaders */
-   struct ts_cs_memory shaders = { };
-   VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
-                                 16 * sizeof(instr_t), &shaders);
-   assert(result == VK_SUCCESS);
-
-   if (layered_clear) {
-      memcpy(shaders.map, vs_layered, sizeof(vs_layered));
-      memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
-   } else {
-      memcpy(shaders.map, vs_code, sizeof(vs_code));
-   }
-
-   instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
-   for (uint32_t i = 0; i < num_rts; i++) {
-      /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
-      *fs_code++ = (instr_t) { .cat1 = {
-         .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
-         .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
-      } };
-   }
-
-   /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
-    * blit path (its not clear what allows it to not have it)
-    */
-   if (blit) {
-      *fs_code++ = (instr_t) { .cat2 = {
-         .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
-         .dst = regid(63, 0), .src1_im = 1
-      } };
-   }
-   *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
-   /* note: assumed <= 16 instructions (MAX_RTS is 8) */
-
-   tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
-
-   tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
+   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
+         .vs_state = true,
+         .hs_state = true,
+         .ds_state = true,
+         .gs_state = true,
+         .fs_state = true,
+         .cs_state = true,
+         .gfx_ibo = true,
+         .cs_ibo = true,
+         .gfx_shared_const = true,
+         .gfx_bindless = 0x1f,
+         .cs_bindless = 0x1f));
+
+   tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
-   tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
-   tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
+   tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
+   tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
+         global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
 
    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 
-   tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
+   tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
 
    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
@@ -690,26 +489,29 @@ r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t nu
                       .persp_division_disable = 1,
                       .vp_xform_disable = 1,
                       .vp_clip_code_ignore = 1,
-                      .clip_disable = 1),
-                   A6XX_GRAS_UNKNOWN_8001(0));
+                      .clip_disable = 1));
    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 
    tu_cs_emit_regs(cs,
-                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
-                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
+                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
+                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
    tu_cs_emit_regs(cs,
-                   A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
-                   A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
+                   A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
+                   A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
+
+   tu_cs_emit_regs(cs,
+                   A6XX_VFD_INDEX_OFFSET(),
+                   A6XX_VFD_INSTANCE_START_OFFSET());
 }
 
 static void
-r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
+r3d_coords_raw(struct tu_cs *cs, const float *coords)
 {
    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
+                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
                   CP_LOAD_STATE6_0_NUM_UNIT(2));
    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
@@ -724,7 +526,7 @@ r3d_coords(struct tu_cs *cs,
 {
    int32_t src_x1 = src ? src->x : 0;
    int32_t src_y1 = src ? src->y : 0;
-   r3d_coords_raw(cs, false, (float[]) {
+   r3d_coords_raw(cs, (float[]) {
       dst->x,                 dst->y,
       src_x1,                 src_y1,
       dst->x + extent->width, dst->y + extent->height,
@@ -780,9 +582,9 @@ r3d_src_common(struct tu_cmd_buffer *cmd,
                const uint32_t *tex_const,
                uint32_t offset_base,
                uint32_t offset_ubwc,
-               bool linear_filter)
+               VkFilter filter)
 {
-   struct ts_cs_memory texture = { };
+   struct tu_cs_memory texture = { };
    VkResult result = tu_cs_alloc(&cmd->sub_cs,
                                  2, /* allocate space for a sampler too */
                                  A6XX_TEX_CONST_DWORDS, &texture);
@@ -797,8 +599,8 @@ r3d_src_common(struct tu_cmd_buffer *cmd,
    texture.map[8] = ubwc_addr >> 32;
 
    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
-      A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
-      A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
+      A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
+      A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
@@ -840,12 +642,12 @@ r3d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
         const struct tu_image_view *iview,
         uint32_t layer,
-        bool linear_filter)
+        VkFilter filter)
 {
    r3d_src_common(cmd, cs, iview->descriptor,
                   iview->layer_size * layer,
                   iview->ubwc_layer_size * layer,
-                  linear_filter);
+                  filter);
 }
 
 static void
@@ -870,7 +672,6 @@ r3d_src_buffer(struct tu_cmd_buffer *cmd,
       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
    desc[2] =
-      A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
       A6XX_TEX_CONST_2_PITCH(pitch) |
       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
    desc[3] = 0;
@@ -879,7 +680,7 @@ r3d_src_buffer(struct tu_cmd_buffer *cmd,
    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
       desc[i] = 0;
 
-   r3d_src_common(cmd, cs, desc, 0, 0, false);
+   r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 }
 
 static void
@@ -898,6 +699,19 @@ r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 }
 
+static void
+r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+{
+   tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
+   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
+   tu_cs_image_stencil_ref(cs, iview, layer);
+   tu_cs_emit(cs, 0);
+
+   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
+}
+
 static void
 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 {
@@ -916,23 +730,48 @@ r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 }
 
+static uint8_t
+aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
+{
+   uint8_t mask = 0xf;
+   assert(aspect_mask);
+   /* note: the only format with partial writing is D24S8,
+    * clear/blit uses the _AS_R8G8B8A8 format to access it
+    */
+   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
+         mask = 0x7;
+      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
+         mask = 0x8;
+   }
+   return mask;
+}
+
 static void
 r3d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
           VkFormat vk_format,
+          VkImageAspectFlags aspect_mask,
           enum a6xx_rotation rotation,
           bool clear,
-          uint8_t mask)
+          bool ubwc)
 {
+   enum a6xx_format format = tu6_base_format(vk_format);
+
+   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
+        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
+      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   }
+
    if (!cmd->state.pass) {
       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
-      tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
+      tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
    }
 
    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 
-   r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0, false);
+   r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
@@ -963,13 +802,19 @@ r3d_setup(struct tu_cmd_buffer *cmd,
    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 
    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
-                        .color_format = tu6_base_format(vk_format),
+                        .color_format = format,
                         .color_sint = vk_format_is_sint(vk_format),
                         .color_uint = vk_format_is_uint(vk_format)));
 
-   tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
+   tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
+      .component_enable = aspect_write_mask(vk_format, aspect_mask)));
    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
+
+   if (cmd->state.predication_active) {
+      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
+      tu_cs_emit(cs, 0);
+   }
 }
 
 static void
@@ -983,6 +828,15 @@ r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit(cs, 2); /* vertex count */
 }
 
+static void
+r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+   if (cmd->state.predication_active) {
+      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
+      tu_cs_emit(cs, 1);
+   }
+}
+
 /* blit ops - common interface for 2d/shader paths */
 
 struct blit_ops {
@@ -996,7 +850,7 @@ struct blit_ops {
         struct tu_cs *cs,
         const struct tu_image_view *iview,
         uint32_t layer,
-        bool linear_filter);
+        VkFilter filter);
    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                       VkFormat vk_format,
                       uint64_t va, uint32_t pitch,
@@ -1006,10 +860,13 @@ struct blit_ops {
    void (*setup)(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
                  VkFormat vk_format,
+                 VkImageAspectFlags aspect_mask,
                  enum a6xx_rotation rotation,
                  bool clear,
-                 uint8_t mask);
+                 bool ubwc);
    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
+   void (*teardown)(struct tu_cmd_buffer *cmd,
+                    struct tu_cs *cs);
 };
 
 static const struct blit_ops r2d_ops = {
@@ -1021,6 +878,7 @@ static const struct blit_ops r2d_ops = {
    .dst_buffer = r2d_dst_buffer,
    .setup = r2d_setup,
    .run = r2d_run,
+   .teardown = r2d_teardown,
 };
 
 static const struct blit_ops r3d_ops = {
@@ -1032,6 +890,7 @@ static const struct blit_ops r3d_ops = {
    .dst_buffer = r3d_dst_buffer,
    .setup = r3d_setup,
    .run = r3d_run,
+   .teardown = r3d_teardown,
 };
 
 /* passthrough set coords from 3D extents */
@@ -1045,13 +904,51 @@ coords(const struct blit_ops *ops,
    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 }
 
+static VkFormat
+copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
+{
+   if (vk_format_is_compressed(format)) {
+      switch (vk_format_get_blocksize(format)) {
+      case 1: return VK_FORMAT_R8_UINT;
+      case 2: return VK_FORMAT_R16_UINT;
+      case 4: return VK_FORMAT_R32_UINT;
+      case 8: return VK_FORMAT_R32G32_UINT;
+      case 16:return VK_FORMAT_R32G32B32A32_UINT;
+      default:
+         unreachable("unhandled format size");
+      }
+   }
+
+   switch (format) {
+   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+      if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
+         return VK_FORMAT_R8G8_UNORM;
+      /* fallthrough */
+   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
+      return VK_FORMAT_R8_UNORM;
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
+         return VK_FORMAT_R8_UNORM;
+      /* fallthrough */
+   default:
+      return format;
+   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+      return VK_FORMAT_R32_UINT;
+   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
+         return VK_FORMAT_S8_UINT;
+      assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
+      return VK_FORMAT_D32_SFLOAT;
+   }
+}
+
 static void
-tu_image_view_blit2(struct tu_image_view *iview,
-                    struct tu_image *image,
-                    VkFormat format,
-                    const VkImageSubresourceLayers *subres,
-                    uint32_t layer,
-                    bool stencil_read)
+tu_image_view_copy_blit(struct tu_image_view *iview,
+                        struct tu_image *image,
+                        VkFormat format,
+                        const VkImageSubresourceLayers *subres,
+                        uint32_t layer,
+                        bool stencil_read)
 {
    VkImageAspectFlags aspect_mask = subres->aspectMask;
 
@@ -1074,7 +971,19 @@ tu_image_view_blit2(struct tu_image_view *iview,
          .baseArrayLayer = subres->baseArrayLayer + layer,
          .layerCount = 1,
       },
-   });
+   }, false);
+}
+
+static void
+tu_image_view_copy(struct tu_image_view *iview,
+                   struct tu_image *image,
+                   VkFormat format,
+                   const VkImageSubresourceLayers *subres,
+                   uint32_t layer,
+                   bool stencil_read)
+{
+   format = copy_format(format, subres->aspectMask, false);
+   tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
 }
 
 static void
@@ -1083,7 +992,7 @@ tu_image_view_blit(struct tu_image_view *iview,
                    const VkImageSubresourceLayers *subres,
                    uint32_t layer)
 {
-   tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
+   tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
 }
 
 static void
@@ -1093,7 +1002,7 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
                const VkImageBlit *info,
                VkFilter filter)
 {
-   const struct blit_ops *ops = &r3d_ops;
+   const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers;
 
@@ -1127,15 +1036,6 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
       layers = info->dstSubresource.layerCount;
    }
 
-   uint8_t mask = 0xf;
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask = 0x7;
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask = 0x8;
-   }
-
    /* BC1_RGB_* formats need to have their last components overriden with 1
     * when sampling, which is normally handled with the texture descriptor
     * swizzle. The 2d path can't handle that, so use the 3d path.
@@ -1146,17 +1046,15 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
 
    if (dst_image->samples > 1 ||
        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
-       src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
+       src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
+       filter == VK_FILTER_CUBIC_EXT)
       ops = &r3d_ops;
 
-   /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
-    * figure out why (should be able to pass all tests with only shader path)
-    */
-
-   ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
+   ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
+              rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
 
    if (ops == &r3d_ops) {
-      r3d_coords_raw(cs, false, (float[]) {
+      r3d_coords_raw(cs, (float[]) {
          info->dstOffsets[0].x, info->dstOffsets[0].y,
          info->srcOffsets[0].x, info->srcOffsets[0].y,
          info->dstOffsets[1].x, info->dstOffsets[1].y,
@@ -1169,10 +1067,10 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
       tu_cs_emit_regs(cs,
-         A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
-         A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
-         A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
-         A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
+         A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
+         A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
+         A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
+         A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
    }
 
    struct tu_image_view dst, src;
@@ -1181,9 +1079,11 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->dst(cs, &dst, i);
-      ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
+      ops->src(cmd, cs, &src, i, filter);
       ops->run(cmd, cs);
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1208,21 +1108,6 @@ tu_CmdBlitImage(VkCommandBuffer commandBuffer,
       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
 }
 
-static VkFormat
-copy_format(VkFormat format)
-{
-   switch (vk_format_get_blocksizebits(format)) {
-   case 8:  return VK_FORMAT_R8_UINT;
-   case 16: return VK_FORMAT_R16_UINT;
-   case 32: return VK_FORMAT_R32_UINT;
-   case 64: return VK_FORMAT_R32G32_UINT;
-   case 96: return VK_FORMAT_R32G32B32_UINT;
-   case 128:return VK_FORMAT_R32G32B32A32_UINT;
-   default:
-      unreachable("unhandled format size");
-   }
-}
-
 static void
 copy_compressed(VkFormat format,
                 VkOffset3D *offset,
@@ -1257,47 +1142,36 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat dst_format = dst_image->vk_format;
-   VkFormat src_format = dst_image->vk_format;
+   VkFormat src_format =
+      copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
    const struct blit_ops *ops = &r2d_ops;
 
-   uint8_t mask = 0xf;
-
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      switch (info->imageSubresource.aspectMask) {
-      case VK_IMAGE_ASPECT_STENCIL_BIT:
-         src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
-         mask = 0x8;
-         ops = &r3d_ops;
-         break;
-      case VK_IMAGE_ASPECT_DEPTH_BIT:
-         mask = 0x7;
-         break;
-      }
+   /* special case for buffer to stencil */
+   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
+       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+      ops = &r3d_ops;
    }
 
+   /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
+    * which matters for UBWC. buffer_to_image/etc can fail because of this
+    */
+
    VkOffset3D offset = info->imageOffset;
    VkExtent3D extent = info->imageExtent;
    uint32_t src_width = info->bufferRowLength ?: extent.width;
    uint32_t src_height = info->bufferImageHeight ?: extent.height;
 
-   if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
-      assert(src_format == dst_format);
-      copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
-      src_format = dst_format = copy_format(dst_format);
-   }
+   copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
 
    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
    uint32_t layer_size = src_height * pitch;
 
-   /* note: the src_va/pitch alignment of 64 is for 2D engine,
-    * it is also valid for 1cpp format with shader path (stencil aspect path)
-    */
-
-   ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
+   ops->setup(cmd, cs,
+              copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
+              info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
 
    struct tu_image_view dst;
-   tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
+   tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->dst(cs, &dst, i);
@@ -1319,6 +1193,8 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1348,13 +1224,12 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat src_format = src_image->vk_format;
-   VkFormat dst_format = src_image->vk_format;
+   VkFormat dst_format =
+      copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
    bool stencil_read = false;
 
    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
-      dst_format = VK_FORMAT_R8_UNORM;
       stencil_read = true;
    }
 
@@ -1364,26 +1239,18 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
    uint32_t dst_width = info->bufferRowLength ?: extent.width;
    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
 
-   if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
-      assert(src_format == dst_format);
-      copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
-      src_format = dst_format = copy_format(dst_format);
-   }
+   copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
 
    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
    uint32_t layer_size = pitch * dst_height;
 
-   /* note: the dst_va/pitch alignment of 64 is for 2D engine,
-    * it is also valid for 1cpp format with shader path (stencil aspect)
-    */
-
-   ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
+   ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
 
    struct tu_image_view src;
-   tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
+   tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
 
    for (uint32_t i = 0; i < layers; i++) {
-      ops->src(cmd, cs, &src, i, false);
+      ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
 
       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
       if ((dst_va & 63) || (pitch & 63)) {
@@ -1401,6 +1268,8 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1447,7 +1316,7 @@ is_swapped_format(VkFormat format)
 static bool
 image_is_r8g8(struct tu_image *image)
 {
-   return image->layout.cpp == 2 &&
+   return image->layout[0].cpp == 2 &&
       vk_format_get_nr_components(image->vk_format) == 2;
 }
 
@@ -1460,19 +1329,9 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
 
-   uint8_t mask = 0xf;
-   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask = 0x7;
-      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask = 0x8;
-   }
-
    if (dst_image->samples > 1)
       ops = &r3d_ops;
 
-   assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
-
    VkFormat format = VK_FORMAT_UNDEFINED;
    VkOffset3D src_offset = info->srcOffset;
    VkOffset3D dst_offset = info->dstOffset;
@@ -1497,10 +1356,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
 
-   VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
-      copy_format(dst_image->vk_format) : dst_image->vk_format;
-   VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
-      copy_format(src_image->vk_format) : src_image->vk_format;
+   VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
+   VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
 
    bool use_staging_blit = false;
 
@@ -1509,12 +1366,12 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
        * the same as a blit.
        */
       format = src_format;
-   } else if (!src_image->layout.tile_mode) {
+   } else if (!src_image->layout[0].tile_mode) {
       /* If an image is linear, we can always safely reinterpret it with the
        * other image's format and then do a regular blit.
        */
       format = dst_format;
-   } else if (!dst_image->layout.tile_mode) {
+   } else if (!dst_image->layout[0].tile_mode) {
       format = src_format;
    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
       /* We can't currently copy r8g8 images to/from other cpp=2 images,
@@ -1527,9 +1384,9 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
        * to/from it.
        */
       use_staging_blit = true;
-   } else if (!src_image->layout.ubwc) {
+   } else if (!src_image->layout[0].ubwc) {
       format = dst_format;
-   } else if (!dst_image->layout.ubwc) {
+   } else if (!dst_image->layout[0].ubwc) {
       format = src_format;
    } else {
       /* Both formats use UBWC and so neither can be reinterpreted.
@@ -1541,8 +1398,8 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    struct tu_image_view dst, src;
 
    if (use_staging_blit) {
-      tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
-      tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
+      tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
+      tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
 
       struct tu_image staging_image = {
          .vk_format = src_format,
@@ -1564,10 +1421,10 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
 
       VkOffset3D staging_offset = { 0 };
 
-      staging_image.layout.tile_mode = TILE6_LINEAR;
-      staging_image.layout.ubwc = false;
+      staging_image.layout[0].tile_mode = TILE6_LINEAR;
+      staging_image.layout[0].ubwc = false;
 
-      fdl6_layout(&staging_image.layout,
+      fdl6_layout(&staging_image.layout[0],
                   vk_format_to_pipe_format(staging_image.vk_format),
                   staging_image.samples,
                   staging_image.extent.width,
@@ -1579,7 +1436,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
                   NULL);
 
       VkResult result = tu_get_scratch_bo(cmd->device,
-                                          staging_image.layout.size,
+                                          staging_image.layout[0].size,
                                           &staging_image.bo);
       if (result != VK_SUCCESS) {
          cmd->record_result = result;
@@ -1590,14 +1447,14 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
 
       struct tu_image_view staging;
-      tu_image_view_blit2(&staging, &staging_image, src_format,
-                          &staging_subresource, 0, false);
+      tu_image_view_copy(&staging, &staging_image, src_format,
+                         &staging_subresource, 0, false);
 
-      ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
+      ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
       coords(ops, cs, &staging_offset, &src_offset, &extent);
 
       for (uint32_t i = 0; i < info->extent.depth; i++) {
-         ops->src(cmd, cs, &src, i, false);
+         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
          ops->dst(cs, &staging, i);
          ops->run(cmd, cs);
       }
@@ -1608,30 +1465,34 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
 
-      tu_image_view_blit2(&staging, &staging_image, dst_format,
-                          &staging_subresource, 0, false);
+      tu_image_view_copy(&staging, &staging_image, dst_format,
+                         &staging_subresource, 0, false);
 
-      ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
+      ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
+                 ROTATE_0, false, dst_image->layout[0].ubwc);
       coords(ops, cs, &dst_offset, &staging_offset, &extent);
 
       for (uint32_t i = 0; i < info->extent.depth; i++) {
-         ops->src(cmd, cs, &staging, i, false);
+         ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
          ops->dst(cs, &dst, i);
          ops->run(cmd, cs);
       }
    } else {
-      tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
-      tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
+      tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
+      tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
 
-      ops->setup(cmd, cs, format, ROTATE_0, false, mask);
+      ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
+                 ROTATE_0, false, dst_image->layout[0].ubwc);
       coords(ops, cs, &dst_offset, &src_offset, &extent);
 
       for (uint32_t i = 0; i < info->extent.depth; i++) {
-         ops->src(cmd, cs, &src, i, false);
+         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
          ops->dst(cs, &dst, i);
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1666,7 +1527,7 @@ copy_buffer(struct tu_cmd_buffer *cmd,
    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
    uint64_t blocks = size / block_size;
 
-   ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
+   ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
 
    while (blocks) {
       uint32_t src_x = (src_va & 63) / block_size;
@@ -1682,6 +1543,8 @@ copy_buffer(struct tu_cmd_buffer *cmd,
       dst_va += width * block_size;
       blocks -= width;
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1718,7 +1581,7 @@ tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
 
    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 
-   struct ts_cs_memory tmp;
+   struct tu_cs_memory tmp;
    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
    if (result != VK_SUCCESS) {
       cmd->record_result = result;
@@ -1749,7 +1612,7 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
    uint32_t blocks = fillSize / 4;
 
-   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
+   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
 
    while (blocks) {
@@ -1763,6 +1626,8 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
       dst_va += width * 4;
       blocks -= width;
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1783,7 +1648,8 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer,
    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
 
-   ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
+   ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
+              ROTATE_0, false, dst_image->layout[0].ubwc);
 
    for (uint32_t i = 0; i < regionCount; ++i) {
       const VkImageResolve *info = &pRegions[i];
@@ -1799,11 +1665,13 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer,
       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
 
       for (uint32_t i = 0; i < layers; i++) {
-         ops->src(cmd, cs, &src, i, false);
+         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
          ops->dst(cs, &dst, i);
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1821,47 +1689,45 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
 
    assert(src->image->vk_format == dst->image->vk_format);
 
-   ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
+   ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
+              ROTATE_0, false, dst->ubwc_enabled);
    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
 
    for (uint32_t i = 0; i < layers; i++) {
-      ops->src(cmd, cs, src, i, false);
+      ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
       ops->dst(cs, dst, i);
       ops->run(cmd, cs);
    }
+
+   ops->teardown(cmd, cs);
 }
 
 static void
 clear_image(struct tu_cmd_buffer *cmd,
             struct tu_image *image,
             const VkClearValue *clear_value,
-            const VkImageSubresourceRange *range)
+            const VkImageSubresourceRange *range,
+            VkImageAspectFlags aspect_mask)
 {
    uint32_t level_count = tu_get_levelCount(image, range);
    uint32_t layer_count = tu_get_layerCount(image, range);
    struct tu_cs *cs = &cmd->cs;
    VkFormat format = image->vk_format;
-   if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      format = VK_FORMAT_R32_UINT;
+   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
+      format = copy_format(format, aspect_mask, false);
 
    if (image->type == VK_IMAGE_TYPE_3D) {
       assert(layer_count == 1);
       assert(range->baseArrayLayer == 0);
    }
 
-   uint8_t mask = 0xf;
-   if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-      mask = 0;
-      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         mask |= 0x7;
-      if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         mask |= 0x8;
-   }
-
    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
 
-   ops->setup(cmd, cs, format, ROTATE_0, true, mask);
-   ops->clear_value(cs, image->vk_format, clear_value);
+   ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
+   if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
+      ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
+   else
+      ops->clear_value(cs, format, clear_value);
 
    for (unsigned j = 0; j < level_count; j++) {
       if (image->type == VK_IMAGE_TYPE_3D)
@@ -1873,8 +1739,8 @@ clear_image(struct tu_cmd_buffer *cmd,
                   });
 
       struct tu_image_view dst;
-      tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
-         .aspectMask = range->aspectMask,
+      tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
+         .aspectMask = aspect_mask,
          .mipLevel = range->baseMipLevel + j,
          .baseArrayLayer = range->baseArrayLayer,
          .layerCount = 1,
@@ -1885,6 +1751,8 @@ clear_image(struct tu_cmd_buffer *cmd,
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1901,7 +1769,7 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
 
    for (unsigned i = 0; i < rangeCount; i++)
-      clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
+      clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
 }
 
 void
@@ -1917,97 +1785,18 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
 
    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
 
-   for (unsigned i = 0; i < rangeCount; i++)
-      clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
-}
-
-static void
-tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
-                               uint32_t attachment_count,
-                               const VkClearAttachment *attachments,
-                               uint32_t rect_count,
-                               const VkClearRect *rects)
-{
-   const struct tu_subpass *subpass = cmd->state.subpass;
-   /* note: cannot use shader path here.. there is a special shader path
-    * in tu_clear_sysmem_attachments()
-    */
-   const struct blit_ops *ops = &r2d_ops;
-   struct tu_cs *cs = &cmd->draw_cs;
-
-   for (uint32_t j = 0; j < attachment_count; j++) {
-         /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
-          * Pass Instance" says that:
-          *
-          *     Unlike other clear commands, vkCmdClearAttachments executes as
-          *     a drawing command, rather than a transfer command, with writes
-          *     performed by it executing in rasterization order. Clears to
-          *     color attachments are executed as color attachment writes, by
-          *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
-          *     Clears to depth/stencil attachments are executed as depth
-          *     writes and writes by the
-          *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
-          *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
-          *
-          * However, the 2d path here is executed the same way as a
-          * transfer command, using the CCU color cache exclusively with
-          * a special depth-as-color format for depth clears. This means that
-          * we can't rely on the normal pipeline barrier mechanism here, and
-          * have to manually flush whenever using a different cache domain
-          * from what the 3d path would've used. This happens when we clear
-          * depth/stencil, since normally depth attachments use CCU depth, but
-          * we clear it using a special depth-as-color format. Since the clear
-          * potentially uses a different attachment state we also need to
-          * invalidate color beforehand and flush it afterwards.
-          */
+   for (unsigned i = 0; i < rangeCount; i++) {
+      const VkImageSubresourceRange *range = &pRanges[i];
 
-         uint32_t a;
-         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-            a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
-         } else {
-            a = subpass->depth_stencil_attachment.attachment;
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
-            tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
-         }
-
-         if (a == VK_ATTACHMENT_UNUSED)
-               continue;
-
-         uint8_t mask = 0xf;
-         if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
-               mask &= ~0x7;
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
-               mask &= ~0x8;
-         }
-
-         const struct tu_image_view *iview =
-            cmd->state.framebuffer->attachments[a].attachment;
-
-         ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
-         ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
-
-         /* Wait for the flushes we triggered manually to complete */
-         tu_cs_emit_wfi(cs);
-
-         for (uint32_t i = 0; i < rect_count; i++) {
-            ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
-            for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
-               ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
-               ops->run(cmd, cs);
-            }
-         }
+      if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         /* can't clear both depth and stencil at once, split up the aspect mask */
+         uint32_t b;
+         for_each_bit(b, range->aspectMask)
+            clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
+         continue;
+      }
 
-         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
-            tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
-         } else {
-            /* sync color into depth */
-            tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
-            tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
-         }
+      clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
    }
 }
 
@@ -2062,16 +1851,22 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
       max_samples = MAX2(max_samples, pass->attachments[a].samples);
    }
 
-   /* prefer to use 2D path for clears
-    * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
+   /* disable all draw states so they don't interfere
+    * TODO: use and re-use draw states
+    * we have to disable draw states individually to preserve
+    * input attachment states, because a secondary command buffer
+    * won't be able to restore them
     */
-   if (max_samples == 1 && cmd->state.framebuffer) {
-      tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
-      return;
+   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
+   for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
+      if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
+          i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
+         continue;
+      tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
+                     CP_SET_DRAW_STATE__0_DISABLE);
+      tu_cs_emit_qw(cs, 0);
    }
-
-   /* This clear path behaves like a draw, needs the same flush as tu_draw */
-   tu_emit_cache_flush_renderpass(cmd, cs);
+   cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
@@ -2092,7 +1887,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
          layered_clear = true;
    }
 
-   r3d_pipeline(cmd, cs, false, num_rts, layered_clear);
+   r3d_common(cmd, cs, false, num_rts, layered_clear);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
@@ -2138,118 +1933,82 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
 
    for (uint32_t i = 0; i < rect_count; i++) {
       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
-         r3d_coords_raw(cs, layered_clear, (float[]) {
+         r3d_coords_raw(cs, (float[]) {
             rects[i].rect.offset.x, rects[i].rect.offset.y,
             z_clear_val, uif(rects[i].baseArrayLayer + layer),
             rects[i].rect.offset.x + rects[i].rect.extent.width,
             rects[i].rect.offset.y + rects[i].rect.extent.height,
             z_clear_val, 1.0f,
          });
-
-         if (layered_clear) {
-            tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
-            tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
-                           CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
-                           CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
-                           CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
-            tu_cs_emit(cs, 1); /* instance count */
-            tu_cs_emit(cs, 1); /* vertex count */
-         } else {
-            r3d_run(cmd, cs);
-         }
+         r3d_run(cmd, cs);
       }
    }
-
-   cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
-      TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
-      TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
-      TU_CMD_DIRTY_DYNAMIC_SCISSOR;
 }
 
-/**
- * Pack a VkClearValue into a 128-bit buffer. format is respected except
- * for the component order.  The components are always packed in WZYX order,
- * because gmem is tiled and tiled formats always have WZYX swap
- */
 static void
-pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
+pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
 {
-   const struct util_format_description *desc = vk_format_description(format);
+   enum pipe_format pformat = vk_format_to_pipe_format(format);
 
    switch (format) {
-   case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-      buf[0] = float3_to_r11g11b10f(val->color.float32);
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
+                       val->depthStencil.stencil << 24;
       return;
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      buf[0] = float3_to_rgb9e5(val->color.float32);
+   case VK_FORMAT_D16_UNORM:
+      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
+      return;
+   case VK_FORMAT_D32_SFLOAT:
+      clear_value[0] = fui(val->depthStencil.depth);
       return;
+   case VK_FORMAT_S8_UINT:
+      clear_value[0] = val->depthStencil.stencil;
+      return;
+   /* these formats use a different base format when tiled
+    * the same format can be used for both because GMEM is always in WZYX order
+    */
+   case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+   case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+      pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
    default:
       break;
    }
 
-   assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
-
-   /* S8_UINT is special and has no depth */
-   const int max_components =
-      format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
-
-   int buf_offset = 0;
-   int bit_shift = 0;
-   for (int comp = 0; comp < max_components; comp++) {
-      const struct util_format_channel_description *ch =
-         tu_get_format_channel_description(desc, comp);
-      if (!ch) {
-         assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
-                (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
-         continue;
-      }
-
-      union tu_clear_component_value v = tu_get_clear_component_value(
-         val, comp, desc->colorspace);
-
-      /* move to the next uint32_t when there is not enough space */
-      assert(ch->size <= 32);
-      if (bit_shift + ch->size > 32) {
-         buf_offset++;
-         bit_shift = 0;
-      }
+   VkClearColorValue color;
 
-      if (bit_shift == 0)
-         buf[buf_offset] = 0;
+   /**
+    * GMEM is tiled and wants the components in WZYX order,
+    * apply swizzle to the color before packing, to counteract
+    * deswizzling applied by packing functions
+    */
+   pipe_swizzle_4f(color.float32, val->color.float32,
+                   util_format_description(pformat)->swizzle);
 
-      buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
-      bit_shift += ch->size;
-   }
+   util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
 }
 
 static void
-tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
-                              struct tu_cs *cs,
-                              uint32_t attachment,
-                              uint8_t component_mask,
-                              const VkClearValue *value)
+clear_gmem_attachment(struct tu_cmd_buffer *cmd,
+                      struct tu_cs *cs,
+                      VkFormat format,
+                      uint8_t clear_mask,
+                      uint32_t gmem_offset,
+                      const VkClearValue *value)
 {
-   VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
-   /* note: component_mask is 0x7 for depth and 0x8 for stencil
-    * because D24S8 is cleared with AS_R8G8B8A8 format
-    */
-
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
-   tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
+   tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
-   tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
+   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
-   tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
+   tu_cs_emit(cs, gmem_offset);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
    tu_cs_emit(cs, 0);
 
    uint32_t clear_vals[4] = {};
-   pack_gmem_clear_value(value, vk_format, clear_vals);
+   pack_gmem_clear_value(value, format, clear_vals);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
    tu_cs_emit_array(cs, clear_vals, 4);
@@ -2257,6 +2016,27 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
    tu6_emit_event_write(cmd, cs, BLIT);
 }
 
+static void
+tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
+                              struct tu_cs *cs,
+                              uint32_t attachment,
+                              VkImageAspectFlags mask,
+                              const VkClearValue *value)
+{
+   const struct tu_render_pass_attachment *att =
+      &cmd->state.pass->attachments[attachment];
+
+   if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
+         clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
+      if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
+         clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
+      return;
+   }
+
+   clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
+}
+
 static void
 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
                           uint32_t attachment_count,
@@ -2288,15 +2068,7 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
          if (a == VK_ATTACHMENT_UNUSED)
                continue;
 
-         unsigned clear_mask = 0xf;
-         if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
-               clear_mask &= ~0x7;
-            if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
-               clear_mask &= ~0x8;
-         }
-
-         tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
+         tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
                                        &attachments[j].clearValue);
       }
    }
@@ -2312,6 +2084,27 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
+   /* sysmem path behaves like a draw, note we don't have a way of using different
+    * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
+    */
+   tu_emit_cache_flush_renderpass(cmd, cs);
+
+   /* vkCmdClearAttachments is supposed to respect the predicate if active.
+    * The easiest way to do this is to always use the 3d path, which always
+    * works even with GMEM because it's just a simple draw using the existing
+    * attachment state. However it seems that IGNORE_VISIBILITY draws must be
+    * skipped in the binning pass, since otherwise they produce binning data
+    * which isn't consumed and leads to the wrong binning data being read, so
+    * condition on GMEM | SYSMEM.
+    */
+   if (cmd->state.predication_active) {
+      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
+                             CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
+      tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
+      tu_cond_exec_end(cs);
+      return;
+   }
+
    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
    tu_cond_exec_end(cs);
@@ -2321,42 +2114,67 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
    tu_cond_exec_end(cs);
 }
 
+static void
+clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
+                        struct tu_cs *cs,
+                        VkFormat format,
+                        VkImageAspectFlags clear_mask,
+                        const VkRenderPassBeginInfo *info,
+                        uint32_t a,
+                        bool separate_stencil)
+{
+   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_image_view *iview = fb->attachments[a].attachment;
+   const struct blit_ops *ops = &r2d_ops;
+   if (cmd->state.pass->attachments[a].samples > 1)
+      ops = &r3d_ops;
+
+   ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
+   ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
+   ops->clear_value(cs, format, &info->pClearValues[a]);
+
+   for (uint32_t i = 0; i < fb->layers; i++) {
+      if (separate_stencil) {
+         if (ops == &r3d_ops)
+            r3d_dst_stencil(cs, iview, i);
+         else
+            r2d_dst_stencil(cs, iview, i);
+      } else {
+         ops->dst(cs, iview, i);
+      }
+      ops->run(cmd, cs);
+   }
+
+   ops->teardown(cmd, cs);
+}
+
 void
 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
                            struct tu_cs *cs,
                            uint32_t a,
                            const VkRenderPassBeginInfo *info)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_image_view *iview = fb->attachments[a].attachment;
    const struct tu_render_pass_attachment *attachment =
       &cmd->state.pass->attachments[a];
-   uint8_t mask = 0;
-
-   if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
-      mask = 0xf;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-      mask |= 0x7;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-      mask |= 0x8;
 
-   if (!mask)
+   if (!attachment->clear_mask)
       return;
 
-   const struct blit_ops *ops = &r2d_ops;
-   if (attachment->samples > 1)
-      ops = &r3d_ops;
-
-   ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
-   ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
-   ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
-
    /* Wait for any flushes at the beginning of the renderpass to complete */
    tu_cs_emit_wfi(cs);
 
-   for (uint32_t i = 0; i < fb->layers; i++) {
-      ops->dst(cs, iview, i);
-      ops->run(cmd, cs);
+   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
+                                 info, a, false);
+      }
+      if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
+                                 info, a, true);
+      }
+   } else {
+      clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
+                              info, a, false);
    }
 
    /* The spec doesn't explicitly say, but presumably the initial renderpass
@@ -2385,21 +2203,13 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
 {
    const struct tu_render_pass_attachment *attachment =
       &cmd->state.pass->attachments[a];
-   unsigned clear_mask = 0;
-
-   if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
-      clear_mask = 0xf;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-      clear_mask |= 0x7;
-   if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-      clear_mask |= 0x8;
 
-   if (!clear_mask)
+   if (!attachment->clear_mask)
       return;
 
    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
 
-   tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
+   tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
                                  &info->pClearValues[a]);
 }
 
@@ -2408,7 +2218,8 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
              struct tu_cs *cs,
              const struct tu_image_view *iview,
              const struct tu_render_pass_attachment *attachment,
-             bool resolve)
+             bool resolve,
+             bool separate_stencil)
 {
    tu_cs_emit_regs(cs,
                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
@@ -2420,14 +2231,23 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
       .integer = vk_format_is_int(attachment->format)));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
-   tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
-   tu_cs_image_ref_2d(cs, iview, 0, false);
+   if (separate_stencil) {
+      tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
+      tu_cs_emit_qw(cs, iview->stencil_base_addr);
+      tu_cs_emit(cs, iview->stencil_PITCH);
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
-   tu_cs_image_flag_ref(cs, iview, 0);
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
+   } else {
+      tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
+      tu_cs_image_ref_2d(cs, iview, 0, false);
 
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
+      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
+      tu_cs_image_flag_ref(cs, iview, 0);
+
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
+   }
 
    tu6_emit_event_write(cmd, cs, BLIT);
 }
@@ -2479,7 +2299,58 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
       &cmd->state.pass->attachments[a];
 
    if (attachment->load || force_load)
-      tu_emit_blit(cmd, cs, iview, attachment, false);
+      tu_emit_blit(cmd, cs, iview, attachment, false, false);
+
+   if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
+      tu_emit_blit(cmd, cs, iview, attachment, false, true);
+}
+
+static void
+store_cp_blit(struct tu_cmd_buffer *cmd,
+              struct tu_cs *cs,
+              struct tu_image_view *iview,
+              uint32_t samples,
+              bool separate_stencil,
+              VkFormat format,
+              uint32_t gmem_offset,
+              uint32_t cpp)
+{
+   r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
+                    iview->ubwc_enabled, true);
+   if (separate_stencil)
+      r2d_dst_stencil(cs, iview, 0);
+   else
+      r2d_dst(cs, iview, 0);
+
+   tu_cs_emit_regs(cs,
+                   A6XX_SP_PS_2D_SRC_INFO(
+                      .color_format = tu6_format_texture(format, TILE6_2).fmt,
+                      .tile_mode = TILE6_2,
+                      .srgb = vk_format_is_srgb(format),
+                      .samples = tu_msaa_samples(samples),
+                      .samples_average = !vk_format_is_int(format),
+                      .unk20 = 1,
+                      .unk22 = 1),
+                   /* note: src size does not matter when not scaling */
+                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
+                   A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
+                   A6XX_SP_PS_2D_SRC_HI(),
+                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
+
+   /* sync GMEM writes with CACHE. */
+   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
+
+   /* Wait for CACHE_INVALIDATE to land */
+   tu_cs_emit_wfi(cs);
+
+   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
+   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
+
+   /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
+    * sysmem, and we generally assume that GMEM renderpasses leave their
+    * results in sysmem, so we need to flush manually here.
+    */
+   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
 }
 
 void
@@ -2488,13 +2359,12 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          uint32_t a,
                          uint32_t gmem_a)
 {
-   const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
-   const VkRect2D *render_area = &tiling->render_area;
+   const VkRect2D *render_area = &cmd->state.render_area;
    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
 
-   if (!dst->store)
+   if (!dst->store && !dst->store_stencil)
       return;
 
    uint32_t x1 = render_area->offset.x;
@@ -2515,7 +2385,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
 
    /* use fast path when render area is aligned, except for unsupported resolve cases */
    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
-      tu_emit_blit(cmd, cs, iview, src, true);
+      if (dst->store)
+         tu_emit_blit(cmd, cs, iview, src, true, false);
+      if (dst->store_stencil)
+         tu_emit_blit(cmd, cs, iview, src, true, true);
       return;
    }
 
@@ -2527,37 +2400,18 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       return;
    }
 
-   r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
-   r2d_dst(cs, iview, 0);
    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
 
-   tu_cs_emit_regs(cs,
-                   A6XX_SP_PS_2D_SRC_INFO(
-                      .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
-                      .tile_mode = TILE6_2,
-                      .srgb = vk_format_is_srgb(src->format),
-                      .samples = tu_msaa_samples(src->samples),
-                      .samples_average = !vk_format_is_int(src->format),
-                      .unk20 = 1,
-                      .unk22 = 1),
-                   /* note: src size does not matter when not scaling */
-                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
-                   A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
-                   A6XX_SP_PS_2D_SRC_HI(),
-                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
-
-   /* sync GMEM writes with CACHE. */
-   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
-
-   /* Wait for CACHE_INVALIDATE to land */
-   tu_cs_emit_wfi(cs);
-
-   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
-   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
+   VkFormat format = src->format;
+   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+      format = VK_FORMAT_D32_SFLOAT;
 
-   /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
-    * sysmem, and we generally assume that GMEM renderpasses leave their
-    * results in sysmem, so we need to flush manually here.
-    */
-   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
+   if (dst->store) {
+      store_cp_blit(cmd, cs, iview, src->samples, false, format,
+                    src->gmem_offset, src->cpp);
+   }
+   if (dst->store_stencil) {
+      store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
+                    src->gmem_offset_stencil, src->samples);
+   }
 }