freedreno/a6xx: Program SP_2D_SRC_FORMAT outside blit loop
[mesa.git] / src / gallium / drivers / freedreno / a6xx / fd6_blitter.c
index 757e7117f7552865ea1e96755353592e21bc8d57..93e460acce2088305c1ab5fb64c4b5ed8fa2bf85 100644 (file)
@@ -30,6 +30,7 @@
 
 #include "freedreno_blitter.h"
 #include "freedreno_fence.h"
+#include "freedreno_log.h"
 #include "freedreno_resource.h"
 
 #include "fd6_blitter.h"
 #include "fd6_resource.h"
 #include "fd6_pack.h"
 
+static inline enum a6xx_2d_ifmt
+fd6_ifmt(enum a6xx_format fmt)
+{
+       switch (fmt) {
+       case FMT6_A8_UNORM:
+       case FMT6_8_UNORM:
+       case FMT6_8_SNORM:
+       case FMT6_8_8_UNORM:
+       case FMT6_8_8_SNORM:
+       case FMT6_8_8_8_8_UNORM:
+       case FMT6_8_8_8_X8_UNORM:
+       case FMT6_8_8_8_8_SNORM:
+       case FMT6_4_4_4_4_UNORM:
+       case FMT6_5_5_5_1_UNORM:
+       case FMT6_5_6_5_UNORM:
+               return R2D_UNORM8;
+
+       case FMT6_32_UINT:
+       case FMT6_32_SINT:
+       case FMT6_32_32_UINT:
+       case FMT6_32_32_SINT:
+       case FMT6_32_32_32_32_UINT:
+       case FMT6_32_32_32_32_SINT:
+               return R2D_INT32;
+
+       case FMT6_16_UINT:
+       case FMT6_16_SINT:
+       case FMT6_16_16_UINT:
+       case FMT6_16_16_SINT:
+       case FMT6_16_16_16_16_UINT:
+       case FMT6_16_16_16_16_SINT:
+       case FMT6_10_10_10_2_UINT:
+               return R2D_INT16;
+
+       case FMT6_8_UINT:
+       case FMT6_8_SINT:
+       case FMT6_8_8_UINT:
+       case FMT6_8_8_SINT:
+       case FMT6_8_8_8_8_UINT:
+       case FMT6_8_8_8_8_SINT:
+       case FMT6_Z24_UNORM_S8_UINT:
+       case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
+               return R2D_INT8;
+
+       case FMT6_16_UNORM:
+       case FMT6_16_SNORM:
+       case FMT6_16_16_UNORM:
+       case FMT6_16_16_SNORM:
+       case FMT6_16_16_16_16_UNORM:
+       case FMT6_16_16_16_16_SNORM:
+       case FMT6_32_FLOAT:
+       case FMT6_32_32_FLOAT:
+       case FMT6_32_32_32_32_FLOAT:
+               return R2D_FLOAT32;
+
+       case FMT6_16_FLOAT:
+       case FMT6_16_16_FLOAT:
+       case FMT6_16_16_16_16_FLOAT:
+       case FMT6_11_11_10_FLOAT:
+       case FMT6_10_10_10_2_UNORM_DEST:
+               return R2D_FLOAT16;
+
+       default:
+               unreachable("bad format");
+               return 0;
+       }
+}
+
 /* Make sure none of the requested dimensions extend beyond the size of the
  * resource.  Not entirely sure why this happens, but sometimes it does, and
  * w/ 2d blt doesn't have wrap modes like a sampler, so force those cases
@@ -60,6 +129,9 @@ ok_format(enum pipe_format pfmt)
 {
        enum a6xx_format fmt = fd6_pipe2color(pfmt);
 
+       if (util_format_is_compressed(pfmt))
+               return true;
+
        switch (pfmt) {
        case PIPE_FORMAT_Z24_UNORM_S8_UINT:
        case PIPE_FORMAT_Z24X8_UNORM:
@@ -73,10 +145,10 @@ ok_format(enum pipe_format pfmt)
                break;
        }
 
-       if (fmt == ~0)
+       if (fmt == FMT6_NONE)
                return false;
 
-       if (fd6_ifmt(fmt) == 0)
+       if (fmt == FMT6_10_10_10_2_UNORM_DEST)
                return false;
 
        return true;
@@ -156,6 +228,11 @@ emit_setup(struct fd_batch *batch)
        fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
        fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false);
        fd6_event_write(batch, ring, PC_CCU_INVALIDATE_DEPTH, false);
+
+       /* normal BLIT_OP_SCALE operation needs bypass RB_CCU_CNTL */
+       OUT_WFI5(ring);
+       OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1);
+       OUT_RING(ring, fd6_context(batch->ctx)->magic.RB_CCU_CNTL_bypass);
 }
 
 static uint32_t
@@ -278,7 +355,7 @@ emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring,
                OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_8_UNORM) |
                                 A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) |
                                 A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX));
-               OUT_RELOCW(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
+               OUT_RELOC(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
                OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(p));
                OUT_RING(ring, 0x00000000);
                OUT_RING(ring, 0x00000000);
@@ -322,18 +399,103 @@ emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring,
        }
 }
 
+static void
+emit_blit_dst(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, unsigned layer)
+{
+       struct fd_resource *dst = fd_resource(info->dst.resource);
+       enum a6xx_format fmt = fd6_pipe2color(info->dst.format);
+       enum a6xx_tile_mode tile = fd_resource_tile_mode(info->dst.resource, info->dst.level);
+       enum a3xx_color_swap swap = fd6_resource_swap(dst, info->dst.format);
+       uint32_t pitch = fd_resource_pitch(dst, info->dst.level);
+       bool ubwc_enabled = fd_resource_ubwc_enabled(dst, info->dst.level);
+       unsigned off = fd_resource_offset(dst, info->dst.level, layer);
+
+       if (fmt == FMT6_Z24_UNORM_S8_UINT)
+               fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+
+       OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9);
+       OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(fmt) |
+                       A6XX_RB_2D_DST_INFO_TILE_MODE(tile) |
+                       A6XX_RB_2D_DST_INFO_COLOR_SWAP(swap) |
+                       COND(util_format_is_srgb(info->dst.format), A6XX_RB_2D_DST_INFO_SRGB) |
+                       COND(ubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS));
+       OUT_RELOC(ring, dst->bo, off, 0, 0);    /* RB_2D_DST_LO/HI */
+       OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(pitch));
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+
+       if (ubwc_enabled) {
+               OUT_PKT4(ring, REG_A6XX_RB_2D_DST_FLAGS_LO, 6);
+               fd6_emit_flag_reference(ring, dst, info->dst.level, layer);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000000);
+       }
+}
+
+static void
+emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, unsigned layer, unsigned nr_samples)
+{
+       struct fd_resource *src = fd_resource(info->src.resource);
+       enum a6xx_format sfmt = fd6_pipe2color(info->src.format);
+       enum a6xx_tile_mode     stile = fd_resource_tile_mode(info->src.resource, info->src.level);
+       enum a3xx_color_swap sswap = fd6_resource_swap(src, info->src.format);
+       uint32_t pitch = fd_resource_pitch(src, info->src.level);
+       bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level);
+       unsigned soff = fd_resource_offset(src, info->src.level, layer);
+       uint32_t width = u_minify(src->base.width0, info->src.level) * nr_samples;
+       uint32_t height = u_minify(src->base.height0, info->src.level);
+       uint32_t filter = 0;
+
+       if (info->filter == PIPE_TEX_FILTER_LINEAR)
+               filter = A6XX_SP_PS_2D_SRC_INFO_FILTER;
+
+       enum a3xx_msaa_samples samples = fd_msaa_samples(src->base.nr_samples);
+
+       if (sfmt == FMT6_10_10_10_2_UNORM_DEST)
+               sfmt = FMT6_10_10_10_2_UNORM;
+
+       OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10);
+       OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) |
+                       A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(stile) |
+                       A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(sswap) |
+                       A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) |
+                       COND(samples > MSAA_ONE && (info->mask & PIPE_MASK_RGBA),
+                                       A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) |
+                       COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) |
+                       COND(util_format_is_srgb(info->src.format), A6XX_SP_PS_2D_SRC_INFO_SRGB) |
+                       0x500000 | filter);
+       OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) |
+                       A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height)); /* SP_PS_2D_SRC_SIZE */
+       OUT_RELOC(ring, src->bo, soff, 0, 0);    /* SP_PS_2D_SRC_LO/HI */
+       OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(pitch));
+
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+
+       if (subwc_enabled) {
+               OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 6);
+               fd6_emit_flag_reference(ring, src, info->src.level, layer);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000000);
+       }
+}
+
 static void
 emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                const struct pipe_blit_info *info, union pipe_color_union *color)
 {
        const struct pipe_box *sbox = &info->src.box;
        const struct pipe_box *dbox = &info->dst.box;
-       struct fd_resource *src, *dst;
-       struct fdl_slice *sslice, *dslice;
+       struct fd_resource *dst;
        enum a6xx_format sfmt, dfmt;
-       enum a6xx_tile_mode stile, dtile;
-       enum a3xx_color_swap sswap, dswap;
-       unsigned spitch, dpitch;
        int sx1, sy1, sx2, sy2;
        int dx1, dy1, dx2, dy2;
 
@@ -347,29 +509,13 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                fprintf(stderr, "\n");
        }
 
-       src = fd_resource(info->src.resource);
        dst = fd_resource(info->dst.resource);
 
-       sslice = fd_resource_slice(src, info->src.level);
-       dslice = fd_resource_slice(dst, info->dst.level);
-
        sfmt = fd6_pipe2color(info->src.format);
        dfmt = fd6_pipe2color(info->dst.format);
 
-       stile = fd_resource_tile_mode(info->src.resource, info->src.level);
-       dtile = fd_resource_tile_mode(info->dst.resource, info->dst.level);
-
-       /* Linear levels of a tiled resource are always WZYX, so look at
-        * rsc->tile_mode to determine the swap.
-        */
-       sswap = fd6_resource_swap(src, info->src.format);
-       dswap = fd6_resource_swap(dst, info->dst.format);
-
-       /* Use the underlying resource format so that we get the right block width
-        * for compressed textures.
-        */
-       spitch = util_format_get_nblocksx(src->base.format, sslice->pitch) * src->layout.cpp;
-       dpitch = util_format_get_nblocksx(dst->base.format, dslice->pitch) * dst->layout.cpp;
+       OUT_PKT7(ring, CP_SET_MARKER, 1);
+       OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE));
 
        uint32_t nr_samples = fd_resource_nr_samples(&dst->base);
        sx1 = sbox->x * nr_samples;
@@ -382,11 +528,15 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
        dx2 = (dbox->x + dbox->width) * nr_samples - 1;
        dy2 = dbox->y + dbox->height - 1;
 
-       uint32_t width = u_minify(src->base.width0, info->src.level) * nr_samples;
-       uint32_t height = u_minify(src->base.height0, info->src.level);
+       OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4);
+       OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X_X(sx1));
+       OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X_X(sx2));
+       OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y_Y(sy1));
+       OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y_Y(sy2));
 
-       OUT_PKT7(ring, CP_SET_MARKER, 1);
-       OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE));
+       OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2);
+       OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dx1) | A6XX_GRAS_2D_DST_TL_Y(dy1));
+       OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dx2) | A6XX_GRAS_2D_DST_BR_Y(dy2));
 
        uint32_t blit_cntl = blit_control(dfmt, util_format_is_srgb(info->dst.format));
 
@@ -403,19 +553,8 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                        color->ui[1] = (depth_unorm24 >> 8) & 0xff;
                        color->ui[2] = (depth_unorm24 >> 16) & 0xff;
                        color->ui[3] = stencil;
-
-                       dfmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
                        break;
                }
-               case PIPE_FORMAT_B5G6R5_UNORM:
-               case PIPE_FORMAT_B5G5R5A1_UNORM:
-               case PIPE_FORMAT_B5G5R5X1_UNORM:
-               case PIPE_FORMAT_B4G4R4A4_UNORM:
-                       color->ui[0] = float_to_ubyte(color->f[0]);
-                       color->ui[1] = float_to_ubyte(color->f[1]);
-                       color->ui[2] = float_to_ubyte(color->f[2]);
-                       color->ui[3] = float_to_ubyte(color->f[3]);
-                       break;
                default:
                        break;
                }
@@ -437,12 +576,10 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                        OUT_RING(ring, _mesa_float_to_half(color->f[3]));
                        sfmt = FMT6_16_16_16_16_FLOAT;
                        break;
-
                case R2D_FLOAT32:
                case R2D_INT32:
                case R2D_INT16:
                case R2D_INT8:
-               case R2D_RAW:
                default:
                        OUT_RING(ring, color->ui[0]);
                        OUT_RING(ring, color->ui[1]);
@@ -452,9 +589,6 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                }
        }
 
-       if (dtile != stile)
-               blit_cntl |= 0x20000000;
-
        if (info->scissor_enable) {
                OUT_PKT4(ring, REG_A6XX_GRAS_RESOLVE_CNTL_1, 2);
                OUT_RING(ring, A6XX_GRAS_RESOLVE_CNTL_1_X(info->scissor.minx) |
@@ -470,91 +604,38 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
        OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
        OUT_RING(ring, blit_cntl);
 
-       for (unsigned i = 0; i < info->dst.box.depth; i++) {
-               unsigned soff = fd_resource_offset(src, info->src.level, sbox->z + i);
-               unsigned doff = fd_resource_offset(dst, info->dst.level, dbox->z + i);
-               bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level);
-               bool dubwc_enabled = fd_resource_ubwc_enabled(dst, info->dst.level);
-
-               /*
-                * Emit source:
-                */
-               uint32_t filter = 0;
-               if (info->filter == PIPE_TEX_FILTER_LINEAR)
-                       filter = A6XX_SP_PS_2D_SRC_INFO_FILTER;
-
-               enum a3xx_msaa_samples samples = fd_msaa_samples(src->base.nr_samples);
+       if (dfmt == FMT6_10_10_10_2_UNORM_DEST)
+               sfmt = FMT6_16_16_16_16_FLOAT;
 
-               if (sfmt == FMT6_10_10_10_2_UNORM_DEST)
-                       sfmt = FMT6_10_10_10_2_UNORM;
+       /* This register is probably badly named... it seems that it's
+        * controlling the internal/accumulator format or something like
+        * that. It's certainly not tied to only the src format.
+        */
+       OUT_PKT4(ring, REG_A6XX_SP_2D_SRC_FORMAT, 1);
+       OUT_RING(ring, A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(sfmt) |
+                       COND(util_format_is_pure_sint(info->src.format),
+                                       A6XX_SP_2D_SRC_FORMAT_SINT) |
+                       COND(util_format_is_pure_uint(info->src.format),
+                                       A6XX_SP_2D_SRC_FORMAT_UINT) |
+                       COND(util_format_is_snorm(info->src.format),
+                                       A6XX_SP_2D_SRC_FORMAT_SINT |
+                                               A6XX_SP_2D_SRC_FORMAT_NORM) |
+                       COND(util_format_is_unorm(info->src.format),
+// TODO sometimes blob uses UINT+NORM but dEQP seems unhappy about that
+//                                             A6XX_SP_2D_SRC_FORMAT_UINT |
+                                       A6XX_SP_2D_SRC_FORMAT_NORM) |
+                       COND(util_format_is_srgb(info->dst.format), A6XX_SP_2D_SRC_FORMAT_SRGB) |
+                       A6XX_SP_2D_SRC_FORMAT_MASK(0xf));
 
-               OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10);
-               OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) |
-                               A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(stile) |
-                               A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(sswap) |
-                               A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) |
-                               COND(samples > MSAA_ONE && (info->mask & PIPE_MASK_RGBA),
-                                               A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) |
-                               COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) |
-                               COND(util_format_is_srgb(info->src.format), A6XX_SP_PS_2D_SRC_INFO_SRGB) |
-                               0x500000 | filter);
-               OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) |
-                                A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height)); /* SP_PS_2D_SRC_SIZE */
-               OUT_RELOC(ring, src->bo, soff, 0, 0);    /* SP_PS_2D_SRC_LO/HI */
-               OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(spitch));
+       for (unsigned i = 0; i < info->dst.box.depth; i++) {
 
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
+               emit_blit_src(ring, info, sbox->z + i, nr_samples);
 
-               if (subwc_enabled) {
-                       OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 6);
-                       fd6_emit_flag_reference(ring, src, info->src.level, sbox->z + i);
-                       OUT_RING(ring, 0x00000000);
-                       OUT_RING(ring, 0x00000000);
-                       OUT_RING(ring, 0x00000000);
-               }
-
-               /*
-                * Emit destination:
-                */
-               OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9);
-               OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
-                                A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
-                                A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap) |
-                                COND(util_format_is_srgb(info->dst.format), A6XX_RB_2D_DST_INFO_SRGB) |
-                                COND(dubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS));
-               OUT_RELOCW(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
-               OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-               OUT_RING(ring, 0x00000000);
-
-               if (dubwc_enabled) {
-                       OUT_PKT4(ring, REG_A6XX_RB_2D_DST_FLAGS_LO, 6);
-                       fd6_emit_flag_reference(ring, dst, info->dst.level, dbox->z + i);
-                       OUT_RING(ring, 0x00000000);
-                       OUT_RING(ring, 0x00000000);
-                       OUT_RING(ring, 0x00000000);
-               }
+               emit_blit_dst(ring, info, dbox->z + i);
 
                /*
                 * Blit command:
                 */
-               OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4);
-               OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X_X(sx1));
-               OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X_X(sx2));
-               OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y_Y(sy1));
-               OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y_Y(sy2));
-
-               OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2);
-               OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dx1) | A6XX_GRAS_2D_DST_TL_Y(dy1));
-               OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dx2) | A6XX_GRAS_2D_DST_BR_Y(dy2));
-
                OUT_PKT7(ring, CP_EVENT_WRITE, 1);
                OUT_RING(ring, 0x3f);
                OUT_WFI5(ring);
@@ -562,29 +643,6 @@ emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring,
                OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8C01, 1);
                OUT_RING(ring, 0);
 
-               if (dfmt == FMT6_10_10_10_2_UNORM_DEST)
-                       sfmt = FMT6_16_16_16_16_FLOAT;
-
-               /* This register is probably badly named... it seems that it's
-                * controlling the internal/accumulator format or something like
-                * that. It's certainly not tied to only the src format.
-                */
-               OUT_PKT4(ring, REG_A6XX_SP_2D_SRC_FORMAT, 1);
-               OUT_RING(ring, A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(sfmt) |
-                               COND(util_format_is_pure_sint(info->src.format),
-                                               A6XX_SP_2D_SRC_FORMAT_SINT) |
-                               COND(util_format_is_pure_uint(info->src.format),
-                                               A6XX_SP_2D_SRC_FORMAT_UINT) |
-                               COND(util_format_is_snorm(info->src.format),
-                                               A6XX_SP_2D_SRC_FORMAT_SINT |
-                                               A6XX_SP_2D_SRC_FORMAT_NORM) |
-                               COND(util_format_is_unorm(info->src.format),
-// TODO sometimes blob uses UINT+NORM but dEQP seems unhappy about that
-//                                             A6XX_SP_2D_SRC_FORMAT_UINT |
-                                               A6XX_SP_2D_SRC_FORMAT_NORM) |
-                               COND(util_format_is_srgb(info->dst.format), A6XX_SP_2D_SRC_FORMAT_SRGB) |
-                               A6XX_SP_2D_SRC_FORMAT_MASK(0xf));
-
                OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1);
                OUT_RING(ring, fd6_context(ctx)->magic.RB_UNKNOWN_8E04_blit);
 
@@ -632,34 +690,44 @@ handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info)
        if (!can_do_blit(info))
                return false;
 
-       fd_fence_ref(&ctx->last_fence, NULL);
-
        batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true);
 
        fd6_emit_restore(batch, batch->draw);
        fd6_emit_lrz_flush(batch->draw);
 
-       mtx_lock(&ctx->screen->lock);
+       fd_screen_lock(ctx->screen);
+
+       fd_batch_resource_read(batch, fd_resource(info->src.resource));
+       fd_batch_resource_write(batch, fd_resource(info->dst.resource));
 
-       fd_batch_resource_used(batch, fd_resource(info->src.resource), false);
-       fd_batch_resource_used(batch, fd_resource(info->dst.resource), true);
+       fd_screen_unlock(ctx->screen);
 
-       mtx_unlock(&ctx->screen->lock);
+       /* Clearing last_fence must come after the batch dependency tracking
+        * (resource_read()/resource_write()), as that can trigger a flush,
+        * re-populating last_fence
+        */
+       fd_fence_ref(&ctx->last_fence, NULL);
 
        fd_batch_set_stage(batch, FD_STAGE_BLIT);
 
+       fd_log_stream(batch, stream, util_dump_blit_info(stream, info));
+
        emit_setup(batch);
 
        if ((info->src.resource->target == PIPE_BUFFER) &&
                        (info->dst.resource->target == PIPE_BUFFER)) {
                assert(fd_resource(info->src.resource)->layout.tile_mode == TILE6_LINEAR);
                assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE6_LINEAR);
+               fd_log(batch, "START BLIT (BUFFER)");
                emit_blit_buffer(ctx, batch->draw, info);
+               fd_log(batch, "END BLIT (BUFFER)");
        } else {
                /* I don't *think* we need to handle blits between buffer <-> !buffer */
                debug_assert(info->src.resource->target != PIPE_BUFFER);
                debug_assert(info->dst.resource->target != PIPE_BUFFER);
+               fd_log(batch, "START BLIT (TEXTURE)");
                emit_blit_or_clear_texture(ctx, batch->draw, info, NULL);
+               fd_log(batch, "END BLIT (TEXTURE)");
        }
 
        fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_COLOR_TS, true);
@@ -796,15 +864,26 @@ handle_compressed_blit(struct fd_context *ctx, const struct pipe_blit_info *info
        int bw = util_format_get_blockwidth(info->src.format);
        int bh = util_format_get_blockheight(info->src.format);
 
+       /* NOTE: x/y *must* be aligned to block boundary (ie. in
+        * glCompressedTexSubImage2D()) but width/height may not
+        * be:
+        */
+
+       debug_assert((blit.src.box.x % bw) == 0);
+       debug_assert((blit.src.box.y % bh) == 0);
+
        blit.src.box.x /= bw;
        blit.src.box.y /= bh;
-       blit.src.box.width /= bw;
-       blit.src.box.height /= bh;
+       blit.src.box.width  = DIV_ROUND_UP(blit.src.box.width, bw);
+       blit.src.box.height = DIV_ROUND_UP(blit.src.box.height, bh);
+
+       debug_assert((blit.dst.box.x % bw) == 0);
+       debug_assert((blit.dst.box.y % bh) == 0);
 
        blit.dst.box.x /= bw;
        blit.dst.box.y /= bh;
-       blit.dst.box.width /= bw;
-       blit.dst.box.height /= bh;
+       blit.dst.box.width  = DIV_ROUND_UP(blit.dst.box.width, bw);
+       blit.dst.box.height = DIV_ROUND_UP(blit.dst.box.height, bh);
 
        return do_rewritten_blit(ctx, &blit);
 }