freedreno: reorder format check
[mesa.git] / src / gallium / drivers / freedreno / a5xx / fd5_gmem.c
index b80a04fd2d27fc4d2749f82df4cde2ddac2aac2c..3380f8f63812cae96edf69d1b55f382c6d9545e4 100644 (file)
@@ -28,7 +28,7 @@
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
 
 #include "freedreno_draw.h"
 #include "freedreno_state.h"
@@ -49,23 +49,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
        enum a5xx_tile_mode tile_mode;
        unsigned i;
 
-       if (gmem) {
-               tile_mode = TILE5_2;
-       } else {
-               tile_mode = TILE5_LINEAR;
-       }
-
        for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) {
                enum a5xx_color_fmt format = 0;
                enum a3xx_color_swap swap = WZYX;
-               bool srgb = false;
+               bool srgb = false, sint = false, uint = false;
                struct fd_resource *rsc = NULL;
-               struct fd_resource_slice *slice = NULL;
+               struct fdl_slice *slice = NULL;
                uint32_t stride = 0;
                uint32_t size = 0;
                uint32_t base = 0;
                uint32_t offset = 0;
 
+               if (gmem) {
+                       tile_mode = TILE5_2;
+               } else {
+                       tile_mode = TILE5_LINEAR;
+               }
+
                if ((i < nr_bufs) && bufs[i]) {
                        struct pipe_surface *psurf = bufs[i];
                        enum pipe_format pformat = psurf->format;
@@ -76,6 +76,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
                        format = fd5_pipe2color(pformat);
                        swap = fd5_pipe2swap(pformat);
                        srgb = util_format_is_srgb(pformat);
+                       sint = util_format_is_pure_sint(pformat);
+                       uint = util_format_is_pure_uint(pformat);
 
                        debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
@@ -83,12 +85,14 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
                                        psurf->u.tex.first_layer);
 
                        if (gmem) {
-                               stride = gmem->bin_w * rsc->cpp;
+                               stride = gmem->bin_w * gmem->cbuf_cpp[i];
                                size = stride * gmem->bin_h;
                                base = gmem->cbuf_base[i];
                        } else {
-                               stride = slice->pitch * rsc->cpp;
+                               stride = slice->pitch * rsc->layout.cpp;
                                size = slice->size0;
+
+                               tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
                        }
                }
 
@@ -110,6 +114,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
                OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1);
                OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) |
+                               COND(sint, A5XX_SP_FS_MRT_REG_COLOR_SINT) |
+                               COND(uint, A5XX_SP_FS_MRT_REG_COLOR_UINT) |
                                COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB));
 
                /* when we support UBWC, these would be the system memory
@@ -130,7 +136,7 @@ emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf,
        if (zsbuf) {
                struct fd_resource *rsc = fd_resource(zsbuf->texture);
                enum a5xx_depth_format fmt = fd5_pipe2depth(zsbuf->format);
-               uint32_t cpp = rsc->cpp;
+               uint32_t cpp = rsc->layout.cpp;
                uint32_t stride = 0;
                uint32_t size = 0;
 
@@ -138,8 +144,8 @@ emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf,
                        stride = cpp * gmem->bin_w;
                        size = stride * gmem->bin_h;
                } else {
-                       struct fd_resource_slice *slice = fd_resource_slice(rsc, 0);
-                       stride = slice->pitch * rsc->cpp;
+                       struct fdl_slice *slice = fd_resource_slice(rsc, 0);
+                       stride = slice->pitch * rsc->layout.cpp;
                        size = slice->size0;
                }
 
@@ -162,13 +168,31 @@ emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf,
                OUT_RING(ring, 0x00000000);    /* RB_DEPTH_FLAG_BUFFER_BASE_HI */
                OUT_RING(ring, 0x00000000);    /* RB_DEPTH_FLAG_BUFFER_PITCH */
 
+               if (rsc->lrz) {
+                       OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3);
+                       OUT_RELOCW(ring, rsc->lrz, 0x1000, 0, 0);
+                       OUT_RING(ring, A5XX_GRAS_LRZ_BUFFER_PITCH(rsc->lrz_pitch));
+
+                       OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2);
+                       OUT_RELOCW(ring, rsc->lrz, 0, 0, 0);
+               } else {
+                       OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3);
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x00000000);     /* GRAS_LRZ_BUFFER_PITCH */
+
+                       OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2);
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x00000000);
+               }
+
                if (rsc->stencil) {
                        if (gmem) {
                                stride = 1 * gmem->bin_w;
                                size = stride * gmem->bin_h;
                        } else {
-                               struct fd_resource_slice *slice = fd_resource_slice(rsc->stencil, 0);
-                               stride = slice->pitch * rsc->cpp;
+                               struct fdl_slice *slice = fd_resource_slice(rsc->stencil, 0);
+                               stride = slice->pitch * rsc->layout.cpp;
                                size = slice->size0;
                        }
 
@@ -207,6 +231,21 @@ emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf,
        }
 }
 
+static bool
+use_hw_binning(struct fd_batch *batch)
+{
+       struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+       if ((gmem->maxpw * gmem->maxph) > 32)
+               return false;
+
+       if ((gmem->maxpw > 15) || (gmem->maxph > 15))
+               return false;
+
+       return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) &&
+                       (batch->num_draws > 0);
+}
+
 static void
 patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode)
 {
@@ -215,19 +254,133 @@ patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode)
                struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i);
                *patch->cs = patch->val | DRAW4(0, 0, 0, vismode);
        }
-       util_dynarray_resize(&batch->draw_patches, 0);
+       util_dynarray_clear(&batch->draw_patches);
+}
+
+static void
+update_vsc_pipe(struct fd_batch *batch)
+{
+       struct fd_context *ctx = batch->ctx;
+       struct fd5_context *fd5_ctx = fd5_context(ctx);
+       struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+       struct fd_ringbuffer *ring = batch->gmem;
+       int i;
+
+       OUT_PKT4(ring, REG_A5XX_VSC_BIN_SIZE, 3);
+       OUT_RING(ring, A5XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
+                       A5XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
+       OUT_RELOCW(ring, fd5_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */
+
+       OUT_PKT4(ring, REG_A5XX_UNKNOWN_0BC5, 2);
+       OUT_RING(ring, 0x00000000);   /* UNKNOWN_0BC5 */
+       OUT_RING(ring, 0x00000000);   /* UNKNOWN_0BC6 */
+
+       OUT_PKT4(ring, REG_A5XX_VSC_PIPE_CONFIG_REG(0), 16);
+       for (i = 0; i < 16; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+               OUT_RING(ring, A5XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
+                               A5XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
+                               A5XX_VSC_PIPE_CONFIG_REG_W(pipe->w) |
+                               A5XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
+       }
+
+       OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32);
+       for (i = 0; i < 16; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+               if (!pipe->bo) {
+                       pipe->bo = fd_bo_new(ctx->dev, 0x20000,
+                                       DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i);
+               }
+               OUT_RELOCW(ring, pipe->bo, 0, 0, 0);     /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */
+       }
+
+       OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(0), 16);
+       for (i = 0; i < 16; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+               OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */
+       }
+}
+
+static void
+emit_binning_pass(struct fd_batch *batch)
+{
+       struct fd_context *ctx = batch->ctx;
+       struct fd_ringbuffer *ring = batch->gmem;
+       struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+       uint32_t x1 = gmem->minx;
+       uint32_t y1 = gmem->miny;
+       uint32_t x2 = gmem->minx + gmem->width - 1;
+       uint32_t y2 = gmem->miny + gmem->height - 1;
+
+       fd5_set_render_mode(batch->ctx, ring, BINNING);
+
+       OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1);
+       OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) |
+                       A5XX_RB_CNTL_HEIGHT(gmem->bin_h));
+
+       OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
+       OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) |
+                       A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1));
+       OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) |
+                       A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2));
+
+       OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2);
+       OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) |
+                       A5XX_RB_RESOLVE_CNTL_1_Y(y1));
+       OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) |
+                       A5XX_RB_RESOLVE_CNTL_2_Y(y2));
+
+       update_vsc_pipe(batch);
+
+       OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1);
+       OUT_RING(ring, A5XX_VPC_MODE_CNTL_BINNING_PASS);
+
+       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, UNK_2C);
+
+       OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1);
+       OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) |
+                       A5XX_RB_WINDOW_OFFSET_Y(0));
+
+       /* emit IB to binning drawcmds: */
+       fd5_emit_ib(ring, batch->binning);
+
+       fd_reset_wfi(batch);
+
+       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, UNK_2D);
+
+       OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+       OUT_RING(ring, CACHE_FLUSH_TS);
+       OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0, 0, 0);  /* ADDR_LO/HI */
+       OUT_RING(ring, 0x00000000);
+
+       // TODO CP_COND_WRITE's for all the vsc buffers (check for overflow??)
+
+       fd_wfi(batch, ring);
+
+       OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1);
+       OUT_RING(ring, 0x0);
 }
 
 /* before first tile */
 static void
 fd5_emit_tile_init(struct fd_batch *batch)
 {
+       struct fd_context *ctx = batch->ctx;
        struct fd_ringbuffer *ring = batch->gmem;
+       struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
        fd5_emit_restore(batch, ring);
 
-       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, UNK_26);
+       if (batch->lrz_clear)
+               fd5_emit_ib(ring, batch->lrz_clear);
+
+       fd5_emit_lrz_flush(ring);
+
+       OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1);
+       OUT_RING(ring, 0x00000080);   /* GRAS_CL_CNTL */
 
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
@@ -243,9 +396,16 @@ fd5_emit_tile_init(struct fd_batch *batch)
        OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1);
        OUT_RING(ring, 0x7c13c080);   /* RB_CCU_CNTL */
 
-/*
-opcode: CP_PREEMPT_ENABLE_LOCAL (6a) (2 dwords)
- */
+       emit_zs(ring, pfb->zsbuf, &ctx->gmem);
+       emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, &ctx->gmem);
+
+       if (use_hw_binning(batch)) {
+               emit_binning_pass(batch);
+               fd5_emit_lrz_flush(ring);
+               patch_draws(batch, USE_VISIBILITY);
+       } else {
+               patch_draws(batch, IGNORE_VISIBILITY);
+       }
 
        fd5_set_render_mode(batch->ctx, ring, GMEM);
 }
@@ -254,6 +414,8 @@ opcode: CP_PREEMPT_ENABLE_LOCAL (6a) (2 dwords)
 static void
 fd5_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
 {
+       struct fd_context *ctx = batch->ctx;
+       struct fd5_context *fd5_ctx = fd5_context(ctx);
        struct fd_ringbuffer *ring = batch->gmem;
 
        uint32_t x1 = tile->xoff;
@@ -273,6 +435,25 @@ fd5_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
        OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) |
                        A5XX_RB_RESOLVE_CNTL_2_Y(y2));
 
+       if (use_hw_binning(batch)) {
+               struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
+
+               OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
+
+               OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
+               OUT_RING(ring, 0x0);
+
+               OUT_PKT7(ring, CP_SET_BIN_DATA5, 5);
+               OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
+                               CP_SET_BIN_DATA5_0_VSC_N(tile->n));
+               OUT_RELOC(ring, pipe->bo, 0, 0, 0);      /* VSC_PIPE[p].DATA_ADDRESS */
+               OUT_RELOC(ring, fd5_ctx->vsc_size_mem,   /* VSC_SIZE_ADDRESS + (p * 4) */
+                               (tile->p * 4), 0, 0);
+       } else {
+               OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
+               OUT_RING(ring, 0x1);
+       }
+
        OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1);
        OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(x1) |
                        A5XX_RB_WINDOW_OFFSET_Y(y1));
@@ -294,7 +475,31 @@ emit_mem2gmem_surf(struct fd_batch *batch, uint32_t base,
 
        debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
-       stride = gmem->bin_w * rsc->cpp;
+       if (buf == BLIT_S)
+               rsc = rsc->stencil;
+
+       if ((buf == BLIT_ZS) || (buf == BLIT_S)) {
+               // XXX hack import via BLIT_MRT0 instead of BLIT_ZS, since I don't
+               // know otherwise how to go from linear in sysmem to tiled in gmem.
+               // possibly we want to flip this around gmem2mem and keep depth
+               // tiled in sysmem (and fixup sampler state to assume tiled).. this
+               // might be required for doing depth/stencil in bypass mode?
+               struct fdl_slice *slice = fd_resource_slice(rsc, 0);
+               enum a5xx_color_fmt format =
+                       fd5_pipe2color(fd_gmem_restore_format(rsc->base.format));
+
+               OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5);
+               OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
+                               A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(rsc->layout.tile_mode) |
+                               A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX));
+               OUT_RING(ring, A5XX_RB_MRT_PITCH(slice->pitch * rsc->layout.cpp));
+               OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(slice->size0));
+               OUT_RELOC(ring, rsc->bo, 0, 0, 0);  /* BASE_LO/HI */
+
+               buf = BLIT_MRT0;
+       }
+
+       stride = gmem->bin_w * rsc->layout.cpp;
        size = stride * gmem->bin_h;
 
        OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4);
@@ -350,27 +555,11 @@ fd5_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
 
        if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
                struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
-               // XXX BLIT_ZS vs BLIT_Z32 .. need some more cmdstream traces
-               // with z32_x24s8..
 
-               // XXX hack import via BLIT_MRT0 instead of BLIT_ZS, since I don't
-               // know otherwise how to go from linear in sysmem to tiled in gmem.
-               // possibly we want to flip this around gmem2mem and keep depth
-               // tiled in sysmem (and fixup sampler state to assume tiled).. this
-               // might be required for doing depth/stencil in bypass mode?
-               struct fd_resource_slice *slice = fd_resource_slice(rsc, 0);
-               enum a5xx_color_fmt format =
-                       fd5_pipe2color(fd_gmem_restore_format(pfb->zsbuf->format));
-
-               OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5);
-               OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
-                               A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE5_LINEAR) |
-                               A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX));
-               OUT_RING(ring, A5XX_RB_MRT_PITCH(slice->pitch * rsc->cpp));
-               OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(slice->size0));
-               OUT_RELOCW(ring, rsc->bo, 0, 0, 0);  /* BASE_LO/HI */
-
-               emit_mem2gmem_surf(batch, ctx->gmem.zsbuf_base[0], pfb->zsbuf, BLIT_MRT0);
+               if (!rsc->stencil || fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH))
+                       emit_mem2gmem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS);
+               if (rsc->stencil && fd_gmem_needs_restore(batch, tile, FD_BUFFER_STENCIL))
+                       emit_mem2gmem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S);
        }
 }
 
@@ -383,33 +572,30 @@ fd5_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
        struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
        struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
-       OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
-       OUT_RING(ring, 0x1);
-
        OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1);
        OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) |
                        A5XX_RB_CNTL_HEIGHT(gmem->bin_h));
 
-       patch_draws(batch, IGNORE_VISIBILITY);
-
        emit_zs(ring, pfb->zsbuf, gmem);
        emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem);
 
-       // TODO MSAA
+       enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples);
+
        OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2);
-       OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-       OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-                       A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE);
+       OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(samples));
+       OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(samples) |
+                       COND(samples == MSAA_ONE, A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE));
 
        OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2);
-       OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-       OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-                       A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE);
+       OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(samples));
+       OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) |
+                       COND(samples == MSAA_ONE, A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE));
+
 
        OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2);
-       OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-       OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-                       A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE);
+       OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(samples));
+       OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(samples) |
+                       COND(samples == MSAA_ONE, A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE));
 }
 
 
@@ -423,9 +609,16 @@ emit_gmem2mem_surf(struct fd_batch *batch, uint32_t base,
 {
        struct fd_ringbuffer *ring = batch->gmem;
        struct fd_resource *rsc = fd_resource(psurf->texture);
-       struct fd_resource_slice *slice;
+       struct fdl_slice *slice;
+       bool tiled;
        uint32_t offset;
 
+       if (!rsc->valid)
+               return;
+
+       if (buf == BLIT_S)
+               rsc = rsc->stencil;
+
        slice = fd_resource_slice(rsc, psurf->u.tex.level);
        offset = fd_resource_offset(rsc, psurf->u.tex.level,
                        psurf->u.tex.first_layer);
@@ -438,15 +631,23 @@ emit_gmem2mem_surf(struct fd_batch *batch, uint32_t base,
        OUT_RING(ring, 0x00000000);   /* RB_BLIT_FLAG_DST_PITCH */
        OUT_RING(ring, 0x00000000);   /* RB_BLIT_FLAG_DST_ARRAY_PITCH */
 
+       tiled = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
+
        OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5);
-       OUT_RING(ring, 0x00000004);   /* XXX RB_RESOLVE_CNTL_3 */
+       OUT_RING(ring, 0x00000004 |   /* XXX RB_RESOLVE_CNTL_3 */
+                       COND(tiled, A5XX_RB_RESOLVE_CNTL_3_TILED));
        OUT_RELOCW(ring, rsc->bo, offset, 0, 0);     /* RB_BLIT_DST_LO/HI */
-       OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(slice->pitch * rsc->cpp));
+       OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(slice->pitch * rsc->layout.cpp));
        OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(slice->size0));
 
        OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1);
        OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf));
 
+//     bool msaa_resolve = pfb->samples > 1;
+       bool msaa_resolve = false;
+       OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1);
+       OUT_RING(ring, COND(msaa_resolve, A5XX_RB_CLEAR_CNTL_MSAA_RESOLVE));
+
        fd5_emit_blit(batch->ctx, ring);
 }
 
@@ -459,12 +660,11 @@ fd5_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
 
        if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
                struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
-               // XXX BLIT_ZS vs BLIT_Z32 .. need some more cmdstream traces
-               // with z32_x24s8..
+
                if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH))
                        emit_gmem2mem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS);
                if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL))
-                       emit_gmem2mem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_ZS);
+                       emit_gmem2mem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S);
        }
 
        if (batch->resolve & FD_BUFFER_COLOR) {
@@ -488,8 +688,7 @@ fd5_emit_tile_fini(struct fd_batch *batch)
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
 
-       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, UNK_26);
+       fd5_emit_lrz_flush(ring);
 
        fd5_cache_flush(batch, ring);
        fd5_set_render_mode(batch->ctx, ring, BYPASS);
@@ -503,14 +702,13 @@ fd5_emit_sysmem_prep(struct fd_batch *batch)
 
        fd5_emit_restore(batch, ring);
 
-       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, UNK_26);
+       fd5_emit_lrz_flush(ring);
 
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
 
        OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, UNK_19);
+       OUT_RING(ring, PC_CCU_INVALIDATE_COLOR);
 
        OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1);
        OUT_RING(ring, 0x00000003);   /* PC_POWER_CNTL */
@@ -552,7 +750,6 @@ fd5_emit_sysmem_prep(struct fd_batch *batch)
        emit_zs(ring, pfb->zsbuf, NULL);
        emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL);
 
-       // TODO MSAA
        OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2);
        OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
        OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
@@ -578,8 +775,7 @@ fd5_emit_sysmem_fini(struct fd_batch *batch)
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
 
-       OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-       OUT_RING(ring, UNK_26);
+       fd5_emit_lrz_flush(ring);
 
        OUT_PKT7(ring, CP_EVENT_WRITE, 4);
        OUT_RING(ring, UNK_1D);