freedreno: gmem bypass
authorRob Clark <robclark@freedesktop.org>
Mon, 27 May 2013 00:36:35 +0000 (20:36 -0400)
committerRob Clark <robclark@freedesktop.org>
Sat, 8 Jun 2013 17:15:51 +0000 (13:15 -0400)
The GPU (at least a3xx, but I think also a2xx) can render directly to
memory, bypassing tiling.  Although it can't do this if blend, depth,
and a few other features of the pipeline are enabled.  This direct
memory mode can be faster for some sorts of operations, such as simple
blits.  In particular, this significantly speeds up XA by avoiding to
pull the entire dest pixmap into GMEM, render tiles, and write it all
back out again.  This should also speed up resource copy-region and
blit.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
src/gallium/drivers/freedreno/a3xx/fd3_zsa.c
src/gallium/drivers/freedreno/freedreno_context.c
src/gallium/drivers/freedreno/freedreno_context.h
src/gallium/drivers/freedreno/freedreno_draw.c
src/gallium/drivers/freedreno/freedreno_gmem.c
src/gallium/drivers/freedreno/freedreno_state.h

index 16ec95972a016b6a9ed4a818ac13db1cae476267..1cb170af261576730aa8f75b5123d4333fa9a1da 100644 (file)
@@ -47,8 +47,15 @@ static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
                struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
 {
+       enum a3xx_tile_mode tile_mode;
        unsigned i;
 
+       if (bin_w) {
+               tile_mode = TILE_32X32;
+       } else {
+               tile_mode = LINEAR;
+       }
+
        for (i = 0; i < 4; i++) {
                enum a3xx_color_fmt format = 0;
                enum a3xx_color_swap swap = WZYX;
@@ -58,23 +65,32 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
                if (i < nr_bufs) {
                        struct pipe_surface *psurf = bufs[i];
-                       struct fd_resource *res = fd_resource(psurf->texture);
 
+                       res = fd_resource(psurf->texture);
                        format = fd3_pipe2color(psurf->format);
                        swap = fd3_pipe2swap(psurf->format);
-                       stride = bin_w * res->cpp;
 
-                       if (bases) {
-                               base = bases[i] * res->cpp;
+                       if (bin_w) {
+                               stride = bin_w * res->cpp;
+
+                               if (bases) {
+                                       base = bases[i] * res->cpp;
+                               }
+                       } else {
+                               stride = res->pitch * res->cpp;
                        }
                }
 
                OUT_PKT0(ring, REG_A3XX_RB_MRT_BUF_INFO(i), 2);
                OUT_RING(ring, A3XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
+                               A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
                                A3XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
-                               A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE_32X32) |
                                A3XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
-               OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base));
+               if (bin_w || (i >= nr_bufs)) {
+                       OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base));
+               } else {
+                       OUT_RELOCS(ring, res->bo, 0, 0, -1);
+               }
 
                OUT_PKT0(ring, REG_A3XX_SP_FS_IMAGE_OUTPUT_REG(i), 1);
                OUT_RING(ring, A3XX_SP_FS_IMAGE_OUTPUT_REG_MRTFORMAT(format));
@@ -381,6 +397,42 @@ update_vsc_pipe(struct fd_context *ctx)
        }
 }
 
+/* for rendering directly to system memory: */
+static void
+fd3_emit_sysmem_prep(struct fd_context *ctx)
+{
+       struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+       struct fd_resource *rsc = fd_resource(pfb->cbufs[0]->texture);
+       struct fd_ringbuffer *ring = ctx->ring;
+
+       fd3_emit_restore(ctx);
+
+       OUT_PKT0(ring, REG_A3XX_RB_WINDOW_SIZE, 1);
+       OUT_RING(ring, A3XX_RB_WINDOW_SIZE_WIDTH(pfb->width) |
+                       A3XX_RB_WINDOW_SIZE_HEIGHT(pfb->height));
+
+       emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+
+       fd3_emit_rbrc_tile_state(ring,
+                       A3XX_RB_RENDER_CONTROL_BIN_WIDTH(rsc->pitch));
+
+       /* setup scissor/offset for current tile: */
+       OUT_PKT0(ring, REG_A3XX_PA_SC_WINDOW_OFFSET, 1);
+       OUT_RING(ring, A3XX_PA_SC_WINDOW_OFFSET_X(0) |
+                       A3XX_PA_SC_WINDOW_OFFSET_Y(0));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));
+
+       OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+                       A3XX_RB_MODE_CONTROL_GMEM_BYPASS |
+                       A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+}
+
 /* before first tile */
 static void
 fd3_emit_tile_init(struct fd_context *ctx)
@@ -478,6 +530,7 @@ fd3_gmem_init(struct pipe_context *pctx)
 {
        struct fd_context *ctx = fd_context(pctx);
 
+       ctx->emit_sysmem_prep = fd3_emit_sysmem_prep;
        ctx->emit_tile_init = fd3_emit_tile_init;
        ctx->emit_tile_prep = fd3_emit_tile_prep;
        ctx->emit_tile_mem2gmem = fd3_emit_tile_mem2gmem;
index 857ab8f106a47ce64a35e4771f1bff6b0c36cf32..7603465c8304486d6424478abc615cd377b64b34 100644 (file)
@@ -94,7 +94,5 @@ fd3_zsa_state_create(struct pipe_context *pctx,
                // TODO alpha_ref and alpha_test_enable??
        }
 
-       so->rb_render_control |= 0x2000;  /* ??? */
-
        return so;
 }
index 0f16568ffdd0b8c32de0f6ba4c3250c52903970c..44d525b25dda2684a70787271b389baa76783ce3 100644 (file)
@@ -83,6 +83,8 @@ fd_context_render(struct pipe_context *pctx)
 
        ctx->needs_flush = false;
        ctx->cleared = ctx->restore = ctx->resolve = 0;
+       ctx->gmem_reason = 0;
+       ctx->num_draws = 0;
 
        fd_resource(pfb->cbufs[0]->texture)->dirty = false;
        if (pfb->zsbuf)
index a6133c0d8c3a872b7cfd24395bb22f83a43e7ecb..54759314e26bb072e62055d5f0b4175ab2cbd922 100644 (file)
@@ -117,6 +117,23 @@ struct fd_context {
 
        bool needs_flush;
 
+       /* To decide whether to render to system memory, keep track of the
+        * number of draws, and whether any of them require multisample,
+        * depth_test (or depth write), stencil_test, blending, and
+        * color_logic_Op (since those functions are disabled when by-
+        * passing GMEM.
+        */
+       enum {
+               FD_GMEM_CLEARS_DEPTH_STENCIL = 0x01,
+               FD_GMEM_DEPTH_ENABLED        = 0x02,
+               FD_GMEM_STENCIL_ENABLED      = 0x04,
+
+               FD_GMEM_MSAA_ENABLED         = 0x08,
+               FD_GMEM_BLEND_ENABLED        = 0x10,
+               FD_GMEM_LOGICOP_ENABLED      = 0x20,
+       } gmem_reason;
+       unsigned num_draws;
+
        struct fd_ringbuffer *ring;
        struct fd_ringmarker *draw_start, *draw_end;
 
@@ -186,6 +203,9 @@ struct fd_context {
        void (*emit_tile_gmem2mem)(struct fd_context *ctx, uint32_t xoff, uint32_t yoff,
                        uint32_t bin_w, uint32_t bin_h);
 
+       /* optional, for GMEM bypass: */
+       void (*emit_sysmem_prep)(struct fd_context *ctx);
+
        /* draw: */
        void (*draw)(struct fd_context *pctx, const struct pipe_draw_info *info);
        void (*clear)(struct fd_context *ctx, unsigned buffers,
index 2b7c16847dc0254fddab0192bcb41506d9c83bc2..dbdf5732658fbeb4456c840a4043b67bfd91ff77 100644 (file)
@@ -114,7 +114,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
        struct fd_context *ctx = fd_context(pctx);
        struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
-       unsigned buffers;
+       unsigned i, buffers = 0;
 
        /* if we supported transform feedback, we'd have to disable this: */
        if (((ctx->scissor.maxx - ctx->scissor.minx) *
@@ -124,19 +124,40 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
        ctx->needs_flush = true;
 
-       fd_resource(pfb->cbufs[0]->texture)->dirty = true;
+       /*
+        * Figure out the buffers/features we need:
+        */
 
-       /* figure out the buffers we need: */
-       buffers = FD_BUFFER_COLOR;
        if (fd_depth_enabled(ctx)) {
                buffers |= FD_BUFFER_DEPTH;
                fd_resource(pfb->zsbuf->texture)->dirty = true;
+               ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED;
        }
+
        if (fd_stencil_enabled(ctx)) {
                buffers |= FD_BUFFER_STENCIL;
                fd_resource(pfb->zsbuf->texture)->dirty = true;
+               ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED;
        }
 
+       if (fd_logicop_enabled(ctx))
+               ctx->gmem_reason |= FD_GMEM_LOGICOP_ENABLED;
+
+       for (i = 0; i < pfb->nr_cbufs; i++) {
+               struct pipe_resource *surf = pfb->cbufs[i]->texture;
+
+               fd_resource(surf)->dirty = true;
+               buffers |= FD_BUFFER_COLOR;
+
+               if (surf->nr_samples > 1)
+                       ctx->gmem_reason |= FD_GMEM_MSAA_ENABLED;
+
+               if (fd_blend_enabled(ctx, i))
+                       ctx->gmem_reason |= FD_GMEM_BLEND_ENABLED;
+       }
+
+       ctx->num_draws++;
+
        /* any buffers that haven't been cleared, we need to restore: */
        ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
        /* and any buffers used, need to be resolved: */
@@ -165,8 +186,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
        if (buffers & PIPE_CLEAR_COLOR)
                fd_resource(pfb->cbufs[0]->texture)->dirty = true;
 
-       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))
+       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
                fd_resource(pfb->zsbuf->texture)->dirty = true;
+               ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL;
+       }
 
        DBG("%x depth=%f, stencil=%u (%s/%s)", buffers, depth, stencil,
                        util_format_name(pfb->cbufs[0]->format),
index 856e441337c27f0140599bf64f76e93d84f6ed0a..12633bd5f389c2b40bde9af474c823eece7309b4 100644 (file)
@@ -128,23 +128,11 @@ calculate_tiles(struct fd_context *ctx)
        gmem->height = height;
 }
 
-
-void
-fd_gmem_render_tiles(struct pipe_context *pctx)
+static void
+render_tiles(struct fd_context *ctx)
 {
-       struct fd_context *ctx = fd_context(pctx);
-       struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
        struct fd_gmem_stateobj *gmem = &ctx->gmem;
-       uint32_t i, timestamp, yoff = 0;
-
-       calculate_tiles(ctx);
-
-       DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
-                       util_format_name(pfb->cbufs[0]->format),
-                       pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none");
-
-       /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
-       fd_ringmarker_mark(ctx->draw_end);
+       uint32_t i, yoff = 0;
 
        yoff= gmem->miny;
 
@@ -184,6 +172,50 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 
                yoff += bh;
        }
+}
+
+static void
+render_sysmem(struct fd_context *ctx)
+{
+       ctx->emit_sysmem_prep(ctx);
+
+       /* emit IB to drawcmds: */
+       OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+}
+
+void
+fd_gmem_render_tiles(struct pipe_context *pctx)
+{
+       struct fd_context *ctx = fd_context(pctx);
+       struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+       uint32_t timestamp = 0;
+       bool sysmem = false;
+
+       if (ctx->emit_sysmem_prep) {
+               if (ctx->cleared || ctx->gmem_reason || (ctx->num_draws > 5)) {
+                       DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u",
+                               ctx->cleared, ctx->gmem_reason, ctx->num_draws);
+               } else {
+                       sysmem = true;
+               }
+       }
+
+       /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
+       fd_ringmarker_mark(ctx->draw_end);
+
+       if (sysmem) {
+               DBG("rendering sysmem (%s/%s)",
+                       util_format_name(pfb->cbufs[0]->format),
+                       pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none");
+               render_sysmem(ctx);
+       } else {
+               struct fd_gmem_stateobj *gmem = &ctx->gmem;
+               DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
+                       util_format_name(pfb->cbufs[0]->format),
+                       pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none");
+               calculate_tiles(ctx);
+               render_tiles(ctx);
+       }
 
        /* GPU executes starting from tile cmds, which IB back to draw cmds: */
        fd_ringmarker_flush(ctx->draw_end);
index c966bdcc51d213a9553e96724e44ce45c3dac8b1..859299b3ad3ee3bc14338053f48c65d7bf847c04 100644 (file)
 
 static inline bool fd_depth_enabled(struct fd_context *ctx)
 {
-       return ctx->zsa->depth.enabled;
+       return ctx->zsa && ctx->zsa->depth.enabled;
 }
 
 static inline bool fd_stencil_enabled(struct fd_context *ctx)
 {
-       return ctx->zsa->stencil[0].enabled;
+       return ctx->zsa && ctx->zsa->stencil[0].enabled;
+}
+
+static inline bool fd_logicop_enabled(struct fd_context *ctx)
+{
+       return ctx->blend && ctx->blend->logicop_enable;
+}
+
+static inline bool fd_blend_enabled(struct fd_context *ctx, unsigned n)
+{
+       return ctx->blend && ctx->blend->rt[n].blend_enable;
 }
 
 void fd_state_init(struct pipe_context *pctx);