From 4af1dcbb7d5431ae75cc39568c99d7a20231f081 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 26 May 2013 20:36:35 -0400 Subject: [PATCH] freedreno: gmem bypass The GPU (at least a3xx, but I think also a2xx) can render directly to memory, bypassing tiling. Although it can't do this if blend, depth, and a few other features of the pipeline are enabled. This direct memory mode can be faster for some sorts of operations, such as simple blits. In particular, this significantly speeds up XA by avoiding to pull the entire dest pixmap into GMEM, render tiles, and write it all back out again. This should also speed up resource copy-region and blit. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 65 +++++++++++++++++-- src/gallium/drivers/freedreno/a3xx/fd3_zsa.c | 2 - .../drivers/freedreno/freedreno_context.c | 2 + .../drivers/freedreno/freedreno_context.h | 20 ++++++ .../drivers/freedreno/freedreno_draw.c | 33 ++++++++-- .../drivers/freedreno/freedreno_gmem.c | 62 +++++++++++++----- .../drivers/freedreno/freedreno_state.h | 14 +++- 7 files changed, 168 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 16ec95972a0..1cb170af261 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -47,8 +47,15 @@ static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w) { + enum a3xx_tile_mode tile_mode; unsigned i; + if (bin_w) { + tile_mode = TILE_32X32; + } else { + tile_mode = LINEAR; + } + for (i = 0; i < 4; i++) { enum a3xx_color_fmt format = 0; enum a3xx_color_swap swap = WZYX; @@ -58,23 +65,32 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, if (i < nr_bufs) { struct pipe_surface *psurf = bufs[i]; - struct fd_resource *res = fd_resource(psurf->texture); + res = fd_resource(psurf->texture); format = fd3_pipe2color(psurf->format); swap = fd3_pipe2swap(psurf->format); - stride = bin_w * res->cpp; - if (bases) { - base = bases[i] * res->cpp; + if (bin_w) { + stride = bin_w * res->cpp; + + if (bases) { + base = bases[i] * res->cpp; + } + } else { + stride = res->pitch * res->cpp; } } OUT_PKT0(ring, REG_A3XX_RB_MRT_BUF_INFO(i), 2); OUT_RING(ring, A3XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | + A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | A3XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | - A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE_32X32) | A3XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap)); - OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base)); + if (bin_w || (i >= nr_bufs)) { + OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base)); + } else { + OUT_RELOCS(ring, res->bo, 0, 0, -1); + } OUT_PKT0(ring, REG_A3XX_SP_FS_IMAGE_OUTPUT_REG(i), 1); OUT_RING(ring, A3XX_SP_FS_IMAGE_OUTPUT_REG_MRTFORMAT(format)); @@ -381,6 +397,42 @@ update_vsc_pipe(struct fd_context *ctx) } } +/* for rendering directly to system memory: */ +static void +fd3_emit_sysmem_prep(struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_resource *rsc = fd_resource(pfb->cbufs[0]->texture); + struct fd_ringbuffer *ring = ctx->ring; + + fd3_emit_restore(ctx); + + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_SIZE, 1); + OUT_RING(ring, A3XX_RB_WINDOW_SIZE_WIDTH(pfb->width) | + A3XX_RB_WINDOW_SIZE_HEIGHT(pfb->height)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); + + fd3_emit_rbrc_tile_state(ring, + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(rsc->pitch)); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_PA_SC_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_PA_SC_WINDOW_OFFSET_X(0) | + A3XX_PA_SC_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_GMEM_BYPASS | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); +} + /* before first tile */ static void fd3_emit_tile_init(struct fd_context *ctx) @@ -478,6 +530,7 @@ fd3_gmem_init(struct pipe_context *pctx) { struct fd_context *ctx = fd_context(pctx); + ctx->emit_sysmem_prep = fd3_emit_sysmem_prep; ctx->emit_tile_init = fd3_emit_tile_init; ctx->emit_tile_prep = fd3_emit_tile_prep; ctx->emit_tile_mem2gmem = fd3_emit_tile_mem2gmem; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c index 857ab8f106a..7603465c830 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c @@ -94,7 +94,5 @@ fd3_zsa_state_create(struct pipe_context *pctx, // TODO alpha_ref and alpha_test_enable?? } - so->rb_render_control |= 0x2000; /* ??? */ - return so; } diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 0f16568ffdd..44d525b25dd 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -83,6 +83,8 @@ fd_context_render(struct pipe_context *pctx) ctx->needs_flush = false; ctx->cleared = ctx->restore = ctx->resolve = 0; + ctx->gmem_reason = 0; + ctx->num_draws = 0; fd_resource(pfb->cbufs[0]->texture)->dirty = false; if (pfb->zsbuf) diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index a6133c0d8c3..54759314e26 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -117,6 +117,23 @@ struct fd_context { bool needs_flush; + /* To decide whether to render to system memory, keep track of the + * number of draws, and whether any of them require multisample, + * depth_test (or depth write), stencil_test, blending, and + * color_logic_Op (since those functions are disabled when by- + * passing GMEM. + */ + enum { + FD_GMEM_CLEARS_DEPTH_STENCIL = 0x01, + FD_GMEM_DEPTH_ENABLED = 0x02, + FD_GMEM_STENCIL_ENABLED = 0x04, + + FD_GMEM_MSAA_ENABLED = 0x08, + FD_GMEM_BLEND_ENABLED = 0x10, + FD_GMEM_LOGICOP_ENABLED = 0x20, + } gmem_reason; + unsigned num_draws; + struct fd_ringbuffer *ring; struct fd_ringmarker *draw_start, *draw_end; @@ -186,6 +203,9 @@ struct fd_context { void (*emit_tile_gmem2mem)(struct fd_context *ctx, uint32_t xoff, uint32_t yoff, uint32_t bin_w, uint32_t bin_h); + /* optional, for GMEM bypass: */ + void (*emit_sysmem_prep)(struct fd_context *ctx); + /* draw: */ void (*draw)(struct fd_context *pctx, const struct pipe_draw_info *info); void (*clear)(struct fd_context *ctx, unsigned buffers, diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 2b7c16847dc..dbdf5732658 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -114,7 +114,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) { struct fd_context *ctx = fd_context(pctx); struct pipe_framebuffer_state *pfb = &ctx->framebuffer; - unsigned buffers; + unsigned i, buffers = 0; /* if we supported transform feedback, we'd have to disable this: */ if (((ctx->scissor.maxx - ctx->scissor.minx) * @@ -124,19 +124,40 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) ctx->needs_flush = true; - fd_resource(pfb->cbufs[0]->texture)->dirty = true; + /* + * Figure out the buffers/features we need: + */ - /* figure out the buffers we need: */ - buffers = FD_BUFFER_COLOR; if (fd_depth_enabled(ctx)) { buffers |= FD_BUFFER_DEPTH; fd_resource(pfb->zsbuf->texture)->dirty = true; + ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED; } + if (fd_stencil_enabled(ctx)) { buffers |= FD_BUFFER_STENCIL; fd_resource(pfb->zsbuf->texture)->dirty = true; + ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED; } + if (fd_logicop_enabled(ctx)) + ctx->gmem_reason |= FD_GMEM_LOGICOP_ENABLED; + + for (i = 0; i < pfb->nr_cbufs; i++) { + struct pipe_resource *surf = pfb->cbufs[i]->texture; + + fd_resource(surf)->dirty = true; + buffers |= FD_BUFFER_COLOR; + + if (surf->nr_samples > 1) + ctx->gmem_reason |= FD_GMEM_MSAA_ENABLED; + + if (fd_blend_enabled(ctx, i)) + ctx->gmem_reason |= FD_GMEM_BLEND_ENABLED; + } + + ctx->num_draws++; + /* any buffers that haven't been cleared, we need to restore: */ ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared); /* and any buffers used, need to be resolved: */ @@ -165,8 +186,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers, if (buffers & PIPE_CLEAR_COLOR) fd_resource(pfb->cbufs[0]->texture)->dirty = true; - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { fd_resource(pfb->zsbuf->texture)->dirty = true; + ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL; + } DBG("%x depth=%f, stencil=%u (%s/%s)", buffers, depth, stencil, util_format_name(pfb->cbufs[0]->format), diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 856e441337c..12633bd5f38 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -128,23 +128,11 @@ calculate_tiles(struct fd_context *ctx) gmem->height = height; } - -void -fd_gmem_render_tiles(struct pipe_context *pctx) +static void +render_tiles(struct fd_context *ctx) { - struct fd_context *ctx = fd_context(pctx); - struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_gmem_stateobj *gmem = &ctx->gmem; - uint32_t i, timestamp, yoff = 0; - - calculate_tiles(ctx); - - DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y, - util_format_name(pfb->cbufs[0]->format), - pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none"); - - /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ - fd_ringmarker_mark(ctx->draw_end); + uint32_t i, yoff = 0; yoff= gmem->miny; @@ -184,6 +172,50 @@ fd_gmem_render_tiles(struct pipe_context *pctx) yoff += bh; } +} + +static void +render_sysmem(struct fd_context *ctx) +{ + ctx->emit_sysmem_prep(ctx); + + /* emit IB to drawcmds: */ + OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end); +} + +void +fd_gmem_render_tiles(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + uint32_t timestamp = 0; + bool sysmem = false; + + if (ctx->emit_sysmem_prep) { + if (ctx->cleared || ctx->gmem_reason || (ctx->num_draws > 5)) { + DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u", + ctx->cleared, ctx->gmem_reason, ctx->num_draws); + } else { + sysmem = true; + } + } + + /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ + fd_ringmarker_mark(ctx->draw_end); + + if (sysmem) { + DBG("rendering sysmem (%s/%s)", + util_format_name(pfb->cbufs[0]->format), + pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none"); + render_sysmem(ctx); + } else { + struct fd_gmem_stateobj *gmem = &ctx->gmem; + DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y, + util_format_name(pfb->cbufs[0]->format), + pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none"); + calculate_tiles(ctx); + render_tiles(ctx); + } /* GPU executes starting from tile cmds, which IB back to draw cmds: */ fd_ringmarker_flush(ctx->draw_end); diff --git a/src/gallium/drivers/freedreno/freedreno_state.h b/src/gallium/drivers/freedreno/freedreno_state.h index c966bdcc51d..859299b3ad3 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.h +++ b/src/gallium/drivers/freedreno/freedreno_state.h @@ -34,12 +34,22 @@ static inline bool fd_depth_enabled(struct fd_context *ctx) { - return ctx->zsa->depth.enabled; + return ctx->zsa && ctx->zsa->depth.enabled; } static inline bool fd_stencil_enabled(struct fd_context *ctx) { - return ctx->zsa->stencil[0].enabled; + return ctx->zsa && ctx->zsa->stencil[0].enabled; +} + +static inline bool fd_logicop_enabled(struct fd_context *ctx) +{ + return ctx->blend && ctx->blend->logicop_enable; +} + +static inline bool fd_blend_enabled(struct fd_context *ctx, unsigned n) +{ + return ctx->blend && ctx->blend->rt[n].blend_enable; } void fd_state_init(struct pipe_context *pctx); -- 2.30.2