From c0766528baaef48902c87bbdaa4f5926c472269b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 7 Jan 2014 10:55:07 -0500 Subject: [PATCH] freedreno/a3xx: support for hw binning pass The binning pass sorts vertices into which bins/tiles they apply to. The visibility information generated during the binning pass can be used to speed up the rendering pass by filtering out vertices which do not apply to the current tile. See: https://github.com/freedreno/freedreno/wiki/Adreno-tiling#optimized-approach This brings a significant fps boost. A rough assortment of tests (supertuxkart, etracer, tremulous, glmark2 'build' test, etc) seems to yield a ~35-45% fps improvement. For now, to be conservative, the binning pass is not enabled yet by default. To enable it use: FD_MESA_DEBUG=binning So far I haven't found anything that breaks with binning enabled, but I'd like a bit more testing before I enable it as default. Signed-off-by: Rob Clark --- configure.ac | 2 +- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 6 +- src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 8 +- src/gallium/drivers/freedreno/a3xx/fd3_draw.c | 81 +++- src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 26 +- src/gallium/drivers/freedreno/a3xx/fd3_emit.h | 3 +- src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 360 +++++++++++++++++- .../drivers/freedreno/a3xx/fd3_program.c | 136 ++++--- .../drivers/freedreno/a3xx/fd3_program.h | 2 +- .../drivers/freedreno/freedreno_context.c | 45 ++- .../drivers/freedreno/freedreno_context.h | 12 +- .../drivers/freedreno/freedreno_draw.c | 9 +- .../drivers/freedreno/freedreno_draw.h | 23 +- .../drivers/freedreno/freedreno_gmem.c | 100 +++-- .../drivers/freedreno/freedreno_screen.c | 9 + .../drivers/freedreno/freedreno_util.h | 44 ++- 16 files changed, 707 insertions(+), 159 deletions(-) diff --git a/configure.ac b/configure.ac index f75325d33da..4b55140d299 100644 --- a/configure.ac +++ b/configure.ac @@ -32,7 +32,7 @@ LIBDRM_RADEON_REQUIRED=2.4.50 LIBDRM_INTEL_REQUIRED=2.4.49 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" -LIBDRM_FREEDRENO_REQUIRED=2.4.39 +LIBDRM_FREEDRENO_REQUIRED=2.4.51 DRI2PROTO_REQUIRED=2.6 DRI3PROTO_REQUIRED=1.0 PRESENTPROTO_REQUIRED=1.0 diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 300ce2e51c1..d6e42b668a8 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -108,7 +108,7 @@ fd2_draw(struct fd_context *ctx, const struct pipe_draw_info *info) OUT_RING(ring, info->max_index); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */ - fd_draw_emit(ctx, info); + fd_draw_emit(ctx, ring, IGNORE_VISIBILITY, info); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); @@ -269,8 +269,8 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index c494bf153e0..274b6145fde 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -90,8 +90,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, uint32_t base, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -212,8 +212,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL); } static void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index c5d8b774552..4c90d984955 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -43,7 +43,7 @@ static void -emit_vertexbufs(struct fd_context *ctx) +emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring) { struct fd_vertex_stateobj *vtx = ctx->vtx; struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf; @@ -63,19 +63,17 @@ emit_vertexbufs(struct fd_context *ctx) bufs[i].format = elem->src_format; } - fd3_emit_vertex_bufs(ctx->ring, &ctx->prog, bufs, vtx->num_elements); + fd3_emit_vertex_bufs(ring, &ctx->prog, bufs, vtx->num_elements); } static void -fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, + struct fd_ringbuffer *ring, unsigned dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - unsigned dirty = ctx->dirty; - - fd3_emit_state(ctx, dirty); + fd3_emit_state(ctx, ring, dirty, binning); if (dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); + emit_vertexbufs(ctx, ring); OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ @@ -90,7 +88,59 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ info->restart_index : 0xffffffff); - fd_draw_emit(ctx, info); + fd_draw_emit(ctx, ring, binning ? IGNORE_VISIBILITY : USE_VISIBILITY, info); +} + +static void +fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) +{ + unsigned dirty = ctx->dirty; + draw_impl(ctx, info, ctx->binning_ring, + dirty & ~(FD_DIRTY_BLEND), true); + draw_impl(ctx, info, ctx->ring, dirty, false); +} + +/* binning pass cmds for a clear: + * NOTE: newer blob drivers don't use binning for clear, which is probably + * preferable since it is low vtx count. However that doesn't seem to + * actually work for me. Not sure if it is depending on support for + * clear pass (rather than using solid-fill shader), or something else + * that newer blob is doing differently. Once that is figured out, we + * can remove fd3_clear_binning(). + */ +static void +fd3_clear_binning(struct fd_context *ctx, unsigned dirty) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_ringbuffer *ring = ctx->binning_ring; + + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), true); + + fd3_program_emit(ring, &ctx->solid_prog, true); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */ + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, PERFCOUNTER_STOP); + + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -99,11 +149,14 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, { struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; + unsigned dirty = ctx->dirty; unsigned ce, i; + fd3_clear_binning(ctx, dirty); + /* emit generic state now: */ - fd3_emit_state(ctx, ctx->dirty & (FD_DIRTY_VIEWPORT | - FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR)); + fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT | + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), false); OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1); OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) | @@ -192,7 +245,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -216,8 +269,8 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, OUT_PKT3(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, PERFCOUNTER_STOP); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, USE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 91993725ea6..9cfe4ddb662 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -337,10 +337,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, } void -fd3_emit_state(struct fd_context *ctx, uint32_t dirty) +fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning) { - struct fd_ringbuffer *ring = ctx->ring; - emit_marker(ring, 5); if (dirty & FD_DIRTY_SAMPLE_MASK) { @@ -354,7 +353,8 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); struct pipe_stencil_ref *sr = &ctx->stencil_ref; - fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); + if (!binning) + fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control); OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); OUT_RING(ring, zsa->rb_alpha_ref); @@ -432,7 +432,10 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty) } if (dirty & FD_DIRTY_PROG) - fd3_program_emit(ring, &ctx->prog); + fd3_program_emit(ring, &ctx->prog, binning); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) { struct fd_program_stateobj *prog = &ctx->prog; @@ -566,11 +569,11 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1); - OUT_RING(ring, 0x00000001); /* UCHE_CACHE_MODE_CONTROL_REG */ - - OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ @@ -604,6 +607,9 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */ } + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + emit_cache_flush(ring); fd_rmw_wfi(ctx, ring); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index bf7787ab6f7..50559d10d22 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -58,7 +58,8 @@ struct fd3_vertex_buf { void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd_program_stateobj *prog, struct fd3_vertex_buf *vbufs, uint32_t n); -void fd3_emit_state(struct fd_context *ctx, uint32_t dirty); +void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + uint32_t dirty, bool binning); void fd3_emit_restore(struct fd_context *ctx); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 3d0a607ed28..8720e087b7b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -106,6 +106,159 @@ depth_base(struct fd_gmem_stateobj *gmem) return align(gmem->bin_w * gmem->bin_h, 0x4000); } +static bool +use_hw_binning(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + +/* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */ +static void update_vsc_pipe(struct fd_context *ctx); +static void +emit_binning_workaround(struct fd_context *ctx) +{ + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct fd_ringbuffer *ring = ctx->ring; + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) | + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A3XX_RB_COPY_CONTROL_MODE(0) | + A3XX_RB_COPY_CONTROL_GMEM_BASE(0)); + OUT_RELOC(ring, fd_resource(fd3_ctx->solid_vbuf)->bo, 0x20, 0, -1); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | + A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) | + A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) | + A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + fd3_program_emit(ring, &ctx->solid_prog, false); + + fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { + { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, + }, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_0_REG_RESERVED2 | + A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); + OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) | + A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20)); + + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0)); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE | + A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + + OUT_PKT3(ring, CP_DRAW_INDX_2, 5); + OUT_RING(ring, 0x00000000); /* viz query info. */ + OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE, + INDEX_SIZE_32_BIT, IGNORE_VISIBILITY)); + OUT_RING(ring, 2); /* NumIndices */ + OUT_RING(ring, 2); + OUT_RING(ring, 1); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS)); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -129,8 +282,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(psurf->format))); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -210,7 +363,7 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->solid_prog); + fd3_program_emit(ring, &ctx->solid_prog, false); fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT }, @@ -252,8 +405,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base, fd3_emit_gmem_restore_tex(ring, psurf); - fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2, - INDEX_SIZE_IGN, 0, 0, NULL); + fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL); } static void @@ -355,7 +508,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - fd3_program_emit(ring, &ctx->blit_prog); + fd3_program_emit(ring, &ctx->blit_prog, false); fd3_emit_vertex_bufs(ring, &ctx->blit_prog, (struct fd3_vertex_buf[]) { { .prsc = fd3_ctx->blit_texcoord_vbuf, .stride = 8, .format = PIPE_FORMAT_R32G32_FLOAT }, @@ -380,12 +533,69 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); } +static void +patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode) +{ + unsigned i; + for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode); + } + util_dynarray_resize(&ctx->draw_patches, 0); +} + +/* for rendering directly to system memory: */ +static void +fd3_emit_sysmem_prep(struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_ringbuffer *ring = ctx->ring; + uint32_t pitch = 0; + + if (pfb->cbufs[0]) + pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + + fd3_emit_restore(ctx); + + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | + A3XX_RB_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_GMEM_BYPASS | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + patch_draws(ctx, IGNORE_VISIBILITY); +} + static void update_vsc_pipe(struct fd_context *ctx) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; int i; + OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + for (i = 0; i < 8; i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; @@ -394,7 +604,7 @@ update_vsc_pipe(struct fd_context *ctx) DRM_FREEDRENO_GEM_TYPE_KMEM); } - OUT_PKT0(ring, REG_A3XX_VSC_PIPE(0), 3); + OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3); OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) | A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | A3XX_VSC_PIPE_CONFIG_W(pipe->w) | @@ -404,34 +614,45 @@ update_vsc_pipe(struct fd_context *ctx) } } -/* for rendering directly to system memory: */ static void -fd3_emit_sysmem_prep(struct fd_context *ctx) +emit_binning_pass(struct fd_context *ctx) { struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct fd_ringbuffer *ring = ctx->ring; - uint32_t pitch = 0; + int i; - if (pfb->cbufs[0]) - pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch; + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); - fd3_emit_restore(ctx); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + } + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); - /* setup scissor/offset for current tile: */ + /* setup scissor/offset for whole screen: */ OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | A3XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE); + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); @@ -439,9 +660,72 @@ fd3_emit_sysmem_prep(struct fd_context *ctx) A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + + for (i = 0; i < 4; i++) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(0) | + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0)); + } + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) | + A3XX_PC_VSTREAM_CONTROL_N(0)); + + /* emit IB to binning drawcmds: */ + OUT_IB(ring, ctx->binning_start, ctx->binning_end); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE | + A3XX_SP_SP_CTRL_REG_CONSTMODE(1) | + A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | + A3XX_SP_SP_CTRL_REG_L0MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_GMEM_BYPASS | A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + if (ctx->screen->gpu_id == 320) { + /* dummy-draw workaround: */ + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, + INDEX_SIZE_IGN, IGNORE_VISIBILITY)); + OUT_RING(ring, 0); /* NumIndices */ + } + + OUT_PKT3(ring, CP_NOP, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_WFI(ring); + + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(ctx); + } } /* before first tile */ @@ -461,6 +745,18 @@ fd3_emit_tile_init(struct fd_context *ctx) A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); update_vsc_pipe(ctx); + + if (use_hw_binning(ctx)) { + /* mark the end of the binning cmds: */ + fd_ringmarker_mark(ctx->binning_end); + + /* emit hw binning pass: */ + emit_binning_pass(ctx); + + patch_draws(ctx, USE_VISIBILITY); + } else { + patch_draws(ctx, IGNORE_VISIBILITY); + } } /* before mem2gmem */ @@ -472,7 +768,6 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) struct fd_gmem_stateobj *gmem = &ctx->gmem; uint32_t reg; - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(gmem)); if (pfb->zsbuf) { @@ -499,6 +794,7 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) static void fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) { + struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -508,6 +804,32 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; + if (use_hw_binning(ctx)) { + struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p]; + + assert(pipe->w * pipe->h); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); + + OUT_WFI(ring); + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A3XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + OUT_PKT3(ring, CP_SET_BIN, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index c02b14cba39..2622006ff09 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -36,6 +36,7 @@ #include "fd3_program.h" #include "fd3_compiler.h" +#include "fd3_emit.h" #include "fd3_texture.h" #include "fd3_util.h" @@ -175,9 +176,9 @@ fd3_vp_state_bind(struct pipe_context *pctx, void *hwcso) } static void -emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) +emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_stateobj *so) { - struct ir3_shader_info *si = &so->info; + const struct ir3_shader_info *si = &so->info; enum adreno_state_block sb; enum adreno_state_src src; uint32_t i, sz, *bin; @@ -216,7 +217,7 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so) } static int -find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) +find_output(const struct fd3_shader_stateobj *so, fd3_semantic semantic) { int j; for (j = 0; j < so->outputs_count; j++) @@ -227,14 +228,21 @@ find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic) void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) + struct fd_program_stateobj *prog, bool binning) { - struct fd3_shader_stateobj *vp = prog->vp; - struct fd3_shader_stateobj *fp = prog->fp; - struct ir3_shader_info *vsi = &vp->info; - struct ir3_shader_info *fsi = &fp->info; + const struct fd3_shader_stateobj *vp = prog->vp; + const struct fd3_shader_stateobj *fp = prog->fp; + const struct ir3_shader_info *vsi = &vp->info; + const struct ir3_shader_info *fsi = &fp->info; int i; + if (binning) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct fd3_shader_stateobj binning_fp = {}; + fp = &binning_fp; + fsi = &fp->info; + } + /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. */ @@ -260,11 +268,9 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | + COND(binning, A3XX_SP_SP_CTRL_REG_BINNING) | A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | - // XXX "resolve" (?) bit set on gmem->mem pass.. -// COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) | - // XXX sometimes 0, sometimes 1: - A3XX_SP_SP_CTRL_REG_LOMODE(1)); + A3XX_SP_SP_CTRL_REG_L0MODE(0)); OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen)); @@ -272,6 +278,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_VS_CTRL_REG0_CACHEINVALID | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | @@ -323,28 +330,38 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | - A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | - A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | - A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | - A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | - A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); - OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | + A3XX_SP_FS_CTRL_REG0_CACHEINVALID | + A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | + A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | + A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | + A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | + A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) | + A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ @@ -360,24 +377,31 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); - OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); - OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | - A3XX_VPC_ATTR_THRDASSIGN(1) | - A3XX_VPC_ATTR_LMSIZE(1)); - OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | - A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + if (binning) { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | + A3XX_VPC_ATTR_THRDASSIGN(1) | + A3XX_VPC_ATTR_LMSIZE(1)); + OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | + A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); + OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); + OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + } OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | @@ -388,10 +412,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - emit_shader(ring, fp); + if (!binning) { + emit_shader(ring, fp); - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + } OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) | diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 85c22a54cf7..bd6483ff42c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -117,7 +117,7 @@ struct fd3_shader_stateobj { }; void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog); + struct fd_program_stateobj *prog, bool binning); void fd3_prog_init(struct pipe_context *pctx); void fd3_prog_fini(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 28be508e329..23f6a67734d 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -34,16 +34,11 @@ #include "freedreno_gmem.h" #include "freedreno_util.h" -static void -fd_context_next_rb(struct pipe_context *pctx) +static struct fd_ringbuffer *next_rb(struct fd_context *ctx) { - struct fd_context *ctx = fd_context(pctx); struct fd_ringbuffer *ring; uint32_t ts; - fd_ringmarker_del(ctx->draw_start); - fd_ringmarker_del(ctx->draw_end); - /* grab next ringbuffer: */ ring = ctx->rings[(ctx->rings_idx++) % ARRAY_SIZE(ctx->rings)]; @@ -56,10 +51,36 @@ fd_context_next_rb(struct pipe_context *pctx) fd_ringbuffer_reset(ring); + return ring; +} + +static void +fd_context_next_rb(struct pipe_context *pctx) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_ringbuffer *ring; + + fd_ringmarker_del(ctx->draw_start); + fd_ringmarker_del(ctx->draw_end); + + ring = next_rb(ctx); + ctx->draw_start = fd_ringmarker_new(ring); ctx->draw_end = fd_ringmarker_new(ring); + fd_ringbuffer_set_parent(ring, NULL); ctx->ring = ring; + + fd_ringmarker_del(ctx->binning_start); + fd_ringmarker_del(ctx->binning_end); + + ring = next_rb(ctx); + + ctx->binning_start = fd_ringmarker_new(ring); + ctx->binning_end = fd_ringmarker_new(ring); + + fd_ringbuffer_set_parent(ring, ctx->ring); + ctx->binning_ring = ring; } /* emit accumulated render cmds, needed for example if render target has @@ -121,6 +142,10 @@ fd_context_destroy(struct pipe_context *pctx) DBG(""); + util_slab_destroy(&ctx->transfer_pool); + + util_dynarray_fini(&ctx->draw_patches); + if (ctx->blitter) util_blitter_destroy(ctx->blitter); @@ -129,7 +154,11 @@ fd_context_destroy(struct pipe_context *pctx) fd_ringmarker_del(ctx->draw_start); fd_ringmarker_del(ctx->draw_end); - fd_ringbuffer_del(ctx->ring); + fd_ringmarker_del(ctx->binning_start); + fd_ringmarker_del(ctx->binning_end); + + for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) + fd_ringbuffer_del(ctx->rings[i]); for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; @@ -176,6 +205,8 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, fd_context_next_rb(pctx); fd_reset_rmw_state(ctx); + util_dynarray_init(&ctx->draw_patches); + util_slab_create(&ctx->transfer_pool, sizeof(struct pipe_transfer), 16, UTIL_SLAB_SINGLETHREADED); diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index a8abbca7a62..a0227e49c03 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -111,7 +111,7 @@ struct fd_context { */ enum { /* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */ - FD_BUFFER_COLOR = PIPE_CLEAR_COLOR, + FD_BUFFER_COLOR = PIPE_CLEAR_COLOR0, FD_BUFFER_DEPTH = PIPE_CLEAR_DEPTH, FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL, FD_BUFFER_ALL = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL, @@ -148,9 +148,14 @@ struct fd_context { struct fd_ringbuffer *rings[4]; unsigned rings_idx; + /* normal draw/clear cmds: */ struct fd_ringbuffer *ring; struct fd_ringmarker *draw_start, *draw_end; + /* binning pass draw/clear cmds: */ + struct fd_ringbuffer *binning_ring; + struct fd_ringmarker *binning_start, *binning_end; + /* Keep track if WAIT_FOR_IDLE is needed for registers we need * to update via RMW: */ @@ -165,6 +170,11 @@ struct fd_context { uint32_t rbrc_draw; } rmw; + /* Keep track of DRAW initiators that need to be patched up depending + * on whether we using binning or not: + */ + struct util_dynarray draw_patches; + struct pipe_scissor_state scissor; /* we don't have a disable/enable bit for scissor, so instead we keep diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 0069438c87d..d80f3565614 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -54,7 +54,9 @@ size2indextype(unsigned index_size) /* this is same for a2xx/a3xx, so split into helper: */ void -fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info) +fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info) { struct pipe_index_buffer *idx = &ctx->indexbuf; struct fd_bo *idx_bo = NULL; @@ -78,8 +80,8 @@ fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info) src_sel = DI_SRC_SEL_AUTO_INDEX; } - fd_draw(ctx, ctx->primtypes[info->mode], src_sel, info->count, - idx_type, idx_size, idx_offset, idx_bo); + fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel, + info->count, idx_type, idx_size, idx_offset, idx_bo); } static void @@ -180,6 +182,7 @@ fd_clear(struct pipe_context *pctx, unsigned buffers, ctx->clear(ctx, buffers, color, depth, stencil); ctx->dirty |= FD_DIRTY_ZSA | + FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_SAMPLE_MASK | FD_DIRTY_PROG | diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h index 190c0e52d24..e8bb420889e 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.h +++ b/src/gallium/drivers/freedreno/freedreno_draw.h @@ -38,19 +38,21 @@ struct fd_ringbuffer; -void fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info); +void fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info); void fd_draw_init(struct pipe_context *pctx); static inline void -fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, +fd_draw(struct fd_context *ctx, struct fd_ringbuffer *ring, + enum pc_di_primtype primtype, + enum pc_di_vis_cull_mode vismode, enum pc_di_src_sel src_sel, uint32_t count, enum pc_di_index_size idx_type, uint32_t idx_size, uint32_t idx_offset, struct fd_bo *idx_bo) { - struct fd_ringbuffer *ring = ctx->ring; - /* for debug after a lock up, write a unique counter value * to scratch7 for each draw, to make it easier to match up * register dumps to cmdstream. The combination of IB @@ -64,7 +66,7 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, OUT_PKT3(ring, CP_DRAW_INDX, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY)); + INDEX_SIZE_IGN, USE_VISIBILITY)); OUT_RING(ring, 0); /* NumIndices */ /* ugg, hard-code register offset to avoid pulling in the @@ -76,8 +78,15 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype, OUT_PKT3(ring, CP_DRAW_INDX, idx_bo ? 5 : 3); OUT_RING(ring, 0x00000000); /* viz query info. */ - OUT_RING(ring, DRAW(primtype, src_sel, - idx_type, IGNORE_VISIBILITY)); + if (vismode == USE_VISIBILITY) { + /* leave vis mode blank for now, it will be patched up when + * we know if we are binning or not + */ + OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0), + &ctx->draw_patches); + } else { + OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode)); + } OUT_RING(ring, count); /* NumIndices */ if (idx_bo) { OUT_RELOC(ring, idx_bo, idx_offset, 0, 0); diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 47f7a310e8c..0270538a3d0 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -85,7 +85,8 @@ calculate_tiles(struct fd_context *ctx) uint32_t bin_w, bin_h; uint32_t max_width = bin_width(ctx); uint32_t cpp = 4; - uint32_t i, j, t, p, n, xoff, yoff; + uint32_t i, j, t, xoff, yoff; + uint32_t tpp_x, tpp_y; bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)); if (pfb->cbufs[0]) @@ -145,20 +146,65 @@ calculate_tiles(struct fd_context *ctx) gmem->width = width; gmem->height = height; - /* Assign tiles and pipes: - * NOTE we currently take a rather simplistic approach of - * mapping rows of tiles to a pipe. At some point it might - * be worth playing with different strategies and seeing if - * that makes much impact on performance. + /* + * Assign tiles and pipes: + * + * At some point it might be worth playing with different + * strategies and seeing if that makes much impact on + * performance. */ - t = p = n = 0; + +#define div_round_up(v, a) (((v) + (a) - 1) / (a)) + /* figure out number of tiles per pipe: */ + tpp_x = tpp_y = 1; + while (div_round_up(nbins_y, tpp_y) > 8) + tpp_y += 2; + while ((div_round_up(nbins_y, tpp_y) * + div_round_up(nbins_x, tpp_x)) > 8) + tpp_x += 1; + + /* configure pipes: */ + xoff = yoff = 0; + for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + + if (xoff >= nbins_x) { + xoff = 0; + yoff += tpp_y; + } + + if (yoff >= nbins_y) { + break; + } + + pipe->x = xoff; + pipe->y = yoff; + pipe->w = MIN2(tpp_x, nbins_x - xoff); + pipe->h = MIN2(tpp_y, nbins_y - yoff); + + xoff += tpp_x; + } + + for (; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + pipe->x = pipe->y = pipe->w = pipe->h = 0; + } + +#if 0 /* debug */ + printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y); + for (i = 0; i < 8; i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + printf("pipe[%d]: %ux%u @ %u,%u\n", i, + pipe->w, pipe->h, pipe->x, pipe->y); + } +#endif + + /* configure tiles: */ + t = 0; yoff = miny; for (i = 0; i < nbins_y; i++) { - struct fd_vsc_pipe *pipe = &ctx->pipe[p]; uint32_t bw, bh; - assert(p < ARRAY_SIZE(ctx->pipe)); - xoff = minx; /* clip bin height: */ @@ -166,13 +212,20 @@ calculate_tiles(struct fd_context *ctx) for (j = 0; j < nbins_x; j++) { struct fd_tile *tile = &ctx->tile[t]; + uint32_t n, p; assert(t < ARRAY_SIZE(ctx->tile)); + /* pipe number: */ + p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); + + /* slot number: */ + n = ((i % tpp_y) * tpp_x) + (j % tpp_x); + /* clip bin width: */ bw = MIN2(bin_w, minx + width - xoff); - tile->n = n++; + tile->n = n; tile->p = p; tile->bin_w = bw; tile->bin_h = bh; @@ -184,22 +237,19 @@ calculate_tiles(struct fd_context *ctx) xoff += bw; } - /* one pipe per row: */ - pipe->x = 0; - pipe->y = i; - pipe->w = nbins_x; - pipe->h = 1; - - p++; - n = 0; - yoff += bh; } - for (; p < ARRAY_SIZE(ctx->pipe); p++) { - struct fd_vsc_pipe *pipe = &ctx->pipe[p]; - pipe->x = pipe->y = pipe->w = pipe->h = 0; +#if 0 /* debug */ + t = 0; + for (i = 0; i < nbins_y; i++) { + for (j = 0; j < nbins_x; j++) { + struct fd_tile *tile = &ctx->tile[t++]; + printf("|p:%u n:%u|", tile->p, tile->n); + } + printf("\n"); } +#endif } static void @@ -259,6 +309,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx) /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ fd_ringmarker_mark(ctx->draw_end); + fd_ringmarker_mark(ctx->binning_end); if (sysmem) { DBG("rendering sysmem (%s/%s)", @@ -277,8 +328,9 @@ fd_gmem_render_tiles(struct pipe_context *pctx) /* GPU executes starting from tile cmds, which IB back to draw cmds: */ fd_ringmarker_flush(ctx->draw_end); - /* mark start for next draw cmds: */ + /* mark start for next draw/binning cmds: */ fd_ringmarker_mark(ctx->draw_start); + fd_ringmarker_mark(ctx->binning_start); fd_reset_rmw_state(ctx); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 319e29f3ada..28a09166acd 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -64,12 +64,15 @@ static const struct debug_named_value debug_options[] = { {"direct", FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"}, {"dbypass", FD_DBG_DBYPASS,"Disable GMEM bypass"}, {"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"}, + {"binning", FD_DBG_BINNING, "Enable hw binning"}, + {"dbinning", FD_DBG_DBINNING, "Disable hw binning"}, DEBUG_NAMED_VALUE_END }; DEBUG_GET_ONCE_FLAGS_OPTION(fd_mesa_debug, "FD_MESA_DEBUG", debug_options, 0) int fd_mesa_debug = 0; +bool fd_binning_enabled = false; /* default to off for now */ static const char * fd_screen_get_name(struct pipe_screen *pscreen) @@ -386,6 +389,12 @@ fd_screen_create(struct fd_device *dev) fd_mesa_debug = debug_get_option_fd_mesa_debug(); + if (fd_mesa_debug & FD_DBG_BINNING) + fd_binning_enabled = true; + + if (fd_mesa_debug & FD_DBG_DBINNING) + fd_binning_enabled = false; + if (!screen) return NULL; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 48d346eb35b..fae5ba06b1d 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -37,6 +37,7 @@ #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_half.h" +#include "util/u_dynarray.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" @@ -52,16 +53,19 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); /* TBD if it is same on a2xx, but for now: */ #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS -#define FD_DBG_MSGS 0x01 -#define FD_DBG_DISASM 0x02 -#define FD_DBG_DCLEAR 0x04 -#define FD_DBG_DGMEM 0x08 -#define FD_DBG_DSCIS 0x10 -#define FD_DBG_DIRECT 0x20 -#define FD_DBG_DBYPASS 0x40 -#define FD_DBG_FRAGHALF 0x80 +#define FD_DBG_MSGS 0x0001 +#define FD_DBG_DISASM 0x0002 +#define FD_DBG_DCLEAR 0x0004 +#define FD_DBG_DGMEM 0x0008 +#define FD_DBG_DSCIS 0x0010 +#define FD_DBG_DIRECT 0x0020 +#define FD_DBG_DBYPASS 0x0040 +#define FD_DBG_FRAGHALF 0x0080 +#define FD_DBG_BINNING 0x0100 +#define FD_DBG_DBINNING 0x0200 extern int fd_mesa_debug; +extern bool fd_binning_enabled; #define DBG(fmt, ...) \ do { if (fd_mesa_debug & FD_DBG_MSGS) \ @@ -87,6 +91,13 @@ static inline uint32_t DRAW(enum pc_di_primtype prim_type, (1 << 14); } +/* for tracking cmdstream positions that need to be patched: */ +struct fd_cs_patch { + uint32_t *cs; + uint32_t val; +}; +#define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch)) +#define fd_patch_element(buf, i) util_dynarray_element(buf, struct fd_cs_patch, i) static inline enum pipe_format pipe_surface_format(struct pipe_surface *psurf) @@ -110,6 +121,21 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data) *(ring->cur++) = data; } +/* like OUT_RING() but appends a cmdstream patch point to 'buf' */ +static inline void +OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data, + struct util_dynarray *buf) +{ + if (LOG_DWORDS) { + DBG("ring[%p]: OUT_RINGP %04x: %08x", ring, + (uint32_t)(ring->cur - ring->last_start), data); + } + util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){ + .cs = ring->cur++, + .val = data, + })); +} + static inline void OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint32_t or, int32_t shift) @@ -132,7 +158,7 @@ OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset, uint32_t or, int32_t shift) { if (LOG_DWORDS) { - DBG("ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring, + DBG("ring[%p]: OUT_RELOCW %04x: %p+%u << %d", ring, (uint32_t)(ring->cur - ring->last_start), bo, offset, shift); } fd_ringbuffer_reloc(ring, &(struct fd_reloc){ -- 2.30.2