freedreno/a3xx: support for hw binning pass
authorRob Clark <robclark@freedesktop.org>
Tue, 7 Jan 2014 15:55:07 +0000 (10:55 -0500)
committerRob Clark <robclark@freedesktop.org>
Wed, 8 Jan 2014 21:30:18 +0000 (16:30 -0500)
The binning pass sorts vertices into which bins/tiles they apply to.
The visibility information generated during the binning pass can be
used to speed up the rendering pass by filtering out vertices which
do not apply to the current tile.  See:

 https://github.com/freedreno/freedreno/wiki/Adreno-tiling#optimized-approach

This brings a significant fps boost.  A rough assortment of tests
(supertuxkart, etracer, tremulous, glmark2 'build' test, etc) seems
to yield a ~35-45% fps improvement.

For now, to be conservative, the binning pass is not enabled yet by
default.  To enable it use:

  FD_MESA_DEBUG=binning

So far I haven't found anything that breaks with binning enabled,
but I'd like a bit more testing before I enable it as default.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
16 files changed:
configure.ac
src/gallium/drivers/freedreno/a2xx/fd2_draw.c
src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
src/gallium/drivers/freedreno/a3xx/fd3_draw.c
src/gallium/drivers/freedreno/a3xx/fd3_emit.c
src/gallium/drivers/freedreno/a3xx/fd3_emit.h
src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
src/gallium/drivers/freedreno/a3xx/fd3_program.c
src/gallium/drivers/freedreno/a3xx/fd3_program.h
src/gallium/drivers/freedreno/freedreno_context.c
src/gallium/drivers/freedreno/freedreno_context.h
src/gallium/drivers/freedreno/freedreno_draw.c
src/gallium/drivers/freedreno/freedreno_draw.h
src/gallium/drivers/freedreno/freedreno_gmem.c
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/freedreno_util.h

index f75325d33da93204e8ffd99788c0593855fa8c47..4b55140d2994f50c98d7df34cb05cfff8f0c978b 100644 (file)
@@ -32,7 +32,7 @@ LIBDRM_RADEON_REQUIRED=2.4.50
 LIBDRM_INTEL_REQUIRED=2.4.49
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
-LIBDRM_FREEDRENO_REQUIRED=2.4.39
+LIBDRM_FREEDRENO_REQUIRED=2.4.51
 DRI2PROTO_REQUIRED=2.6
 DRI3PROTO_REQUIRED=1.0
 PRESENTPROTO_REQUIRED=1.0
index 300ce2e51c1cf05c502c084acb5499735ffa7e38..d6e42b668a8734eace0ae893bc30e685a55e7ab3 100644 (file)
@@ -108,7 +108,7 @@ fd2_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
        OUT_RING(ring, info->max_index);        /* VGT_MAX_VTX_INDX */
        OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
 
-       fd_draw_emit(ctx, info);
+       fd_draw_emit(ctx, ring, IGNORE_VISIBILITY, info);
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010));
@@ -269,8 +269,8 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
        OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
        OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
index c494bf153e0b36cd44d451a23136cdd1e8ba8d34..274b6145fdeeaece3b3dc27c2f80aa16d9d5e1da 100644 (file)
@@ -90,8 +90,8 @@ emit_gmem2mem_surf(struct fd_context *ctx, uint32_t base,
        OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
        OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 static void
@@ -212,8 +212,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
        OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
        OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 static void
index c5d8b7745521d2ff9769a7f11b8044d918ef6531..4c90d98495570e3cf856faab8d8c4ad9fdcbbbe2 100644 (file)
@@ -43,7 +43,7 @@
 
 
 static void
-emit_vertexbufs(struct fd_context *ctx)
+emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring)
 {
        struct fd_vertex_stateobj *vtx = ctx->vtx;
        struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf;
@@ -63,19 +63,17 @@ emit_vertexbufs(struct fd_context *ctx)
                bufs[i].format = elem->src_format;
        }
 
-       fd3_emit_vertex_bufs(ctx->ring, &ctx->prog, bufs, vtx->num_elements);
+       fd3_emit_vertex_bufs(ring, &ctx->prog, bufs, vtx->num_elements);
 }
 
 static void
-fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
+draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
+               struct fd_ringbuffer *ring, unsigned dirty, bool binning)
 {
-       struct fd_ringbuffer *ring = ctx->ring;
-       unsigned dirty = ctx->dirty;
-
-       fd3_emit_state(ctx, dirty);
+       fd3_emit_state(ctx, ring, dirty, binning);
 
        if (dirty & FD_DIRTY_VTXBUF)
-               emit_vertexbufs(ctx);
+               emit_vertexbufs(ctx, ring);
 
        OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1);
        OUT_RING(ring, 0x0000000b);                  /* PC_VERTEX_REUSE_BLOCK_CNTL */
@@ -90,7 +88,59 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
        OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
                        info->restart_index : 0xffffffff);
 
-       fd_draw_emit(ctx, info);
+       fd_draw_emit(ctx, ring, binning ? IGNORE_VISIBILITY : USE_VISIBILITY, info);
+}
+
+static void
+fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
+{
+       unsigned dirty = ctx->dirty;
+       draw_impl(ctx, info, ctx->binning_ring,
+                       dirty & ~(FD_DIRTY_BLEND), true);
+       draw_impl(ctx, info, ctx->ring, dirty, false);
+}
+
+/* binning pass cmds for a clear:
+ * NOTE: newer blob drivers don't use binning for clear, which is probably
+ * preferable since it is low vtx count.  However that doesn't seem to
+ * actually work for me.  Not sure if it is depending on support for
+ * clear pass (rather than using solid-fill shader), or something else
+ * that newer blob is doing differently.  Once that is figured out, we
+ * can remove fd3_clear_binning().
+ */
+static void
+fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
+{
+       struct fd3_context *fd3_ctx = fd3_context(ctx);
+       struct fd_ringbuffer *ring = ctx->binning_ring;
+
+       fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT |
+                       FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), true);
+
+       fd3_program_emit(ring, &ctx->solid_prog, true);
+
+       fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
+                       { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
+               }, 1);
+
+       OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
+       OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
+                       A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
+                       A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+                       A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
+       OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4);
+       OUT_RING(ring, 0);            /* VFD_INDEX_MIN */
+       OUT_RING(ring, 2);            /* VFD_INDEX_MAX */
+       OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
+       OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
+       OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1);
+       OUT_RING(ring, 0xffffffff);   /* PC_RESTART_INDEX */
+
+       OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, PERFCOUNTER_STOP);
+
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 static void
@@ -99,11 +149,14 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 {
        struct fd3_context *fd3_ctx = fd3_context(ctx);
        struct fd_ringbuffer *ring = ctx->ring;
+       unsigned dirty = ctx->dirty;
        unsigned ce, i;
 
+       fd3_clear_binning(ctx, dirty);
+
        /* emit generic state now: */
-       fd3_emit_state(ctx, ctx->dirty & (FD_DIRTY_VIEWPORT |
-                       FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR));
+       fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT |
+                       FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), false);
 
        OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1);
        OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) |
@@ -192,7 +245,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
        OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1);
        OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0));
 
-       fd3_program_emit(ring, &ctx->solid_prog);
+       fd3_program_emit(ring, &ctx->solid_prog, false);
 
        fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
                        { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
@@ -216,8 +269,8 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
        OUT_PKT3(ring, CP_EVENT_WRITE, 1);
        OUT_RING(ring, PERFCOUNTER_STOP);
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, USE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 void
index 91993725ea69c3227335b5430d6ac160725db78d..9cfe4ddb66246f09bbacce59cb6fb82c3e00f8fd 100644 (file)
@@ -337,10 +337,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
 }
 
 void
-fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
+fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               uint32_t dirty, bool binning)
 {
-       struct fd_ringbuffer *ring = ctx->ring;
-
        emit_marker(ring, 5);
 
        if (dirty & FD_DIRTY_SAMPLE_MASK) {
@@ -354,7 +353,8 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
                struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa);
                struct pipe_stencil_ref *sr = &ctx->stencil_ref;
 
-               fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control);
+               if (!binning)
+                       fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control);
 
                OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1);
                OUT_RING(ring, zsa->rb_alpha_ref);
@@ -432,7 +432,10 @@ fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
        }
 
        if (dirty & FD_DIRTY_PROG)
-               fd3_program_emit(ring, &ctx->prog);
+               fd3_program_emit(ring, &ctx->prog, binning);
+
+       OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, HLSQ_FLUSH);
 
        if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
                struct fd_program_stateobj *prog = &ctx->prog;
@@ -566,11 +569,11 @@ fd3_emit_restore(struct fd_context *ctx)
        OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) |
                        A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0));
 
-       OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1);
-       OUT_RING(ring, 0x00000001);        /* UCHE_CACHE_MODE_CONTROL_REG */
-
-       OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1);
-       OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */
+       OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
+       OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
+                       A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
+                       A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
 
        OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
        OUT_RING(ring, 0x00000000);                  /* GRAS_CL_CLIP_CNTL */
@@ -604,6 +607,9 @@ fd3_emit_restore(struct fd_context *ctx)
                OUT_RING(ring, 0x00000000);    /* GRAS_CL_USER_PLANE[i].W */
        }
 
+       OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+       OUT_RING(ring, 0x00000000);
+
        emit_cache_flush(ring);
        fd_rmw_wfi(ctx, ring);
 }
index bf7787ab6f7c8b2ceb3311a392f5e09ce7e9eec6..50559d10d2237e6edf835a168a6b0baef9f9545d 100644 (file)
@@ -58,7 +58,8 @@ struct fd3_vertex_buf {
 void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
                struct fd_program_stateobj *prog,
                struct fd3_vertex_buf *vbufs, uint32_t n);
-void fd3_emit_state(struct fd_context *ctx, uint32_t dirty);
+void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               uint32_t dirty, bool binning);
 void fd3_emit_restore(struct fd_context *ctx);
 
 
index 3d0a607ed286ce61a31e59b86e1b04a641b9517b..8720e087b7b893aa81d48e9b5f5354d258da2b8a 100644 (file)
@@ -106,6 +106,159 @@ depth_base(struct fd_gmem_stateobj *gmem)
        return align(gmem->bin_w * gmem->bin_h, 0x4000);
 }
 
+static bool
+use_hw_binning(struct fd_context *ctx)
+{
+       struct fd_gmem_stateobj *gmem = &ctx->gmem;
+       return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
+}
+
+/* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */
+static void update_vsc_pipe(struct fd_context *ctx);
+static void
+emit_binning_workaround(struct fd_context *ctx)
+{
+       struct fd3_context *fd3_ctx = fd3_context(ctx);
+       struct fd_gmem_stateobj *gmem = &ctx->gmem;
+       struct fd_ringbuffer *ring = ctx->ring;
+
+       OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2);
+       OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
+                       A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+       OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) |
+                       A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
+                       A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER));
+
+       OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4);
+       OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) |
+                       A3XX_RB_COPY_CONTROL_MODE(0) |
+                       A3XX_RB_COPY_CONTROL_GMEM_BASE(0));
+       OUT_RELOC(ring, fd_resource(fd3_ctx->solid_vbuf)->bo, 0x20, 0, -1);  /* RB_COPY_DEST_BASE */
+       OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128));
+       OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) |
+                       A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) |
+                       A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) |
+                       A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) |
+                       A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+       OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
+                       A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+                       A3XX_GRAS_SC_CONTROL_RASTER_MODE(1));
+
+       fd3_program_emit(ring, &ctx->solid_prog, false);
+
+       fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
+                       { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
+               }, 1);
+
+       OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4);
+       OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+                       A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
+                       A3XX_HLSQ_CONTROL_0_REG_RESERVED2 |
+                       A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
+       OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
+                       A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE);
+       OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
+       OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */
+
+       OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1);
+       OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) |
+                       A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20));
+
+       OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE |
+                       A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) |
+                       A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff));
+
+       OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER));
+
+       OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) |
+                       A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) |
+                       A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) |
+                       A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) |
+                       A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) |
+                       A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) |
+                       A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) |
+                       A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1);
+       OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0));
+
+       OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4);
+       OUT_RING(ring, 0);            /* VFD_INDEX_MIN */
+       OUT_RING(ring, 2);            /* VFD_INDEX_MAX */
+       OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
+       OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
+
+       OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
+       OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
+                       A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
+                       A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+                       A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
+       OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) |
+                       A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1));
+       OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) |
+                       A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6);
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0));
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0));
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0));
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0));
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0));
+       OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
+       OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE |
+                       A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE |
+                       A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE |
+                       A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE |
+                       A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE);
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1);
+       OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) |
+                       A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0));
+
+       OUT_PKT3(ring, CP_DRAW_INDX_2, 5);
+       OUT_RING(ring, 0x00000000);   /* viz query info. */
+       OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE,
+                       INDEX_SIZE_32_BIT, IGNORE_VISIBILITY));
+       OUT_RING(ring, 2);            /* NumIndices */
+       OUT_RING(ring, 2);
+       OUT_RING(ring, 1);
+
+       OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1);
+       OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS));
+
+       OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
+       OUT_RING(ring, 0x00000000);
+
+       OUT_WFI(ring);
+
+       OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1);
+       OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
+                       A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+       OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+                       A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+                       A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
+       OUT_RING(ring, 0x00000000);
+}
+
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
@@ -129,8 +282,8 @@ emit_gmem2mem_surf(struct fd_context *ctx,
                        A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) |
                        A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(psurf->format)));
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 static void
@@ -210,7 +363,7 @@ fd3_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
        OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
        OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 
-       fd3_program_emit(ring, &ctx->solid_prog);
+       fd3_program_emit(ring, &ctx->solid_prog, false);
 
        fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
                        { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
@@ -252,8 +405,8 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
 
        fd3_emit_gmem_restore_tex(ring, psurf);
 
-       fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
-                       INDEX_SIZE_IGN, 0, 0, NULL);
+       fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
 }
 
 static void
@@ -355,7 +508,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
        OUT_RING(ring, 0);            /* VFD_INSTANCEID_OFFSET */
        OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 
-       fd3_program_emit(ring, &ctx->blit_prog);
+       fd3_program_emit(ring, &ctx->blit_prog, false);
 
        fd3_emit_vertex_bufs(ring, &ctx->blit_prog, (struct fd3_vertex_buf[]) {
                        { .prsc = fd3_ctx->blit_texcoord_vbuf, .stride = 8, .format = PIPE_FORMAT_R32G32_FLOAT },
@@ -380,12 +533,69 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
                        A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
 }
 
+static void
+patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode)
+{
+       unsigned i;
+       for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) {
+               struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i);
+               *patch->cs = patch->val | DRAW(0, 0, 0, vismode);
+       }
+       util_dynarray_resize(&ctx->draw_patches, 0);
+}
+
+/* for rendering directly to system memory: */
+static void
+fd3_emit_sysmem_prep(struct fd_context *ctx)
+{
+       struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+       struct fd_ringbuffer *ring = ctx->ring;
+       uint32_t pitch = 0;
+
+       if (pfb->cbufs[0])
+               pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch;
+
+       fd3_emit_restore(ctx);
+
+       OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
+       OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
+                       A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
+
+       emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+
+       OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
+                       A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch));
+
+       /* setup scissor/offset for current tile: */
+       OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1);
+       OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) |
+                       A3XX_RB_WINDOW_OFFSET_Y(0));
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
+       OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) |
+                       A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));
+
+       OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+                       A3XX_RB_MODE_CONTROL_GMEM_BYPASS |
+                       A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+
+       patch_draws(ctx, IGNORE_VISIBILITY);
+}
+
 static void
 update_vsc_pipe(struct fd_context *ctx)
 {
+       struct fd3_context *fd3_ctx = fd3_context(ctx);
        struct fd_ringbuffer *ring = ctx->ring;
        int i;
 
+       OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1);
+       OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */
+
        for (i = 0; i < 8; i++) {
                struct fd_vsc_pipe *pipe = &ctx->pipe[i];
 
@@ -394,7 +604,7 @@ update_vsc_pipe(struct fd_context *ctx)
                                        DRM_FREEDRENO_GEM_TYPE_KMEM);
                }
 
-               OUT_PKT0(ring, REG_A3XX_VSC_PIPE(0), 3);
+               OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3);
                OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) |
                                A3XX_VSC_PIPE_CONFIG_Y(pipe->y) |
                                A3XX_VSC_PIPE_CONFIG_W(pipe->w) |
@@ -404,34 +614,45 @@ update_vsc_pipe(struct fd_context *ctx)
        }
 }
 
-/* for rendering directly to system memory: */
 static void
-fd3_emit_sysmem_prep(struct fd_context *ctx)
+emit_binning_pass(struct fd_context *ctx)
 {
        struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
        struct fd_ringbuffer *ring = ctx->ring;
-       uint32_t pitch = 0;
+       int i;
 
-       if (pfb->cbufs[0])
-               pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch;
+       if (ctx->screen->gpu_id == 320) {
+               emit_binning_workaround(ctx);
 
-       fd3_emit_restore(ctx);
+               OUT_PKT3(ring, CP_INVALIDATE_STATE, 1);
+               OUT_RING(ring, 0x00007fff);
+       }
+
+       OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1);
+       OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE);
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+       OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) |
+                       A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+                       A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
 
        OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
        OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
                        A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
 
-       emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
-
        OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1);
        OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
-                       A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch));
+                       A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
+                       A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w));
 
-       /* setup scissor/offset for current tile: */
+       /* setup scissor/offset for whole screen: */
        OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1);
        OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) |
                        A3XX_RB_WINDOW_OFFSET_Y(0));
 
+       OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE);
+
        OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
        OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
                        A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
@@ -439,9 +660,72 @@ fd3_emit_sysmem_prep(struct fd_context *ctx)
                        A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));
 
        OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1);
+       OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) |
+                       A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+
+       for (i = 0; i < 4; i++) {
+               OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
+               OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(0) |
+                               A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) |
+                               A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0));
+       }
+
+       OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+       OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) |
+                       A3XX_PC_VSTREAM_CONTROL_N(0));
+
+       /* emit IB to binning drawcmds: */
+       OUT_IB(ring, ctx->binning_start, ctx->binning_end);
+
+       /* and then put stuff back the way it was: */
+
+       OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1);
+       OUT_RING(ring, 0x00000000);
+
+       OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
+       OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE |
+                       A3XX_SP_SP_CTRL_REG_CONSTMODE(1) |
+                       A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
+                       A3XX_SP_SP_CTRL_REG_L0MODE(0));
+
+       OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1);
+       OUT_RING(ring, 0x00000000);
+
+       OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+       OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+                       A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+                       A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+       OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2);
        OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
-                       A3XX_RB_MODE_CONTROL_GMEM_BYPASS |
                        A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+       OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM |
+                       A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
+                       A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w));
+
+       OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+       OUT_RING(ring, CACHE_FLUSH);
+
+       if (ctx->screen->gpu_id == 320) {
+               /* dummy-draw workaround: */
+               OUT_PKT3(ring, CP_DRAW_INDX, 3);
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
+                               INDEX_SIZE_IGN, IGNORE_VISIBILITY));
+               OUT_RING(ring, 0);             /* NumIndices */
+       }
+
+       OUT_PKT3(ring, CP_NOP, 4);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000000);
+
+       OUT_WFI(ring);
+
+       if (ctx->screen->gpu_id == 320) {
+               emit_binning_workaround(ctx);
+       }
 }
 
 /* before first tile */
@@ -461,6 +745,18 @@ fd3_emit_tile_init(struct fd_context *ctx)
                        A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
 
        update_vsc_pipe(ctx);
+
+       if (use_hw_binning(ctx)) {
+               /* mark the end of the binning cmds: */
+               fd_ringmarker_mark(ctx->binning_end);
+
+               /* emit hw binning pass: */
+               emit_binning_pass(ctx);
+
+               patch_draws(ctx, USE_VISIBILITY);
+       } else {
+               patch_draws(ctx, IGNORE_VISIBILITY);
+       }
 }
 
 /* before mem2gmem */
@@ -472,7 +768,6 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
        struct fd_gmem_stateobj *gmem = &ctx->gmem;
        uint32_t reg;
 
-
        OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2);
        reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(gmem));
        if (pfb->zsbuf) {
@@ -499,6 +794,7 @@ fd3_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 static void
 fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 {
+       struct fd3_context *fd3_ctx = fd3_context(ctx);
        struct fd_ringbuffer *ring = ctx->ring;
        struct fd_gmem_stateobj *gmem = &ctx->gmem;
        struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
@@ -508,6 +804,32 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
        uint32_t x2 = tile->xoff + tile->bin_w - 1;
        uint32_t y2 = tile->yoff + tile->bin_h - 1;
 
+       if (use_hw_binning(ctx)) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p];
+
+               assert(pipe->w * pipe->h);
+
+               OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+               OUT_RING(ring, HLSQ_FLUSH);
+
+               OUT_WFI(ring);
+
+               OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+               OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) |
+                               A3XX_PC_VSTREAM_CONTROL_N(tile->n));
+
+               OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+               OUT_RING(ring, CACHE_FLUSH);
+
+               OUT_PKT3(ring, CP_SET_BIN_DATA, 2);
+               OUT_RELOC(ring, pipe->bo, 0, 0, 0);    /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */
+               OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */
+                               (tile->p * 4), 0, 0);
+       } else {
+               OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+               OUT_RING(ring, 0x00000000);
+       }
+
        OUT_PKT3(ring, CP_SET_BIN, 3);
        OUT_RING(ring, 0x00000000);
        OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
index c02b14cba391b201a9ba8307de5d1b69e841d55c..2622006ff0914dc015c080e281905c67525f7922 100644 (file)
@@ -36,6 +36,7 @@
 
 #include "fd3_program.h"
 #include "fd3_compiler.h"
+#include "fd3_emit.h"
 #include "fd3_texture.h"
 #include "fd3_util.h"
 
@@ -175,9 +176,9 @@ fd3_vp_state_bind(struct pipe_context *pctx, void *hwcso)
 }
 
 static void
-emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so)
+emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_stateobj *so)
 {
-       struct ir3_shader_info *si = &so->info;
+       const struct ir3_shader_info *si = &so->info;
        enum adreno_state_block sb;
        enum adreno_state_src src;
        uint32_t i, sz, *bin;
@@ -216,7 +217,7 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so)
 }
 
 static int
-find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic)
+find_output(const struct fd3_shader_stateobj *so, fd3_semantic semantic)
 {
        int j;
        for (j = 0; j < so->outputs_count; j++)
@@ -227,14 +228,21 @@ find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic)
 
 void
 fd3_program_emit(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog)
+               struct fd_program_stateobj *prog, bool binning)
 {
-       struct fd3_shader_stateobj *vp = prog->vp;
-       struct fd3_shader_stateobj *fp = prog->fp;
-       struct ir3_shader_info *vsi = &vp->info;
-       struct ir3_shader_info *fsi = &fp->info;
+       const struct fd3_shader_stateobj *vp = prog->vp;
+       const struct fd3_shader_stateobj *fp = prog->fp;
+       const struct ir3_shader_info *vsi = &vp->info;
+       const struct ir3_shader_info *fsi = &fp->info;
        int i;
 
+       if (binning) {
+               /* use dummy stateobj to simplify binning vs non-binning: */
+               static const struct fd3_shader_stateobj binning_fp = {};
+               fp = &binning_fp;
+               fsi = &fp->info;
+       }
+
        /* we could probably divide this up into things that need to be
         * emitted if frag-prog is dirty vs if vert-prog is dirty..
         */
@@ -260,11 +268,9 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 
        OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
        OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
+                       COND(binning, A3XX_SP_SP_CTRL_REG_BINNING) |
                        A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
-                       // XXX "resolve" (?) bit set on gmem->mem pass..
-//                     COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) |
-                       // XXX sometimes 0, sometimes 1:
-                       A3XX_SP_SP_CTRL_REG_LOMODE(1));
+                       A3XX_SP_SP_CTRL_REG_L0MODE(0));
 
        OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
        OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen));
@@ -272,6 +278,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,
        OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
        OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
                        A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
+                       A3XX_SP_VS_CTRL_REG0_CACHEINVALID |
                        A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
                        A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
                        A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
@@ -323,28 +330,38 @@ fd3_program_emit(struct fd_ringbuffer *ring,
                        A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
        OUT_RELOC(ring, vp->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */
 
-       OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
-       OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));
-
-       OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
-       OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
-                       A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
-                       A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
-                       A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
-                       A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-                       A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
-                       A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
-                       COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
-                       A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
-       OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
-                       A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
-                       A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) |
-                       A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
-
-       OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
-       OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
-                       A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
-       OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
+       if (binning) {
+               OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
+               OUT_RING(ring, 0x00000000);
+
+               OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
+               OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+                               A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
+               OUT_RING(ring, 0x00000000);
+       } else {
+               OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
+               OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));
+
+               OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
+               OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+                               A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
+                               A3XX_SP_FS_CTRL_REG0_CACHEINVALID |
+                               A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
+                               A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
+                               A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+                               A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+                               A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
+                               COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
+                               A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
+               OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
+                               A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
+                               A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) |
+                               A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
+               OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
+               OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
+                               A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
+               OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
+       }
 
        OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
        OUT_RING(ring, 0x00000000);        /* SP_FS_FLAT_SHAD_MODE_REG_0 */
@@ -360,24 +377,31 @@ fd3_program_emit(struct fd_ringbuffer *ring,
        OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
        OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
 
-       OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
-       OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
-                       A3XX_VPC_ATTR_THRDASSIGN(1) |
-                       A3XX_VPC_ATTR_LMSIZE(1));
-       OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
-                       A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));
-
-       OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
-       OUT_RING(ring, fp->vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
-       OUT_RING(ring, fp->vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
-       OUT_RING(ring, fp->vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
-       OUT_RING(ring, fp->vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */
-
-       OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
-       OUT_RING(ring, fp->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
-       OUT_RING(ring, fp->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
-       OUT_RING(ring, fp->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
-       OUT_RING(ring, fp->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */
+       if (binning) {
+               OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
+               OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
+                               A3XX_VPC_ATTR_LMSIZE(1));
+               OUT_RING(ring, 0x00000000);
+       } else {
+               OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
+               OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
+                               A3XX_VPC_ATTR_THRDASSIGN(1) |
+                               A3XX_VPC_ATTR_LMSIZE(1));
+               OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
+                               A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));
+
+               OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
+               OUT_RING(ring, fp->vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
+               OUT_RING(ring, fp->vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
+               OUT_RING(ring, fp->vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
+               OUT_RING(ring, fp->vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */
+
+               OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
+               OUT_RING(ring, fp->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
+               OUT_RING(ring, fp->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
+               OUT_RING(ring, fp->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
+               OUT_RING(ring, fp->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */
+       }
 
        OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
        OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
@@ -388,10 +412,12 @@ fd3_program_emit(struct fd_ringbuffer *ring,
        OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
        OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
 
-       emit_shader(ring, fp);
+       if (!binning) {
+               emit_shader(ring, fp);
 
-       OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
-       OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
+               OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
+               OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
+       }
 
        OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
        OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
index 85c22a54cf7b72d5b83674d3d545f6e15576b811..bd6483ff42cb7497fb2d30af2767785291fca5f5 100644 (file)
@@ -117,7 +117,7 @@ struct fd3_shader_stateobj {
 };
 
 void fd3_program_emit(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog);
+               struct fd_program_stateobj *prog, bool binning);
 
 void fd3_prog_init(struct pipe_context *pctx);
 void fd3_prog_fini(struct pipe_context *pctx);
index 28be508e329ec6195e215b8e5e52da9e75d2fee6..23f6a67734d1d5949d6cf61c86df87566e0b9078 100644 (file)
 #include "freedreno_gmem.h"
 #include "freedreno_util.h"
 
-static void
-fd_context_next_rb(struct pipe_context *pctx)
+static struct fd_ringbuffer *next_rb(struct fd_context *ctx)
 {
-       struct fd_context *ctx = fd_context(pctx);
        struct fd_ringbuffer *ring;
        uint32_t ts;
 
-       fd_ringmarker_del(ctx->draw_start);
-       fd_ringmarker_del(ctx->draw_end);
-
        /* grab next ringbuffer: */
        ring = ctx->rings[(ctx->rings_idx++) % ARRAY_SIZE(ctx->rings)];
 
@@ -56,10 +51,36 @@ fd_context_next_rb(struct pipe_context *pctx)
 
        fd_ringbuffer_reset(ring);
 
+       return ring;
+}
+
+static void
+fd_context_next_rb(struct pipe_context *pctx)
+{
+       struct fd_context *ctx = fd_context(pctx);
+       struct fd_ringbuffer *ring;
+
+       fd_ringmarker_del(ctx->draw_start);
+       fd_ringmarker_del(ctx->draw_end);
+
+       ring = next_rb(ctx);
+
        ctx->draw_start = fd_ringmarker_new(ring);
        ctx->draw_end = fd_ringmarker_new(ring);
 
+       fd_ringbuffer_set_parent(ring, NULL);
        ctx->ring = ring;
+
+       fd_ringmarker_del(ctx->binning_start);
+       fd_ringmarker_del(ctx->binning_end);
+
+       ring = next_rb(ctx);
+
+       ctx->binning_start = fd_ringmarker_new(ring);
+       ctx->binning_end = fd_ringmarker_new(ring);
+
+       fd_ringbuffer_set_parent(ring, ctx->ring);
+       ctx->binning_ring = ring;
 }
 
 /* emit accumulated render cmds, needed for example if render target has
@@ -121,6 +142,10 @@ fd_context_destroy(struct pipe_context *pctx)
 
        DBG("");
 
+       util_slab_destroy(&ctx->transfer_pool);
+
+       util_dynarray_fini(&ctx->draw_patches);
+
        if (ctx->blitter)
                util_blitter_destroy(ctx->blitter);
 
@@ -129,7 +154,11 @@ fd_context_destroy(struct pipe_context *pctx)
 
        fd_ringmarker_del(ctx->draw_start);
        fd_ringmarker_del(ctx->draw_end);
-       fd_ringbuffer_del(ctx->ring);
+       fd_ringmarker_del(ctx->binning_start);
+       fd_ringmarker_del(ctx->binning_end);
+
+       for (i = 0; i < ARRAY_SIZE(ctx->rings); i++)
+               fd_ringbuffer_del(ctx->rings[i]);
 
        for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
                struct fd_vsc_pipe *pipe = &ctx->pipe[i];
@@ -176,6 +205,8 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
        fd_context_next_rb(pctx);
        fd_reset_rmw_state(ctx);
 
+       util_dynarray_init(&ctx->draw_patches);
+
        util_slab_create(&ctx->transfer_pool, sizeof(struct pipe_transfer),
                        16, UTIL_SLAB_SINGLETHREADED);
 
index a8abbca7a62e031246b917db145e5826c76a8af0..a0227e49c032d50c396d2b460916cd3ec5cd12be 100644 (file)
@@ -111,7 +111,7 @@ struct fd_context {
         */
        enum {
                /* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */
-               FD_BUFFER_COLOR   = PIPE_CLEAR_COLOR,
+               FD_BUFFER_COLOR   = PIPE_CLEAR_COLOR0,
                FD_BUFFER_DEPTH   = PIPE_CLEAR_DEPTH,
                FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL,
                FD_BUFFER_ALL     = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL,
@@ -148,9 +148,14 @@ struct fd_context {
        struct fd_ringbuffer *rings[4];
        unsigned rings_idx;
 
+       /* normal draw/clear cmds: */
        struct fd_ringbuffer *ring;
        struct fd_ringmarker *draw_start, *draw_end;
 
+       /* binning pass draw/clear cmds: */
+       struct fd_ringbuffer *binning_ring;
+       struct fd_ringmarker *binning_start, *binning_end;
+
        /* Keep track if WAIT_FOR_IDLE is needed for registers we need
         * to update via RMW:
         */
@@ -165,6 +170,11 @@ struct fd_context {
                uint32_t rbrc_draw;
        } rmw;
 
+       /* Keep track of DRAW initiators that need to be patched up depending
+        * on whether we using binning or not:
+        */
+       struct util_dynarray draw_patches;
+
        struct pipe_scissor_state scissor;
 
        /* we don't have a disable/enable bit for scissor, so instead we keep
index 0069438c87da5c8bd5433661cd5aa18958136e5d..d80f35656143eeb5d0853633a23b249d8ec85838 100644 (file)
@@ -54,7 +54,9 @@ size2indextype(unsigned index_size)
 
 /* this is same for a2xx/a3xx, so split into helper: */
 void
-fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               enum pc_di_vis_cull_mode vismode,
+               const struct pipe_draw_info *info)
 {
        struct pipe_index_buffer *idx = &ctx->indexbuf;
        struct fd_bo *idx_bo = NULL;
@@ -78,8 +80,8 @@ fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info)
                src_sel = DI_SRC_SEL_AUTO_INDEX;
        }
 
-       fd_draw(ctx, ctx->primtypes[info->mode], src_sel, info->count,
-                       idx_type, idx_size, idx_offset, idx_bo);
+       fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+                       info->count, idx_type, idx_size, idx_offset, idx_bo);
 }
 
 static void
@@ -180,6 +182,7 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
        ctx->clear(ctx, buffers, color, depth, stencil);
 
        ctx->dirty |= FD_DIRTY_ZSA |
+                       FD_DIRTY_VIEWPORT |
                        FD_DIRTY_RASTERIZER |
                        FD_DIRTY_SAMPLE_MASK |
                        FD_DIRTY_PROG |
index 190c0e52d24e8b153611b41711051f5b7b0beb1b..e8bb420889e6e3447a010d0a9e3820c5395e9eae 100644 (file)
 
 struct fd_ringbuffer;
 
-void fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info);
+void fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               enum pc_di_vis_cull_mode vismode,
+               const struct pipe_draw_info *info);
 
 void fd_draw_init(struct pipe_context *pctx);
 
 static inline void
-fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype,
+fd_draw(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               enum pc_di_primtype primtype,
+               enum pc_di_vis_cull_mode vismode,
                enum pc_di_src_sel src_sel, uint32_t count,
                enum pc_di_index_size idx_type,
                uint32_t idx_size, uint32_t idx_offset,
                struct fd_bo *idx_bo)
 {
-       struct fd_ringbuffer *ring = ctx->ring;
-
        /* for debug after a lock up, write a unique counter value
         * to scratch7 for each draw, to make it easier to match up
         * register dumps to cmdstream.  The combination of IB
@@ -64,7 +66,7 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype,
                OUT_PKT3(ring, CP_DRAW_INDX, 3);
                OUT_RING(ring, 0x00000000);
                OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
-                               INDEX_SIZE_IGN, IGNORE_VISIBILITY));
+                               INDEX_SIZE_IGN, USE_VISIBILITY));
                OUT_RING(ring, 0);             /* NumIndices */
 
                /* ugg, hard-code register offset to avoid pulling in the
@@ -76,8 +78,15 @@ fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype,
 
        OUT_PKT3(ring, CP_DRAW_INDX, idx_bo ? 5 : 3);
        OUT_RING(ring, 0x00000000);        /* viz query info. */
-       OUT_RING(ring, DRAW(primtype, src_sel,
-                       idx_type, IGNORE_VISIBILITY));
+       if (vismode == USE_VISIBILITY) {
+               /* leave vis mode blank for now, it will be patched up when
+                * we know if we are binning or not
+                */
+               OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0),
+                               &ctx->draw_patches);
+       } else {
+               OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode));
+       }
        OUT_RING(ring, count);             /* NumIndices */
        if (idx_bo) {
                OUT_RELOC(ring, idx_bo, idx_offset, 0, 0);
index 47f7a310e8c747d6a7b07de5bb18726ac5159733..0270538a3d0e080f321ffb591f4642c3c499c4d3 100644 (file)
@@ -85,7 +85,8 @@ calculate_tiles(struct fd_context *ctx)
        uint32_t bin_w, bin_h;
        uint32_t max_width = bin_width(ctx);
        uint32_t cpp = 4;
-       uint32_t i, j, t, p, n, xoff, yoff;
+       uint32_t i, j, t, xoff, yoff;
+       uint32_t tpp_x, tpp_y;
        bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
 
        if (pfb->cbufs[0])
@@ -145,20 +146,65 @@ calculate_tiles(struct fd_context *ctx)
        gmem->width = width;
        gmem->height = height;
 
-       /* Assign tiles and pipes:
-        * NOTE we currently take a rather simplistic approach of
-        * mapping rows of tiles to a pipe.  At some point it might
-        * be worth playing with different strategies and seeing if
-        * that makes much impact on performance.
+       /*
+        * Assign tiles and pipes:
+        *
+        * At some point it might be worth playing with different
+        * strategies and seeing if that makes much impact on
+        * performance.
         */
-       t = p = n = 0;
+
+#define div_round_up(v, a)  (((v) + (a) - 1) / (a))
+       /* figure out number of tiles per pipe: */
+       tpp_x = tpp_y = 1;
+       while (div_round_up(nbins_y, tpp_y) > 8)
+               tpp_y += 2;
+       while ((div_round_up(nbins_y, tpp_y) *
+                       div_round_up(nbins_x, tpp_x)) > 8)
+               tpp_x += 1;
+
+       /* configure pipes: */
+       xoff = yoff = 0;
+       for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+
+               if (xoff >= nbins_x) {
+                       xoff = 0;
+                       yoff += tpp_y;
+               }
+
+               if (yoff >= nbins_y) {
+                       break;
+               }
+
+               pipe->x = xoff;
+               pipe->y = yoff;
+               pipe->w = MIN2(tpp_x, nbins_x - xoff);
+               pipe->h = MIN2(tpp_y, nbins_y - yoff);
+
+               xoff += tpp_x;
+       }
+
+       for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               pipe->x = pipe->y = pipe->w = pipe->h = 0;
+       }
+
+#if 0 /* debug */
+       printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
+       for (i = 0; i < 8; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               printf("pipe[%d]: %ux%u @ %u,%u\n", i,
+                               pipe->w, pipe->h, pipe->x, pipe->y);
+       }
+#endif
+
+       /* configure tiles: */
+       t = 0;
        yoff = miny;
        for (i = 0; i < nbins_y; i++) {
-               struct fd_vsc_pipe *pipe = &ctx->pipe[p];
                uint32_t bw, bh;
 
-               assert(p < ARRAY_SIZE(ctx->pipe));
-
                xoff = minx;
 
                /* clip bin height: */
@@ -166,13 +212,20 @@ calculate_tiles(struct fd_context *ctx)
 
                for (j = 0; j < nbins_x; j++) {
                        struct fd_tile *tile = &ctx->tile[t];
+                       uint32_t n, p;
 
                        assert(t < ARRAY_SIZE(ctx->tile));
 
+                       /* pipe number: */
+                       p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
+
+                       /* slot number: */
+                       n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
+
                        /* clip bin width: */
                        bw = MIN2(bin_w, minx + width - xoff);
 
-                       tile->n = n++;
+                       tile->n = n;
                        tile->p = p;
                        tile->bin_w = bw;
                        tile->bin_h = bh;
@@ -184,22 +237,19 @@ calculate_tiles(struct fd_context *ctx)
                        xoff += bw;
                }
 
-               /* one pipe per row: */
-               pipe->x = 0;
-               pipe->y = i;
-               pipe->w = nbins_x;
-               pipe->h = 1;
-
-               p++;
-               n = 0;
-
                yoff += bh;
        }
 
-       for (; p < ARRAY_SIZE(ctx->pipe); p++) {
-               struct fd_vsc_pipe *pipe = &ctx->pipe[p];
-               pipe->x = pipe->y = pipe->w = pipe->h = 0;
+#if 0 /* debug */
+       t = 0;
+       for (i = 0; i < nbins_y; i++) {
+               for (j = 0; j < nbins_x; j++) {
+                       struct fd_tile *tile = &ctx->tile[t++];
+                       printf("|p:%u n:%u|", tile->p, tile->n);
+               }
+               printf("\n");
        }
+#endif
 }
 
 static void
@@ -259,6 +309,7 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
 
        /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
        fd_ringmarker_mark(ctx->draw_end);
+       fd_ringmarker_mark(ctx->binning_end);
 
        if (sysmem) {
                DBG("rendering sysmem (%s/%s)",
@@ -277,8 +328,9 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
        /* GPU executes starting from tile cmds, which IB back to draw cmds: */
        fd_ringmarker_flush(ctx->draw_end);
 
-       /* mark start for next draw cmds: */
+       /* mark start for next draw/binning cmds: */
        fd_ringmarker_mark(ctx->draw_start);
+       fd_ringmarker_mark(ctx->binning_start);
 
        fd_reset_rmw_state(ctx);
 
index 319e29f3adae82db9343a699d934470b946a3a7b..28a09166acdaca2ce087e30fb498d30dada40d1f 100644 (file)
@@ -64,12 +64,15 @@ static const struct debug_named_value debug_options[] = {
                {"direct",    FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"},
                {"dbypass",   FD_DBG_DBYPASS,"Disable GMEM bypass"},
                {"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
+               {"binning",   FD_DBG_BINNING,  "Enable hw binning"},
+               {"dbinning",  FD_DBG_DBINNING, "Disable hw binning"},
                DEBUG_NAMED_VALUE_END
 };
 
 DEBUG_GET_ONCE_FLAGS_OPTION(fd_mesa_debug, "FD_MESA_DEBUG", debug_options, 0)
 
 int fd_mesa_debug = 0;
+bool fd_binning_enabled = false; /* default to off for now */
 
 static const char *
 fd_screen_get_name(struct pipe_screen *pscreen)
@@ -386,6 +389,12 @@ fd_screen_create(struct fd_device *dev)
 
        fd_mesa_debug = debug_get_option_fd_mesa_debug();
 
+       if (fd_mesa_debug & FD_DBG_BINNING)
+               fd_binning_enabled = true;
+
+       if (fd_mesa_debug & FD_DBG_DBINNING)
+               fd_binning_enabled = false;
+
        if (!screen)
                return NULL;
 
index 48d346eb35b3f838e2522c42c3e342d2e72186e2..fae5ba06b1d07311aac460082e82bc7c7e49f99b 100644 (file)
@@ -37,6 +37,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_half.h"
+#include "util/u_dynarray.h"
 
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
@@ -52,16 +53,19 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 /* TBD if it is same on a2xx, but for now: */
 #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS
 
-#define FD_DBG_MSGS     0x01
-#define FD_DBG_DISASM   0x02
-#define FD_DBG_DCLEAR   0x04
-#define FD_DBG_DGMEM    0x08
-#define FD_DBG_DSCIS    0x10
-#define FD_DBG_DIRECT   0x20
-#define FD_DBG_DBYPASS  0x40
-#define FD_DBG_FRAGHALF 0x80
+#define FD_DBG_MSGS     0x0001
+#define FD_DBG_DISASM   0x0002
+#define FD_DBG_DCLEAR   0x0004
+#define FD_DBG_DGMEM    0x0008
+#define FD_DBG_DSCIS    0x0010
+#define FD_DBG_DIRECT   0x0020
+#define FD_DBG_DBYPASS  0x0040
+#define FD_DBG_FRAGHALF 0x0080
+#define FD_DBG_BINNING  0x0100
+#define FD_DBG_DBINNING 0x0200
 
 extern int fd_mesa_debug;
+extern bool fd_binning_enabled;
 
 #define DBG(fmt, ...) \
                do { if (fd_mesa_debug & FD_DBG_MSGS) \
@@ -87,6 +91,13 @@ static inline uint32_t DRAW(enum pc_di_primtype prim_type,
                        (1                 << 14);
 }
 
+/* for tracking cmdstream positions that need to be patched: */
+struct fd_cs_patch {
+       uint32_t *cs;
+       uint32_t val;
+};
+#define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch))
+#define fd_patch_element(buf, i)   util_dynarray_element(buf, struct fd_cs_patch, i)
 
 static inline enum pipe_format
 pipe_surface_format(struct pipe_surface *psurf)
@@ -110,6 +121,21 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
        *(ring->cur++) = data;
 }
 
+/* like OUT_RING() but appends a cmdstream patch point to 'buf' */
+static inline void
+OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data,
+               struct util_dynarray *buf)
+{
+       if (LOG_DWORDS) {
+               DBG("ring[%p]: OUT_RINGP  %04x:  %08x", ring,
+                               (uint32_t)(ring->cur - ring->last_start), data);
+       }
+       util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){
+               .cs  = ring->cur++,
+               .val = data,
+       }));
+}
+
 static inline void
 OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
                uint32_t offset, uint32_t or, int32_t shift)
@@ -132,7 +158,7 @@ OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo,
                uint32_t offset, uint32_t or, int32_t shift)
 {
        if (LOG_DWORDS) {
-               DBG("ring[%p]: OUT_RELOC   %04x:  %p+%u << %d", ring,
+               DBG("ring[%p]: OUT_RELOCW  %04x:  %p+%u << %d", ring,
                                (uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
        }
        fd_ringbuffer_reloc(ring, &(struct fd_reloc){