freedreno/a3xx: binning-pass vertex shader variant
authorRob Clark <robclark@freedesktop.org>
Sat, 22 Feb 2014 15:47:27 +0000 (10:47 -0500)
committerRob Clark <robclark@freedesktop.org>
Sun, 2 Mar 2014 16:26:35 +0000 (11:26 -0500)
Now that we have the infrastructure for shader variants, add support to
generate an optimized shader for hw binning pass (with varyings/outputs
other than position/pointsize removed).  This exposes the possibility
that the shader uses fewer constants than what is bound, so we have to
take care to not emit consts beyond what the shader uses, lest we
provoke the wrath of the HLSQ lockup!

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
src/gallium/drivers/freedreno/a3xx/fd3_emit.c
src/gallium/drivers/freedreno/a3xx/fd3_program.c

index 54b36265ddf620c89272c18f8eecf3ae767660df..905af54e48dc02de4a5432f13068fa6f8a29a9e2 100644 (file)
@@ -2214,7 +2214,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 {
        struct fd3_compile_context ctx;
        struct ir3_block *block;
-       unsigned i, actual_in;
+       unsigned i, j, actual_in;
        int ret = 0;
 
        assert(!so->ir);
@@ -2232,6 +2232,29 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 
        block = ctx.block;
 
+       /* at this point, for binning pass, throw away unneeded outputs: */
+       if (key.binning_pass) {
+               for (i = 0, j = 0; i < so->outputs_count; i++) {
+                       unsigned name = sem2name(so->outputs[i].semantic);
+                       unsigned idx = sem2name(so->outputs[i].semantic);
+
+                       /* throw away everything but first position/psize */
+                       if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
+                                       (name == TGSI_SEMANTIC_PSIZE))) {
+                               if (i != j) {
+                                       so->outputs[j] = so->outputs[i];
+                                       block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
+                                       block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
+                                       block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
+                                       block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+                               }
+                               j++;
+                       }
+               }
+               so->outputs_count = j;
+               block->noutputs = j * 4;
+       }
+
        /* at this point, we want the kill's in the outputs array too,
         * so that they get scheduled (since they have no dst).. we've
         * already ensured that the array is big enough in push_block():
index 5bfd976170cd5974b16c98d197e52c98bd14d6ef..50271fa137c5320df9dfe903eee4be71803003c0 100644 (file)
@@ -90,6 +90,7 @@ emit_constants(struct fd_ringbuffer *ring,
                struct fd3_shader_variant *shader)
 {
        uint32_t enabled_mask = constbuf->enabled_mask;
+       uint32_t first_immediate;
        uint32_t base = 0;
        unsigned i;
 
@@ -97,6 +98,13 @@ emit_constants(struct fd_ringbuffer *ring,
        // they are clobbered by a clear, gmem2mem, or mem2gmem..
        constbuf->dirty_mask = enabled_mask;
 
+       /* in particular, with binning shader and a unneeded consts no
+        * longer referenced, we could end up w/ constlen that is smaller
+        * than first_immediate.  In that case truncate the user consts
+        * early to avoid HLSQ lockup caused by writing too many consts
+        */
+       first_immediate = MIN2(shader->first_immediate, shader->constlen);
+
        /* emit user constants: */
        while (enabled_mask) {
                unsigned index = ffs(enabled_mask) - 1;
@@ -109,10 +117,14 @@ emit_constants(struct fd_ringbuffer *ring,
                /* gallium could leave const buffers bound above what the
                 * current shader uses.. don't let that confuse us.
                 */
-               if (base >= (4 * shader->first_immediate))
+               if (base >= (4 * first_immediate))
                        break;
 
                if (constbuf->dirty_mask & (1 << index)) {
+                       /* and even if the start of the const buffer is before
+                        * first_immediate, the end may not be:
+                        */
+                       size = MIN2(size, (4 * first_immediate) - base);
                        fd3_emit_constant(ring, sb, base,
                                        cb->buffer_offset, size,
                                        cb->user_buffer, cb->buffer);
@@ -332,6 +344,15 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
                        j++;
                }
        }
+
+       OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
+       OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
+                       A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
+                       A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) |
+                       A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j));
+       OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
+                       A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
+                       A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
 }
 
 void
@@ -429,11 +450,13 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
        if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
                struct fd3_rasterizer_stateobj *rasterizer =
                                fd3_rasterizer_stateobj(ctx->rasterizer);
-               uint32_t stride_in_vpc;
+               uint32_t stride_in_vpc = 0;
 
-               stride_in_vpc = align(fp->total_in, 4) / 4;
-               if (stride_in_vpc > 0)
-                       stride_in_vpc = MAX2(stride_in_vpc, 2);
+               if (!key.binning_pass) {
+                       stride_in_vpc = align(fp->total_in, 4) / 4;
+                       if (stride_in_vpc > 0)
+                               stride_in_vpc = MAX2(stride_in_vpc, 2);
+               }
 
                OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
                OUT_RING(ring, rasterizer->pc_prim_vtx_cntl |
@@ -480,9 +503,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                emit_constants(ring,  SB_VERT_SHADER,
                                &ctx->constbuf[PIPE_SHADER_VERTEX],
                                (prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
-               emit_constants(ring, SB_FRAG_SHADER,
-                               &ctx->constbuf[PIPE_SHADER_FRAGMENT],
-                               (prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+               if (!key.binning_pass) {
+                       emit_constants(ring, SB_FRAG_SHADER,
+                                       &ctx->constbuf[PIPE_SHADER_FRAGMENT],
+                                       (prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+               }
        }
 
        if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
index 01502ce955ec4e2100bf351b17dc14c973c8c518..6fc39a96380df2d1854bf066de47afc4c63b5c29 100644 (file)
@@ -532,15 +532,6 @@ fd3_program_emit(struct fd_ringbuffer *ring,
                OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
                OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
        }
-
-       OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
-       OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
-                       A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
-                       A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vp->inputs_count) |
-                       A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vp->inputs_count));
-       OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
-                       A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
-                       A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
 }
 
 /* hack.. until we figure out how to deal w/ vpsrepl properly.. */