From cb540c21f24720436356ab34f15e440a58e3e55d Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 22 Feb 2014 10:47:27 -0500 Subject: [PATCH] freedreno/a3xx: binning-pass vertex shader variant Now that we have the infrastructure for shader variants, add support to generate an optimized shader for hw binning pass (with varyings/outputs other than position/pointsize removed). This exposes the possibility that the shader uses fewer constants than what is bound, so we have to take care to not emit consts beyond what the shader uses, lest we provoke the wrath of the HLSQ lockup! Signed-off-by: Rob Clark --- .../drivers/freedreno/a3xx/fd3_compiler.c | 25 ++++++++++- src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 41 +++++++++++++++---- .../drivers/freedreno/a3xx/fd3_program.c | 9 ---- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index 54b36265ddf..905af54e48d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -2214,7 +2214,7 @@ fd3_compile_shader(struct fd3_shader_variant *so, { struct fd3_compile_context ctx; struct ir3_block *block; - unsigned i, actual_in; + unsigned i, j, actual_in; int ret = 0; assert(!so->ir); @@ -2232,6 +2232,29 @@ fd3_compile_shader(struct fd3_shader_variant *so, block = ctx.block; + /* at this point, for binning pass, throw away unneeded outputs: */ + if (key.binning_pass) { + for (i = 0, j = 0; i < so->outputs_count; i++) { + unsigned name = sem2name(so->outputs[i].semantic); + unsigned idx = sem2name(so->outputs[i].semantic); + + /* throw away everything but first position/psize */ + if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || + (name == TGSI_SEMANTIC_PSIZE))) { + if (i != j) { + so->outputs[j] = so->outputs[i]; + block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; + block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; + block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; + block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; + } + j++; + } + } + so->outputs_count = j; + block->noutputs = j * 4; + } + /* at this point, we want the kill's in the outputs array too, * so that they get scheduled (since they have no dst).. we've * already ensured that the array is big enough in push_block(): diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 5bfd976170c..50271fa137c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -90,6 +90,7 @@ emit_constants(struct fd_ringbuffer *ring, struct fd3_shader_variant *shader) { uint32_t enabled_mask = constbuf->enabled_mask; + uint32_t first_immediate; uint32_t base = 0; unsigned i; @@ -97,6 +98,13 @@ emit_constants(struct fd_ringbuffer *ring, // they are clobbered by a clear, gmem2mem, or mem2gmem.. constbuf->dirty_mask = enabled_mask; + /* in particular, with binning shader and a unneeded consts no + * longer referenced, we could end up w/ constlen that is smaller + * than first_immediate. In that case truncate the user consts + * early to avoid HLSQ lockup caused by writing too many consts + */ + first_immediate = MIN2(shader->first_immediate, shader->constlen); + /* emit user constants: */ while (enabled_mask) { unsigned index = ffs(enabled_mask) - 1; @@ -109,10 +117,14 @@ emit_constants(struct fd_ringbuffer *ring, /* gallium could leave const buffers bound above what the * current shader uses.. don't let that confuse us. */ - if (base >= (4 * shader->first_immediate)) + if (base >= (4 * first_immediate)) break; if (constbuf->dirty_mask & (1 << index)) { + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (4 * first_immediate) - base); fd3_emit_constant(ring, sb, base, cb->buffer_offset, size, cb->user_buffer, cb->buffer); @@ -332,6 +344,15 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, j++; } } + + OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); + OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) | + A3XX_VFD_CONTROL_0_PACKETSIZE(2) | + A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) | + A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j)); + OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX + A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) | + A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0))); } void @@ -429,11 +450,13 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { struct fd3_rasterizer_stateobj *rasterizer = fd3_rasterizer_stateobj(ctx->rasterizer); - uint32_t stride_in_vpc; + uint32_t stride_in_vpc = 0; - stride_in_vpc = align(fp->total_in, 4) / 4; - if (stride_in_vpc > 0) - stride_in_vpc = MAX2(stride_in_vpc, 2); + if (!key.binning_pass) { + stride_in_vpc = align(fp->total_in, 4) / 4; + if (stride_in_vpc > 0) + stride_in_vpc = MAX2(stride_in_vpc, 2); + } OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); OUT_RING(ring, rasterizer->pc_prim_vtx_cntl | @@ -480,9 +503,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, emit_constants(ring, SB_VERT_SHADER, &ctx->constbuf[PIPE_SHADER_VERTEX], (prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL); - emit_constants(ring, SB_FRAG_SHADER, - &ctx->constbuf[PIPE_SHADER_FRAGMENT], - (prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL); + if (!key.binning_pass) { + emit_constants(ring, SB_FRAG_SHADER, + &ctx->constbuf[PIPE_SHADER_FRAGMENT], + (prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL); + } } if ((dirty & FD_DIRTY_BLEND) && ctx->blend) { diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 01502ce955e..6fc39a96380 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -532,15 +532,6 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ } - - OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); - OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) | - A3XX_VFD_CONTROL_0_PACKETSIZE(2) | - A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vp->inputs_count) | - A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vp->inputs_count)); - OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX - A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) | - A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0))); } /* hack.. until we figure out how to deal w/ vpsrepl properly.. */ -- 2.30.2