From: Rob Clark Date: Tue, 8 Apr 2014 18:14:43 +0000 (-0400) Subject: freedreno/a3xx: deal with optimized tex instructions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ee839cc6ef92d37ec6a44e6036e7a2c46172a16a;p=mesa.git freedreno/a3xx: deal with optimized tex instructions Keep track of whether we actually have any sam instructions in the resulting shader, rather than using TGSI SAMP declarations. If the sam instruction is optimized out, because the result is not used, we don't want to emit texture state, etc. In fact emitting sampler state and/or setting PIXLODENABLE bit when there are no texture fetches seems to cause lockup. In theory this should never happen for a "normal" shader, unless the state tracker is wonky. But it is a very real possibility for binning pass shaders. Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index 1d99e5caa99..911330cde2a 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -2054,12 +2054,6 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) } } -static void -decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - ctx->so->samplers_count++; -} - /* from TGSI perspective, we actually have inputs. But most of the "inputs" * for a fragment shader are just bary.f instructions. The *actual* inputs * from the hw perspective are the frag_pos and optionally frag_coord and @@ -2160,8 +2154,6 @@ compile_instructions(struct fd3_compile_context *ctx) decl_out(ctx, decl); } else if (decl->Declaration.File == TGSI_FILE_INPUT) { decl_in(ctx, decl); - } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) { - decl_samp(ctx, decl); } break; } @@ -2320,7 +2312,7 @@ fd3_compile_shader(struct fd3_shader_variant *so, } ret = ir3_block_ra(block, so->type, key.half_precision, - so->frag_coord, so->frag_face); + so->frag_coord, so->frag_face, &so->has_samp); if (ret) goto out; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c index 76de287b163..ee58591fffc 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c @@ -1417,7 +1417,7 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) static void decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) { - ctx->so->samplers_count++; + ctx->so->has_samp = true; } static void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 00f1014444b..b1cf3fd131a 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -177,7 +177,7 @@ emit_textures(struct fd_ringbuffer *ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); for (i = 0; i < tex->num_samplers; i++) { static const struct fd3_sampler_stateobj dummy_sampler = {}; - struct fd3_sampler_stateobj *sampler = tex->samplers[i] ? + const struct fd3_sampler_stateobj *sampler = tex->samplers[i] ? fd3_sampler_stateobj(tex->samplers[i]) : &dummy_sampler; OUT_RING(ring, sampler->texsamp0); @@ -542,11 +542,19 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & (FD_DIRTY_VERTTEX | FD_DIRTY_FRAGTEX)) fd_wfi(ctx, ring); - if (dirty & FD_DIRTY_VERTTEX) - emit_textures(ring, SB_VERT_TEX, &ctx->verttex); + if (dirty & FD_DIRTY_VERTTEX) { + if (vp->has_samp) + emit_textures(ring, SB_VERT_TEX, &ctx->verttex); + else + dirty &= ~FD_DIRTY_VERTTEX; + } - if (dirty & FD_DIRTY_FRAGTEX) - emit_textures(ring, SB_FRAG_TEX, &ctx->fragtex); + if (dirty & FD_DIRTY_FRAGTEX) { + if (fp->has_samp) + emit_textures(ring, SB_FRAG_TEX, &ctx->fragtex); + else + dirty &= ~FD_DIRTY_FRAGTEX; + } ctx->dirty &= ~dirty; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 09cadf81cbf..b5544e8c358 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -120,7 +120,7 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) v->inputs_count = 0; v->outputs_count = 0; v->total_in = 0; - v->samplers_count = 0; + v->has_samp = false; v->immediates_count = 0; } } else { @@ -397,7 +397,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | - COND(vp->samplers_count > 0, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) | + COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) | A3XX_SP_VS_CTRL_REG0_LENGTH(vp->instrlen)); OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | @@ -475,7 +475,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | + COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 8d4fd57ae75..e0866c1d008 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -107,8 +107,8 @@ struct fd3_shader_variant { unsigned total_in; /* sum of inputs (scalar) */ - /* samplers: */ - unsigned samplers_count; + /* do we have one or more texture sample instructions: */ + bool has_samp; /* const reg # of first immediate, ie. 1 == c1 * (not regid, because TGSI thinks in terms of vec4 registers, diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 09052346992..872f47883bb 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -385,8 +385,8 @@ void ir3_block_sched(struct ir3_block *block); /* register assignment: */ int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision, bool frag_coord, bool frag_face); - + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp); #ifndef ARRAY_SIZE # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 4e48eded2bb..57c68c729c5 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -56,6 +56,7 @@ struct ir3_ra_ctx { bool half_precision; bool frag_coord; bool frag_face; + bool has_samp; int cnt; bool error; }; @@ -654,8 +655,17 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (is_sfu(n)) regmask_set(&needs_ss, n->regs[0]); - if (is_tex(n)) + if (is_tex(n)) { + /* this ends up being the # of samp instructions.. but that + * is ok, everything else only cares whether it is zero or + * not. We do this here, rather than when we encounter a + * SAMP decl, because (especially in binning pass shader) + * the samp instruction(s) could get eliminated if the + * result is not used. + */ + ctx->has_samp = true; regmask_set(&needs_sy, n->regs[0]); + } /* both tex/sfu appear to not always immediately consume * their src register(s): @@ -730,7 +740,8 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) } int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision, bool frag_coord, bool frag_face) + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp) { struct ir3_ra_ctx ctx = { .block = block, @@ -739,6 +750,11 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type, .frag_coord = frag_coord, .frag_face = frag_face, }; + int ret; + ir3_shader_clear_mark(block->shader); - return block_ra(&ctx, block); + ret = block_ra(&ctx, block); + *has_samp = ctx.has_samp; + + return ret; }