From 7309c6126f2987d703b2f30b3cb56edf97d437d3 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 1 Oct 2014 15:26:26 -0400 Subject: [PATCH] freedreno/a3xx: handle large shader program sizes Above a certain limit use CACHE mode instead of BUFFER mode. This should solve gpu hangs with large shader programs. Signed-off-by: Rob Clark --- .../drivers/freedreno/a3xx/fd3_program.c | 74 ++++++++++++++++--- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index d7fe42ec4df..d2a32486781 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -179,6 +179,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, { const struct ir3_shader_variant *vp, *fp; const struct ir3_info *vsi, *fsi; + enum a3xx_instrbuffermode fpbuffer, vpbuffer; + uint32_t fpbuffersz, vpbuffersz, fsoff; uint32_t pos_regid, posz_regid, psize_regid, color_regid; int i, j, k; @@ -195,6 +197,46 @@ fd3_program_emit(struct fd_ringbuffer *ring, vsi = &vp->info; fsi = &fp->info; + fpbuffer = BUFFER; + vpbuffer = BUFFER; + fpbuffersz = fp->instrlen; + vpbuffersz = vp->instrlen; + + /* + * Decide whether to use BUFFER or CACHE mode for VS and FS. It + * appears like 256 is the hard limit, but when the combined size + * exceeds 128 then blob will try to keep FS in BUFFER mode and + * switch to CACHE for VS until VS is too large. The blob seems + * to switch FS out of BUFFER mode at slightly under 128. But + * a bit fuzzy on the decision tree, so use slightly conservative + * limits. + * + * TODO check if these thresholds for BUFFER vs CACHE mode are the + * same for all a3xx or whether we need to consider the gpuid + */ + + if ((fpbuffersz + vpbuffersz) > 128) { + if (fpbuffersz < 112) { + /* FP:BUFFER VP:CACHE */ + vpbuffer = CACHE; + vpbuffersz = 256 - fpbuffersz; + } else if (vpbuffersz < 112) { + /* FP:CACHE VP:BUFFER */ + fpbuffer = CACHE; + fpbuffersz = 256 - vpbuffersz; + } else { + /* FP:CACHE VP:CACHE */ + vpbuffer = fpbuffer = CACHE; + vpbuffersz = fpbuffersz = 192; + } + } + + if (fpbuffer == BUFFER) { + fsoff = 128 - fpbuffersz; + } else { + fsoff = 256 - fpbuffersz; + } + pos_regid = find_output_regid(vp, ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); posz_regid = find_output_regid(fp, @@ -223,10 +265,10 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid)); OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | - A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp->instrlen)); + A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz)); OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) | A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) | - A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fp->instrlen)); + A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz)); OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | @@ -239,15 +281,15 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_VS_CTRL_REG0_CACHEINVALID | + A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) | + COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_VS_CTRL_REG0_LENGTH(vp->instrlen)); + A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz)); OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0))); @@ -311,28 +353,36 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); } else { OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | - A3XX_SP_FS_CTRL_REG0_CACHEINVALID | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) | + COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) | A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen)); + A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz)); OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) | A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) | A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + + /* NOTE: I believe VS.CONSTLEN should be <= FS.CONSTOBJOFFSET*/ + debug_assert(vp->constlen <= 128); + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff)); OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ } @@ -411,13 +461,15 @@ fd3_program_emit(struct fd_ringbuffer *ring, OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252)); - emit_shader(ring, vp); + if (vpbuffer == BUFFER) + emit_shader(ring, vp); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ if (!key.binning_pass) { - emit_shader(ring, fp); + if (fpbuffer == BUFFER) + emit_shader(ring, fp); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ -- 2.30.2