From 1f9839907a8eee15f634ff95577fbe498f1b70c2 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 22 Apr 2020 17:54:41 +0200 Subject: [PATCH] ir3: Skip missing VS outputs in VS out map when linking The hardware is capable of automatically filling in certain values in the VPC without writing them from the last geometry stage, like gl_PointCoord or gl_PrimitiveID when there is no GS. However, we *do* have to enable these outputs (i.e. set the VPC_VAR_DISABLE bit to 0) as VPC_VAR_DISABLE is really about FS inputs rather than VS outputs. To do this, we move the computation of the enable bits to ir3_link_add(), which is also a nice refactor anyway. In addition we detect the PrimID case specifically so that the driver can program the location. Part-of: --- src/freedreno/ir3/ir3_shader.h | 60 +++++++++++++++---- src/freedreno/vulkan/tu_pipeline.c | 17 ++---- .../drivers/freedreno/a3xx/fd3_program.c | 2 +- .../drivers/freedreno/a4xx/fd4_program.c | 2 +- .../drivers/freedreno/a5xx/fd5_program.c | 17 ++---- .../drivers/freedreno/a6xx/fd6_program.c | 17 ++---- 6 files changed, 67 insertions(+), 48 deletions(-) diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 63bec0d25dc..a3b8f917af6 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -699,7 +699,7 @@ ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) } else if (slot == VARYING_SLOT_COL1) { slot = VARYING_SLOT_BFC1; } else { - return 0; + return -1; } for (j = 0; j < so->outputs_count; j++) @@ -708,7 +708,7 @@ ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) debug_assert(0); - return 0; + return -1; } static inline int @@ -721,35 +721,71 @@ ir3_next_varying(const struct ir3_shader_variant *so, int i) } struct ir3_shader_linkage { + /* Maximum location either consumed by the fragment shader or produced by + * the last geometry stage, i.e. the size required for each vertex in the + * VPC in DWORD's. + */ uint8_t max_loc; + + /* Number of entries in var. */ uint8_t cnt; + + /* Bitset of locations used, including ones which are only used by the FS. + */ + uint32_t varmask[4]; + + /* Map from VS output to location. */ struct { uint8_t regid; uint8_t compmask; uint8_t loc; } var[32]; + + /* location for fixed-function gl_PrimitiveID passthrough */ + uint8_t primid_loc; }; static inline void -ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc) +ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask, uint8_t loc) { - int i = l->cnt++; - debug_assert(i < ARRAY_SIZE(l->var)); - l->var[i].regid = regid; - l->var[i].compmask = compmask; - l->var[i].loc = loc; + for (int j = 0; j < util_last_bit(compmask); j++) { + uint8_t comploc = loc + j; + l->varmask[comploc / 32] |= 1 << (comploc % 32); + } + l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask)); + + if (regid_ != regid(63, 0)) { + int i = l->cnt++; + debug_assert(i < ARRAY_SIZE(l->var)); + + l->var[i].regid = regid_; + l->var[i].compmask = compmask; + l->var[i].loc = loc; + } } static inline void ir3_link_shaders(struct ir3_shader_linkage *l, const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *fs) + const struct ir3_shader_variant *fs, + bool pack_vs_out) { + /* On older platforms, varmask isn't programmed at all, and it appears + * that the hardware generates a mask of used VPC locations using the VS + * output map, and hangs if a FS bary instruction references a location + * not in the list. This means that we need to have a dummy entry in the + * VS out map for things like gl_PointCoord which aren't written by the + * VS. Furthermore we can't use r63.x, so just pick a random register to + * use if there is no VS output. + */ + const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0); int j = -1, k; + l->primid_loc = 0xff; + while (l->cnt < ARRAY_SIZE(l->var)) { j = ir3_next_varying(fs, j); @@ -761,7 +797,11 @@ ir3_link_shaders(struct ir3_shader_linkage *l, k = ir3_find_output(vs, fs->inputs[j].slot); - ir3_link_add(l, vs->outputs[k].regid, + if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) { + l->primid_loc = fs->inputs[j].inloc; + } + + ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid, fs->inputs[j].compmask, fs->inputs[j].inloc); } } diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index e3ac144603a..a86c6e1c384 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -929,23 +929,16 @@ tu6_emit_vpc(struct tu_cs *cs, bool has_gs = gs->type != MESA_SHADER_NONE; const struct ir3_shader_variant *last_shader = has_gs ? gs : vs; struct ir3_shader_linkage linkage = { 0 }; - ir3_link_shaders(&linkage, last_shader, fs); + ir3_link_shaders(&linkage, last_shader, fs, true); if (last_shader->shader->stream_output.num_outputs) tu6_link_streamout(&linkage, last_shader); - BITSET_DECLARE(vpc_var_enables, 128) = { 0 }; - for (uint32_t i = 0; i < linkage.cnt; i++) { - const uint32_t comp_count = util_last_bit(linkage.var[i].compmask); - for (uint32_t j = 0; j < comp_count; j++) - BITSET_SET(vpc_var_enables, linkage.var[i].loc + j); - } - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); - tu_cs_emit(cs, ~vpc_var_enables[0]); - tu_cs_emit(cs, ~vpc_var_enables[1]); - tu_cs_emit(cs, ~vpc_var_enables[2]); - tu_cs_emit(cs, ~vpc_var_enables[3]); + tu_cs_emit(cs, ~linkage.varmask[0]); + tu_cs_emit(cs, ~linkage.varmask[1]); + tu_cs_emit(cs, ~linkage.varmask[2]); + tu_cs_emit(cs, ~linkage.varmask[3]); /* a6xx finds position/pointsize at the end */ const uint32_t position_regid = diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index c6a07b19389..cbcdfe57ea7 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -237,7 +237,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in)); struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, vp, fp); + ir3_link_shaders(&l, vp, fp, false); for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { uint32_t reg = 0; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 4a0e7568250..d9ccecb06ad 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -289,7 +289,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in)); struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, s[VS].v, s[FS].v); + ir3_link_shaders(&l, s[VS].v, s[FS].v, false); for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { uint32_t reg = 0; diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c index b427f989470..a2fe505b4dd 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c @@ -410,24 +410,17 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, COND(s[VS].v->num_samp > 0, A5XX_SP_VS_CTRL_REG0_PIXLODENABLE)); struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, s[VS].v, s[FS].v); + ir3_link_shaders(&l, s[VS].v, s[FS].v, true); if ((s[VS].v->shader->stream_output.num_outputs > 0) && !emit->binning_pass) link_stream_out(&l, s[VS].v); - BITSET_DECLARE(varbs, 128) = {0}; - uint32_t *varmask = (uint32_t *)varbs; - - for (i = 0; i < l.cnt; i++) - for (j = 0; j < util_last_bit(l.var[i].compmask); j++) - BITSET_SET(varbs, l.var[i].loc + j); - OUT_PKT4(ring, REG_A5XX_VPC_VAR_DISABLE(0), 4); - OUT_RING(ring, ~varmask[0]); /* VPC_VAR[0].DISABLE */ - OUT_RING(ring, ~varmask[1]); /* VPC_VAR[1].DISABLE */ - OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */ - OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */ + OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ + OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ + OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ + OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ /* a5xx appends pos/psize to end of the linkage map: */ if (pos_regid != regid(63,0)) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 96bed761579..9e12cb246c9 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -429,20 +429,13 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_screen *screen, struct ir3_shader_linkage l = {0}; const struct ir3_shader_variant *last_shader = fd6_last_shader(state); - ir3_link_shaders(&l, last_shader, fs); - - BITSET_DECLARE(varbs, 128) = {0}; - uint32_t *varmask = (uint32_t *)varbs; - - for (i = 0; i < l.cnt; i++) - for (j = 0; j < util_last_bit(l.var[i].compmask); j++) - BITSET_SET(varbs, l.var[i].loc + j); + ir3_link_shaders(&l, last_shader, fs, true); OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); - OUT_RING(ring, ~varmask[0]); /* VPC_VAR[0].DISABLE */ - OUT_RING(ring, ~varmask[1]); /* VPC_VAR[1].DISABLE */ - OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */ - OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */ + OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ + OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ + OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ + OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ if (last_shader->shader->stream_output.num_outputs > 0) -- 2.30.2