freedreno/ir3: don't offset inloc by 8
authorRob Clark <robdclark@gmail.com>
Wed, 23 Nov 2016 17:21:38 +0000 (12:21 -0500)
committerRob Clark <robdclark@gmail.com>
Wed, 30 Nov 2016 17:25:48 +0000 (12:25 -0500)
On a3xx/a4xx, the SP_VS_VPC_DST_REG.OUTLOCn is offset by 8, so we used
to add this offset into fs->inputs[n].inloc.  But a5xx drops this extra
offset-by-8.  So instead make inloc zero based and add the offset when
we emit OUTLOCn values (for the gen's that need the offset).

Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/a3xx/fd3_program.c
src/gallium/drivers/freedreno/a4xx/fd4_program.c
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
src/gallium/drivers/freedreno/ir3/ir3_shader.h

index e72d432c6ca4135a9d834202828e246db1d476cf..f43d5c47ce0b20790d61a972a97b20f2e1d24b30 100644 (file)
@@ -299,10 +299,10 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 
                OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1);
 
-               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc);
-               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc);
-               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc);
-               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc);
+               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8);
+               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8);
+               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8);
+               reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8);
 
                OUT_RING(ring, reg);
        }
@@ -391,10 +391,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
                         */
                        unsigned compmask = fp->inputs[j].compmask;
 
-                       /* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
-                        * instead.. rather than -8 everywhere else..
-                        */
-                       uint32_t inloc = fp->inputs[j].inloc - 8;
+                       uint32_t inloc = fp->inputs[j].inloc;
 
                        if ((fp->inputs[j].interpolate == INTERP_MODE_FLAT) ||
                                        (fp->inputs[j].rasterflat && emit->rasterflat)) {
index 4db846a6690fa71c8bd45ba527e5f148ce95703d..3e7512533791080411a41ae266c6bd1f2c685a84 100644 (file)
@@ -366,10 +366,10 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
                OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1);
 
-               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc);
-               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc);
-               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc);
-               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc);
+               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8);
+               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8);
+               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8);
+               reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8);
 
                OUT_RING(ring, reg);
        }
@@ -504,10 +504,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
                         */
                        unsigned compmask = s[FS].v->inputs[j].compmask;
 
-                       /* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
-                        * instead.. rather than -8 everywhere else..
-                        */
-                       uint32_t inloc = s[FS].v->inputs[j].inloc - 8;
+                       uint32_t inloc = s[FS].v->inputs[j].inloc;
 
                        if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) ||
                                        (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
index f4d92650595207745a479a35ec63c5c7a8fe761e..9cf6717c17db9defd5e333c9e936067428988d43 100644 (file)
@@ -2460,7 +2460,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
        for (i = 0; i < so->inputs_count; i++) {
                unsigned j, regid = ~0, compmask = 0;
                so->inputs[i].ncomp = 0;
-               so->inputs[i].inloc = inloc + 8;
+               so->inputs[i].inloc = inloc;
                for (j = 0; j < 4; j++) {
                        struct ir3_instruction *in = inputs[(i*4) + j];
                        if (in && !(in->flags & IR3_INSTR_UNUSED)) {
index c46b4522e3ce0fb987b2a02a8da1a053c6301fa2..c603168a04b3ef45cb6d77458e0f0a1449fbf43a 100644 (file)
@@ -181,16 +181,10 @@ struct ir3_shader_variant {
                uint8_t regid;
                uint8_t compmask;
                uint8_t ncomp;
-               /* In theory inloc of fs should match outloc of vs.  Or
-                * rather the outloc of the vs is 8 plus the offset passed
-                * to bary.f.  Presumably that +8 is to account for
-                * gl_Position/gl_PointSize?
-                *
-                * NOTE inloc is currently aligned to 4 (we don't try
-                * to pack varyings).  Changing this would likely break
-                * assumptions in few places (like setting up of flat
-                * shading in fd3_program) so be sure to check all the
-                * spots where inloc is used.
+               /* location of input (ie. offset passed to bary.f, etc).  This
+                * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+                * have the OUTLOCn value offset by 8, presumably to account
+                * for gl_Position/gl_PointSize)
                 */
                uint8_t inloc;
                /* vertex shader specific: */