v3d: Stop scalarizing our uniform loads.
authorEric Anholt <eric@anholt.net>
Thu, 3 Jan 2019 19:38:57 +0000 (11:38 -0800)
committerEric Anholt <eric@anholt.net>
Fri, 4 Jan 2019 23:41:23 +0000 (15:41 -0800)
We can pull a whole vector in a single indirect load.  This saves a bunch
of round-trips to the TMU, instructions for setting up multiple loads,
references to the UBO base in the uniforms, and apparently manages to
reduce register pressure as well.

instructions in affected programs: 3086665 -> 2454967 (-20.47%)
uniforms in affected programs: 919581 -> 721039 (-21.59%)
threads in affected programs: 1710 -> 3420 (100.00%)
spills in affected programs: 596 -> 522 (-12.42%)
fills in affected programs: 680 -> 562 (-17.35%)

Improves 3dmmes performance by 2.29312% +/- 0.139825% (n=5)

src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/v3d_nir_lower_io.c

index 91d4ab0020e7ea4ce400851af93b9ce393d18b95..defddecc84730067e76143560baff0fce59d62ac 100644 (file)
@@ -118,11 +118,44 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
         bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
         int offset_src = 0 + has_index;
 
-        /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
-         * 1 (0 is gallium's constant buffer 0).
-         */
-        struct qreg offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                         nir_src_as_uint(instr->src[0]) + 1);
+        struct qreg offset;
+        if (instr->intrinsic == nir_intrinsic_load_uniform) {
+                offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0);
+
+                /* Find what variable in the default uniform block this
+                 * uniform load is coming from.
+                 */
+                uint32_t base = nir_intrinsic_base(instr);
+                int i;
+                struct v3d_ubo_range *range = NULL;
+                for (i = 0; i < c->num_ubo_ranges; i++) {
+                        range = &c->ubo_ranges[i];
+                        if (base >= range->src_offset &&
+                            base < range->src_offset + range->size) {
+                                break;
+                        }
+                }
+                /* The driver-location-based offset always has to be within a
+                 * declared uniform range.
+                 */
+                assert(i != c->num_ubo_ranges);
+                if (!c->ubo_range_used[i]) {
+                        c->ubo_range_used[i] = true;
+                        range->dst_offset = c->next_ubo_dst_offset;
+                        c->next_ubo_dst_offset += range->size;
+                }
+
+                base = base - range->src_offset + range->dst_offset;
+
+                if (base != 0)
+                        offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
+        } else {
+                /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
+                 * 1 (0 is gallium's constant buffer 0).
+                 */
+                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                     nir_src_as_uint(instr->src[0]) + 1);
+        }
 
         uint32_t config = (0xffffff00 |
                            tmu_op |
@@ -161,49 +194,6 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
 }
 
-static struct qreg
-indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
-{
-        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
-        uint32_t offset = nir_intrinsic_base(intr);
-        struct v3d_ubo_range *range = NULL;
-        unsigned i;
-
-        for (i = 0; i < c->num_ubo_ranges; i++) {
-                range = &c->ubo_ranges[i];
-                if (offset >= range->src_offset &&
-                    offset < range->src_offset + range->size) {
-                        break;
-                }
-        }
-        /* The driver-location-based offset always has to be within a declared
-         * uniform range.
-         */
-        assert(i != c->num_ubo_ranges);
-        if (!c->ubo_range_used[i]) {
-                c->ubo_range_used[i] = true;
-                range->dst_offset = c->next_ubo_dst_offset;
-                c->next_ubo_dst_offset += range->size;
-        }
-
-        offset -= range->src_offset;
-
-        if (range->dst_offset + offset != 0) {
-                indirect_offset = vir_ADD(c, indirect_offset,
-                                          vir_uniform_ui(c, range->dst_offset +
-                                                         offset));
-        }
-
-        /* Adjust for where we stored the TGSI register base. */
-        vir_ADD_dest(c,
-                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
-                     vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
-                     indirect_offset);
-
-        vir_emit_thrsw(c);
-        return vir_LDTMU(c);
-}
-
 static struct qreg *
 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
 {
@@ -1618,19 +1608,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                assert(instr->num_components == 1);
                 if (nir_src_is_const(instr->src[0])) {
-                        offset = (nir_intrinsic_base(instr) +
-                                  nir_src_as_uint(instr->src[0]));
+                        int offset = (nir_intrinsic_base(instr) +
+                                      nir_src_as_uint(instr->src[0]));
                         assert(offset % 4 == 0);
                         /* We need dwords */
                         offset = offset / 4;
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       vir_uniform(c, QUNIFORM_UNIFORM,
-                                                   offset));
+                        for (int i = 0; i < instr->num_components; i++) {
+                                ntq_store_dest(c, &instr->dest, i,
+                                               vir_uniform(c, QUNIFORM_UNIFORM,
+                                                           offset + i));
+                        }
                 } else {
-                        ntq_store_dest(c, &instr->dest, 0,
-                                       indirect_uniform_load(c, instr));
+                        ntq_emit_tmu_general(c, instr);
                 }
                 break;
 
index db339f87a53b78afe76dbb136d554bccf9eead7f..b65a82b7f7a4beeb2021839d8b41b721579499d1 100644 (file)
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
  *
- * Currently, it just splits uniforms into scalars, and fixes up the
- * addressing on indirect uniform loads.  FS input and VS output scalarization
- * is handled by nir_lower_io_to_scalar().
+ * After moving more and more logic to NIR, all that's left here is fixing up
+ * addressing on uniform loads.  FS input and VS output scalarization is
+ * handled by nir_lower_io_to_scalar().
  */
 
-static void
-replace_intrinsic_with_vec(nir_builder *b, nir_intrinsic_instr *intr,
-                           nir_ssa_def **comps)
-{
-
-        /* Batch things back together into a vector.  This will get split by
-         * the later ALU scalarization pass.
-         */
-        nir_ssa_def *vec = nir_vec(b, comps, intr->num_components);
-
-        /* Replace the old intrinsic with a reference to our reconstructed
-         * vector.
-         */
-        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec));
-        nir_instr_remove(&intr->instr);
-}
-
+/* Convert the uniform offset to bytes.  If it happens to be a constant,
+ * constant-folding will clean up the shift for us.
+ */
 static void
 v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
                       nir_intrinsic_instr *intr)
 {
         b->cursor = nir_before_instr(&intr->instr);
 
-        /* Generate scalar loads equivalent to the original vector. */
-        nir_ssa_def *dests[4];
-        for (unsigned i = 0; i < intr->num_components; i++) {
-                nir_intrinsic_instr *intr_comp =
-                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
-                intr_comp->num_components = 1;
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1,
-                                  intr->dest.ssa.bit_size, NULL);
-
-                /* Convert the uniform offset to bytes.  If it happens
-                 * to be a constant, constant-folding will clean up
-                 * the shift for us.
-                 */
-                nir_intrinsic_set_base(intr_comp,
-                                       nir_intrinsic_base(intr) * 16 +
-                                       i * 4);
-
-                intr_comp->src[0] =
-                        nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
-                                                 nir_imm_int(b, 4)));
-
-                dests[i] = &intr_comp->dest.ssa;
-
-                nir_builder_instr_insert(b, &intr_comp->instr);
-        }
+        nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
 
-        replace_intrinsic_with_vec(b, intr, dests);
+        nir_instr_rewrite_src(&intr->instr,
+                              &intr->src[0],
+                              nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
+                                                       nir_imm_int(b, 4))));
 }
 
 static void