v3d: Implement a small immediates optimization, based on VC4's.
authorEric Anholt <eric@anholt.net>
Fri, 20 Jul 2018 21:27:09 +0000 (14:27 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 23 Jul 2018 17:21:43 +0000 (10:21 -0700)
We can do one per instruction, and we have to be careful not to overwrite
raddr_b, but this greatly reduces the pressure on uniform loads
(particularly around ldvpm/stvpm instructions).

total instructions in shared programs: 90768 -> 88220 (-2.81%)
instructions in affected programs:     82711 -> 80163 (-3.08%)

src/broadcom/Makefile.sources
src/broadcom/compiler/meson.build
src/broadcom/compiler/qpu_schedule.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c
src/broadcom/compiler/vir_dump.c
src/broadcom/compiler/vir_opt_small_immediates.c [new file with mode: 0644]
src/broadcom/compiler/vir_to_qpu.c

index 05e4588446a232244765685928c6e827e247f99c..5955acdefd5929b1ff5be6b595efaf5ab24d26fa 100644 (file)
@@ -28,6 +28,7 @@ BROADCOM_FILES = \
        compiler/vir_lower_uniforms.c \
        compiler/vir_opt_copy_propagate.c \
        compiler/vir_opt_dead_code.c \
+       compiler/vir_opt_small_immediates.c \
        compiler/vir_register_allocate.c \
        compiler/vir_to_qpu.c \
        compiler/qpu_schedule.c \
index 0cdf8a5f36491962fbab1b35ca7e7d1706637292..1bd43bf2c2da496070742056b06ec016cbed769c 100644 (file)
@@ -26,6 +26,7 @@ libbroadcom_compiler_files = files(
   'vir_lower_uniforms.c',
   'vir_opt_copy_propagate.c',
   'vir_opt_dead_code.c',
+  'vir_opt_small_immediates.c',
   'vir_register_allocate.c',
   'vir_to_qpu.c',
   'qpu_schedule.c',
index af0b9b86b1c34e34a70cf46a320bfc914c031c5a..441b6327825da8bea372bf8bc5dd63072e03edda 100644 (file)
@@ -670,7 +670,8 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
 
         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
-                    a->raddr_b != b->raddr_b) {
+                    (a->raddr_b != b->raddr_b ||
+                     a->sig.small_imm != b->sig.small_imm)) {
                         return false;
                 }
                 merge.raddr_b = b->raddr_b;
index 9dc19248aa6247d4de22e85f3a307b035fc348f2..a02b5a6404a74bc6486ecf3dfce83bfb4bde3525 100644 (file)
@@ -115,6 +115,7 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index)
  */
 struct qpu_reg {
         bool magic;
+        bool smimm;
         int index;
 };
 
index 5a4bf80f93d497d93c899efeb74cb2be735d2a9a..b5539b6ef50b58d02de4dc7b33393e054aba0a7c 100644 (file)
@@ -1001,6 +1001,7 @@ vir_optimize(struct v3d_compile *c)
 
                 OPTPASS(vir_opt_copy_propagate);
                 OPTPASS(vir_opt_dead_code);
+                OPTPASS(vir_opt_small_immediates);
 
                 if (!progress)
                         break;
index 88b5dc90ac7d614b5dcc598a04d16506e67ce81a..c43578302b58cd8d42098d510cdfe63dcec715eb 100644 (file)
@@ -25,7 +25,8 @@
 #include "v3d_compiler.h"
 
 static void
-vir_print_reg(struct v3d_compile *c, struct qreg reg)
+vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
+              struct qreg reg)
 {
         static const char *files[] = {
                 [QFILE_TEMP] = "t",
@@ -58,12 +59,20 @@ vir_print_reg(struct v3d_compile *c, struct qreg reg)
                 fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
                 break;
 
-        case QFILE_SMALL_IMM:
-                if ((int)reg.index >= -16 && (int)reg.index <= 15)
-                        fprintf(stderr, "%d", reg.index);
+        case QFILE_SMALL_IMM: {
+                uint32_t unpacked;
+                bool ok = v3d_qpu_small_imm_unpack(c->devinfo,
+                                                   inst->qpu.raddr_b,
+                                                   &unpacked);
+                assert(ok); (void) ok;
+
+                if ((int)inst->qpu.raddr_b >= -16 &&
+                    (int)inst->qpu.raddr_b <= 15)
+                        fprintf(stderr, "%d", unpacked);
                 else
-                        fprintf(stderr, "%f", uif(reg.index));
+                        fprintf(stderr, "%f", uif(unpacked));
                 break;
+        }
 
         case QFILE_VPM:
                 fprintf(stderr, "vpm%d.%d",
@@ -220,7 +229,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
                 fprintf(stderr, " ");
 
-                vir_print_reg(c, inst->dst);
+                vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
 
                 unpack[0] = instr->alu.add.a_unpack;
@@ -232,7 +241,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
                 fprintf(stderr, " ");
 
-                vir_print_reg(c, inst->dst);
+                vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
 
                 unpack[0] = instr->alu.mul.a_unpack;
@@ -241,7 +250,7 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
 
         for (int i = 0; i < sideband_nsrc; i++) {
                 fprintf(stderr, ", ");
-                vir_print_reg(c, inst->src[i]);
+                vir_print_reg(c, inst, inst->src[i]);
                 if (i < nsrc)
                         fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
         }
@@ -307,7 +316,7 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
 
                 if (vir_has_implicit_uniform(inst)) {
                         fprintf(stderr, " ");
-                        vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
+                        vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]);
                 }
 
                 break;
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
new file mode 100644 (file)
index 0000000..5491f9c
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_small_immediates.c
+ *
+ * Turns references to small constant uniform values into small immediates
+ * fields.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+bool
+vir_opt_small_immediates(struct v3d_compile *c)
+{
+        bool progress = false;
+
+        vir_for_each_inst_inorder(inst, c) {
+                if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                        continue;
+
+                /* The small immediate value sits in the raddr B field, so we
+                 * can't have 2 small immediates in one instruction (unless
+                 * they're the same value, but that should be optimized away
+                 * elsewhere).
+                 */
+                bool uses_small_imm = false;
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_SMALL_IMM)
+                                uses_small_imm = true;
+                }
+                if (uses_small_imm)
+                        continue;
+
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        struct qreg src = vir_follow_movs(c, inst->src[i]);
+
+                        if (src.file != QFILE_UNIF ||
+                            c->uniform_contents[src.index] !=
+                            QUNIFORM_CONSTANT) {
+                                continue;
+                        }
+
+                        if (vir_has_implicit_uniform(inst) &&
+                            i == vir_get_implicit_uniform_src(inst)) {
+                                /* No turning the implicit uniform read into
+                                 * an immediate.
+                                 */
+                                continue;
+                        }
+
+                        /* Check if the uniform is suitable as a small
+                         * immediate.
+                         */
+                        uint32_t imm = c->uniform_data[src.index];
+                        uint32_t packed;
+                        if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed))
+                                continue;
+
+                        /* Check that we don't have any other signals already
+                         * that would be incompatible with small_imm.
+                         */
+                        struct v3d_qpu_sig new_sig = inst->qpu.sig;
+                        uint32_t sig_packed;
+                        new_sig.small_imm = true;
+                        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
+                                continue;
+
+                        if (debug) {
+                                fprintf(stderr, "opt_small_immediate() from: ");
+                                vir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        inst->qpu.sig.small_imm = true;
+                        inst->qpu.raddr_b = packed;
+
+                        inst->src[i].file = QFILE_SMALL_IMM;
+                        inst->src[i].index = imm;
+                        if (debug) {
+                                fprintf(stderr, "to: ");
+                                vir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        progress = true;
+                        break;
+                }
+        }
+
+        return progress;
+}
index 2c0349bb3b77704344d67f9cb508363d22263d8e..b5a7b841ef631a02079206ac6dc2aa71a4f98834 100644 (file)
@@ -109,6 +109,12 @@ new_ldunif_instr(struct qinst *inst, int i)
 static void
 set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
+        if (src.smimm) {
+                assert(instr->sig.small_imm);
+                *mux = V3D_QPU_MUX_B;
+                return;
+        }
+
         if (src.magic) {
                 assert(src.index >= V3D_QPU_WADDR_R0 &&
                        src.index <= V3D_QPU_WADDR_R5);
@@ -244,15 +250,7 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_acc(5);
                                 break;
                         case QFILE_SMALL_IMM:
-                                abort(); /* XXX */
-#if 0
-                                src[i].mux = QPU_MUX_SMALL_IMM;
-                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
-                                /* This should only have returned a valid
-                                 * small immediate field, not ~0 for failure.
-                                 */
-                                assert(src[i].addr <= 47);
-#endif
+                                src[i].smimm = true;
                                 break;
 
                         case QFILE_VPM: