v3d: Switch to using the new SFU instructions on V3D 4.x.
authorEric Anholt <eric@anholt.net>
Fri, 20 Jul 2018 20:06:50 +0000 (13:06 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 23 Jul 2018 17:21:43 +0000 (10:21 -0700)
These instructions let us write directly to the phys regfile, instead of
just R4.  That lets us avoid moving out of R4 to avoid conflicting with
other SFU results, and to avoid conflicting with thread switches.

There is still an extra instruction of latency, which is not represented
in the scheduler at the moment.  If you use the result before it's ready,
the QPU will just stall, unlike the magic R4 mode where you'd read the
previous value.  That means that the following shader-db results aren't
quite representative (since we now cause some stalls instead of emitting
nops), but they're impressive enough that I'm happy with the change.

total instructions in shared programs: 95669 -> 91275 (-4.59%)
instructions in affected programs:     82590 -> 78196 (-5.32%)

src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/qpu_schedule.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c
src/broadcom/compiler/vir_opt_dead_code.c
src/broadcom/compiler/vir_register_allocate.c
src/broadcom/qpu/qpu_instr.c
src/broadcom/qpu/qpu_instr.h

index 5c7acdf72ab3d7bd0762639b40d61618c517307e..51cb8845cdbb9c30dba455415754b54895510821 100644 (file)
@@ -73,13 +73,6 @@ vir_emit_thrsw(struct v3d_compile *c)
         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
 }
 
-static struct qreg
-vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
-{
-        vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
-        return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
-}
-
 static struct qreg
 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
 {
@@ -330,8 +323,7 @@ ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
 
         struct qreg periods = vir_FROUND(c, input);
-        struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
-                                         vir_FSUB(c, input, periods));
+        struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
         return vir_XOR(c, sin_output, vir_SHL(c,
                                               vir_FTOIN(c, periods),
                                               vir_uniform_ui(c, -1)));
@@ -369,8 +361,7 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
         c->inputs[attr * 4 + 0] = vir_FXCD(c);
         c->inputs[attr * 4 + 1] = vir_FYCD(c);
         c->inputs[attr * 4 + 2] = c->payload_z;
-        c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
-                                          c->payload_w);
+        c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
 }
 
 static struct qreg
@@ -782,16 +773,16 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 break;
 
         case nir_op_frcp:
-                result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
+                result = vir_RECIP(c, src[0]);
                 break;
         case nir_op_frsq:
-                result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
+                result = vir_RSQRT(c, src[0]);
                 break;
         case nir_op_fexp2:
-                result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
+                result = vir_EXP(c, src[0]);
                 break;
         case nir_op_flog2:
-                result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
+                result = vir_LOG(c, src[0]);
                 break;
 
         case nir_op_fceil:
@@ -1151,8 +1142,8 @@ emit_vert_end(struct v3d_compile *c)
         setup_default_position(c);
 
         uint32_t vpm_index = 0;
-        struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
-                                    c->outputs[c->output_position_index + 3]);
+        struct qreg rcp_w = vir_RECIP(c,
+                                      c->outputs[c->output_position_index + 3]);
 
         emit_vpm_write_setup(c);
 
index 2a035c5521e2ccc543eb5a477caea8db60aff5a9..af0b9b86b1c34e34a70cf46a320bfc914c031c5a 100644 (file)
@@ -459,7 +459,7 @@ calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
 
 struct choose_scoreboard {
         int tick;
-        int last_sfu_write_tick;
+        int last_magic_sfu_write_tick;
         int last_ldvary_tick;
         int last_uniforms_reset_tick;
         bool tlb_locked;
@@ -471,7 +471,7 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
 {
         switch (mux) {
         case V3D_QPU_MUX_R4:
-                if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
+                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
                         return true;
                 break;
 
@@ -536,7 +536,7 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
          * This would normally be prevented by dependency tracking, but might
          * occur if a dead SFU computation makes it to scheduling.
          */
-        if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
+        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
             v3d_qpu_writes_r4(devinfo, inst))
                 return true;
 
@@ -595,6 +595,8 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
 {
         if (v3d_qpu_uses_vpm(inst))
                 return true;
+        if (v3d_qpu_uses_sfu(inst))
+                return true;
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -825,7 +827,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
                                   enum v3d_qpu_waddr waddr)
 {
         if (v3d_qpu_magic_waddr_is_sfu(waddr))
-                scoreboard->last_sfu_write_tick = scoreboard->tick;
+                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
 }
 
 static void
@@ -1467,7 +1469,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         struct choose_scoreboard scoreboard;
         memset(&scoreboard, 0, sizeof(scoreboard));
         scoreboard.last_ldvary_tick = -10;
-        scoreboard.last_sfu_write_tick = -10;
+        scoreboard.last_magic_sfu_write_tick = -10;
         scoreboard.last_uniforms_reset_tick = -10;
 
         if (debug) {
index 133c2e0b7d9beeb9854d8926da91a12a62cd56e5..9dc19248aa6247d4de22e85f3a307b035fc348f2 100644 (file)
@@ -867,6 +867,33 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
                                            a, b));                      \
 }
 
+#define VIR_SFU(name)                                                      \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a)                         \
+{                                                                        \
+        if (c->devinfo->ver >= 41) {                                     \
+                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
+                                                    c->undef,            \
+                                                    a, c->undef));       \
+        } else {                                                         \
+                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
+                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
+        }                                                                \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        if (c->devinfo->ver >= 41) {                                     \
+                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+                                                       dest,             \
+                                                       a, c->undef));    \
+        } else {                                                         \
+                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
+                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
+        }                                                                \
+}
+
 #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
 #define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
 #define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
@@ -948,6 +975,13 @@ VIR_M_NODST_2(MULTOP)
 VIR_M_ALU1(MOV)
 VIR_M_ALU1(FMOV)
 
+VIR_SFU(RECIP)
+VIR_SFU(RSQRT)
+VIR_SFU(EXP)
+VIR_SFU(LOG)
+VIR_SFU(SIN)
+VIR_SFU(RSQRT2)
+
 static inline struct qinst *
 vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
              struct qreg dest, struct qreg src)
index ee0f329040eb629c192b94b49aaef7bbfe65748f..d804fe6089d8aaae87804284564b44840556c85c 100644 (file)
@@ -935,6 +935,17 @@ vir_uniform(struct v3d_compile *c,
         return vir_reg(QFILE_UNIF, uniform);
 }
 
+static bool
+vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
+{
+        if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
+                                      v3d_qpu_uses_sfu(&inst->qpu))) {
+                return false;
+        }
+
+        return true;
+}
+
 void
 vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
 {
@@ -954,7 +965,8 @@ vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
 
         if (src.file != QFILE_TEMP ||
             !c->defs[src.index] ||
-            last_inst != c->defs[src.index]) {
+            last_inst != c->defs[src.index] ||
+            !vir_can_set_flags(c, last_inst)) {
                 /* XXX: Make the MOV be the appropriate type */
                 last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
         }
index 7ce05fb5f5102824530bc66fa886d80ce13a2cdf..362fc9e52a33f31565b6bfebaba0ca73ad2622f7 100644 (file)
@@ -85,6 +85,16 @@ has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
         return false;
 }
 
+static bool
+can_write_to_null(struct v3d_compile *c, struct qinst *inst)
+{
+        /* The SFU instructions must write to a physical register. */
+        if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+                return false;
+
+        return true;
+}
+
 bool
 vir_opt_dead_code(struct v3d_compile *c)
 {
@@ -122,7 +132,8 @@ vir_opt_dead_code(struct v3d_compile *c)
                                  * it's nicer to read the VIR code without
                                  * unused destination regs.
                                  */
-                                if (inst->dst.file == QFILE_TEMP) {
+                                if (inst->dst.file == QFILE_TEMP &&
+                                    can_write_to_null(c, inst)) {
                                         if (debug) {
                                                 fprintf(stderr,
                                                         "Removing dst from: ");
index aa5e2139c1b3180f97a3586b0c540c6f098581ef..5a856acd7ed3052dc6441c5e75ffb63d6621ded2 100644 (file)
@@ -445,6 +445,19 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                                 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
                                 break;
 
+                        case V3D_QPU_A_RECIP:
+                        case V3D_QPU_A_RSQRT:
+                        case V3D_QPU_A_EXP:
+                        case V3D_QPU_A_LOG:
+                        case V3D_QPU_A_SIN:
+                        case V3D_QPU_A_RSQRT2:
+                                /* The SFU instructions write directly to the
+                                 * phys regfile.
+                                 */
+                                assert(inst->dst.file == QFILE_TEMP);
+                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
+                                break;
+
                         default:
                                 break;
                         }
index deaa533c8aed722baa38a9ef26df7fe9ea9d3820..a7fb4186e1a9997bef6252325951c34656dc7e70 100644 (file)
@@ -602,6 +602,36 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
         return false;
 }
 
+bool
+v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->alu.add.op) {
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2:
+                        return true;
+                default:
+                        break;
+                }
+
+                if (inst->alu.add.magic_write &&
+                    v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
+                        return true;
+                }
+
+                if (inst->alu.mul.magic_write &&
+                    v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) {
+                        return true;
+                }
+        }
+
+        return false;
+}
+
 bool
 v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
 {
index 09dbf3eb4fa557cbefa5021d1f6e9a48da7b642f..c37abac3cf8bd8749714d2c13ff3e1a44fe4ee24 100644 (file)
@@ -444,6 +444,7 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;