From 8ff418287689832deb623711deda9c56900e3338 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 18 Oct 2016 13:08:02 -0700
Subject: [PATCH] vc4: Avoid making temporaries for assignments to NIR
 registers.

Getting stores to NIR regs to not generate new MOVs is tricky, since the
result we're trying to store into the NIR reg may have been from a
conditional update of a temp, or a series of packed writes.  The easiest
solution seems to be to require that nir_store_dest()'s arg comes from an
SSA temp.

This causes us to put in a few more temporary MOVs in the NIR SSA dest
case, but copy propagation successfully cleans those up.

The shader-db change is modest:

total instructions in shared programs: 93774 -> 93598 (-0.19%)
instructions in affected programs:     14760 -> 14584 (-1.19%)
total estimated cycles in shared programs: 212135 -> 211946 (-0.09%)
estimated cycles in affected programs:     27005 -> 26816 (-0.70%)

but I was seeing patterns in some register-allocation failures in DEQP
tests that looked like the extra MOVs would increase maximum register
pressure in loops.  Some debug code indicates that that's not the case,
though I'm still a bit confused by that result.
---
 src/gallium/drivers/vc4/vc4_program.c | 114 ++++++++++++++++++--------
 1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index c0d956c276b..35729b524b9 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -137,10 +137,33 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
         return qregs;
 }
 
+/**
+ * This function is responsible for getting QIR results into the associated
+ * storage for a NIR instruction.
+ *
+ * If it's a NIR SSA def, then we just set the associated hash table entry to
+ * the new result.
+ *
+ * If it's a NIR reg, then we need to update the existing qreg assigned to the
+ * NIR destination with the incoming value.  To do that without introducing
+ * new MOVs, we require that the incoming qreg either be a uniform, or be
+ * SSA-defined by the previous QIR instruction in the block and rewritable by
+ * this function.  That lets us sneak ahead and insert the SF flag beforehand
+ * (knowing that the previous instruction doesn't depend on flags) and rewrite
+ * its destination to be the NIR reg's destination
+ */
 static void
 ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
                struct qreg result)
 {
+        struct qinst *last_inst = NULL;
+        if (!list_empty(&c->cur_block->instructions))
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+        assert(result.file == QFILE_UNIF ||
+               (result.file == QFILE_TEMP &&
+                last_inst && last_inst == c->defs[result.index]));
+
         if (dest->is_ssa) {
                 assert(chan < dest->ssa.num_components);
 
@@ -162,17 +185,34 @@ ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
                         _mesa_hash_table_search(c->def_ht, reg);
                 struct qreg *qregs = entry->data;
 
-                /* Conditionally move the result to the destination if the
-                 * channel is active.
+                /* Insert a MOV if the source wasn't an SSA def in the
+                 * previous instruction.
+                 */
+                if (result.file == QFILE_UNIF) {
+                        result = qir_MOV(c, result);
+                        last_inst = c->defs[result.index];
+                }
+
+                /* We know they're both temps, so just rewrite index. */
+                c->defs[last_inst->dst.index] = NULL;
+                last_inst->dst.index = qregs[chan].index;
+
+                /* If we're in control flow, then make this update of the reg
+                 * conditional on the execution mask.
                  */
                 if (c->execute.file != QFILE_NULL) {
-                        struct qinst *mov;
+                        last_inst->dst.index = qregs[chan].index;
 
+                        /* Set the flags to the current exec mask.  To insert
+                         * the SF, we temporarily remove our SSA instruction.
+                         */
+                        list_del(&last_inst->link);
                         qir_SF(c, c->execute);
-                        mov = qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result);
-                        mov->cond_is_exec_mask = true;
-                } else {
-                        qir_MOV_dest(c, qregs[chan], result);
+                        list_addtail(&last_inst->link,
+                                     &c->cur_block->instructions);
+
+                        last_inst->cond = QPU_COND_ZS;
+                        last_inst->cond_is_exec_mask = true;
                 }
         }
 }
@@ -326,19 +366,16 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
         struct qreg tex = qir_TEX_RESULT(c);
         c->num_texture_samples++;
 
-        struct qreg dest[4];
         enum pipe_format format = c->key->tex[unit].format;
         if (util_format_is_depth_or_stencil(format)) {
                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
                 for (int i = 0; i < 4; i++)
-                        dest[i] = scaled;
+                        ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
         } else {
                 for (int i = 0; i < 4; i++)
-                        dest[i] = qir_UNPACK_8_F(c, tex, i);
+                        ntq_store_dest(c, &instr->dest, i,
+                                       qir_UNPACK_8_F(c, tex, i));
         }
-
-        for (int i = 0; i < 4; i++)
-                ntq_store_dest(c, &instr->dest, i, dest[i]);
 }
 
 static void
@@ -502,8 +539,9 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
         struct qreg diff = qir_FSUB(c, src, trunc);
         qir_SF(c, diff);
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
+        return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
+                                  qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
+                                  diff));
 }
 
 /**
@@ -520,8 +558,9 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, src, trunc));
 
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+        return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
+                                  qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
+                                  trunc));
 }
 
 /**
@@ -538,8 +577,9 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
          */
         qir_SF(c, qir_FSUB(c, trunc, src));
 
-        return qir_SEL(c, QPU_COND_NS,
-                       qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+        return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
+                                  qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
+                                  trunc));
 }
 
 static struct qreg
@@ -620,7 +660,7 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
         qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
         qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
         qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
-        return t;
+        return qir_MOV(c, t);
 }
 
 static void
@@ -799,7 +839,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
                 qir_PACK_8_F(c, result, src, i);
         }
 
-        ntq_store_dest(c, &instr->dest.dest, 0, result);
+        ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
 }
 
 /** Handles sign-extended bitfield extracts for 16 bits. */
@@ -905,6 +945,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
                 break;
         }
 
+        /* Make the temporary for nir_store_dest(). */
+        *dest = qir_MOV(c, *dest);
+
         return true;
 }
 
@@ -931,7 +974,7 @@ static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
 
 out:
         qir_SF(c, src[0]);
-        return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+        return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
 }
 
 static struct qreg
@@ -950,9 +993,9 @@ ntq_fddx(struct vc4_compile *c, struct qreg src)
         qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
                           qir_uniform_ui(c, 1)));
 
-        return qir_SEL(c, QPU_COND_ZS,
-                       qir_FSUB(c, from_right, src),
-                       qir_FSUB(c, src, from_left));
+        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+                                  qir_FSUB(c, from_right, src),
+                                  qir_FSUB(c, src, from_left)));
 }
 
 static struct qreg
@@ -969,9 +1012,9 @@ ntq_fddy(struct vc4_compile *c, struct qreg src)
                           qir_reg(QFILE_QPU_ELEMENT, 0),
                           qir_uniform_ui(c, 2)));
 
-        return qir_SEL(c, QPU_COND_ZS,
-                       qir_FSUB(c, from_top, src),
-                       qir_FSUB(c, src, from_bottom));
+        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+                                  qir_FSUB(c, from_top, src),
+                                  qir_FSUB(c, src, from_bottom)));
 }
 
 static void
@@ -992,7 +1035,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-                        ntq_store_dest(c, &instr->dest.dest, i, srcs[i]);
+                        ntq_store_dest(c, &instr->dest.dest, i,
+                                       qir_MOV(c, srcs[i]));
                 return;
         }
 
@@ -1058,9 +1102,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_i2b:
         case nir_op_f2b:
                 qir_SF(c, src[0]);
-                result = qir_SEL(c, QPU_COND_ZC,
-                                 qir_uniform_ui(c, ~0),
-                                 qir_uniform_ui(c, 0));
+                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
+                                            qir_uniform_ui(c, ~0),
+                                            qir_uniform_ui(c, 0)));
                 break;
 
         case nir_op_iadd:
@@ -1124,7 +1168,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 break;
         case nir_op_fcsel:
                 qir_SF(c, src[0]);
-                result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
+                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
                 break;
 
         case nir_op_frcp:
@@ -1687,12 +1731,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                                 }
                         }
                         ntq_store_dest(c, &instr->dest, 0,
-                                       c->color_reads[sample_index]);
+                                       qir_MOV(c, c->color_reads[sample_index]));
                 } else {
                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
                         int comp = nir_intrinsic_component(instr);
                         ntq_store_dest(c, &instr->dest, 0,
-                                       c->inputs[offset * 4 + comp]);
+                                       qir_MOV(c, c->inputs[offset * 4 + comp]));
                 }
                 break;
 
-- 
2.30.2