vc4: Add support for coalescing ALU ops into tex_[srtb] MOVs.
authorEric Anholt <eric@anholt.net>
Tue, 15 Nov 2016 22:48:43 +0000 (14:48 -0800)
committerEric Anholt <eric@anholt.net>
Tue, 29 Nov 2016 16:52:50 +0000 (08:52 -0800)
This isn't as complete as I would like (can't merge interpolation because
of the implicit r5 dependency, doesn't work with control flow), but this
was cheap and easy.

Improves 3DMMES Taiji performance by 1.15353% +/- 0.299896% (n=29, 16)

total instructions in shared programs: 99810 -> 99059 (-0.75%)
instructions in affected programs:     10705 -> 9954 (-7.02%)

src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c

index c08c02619f748547aa4c4e66228e7a00bad7ac50..b247c690d8288e903e09e7d58402c398b5e4414e 100644 (file)
@@ -24,8 +24,8 @@
 /**
  * @file vc4_opt_coalesce_ff_writes.c
  *
- * This modifies instructions that generate the value consumed by a VPM write
- * to write directly into the VPM.
+ * This modifies instructions that generate the value consumed by a VPM or TMU
+ * coordinate write to write directly into the VPM or TMU.
  */
 
 #include "vc4_qir.h"
@@ -33,9 +33,6 @@
 bool
 qir_opt_coalesce_ff_writes(struct vc4_compile *c)
 {
-        if (c->stage == QSTAGE_FRAG)
-                return false;
-
         /* For now, only do this pass when we don't have control flow. */
         struct qblock *block = qir_entry_block(c);
         if (block != qir_exit_block(c))
@@ -60,7 +57,7 @@ qir_opt_coalesce_ff_writes(struct vc4_compile *c)
                 if (mov_inst->src[0].file != QFILE_TEMP)
                         continue;
 
-                if (mov_inst->dst.file != QFILE_VPM)
+                if (!(mov_inst->dst.file == QFILE_VPM || qir_is_tex(mov_inst)))
                         continue;
 
                 uint32_t temp = mov_inst->src[0].index;
@@ -71,24 +68,37 @@ qir_opt_coalesce_ff_writes(struct vc4_compile *c)
                 if (!inst)
                         continue;
 
+                /* Don't bother trying to fold in an ALU op using a uniform to
+                 * a texture op, as we'll just have to lower the uniform back
+                 * out.
+                 */
+                if (qir_is_tex(mov_inst) && qir_has_uniform_read(inst))
+                        continue;
+
                 if (qir_depends_on_flags(inst) || inst->sf)
                         continue;
 
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_VARY_ADD_C) {
                         continue;
                 }
 
-                /* Move the generating instruction to the end of the program
-                 * to maintain the order of the VPM writes.
+                /* Move the generating instruction into the position of the FF
+                 * write.
                  */
+                c->defs[inst->dst.index] = NULL;
+                inst->dst.file = mov_inst->dst.file;
+                inst->dst.index = mov_inst->dst.index;
+                if (qir_has_implicit_tex_uniform(mov_inst)) {
+                        inst->src[qir_get_tex_uniform_src(inst)] =
+                                mov_inst->src[qir_get_tex_uniform_src(mov_inst)];
+                }
+
                 list_del(&inst->link);
                 list_addtail(&inst->link, &mov_inst->link);
-                qir_remove_instruction(c, mov_inst);
 
-                c->defs[inst->dst.index] = NULL;
-                inst->dst.file = QFILE_VPM;
-                inst->dst.index = 0;
+                qir_remove_instruction(c, mov_inst);
 
                 progress = true;
         }
index a082c41dfe0afef22fa2acfcb1ca98a60bb0aee8..d4f35d8f01a78b335c69c1af126ebebedb258e69 100644 (file)
@@ -179,6 +179,17 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
         return false;
 }
 
+bool
+qir_has_uniform_read(struct qinst *inst)
+{
+        for (int i = 0; i < qir_get_nsrc(inst); i++) {
+                if (inst->src[i].file == QFILE_UNIF)
+                        return true;
+        }
+
+        return false;
+}
+
 bool
 qir_is_mul(struct qinst *inst)
 {
index 28d33449391a05b056686efda04e6fa3380fc004..e189bc32d940f46a510842e9e4f23b27863c609e 100644 (file)
@@ -577,6 +577,7 @@ int qir_get_tex_uniform_src(struct qinst *inst);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
+bool qir_has_uniform_read(struct qinst *inst);
 bool qir_is_mul(struct qinst *inst);
 bool qir_is_raw_mov(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
index 23ae8ebfa6f49d12d5832447130f936658678c9e..443682a4670b82bc068594d48602a3652e9bf455 100644 (file)
 #include "util/hash_table.h"
 #include "util/u_math.h"
 
-static bool
-inst_reads_a_uniform(struct qinst *inst)
-{
-        if (qir_is_tex(inst))
-                return true;
-
-        for (int i = 0; i < qir_get_nsrc(inst); i++) {
-                if (inst->src[i].file == QFILE_UNIF)
-                        return true;
-        }
-
-        return false;
-}
-
 static bool
 block_reads_any_uniform(struct qblock *block)
 {
         qir_for_each_inst(inst, block) {
-                if (inst_reads_a_uniform(inst))
+                if (qir_has_uniform_read(inst))
                         return true;
         }
 
@@ -94,7 +80,7 @@ qir_emit_uniform_stream_resets(struct vc4_compile *c)
                 }
 
                 qir_for_each_inst(inst, block) {
-                        if (inst_reads_a_uniform(inst))
+                        if (qir_has_uniform_read(inst))
                                 uniform_count++;
                 }
         }