vc4: Coalesce instructions using VPM reads into the VPM read.
authorVarad Gautam <varadgautam@gmail.com>
Mon, 7 Mar 2016 19:31:59 +0000 (01:01 +0530)
committerEric Anholt <eric@anholt.net>
Tue, 15 Mar 2016 20:09:24 +0000 (13:09 -0700)
This is done instead of copy propagating the VPM reads into the
instructions using them, because VPM reads have to stay in order.

shader-db results:
total instructions in shared programs: 78509 -> 78114 (-0.50%)
instructions in affected programs:     5203 -> 4808 (-7.59%)
total estimated cycles in shared programs: 234670 -> 234318 (-0.15%)
estimated cycles in affected programs:     5345 -> 4993 (-6.59%)

Signed-off-by: Varad Gautam <varadgautam@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
Tested-by: Rhys Kidd <rhyskidd@gmail.com>
src/gallium/drivers/vc4/vc4_opt_vpm.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h

index 0fcf1e5c6dd39ce33f8117afc6c3f7cd66085fe7..d15b0c1a39f4c07518f0a94b654674699ccd5c88 100644 (file)
 /**
  * @file vc4_opt_vpm.c
  *
- * This modifies instructions that generate the value consumed by a VPM write
- * to write directly into the VPM.
+ * This modifies instructions that:
+ * 1. exclusively consume a value read from the VPM to directly read the VPM if
+ *    other operands allow it.
+ * 2. generate the value consumed by a VPM write to write directly into the VPM.
  */
 
 #include "vc4_qir.h"
 
 bool
-qir_opt_vpm_writes(struct vc4_compile *c)
+qir_opt_vpm(struct vc4_compile *c)
 {
         if (c->stage == QSTAGE_FRAG)
                 return false;
@@ -52,8 +54,70 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                 }
 
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        if (inst->src[i].file == QFILE_TEMP)
-                                use_count[inst->src[i].index]++;
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                uint32_t temp = inst->src[i].index;
+                                use_count[temp]++;
+                        }
+                }
+        }
+
+        /* For instructions reading from a temporary that contains a VPM read
+         * result, try to move the instruction up in place of the VPM read.
+         */
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (!inst || qir_is_multi_instruction(inst))
+                        continue;
+
+                if (qir_depends_on_flags(inst) || inst->sf)
+                        continue;
+
+                if (qir_has_side_effects(c, inst) ||
+                    qir_has_side_effect_reads(c, inst) ||
+                    qir_is_tex(inst))
+                        continue;
+
+                for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+                        if (inst->src[j].file != QFILE_TEMP ||
+                            inst->src[j].pack)
+                                continue;
+
+                        uint32_t temp = inst->src[j].index;
+
+                        /* Since VPM reads pull from a FIFO, we only get to
+                         * read each VPM entry once (unless we reset the read
+                         * pointer).  That means we can't copy-propagate a VPM
+                         * read to multiple locations.
+                         */
+                        if (use_count[temp] != 1)
+                                continue;
+
+                        struct qinst *mov = c->defs[temp];
+                        if (!mov ||
+                            (mov->op != QOP_MOV &&
+                             mov->op != QOP_FMOV &&
+                             mov->op != QOP_MMOV) ||
+                            mov->src[0].file != QFILE_VPM) {
+                                continue;
+                        }
+
+                        uint32_t temps = 0;
+                        for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
+                                if (inst->src[k].file == QFILE_TEMP)
+                                        temps++;
+                        }
+
+                        /* The instruction is safe to reorder if its other
+                         * sources are independent of previous instructions
+                         */
+                        if (temps == 1) {
+                                list_del(&inst->link);
+                                inst->src[j] = mov->src[0];
+                                list_replace(&mov->link, &inst->link);
+                                c->defs[temp] = NULL;
+                                free(mov);
+                                progress = true;
+                                break;
+                        }
                 }
         }
 
index f9eb0e151c5a46307266e876749dd11c4dfcdd8f..65f0067c61ed48e1b74857f7cee83aadaeb476db 100644 (file)
@@ -526,7 +526,7 @@ qir_optimize(struct vc4_compile *c)
                 OPTPASS(qir_opt_copy_propagation);
                 OPTPASS(qir_opt_dead_code);
                 OPTPASS(qir_opt_small_immediates);
-                OPTPASS(qir_opt_vpm_writes);
+                OPTPASS(qir_opt_vpm);
 
                 if (!progress)
                         break;
index bae31768bd808f81796848d773484cc238320691..4f39d72f552281dfef6506c9ff728edb27f7a6e7 100644 (file)
@@ -484,7 +484,7 @@ bool qir_opt_copy_propagation(struct vc4_compile *c);
 bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
-bool qir_opt_vpm_writes(struct vc4_compile *c);
+bool qir_opt_vpm(struct vc4_compile *c);
 void vc4_nir_lower_blend(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,