vc4: Enforce one-uniform-per-instruction after optimization.
authorEric Anholt <eric@anholt.net>
Thu, 19 Feb 2015 20:58:53 +0000 (12:58 -0800)
committerEric Anholt <eric@anholt.net>
Fri, 20 Feb 2015 07:35:17 +0000 (23:35 -0800)
This lets us more intelligently decide which uniform values should be put
into temporaries, by choosing the most reused values to push to temps
first.

total uniforms in shared programs: 13457 -> 13433 (-0.18%)
uniforms in affected programs:     1524 -> 1500 (-1.57%)
total instructions in shared programs: 40198 -> 40019 (-0.45%)
instructions in affected programs:     6027 -> 5848 (-2.97%)

I noticed this opportunity because with the NIR work, some programs were
happening to make different uniform copy propagation choices that
significantly increased instruction counts.

src/gallium/drivers/vc4/Makefile.sources
src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c [new file with mode: 0644]

index 95f1a340ab3fc6bc3351c9ad405d19379c12fbd8..c7254ea1473383bd377b21eff5604e8655355ad2 100644 (file)
@@ -20,6 +20,7 @@ C_SOURCES := \
        vc4_packet.h \
        vc4_program.c \
        vc4_qir.c \
+       vc4_qir_lower_uniforms.c \
        vc4_qir.h \
        vc4_qpu.c \
        vc4_qpu_defines.h \
index 07e1cb14b0d8636359ae5323d324e7d402c83156..f8c49a44bd3b70d3fbf1e0aafcc4a801f3fa4115 100644 (file)
@@ -49,22 +49,11 @@ qir_opt_copy_propagation(struct vc4_compile *c)
                 if (inst->dst.file == QFILE_TEMP)
                         defs[inst->dst.index] = inst;
 
-                /* A single instruction can only read one uniform value.  (It
-                 * could maybe read the same uniform value in two operands,
-                 * but that doesn't seem important to do).
-                 */
-                bool reads_a_uniform = false;
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        if (inst->src[i].file == QFILE_UNIF)
-                                reads_a_uniform = true;
-                }
-
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         int index = inst->src[i].index;
                         if (inst->src[i].file == QFILE_TEMP &&
                             (movs[index].file == QFILE_TEMP ||
-                             (movs[index].file == QFILE_UNIF &&
-                              !reads_a_uniform))) {
+                             movs[index].file == QFILE_UNIF)) {
                                 if (debug) {
                                         fprintf(stderr, "Copy propagate: ");
                                         qir_dump_inst(c, inst);
@@ -72,8 +61,6 @@ qir_opt_copy_propagation(struct vc4_compile *c)
                                 }
 
                                 inst->src[i] = movs[index];
-                                if (movs[index].file == QFILE_UNIF)
-                                        reads_a_uniform = true;
 
                                 if (debug) {
                                         fprintf(stderr, "to: ");
index a1d9a7f064c9762923d58e3dfe5739df8e64bf95..b904679ef00a15665d2bfb82b5c136d782bcd554 100644 (file)
@@ -142,25 +142,16 @@ qir_uniform(struct vc4_compile *c,
         return u;
 }
 
-static struct qreg
-get_temp_for_uniform(struct vc4_compile *c, enum quniform_contents contents,
-                     uint32_t data)
-{
-        struct qreg u = qir_uniform(c, contents, data);
-        struct qreg t = qir_MOV(c, u);
-        return t;
-}
-
 static struct qreg
 qir_uniform_ui(struct vc4_compile *c, uint32_t ui)
 {
-        return get_temp_for_uniform(c, QUNIFORM_CONSTANT, ui);
+        return qir_uniform(c, QUNIFORM_CONSTANT, ui);
 }
 
 static struct qreg
 qir_uniform_f(struct vc4_compile *c, float f)
 {
-        return qir_uniform_ui(c, fui(f));
+        return qir_uniform(c, QUNIFORM_CONSTANT, fui(f));
 }
 
 static struct qreg
@@ -232,8 +223,7 @@ get_src(struct vc4_compile *c, unsigned tgsi_op,
                 if (src->Indirect) {
                         r = indirect_uniform_load(c, full_src, s);
                 } else {
-                        r = get_temp_for_uniform(c, QUNIFORM_UNIFORM,
-                                                 src->Index * 4 + s);
+                        r = qir_uniform(c, QUNIFORM_UNIFORM, src->Index * 4 + s);
                 }
                 break;
         case TGSI_FILE_INPUT:
@@ -660,13 +650,9 @@ tgsi_to_qir_tex(struct vc4_compile *c,
         if (tgsi_inst->Texture.Texture == TGSI_TEXTURE_RECT ||
             tgsi_inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
                 s = qir_FMUL(c, s,
-                             get_temp_for_uniform(c,
-                                                  QUNIFORM_TEXRECT_SCALE_X,
-                                                  unit));
+                             qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, unit));
                 t = qir_FMUL(c, t,
-                             get_temp_for_uniform(c,
-                                                  QUNIFORM_TEXRECT_SCALE_Y,
-                                                  unit));
+                             qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, unit));
         }
 
         if (tgsi_inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
@@ -689,7 +675,7 @@ tgsi_to_qir_tex(struct vc4_compile *c,
                    c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
-                qir_TEX_R(c, get_temp_for_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit),
+                qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit),
                           texture_u[next_texture_u++]);
         }
 
@@ -1504,14 +1490,11 @@ vc4_blend_channel(struct vc4_compile *c,
                 }
         case PIPE_BLENDFACTOR_CONST_COLOR:
                 return qir_FMUL(c, val,
-                                get_temp_for_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel));
+                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
+                                            channel));
         case PIPE_BLENDFACTOR_CONST_ALPHA:
                 return qir_FMUL(c, val,
-                                get_temp_for_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3));
+                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
         case PIPE_BLENDFACTOR_ZERO:
                 return qir_uniform_f(c, 0.0);
         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
@@ -1529,15 +1512,15 @@ vc4_blend_channel(struct vc4_compile *c,
         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
                 return qir_FMUL(c, val,
                                 qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         get_temp_for_uniform(c,
-                                                              QUNIFORM_BLEND_CONST_COLOR,
-                                                              channel)));
+                                         qir_uniform(c,
+                                                     QUNIFORM_BLEND_CONST_COLOR,
+                                                     channel)));
         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
                 return qir_FMUL(c, val,
                                 qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         get_temp_for_uniform(c,
-                                                              QUNIFORM_BLEND_CONST_COLOR,
-                                                              3)));
+                                         qir_uniform(c,
+                                                     QUNIFORM_BLEND_CONST_COLOR,
+                                                     3)));
 
         default:
         case PIPE_BLENDFACTOR_SRC1_COLOR:
@@ -1661,7 +1644,7 @@ static void
 alpha_test_discard(struct vc4_compile *c)
 {
         struct qreg src_alpha;
-        struct qreg alpha_ref = get_temp_for_uniform(c, QUNIFORM_ALPHA_REF, 0);
+        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
 
         if (!c->fs_key->alpha_test)
                 return;
@@ -2171,6 +2154,7 @@ vc4_shader_tgsi_to_qir(struct vc4_context *vc4, enum qstage stage,
         }
 
         qir_optimize(c);
+        qir_lower_uniforms(c);
 
         if (vc4_debug & VC4_DEBUG_QIR) {
                 fprintf(stderr, "%s prog %d/%d QIR:\n",
index 9e0ee1f0ae5965eb791b5e53852be7b435487952..5c1fdbddfb6369d93f95f79c3cdfded233461d95 100644 (file)
@@ -173,6 +173,12 @@ qir_is_multi_instruction(struct qinst *inst)
         return qir_op_info[inst->op].multi_instruction;
 }
 
+bool
+qir_is_tex(struct qinst *inst)
+{
+        return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
+}
+
 bool
 qir_depends_on_flags(struct qinst *inst)
 {
@@ -420,9 +426,12 @@ qir_get_stage_name(enum qstage stage)
 void
 qir_SF(struct vc4_compile *c, struct qreg src)
 {
-        assert(!is_empty_list(&c->instructions));
-        struct qinst *last_inst = (struct qinst *)c->instructions.prev;
-        if (last_inst->dst.file != src.file ||
+        struct qinst *last_inst = NULL;
+        if (!is_empty_list(&c->instructions))
+                last_inst = (struct qinst *)c->instructions.prev;
+
+        if (!last_inst ||
+            last_inst->dst.file != src.file ||
             last_inst->dst.index != src.index ||
             qir_is_multi_instruction(last_inst)) {
                 src = qir_MOV(c, src);
index 6da6ff6542e84b4407f7bd81fd31b564e8871e1e..a1b556055847fe3ec16c5b5dc38065fcb24dae09 100644 (file)
@@ -376,6 +376,7 @@ bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
 bool qir_is_multi_instruction(struct qinst *inst);
+bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
 bool qir_reads_r4(struct qinst *inst);
@@ -393,6 +394,7 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
 
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
new file mode 100644 (file)
index 0000000..d527889
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_opt_algebraic.c
+ *
+ * This is the optimization pass for miscellaneous changes to instructions
+ * where we can simplify the operation by some knowledge about the specific
+ * operations.
+ *
+ * Mostly this will be a matter of turning things into MOVs so that they can
+ * later be copy-propagated out.
+ */
+
+#include "vc4_qir.h"
+#include "util/hash_table.h"
+#include "util/u_math.h"
+
+static inline uint32_t
+index_hash(const void *key)
+{
+        return (uintptr_t)key;
+}
+
+static inline bool
+index_compare(const void *a, const void *b)
+{
+        return a == b;
+}
+
+static void
+add_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)reg.index;
+
+        entry = _mesa_hash_table_search(ht, key);
+        if (entry) {
+                entry->data++;
+        } else {
+                _mesa_hash_table_insert(ht, key, (void *)(uintptr_t)1);
+        }
+}
+
+static void
+remove_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)reg.index;
+
+        entry = _mesa_hash_table_search(ht, key);
+        assert(entry);
+        entry->data--;
+        if (entry->data == NULL)
+                _mesa_hash_table_remove(ht, entry);
+}
+
+static bool
+is_lowerable_uniform(struct qinst *inst, int i)
+{
+        if (inst->src[i].file != QFILE_UNIF)
+                return false;
+        if (qir_is_tex(inst))
+                return i != 1;
+        return true;
+}
+
+void
+qir_lower_uniforms(struct vc4_compile *c)
+{
+        struct simple_node *node;
+        struct hash_table *ht =
+                _mesa_hash_table_create(c, index_hash, index_compare);
+
+        /* Walk the instruction list, finding which instructions have more
+         * than one uniform referenced, and add those uniform values to the
+         * ht.
+         */
+        foreach(node, &c->instructions) {
+                struct qinst *inst = (struct qinst *)node;
+                uint32_t nsrc = qir_get_op_nsrc(inst->op);
+
+                uint32_t count = 0;
+                for (int i = 0; i < nsrc; i++) {
+                        if (inst->src[i].file == QFILE_UNIF)
+                                count++;
+                }
+
+                if (count <= 1)
+                        continue;
+
+                for (int i = 0; i < nsrc; i++) {
+                        if (is_lowerable_uniform(inst, i))
+                                add_uniform(ht, inst->src[i]);
+                }
+        }
+
+        while (ht->entries) {
+                /* Find the most commonly used uniform in instructions that
+                 * need a uniform lowered.
+                 */
+                uint32_t max_count = 0;
+                uint32_t max_index = 0;
+                struct hash_entry *entry;
+                hash_table_foreach(ht, entry) {
+                        uint32_t count = (uintptr_t)entry->data;
+                        uint32_t index = (uintptr_t)entry->key;
+                        if (count > max_count) {
+                                max_count = count;
+                                max_index = index;
+                        }
+                }
+
+                /* Now, find the instructions using this uniform and make them
+                 * reference a temp instead.
+                 */
+                struct qreg temp = qir_get_temp(c);
+                struct qreg unif = { QFILE_UNIF, max_index };
+                struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef);
+                insert_at_head(&c->instructions, &mov->link);
+                foreach(node, &c->instructions) {
+                        struct qinst *inst = (struct qinst *)node;
+                        uint32_t nsrc = qir_get_op_nsrc(inst->op);
+
+                        uint32_t count = 0;
+                        for (int i = 0; i < nsrc; i++) {
+                                if (inst->src[i].file == QFILE_UNIF)
+                                        count++;
+                        }
+
+                        if (count <= 1)
+                                continue;
+
+                        for (int i = 0; i < nsrc; i++) {
+                                if (is_lowerable_uniform(inst, i) &&
+                                    inst->src[i].index == max_index) {
+                                        inst->src[i] = temp;
+                                        remove_uniform(ht, unif);
+                                        count--;
+                                }
+                        }
+
+                        /* If the instruction doesn't need lowering any more,
+                         * then drop it from the list.
+                         */
+                        if (count <= 1) {
+                                for (int i = 0; i < nsrc; i++) {
+                                        if (is_lowerable_uniform(inst, i))
+                                                remove_uniform(ht, inst->src[i]);
+                                }
+                        }
+                }
+        }
+
+        _mesa_hash_table_destroy(ht, NULL);
+}