vc4: Add support for turning constant uniforms into small immediates.
authorEric Anholt <eric@anholt.net>
Wed, 10 Dec 2014 22:56:46 +0000 (14:56 -0800)
committerEric Anholt <eric@anholt.net>
Thu, 18 Dec 2014 03:35:13 +0000 (19:35 -0800)
Small immediates have the downside of taking over the raddr B field, so
you might have less chance to pack instructions together thanks to raddr B
conflicts.  However, it also reduces some register pressure since it lets
you load 2 "uniform" values in one instruction (avoiding a previous load
of the constant value to a register), and increases some pairing for the
same reason.

total uniforms in shared programs: 16231 -> 13374 (-17.60%)
uniforms in affected programs:     10280 -> 7423 (-27.79%)
total instructions in shared programs: 40795 -> 41168 (0.91%)
instructions in affected programs:     25551 -> 25924 (1.46%)

In a previous version of this patch I had a reduction in instruction count
by forcing the other args alongside a SMALL_IMM to be in the A file or
accumulators, but that increases register pressure and had a bug in
handling FRAG_Z.  In this patch is I just use raddr conflict resolution,
which is more expensive.  I think I'd rather tweak allocation to have some
way to slightly prefer good choices for files in general, rather than risk
failing to register allocate by forcing things into register classes.

13 files changed:
src/gallium/drivers/vc4/Makefile.sources
src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
src/gallium/drivers/vc4/vc4_opt_algebraic.c
src/gallium/drivers/vc4/vc4_opt_small_immediates.c [new file with mode: 0644]
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qpu.c
src/gallium/drivers/vc4/vc4_qpu.h
src/gallium/drivers/vc4/vc4_qpu_defines.h
src/gallium/drivers/vc4/vc4_qpu_disasm.c
src/gallium/drivers/vc4/vc4_qpu_emit.c
src/gallium/drivers/vc4/vc4_qpu_schedule.c
src/gallium/drivers/vc4/vc4_qpu_validate.c

index 6bcb731d03497b95e29972fd64b89d401f9ff31e..1f8e8c41bf45e00d601a0954b31192454c50c6f1 100644 (file)
@@ -15,6 +15,7 @@ C_SOURCES := \
        vc4_opt_copy_propagation.c \
        vc4_opt_cse.c \
        vc4_opt_dead_code.c \
+       vc4_opt_small_immediates.c \
        vc4_packet.h \
        vc4_program.c \
        vc4_qir.c \
index f5e152bab55133c7d249a88c37b4a1c1ba95ac8e..48bc683da5c0051bebcf96b793b8391e95486cb5 100644 (file)
@@ -133,12 +133,18 @@ check_tmu_write(uint64_t inst,
        int tmu = waddr > QPU_W_TMU0_B;
        bool submit = is_tmu_submit(waddr);
        bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
+       uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
        if (is_direct) {
                uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
                uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
                uint32_t clamp_offset = ~0;
 
+               if (sig == QPU_SIG_SMALL_IMM) {
+                       DRM_ERROR("direct TMU read used small immediate\n");
+                       return false;
+               }
+
                /* Make sure that this texture load is an add of the base
                 * address of the UBO to a clamped offset within the UBO.
                 */
@@ -180,7 +186,8 @@ check_tmu_write(uint64_t inst,
 
                validation_state->tmu_setup[tmu].is_direct = true;
        } else {
-               if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) {
+               if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
+                                             raddr_b == QPU_R_UNIF)) {
                        DRM_ERROR("uniform read in the same instruction as "
                                  "texture setup.\n");
                        return false;
@@ -298,6 +305,7 @@ track_live_clamps(uint64_t inst,
        uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+       uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
        bool is_b = inst & QPU_WS;
        uint32_t live_reg_index;
 
@@ -305,7 +313,8 @@ track_live_clamps(uint64_t inst,
                return;
 
        if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
-           !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+           !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+             sig != QPU_SIG_SMALL_IMM)) {
                return;
        }
 
@@ -344,9 +353,10 @@ check_instruction_reads(uint64_t inst,
 {
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+       uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
        if (raddr_a == QPU_R_UNIF ||
-           raddr_b == QPU_R_UNIF) {
+           (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
                /* This can't overflow the uint32_t, because we're reading 8
                 * bytes of instruction to increment by 4 here, so we'd
                 * already be OOM.
@@ -401,6 +411,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
                case QPU_SIG_LOAD_TMU0:
                case QPU_SIG_LOAD_TMU1:
                case QPU_SIG_PROG_END:
+               case QPU_SIG_SMALL_IMM:
                        if (!check_instruction_writes(inst, validated_shader,
                                                      &validation_state)) {
                                DRM_ERROR("Bad write at ip %d\n", ip);
index 4376c7ba08f38ee6a79cf07e509677cdc3194a03..d36bb2d6596e057294a362fc3b527656bc44108d 100644 (file)
@@ -59,24 +59,34 @@ dump_to(struct vc4_compile *c, struct qinst *inst)
         fprintf(stderr, "\n");
 }
 
+static bool
+is_constant_value(struct vc4_compile *c, struct qinst **defs, struct qreg reg,
+                  uint32_t val)
+{
+        if (reg.file == QFILE_UNIF &&
+            c->uniform_contents[reg.index] == QUNIFORM_CONSTANT &&
+            c->uniform_data[reg.index] == val) {
+                return true;
+        }
+
+        if (reg.file == QFILE_SMALL_IMM && reg.index == val)
+                return true;
+
+        return false;
+}
+
 static bool
 is_zero(struct vc4_compile *c, struct qinst **defs, struct qreg reg)
 {
         reg = qir_follow_movs(defs, reg);
-
-        return (reg.file == QFILE_UNIF &&
-                c->uniform_contents[reg.index] == QUNIFORM_CONSTANT &&
-                c->uniform_data[reg.index] == 0);
+        return is_constant_value(c, defs, reg, 0);
 }
 
 static bool
 is_1f(struct vc4_compile *c, struct qinst **defs, struct qreg reg)
 {
         reg = qir_follow_movs(defs, reg);
-
-        return (reg.file == QFILE_UNIF &&
-                c->uniform_contents[reg.index] == QUNIFORM_CONSTANT &&
-                c->uniform_data[reg.index] == fui(1.0));
+        return is_constant_value(c, defs, reg, fui(1.0));
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
new file mode 100644 (file)
index 0000000..8b98ce3
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_opt_small_immediates.c
+ *
+ * Turns references to small constant uniform values into small immediates
+ * fields.
+ */
+
+#include "vc4_qir.h"
+#include "vc4_qpu.h"
+
+static bool debug;
+
+bool
+qir_opt_small_immediates(struct vc4_compile *c)
+{
+        bool progress = false;
+        struct simple_node *node;
+        struct qinst *defs[c->num_temps];
+
+        foreach(node, &c->instructions) {
+                struct qinst *inst = (struct qinst *)node;
+
+                if (inst->dst.file == QFILE_TEMP)
+                        defs[inst->dst.index] = inst;
+
+                /* The small immediate value sits in the raddr B field, so we
+                 * can't have 2 small immediates in one instruction (unless
+                 * they're the same value, but that should be optimized away
+                 * elsewhere).
+                 */
+                bool uses_small_imm = false;
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        if (inst->src[i].file == QFILE_SMALL_IMM)
+                                uses_small_imm = true;
+                }
+                if (uses_small_imm)
+                        continue;
+
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        struct qreg src = qir_follow_movs(defs, inst->src[i]);
+
+                        if (src.file != QFILE_UNIF ||
+                            c->uniform_contents[src.index] !=
+                            QUNIFORM_CONSTANT) {
+                                continue;
+                        }
+
+                        if (i == 1 &&
+                            (inst->op == QOP_TEX_S ||
+                             inst->op == QOP_TEX_T ||
+                             inst->op == QOP_TEX_R ||
+                             inst->op == QOP_TEX_B)) {
+                                /* No turning the implicit uniform read into
+                                 * an immediate.
+                                 */
+                                continue;
+                        }
+
+                        uint32_t imm = c->uniform_data[src.index];
+                        uint32_t small_imm = qpu_encode_small_immediate(imm);
+                        if (small_imm == ~0)
+                                continue;
+
+                        if (debug) {
+                                fprintf(stderr, "opt_small_immediate() from: ");
+                                qir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        inst->src[i].file = QFILE_SMALL_IMM;
+                        inst->src[i].index = imm;
+                        if (debug) {
+                                fprintf(stderr, "to: ");
+                                qir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        progress = true;
+                        break;
+                }
+        }
+
+        return progress;
+}
index d7251abda1c497c273acbb3230e5a6ce026abd73..8cb9826a21d35e8d02c1cf056d28c0c174378e71 100644 (file)
@@ -204,16 +204,22 @@ qir_reads_r4(struct qinst *inst)
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg)
 {
-        const char *files[] = {
+        static const char *files[] = {
                 [QFILE_TEMP] = "t",
                 [QFILE_VARY] = "v",
                 [QFILE_UNIF] = "u",
         };
 
-        if (reg.file == QFILE_NULL)
+        if (reg.file == QFILE_NULL) {
                 fprintf(stderr, "null");
-        else
+        } else if (reg.file == QFILE_SMALL_IMM) {
+                if ((int)reg.index >= -16 && (int)reg.index <= 15)
+                        fprintf(stderr, "%d", reg.index);
+                else
+                        fprintf(stderr, "%f", uif(reg.index));
+        } else {
                 fprintf(stderr, "%s%d", files[reg.file], reg.index);
+        }
 
         if (reg.file == QFILE_UNIF &&
             c->uniform_contents[reg.index] == QUNIFORM_CONSTANT) {
@@ -386,6 +392,7 @@ qir_optimize(struct vc4_compile *c)
                 OPTPASS(qir_opt_cse);
                 OPTPASS(qir_opt_copy_propagation);
                 OPTPASS(qir_opt_dead_code);
+                OPTPASS(qir_opt_small_immediates);
 
                 if (!progress)
                         break;
index 40c0d3d04dd68e118727a7d0bd742980a47886b0..db0a436722284c7148bad8454a0ee00709403300 100644 (file)
@@ -38,6 +38,12 @@ enum qfile {
         QFILE_TEMP,
         QFILE_VARY,
         QFILE_UNIF,
+
+        /**
+         * Stores an immediate value in the index field that can be turned
+         * into a small immediate field by qpu_encode_small_immediate().
+         */
+        QFILE_SMALL_IMM,
 };
 
 struct qreg {
@@ -382,6 +388,7 @@ bool qir_opt_algebraic(struct vc4_compile *c);
 bool qir_opt_copy_propagation(struct vc4_compile *c);
 bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
+bool qir_opt_small_immediates(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
 
index 52c06ae551744a9b3aabba72a0601de94a49329b..7e38ede334278e67ca6afaf4b13470cc9ea52fc8 100644 (file)
@@ -26,6 +26,9 @@
 #include "vc4_qir.h"
 #include "vc4_qpu.h"
 
+#define QPU_MUX(mux, muxfield)                                  \
+        QPU_SET_FIELD(mux != QPU_MUX_SMALL_IMM ? mux : QPU_MUX_B, muxfield)
+
 static uint64_t
 set_src_raddr(uint64_t inst, struct qpu_reg src)
 {
@@ -36,11 +39,23 @@ set_src_raddr(uint64_t inst, struct qpu_reg src)
         }
 
         if (src.mux == QPU_MUX_B) {
-                assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP ||
-                       QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr);
+                assert((QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP ||
+                        QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr) &&
+                       QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM);
                 return QPU_UPDATE_FIELD(inst, src.addr, QPU_RADDR_B);
         }
 
+        if (src.mux == QPU_MUX_SMALL_IMM) {
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM) {
+                        assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr);
+                } else {
+                        inst = qpu_set_sig(inst, QPU_SIG_SMALL_IMM);
+                        assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP);
+                }
+                return ((inst & ~QPU_RADDR_B_MASK) |
+                        QPU_SET_FIELD(src.addr, QPU_RADDR_B));
+        }
+
         return inst;
 }
 
@@ -101,15 +116,15 @@ qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src)
 {
         uint64_t inst = 0;
 
+        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_A_OR, QPU_OP_ADD);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
         inst |= qpu_a_dst(dst);
         inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD);
-        inst |= QPU_SET_FIELD(src.mux, QPU_ADD_A);
-        inst |= QPU_SET_FIELD(src.mux, QPU_ADD_B);
+        inst |= QPU_MUX(src.mux, QPU_ADD_A);
+        inst |= QPU_MUX(src.mux, QPU_ADD_B);
         inst = set_src_raddr(inst, src);
-        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);
 
         return inst;
@@ -120,15 +135,15 @@ qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src)
 {
         uint64_t inst = 0;
 
+        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_M_V8MIN, QPU_OP_MUL);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
         inst |= qpu_m_dst(dst);
         inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL);
-        inst |= QPU_SET_FIELD(src.mux, QPU_MUL_A);
-        inst |= QPU_SET_FIELD(src.mux, QPU_MUL_B);
+        inst |= QPU_MUX(src.mux, QPU_MUL_A);
+        inst |= QPU_MUX(src.mux, QPU_MUL_B);
         inst = set_src_raddr(inst, src);
-        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD);
 
         return inst;
@@ -155,16 +170,16 @@ qpu_a_alu2(enum qpu_op_add op,
 {
         uint64_t inst = 0;
 
+        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(op, QPU_OP_ADD);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
         inst |= qpu_a_dst(dst);
         inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD);
-        inst |= QPU_SET_FIELD(src0.mux, QPU_ADD_A);
+        inst |= QPU_MUX(src0.mux, QPU_ADD_A);
         inst = set_src_raddr(inst, src0);
-        inst |= QPU_SET_FIELD(src1.mux, QPU_ADD_B);
+        inst |= QPU_MUX(src1.mux, QPU_ADD_B);
         inst = set_src_raddr(inst, src1);
-        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);
 
         return inst;
@@ -176,16 +191,16 @@ qpu_m_alu2(enum qpu_op_mul op,
 {
         uint64_t inst = 0;
 
+        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(op, QPU_OP_MUL);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
         inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
         inst |= qpu_m_dst(dst);
         inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL);
-        inst |= QPU_SET_FIELD(src0.mux, QPU_MUL_A);
+        inst |= QPU_MUX(src0.mux, QPU_MUL_A);
         inst = set_src_raddr(inst, src0);
-        inst |= QPU_SET_FIELD(src1.mux, QPU_MUL_B);
+        inst |= QPU_MUX(src1.mux, QPU_MUL_B);
         inst = set_src_raddr(inst, src1);
-        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
         inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD);
 
         return inst;
@@ -243,7 +258,8 @@ qpu_num_sf_accesses(uint64_t inst)
 
         if (raddr_a == QPU_R_MUTEX_ACQUIRE)
                 accesses++;
-        if (raddr_b == QPU_R_MUTEX_ACQUIRE)
+        if (raddr_b == QPU_R_MUTEX_ACQUIRE &&
+            QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM)
                 accesses++;
 
         /* XXX: semaphore, combined color read/write? */
@@ -383,6 +399,8 @@ qpu_merge_inst(uint64_t a, uint64_t b)
 {
         uint64_t merge = a | b;
         bool ok = true;
+        uint32_t a_sig = QPU_GET_FIELD(a, QPU_SIG);
+        uint32_t b_sig = QPU_GET_FIELD(b, QPU_SIG);
 
         if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP &&
             QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP) {
@@ -402,8 +420,10 @@ qpu_merge_inst(uint64_t a, uint64_t b)
         if (qpu_num_sf_accesses(a) && qpu_num_sf_accesses(b))
                 return 0;
 
-        if (QPU_GET_FIELD(a, QPU_SIG) == QPU_SIG_LOAD_IMM ||
-            QPU_GET_FIELD(b, QPU_SIG) == QPU_SIG_LOAD_IMM) {
+        if (a_sig == QPU_SIG_LOAD_IMM ||
+            b_sig == QPU_SIG_LOAD_IMM ||
+            a_sig == QPU_SIG_SMALL_IMM ||
+            b_sig == QPU_SIG_SMALL_IMM) {
                 return 0;
         }
 
@@ -501,6 +521,56 @@ qpu_inst_is_tlb(uint64_t inst)
                 sig == QPU_SIG_WAIT_FOR_SCOREBOARD);
 }
 
+/**
+ * Returns the small immediate value to be encoded in to the raddr b field if
+ * the argument can be represented as one, or ~0 otherwise.
+ */
+uint32_t
+qpu_encode_small_immediate(uint32_t i)
+{
+        if (i <= 15)
+                return i;
+        if ((int)i < 0 && (int)i >= -16)
+                return i + 32;
+
+        switch (i) {
+        case 0x3f800000:
+                return 32;
+        case 0x40000000:
+                return 33;
+        case 0x40800000:
+                return 34;
+        case 0x41000000:
+                return 35;
+        case 0x41800000:
+                return 36;
+        case 0x42000000:
+                return 37;
+        case 0x42800000:
+                return 38;
+        case 0x43000000:
+                return 39;
+        case 0x3b800000:
+                return 40;
+        case 0x3c000000:
+                return 41;
+        case 0x3c800000:
+                return 42;
+        case 0x3d000000:
+                return 43;
+        case 0x3d800000:
+                return 44;
+        case 0x3e000000:
+                return 45;
+        case 0x3e800000:
+                return 46;
+        case 0x3f000000:
+                return 47;
+        }
+
+        return ~0;
+}
+
 void
 qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst)
 {
index e1307ebb57b00f30c5775ac65695354f0e4054f0..c9ab6344589151e0bfd8c191da8572e510180b0c 100644 (file)
@@ -134,6 +134,7 @@ uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
 uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
 uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
 uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond);
+uint32_t qpu_encode_small_immediate(uint32_t i);
 
 bool qpu_waddr_is_tlb(uint32_t waddr);
 bool qpu_inst_is_tlb(uint64_t inst);
index a965b9635b1afdfa660303d65a45d6909a47fa83..eb3dfb3382752f4cc267a352422777b3dac7f6f2 100644 (file)
@@ -147,8 +147,11 @@ enum qpu_mux {
         QPU_MUX_A,
         QPU_MUX_B,
 
-        /* non-hardware mux values */
-        QPU_MUX_IMM,
+        /**
+         * Non-hardware mux value, stores a small immediate field to be
+         * programmed into raddr_b in the qpu_reg.index.
+         */
+        QPU_MUX_SMALL_IMM,
 };
 
 enum qpu_cond {
index b87205a011eef676c96a9c85edf3b670e90f166a..55e0e6139b53b49cc6816c80b66ba8c80e43e5ae 100644 (file)
@@ -291,9 +291,9 @@ print_alu_src(uint64_t inst, uint32_t mux)
                 else if (si <= 39)
                         fprintf(stderr, "%.1f", (float)(1 << (si - 32)));
                 else if (si <= 47)
-                        fprintf(stderr, "%f", 1.0f / (256 / (si - 39)));
+                        fprintf(stderr, "%f", 1.0f / (1 << (48 - si)));
                 else
-                        fprintf(stderr, "???");
+                        fprintf(stderr, "<bad imm %d>", si);
         } else if (raddr <= 31)
                 fprintf(stderr, "r%s%d", file, raddr);
         else {
index 530ec8bf501dde840ea94bd3913e284d79c3a9c5..35300ff42e8de9e5ffa94a08fdfcca52cc6f3de6 100644 (file)
@@ -74,11 +74,15 @@ swap_file(struct qpu_reg *src)
         switch (src->addr) {
         case QPU_R_UNIF:
         case QPU_R_VARY:
-                if (src->mux == QPU_MUX_A)
-                        src->mux = QPU_MUX_B;
-                else
-                        src->mux = QPU_MUX_A;
-                return true;
+                if (src->mux == QPU_MUX_SMALL_IMM) {
+                        return false;
+                } else {
+                        if (src->mux == QPU_MUX_A)
+                                src->mux = QPU_MUX_B;
+                        else
+                                src->mux = QPU_MUX_A;
+                        return true;
+                }
 
         default:
                 return false;
@@ -100,16 +104,20 @@ fixup_raddr_conflict(struct vc4_compile *c,
                      struct qpu_reg *src0, struct qpu_reg *src1,
                      bool r3_live)
 {
-        if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
-            src0->mux != src1->mux ||
-            src0->addr == src1->addr) {
+        uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
+        uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
+
+        if (mux0 <= QPU_MUX_R5 ||
+            mux0 != mux1 ||
+            (src0->addr == src1->addr &&
+             src0->mux == src1->mux)) {
                 return false;
         }
 
         if (swap_file(src0) || swap_file(src1))
                 return false;
 
-        if (src0->mux == QPU_MUX_A) {
+        if (mux0 == QPU_MUX_A) {
                 /* If we're conflicting over the A regfile, then we can just
                  * use the reserved rb31.
                  */
@@ -233,6 +241,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VARY:
                                 src[i] = qpu_vary();
                                 break;
+                        case QFILE_SMALL_IMM:
+                                src[i].mux = QPU_MUX_SMALL_IMM;
+                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
+                                /* This should only have returned a valid
+                                 * small immediate field, not ~0 for failure.
+                                 */
+                                assert(src[i].addr <= 47);
+                                break;
                         }
                 }
 
@@ -246,6 +262,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
                 case QFILE_VARY:
                 case QFILE_UNIF:
+                case QFILE_SMALL_IMM:
                         assert(!"not reached");
                         break;
                 }
index 0700b0df63da898fa19f3ea55a52fba9354c65cc..f523b4c6fb05e465232bc218cbcc1f8455d8c8b7 100644 (file)
@@ -224,7 +224,8 @@ reads_uniform(uint64_t inst)
                 return false;
 
         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
-                QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF ||
+                (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
+                 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
 }
@@ -343,7 +344,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 
         if (sig != QPU_SIG_LOAD_IMM) {
                 process_raddr_deps(state, n, raddr_a, true);
-                process_raddr_deps(state, n, raddr_b, false);
+                if (sig != QPU_SIG_SMALL_IMM)
+                        process_raddr_deps(state, n, raddr_b, false);
         }
 
         if (add_op != QPU_A_NOP) {
@@ -435,6 +437,7 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
 {
         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
         uint32_t src_muxes[] = {
                 QPU_GET_FIELD(inst, QPU_ADD_A),
                 QPU_GET_FIELD(inst, QPU_ADD_B),
@@ -446,6 +449,7 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
                      raddr_a < 32 &&
                      scoreboard->last_waddr_a == raddr_a) ||
                     (src_muxes[i] == QPU_MUX_B &&
+                     sig != QPU_SIG_SMALL_IMM &&
                      raddr_b < 32 &&
                      scoreboard->last_waddr_b == raddr_b)) {
                         return true;
index ffd1b4767b3a79ef7d470196eac3bd996474c38e..8471edbf62cd7bdbc9201cc8fa12220dc5c66df4 100644 (file)
@@ -49,6 +49,7 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
                         return true;
 
                 if (!ignore_b &&
+                    QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
                     src_regs[i].mux == QPU_MUX_B &&
                     (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
                         return true;