From cc8fb2904673588d31b660dbfaf692615b5202dd Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 31 Jul 2015 11:46:56 -0700
Subject: [PATCH] vc4: Make r4-writes implicitly move to a temp, and allocate
 temps to r4.

Previously, SFU values always moved to a temporary, and TLB color reads
and texture reads always lived in r4.  Instead, we can have these results
just be normal temporaries, and the register allocator can leave the
values in r4 when they don't interfere with anything else using r4.

shader-db results:
total instructions in shared programs: 100809 -> 100040 (-0.76%)
instructions in affected programs:     42383 -> 41614 (-1.81%)
---
 src/gallium/drivers/vc4/vc4_context.h         |  1 +
 .../drivers/vc4/vc4_opt_copy_propagation.c    |  5 +-
 src/gallium/drivers/vc4/vc4_opt_cse.c         | 16 +---
 src/gallium/drivers/vc4/vc4_program.c         | 19 ++---
 src/gallium/drivers/vc4/vc4_qir.c             | 18 ----
 src/gallium/drivers/vc4/vc4_qir.h             | 13 ---
 src/gallium/drivers/vc4/vc4_qpu_emit.c        | 58 ++++++-------
 .../drivers/vc4/vc4_register_allocate.c       | 83 ++++++++++++++-----
 8 files changed, 106 insertions(+), 107 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 30fb285eefe..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
         uint8_t prim_mode;
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 
                 if (inst->op == QOP_MOV &&
                     inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM &&
-                    !(inst->src[0].file == QFILE_TEMP &&
-                      (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
-                       c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+                    inst->src[0].file != QFILE_VPM) {
                         movs[inst->dst.index] = inst->src[0];
                 }
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 51a56504e5e..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
         struct qreg src[4];
         /**
          * If the instruction depends on the flags, how many SFs have been
-         * seen before this instruction, or if it depends on r4, how many r4
-         * writes have been seen.
+         * seen before this instruction.
          */
         uint32_t implicit_arg_update_count;
 };
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
 
 static struct qinst *
 vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
-             struct qinst *inst, uint32_t sf_count,
-             uint32_t r4_count)
+             struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
             inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
                qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
         if (qir_depends_on_flags(inst))
                 key.implicit_arg_update_count = sf_count;
-        if (qir_reads_r4(inst))
-                key.implicit_arg_update_count = r4_count;
 
         uint32_t hash = _mesa_hash_data(&key, sizeof(key));
         struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        uint32_t sf_count = 0, r4_count = 0;
+        uint32_t sf_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                         inst_key_equals);
@@ -138,8 +134,7 @@ qir_opt_cse(struct vc4_compile *c)
                 if (inst->sf) {
                         sf_count++;
                 } else {
-                        struct qinst *cse = vc4_find_cse(c, ht, inst,
-                                                         sf_count, r4_count);
+                        struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
                         if (cse) {
                                 inst->src[0] = cse->dst;
                                 for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -155,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
                                 }
                         }
                 }
-
-                if (qir_writes_r4(inst))
-                        r4_count++;
         }
 
         ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f2742986beb..5e2a3f448a0 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -105,9 +105,8 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
                                                      range->size - 4)));
 
         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
-        struct qreg r4 = qir_TEX_RESULT(c);
         c->num_texture_samples++;
-        return qir_MOV(c, r4);
+        return qir_TEX_RESULT(c);
 }
 
 static struct qreg *
@@ -360,13 +359,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 
         c->num_texture_samples++;
-        struct qreg r4 = qir_TEX_RESULT(c);
+        struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
                                                          qir_uniform_ui(c, 8)));
                 struct qreg normalized = qir_FMUL(c, depthf,
                                                   qir_uniform_f(c, 1.0f/0xffffff));
@@ -418,7 +417,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         unpacked[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_R4_UNPACK(c, r4, i);
+                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -1305,9 +1304,10 @@ blend_pipeline(struct vc4_compile *c)
         if (c->fs_key->blend.blend_enable ||
             c->fs_key->blend.colormask != 0xf ||
             c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                struct qreg r4 = qir_TLB_COLOR_READ(c);
+                packed_dst_color = qir_TLB_COLOR_READ(c);
                 for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
+                        tlb_read_color[i] = qir_UNPACK_8_F(c,
+                                                           packed_dst_color, i);
                 for (int i = 0; i < 4; i++) {
                         dst_color[i] = get_swizzled_channel(c,
                                                             tlb_read_color,
@@ -1319,11 +1319,6 @@ blend_pipeline(struct vc4_compile *c)
                                 linear_dst_color[i] = dst_color[i];
                         }
                 }
-
-                /* Save the packed value for logic ops.  Can't reuse r4
-                 * because other things might smash it (like sRGB)
-                 */
-                packed_dst_color = qir_MOV(c, r4);
         }
 
         struct qreg undef_array[4] = { c->undef, c->undef, c->undef, c->undef };
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
-        [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
-        [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
-        [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
         [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
         [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
         [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
-bool
-qir_reads_r4(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_R4_UNPACK_A:
-        case QOP_R4_UNPACK_B:
-        case QOP_R4_UNPACK_C:
-        case QOP_R4_UNPACK_D:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index e2d2574f1b1..7a74018d9af 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -158,10 +158,6 @@ enum qop {
          * the destination
          */
         QOP_TEX_RESULT,
-        QOP_R4_UNPACK_A,
-        QOP_R4_UNPACK_B,
-        QOP_R4_UNPACK_C,
-        QOP_R4_UNPACK_D
 };
 
 struct queued_qpu_inst {
@@ -442,7 +438,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
 bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
@@ -578,14 +573,6 @@ QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 
-static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index e1b3f3ce99a..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -320,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
 
                         break;
 
@@ -403,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
 
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
@@ -452,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-
-                        break;
-
-                case QOP_R4_UNPACK_A:
-                case QOP_R4_UNPACK_B:
-                case QOP_R4_UNPACK_C:
-                case QOP_R4_UNPACK_D:
-                        assert(src[0].mux == QPU_MUX_R4);
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_R4_UNPACK_A),
-                                                       QPU_UNPACK);
-
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_UNPACK_8A_F:
@@ -475,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8D_F:
                 case QOP_UNPACK_16A_F:
                 case QOP_UNPACK_16B_F: {
-                        assert(src[0].mux == QPU_MUX_A);
-
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                             qpu_rb(31) : dst),
-                                            src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_F],
-                                                       QPU_UNPACK);
+                        if (src[0].mux == QPU_MUX_R4) {
+                                queue(c, qpu_a_MOV(dst, src[0]));
+                                *last_inst(c) |= QPU_PM;
+                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+                                                               (qinst->op -
+                                                                QOP_UNPACK_8A_F),
+                                                               QPU_UNPACK);
+                        } else {
+                                assert(src[0].mux == QPU_MUX_A);
 
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                /* Since we're setting the pack bits, if the
+                                 * destination is in A it would get re-packed.
+                                 */
+                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                                     qpu_rb(31) : dst),
+                                                    src[0], src[0]));
+                                *last_inst(c) |=
+                                        QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                 QOP_UNPACK_8A_F],
+                                                      QPU_UNPACK);
+
+                                if (dst.mux == QPU_MUX_A) {
+                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                }
                         }
                 }
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 73964b48dca..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
                 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                 /* R4 can't be written as a general purpose register. (it's
                  * TMU_NOSWAP as a write address).
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R4)
+                if (vc4_regs[i].mux == QPU_MUX_R4) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
                         continue;
+                }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
         }
 
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+        }
 
         ra_set_finalize(vc4->regs, NULL);
 }
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
         return a->priority - b->priority;
 }
 
+#define CLASS_BIT_A			(1 << 0)
+#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_R4			(1 << 2)
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
+        uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(g, i, vc4->reg_class_any);
-        }
-
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 temp_to_node[map[i].temp] = i;
         }
 
-        /* Figure out our register classes and preallocated registers*/
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (qir_writes_r4(inst)) {
+                        /* This instruction writes r4 (and optionally moves
+                         * its result to a temp), so nothing else can be
+                         * stored in r4 across it.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (def[i] < ip && use[i] > ip)
+                                        class_bits[i] &= ~CLASS_BIT_R4;
+                        }
+                } else {
+                        /* R4 can't be written as a general purpose
+                         * register. (it's TMU_NOSWAP as a write address).
+                         */
+                        if (inst->dst.file == QFILE_TEMP)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+                }
+
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_TEX_RESULT:
-                case QOP_TLB_COLOR_READ:
-                        assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
-                                        ACC_INDEX + 4);
-                        break;
-
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, temp_to_node[inst->dst.index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                         break;
 
                 default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 if (qir_src_needs_a_file(inst)) {
-                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                }
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                switch (class_bits[i]) {
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_any);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_R4:
+                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        break;
+                case CLASS_BIT_A:
+                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        break;
+                default:
+                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+                                i, class_bits[i]);
+                        abort();
+                        break;
                 }
         }
 
-- 
2.30.2