v3d: Rotate through registers to improve post-RA scheduling options.
authorEric Anholt <eric@anholt.net>
Fri, 20 Jul 2018 19:05:57 +0000 (12:05 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 23 Jul 2018 17:21:42 +0000 (10:21 -0700)
Similarly to VC4's implementation, by not picking r0 immediately upon
freeing it, we give the scheduler more of a chance to fit later writes in
earlier.  I'm not clear on whether there's any real cost to picking phys
over accumulators, so keep that behavior for now.

shader-db:
total instructions in shared programs: 96831 -> 95669 (-1.20%)
instructions in affected programs:     77254 -> 76092 (-1.50%)

src/broadcom/compiler/vir_register_allocate.c

index 4ec5f232643ec1fb9b6ea65690a49e4ec05522af..aa5e2139c1b3180f97a3586b0c540c6f098581ef 100644 (file)
@@ -238,6 +238,43 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                 BITSET_CLEAR(c->spillable, i);
 }
 
                 BITSET_CLEAR(c->spillable, i);
 }
 
+struct v3d_ra_select_callback_data {
+        uint32_t next_acc;
+        uint32_t next_phys;
+};
+
+static unsigned int
+v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+        struct v3d_ra_select_callback_data *v3d_ra = data;
+
+        /* Choose an accumulator if possible (I think it's lower power than
+         * phys regs), but round-robin through them to give post-RA
+         * instruction selection more options.
+         */
+        for (int i = 0; i < ACC_COUNT; i++) {
+                int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
+                int acc = ACC_INDEX + acc_off;
+
+                if (BITSET_TEST(regs, acc)) {
+                        v3d_ra->next_acc = acc_off + 1;
+                        return acc;
+                }
+        }
+
+        for (int i = 0; i < PHYS_COUNT; i++) {
+                int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+                int phys = PHYS_INDEX + phys_off;
+
+                if (BITSET_TEST(regs, phys)) {
+                        v3d_ra->next_phys = phys_off + 1;
+                        return phys;
+                }
+        }
+
+        unreachable("RA must pass us at least one possible reg.");
+}
+
 bool
 vir_init_reg_sets(struct v3d_compiler *compiler)
 {
 bool
 vir_init_reg_sets(struct v3d_compiler *compiler)
 {
@@ -309,6 +346,13 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         int acc_nodes[ACC_COUNT];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         int acc_nodes[ACC_COUNT];
+        struct v3d_ra_select_callback_data callback_data = {
+                .next_acc = 0,
+                /* Start at RF3, to try to keep the TLB writes from using
+                 * RF0-2.
+                 */
+                .next_phys = 3,
+        };
 
         *spilled = false;
 
 
         *spilled = false;
 
@@ -328,6 +372,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
         struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
                                                          c->num_temps +
                                                          ARRAY_SIZE(acc_nodes));
         struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
                                                          c->num_temps +
                                                          ARRAY_SIZE(acc_nodes));
+        ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
 
         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread
 
         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread