vc4: Reserve rb31 instead of r3 for raddr conflict spills.
authorEric Anholt <eric@anholt.net>
Tue, 9 Dec 2014 00:52:53 +0000 (16:52 -0800)
committerEric Anholt <eric@anholt.net>
Tue, 9 Dec 2014 09:04:46 +0000 (01:04 -0800)
This increases the cost of a raddr b conflict spill (save r3 to rb31, move
src1 to r3, move rb31 back to r3 when done, instead of just move src1 to
r3), but on average thanks to instruction pairing it's more worthwhile to
have another accumulator.

total instructions in shared programs: 46428 -> 46171 (-0.55%)
instructions in affected programs:     38030 -> 37773 (-0.68%)

src/gallium/drivers/vc4/vc4_qpu_emit.c
src/gallium/drivers/vc4/vc4_register_allocate.c

index 856f84444d539c121c5ee6f62006a4fd0e8fb619..f2620c0a75f8e6c839213b05f900fea911879955 100644 (file)
@@ -93,21 +93,41 @@ swap_file(struct qpu_reg *src)
  * In that case, we need to move one to a temporary that can be used in the
  * instruction, instead.
  */
-static void
+static bool
 fixup_raddr_conflict(struct vc4_compile *c,
-                     struct qpu_reg *src0, struct qpu_reg *src1)
+                     struct qpu_reg dst,
+                     struct qpu_reg *src0, struct qpu_reg *src1,
+                     bool r3_live)
 {
         if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
             src0->mux != src1->mux ||
             src0->addr == src1->addr) {
-                return;
+                return false;
         }
 
         if (swap_file(src0) || swap_file(src1))
-                return;
+                return false;
+
+        if (src0->mux == QPU_MUX_A) {
+                /* If we're conflicting over the A regfile, then we can just
+                 * use the reserved rb31.
+                 */
+                queue(c, qpu_a_MOV(qpu_rb(31), *src1));
+                *src1 = qpu_rb(31);
+                return false;
+        } else {
+                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
+                 * rb31, then store our desired value in r3, and tell the
+                 * caller to put rb31 back into r3 when we're done.
+                 */
+                if (r3_live)
+                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
+                queue(c, qpu_a_MOV(qpu_r3(), *src1));
+
+                *src1 = qpu_r3();
 
-        queue(c, qpu_a_MOV(qpu_r3(), *src1));
-        *src1 = qpu_r3();
+                return r3_live && dst.mux != QPU_MUX_R3;
+        }
 }
 
 void
@@ -118,6 +138,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t inputs_remaining = c->num_inputs;
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
+        bool written_r3 = false;
+        bool needs_restore;
 
         make_empty_list(&c->qpu_inst_list);
 
@@ -416,8 +438,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TEX_DIRECT:
-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);
                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                         break;
 
                 case QOP_TEX_RESULT:
@@ -477,7 +503,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);
 
                         if (translate[qinst->op].is_mul) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -488,8 +516,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                     dst,
                                                     src[0], src[1]));
                         }
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
+
                         break;
                 }
+
+                if (dst.mux == QPU_MUX_R3)
+                        written_r3 = true;
         }
 
         qpu_schedule_instructions(c);
index 3001900c0744887355ce35e338dcb9ec227c0ad1..85f29e54f94f3e72542a672b7dafc2452b9495ca 100644 (file)
@@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
-                /* Reserve r3 for now, since we're using it for spilling-like
-                 * operations in vc4_qpu_emit.c
+                /* Reserve rb31 for spilling fixup_raddr_conflict() in
+                 * vc4_qpu_emit.c
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R3)
+                if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
                         continue;
 
                 /* R4 can't be written as a general purpose register. (it's