vc4: Avoid the save/restore of r3 for raddr conflicts, just use ra31.
authorEric Anholt <eric@anholt.net>
Sat, 10 Jan 2015 02:59:56 +0000 (15:59 +1300)
committerEric Anholt <eric@anholt.net>
Sat, 10 Jan 2015 19:57:24 +0000 (08:57 +1300)
Turns out this was harmful in code quality:

total instructions in shared programs: 39487 -> 38845 (-1.63%)
instructions in affected programs:     22522 -> 21880 (-2.85%)

This costs us yet another register, which is painful since it means more
programs might fail to compile).  However, the alternative was causing us
trouble where we'd save/restore r3 while it contained a MIN-ed direct
texture offset, causing the kernel to fail to validate our shaders (such
as in GLB2.7).

src/gallium/drivers/vc4/vc4_qpu_emit.c
src/gallium/drivers/vc4/vc4_register_allocate.c

index f3f50ec68f8feb7b98e135af7038bcc33d52f9b8..7531be5cf89840db61a8ac861315ac79984321de 100644 (file)
@@ -96,13 +96,12 @@ swap_file(struct qpu_reg *src)
  * address.
  *
  * In that case, we need to move one to a temporary that can be used in the
- * instruction, instead.
+ * instruction, instead.  We reserve ra31/rb31 for this purpose.
  */
-static bool
+static void
 fixup_raddr_conflict(struct vc4_compile *c,
                      struct qpu_reg dst,
-                     struct qpu_reg *src0, struct qpu_reg *src1,
-                     bool r3_live)
+                     struct qpu_reg *src0, struct qpu_reg *src1)
 {
         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
@@ -111,31 +110,18 @@ fixup_raddr_conflict(struct vc4_compile *c,
             mux0 != mux1 ||
             (src0->addr == src1->addr &&
              src0->mux == src1->mux)) {
-                return false;
+                return;
         }
 
         if (swap_file(src0) || swap_file(src1))
-                return false;
+                return;
 
         if (mux0 == QPU_MUX_A) {
-                /* If we're conflicting over the A regfile, then we can just
-                 * use the reserved rb31.
-                 */
                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
                 *src1 = qpu_rb(31);
-                return false;
         } else {
-                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
-                 * rb31, then store our desired value in r3, and tell the
-                 * caller to put rb31 back into r3 when we're done.
-                 */
-                if (r3_live)
-                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
-                queue(c, qpu_a_MOV(qpu_r3(), *src1));
-
-                *src1 = qpu_r3();
-
-                return r3_live && dst.mux != QPU_MUX_R3;
+                queue(c, qpu_a_MOV(qpu_ra(31), *src1));
+                *src1 = qpu_ra(31);
         }
 }
 
@@ -148,8 +134,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
         int last_vpm_read_index = -1;
-        bool written_r3 = false;
-        bool needs_restore;
         /* Map from the QIR ops enum order to QPU unpack bits. */
         static const uint32_t unpack_map[] = {
                 QPU_UNPACK_8A,
@@ -467,12 +451,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TEX_DIRECT:
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                         break;
 
                 case QOP_TEX_RESULT:
@@ -554,9 +534,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
 
                         if (translate[qinst->op].is_mul) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -567,14 +545,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                     dst,
                                                     src[0], src[1]));
                         }
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 
                         break;
                 }
-
-                if (dst.mux == QPU_MUX_R3)
-                        written_r3 = true;
         }
 
         qpu_schedule_instructions(c);
index 9eae7fca758f5bc7cd001e10ee36c7111e90eb15..efd9d51eaa0e235167b1e6594fac857214590e9b 100644 (file)
@@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
-                /* Reserve rb31 for spilling fixup_raddr_conflict() in
+                /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
                  */
-                if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
+                if (vc4_regs[i].addr == 31)
                         continue;
 
                 /* R4 can't be written as a general purpose register. (it's