vc4: Drop dependency on r3 for color packing.
authorEric Anholt <eric@anholt.net>
Sun, 7 Sep 2014 21:38:24 +0000 (14:38 -0700)
committerEric Anholt <eric@anholt.net>
Tue, 9 Dec 2014 00:08:13 +0000 (16:08 -0800)
We can avoid it by carefully ordering the packing.  This is important as a
step in giving r3 to the register allocator.

total instructions in shared programs: 56087 -> 55957 (-0.23%)
instructions in affected programs:     18368 -> 18238 (-0.71%)

src/gallium/drivers/vc4/vc4_qpu_emit.c

index 3cb709f11fe91377d464eea1e0f50211e96ab396..856f84444d539c121c5ee6f62006a4fd0e8fb619 100644 (file)
@@ -296,17 +296,40 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
 
                         break;
 
-                case QOP_PACK_COLORS:
+                case QOP_PACK_COLORS: {
+                        /* We have to be careful not to start writing over one
+                         * of our source values when incrementally writing the
+                         * destination.  So, if the dst is one of the srcs, we
+                         * pack that one first (and we pack 4 channels at once
+                         * for the first pack).
+                         */
+                        struct qpu_reg first_pack = src[0];
+                        for (int i = 0; i < 4; i++) {
+                                if (src[i].mux == dst.mux &&
+                                    src[i].addr == dst.addr) {
+                                        first_pack = dst;
+                                        break;
+                                }
+                        }
+                        queue(c, qpu_m_MOV(dst, first_pack));
+                        *last_inst(c) |= QPU_PM;
+                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
+                                                       QPU_PACK);
+
                         for (int i = 0; i < 4; i++) {
                         for (int i = 0; i < 4; i++) {
-                                queue(c, qpu_m_MOV(qpu_r3(), src[i]));
+                                if (src[i].mux == first_pack.mux &&
+                                    src[i].addr == first_pack.addr) {
+                                        continue;
+                                }
+
+                                queue(c, qpu_m_MOV(dst, src[i]));
                                 *last_inst(c) |= QPU_PM;
                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
                                                                QPU_PACK);
                         }
 
                                 *last_inst(c) |= QPU_PM;
                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
                                                                QPU_PACK);
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r3()));
-
                         break;
                         break;
+                }
 
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,
 
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,