vc4: Restructure color packing as a series of channel replacements.
authorEric Anholt <eric@anholt.net>
Fri, 9 Jan 2015 02:32:29 +0000 (18:32 -0800)
committerEric Anholt <eric@anholt.net>
Sat, 10 Jan 2015 00:54:12 +0000 (13:54 +1300)
I'm using this in some WIP commits for doing blending in 8888 instead of
vec4.  But it also gives us these results immediately, thanks to allowing
more uniforms/immediates in the arguments:

total instructions in shared programs: 41027 -> 40960 (-0.16%)
instructions in affected programs:     4381 -> 4314 (-1.53%)

src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qpu_emit.c

index bba02ca93f2013629e95a2fc75f30052448a01b1..6bad1560b2f197a29b6a1238841b10dc172181f7 100644 (file)
@@ -1843,32 +1843,22 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        bool color_written = false;
+        struct qreg packed_color = c->undef;
         for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file != QFILE_NULL)
-                        color_written = true;
-        }
-
-        struct qreg packed_color;
-        if (color_written) {
-                /* Fill in any undefined colors.  The simulator will assertion
-                 * fail if we read something that wasn't written, and I don't
-                 * know what hardware does.
-                 */
-                for (int i = 0; i < 4; i++) {
-                        if (swizzled_outputs[i].file == QFILE_NULL)
-                                swizzled_outputs[i] = qir_uniform_f(c, 0.0);
+                if (swizzled_outputs[i].file == QFILE_NULL)
+                        continue;
+                if (packed_color.file == QFILE_NULL) {
+                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
+                } else {
+                        packed_color = qir_PACK_8_F(c,
+                                                    packed_color,
+                                                    swizzled_outputs[i],
+                                                    i);
                 }
-                packed_color = qir_get_temp(c);
-                qir_emit(c, qir_inst4(QOP_PACK_COLORS, packed_color,
-                                      swizzled_outputs[0],
-                                      swizzled_outputs[1],
-                                      swizzled_outputs[2],
-                                      swizzled_outputs[3]));
-        } else {
-                packed_color = qir_uniform_ui(c, 0);
         }
 
+        if (packed_color.file == QFILE_NULL)
+                packed_color = qir_uniform_ui(c, 0);
 
         if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
                 packed_color = vc4_logicop(c, packed_color, packed_dst_color);
index 3fd394132224a0e60e9b5b4cb867fdd3c7cfeceb..5f3b8ddc4456279cc59f9d283919c5b8b9de091f 100644 (file)
@@ -73,7 +73,11 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
         [QOP_LOG2] = { "log2", 1, 2, false, true },
-        [QOP_PACK_COLORS] = { "pack_colors", 1, 4, false, true },
+        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true },
+        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true },
+        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true },
+        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true },
+        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true },
         [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true },
         [QOP_VPM_READ] = { "vpm_read", 0, 1, true },
         [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true },
index f7d59a80dac2c5bc59a1d6c6cb925056dca694c9..6dac00fbbd84ded0d8982e4ba73a8f4b3a76a518 100644 (file)
@@ -100,7 +100,11 @@ enum qop {
         QOP_VW_SETUP,
         QOP_VR_SETUP,
         QOP_PACK_SCALED,
-        QOP_PACK_COLORS,
+        QOP_PACK_8888_F,
+        QOP_PACK_8A_F,
+        QOP_PACK_8B_F,
+        QOP_PACK_8C_F,
+        QOP_PACK_8D_F,
         QOP_VPM_READ,
         QOP_TLB_DISCARD_SETUP,
         QOP_TLB_STENCIL_SETUP,
@@ -473,6 +477,11 @@ QIR_ALU1(RSQ)
 QIR_ALU1(EXP2)
 QIR_ALU1(LOG2)
 QIR_ALU2(PACK_SCALED)
+QIR_ALU1(PACK_8888_F)
+QIR_ALU2(PACK_8A_F)
+QIR_ALU2(PACK_8B_F)
+QIR_ALU2(PACK_8C_F)
+QIR_ALU2(PACK_8D_F)
 QIR_ALU1(VARY_ADD_C)
 QIR_NODST_2(TEX_S)
 QIR_NODST_2(TEX_T)
@@ -538,6 +547,14 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
         return t;
 }
 
+static inline struct qreg
+qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan)
+{
+        struct qreg t = qir_get_temp(c);
+        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val));
+        return t;
+}
+
 static inline struct qreg
 qir_POW(struct vc4_compile *c, struct qreg x, struct qreg y)
 {
index 503f32a4c05a897cad8312eda78e5036456d5210..857d56e0f442d3710355cf8e567fe48dd065ad41 100644 (file)
@@ -347,40 +347,40 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_PACK_COLORS: {
-                        /* We have to be careful not to start writing over one
-                         * of our source values when incrementally writing the
-                         * destination.  So, if the dst is one of the srcs, we
-                         * pack that one first (and we pack 4 channels at once
-                         * for the first pack).
-                         */
-                        struct qpu_reg first_pack = src[0];
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == dst.mux &&
-                                    src[i].addr == dst.addr) {
-                                        first_pack = dst;
-                                        break;
-                                }
-                        }
-                        queue(c, qpu_m_MOV(dst, first_pack));
+                case QOP_PACK_8888_F:
+                        queue(c, qpu_m_MOV(dst, src[0]));
                         *last_inst(c) |= QPU_PM;
                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
                                                        QPU_PACK);
+                        break;
 
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == first_pack.mux &&
-                                    src[i].addr == first_pack.addr) {
-                                        continue;
+                case QOP_PACK_8A_F:
+                case QOP_PACK_8B_F:
+                case QOP_PACK_8C_F:
+                case QOP_PACK_8D_F:
+                        /* If dst doesn't happen to already contain src[0],
+                         * then we have to move it in.
+                         */
+                        if (qinst->src[0].file != QFILE_NULL &&
+                            (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
+                                /* Don't overwrite src1 while setting up
+                                 * the dst!
+                                 */
+                                if (dst.mux == src[1].mux &&
+                                    dst.addr == src[1].addr) {
+                                        queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
+                                        src[1] = qpu_rb(31);
                                 }
 
-                                queue(c, qpu_m_MOV(dst, src[i]));
-                                *last_inst(c) |= QPU_PM;
-                                *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
-                                                               QPU_PACK);
+                                queue(c, qpu_m_MOV(dst, src[0]));
                         }
 
+                        queue(c, qpu_m_MOV(dst, src[1]));
+                        *last_inst(c) |= QPU_PM;
+                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
+                                                       qinst->op - QOP_PACK_8A_F,
+                                                       QPU_PACK);
                         break;
-                }
 
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,