vc4: Expose compares at a lower level in QIR.
authorEric Anholt <eric@anholt.net>
Sun, 24 Aug 2014 23:51:32 +0000 (16:51 -0700)
committerEric Anholt <eric@anholt.net>
Thu, 4 Sep 2014 18:39:51 +0000 (11:39 -0700)
Before, we had some special opcodes like CMP and SNE that emitted multiple
instructions.  Now, we reduce those operations significantly, giving
optimization more to look at for reducing redundant operations.

The downside is that QOP_SF is pretty special -- we're going to have to
track it separately when we're doing instruction scheduling, and we want
to peephole it into the instruction generating the destination write in
most cases (and not allocate the destination reg, probably.  Unless it's
used for some other purpose, as well).

src/gallium/drivers/vc4/vc4_opt_algebraic.c
src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qpu_emit.c

index 2bf474ccef9cfe23cda8d2c0390cd48224f0fab2..f8ed6218adc31729a23bc6fec9988ea85ad392a9 100644 (file)
@@ -45,9 +45,12 @@ qir_opt_algebraic(struct qcompile *c)
                 struct qinst *inst = (struct qinst *)node;
 
                 switch (inst->op) {
-                case QOP_CMP:
-                        /* Turn "dst = (a < 0) ? b : b)" into "dst = b" */
-                        if (qir_reg_equals(inst->src[1], inst->src[2])) {
+                case QOP_SEL_X_Y_ZS:
+                case QOP_SEL_X_Y_ZC:
+                case QOP_SEL_X_Y_NS:
+                case QOP_SEL_X_Y_NC:
+                        /* Turn "dst = (sf == x) ? a : a)" into "dst = a" */
+                        if (qir_reg_equals(inst->src[0], inst->src[1])) {
                                 if (debug) {
                                         fprintf(stderr, "optimizing: ");
                                         qir_dump_inst(inst);
index d404047e4bb730355d2ebd2e07e6ff0110e070b2..aaa7eb346f328a3a7e5f64d7cd32f91406b7ae2a 100644 (file)
@@ -248,6 +248,58 @@ tgsi_to_qir_alu(struct tgsi_to_qir *trans,
         return dst;
 }
 
+static struct qreg
+tgsi_to_qir_seq(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_ZS(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_sne(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_ZC(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_slt(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_NS(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_sge(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_NC(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_cmp(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, src[0 * 4 + i]);
+        return qir_SEL_X_Y_NS(c,
+                              src[1 * 4 + i],
+                              src[2 * 4 + i]);
+}
+
 static struct qreg
 tgsi_to_qir_mad(struct tgsi_to_qir *trans,
                 struct tgsi_full_instruction *tgsi_inst,
@@ -280,16 +332,15 @@ tgsi_to_qir_lit(struct tgsi_to_qir *trans,
         case 2: {
                 struct qreg zero = qir_uniform_f(trans, 0.0);
 
+                qir_SF(c, x);
                 /* XXX: Clamp w to -128..128 */
-                return qir_CMP(c,
-                               x,
-                               zero,
-                               qir_EXP2(c, qir_FMUL(c,
-                                                    w,
-                                                    qir_LOG2(c,
-                                                             qir_FMAX(c,
-                                                                      y,
-                                                                      zero)))));
+                return qir_SEL_X_0_NC(c,
+                                      qir_EXP2(c, qir_FMUL(c,
+                                                           w,
+                                                           qir_LOG2(c,
+                                                                    qir_FMAX(c,
+                                                                             y,
+                                                                             zero)))));
         }
         default:
                 assert(!"not reached");
@@ -415,10 +466,10 @@ tgsi_to_qir_frc(struct tgsi_to_qir *trans,
         struct qcompile *c = trans->c;
         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src[0 * 4 + i]));
         struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
-        return qir_CMP(c,
-                       diff,
-                       qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
-                       diff);
+        qir_SF(c, diff);
+        return qir_SEL_X_Y_NS(c,
+                              qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
+                              diff);
 }
 
 /**
@@ -436,12 +487,11 @@ tgsi_to_qir_flr(struct tgsi_to_qir *trans,
         /* This will be < 0 if we truncated and the truncation was of a value
          * that was < 0 in the first place.
          */
-        struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], trunc));
 
-        return qir_CMP(c,
-                       diff,
-                       qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
-                       trunc);
+        return qir_SEL_X_Y_NS(c,
+                              qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
+                              trunc);
 }
 
 static struct qreg
@@ -613,10 +663,10 @@ tgsi_to_qir_kill_if(struct tgsi_to_qir *trans, struct qreg *src, int i)
 
         if (trans->discard.file == QFILE_NULL)
                 trans->discard = qir_uniform_f(trans, 0.0);
-        trans->discard = qir_CMP(c,
-                                 src[0 * 4 + i],
-                                 qir_uniform_f(trans, 1.0),
-                                 trans->discard);
+        qir_SF(c, src[0 * 4 + i]);
+        trans->discard = qir_SEL_X_Y_NS(c,
+                                        qir_uniform_f(trans, 1.0),
+                                        trans->discard);
 }
 
 static void
@@ -705,11 +755,11 @@ emit_tgsi_instruction(struct tgsi_to_qir *trans,
                 [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu },
                 [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu },
                 [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SEQ] = { QOP_SEQ, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SNE] = { QOP_SNE, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SGE] = { QOP_SGE, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SLT] = { QOP_SLT, tgsi_to_qir_alu },
-                [TGSI_OPCODE_CMP] = { QOP_CMP, tgsi_to_qir_alu },
+                [TGSI_OPCODE_SEQ] = { 0, tgsi_to_qir_seq },
+                [TGSI_OPCODE_SNE] = { 0, tgsi_to_qir_sne },
+                [TGSI_OPCODE_SGE] = { 0, tgsi_to_qir_sge },
+                [TGSI_OPCODE_SLT] = { 0, tgsi_to_qir_slt },
+                [TGSI_OPCODE_CMP] = { 0, tgsi_to_qir_cmp },
                 [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad },
                 [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 },
                 [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 },
index 0b0d2c11cf1dd2774fab1d3e86e1dee772da4ac5..72149908422cdab9c5a84a488ae5e43d470641f7 100644 (file)
@@ -43,11 +43,15 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_FMINABS] = { "fminabs", 1, 2 },
         [QOP_FMAXABS] = { "fmaxabs", 1, 2 },
 
-        [QOP_SEQ] = { "seq", 1, 2 },
-        [QOP_SNE] = { "sne", 1, 2 },
-        [QOP_SGE] = { "sge", 1, 2 },
-        [QOP_SLT] = { "slt", 1, 2 },
-        [QOP_CMP] = { "cmp", 1, 3 },
+        [QOP_SF] = { "sf", 0, 1 },
+        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1 },
+        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1 },
+        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1 },
+        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1 },
+        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2 },
+        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2 },
+        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2 },
+        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2 },
 
         [QOP_FTOI] = { "ftoi", 1, 1 },
         [QOP_ITOF] = { "itof", 1, 1 },
index 77b5f1af9035bd04e5f61d5056a66714ca13cf27..99df99c1a073104d4ddfb5b0ae23ecd517e44cfe 100644 (file)
@@ -54,11 +54,21 @@ enum qop {
         QOP_FMINABS,
         QOP_FMAXABS,
 
-        QOP_SEQ,
-        QOP_SNE,
-        QOP_SGE,
-        QOP_SLT,
-        QOP_CMP,
+        /* Sets the flag register according to src. */
+        QOP_SF,
+
+        /* Note: Orderings of these compares must be the same as in
+         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
+         * otherwise 0. */
+        QOP_SEL_X_0_ZS,
+        QOP_SEL_X_0_ZC,
+        QOP_SEL_X_0_NS,
+        QOP_SEL_X_0_NC,
+        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
+        QOP_SEL_X_Y_ZS,
+        QOP_SEL_X_Y_ZC,
+        QOP_SEL_X_Y_NS,
+        QOP_SEL_X_Y_NC,
 
         QOP_FTOI,
         QOP_ITOF,
@@ -260,6 +270,15 @@ QIR_ALU1(MOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
 QIR_ALU2(FMUL)
+QIR_NODST_1(SF)
+QIR_ALU1(SEL_X_0_ZS)
+QIR_ALU1(SEL_X_0_ZC)
+QIR_ALU1(SEL_X_0_NS)
+QIR_ALU1(SEL_X_0_NC)
+QIR_ALU2(SEL_X_Y_ZS)
+QIR_ALU2(SEL_X_Y_ZC)
+QIR_ALU2(SEL_X_Y_NS)
+QIR_ALU2(SEL_X_Y_NC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
@@ -283,14 +302,6 @@ QIR_ALU0(FRAG_Z)
 QIR_ALU0(FRAG_RCP_W)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 
-static inline struct qreg
-qir_CMP(struct qcompile *c, struct qreg cmp, struct qreg a, struct qreg b)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst4(QOP_CMP, t, cmp, a, b, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_R4_UNPACK(struct qcompile *c, int i)
 {
index 4e28ff7c3b809e63ea45ef329b911c8df2790b06..6d2c34f2d1fc5e7c00c0fc5ec90b8b5c60ce9b6d 100644 (file)
@@ -60,6 +60,12 @@ last_inst(struct qcompile *c)
         return &q->inst;
 }
 
+static void
+set_last_cond_add(struct qcompile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
+}
+
 /**
  * This is used to resolve the fact that we might register-allocate two
  * different operands of an instruction to the same physical register file
@@ -278,13 +284,6 @@ vc4_generate_code(struct qcompile *c)
                         M(FMUL),
                 };
 
-                static const uint32_t compareflags[] = {
-                        [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS,
-                        [QOP_SNE - QOP_SEQ] = QPU_COND_ZC,
-                        [QOP_SLT - QOP_SEQ] = QPU_COND_NS,
-                        [QOP_SGE - QOP_SEQ] = QPU_COND_NC,
-                };
-
                 struct qpu_reg src[4];
                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
                         int index = qinst->src[i].index;
@@ -365,32 +364,36 @@ vc4_generate_code(struct qcompile *c)
                         }
                         break;
 
-                case QOP_CMP:
+                case QOP_SF:
+                        fixup_raddr_conflict(c, src[0], &src[1]);
                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
                         *last_inst(c) |= QPU_SF;
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         QPU_COND_NS);
-
-                        queue(c, qpu_a_MOV(dst, src[2]));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         QPU_COND_NC);
                         break;
 
-                case QOP_SEQ:
-                case QOP_SNE:
-                case QOP_SGE:
-                case QOP_SLT:
-                        fixup_raddr_conflict(c, src[0], &src[1]);
-                        queue(c, qpu_a_FSUB(qpu_ra(QPU_W_NOP), src[0], src[1]));
-                        *last_inst(c) |= QPU_SF;
+                case QOP_SEL_X_0_ZS:
+                case QOP_SEL_X_0_ZC:
+                case QOP_SEL_X_0_NS:
+                case QOP_SEL_X_0_NC:
+                        queue(c, qpu_a_MOV(dst, src[0]));
+                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
+                                          QPU_COND_ZS);
+
+                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
+                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
+                                              1) + QPU_COND_ZS);
+                        break;
 
-                        queue(c, qpu_load_imm_f(dst, 0.0));
-                        queue(c, qpu_load_imm_f(dst, 1.0));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         compareflags[qinst->op - QOP_SEQ]);
+                case QOP_SEL_X_Y_ZS:
+                case QOP_SEL_X_Y_ZC:
+                case QOP_SEL_X_Y_NS:
+                case QOP_SEL_X_Y_NC:
+                        queue(c, qpu_a_MOV(dst, src[0]));
+                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
+                                          QPU_COND_ZS);
 
+                        queue(c, qpu_a_MOV(dst, src[1]));
+                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
+                                              1) + QPU_COND_ZS);
 
                         break;
 
@@ -475,8 +478,7 @@ vc4_generate_code(struct qcompile *c)
                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
                                            qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
                         if (discard) {
-                                *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                                 QPU_COND_ZS);
+                                set_last_cond_add(c, QPU_COND_ZS);
                         }
                         break;
 
@@ -490,8 +492,7 @@ vc4_generate_code(struct qcompile *c)
                 case QOP_TLB_COLOR_WRITE:
                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
                         if (discard) {
-                                *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                                 QPU_COND_ZS);
+                                set_last_cond_add(c, QPU_COND_ZS);
                         }
                         break;