vc4: Add support for MUL output rotation.
authorEric Anholt <eric@anholt.net>
Thu, 25 Aug 2016 19:31:49 +0000 (12:31 -0700)
committerEric Anholt <eric@anholt.net>
Fri, 26 Aug 2016 00:24:11 +0000 (17:24 -0700)
Extracted from a patch by jonasarrow on github.

src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qpu.c
src/gallium/drivers/vc4/vc4_qpu.h
src/gallium/drivers/vc4/vc4_qpu_emit.c

index f8f1365f6589b5d26de24956670e9c075574083a..d20ee5e227d5f4195db3c0eba9079918e1eb57c5 100644 (file)
@@ -90,6 +90,14 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs)
                                 continue;
                 }
 
+                /* Mul rotation's source needs to be in an r0-r3 accumulator,
+                 * so no uniforms or regfile-a/r4 unpacking allowed.
+                 */
+                if (inst->op == QOP_ROT_MUL &&
+                    (mov->src[0].file != QFILE_TEMP ||
+                     mov->src[0].pack))
+                        continue;
+
                 uint8_t unpack;
                 if (mov->src[0].pack) {
                         /* Make sure that the meaning of the unpack
index 9b4a28ebab6b8ed0f8eaf10a915b46de091f717f..446af66affde8781df8243c907fa0b5ca7d4da1b 100644 (file)
@@ -86,6 +86,8 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_LOAD_IMM_U2] = { "load_imm_u2", 0, 1 },
         [QOP_LOAD_IMM_I2] = { "load_imm_i2", 0, 1 },
 
+        [QOP_ROT_MUL] = { "rot_mul", 0, 2 },
+
         [QOP_BRANCH] = { "branch", 0, 0, true },
         [QOP_UNIFORMS_RESET] = { "uniforms_reset", 0, 2, true },
 };
@@ -164,6 +166,7 @@ qir_is_mul(struct qinst *inst)
         case QOP_V8MAX:
         case QOP_V8ADDS:
         case QOP_V8SUBS:
+        case QOP_ROT_MUL:
                 return true;
         default:
                 return false;
index 90cc138504352df4cccd27a86d45cb49fec0411b..a82c47c03412d56f79d0b602fc0ff673d42b74fe 100644 (file)
@@ -168,6 +168,8 @@ enum qop {
          */
         QOP_LOAD_IMM_I2,
 
+        QOP_ROT_MUL,
+
         /* Jumps to block->successor[0] if the qinst->cond (as a
          * QPU_COND_BRANCH_*) passes, or block->successor[1] if not.  Note
          * that block->successor[1] may be unset if the condition is ALWAYS.
@@ -822,6 +824,16 @@ qir_LOAD_IMM_I2(struct vc4_compile *c, uint32_t val)
                                         c->undef));
 }
 
+/** Shifts the multiply output to the right by rot channels */
+static inline struct qreg
+qir_ROT_MUL(struct vc4_compile *c, struct qreg val, uint32_t rot)
+{
+        return qir_emit_def(c, qir_inst(QOP_ROT_MUL, c->undef,
+                                        val,
+                                        qir_reg(QFILE_LOAD_IMM,
+                                                QPU_SMALL_IMM_MUL_ROT + rot)));
+}
+
 static inline void
 qir_MOV_cond(struct vc4_compile *c, uint8_t cond,
              struct qreg dest, struct qreg src)
index d022d107eb3fa585788f6c21048e07194b56c0fc..67850a8114a2976a890bf09cac42dd5ed60958b3 100644 (file)
@@ -234,6 +234,19 @@ qpu_m_alu2(enum qpu_op_mul op,
         return inst;
 }
 
+uint64_t
+qpu_m_rot(struct qpu_reg dst, struct qpu_reg src0, int rot)
+{
+       uint64_t inst = 0;
+       inst = qpu_m_alu2(QPU_M_V8MIN, dst, src0, src0);
+
+       inst = QPU_UPDATE_FIELD(inst, QPU_SIG_SMALL_IMM, QPU_SIG);
+       inst = QPU_UPDATE_FIELD(inst, QPU_SMALL_IMM_MUL_ROT + rot,
+                                QPU_SMALL_IMM);
+
+       return inst;
+}
+
 static bool
 merge_fields(uint64_t *merge,
              uint64_t a, uint64_t b,
index 437e4f5e5a4c54e669f9ceeefcc8225767815d5e..5ec80f0537522c87a9ecc86ab01d00f8b320c1ee 100644 (file)
@@ -150,6 +150,7 @@ uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST;
 uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
 uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
 uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST;
+uint64_t qpu_m_rot(struct qpu_reg dst, struct qpu_reg src, int rot) ATTRIBUTE_CONST;
 
 bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST;
 bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
index f5a5b8a862ac8fc5db2212e0ba2228af4cd3a0ab..79588b3f51c7383b650c52cdde9a8e66f62ac019 100644 (file)
@@ -434,6 +434,20 @@ vc4_generate_code_block(struct vc4_compile *c,
 
                 case QOP_LOAD_IMM_I2:
                         queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
+
+                case QOP_ROT_MUL:
+                        /* Rotation at the hardware level occurs on the inputs
+                         * to the MUL unit, and they must be accumulators in
+                         * order to have the time necessary to move things.
+                         */
+                        assert(src[0].mux <= QPU_MUX_R3);
+
+                        queue(block,
+                              qpu_m_rot(dst, src[0], qinst->src[1].index -
+                                        QPU_SMALL_IMM_MUL_ROT) | unpack);
+                        set_last_cond_mul(block, qinst->cond);
+                        handled_qinst_cond = true;
+                        set_last_dst_pack(block, qinst);
                         break;
 
                 case QOP_MS_MASK: