vc4: Add shader-db dumping of NIR instruction count.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
index 35300ff42e8de9e5ffa94a08fdfcca52cc6f3de6..eeb8d3a21ff6633811f923fbd62abb6628ae79fe 100644 (file)
@@ -96,13 +96,12 @@ swap_file(struct qpu_reg *src)
  * address.
  *
  * In that case, we need to move one to a temporary that can be used in the
- * instruction, instead.
+ * instruction, instead.  We reserve ra31/rb31 for this purpose.
  */
-static bool
+static void
 fixup_raddr_conflict(struct vc4_compile *c,
                      struct qpu_reg dst,
-                     struct qpu_reg *src0, struct qpu_reg *src1,
-                     bool r3_live)
+                     struct qpu_reg *src0, struct qpu_reg *src1)
 {
         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
@@ -111,31 +110,18 @@ fixup_raddr_conflict(struct vc4_compile *c,
             mux0 != mux1 ||
             (src0->addr == src1->addr &&
              src0->mux == src1->mux)) {
-                return false;
+                return;
         }
 
         if (swap_file(src0) || swap_file(src1))
-                return false;
+                return;
 
         if (mux0 == QPU_MUX_A) {
-                /* If we're conflicting over the A regfile, then we can just
-                 * use the reserved rb31.
-                 */
                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
                 *src1 = qpu_rb(31);
-                return false;
         } else {
-                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
-                 * rb31, then store our desired value in r3, and tell the
-                 * caller to put rb31 back into r3 when we're done.
-                 */
-                if (r3_live)
-                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
-                queue(c, qpu_a_MOV(qpu_r3(), *src1));
-
-                *src1 = qpu_r3();
-
-                return r3_live && dst.mux != QPU_MUX_R3;
+                queue(c, qpu_a_MOV(qpu_ra(31), *src1));
+                *src1 = qpu_ra(31);
         }
 }
 
@@ -147,8 +133,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t inputs_remaining = c->num_inputs;
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
-        bool written_r3 = false;
-        bool needs_restore;
+        int last_vpm_read_index = -1;
         /* Map from the QIR ops enum order to QPU unpack bits. */
         static const uint32_t unpack_map[] = {
                 QPU_UNPACK_8A,
@@ -249,6 +234,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                  */
                                 assert(src[i].addr <= 47);
                                 break;
+                        case QFILE_VPM:
+                                assert((int)qinst->src[i].index >=
+                                       last_vpm_read_index);
+                                last_vpm_read_index = qinst->src[i].index;
+                                src[i] = qpu_ra(QPU_R_VPM);
+                                break;
                         }
                 }
 
@@ -260,6 +251,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QFILE_TEMP:
                         dst = temp_registers[qinst->dst.index];
                         break;
+                case QFILE_VPM:
+                        dst = qpu_ra(QPU_W_VPM);
+                        break;
                 case QFILE_VARY:
                 case QFILE_UNIF:
                 case QFILE_SMALL_IMM:
@@ -276,11 +270,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         }
                         break;
 
-                case QOP_SF:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
-                        *last_inst(c) |= QPU_SF;
-                        break;
-
                 case QOP_SEL_X_0_ZS:
                 case QOP_SEL_X_0_ZC:
                 case QOP_SEL_X_0_NS:
@@ -308,14 +297,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_VPM_WRITE:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
-                        break;
-
-                case QOP_VPM_READ:
-                        queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
-                        break;
-
                 case QOP_RCP:
                 case QOP_RSQ:
                 case QOP_EXP2:
@@ -345,40 +326,40 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_PACK_COLORS: {
-                        /* We have to be careful not to start writing over one
-                         * of our source values when incrementally writing the
-                         * destination.  So, if the dst is one of the srcs, we
-                         * pack that one first (and we pack 4 channels at once
-                         * for the first pack).
-                         */
-                        struct qpu_reg first_pack = src[0];
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == dst.mux &&
-                                    src[i].addr == dst.addr) {
-                                        first_pack = dst;
-                                        break;
-                                }
-                        }
-                        queue(c, qpu_m_MOV(dst, first_pack));
+                case QOP_PACK_8888_F:
+                        queue(c, qpu_m_MOV(dst, src[0]));
                         *last_inst(c) |= QPU_PM;
                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
                                                        QPU_PACK);
+                        break;
 
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == first_pack.mux &&
-                                    src[i].addr == first_pack.addr) {
-                                        continue;
+                case QOP_PACK_8A_F:
+                case QOP_PACK_8B_F:
+                case QOP_PACK_8C_F:
+                case QOP_PACK_8D_F:
+                        /* If dst doesn't happen to already contain src[0],
+                         * then we have to move it in.
+                         */
+                        if (qinst->src[0].file != QFILE_NULL &&
+                            (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
+                                /* Don't overwrite src1 while setting up
+                                 * the dst!
+                                 */
+                                if (dst.mux == src[1].mux &&
+                                    dst.addr == src[1].addr) {
+                                        queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
+                                        src[1] = qpu_rb(31);
                                 }
 
-                                queue(c, qpu_m_MOV(dst, src[i]));
-                                *last_inst(c) |= QPU_PM;
-                                *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
-                                                               QPU_PACK);
+                                queue(c, qpu_m_MOV(dst, src[0]));
                         }
 
+                        queue(c, qpu_m_MOV(dst, src[1]));
+                        *last_inst(c) |= QPU_PM;
+                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
+                                                       qinst->op - QOP_PACK_8A_F,
+                                                       QPU_PACK);
                         break;
-                }
 
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,
@@ -465,12 +446,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TEX_DIRECT:
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                         break;
 
                 case QOP_TEX_RESULT:
@@ -552,9 +529,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
 
                         if (translate[qinst->op].is_mul) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -565,14 +540,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                     dst,
                                                     src[0], src[1]));
                         }
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 
                         break;
                 }
 
-                if (dst.mux == QPU_MUX_R3)
-                        written_r3 = true;
+                if (qinst->sf) {
+                        assert(!qir_is_multi_instruction(qinst));
+                        *last_inst(c) |= QPU_SF;
+                }
         }
 
         qpu_schedule_instructions(c);