vc4: Add shader-db dumping of NIR instruction count.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
index f2620c0a75f8e6c839213b05f900fea911879955..eeb8d3a21ff6633811f923fbd62abb6628ae79fe 100644 (file)
@@ -26,6 +26,7 @@
 #include "vc4_context.h"
 #include "vc4_qir.h"
 #include "vc4_qpu.h"
+#include "util/ralloc.h"
 
 static void
 vc4_dump_program(struct vc4_compile *c)
@@ -44,7 +45,7 @@ vc4_dump_program(struct vc4_compile *c)
 static void
 queue(struct vc4_compile *c, uint64_t inst)
 {
-        struct queued_qpu_inst *q = calloc(1, sizeof(*q));
+        struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
         q->inst = inst;
         insert_at_tail(&c->qpu_inst_list, &q->link);
 }
@@ -73,11 +74,15 @@ swap_file(struct qpu_reg *src)
         switch (src->addr) {
         case QPU_R_UNIF:
         case QPU_R_VARY:
-                if (src->mux == QPU_MUX_A)
-                        src->mux = QPU_MUX_B;
-                else
-                        src->mux = QPU_MUX_A;
-                return true;
+                if (src->mux == QPU_MUX_SMALL_IMM) {
+                        return false;
+                } else {
+                        if (src->mux == QPU_MUX_A)
+                                src->mux = QPU_MUX_B;
+                        else
+                                src->mux = QPU_MUX_A;
+                        return true;
+                }
 
         default:
                 return false;
@@ -91,42 +96,32 @@ swap_file(struct qpu_reg *src)
  * address.
  *
  * In that case, we need to move one to a temporary that can be used in the
- * instruction, instead.
+ * instruction, instead.  We reserve ra31/rb31 for this purpose.
  */
-static bool
+static void
 fixup_raddr_conflict(struct vc4_compile *c,
                      struct qpu_reg dst,
-                     struct qpu_reg *src0, struct qpu_reg *src1,
-                     bool r3_live)
+                     struct qpu_reg *src0, struct qpu_reg *src1)
 {
-        if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
-            src0->mux != src1->mux ||
-            src0->addr == src1->addr) {
-                return false;
+        uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
+        uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
+
+        if (mux0 <= QPU_MUX_R5 ||
+            mux0 != mux1 ||
+            (src0->addr == src1->addr &&
+             src0->mux == src1->mux)) {
+                return;
         }
 
         if (swap_file(src0) || swap_file(src1))
-                return false;
+                return;
 
-        if (src0->mux == QPU_MUX_A) {
-                /* If we're conflicting over the A regfile, then we can just
-                 * use the reserved rb31.
-                 */
+        if (mux0 == QPU_MUX_A) {
                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
                 *src1 = qpu_rb(31);
-                return false;
         } else {
-                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
-                 * rb31, then store our desired value in r3, and tell the
-                 * caller to put rb31 back into r3 when we're done.
-                 */
-                if (r3_live)
-                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
-                queue(c, qpu_a_MOV(qpu_r3(), *src1));
-
-                *src1 = qpu_r3();
-
-                return r3_live && dst.mux != QPU_MUX_R3;
+                queue(c, qpu_a_MOV(qpu_ra(31), *src1));
+                *src1 = qpu_ra(31);
         }
 }
 
@@ -138,8 +133,16 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t inputs_remaining = c->num_inputs;
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
-        bool written_r3 = false;
-        bool needs_restore;
+        int last_vpm_read_index = -1;
+        /* Map from the QIR ops enum order to QPU unpack bits. */
+        static const uint32_t unpack_map[] = {
+                QPU_UNPACK_8A,
+                QPU_UNPACK_8B,
+                QPU_UNPACK_8C,
+                QPU_UNPACK_8D,
+                QPU_UNPACK_16A_TO_F32,
+                QPU_UNPACK_16B_TO_F32,
+        };
 
         make_empty_list(&c->qpu_inst_list);
 
@@ -223,6 +226,20 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VARY:
                                 src[i] = qpu_vary();
                                 break;
+                        case QFILE_SMALL_IMM:
+                                src[i].mux = QPU_MUX_SMALL_IMM;
+                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
+                                /* This should only have returned a valid
+                                 * small immediate field, not ~0 for failure.
+                                 */
+                                assert(src[i].addr <= 47);
+                                break;
+                        case QFILE_VPM:
+                                assert((int)qinst->src[i].index >=
+                                       last_vpm_read_index);
+                                last_vpm_read_index = qinst->src[i].index;
+                                src[i] = qpu_ra(QPU_R_VPM);
+                                break;
                         }
                 }
 
@@ -234,8 +251,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QFILE_TEMP:
                         dst = temp_registers[qinst->dst.index];
                         break;
+                case QFILE_VPM:
+                        dst = qpu_ra(QPU_W_VPM);
+                        break;
                 case QFILE_VARY:
                 case QFILE_UNIF:
+                case QFILE_SMALL_IMM:
                         assert(!"not reached");
                         break;
                 }
@@ -249,11 +270,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         }
                         break;
 
-                case QOP_SF:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
-                        *last_inst(c) |= QPU_SF;
-                        break;
-
                 case QOP_SEL_X_0_ZS:
                 case QOP_SEL_X_0_ZC:
                 case QOP_SEL_X_0_NS:
@@ -281,14 +297,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_VPM_WRITE:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
-                        break;
-
-                case QOP_VPM_READ:
-                        queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
-                        break;
-
                 case QOP_RCP:
                 case QOP_RSQ:
                 case QOP_EXP2:
@@ -318,40 +326,40 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_PACK_COLORS: {
-                        /* We have to be careful not to start writing over one
-                         * of our source values when incrementally writing the
-                         * destination.  So, if the dst is one of the srcs, we
-                         * pack that one first (and we pack 4 channels at once
-                         * for the first pack).
-                         */
-                        struct qpu_reg first_pack = src[0];
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == dst.mux &&
-                                    src[i].addr == dst.addr) {
-                                        first_pack = dst;
-                                        break;
-                                }
-                        }
-                        queue(c, qpu_m_MOV(dst, first_pack));
+                case QOP_PACK_8888_F:
+                        queue(c, qpu_m_MOV(dst, src[0]));
                         *last_inst(c) |= QPU_PM;
                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
                                                        QPU_PACK);
+                        break;
 
-                        for (int i = 0; i < 4; i++) {
-                                if (src[i].mux == first_pack.mux &&
-                                    src[i].addr == first_pack.addr) {
-                                        continue;
+                case QOP_PACK_8A_F:
+                case QOP_PACK_8B_F:
+                case QOP_PACK_8C_F:
+                case QOP_PACK_8D_F:
+                        /* If dst doesn't happen to already contain src[0],
+                         * then we have to move it in.
+                         */
+                        if (qinst->src[0].file != QFILE_NULL &&
+                            (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
+                                /* Don't overwrite src1 while setting up
+                                 * the dst!
+                                 */
+                                if (dst.mux == src[1].mux &&
+                                    dst.addr == src[1].addr) {
+                                        queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
+                                        src[1] = qpu_rb(31);
                                 }
 
-                                queue(c, qpu_m_MOV(dst, src[i]));
-                                *last_inst(c) |= QPU_PM;
-                                *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
-                                                               QPU_PACK);
+                                queue(c, qpu_m_MOV(dst, src[0]));
                         }
 
+                        queue(c, qpu_m_MOV(dst, src[1]));
+                        *last_inst(c) |= QPU_PM;
+                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
+                                                       qinst->op - QOP_PACK_8A_F,
+                                                       QPU_PACK);
                         break;
-                }
 
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,
@@ -438,12 +446,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TEX_DIRECT:
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                         break;
 
                 case QOP_TEX_RESULT:
@@ -467,27 +471,49 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_UNPACK_8A:
-                case QOP_UNPACK_8B:
-                case QOP_UNPACK_8C:
-                case QOP_UNPACK_8D: {
+                case QOP_UNPACK_8A_F:
+                case QOP_UNPACK_8B_F:
+                case QOP_UNPACK_8C_F:
+                case QOP_UNPACK_8D_F:
+                case QOP_UNPACK_16A_F:
+                case QOP_UNPACK_16B_F: {
                         assert(src[0].mux == QPU_MUX_A);
 
-                        /* And, since we're setting the pack bits, if the
+                        /* Since we're setting the pack bits, if the
                          * destination is in A it would get re-packed.
                          */
-                        struct qpu_reg orig_dst = dst;
-                        if (orig_dst.mux == QPU_MUX_A)
-                                dst = qpu_rn(3);
+                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                             qpu_rb(31) : dst),
+                                            src[0], src[0]));
+                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                  QOP_UNPACK_8A_F],
+                                                       QPU_UNPACK);
 
-                        queue(c, qpu_a_FMAX(dst, src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_UNPACK_8A),
+                        if (dst.mux == QPU_MUX_A) {
+                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                        }
+                }
+                        break;
+
+                case QOP_UNPACK_8A_I:
+                case QOP_UNPACK_8B_I:
+                case QOP_UNPACK_8C_I:
+                case QOP_UNPACK_8D_I:
+                case QOP_UNPACK_16A_I:
+                case QOP_UNPACK_16B_I: {
+                        assert(src[0].mux == QPU_MUX_A);
+
+                        /* Since we're setting the pack bits, if the
+                         * destination is in A it would get re-packed.
+                         */
+                        queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
+                                            qpu_rb(31) : dst), src[0]));
+                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                  QOP_UNPACK_8A_I],
                                                        QPU_UNPACK);
 
-                        if (orig_dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(orig_dst, dst));
+                        if (dst.mux == QPU_MUX_A) {
+                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
                         }
                 }
                         break;
@@ -503,9 +529,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        needs_restore = fixup_raddr_conflict(c, dst,
-                                                             &src[0], &src[1],
-                                                             written_r3);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
 
                         if (translate[qinst->op].is_mul) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -516,14 +540,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                     dst,
                                                     src[0], src[1]));
                         }
-                        if (needs_restore)
-                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 
                         break;
                 }
 
-                if (dst.mux == QPU_MUX_R3)
-                        written_r3 = true;
+                if (qinst->sf) {
+                        assert(!qir_is_multi_instruction(qinst));
+                        *last_inst(c) |= QPU_SF;
+                }
         }
 
         qpu_schedule_instructions(c);