vc4: Refactor qir_get_op_nsrc(enum qop) to qir_get_nsrc(struct qinst *).
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
index cb936243c65251e35b7b1058f3bffefa4341c6c6..9d9e5d84ecd9178dbbe537a93508dabc5fddb83a 100644 (file)
@@ -288,8 +288,8 @@ vc4_generate_code_block(struct vc4_compile *c,
                 };
 
                 uint64_t unpack = 0;
-                struct qpu_reg src[4];
-                for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+                for (int i = 0; i < qir_get_nsrc(qinst); i++) {
                         int index = qinst->src[i].index;
                         switch (qinst->src[i].file) {
                         case QFILE_NULL:
@@ -504,6 +504,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                         queue(block, qpu_NOP());
                         *last_inst(block) = qpu_set_sig(*last_inst(block),
                                                         QPU_SIG_THREAD_SWITCH);
+                        c->last_thrsw = last_inst(block);
                         break;
 
                 case QOP_BRANCH:
@@ -537,7 +538,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                          * argument slot as well so that we don't take up
                          * another raddr just to get unused data.
                          */
-                        if (qir_get_op_nsrc(qinst->op) == 1)
+                        if (qir_get_nsrc(qinst) == 1)
                                 src[1] = src[0];
 
                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
@@ -591,6 +592,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         qir_for_each_block(block, c)
                 vc4_generate_code_block(c, block, temp_registers);
 
+        /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+         *
+         * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+         * that ensures that a later thread doesn't try to lock the scoreboard
+         * and terminate before an earlier-spawned thread on the same QPU, by
+         * delaying switching back to the later shader until earlier has
+         * finished.  Otherwise, if the earlier thread was hitting the same
+         * quad, the scoreboard would deadlock.
+         */
+        if (c->last_thrsw) {
+                assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+                       QPU_SIG_THREAD_SWITCH);
+                *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+                                  QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+                                                QPU_SIG));
+        }
+
         uint32_t cycles = qpu_schedule_instructions(c);
         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;