};
uint64_t unpack = 0;
- struct qpu_reg src[4];
- for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+ struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+ for (int i = 0; i < qir_get_nsrc(qinst); i++) {
int index = qinst->src[i].index;
switch (qinst->src[i].file) {
case QFILE_NULL:
queue(block, qpu_NOP());
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_THREAD_SWITCH);
+ c->last_thrsw = last_inst(block);
break;
case QOP_BRANCH:
* argument slot as well so that we don't take up
* another raddr just to get unused data.
*/
- if (qir_get_op_nsrc(qinst->op) == 1)
+ if (qir_get_nsrc(qinst) == 1)
src[1] = src[0];
fixup_raddr_conflict(block, dst, &src[0], &src[1],
qir_for_each_block(block, c)
vc4_generate_code_block(c, block, temp_registers);
+ /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+ *
+ * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+ * that ensures that a later thread doesn't try to lock the scoreboard
+ * and terminate before an earlier-spawned thread on the same QPU, by
+ * delaying switching back to the later shader until earlier has
+ * finished. Otherwise, if the earlier thread was hitting the same
+ * quad, the scoreboard would deadlock.
+ */
+ if (c->last_thrsw) {
+ assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+ QPU_SIG_THREAD_SWITCH);
+ *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+ QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+ QPU_SIG));
+ }
+
uint32_t cycles = qpu_schedule_instructions(c);
uint32_t inst_count_at_schedule_time = c->qpu_inst_count;