* address.
*
* In that case, we need to move one to a temporary that can be used in the
- * instruction, instead. We reserve ra31/rb31 for this purpose.
+ * instruction, instead. We reserve ra14/rb14 for this purpose.
*/
static void
fixup_raddr_conflict(struct qblock *block,
* in case of unpacks.
*/
if (qir_is_float_input(inst))
- queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
+ queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
else
- queue(block, qpu_a_MOV(qpu_rb(31), *src0));
+ queue(block, qpu_a_MOV(qpu_rb(14), *src0));
/* If we had an unpack on this A-file source, we need to put
* it into this MOV, not into the later move from regfile B.
*last_inst(block) |= *unpack;
*unpack = 0;
}
- *src0 = qpu_rb(31);
+ *src0 = qpu_rb(14);
} else {
- queue(block, qpu_a_MOV(qpu_ra(31), *src0));
- *src0 = qpu_ra(31);
+ queue(block, qpu_a_MOV(qpu_ra(14), *src0));
+ *src0 = qpu_ra(14);
}
}
static void
set_last_dst_pack(struct qblock *block, struct qinst *inst)
{
- bool had_pm = *last_inst(block) & QPU_PM;
- bool had_ws = *last_inst(block) & QPU_WS;
- uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
+ ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
+ ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
+ ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
if (!inst->dst.pack)
return;
handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
struct qpu_reg dst)
{
- if (dst.mux != QPU_MUX_R4)
+ if (dst.mux != QPU_MUX_R4) {
queue(block, qpu_a_MOV(dst, qpu_r4()));
- else if (qinst->sf)
- queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+ set_last_cond_add(block, qinst->cond);
+ } else {
+ assert(qinst->cond == QPU_COND_ALWAYS);
+ if (qinst->sf)
+ queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+ }
}
static void
[QOP_MOV] = { QPU_A_OR },
[QOP_FMOV] = { QPU_A_FMAX },
[QOP_MMOV] = { QPU_M_V8MIN },
+
+ [QOP_MIN_NOIMM] = { QPU_A_MIN },
};
uint64_t unpack = 0;
- struct qpu_reg src[4];
- for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+ struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+ for (int i = 0; i < qir_get_nsrc(qinst); i++) {
int index = qinst->src[i].index;
switch (qinst->src[i].file) {
case QFILE_NULL:
case QFILE_TLB_COLOR_WRITE_MS:
case QFILE_TLB_Z_WRITE:
case QFILE_TLB_STENCIL_SETUP:
+ case QFILE_TEX_S:
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
unreachable("bad qir src file");
}
}
dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
break;
+ case QFILE_TEX_S:
+ case QFILE_TEX_S_DIRECT:
+ dst = qpu_rb(QPU_W_TMU0_S);
+ break;
+
+ case QFILE_TEX_T:
+ dst = qpu_rb(QPU_W_TMU0_T);
+ break;
+
+ case QFILE_TEX_R:
+ dst = qpu_rb(QPU_W_TMU0_R);
+ break;
+
+ case QFILE_TEX_B:
+ dst = qpu_rb(QPU_W_TMU0_B);
+ break;
+
case QFILE_VARY:
case QFILE_UNIF:
case QFILE_SMALL_IMM:
break;
}
- bool handled_qinst_cond = false;
+ ASSERTED bool handled_qinst_cond = false;
switch (qinst->op) {
case QOP_RCP:
}
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
break;
queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
break;
+ case QOP_LOAD_IMM_U2:
+ queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
+ break;
+
+ case QOP_LOAD_IMM_I2:
+ queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
+ break;
+
+ case QOP_ROT_MUL:
+ /* Rotation at the hardware level occurs on the inputs
+ * to the MUL unit, and they must be accumulators in
+ * order to have the time necessary to move things.
+ */
+ assert(src[0].mux <= QPU_MUX_R3);
+
+ queue(block,
+ qpu_m_rot(dst, src[0], qinst->src[1].index -
+ QPU_SMALL_IMM_MUL_ROT) | unpack);
+ set_last_cond_mul(block, qinst->cond);
+ handled_qinst_cond = true;
+ set_last_dst_pack(block, qinst);
+ break;
+
case QOP_MS_MASK:
src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
fixup_raddr_conflict(block, dst, &src[0], &src[1],
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_COLOR_LOAD);
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
break;
case QOP_VARY_ADD_C:
queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
break;
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
- (qinst->op - QOP_TEX_S)),
- src[0]) | unpack);
- break;
-
- case QOP_TEX_DIRECT:
- fixup_raddr_conflict(block, dst, &src[0], &src[1],
- qinst, &unpack);
- queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
- src[0], src[1]) | unpack);
- break;
case QOP_TEX_RESULT:
queue(block, qpu_NOP());
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_LOAD_TMU0);
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
+ break;
+
+ case QOP_THRSW:
+ queue(block, qpu_NOP());
+ *last_inst(block) = qpu_set_sig(*last_inst(block),
+ QPU_SIG_THREAD_SWITCH);
+ c->last_thrsw = last_inst(block);
break;
case QOP_BRANCH:
* argument slot as well so that we don't take up
* another raddr just to get unused data.
*/
- if (qir_get_op_nsrc(qinst->op) == 1)
+ if (qir_get_non_sideband_nsrc(qinst) == 1)
src[1] = src[0];
fixup_raddr_conflict(block, dst, &src[0], &src[1],
void
vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
{
- struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
struct qblock *start_block = list_first_entry(&c->blocks,
struct qblock, link);
+ struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
+ if (!temp_registers)
+ return;
+
switch (c->stage) {
case QSTAGE_VERT:
case QSTAGE_COORD:
qir_for_each_block(block, c)
vc4_generate_code_block(c, block, temp_registers);
+ /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+ *
+ * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+ * that ensures that a later thread doesn't try to lock the scoreboard
+ * and terminate before an earlier-spawned thread on the same QPU, by
+ * delaying switching back to the later shader until earlier has
+ * finished. Otherwise, if the earlier thread was hitting the same
+ * quad, the scoreboard would deadlock.
+ */
+ if (c->last_thrsw) {
+ assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+ QPU_SIG_THREAD_SWITCH);
+ *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+ QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+ QPU_SIG));
+ }
+
uint32_t cycles = qpu_schedule_instructions(c);
uint32_t inst_count_at_schedule_time = c->qpu_inst_count;