if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
struct ir3_register *dst = assigner->regs[0];
- struct ir3_register *src;
debug_assert(dst->flags & IR3_REG_ARRAY);
*/
int
ir3_delayslots(struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned n)
+ struct ir3_instruction *consumer, unsigned n, bool soft)
{
if (ignore_dep(assigner, consumer, n))
return 0;
if (is_meta(assigner) || is_meta(consumer))
return 0;
- if (writes_addr(assigner))
+ if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
+ /* On a6xx, it takes the number of delay slots to get a SFU result
+ * back (ie. using nop's instead of (ss) is:
+ *
+ * 8 - single warp
+ * 9 - two warps
+ * 10 - four warps
+ *
+ * and so on. Not quite sure where it tapers out (ie. how many
+ * warps share an SFU unit). But 10 seems like a reasonable #
+ * to choose:
+ */
+ if (soft && is_sfu(assigner))
+ return 10;
+
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
return 0;
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
- return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
+ return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
}
/**
unsigned delay = 0;
if (is_meta(assigner)) {
- struct ir3_register *src;
- foreach_src (src, assigner) {
+ foreach_src_n (src, n, assigner) {
unsigned d;
if (!src->instr)
continue;
d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
+
+ /* A (rptN) instruction executes in consecutive cycles so
+ * it's outputs are written in successive cycles. And
+ * likewise for it's (r)'d (incremented) inputs, they are
+ * read on successive cycles.
+ *
+ * So we need to adjust the delay for (rptN)'s assigners
+ * and consumers accordingly.
+ *
+ * Note that the dst of a (rptN) instruction is implicitly
+ * (r) (the assigner case), although that is not the case
+ * for src registers. There is exactly one case, bary.f,
+ * which has a vecN (collect) src that is not (r)'d.
+ */
+ if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
+ /* (rptN) assigner case: */
+ d -= MIN2(d, src->instr->repeat - assigner->split.off);
+ } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
+ (consumer->regs[srcn]->flags & IR3_REG_R)) {
+ d -= MIN2(d, n);
+ }
+
delay = MAX2(delay, d);
}
} else {
- if (soft) {
- if (is_sfu(assigner)) {
- delay = 4;
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
+ delay = ir3_delayslots(assigner, consumer, srcn, soft);
delay -= distance(block, assigner, delay, pred);
}
bool soft, bool pred)
{
unsigned delay = 0;
- struct ir3_register *src;
foreach_src_n (src, i, instr) {
unsigned d = 0;