/* delay calculation: */
int ir3_delayslots(struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned n);
+ struct ir3_instruction *consumer, unsigned n, bool soft);
unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
bool soft, bool pred);
void ir3_remove_nops(struct ir3 *ir);
*/
int
ir3_delayslots(struct ir3_instruction *assigner,
- struct ir3_instruction *consumer, unsigned n)
+ struct ir3_instruction *consumer, unsigned n, bool soft)
{
if (ignore_dep(assigner, consumer, n))
return 0;
if (writes_addr(assigner))
return 6;
+ /* On a6xx, it takes the number of delay slots to get a SFU result
+ * back (ie. using nop's instead of (ss) is:
+ *
+ * 8 - single warp
+ * 9 - two warps
+ * 10 - four warps
+ *
+ * and so on. Not quite sure where it tapers out (ie. how many
+ * warps share an SFU unit). But 10 seems like a reasonable #
+ * to choose:
+ */
+ if (soft && is_sfu(assigner))
+ return 10;
+
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
return 0;
delay = MAX2(delay, d);
}
} else {
- if (soft) {
- if (is_sfu(assigner)) {
- delay = 4;
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
- } else {
- delay = ir3_delayslots(assigner, consumer, srcn);
- }
+ delay = ir3_delayslots(assigner, consumer, srcn, soft);
delay -= distance(block, assigner, delay, pred);
}
if (i == 0)
continue;
- sd = ir3_delayslots(src, instr, i) + src->depth;
+ sd = ir3_delayslots(src, instr, i, true) + src->depth;
sd += boost;
instr->depth = MAX2(instr->depth, sd);
struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
if (dep && (state->direction == F)) {
- unsigned d = ir3_delayslots(dep->instr, node->instr, i);
+ unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
node->delay = MAX2(node->delay, d);
}
}