return NULL;
for (; i < nsrcs; i++)
- if (srcs[i] && (srcs[i]->sun > d->sun))
+ if (srcs[i] && (srcs[i]->depth > d->depth))
d = srcs[id = i];
srcs[id] = NULL;
return NULL;
}
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+ int new_live = dest_regs(instr);
+ int old_live = 0;
+
+ foreach_ssa_src_n(src, n, instr) {
+ if (__is_false_dep(instr, n))
+ continue;
+
+ if (instr->block != src->block)
+ continue;
+
+ /* for fanout/split, just pass things along to the real src: */
+ if (src->opc == OPC_META_FO)
+ src = ssa(src->regs[1]);
+
+ /* for fanin/collect, if this is the last use of *each* src,
+ * then it will decrease the live values, since RA treats
+ * them as a whole:
+ */
+ if (src->opc == OPC_META_FI) {
+ struct ir3_instruction *src2;
+ bool last_use = true;
+
+ foreach_ssa_src(src2, src) {
+ if (src2->use_count > 1) {
+ last_use = false;
+ break;
+ }
+ }
+
+ if (last_use)
+ old_live += dest_regs(src);
+
+ } else {
+ debug_assert(src->use_count > 0);
+
+ if (src->use_count == 1) {
+ old_live += dest_regs(src);
+ }
+ }
+ }
+
+ return new_live - old_live;
+}
+
/* find instruction to schedule: */
static struct ir3_instruction *
find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
bool soft)
{
struct ir3_instruction *best_instr = NULL;
- unsigned min_delay = ~0;
+ int best_rank = INT_MAX; /* lower is better */
+ unsigned deepest = 0;
/* TODO we'd really rather use the list/array of block outputs. But we
* don't have such a thing. Recursing *every* instruction in the list
*/
list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
struct ir3_instruction *candidate;
- unsigned delay;
candidate = find_instr_recursive(ctx, notes, instr);
if (!candidate)
continue;
- if (ctx->live_values > 16*4) {
- /* under register pressure, only care about reducing live values: */
- if (!best_instr || (candidate->sun > best_instr->sun))
- best_instr = candidate;
- } else {
- delay = delay_calc(ctx->block, candidate, soft, false);
- if ((delay < min_delay) ||
- ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
- best_instr = candidate;
- min_delay = delay;
+ deepest = MAX2(deepest, candidate->depth);
+ }
+
+ /* traverse the list a second time.. but since we cache the result of
+ * find_instr_recursive() it isn't as bad as it looks.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
+
+ /* determine net change to # of live values: */
+ int le = live_effect(candidate);
+
+ /* if there is a net increase in # of live values, then apply some
+ * threshold to avoid instructions getting scheduled *too* early
+ * and increasing register pressure.
+ */
+ if (le >= 1) {
+ unsigned threshold;
+
+ if (ctx->live_values > 4*4) {
+ threshold = 4;
+ } else {
+ threshold = 6;
}
+
+ /* Filter out any "shallow" instructions which would otherwise
+ * tend to get scheduled too early to fill delay slots even
+ * when they are not needed for a while. There will probably
+ * be later delay slots that they could just as easily fill.
+ *
+ * A classic case where this comes up is frag shaders that
+ * write a constant value (like 1.0f) to one of the channels
+ * of the output color(s). Since the mov from immed has no
+ * dependencies, it would otherwise get scheduled early to
+ * fill delay slots, occupying a register until the end of
+ * the program.
+ */
+ if ((deepest - candidate->depth) > threshold)
+ continue;
+ }
+
+ int rank = delay_calc(ctx->block, candidate, soft, false);
+
+ /* if too many live values, prioritize instructions that reduce the
+ * number of live values:
+ */
+ if (ctx->live_values > 16*4) {
+ rank = le;
+ } else if (ctx->live_values > 4*4) {
+ rank += le;
+ }
+
+ if (rank < best_rank) {
+ best_instr = candidate;
+ best_rank = rank;
}
}