struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
struct ir3_instruction *addr; /* current a0.x user, if any */
struct ir3_instruction *pred; /* current p0.x user, if any */
+ int live_values; /* estimate of current live values */
bool error;
};
return is_sfu(instr) || is_mem(instr);
}
+static void
+unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+
+ foreach_ssa_src_n(src, n, instr) {
+ if (__is_false_dep(instr, n))
+ continue;
+ if (instr->block != src->block)
+ continue;
+ if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+ unuse_each_src(ctx, src);
+ } else {
+ debug_assert(src->use_count > 0);
+
+ if (--src->use_count == 0) {
+ ctx->live_values -= dest_regs(src);
+ debug_assert(ctx->live_values >= 0);
+ }
+ }
+ }
+}
+
+static void use_instr(struct ir3_instruction *instr);
+
+static void
+use_each_src(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+
+ foreach_ssa_src_n(src, n, instr) {
+ if (__is_false_dep(instr, n))
+ continue;
+ use_instr(src);
+ }
+}
+
+static void
+use_instr(struct ir3_instruction *instr)
+{
+ if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
+ use_each_src(instr);
+ } else {
+ instr->use_count++;
+ }
+}
+
+static void
+update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+ return;
+
+ ctx->live_values += dest_regs(instr);
+ unuse_each_src(ctx, instr);
+}
+
+static void
+update_use_count(struct ir3 *ir)
+{
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ instr->use_count = 0;
+ }
+ }
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+ continue;
+
+ use_each_src(instr);
+ }
+ }
+
+ /* Shader outputs are also used:
+ */
+ for (unsigned i = 0; i < ir->noutputs; i++) {
+ struct ir3_instruction *out = ir->outputs[i];
+
+ if (!out)
+ continue;
+
+ use_instr(out);
+ }
+}
+
#define NULL_INSTR ((void *)~0)
static void
list_addtail(&instr->node, &instr->block->instr_list);
ctx->scheduled = instr;
+ update_live_values(ctx, instr);
+
if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
clear_cache(ctx, NULL);
} else {
/* find unscheduled srcs: */
foreach_ssa_src(src, instr) {
- if (!is_scheduled(src)) {
+ if (!is_scheduled(src) && (src->block == instr->block)) {
debug_assert(nsrcs < ARRAY_SIZE(srcs));
srcs[nsrcs++] = src;
}
return NULL;
}
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+ int new_live = dest_regs(instr);
+ int old_live = 0;
+
+ foreach_ssa_src_n(src, n, instr) {
+ if (__is_false_dep(instr, n))
+ continue;
+
+ if (instr->block != src->block)
+ continue;
+
+ /* for fanout/split, just pass things along to the real src: */
+ if (src->opc == OPC_META_FO)
+ src = ssa(src->regs[1]);
+
+ /* for fanin/collect, if this is the last use of *each* src,
+ * then it will decrease the live values, since RA treats
+ * them as a whole:
+ */
+ if (src->opc == OPC_META_FI) {
+ struct ir3_instruction *src2;
+ bool last_use = true;
+
+ foreach_ssa_src(src2, src) {
+ if (src2->use_count > 1) {
+ last_use = false;
+ break;
+ }
+ }
+
+ if (last_use)
+ old_live += dest_regs(src);
+
+ } else {
+ debug_assert(src->use_count > 0);
+
+ if (src->use_count == 1) {
+ old_live += dest_regs(src);
+ }
+ }
+ }
+
+ return new_live - old_live;
+}
+
/* find instruction to schedule: */
static struct ir3_instruction *
find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
bool soft)
{
struct ir3_instruction *best_instr = NULL;
- unsigned min_delay = ~0;
+ int best_rank = INT_MAX; /* lower is better */
+ unsigned deepest = 0;
/* TODO we'd really rather use the list/array of block outputs. But we
* don't have such a thing. Recursing *every* instruction in the list
*/
list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
struct ir3_instruction *candidate;
- unsigned delay;
candidate = find_instr_recursive(ctx, notes, instr);
if (!candidate)
continue;
- delay = delay_calc(ctx->block, candidate, soft, false);
- if (delay < min_delay) {
- best_instr = candidate;
- min_delay = delay;
+ if (is_meta(candidate))
+ return candidate;
+
+ deepest = MAX2(deepest, candidate->depth);
+ }
+
+ /* traverse the list a second time.. but since we cache the result of
+ * find_instr_recursive() it isn't as bad as it looks.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
+
+ /* determine net change to # of live values: */
+ int le = live_effect(candidate);
+
+ /* if there is a net increase in # of live values, then apply some
+ * threshold to avoid instructions getting scheduled *too* early
+ * and increasing register pressure.
+ */
+ if (le >= 1) {
+ unsigned threshold;
+
+ if (ctx->live_values > 4*4) {
+ threshold = 4;
+ } else {
+ threshold = 6;
+ }
+
+ /* Filter out any "shallow" instructions which would otherwise
+ * tend to get scheduled too early to fill delay slots even
+ * when they are not needed for a while. There will probably
+ * be later delay slots that they could just as easily fill.
+ *
+ * A classic case where this comes up is frag shaders that
+ * write a constant value (like 1.0f) to one of the channels
+ * of the output color(s). Since the mov from immed has no
+ * dependencies, it would otherwise get scheduled early to
+ * fill delay slots, occupying a register until the end of
+ * the program.
+ */
+ if ((deepest - candidate->depth) > threshold)
+ continue;
}
- if (min_delay == 0)
- break;
+ int rank = delay_calc(ctx->block, candidate, soft, false);
+
+ /* if too many live values, prioritize instructions that reduce the
+ * number of live values:
+ */
+ if (ctx->live_values > 16*4) {
+ rank = le;
+ } else if (ctx->live_values > 4*4) {
+ rank += le;
+ }
+
+ if (rank < best_rank) {
+ best_instr = candidate;
+ best_rank = rank;
+ }
}
return best_instr;
struct ir3_sched_ctx ctx = {0};
ir3_clear_mark(ir);
+ update_use_count(ir);
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ ctx.live_values = 0;
sched_block(&ctx, block);
}
if (ctx.error)
return -1;
+
return 0;
}
+static unsigned
+get_array_id(struct ir3_instruction *instr)
+{
+ /* The expectation is that there is only a single array
+ * src or dst, ir3_cp should enforce this.
+ */
+
+ for (unsigned i = 0; i < instr->regs_count; i++)
+ if (instr->regs[i]->flags & IR3_REG_ARRAY)
+ return instr->regs[i]->array.id;
+
+ unreachable("this was unexpected");
+}
+
/* does instruction 'prior' need to be scheduled before 'instr'? */
static bool
depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
return true;
- return !!(instr->barrier_class & prior->barrier_conflict);
+
+ if (instr->barrier_class & prior->barrier_conflict) {
+ if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
+ /* if only array barrier, then we can further limit false-deps
+ * by considering the array-id, ie reads/writes to different
+ * arrays do not depend on each other (no aliasing)
+ */
+ if (get_array_id(instr) != get_array_id(prior)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
}
static void