bool error;
};
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+ return !!(instr->flags & IR3_INSTR_MARK);
+}
+
static bool is_sfu_or_mem(struct ir3_instruction *instr)
{
return is_sfu(instr) || is_mem(instr);
continue;
if (instr->block != src->block)
continue;
- if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+ if ((src->opc == OPC_META_COLLECT) || (src->opc == OPC_META_SPLIT)) {
unuse_each_src(ctx, src);
} else {
debug_assert(src->use_count > 0);
}
}
+static void clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
static void use_instr(struct ir3_instruction *instr);
+/* transfers a use-count to new instruction, for cases where we
+ * "spill" address or predicate. Note this might cause the
+ * previous instruction that loaded a0.x/p0.x to become live
+ * again, when we previously thought it was dead.
+ */
+static void
+transfer_use(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr,
+ struct ir3_instruction *new_instr)
+{
+ struct ir3_instruction *src;
+
+ debug_assert(is_scheduled(orig_instr));
+
+ foreach_ssa_src_n(src, n, new_instr) {
+ if (__is_false_dep(new_instr, n))
+ continue;
+ ctx->live_values += dest_regs(src);
+ use_instr(src);
+ }
+
+ clear_cache(ctx, orig_instr);
+}
+
static void
use_each_src(struct ir3_instruction *instr)
{
static void
use_instr(struct ir3_instruction *instr)
{
- if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
+ if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT)) {
use_each_src(instr);
} else {
instr->use_count++;
static void
update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
{
- if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+ if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
return;
ctx->live_values += dest_regs(instr);
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
- if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+ if ((instr->opc == OPC_META_COLLECT) || (instr->opc == OPC_META_SPLIT))
continue;
use_each_src(instr);
/* Shader outputs are also used:
*/
- for (unsigned i = 0; i < ir->noutputs; i++) {
- struct ir3_instruction *out = ir->outputs[i];
-
- if (!out)
- continue;
-
+ struct ir3_instruction *out;
+ foreach_output(out, ir)
use_instr(out);
- }
}
#define NULL_INSTR ((void *)~0)
return NULL;
for (; i < nsrcs; i++)
- if (srcs[i] && (srcs[i]->sun > d->sun))
+ if (srcs[i] && (srcs[i]->depth > d->depth))
d = srcs[id = i];
srcs[id] = NULL;
/* (ab)use block->data to prevent recursion: */
block->data = block;
- for (unsigned i = 0; i < block->predecessors_count; i++) {
+ set_foreach(block->predecessors, entry) {
+ struct ir3_block *pred = (struct ir3_block *)entry->key;
unsigned n;
- n = distance(block->predecessors[i], instr, min, pred);
+ n = distance(pred, instr, min, pred);
min = MIN2(min, n);
}
bool addr_conflict, pred_conflict;
};
-static bool is_scheduled(struct ir3_instruction *instr)
-{
- return !!(instr->flags & IR3_INSTR_MARK);
-}
-
/* could an instruction be scheduled if specified ssa src was scheduled? */
static bool
could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
struct ir3_instruction *instr)
{
+ debug_assert(!is_scheduled(instr));
+
/* For instructions that write address register we need to
* make sure there is at least one instruction that uses the
* addr value which is otherwise ready.
return NULL;
}
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+ struct ir3_instruction *src;
+ int new_live = dest_regs(instr);
+ int old_live = 0;
+
+ foreach_ssa_src_n(src, n, instr) {
+ if (__is_false_dep(instr, n))
+ continue;
+
+ if (instr->block != src->block)
+ continue;
+
+ /* for split, just pass things along to the real src: */
+ if (src->opc == OPC_META_SPLIT)
+ src = ssa(src->regs[1]);
+
+ /* for collect, if this is the last use of *each* src,
+ * then it will decrease the live values, since RA treats
+ * them as a whole:
+ */
+ if (src->opc == OPC_META_COLLECT) {
+ struct ir3_instruction *src2;
+ bool last_use = true;
+
+ foreach_ssa_src(src2, src) {
+ if (src2->use_count > 1) {
+ last_use = false;
+ break;
+ }
+ }
+
+ if (last_use)
+ old_live += dest_regs(src);
+
+ } else {
+ debug_assert(src->use_count > 0);
+
+ if (src->use_count == 1) {
+ old_live += dest_regs(src);
+ }
+ }
+ }
+
+ return new_live - old_live;
+}
+
/* find instruction to schedule: */
static struct ir3_instruction *
find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
bool soft)
{
struct ir3_instruction *best_instr = NULL;
- unsigned min_delay = ~0;
+ int best_rank = INT_MAX; /* lower is better */
+ unsigned deepest = 0;
/* TODO we'd really rather use the list/array of block outputs. But we
* don't have such a thing. Recursing *every* instruction in the list
*/
list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
struct ir3_instruction *candidate;
- unsigned delay;
candidate = find_instr_recursive(ctx, notes, instr);
if (!candidate)
continue;
- if (ctx->live_values > 16*4) {
- /* under register pressure, only care about reducing live values: */
- if (!best_instr || (candidate->sun > best_instr->sun))
- best_instr = candidate;
- } else {
- delay = delay_calc(ctx->block, candidate, soft, false);
- if ((delay < min_delay) ||
- ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
- best_instr = candidate;
- min_delay = delay;
+ if (is_meta(candidate))
+ return candidate;
+
+ deepest = MAX2(deepest, candidate->depth);
+ }
+
+ /* traverse the list a second time.. but since we cache the result of
+ * find_instr_recursive() it isn't as bad as it looks.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
+
+ /* determine net change to # of live values: */
+ int le = live_effect(candidate);
+
+ /* if there is a net increase in # of live values, then apply some
+ * threshold to avoid instructions getting scheduled *too* early
+ * and increasing register pressure.
+ */
+ if (le >= 1) {
+ unsigned threshold;
+
+ if (ctx->live_values > 4*4) {
+ threshold = 4;
+ } else {
+ threshold = 6;
}
+
+ /* Filter out any "shallow" instructions which would otherwise
+ * tend to get scheduled too early to fill delay slots even
+ * when they are not needed for a while. There will probably
+ * be later delay slots that they could just as easily fill.
+ *
+ * A classic case where this comes up is frag shaders that
+ * write a constant value (like 1.0f) to one of the channels
+ * of the output color(s). Since the mov from immed has no
+ * dependencies, it would otherwise get scheduled early to
+ * fill delay slots, occupying a register until the end of
+ * the program.
+ */
+ if ((deepest - candidate->depth) > threshold)
+ continue;
+ }
+
+ int rank = delay_calc(ctx->block, candidate, soft, false);
+
+ /* if too many live values, prioritize instructions that reduce the
+ * number of live values:
+ */
+ if (ctx->live_values > 16*4) {
+ rank = le;
+ } else if (ctx->live_values > 4*4) {
+ rank += le;
+ }
+
+ if (rank < best_rank) {
+ best_instr = candidate;
+ best_rank = rank;
}
}
return best_instr;
}
+static struct ir3_instruction *
+split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
+{
+ struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
+ ir3_insert_by_depth(new_instr, &ctx->depth_list);
+ transfer_use(ctx, orig_instr, new_instr);
+ return new_instr;
+}
+
/* "spill" the address register by remapping any unscheduled
* instructions which depend on the current address register
* to a clone of the instruction which wrote the address reg.
*/
if (indirect->address == ctx->addr) {
if (!new_addr) {
- new_addr = ir3_instr_clone(ctx->addr);
+ new_addr = split_instr(ctx, ctx->addr);
/* original addr is scheduled, but new one isn't: */
new_addr->flags &= ~IR3_INSTR_MARK;
}
+ indirect->address = NULL;
ir3_instr_set_address(indirect, new_addr);
}
}
*/
if (ssa(predicated->regs[1]) == ctx->pred) {
if (!new_pred) {
- new_pred = ir3_instr_clone(ctx->pred);
+ new_pred = split_instr(ctx, ctx->pred);
/* original pred is scheduled, but new one isn't: */
new_pred->flags &= ~IR3_INSTR_MARK;
}
list_inithead(&block->instr_list);
list_inithead(&ctx->depth_list);
- /* first a pre-pass to schedule all meta:input instructions
- * (which need to appear first so that RA knows the register is
- * occupied), and move remaining to depth sorted list:
+ /* First schedule all meta:input instructions, followed by
+ * tex-prefetch. We want all of the instructions that load
+ * values into registers before the shader starts to go
+ * before any other instructions. But in particular we
+ * want inputs to come before prefetches. This is because
+ * a FS's bary_ij input may not actually be live in the
+ * shader, but it should not be scheduled on top of any
+ * other input (but can be overwritten by a tex prefetch)
+ *
+ * Finally, move all the remaining instructions to the depth-
+ * list
*/
- list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
- if (instr->opc == OPC_META_INPUT) {
+ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
+ if (instr->opc == OPC_META_INPUT)
schedule(ctx, instr);
- } else {
- ir3_insert_by_depth(instr, &ctx->depth_list);
- }
- }
- while (!list_empty(&ctx->depth_list)) {
+ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
+ if (instr->opc == OPC_META_TEX_PREFETCH)
+ schedule(ctx, instr);
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node)
+ ir3_insert_by_depth(instr, &ctx->depth_list);
+
+ while (!list_is_empty(&ctx->depth_list)) {
struct ir3_sched_notes notes = {0};
struct ir3_instruction *instr;
list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
unsigned delay = 0;
- for (unsigned i = 0; i < block->predecessors_count; i++) {
- unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+ set_foreach(block->predecessors, entry) {
+ struct ir3_block *pred = (struct ir3_block *)entry->key;
+ unsigned d = delay_calc(pred, instr, false, true);
delay = MAX2(d, delay);
}