From 7273cb4e933f8be65fc73b9d8c69c76d1078cb14 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 30 Apr 2015 13:57:15 -0400 Subject: [PATCH] freedreno/ir3/sched: convert to priority queue Use a more standard priority-queue based scheduling algo. It is simpler and will make things easier once we have multiple basic blocks and flow control. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/ir3/ir3.c | 1 + src/gallium/drivers/freedreno/ir3/ir3.h | 3 + .../drivers/freedreno/ir3/ir3_compiler_nir.c | 1 + src/gallium/drivers/freedreno/ir3/ir3_sched.c | 466 +++++++++--------- 4 files changed, 242 insertions(+), 229 deletions(-) diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index 84564a9eef7..aea1b967b07 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -82,6 +82,7 @@ void ir3_destroy(struct ir3 *shader) free(chunk); } free(shader->indirects); + free(shader->predicates); free(shader->baryfs); free(shader); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index edb5b49e23c..030a74fe21a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -346,6 +346,9 @@ struct ir3 { */ unsigned indirects_count, indirects_sz; struct ir3_instruction **indirects; + /* and same for instructions that consume predicate register: */ + unsigned predicates_count, predicates_sz; + struct ir3_instruction **predicates; struct ir3_block *block; unsigned heap_idx; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 8d382e5cf3e..caea34c7fd4 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1250,6 +1250,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) cond->regs[0]->num = regid(REG_P0, 0); kill = ir3_KILL(b, cond, 0); + array_insert(ctx->ir->predicates, kill); ctx->kill[ctx->kill_count++] = kill; ctx->so->has_kill = true; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index fc41f93b884..1d166d879df 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -31,23 +31,14 @@ #include "ir3.h" -enum { - SCHEDULED = -1, - DELAYED = -2, -}; - /* * Instruction Scheduling: * - * Using the depth sorted list from depth pass, attempt to recursively - * schedule deepest unscheduled path. The first instruction that cannot - * be scheduled, returns the required delay slots it needs, at which - * point we return back up to the top and attempt to schedule by next - * highest depth. After a sufficient number of instructions have been - * scheduled, return back to beginning of list and start again. If you - * reach the end of depth sorted list without being able to insert any - * instruction, insert nop's. Repeat until no more unscheduled - * instructions. + * A priority-queue based scheduling algo. Add eligible instructions, + * ie. ones with all their dependencies scheduled, to the priority + * (depth) sorted queue (list). Pop highest priority instruction off + * the queue and schedule it, add newly eligible instructions to the + * priority queue, rinse, repeat. * * There are a few special cases that need to be handled, since sched * is currently independent of register allocation. Usages of address @@ -60,67 +51,29 @@ enum { */ struct ir3_sched_ctx { - struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_block *block; /* the current block */ + struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ struct ir3_instruction *addr; /* current a0.x user, if any */ struct ir3_instruction *pred; /* current p0.x user, if any */ - unsigned cnt; bool error; }; -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; - - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; -} - -static unsigned -distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, - unsigned maxd) -{ - struct list_head *instr_list = &instr->block->instr_list; - unsigned d = 0; - - list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { - if ((n == instr) || (d >= maxd)) - break; - if (is_alu(n) || is_flow(n)) - d++; - } - - return d; -} - static bool is_sfu_or_mem(struct ir3_instruction *instr) { return is_sfu(instr) || is_mem(instr); } -static void schedule(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr, bool remove) +static void +schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_block *block = instr->block; + debug_assert(ctx->block == instr->block); /* maybe there is a better way to handle this than just stuffing * a nop.. ideally we'd know about this constraint in the * scheduling and depth calculation.. */ if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) - ir3_NOP(block); + ir3_NOP(ctx->block); /* remove from depth list: */ @@ -140,16 +93,28 @@ static void schedule(struct ir3_sched_ctx *ctx, list_addtail(&instr->node, &instr->block->instr_list); ctx->scheduled = instr; - - ctx->cnt++; } -/* - * Delay-slot calculation. Follows fanin/fanout. - */ +static unsigned +distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, + unsigned maxd) +{ + struct list_head *instr_list = &ctx->block->instr_list; + unsigned d = 0; + + list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { + if ((n == instr) || (d >= maxd)) + break; + if (is_alu(n) || is_flow(n)) + d++; + } + + return d; +} /* calculate delay for specified src: */ -static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, +static unsigned +delay_calc_srcn(struct ir3_sched_ctx *ctx, struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned srcn) { @@ -158,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, if (is_meta(assigner)) { struct ir3_instruction *src; foreach_ssa_src(src, assigner) { - unsigned d = delay_calc_srcn(ctx, src, consumer, srcn); + unsigned d; + if (src->block != assigner->block) + break; + d = delay_calc_srcn(ctx, src, consumer, srcn); delay = MAX2(delay, d); } } else { @@ -170,48 +138,77 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx, } /* calculate delay for instruction (maximum of delay for all srcs): */ -static unsigned delay_calc(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) +static unsigned +delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) { unsigned delay = 0; struct ir3_instruction *src; foreach_ssa_src_n(src, i, instr) { - unsigned d = delay_calc_srcn(ctx, src, instr, i); + unsigned d; + if (src->block != instr->block) + continue; + d = delay_calc_srcn(ctx, src, instr, i); delay = MAX2(delay, d); } return delay; } -/* A negative return value signals that an instruction has been newly - * SCHEDULED (or DELAYED due to address or predicate register already - * in use), return back up to the top of the stack (to block_sched()) +struct ir3_sched_notes { + /* there is at least one kill which could be scheduled, except + * for unscheduled bary.f's: + */ + bool blocked_kill; + /* there is at least one instruction that could be scheduled, + * except for conflicting address/predicate register usage: + */ + bool addr_conflict, pred_conflict; +}; + +static bool is_scheduled(struct ir3_instruction *instr) +{ + return !!(instr->flags & IR3_INSTR_MARK); +} + +static bool +check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct ir3_instruction *instr) +{ + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + assert(ctx->addr != instr); + notes->addr_conflict = true; + return true; + } + + if (writes_pred(instr) && ctx->pred) { + assert(ctx->pred != instr); + notes->pred_conflict = true; + return true; + } + + return false; +} + +/* is this instruction ready to be scheduled? Return negative for not + * ready (updating notes if needed), or >= 0 to indicate number of + * delay slots needed. */ -static int trysched(struct ir3_sched_ctx *ctx, +static int +instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, struct ir3_instruction *instr) { - struct ir3_instruction *srcs[64]; struct ir3_instruction *src; - unsigned delay, nsrcs = 0; - - /* if already scheduled: */ - if (instr->flags & IR3_INSTR_MARK) - return 0; + unsigned delay = 0; - /* figure out our src's, copy 'em out into an array for sorting: */ foreach_ssa_src(src, instr) { - debug_assert(nsrcs < ARRAY_SIZE(srcs)); - srcs[nsrcs++] = src; - } - - /* for each src register in sorted order: - */ - delay = 0; - while ((src = deepest(srcs, nsrcs))) { - delay = trysched(ctx, src); - if (delay) - return delay; + /* if dependency not scheduled, we aren't ready yet: */ + if (!is_scheduled(src)) + return -1; } /* all our dependents are scheduled, figure out if @@ -236,183 +233,194 @@ static int trysched(struct ir3_sched_ctx *ctx, */ if (is_kill(instr)) { struct ir3 *ir = instr->block->shader; - unsigned i; - for (i = 0; i < ir->baryfs_count; i++) { + for (unsigned i = 0; i < ir->baryfs_count; i++) { struct ir3_instruction *baryf = ir->baryfs[i]; if (baryf->depth == DEPTH_UNUSED) continue; - delay = trysched(ctx, baryf); - if (delay) - return delay; + if (!is_scheduled(baryf)) { + notes->blocked_kill = true; + return -1; + } } } - /* if this is a write to address/predicate register, and that - * register is currently in use, we need to defer until it is - * free: - */ - if (writes_addr(instr) && ctx->addr) { - assert(ctx->addr != instr); - return DELAYED; - } - if (writes_pred(instr) && ctx->pred) { - assert(ctx->pred != instr); - return DELAYED; - } + if (check_conflict(ctx, notes, instr)) + return -1; - schedule(ctx, instr, true); - return SCHEDULED; + return 0; } -static bool uses_current_addr(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) +/* move eligible instructions to the priority list: */ +static unsigned +add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, + struct list_head *prio_queue, struct list_head *unscheduled_list) { - return instr->address && (ctx->addr == instr->address); -} + unsigned min_delay = ~0; + + list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) { + int e = instr_eligibility(ctx, notes, instr); + if (e < 0) + continue; + min_delay = MIN2(min_delay, e); + if (e == 0) { + /* remove from unscheduled list and into priority queue: */ + list_delinit(&instr->node); + ir3_insert_by_depth(instr, prio_queue); + } + } -static bool uses_current_pred(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *src; - foreach_ssa_src(src, instr) - if (ctx->pred == src) - return true; - return false; + return min_delay; } -/* when we encounter an instruction that writes to the address register - * when it is in use, we delay that instruction and try to schedule all - * other instructions using the current address register: +/* "spill" the address register by remapping any unscheduled + * instructions which depend on the current address register + * to a clone of the instruction which wrote the address reg. */ -static int block_sched_undelayed(struct ir3_sched_ctx *ctx, - struct list_head *unscheduled_list) +static void +split_addr(struct ir3_sched_ctx *ctx) { - bool addr_in_use = false; - bool pred_in_use = false; - bool all_delayed = true; - unsigned cnt = ~0, attempted = 0; - - list_for_each_entry_safe(struct ir3_instruction, instr, unscheduled_list, node) { - bool addr = uses_current_addr(ctx, instr); - bool pred = uses_current_pred(ctx, instr); - - if (addr || pred) { - int ret = trysched(ctx, instr); - - if (ret != DELAYED) - all_delayed = false; - - if (ret == SCHEDULED) - cnt = 0; - else if (ret > 0) - cnt = MIN2(cnt, ret); - if (addr) - addr_in_use = true; - if (pred) - pred_in_use = true; - - attempted++; + struct ir3 *ir = ctx->addr->block->shader; + struct ir3_instruction *new_addr = NULL; + unsigned i; + + debug_assert(ctx->addr); + + for (i = 0; i < ir->indirects_count; i++) { + struct ir3_instruction *indirect = ir->indirects[i]; + + /* skip instructions already scheduled: */ + if (indirect->flags & IR3_INSTR_MARK) + continue; + + /* remap remaining instructions using current addr + * to new addr: + */ + if (indirect->address == ctx->addr) { + if (!new_addr) { + new_addr = ir3_instr_clone(ctx->addr); + /* original addr is scheduled, but new one isn't: */ + new_addr->flags &= ~IR3_INSTR_MARK; + } + indirect->address = new_addr; } } - if (!addr_in_use) - ctx->addr = NULL; + /* all remaining indirects remapped to new addr: */ + ctx->addr = NULL; +} - if (!pred_in_use) - ctx->pred = NULL; +/* "spill" the predicate register by remapping any unscheduled + * instructions which depend on the current predicate register + * to a clone of the instruction which wrote the address reg. + */ +static void +split_pred(struct ir3_sched_ctx *ctx) +{ + struct ir3 *ir = ctx->pred->block->shader; + struct ir3_instruction *new_pred = NULL; + unsigned i; - /* detect if we've gotten ourselves into an impossible situation - * and bail if needed - */ - if (all_delayed && (attempted > 0)) { - if (pred_in_use) { - /* TODO we probably need to keep a list of instructions - * that reference predicate, similar to indirects - */ - ctx->error = true; - return DELAYED; - } - if (addr_in_use) { - struct ir3 *ir = ctx->addr->block->shader; - struct ir3_instruction *new_addr = - ir3_instr_clone(ctx->addr); - unsigned i; - - /* original addr is scheduled, but new one isn't: */ - new_addr->flags &= ~IR3_INSTR_MARK; - - for (i = 0; i < ir->indirects_count; i++) { - struct ir3_instruction *indirect = ir->indirects[i]; - - /* skip instructions already scheduled: */ - if (indirect->flags & IR3_INSTR_MARK) - continue; - - /* remap remaining instructions using current addr - * to new addr: - */ - if (indirect->address == ctx->addr) - indirect->address = new_addr; - } + debug_assert(ctx->pred); - /* all remaining indirects remapped to new addr: */ - ctx->addr = NULL; + for (i = 0; i < ir->predicates_count; i++) { + struct ir3_instruction *predicated = ir->predicates[i]; - /* not really, but this will trigger us to go back to - * main trysched() loop now that we've resolved the - * conflict by duplicating the instr that writes to - * the address register. - */ - return SCHEDULED; + /* skip instructions already scheduled: */ + if (predicated->flags & IR3_INSTR_MARK) + continue; + + /* remap remaining instructions using current pred + * to new pred: + * + * TODO is there ever a case when pred isn't first + * (and only) src? + */ + if (ssa(predicated->regs[1]) == ctx->pred) { + if (!new_pred) { + new_pred = ir3_instr_clone(ctx->pred); + /* original pred is scheduled, but new one isn't: */ + new_pred->flags &= ~IR3_INSTR_MARK; + } + predicated->regs[1]->instr = new_pred; } } - return cnt; + /* all remaining predicated remapped to new pred: */ + ctx->pred = NULL; } -static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) +static void +sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) { - struct list_head unscheduled_list; + struct list_head unscheduled_list, prio_queue; + ctx->block = block; + + /* move all instructions to the unscheduled list, and + * empty the block's instruction list (to which we will + * be inserting. + */ list_replace(&block->instr_list, &unscheduled_list); list_inithead(&block->instr_list); + list_inithead(&prio_queue); - /* schedule all the shader input's (meta-instr) first so that - * the RA step sees that the input registers contain a value - * from the start of the shader: + /* first a pre-pass to schedule all meta:input/phi instructions + * (which need to appear first so that RA knows the register is + * occupied: */ - if (!block->parent) { - unsigned i; - for (i = 0; i < block->ninputs; i++) { - struct ir3_instruction *in = block->inputs[i]; - if (in) - schedule(ctx, in, true); - } - } - list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { - int cnt = trysched(ctx, instr); + if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) || + (instr->opc == OPC_META_PHI))) + schedule(ctx, instr); + } - if (cnt == DELAYED) - cnt = block_sched_undelayed(ctx, &unscheduled_list); + while (!(list_empty(&unscheduled_list) && + list_empty(&prio_queue))) { + struct ir3_sched_notes notes = {0}; + unsigned delay; - /* -1 is signal to return up stack, but to us means same as 0: */ - cnt = MAX2(0, cnt); - cnt += ctx->cnt; + delay = add_eligible_instrs(ctx, ¬es, &prio_queue, &unscheduled_list); - /* if deepest remaining instruction cannot be scheduled, try - * the increasingly more shallow instructions until needed - * number of delay slots is filled: - */ - list_for_each_entry_safe (struct ir3_instruction, instr, &instr->node, node) - trysched(ctx, instr); + if (!list_empty(&prio_queue)) { + struct ir3_instruction *instr = list_last_entry(&prio_queue, + struct ir3_instruction, node); + /* ugg, this is a bit ugly, but between the time when + * the instruction became eligible and now, a new + * conflict may have arose.. + */ + if (check_conflict(ctx, ¬es, instr)) { + list_del(&instr->node); + list_addtail(&instr->node, &unscheduled_list); + continue; + } - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: - */ - while (cnt > ctx->cnt) - schedule(ctx, ir3_NOP(block), false); + schedule(ctx, instr); + } else if (delay == ~0) { + /* nothing available to schedule.. if we are blocked on + * address/predicate register conflict, then break the + * deadlock by cloning the instruction that wrote that + * reg: + */ + if (notes.addr_conflict) { + split_addr(ctx); + } else if (notes.pred_conflict) { + split_pred(ctx); + } else { + debug_assert(0); + ctx->error = true; + return; + } + } else { + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + debug_assert(delay <= 6); + while (delay > 0) { + ir3_NOP(block); + delay--; + } + } } } @@ -420,7 +428,7 @@ int ir3_block_sched(struct ir3_block *block) { struct ir3_sched_ctx ctx = {0}; ir3_clear_mark(block->shader); - block_sched(&ctx, block); + sched_block(&ctx, block); if (ctx.error) return -1; return 0; -- 2.30.2