From c57ed8e01cb40ef9a422346c3d304c1a3cc1f418 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 5 Feb 2018 08:45:29 -0500 Subject: [PATCH] freedreno/ir3: intra-block scheduling Because of loops, we can't schedule all of a block's predecessors first. Instead just assume that the result consumed in a block was written far enough away in all paths into a block. And do an intra-block scheduling pass to figure out if there are any cases where we need to insert extra nop's. This works out better than always assuming the worst case (ie. that a value live into a block was written in the last instruction in the predecessor block). Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/ir3/ir3_sched.c | 126 +++++++++++++++--- 1 file changed, 104 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index 72bd0b2ce07..9269f471c56 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -136,29 +136,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs) return d; } +/** + * @block: the block to search in, starting from end; in first pass, + * this will be the block the instruction would be inserted into + * (but has not yet, ie. it only contains already scheduled + * instructions). For intra-block scheduling (second pass), this + * would be one of the predecessor blocks. + * @instr: the instruction to search for + * @maxd: max distance, bail after searching this # of instruction + * slots, since it means the instruction we are looking for is + * far enough away + * @pred: if true, recursively search into predecessor blocks to + * find the worst case (shortest) distance (only possible after + * individual blocks are all scheduled + */ static unsigned -distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, - unsigned maxd) +distance(struct ir3_block *block, struct ir3_instruction *instr, + unsigned maxd, bool pred) { - struct list_head *instr_list = &ctx->block->instr_list; unsigned d = 0; - list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { + list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) { if ((n == instr) || (d >= maxd)) - break; - if (is_alu(n) || is_flow(n)) + return d; + /* NOTE: don't count branch/jump since we don't know yet if they will + * be eliminated later in resolve_jumps().. really should do that + * earlier so we don't have this constraint. + */ + if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR))) d++; } + /* if coming from a predecessor block, assume it is assigned far + * enough away.. we'll fix up later. + */ + if (!pred) + return maxd; + + if (pred && (block->data != block)) { + /* Search into predecessor blocks, finding the one with the + * shortest distance, since that will be the worst case + */ + unsigned min = maxd - d; + + /* (ab)use block->data to prevent recursion: */ + block->data = block; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + unsigned n; + + n = distance(block->predecessors[i], instr, min, pred); + + min = MIN2(min, n); + } + + block->data = NULL; + d += min; + } + return d; } /* calculate delay for specified src: */ static unsigned -delay_calc_srcn(struct ir3_sched_ctx *ctx, +delay_calc_srcn(struct ir3_block *block, struct ir3_instruction *assigner, struct ir3_instruction *consumer, - unsigned srcn, bool soft) + unsigned srcn, bool soft, bool pred) { unsigned delay = 0; @@ -166,9 +210,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx, struct ir3_instruction *src; foreach_ssa_src(src, assigner) { unsigned d; - if (src->block != assigner->block) - break; - d = delay_calc_srcn(ctx, src, consumer, srcn, soft); + d = delay_calc_srcn(block, src, consumer, srcn, soft, pred); delay = MAX2(delay, d); } } else { @@ -181,7 +223,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx, } else { delay = ir3_delayslots(assigner, consumer, srcn); } - delay -= distance(ctx, assigner, delay); + delay -= distance(block, assigner, delay, pred); } return delay; @@ -189,19 +231,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx, /* calculate delay for instruction (maximum of delay for all srcs): */ static unsigned -delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft) +delay_calc(struct ir3_block *block, struct ir3_instruction *instr, + bool soft, bool pred) { unsigned delay = 0; struct ir3_instruction *src; foreach_ssa_src_n(src, i, instr) { unsigned d; - /* for array writes, no need to delay on previous write: */ - if (i == 0) - continue; - if (src->block != instr->block) - continue; - d = delay_calc_srcn(ctx, src, instr, i, soft); + d = delay_calc_srcn(block, src, instr, i, soft, pred); delay = MAX2(delay, d); } @@ -396,7 +434,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, if (!candidate) continue; - delay = delay_calc(ctx, candidate, soft); + delay = delay_calc(ctx->block, candidate, soft, false); if (delay < min_delay) { best_instr = candidate; min_delay = delay; @@ -537,7 +575,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) instr = find_eligible_instr(ctx, ¬es, false); if (instr) { - unsigned delay = delay_calc(ctx, instr, false); + unsigned delay = delay_calc(ctx->block, instr, false, false); /* and if we run out of instructions that can be scheduled, * then it is time for nop's: @@ -594,7 +632,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) debug_assert(ctx->pred); debug_assert(block->condition); - delay -= distance(ctx, ctx->pred, delay); + delay -= distance(ctx->block, ctx->pred, delay, false); while (delay > 0) { ir3_NOP(block); @@ -633,14 +671,58 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) */ } +/* After scheduling individual blocks, we still could have cases where + * one (or more) paths into a block, a value produced by a previous + * has too few delay slots to be legal. We can't deal with this in the + * first pass, because loops (ie. we can't ensure all predecessor blocks + * are already scheduled in the first pass). All we can really do at + * this point is stuff in extra nop's until things are legal. + */ +static void +sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + unsigned n = 0; + + ctx->block = block; + + list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { + unsigned delay = 0; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + unsigned d = delay_calc(block->predecessors[i], instr, false, true); + delay = MAX2(d, delay); + } + + while (delay > n) { + struct ir3_instruction *nop = ir3_NOP(block); + + /* move to before instr: */ + list_delinit(&nop->node); + list_addtail(&nop->node, &instr->node); + + n++; + } + + /* we can bail once we hit worst case delay: */ + if (++n > 6) + break; + } +} + int ir3_sched(struct ir3 *ir) { struct ir3_sched_ctx ctx = {0}; ir3_clear_mark(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { sched_block(&ctx, block); } + + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + sched_intra_block(&ctx, block); + } + if (ctx.error) return -1; return 0; -- 2.30.2