From c57ed8e01cb40ef9a422346c3d304c1a3cc1f418 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Mon, 5 Feb 2018 08:45:29 -0500
Subject: [PATCH] freedreno/ir3: intra-block scheduling

Because of loops, we can't schedule all of a block's predecessors first.
Instead just assume that the result consumed in a block was written far
enough away in all paths into a block.  And do an intra-block scheduling
pass to figure out if there are any cases where we need to insert extra
nop's.  This works out better than always assuming the worst case (ie.
that a value live into a block was written in the last instruction in
the predecessor block).

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 126 +++++++++++++++---
 1 file changed, 104 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 72bd0b2ce07..9269f471c56 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -136,29 +136,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 	return d;
 }
 
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
 static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
-		unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+		unsigned maxd, bool pred)
 {
-	struct list_head *instr_list = &ctx->block->instr_list;
 	unsigned d = 0;
 
-	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+	list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
 		if ((n == instr) || (d >= maxd))
-			break;
-		if (is_alu(n) || is_flow(n))
+			return d;
+		/* NOTE: don't count branch/jump since we don't know yet if they will
+		 * be eliminated later in resolve_jumps().. really should do that
+		 * earlier so we don't have this constraint.
+		 */
+		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 			d++;
 	}
 
+	/* if coming from a predecessor block, assume it is assigned far
+	 * enough away.. we'll fix up later.
+	 */
+	if (!pred)
+		return maxd;
+
+	if (pred && (block->data != block)) {
+		/* Search into predecessor blocks, finding the one with the
+		 * shortest distance, since that will be the worst case
+		 */
+		unsigned min = maxd - d;
+
+		/* (ab)use block->data to prevent recursion: */
+		block->data = block;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned n;
+
+			n = distance(block->predecessors[i], instr, min, pred);
+
+			min = MIN2(min, n);
+		}
+
+		block->data = NULL;
+		d += min;
+	}
+
 	return d;
 }
 
 /* calculate delay for specified src: */
 static unsigned
-delay_calc_srcn(struct ir3_sched_ctx *ctx,
+delay_calc_srcn(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft)
+		unsigned srcn, bool soft, bool pred)
 {
 	unsigned delay = 0;
 
@@ -166,9 +210,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
 			unsigned d;
-			if (src->block != assigner->block)
-				break;
-			d = delay_calc_srcn(ctx, src, consumer, srcn, soft);
+			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 			delay = MAX2(delay, d);
 		}
 	} else {
@@ -181,7 +223,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		} else {
 			delay = ir3_delayslots(assigner, consumer, srcn);
 		}
-		delay -= distance(ctx, assigner, delay);
+		delay -= distance(block, assigner, delay, pred);
 	}
 
 	return delay;
@@ -189,19 +231,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft)
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool pred)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 
 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
-		/* for array writes, no need to delay on previous write: */
-		if (i == 0)
-			continue;
-		if (src->block != instr->block)
-			continue;
-		d = delay_calc_srcn(ctx, src, instr, i, soft);
+		d = delay_calc_srcn(block, src, instr, i, soft, pred);
 		delay = MAX2(delay, d);
 	}
 
@@ -396,7 +434,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (!candidate)
 			continue;
 
-		delay = delay_calc(ctx, candidate, soft);
+		delay = delay_calc(ctx->block, candidate, soft, false);
 		if (delay < min_delay) {
 			best_instr = candidate;
 			min_delay = delay;
@@ -537,7 +575,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			instr = find_eligible_instr(ctx, &notes, false);
 
 		if (instr) {
-			unsigned delay = delay_calc(ctx, instr, false);
+			unsigned delay = delay_calc(ctx->block, instr, false, false);
 
 			/* and if we run out of instructions that can be scheduled,
 			 * then it is time for nop's:
@@ -594,7 +632,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		debug_assert(ctx->pred);
 		debug_assert(block->condition);
 
-		delay -= distance(ctx, ctx->pred, delay);
+		delay -= distance(ctx->block, ctx->pred, delay, false);
 
 		while (delay > 0) {
 			ir3_NOP(block);
@@ -633,14 +671,58 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 */
 }
 
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	unsigned n = 0;
+
+	ctx->block = block;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		unsigned delay = 0;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+			delay = MAX2(d, delay);
+		}
+
+		while (delay > n) {
+			struct ir3_instruction *nop = ir3_NOP(block);
+
+			/* move to before instr: */
+			list_delinit(&nop->node);
+			list_addtail(&nop->node, &instr->node);
+
+			n++;
+		}
+
+		/* we can bail once we hit worst case delay: */
+		if (++n > 6)
+			break;
+	}
+}
+
 int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
 
 	ir3_clear_mark(ir);
+
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		sched_block(&ctx, block);
 	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_intra_block(&ctx, block);
+	}
+
 	if (ctx.error)
 		return -1;
 	return 0;
-- 
2.30.2