From 4b18d51756e9099710bfe421657b3b2034e1497f Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 30 Nov 2015 15:52:26 -0500
Subject: [PATCH] freedreno/ir3: convert scheduler back to recursive algo

I've played with a few different approaches to tweak instruction
priority according to how much they increase/decrease register pressure,
etc.  But nothing seems to change the fact that compared to original
(pre-multiple-block-support) scheduler, in some edge cases we are
generating shaders w/ 5-6x higher register usage.

The problem is that the priority queue approach completely looses the
dependency between instructions, and ends up scheduling all paths at the
same time.

Original reason for switching was that recursive approach relied on
starting from the shader outputs array.  But we can achieve more or less
the same thing by starting from the depth-sorted list.

shader-db results:

total instructions in shared programs:          113350 -> 105183 (-7.21%)
total dwords in shared programs:                219328 -> 211168 (-3.72%)
total full registers used in shared programs:   7911 -> 7383 (-6.67%)
total half registers used in shader programs:   109 -> 109 (0.00%)
total const registers used in shared programs:  21294 -> 21294 (0.00%)

                 half       full      const      instr     dwords
    helped           0         322           0         711         215
      hurt           0         163           0          38           4

The shaders hurt tend to gain a register or two.  While there are also a
lot of helped shaders that only loose a register or two, the more
complex ones tend to loose significanly more registers used.  In some
more extreme cases, like glsl-fs-convolution-1.shader_test it is more
like 7 vs 34 registers!

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.h       |   4 +
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 327 +++++++++++-------
 2 files changed, 204 insertions(+), 127 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 89b93105cbc..62d14a0ae37 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -255,6 +255,10 @@ struct ir3_instruction {
 		};
 	};
 
+	/* used for per-pass extra instruction data.
+	 */
+	void *data;
+
 	/* Used during CP and RA stages.  For fanin and shader inputs/
 	 * outputs where we need a sequence of consecutive registers,
 	 * keep track of each src instructions left (ie 'n-1') and right
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 08f5cac0cf4..6aaa16edbfe 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -34,11 +34,12 @@
 /*
  * Instruction Scheduling:
  *
- * A priority-queue based scheduling algo.  Add eligible instructions,
- * ie. ones with all their dependencies scheduled, to the priority
- * (depth) sorted queue (list).  Pop highest priority instruction off
- * the queue and schedule it, add newly eligible instructions to the
- * priority queue, rinse, repeat.
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -52,6 +53,7 @@
 
 struct ir3_sched_ctx {
 	struct ir3_block *block;           /* the current block */
+	struct list_head depth_list;       /* depth sorted unscheduled instrs */
 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
@@ -63,6 +65,17 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
 	return is_sfu(instr) || is_mem(instr);
 }
 
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+			instr2->data = NULL;
+	}
+}
+
 static void
 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
@@ -93,6 +106,34 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
+
+	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+		clear_cache(ctx, NULL);
+	} else {
+		/* invalidate only the necessary entries.. */
+		clear_cache(ctx, instr);
+	}
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+	struct ir3_instruction *d = NULL;
+	unsigned i = 0, id = 0;
+
+	while ((i < nsrcs) && !(d = srcs[id = i]))
+		i++;
+
+	if (!d)
+		return NULL;
+
+	for (; i < nsrcs; i++)
+		if (srcs[i] && (srcs[i]->depth > d->depth))
+			d = srcs[id = i];
+
+	srcs[id] = NULL;
+
+	return d;
 }
 
 static unsigned
@@ -171,10 +212,51 @@ static bool is_scheduled(struct ir3_instruction *instr)
 	return !!(instr->flags & IR3_INSTR_MARK);
 }
 
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
 static bool
-check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
+	/* For instructions that write address register we need to
+	 * make sure there is at least one instruction that uses the
+	 * addr value which is otherwise ready.
+	 *
+	 * TODO if any instructions use pred register and have other
+	 * src args, we would need to do the same for writes_pred()..
+	 */
+	if (writes_addr(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		bool ready = false;
+		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->indirects[i];
+			if (!indirect)
+				continue;
+			if (indirect->address != instr)
+				continue;
+			ready = could_sched(indirect, instr);
+		}
+
+		/* nothing could be scheduled, so keep looking: */
+		if (!ready)
+			return false;
+	}
+
 	/* if this is a write to address/predicate register, and that
 	 * register is currently in use, we need to defer until it is
 	 * free:
@@ -182,52 +264,15 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	if (writes_addr(instr) && ctx->addr) {
 		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
-		return true;
+		return false;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
 		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
-		return true;
+		return false;
 	}
 
-	return false;
-}
-
-/* is this instruction ready to be scheduled?  Return negative for not
- * ready (updating notes if needed), or >= 0 to indicate number of
- * delay slots needed.
- */
-static int
-instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-	unsigned delay = 0;
-
-	/* Phi instructions can have a dependency on something not
-	 * scheduled yet (for ex, loops).  But OTOH we don't really
-	 * care.  By definition phi's should appear at the top of
-	 * the block, and it's sources should be values from the
-	 * previously executing block, so they are always ready to
-	 * be scheduled:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
-		return 0;
-
-	foreach_ssa_src(src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if (!is_scheduled(src))
-			return -1;
-	}
-
-	/* all our dependents are scheduled, figure out if
-	 * we have enough delay slots to schedule ourself:
-	 */
-	delay = delay_calc(ctx, instr);
-	if (delay)
-		return delay;
-
 	/* if the instruction is a kill, we need to ensure *every*
 	 * bary.f is scheduled.  The hw seems unhappy if the thread
 	 * gets killed before the end-input (ei) flag is hit.
@@ -250,76 +295,105 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 				continue;
 			if (!is_scheduled(baryf)) {
 				notes->blocked_kill = true;
-				return -1;
+				return false;
 			}
 		}
 	}
 
-	if (check_conflict(ctx, notes, instr))
-		return -1;
-
-	return 0;
+	return true;
 }
 
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *other_src;
-	foreach_ssa_src(other_src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if ((src != other_src) && !is_scheduled(other_src)) {
-			return false;
+	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+	struct ir3_instruction *src;
+	unsigned nsrcs = 0;
+
+	if (is_scheduled(instr))
+		return NULL;
+
+	/* use instr->data to cache the results of recursing up the
+	 * instr src's.  Otherwise the recursive algo can scale quite
+	 * badly w/ shader size.  But this takes some care to clear
+	 * the cache appropriately when instructions are scheduled.
+	 */
+	if (instr->data) {
+		if (instr->data == NULL_INSTR)
+			return NULL;
+		return instr->data;
+	}
+
+	/* find unscheduled srcs: */
+	foreach_ssa_src(src, instr) {
+		if (!is_scheduled(src)) {
+			debug_assert(nsrcs < ARRAY_SIZE(srcs));
+			srcs[nsrcs++] = src;
 		}
 	}
-	return true;
+
+	/* if all our src's are already scheduled: */
+	if (nsrcs == 0) {
+		if (check_instr(ctx, notes, instr)) {
+			instr->data = instr;
+			return instr;
+		}
+		return NULL;
+	}
+
+	while ((src = deepest(srcs, nsrcs))) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, src);
+		if (!candidate)
+			continue;
+
+		if (check_instr(ctx, notes, candidate)) {
+			instr->data = candidate;
+			return candidate;
+		}
+	}
+
+	instr->data = NULL_INSTR;
+	return NULL;
 }
 
-/* move eligible instructions to the priority list: */
-static unsigned
-add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct list_head *prio_queue, struct list_head *unscheduled_list)
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 {
+	struct ir3_instruction *best_instr = NULL;
 	unsigned min_delay = ~0;
 
-	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
-		int e = instr_eligibility(ctx, notes, instr);
-		if (e < 0)
-			continue;
+	/* TODO we'd really rather use the list/array of block outputs.  But we
+	 * don't have such a thing.  Recursing *every* instruction in the list
+	 * will result in a lot of repeated traversal, since instructions will
+	 * get traversed both when they appear as ssa src to a later instruction
+	 * as well as where they appear in the depth_list.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+		unsigned delay;
 
-		/* For instructions that write address register we need to
-		 * make sure there is at least one instruction that uses the
-		 * addr value which is otherwise ready.
-		 *
-		 * TODO if any instructions use pred register and have other
-		 * src args, we would need to do the same for writes_pred()..
-		 */
-		if (unlikely(writes_addr(instr))) {
-			struct ir3 *ir = instr->block->shader;
-			bool ready = false;
-			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-				struct ir3_instruction *indirect = ir->indirects[i];
-				if (!indirect)
-					continue;
-				if (indirect->address != instr)
-					continue;
-				ready = could_sched(indirect, instr);
-			}
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
 
-			/* nothing could be scheduled, so keep looking: */
-			if (!ready)
-				continue;
+		delay = delay_calc(ctx, candidate);
+		if (delay < min_delay) {
+			best_instr = candidate;
+			min_delay = delay;
 		}
 
-		min_delay = MIN2(min_delay, e);
-		if (e == 0) {
-			/* remove from unscheduled list and into priority queue: */
-			list_delinit(&instr->node);
-			ir3_insert_by_depth(instr, prio_queue);
-		}
+		if (min_delay == 0)
+			break;
 	}
 
-	return min_delay;
+	return best_instr;
 }
 
 /* "spill" the address register by remapping any unscheduled
@@ -413,50 +487,55 @@ split_pred(struct ir3_sched_ctx *ctx)
 static void
 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct list_head unscheduled_list, prio_queue;
+	struct list_head unscheduled_list;
 
 	ctx->block = block;
 
+	/* addr/pred writes are per-block: */
+	ctx->addr = NULL;
+	ctx->pred = NULL;
+
 	/* move all instructions to the unscheduled list, and
 	 * empty the block's instruction list (to which we will
-	 * be inserting.
+	 * be inserting).
 	 */
 	list_replace(&block->instr_list, &unscheduled_list);
 	list_inithead(&block->instr_list);
-	list_inithead(&prio_queue);
+	list_inithead(&ctx->depth_list);
 
 	/* first a pre-pass to schedule all meta:input/phi instructions
 	 * (which need to appear first so that RA knows the register is
-	 * occupied:
+	 * occupied), and move remaining to depth sorted list:
 	 */
 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
-				(instr->opc == OPC_META_PHI)))
+				(instr->opc == OPC_META_PHI))) {
 			schedule(ctx, instr);
+		} else {
+			ir3_insert_by_depth(instr, &ctx->depth_list);
+		}
 	}
 
-	while (!(list_empty(&unscheduled_list) &&
-			list_empty(&prio_queue))) {
+	while (!list_empty(&ctx->depth_list)) {
 		struct ir3_sched_notes notes = {0};
-		unsigned delay;
+		struct ir3_instruction *instr;
+
+		instr = find_eligible_instr(ctx, &notes);
 
-		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
+		if (instr) {
+			unsigned delay = delay_calc(ctx, instr);
 
-		if (!list_empty(&prio_queue)) {
-			struct ir3_instruction *instr = list_last_entry(&prio_queue,
-					struct ir3_instruction, node);
-			/* ugg, this is a bit ugly, but between the time when
-			 * the instruction became eligible and now, a new
-			 * conflict may have arose..
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
 			 */
-			if (check_conflict(ctx, &notes, instr)) {
-				list_del(&instr->node);
-				list_addtail(&instr->node, &unscheduled_list);
-				continue;
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
 			}
 
 			schedule(ctx, instr);
-		} else if (delay == ~0) {
+		} else {
 			struct ir3_instruction *new_instr = NULL;
 
 			/* nothing available to schedule.. if we are blocked on
@@ -475,23 +554,17 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			}
 
 			if (new_instr) {
-				list_del(&new_instr->node);
-				list_addtail(&new_instr->node, &unscheduled_list);
+				/* clearing current addr/pred can change what is
+				 * available to schedule, so clear cache..
+				 */
+				clear_cache(ctx, NULL);
+
+				ir3_insert_by_depth(new_instr, &ctx->depth_list);
 				/* the original instr that wrote addr/pred may have
 				 * originated from a different block:
 				 */
 				new_instr->block = block;
 			}
-
-		} else {
-			/* and if we run out of instructions that can be scheduled,
-			 * then it is time for nop's:
-			 */
-			debug_assert(delay <= 6);
-			while (delay > 0) {
-				ir3_NOP(block);
-				delay--;
-			}
 		}
 	}
 
-- 
2.30.2