From 7273cb4e933f8be65fc73b9d8c69c76d1078cb14 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 30 Apr 2015 13:57:15 -0400
Subject: [PATCH] freedreno/ir3/sched: convert to priority queue

Use a more standard priority-queue based scheduling algo.  It is simpler
and will make things easier once we have multiple basic blocks and flow
control.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3.c       |   1 +
 src/gallium/drivers/freedreno/ir3/ir3.h       |   3 +
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  |   1 +
 src/gallium/drivers/freedreno/ir3/ir3_sched.c | 466 +++++++++---------
 4 files changed, 242 insertions(+), 229 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 84564a9eef7..aea1b967b07 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -82,6 +82,7 @@ void ir3_destroy(struct ir3 *shader)
 		free(chunk);
 	}
 	free(shader->indirects);
+	free(shader->predicates);
 	free(shader->baryfs);
 	free(shader);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index edb5b49e23c..030a74fe21a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -346,6 +346,9 @@ struct ir3 {
 	 */
 	unsigned indirects_count, indirects_sz;
 	struct ir3_instruction **indirects;
+	/* and same for instructions that consume predicate register: */
+	unsigned predicates_count, predicates_sz;
+	struct ir3_instruction **predicates;
 
 	struct ir3_block *block;
 	unsigned heap_idx;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8d382e5cf3e..caea34c7fd4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1250,6 +1250,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		cond->regs[0]->num = regid(REG_P0, 0);
 
 		kill = ir3_KILL(b, cond, 0);
+		array_insert(ctx->ir->predicates, kill);
 
 		ctx->kill[ctx->kill_count++] = kill;
 		ctx->so->has_kill = true;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index fc41f93b884..1d166d879df 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -31,23 +31,14 @@
 
 #include "ir3.h"
 
-enum {
-	SCHEDULED = -1,
-	DELAYED = -2,
-};
-
 /*
  * Instruction Scheduling:
  *
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path.  The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth.  After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again.  If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's.  Repeat until no more unscheduled
- * instructions.
+ * A priority-queue based scheduling algo.  Add eligible instructions,
+ * ie. ones with all their dependencies scheduled, to the priority
+ * (depth) sorted queue (list).  Pop highest priority instruction off
+ * the queue and schedule it, add newly eligible instructions to the
+ * priority queue, rinse, repeat.
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -60,67 +51,29 @@ enum {
  */
 
 struct ir3_sched_ctx {
-	struct ir3_instruction *scheduled; /* last scheduled instr */
+	struct ir3_block *block;           /* the current block */
+	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
-	unsigned cnt;
 	bool error;
 };
 
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
-
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
-
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
-}
-
-static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
-		unsigned maxd)
-{
-	struct list_head *instr_list = &instr->block->instr_list;
-	unsigned d = 0;
-
-	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
-		if ((n == instr) || (d >= maxd))
-			break;
-		if (is_alu(n) || is_flow(n))
-			d++;
-	}
-
-	return d;
-}
-
 static bool is_sfu_or_mem(struct ir3_instruction *instr)
 {
 	return is_sfu(instr) || is_mem(instr);
 }
 
-static void schedule(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr, bool remove)
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
-	struct ir3_block *block = instr->block;
+	debug_assert(ctx->block == instr->block);
 
 	/* maybe there is a better way to handle this than just stuffing
 	 * a nop.. ideally we'd know about this constraint in the
 	 * scheduling and depth calculation..
 	 */
 	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-		ir3_NOP(block);
+		ir3_NOP(ctx->block);
 
 	/* remove from depth list:
 	 */
@@ -140,16 +93,28 @@ static void schedule(struct ir3_sched_ctx *ctx,
 
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
-
-	ctx->cnt++;
 }
 
-/*
- * Delay-slot calculation.  Follows fanin/fanout.
- */
+static unsigned
+distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
+		unsigned maxd)
+{
+	struct list_head *instr_list = &ctx->block->instr_list;
+	unsigned d = 0;
+
+	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+		if ((n == instr) || (d >= maxd))
+			break;
+		if (is_alu(n) || is_flow(n))
+			d++;
+	}
+
+	return d;
+}
 
 /* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned srcn)
 {
@@ -158,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 	if (is_meta(assigner)) {
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
-			unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+			unsigned d;
+			if (src->block != assigner->block)
+				break;
+			d = delay_calc_srcn(ctx, src, consumer, srcn);
 			delay = MAX2(delay, d);
 		}
 	} else {
@@ -170,48 +138,77 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 }
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 
 	foreach_ssa_src_n(src, i, instr) {
-		unsigned d = delay_calc_srcn(ctx, src, instr, i);
+		unsigned d;
+		if (src->block != instr->block)
+			continue;
+		d = delay_calc_srcn(ctx, src, instr, i);
 		delay = MAX2(delay, d);
 	}
 
 	return delay;
 }
 
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+	/* there is at least one kill which could be scheduled, except
+	 * for unscheduled bary.f's:
+	 */
+	bool blocked_kill;
+	/* there is at least one instruction that could be scheduled,
+	 * except for conflicting address/predicate register usage:
+	 */
+	bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+	return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+static bool
+check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		assert(ctx->addr != instr);
+		notes->addr_conflict = true;
+		return true;
+	}
+
+	if (writes_pred(instr) && ctx->pred) {
+		assert(ctx->pred != instr);
+		notes->pred_conflict = true;
+		return true;
+	}
+
+	return false;
+}
+
+/* is this instruction ready to be scheduled?  Return negative for not
+ * ready (updating notes if needed), or >= 0 to indicate number of
+ * delay slots needed.
  */
-static int trysched(struct ir3_sched_ctx *ctx,
+static int
+instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *srcs[64];
 	struct ir3_instruction *src;
-	unsigned delay, nsrcs = 0;
-
-	/* if already scheduled: */
-	if (instr->flags & IR3_INSTR_MARK)
-		return 0;
+	unsigned delay = 0;
 
-	/* figure out our src's, copy 'em out into an array for sorting: */
 	foreach_ssa_src(src, instr) {
-		debug_assert(nsrcs < ARRAY_SIZE(srcs));
-		srcs[nsrcs++] = src;
-	}
-
-	/* for each src register in sorted order:
-	 */
-	delay = 0;
-	while ((src = deepest(srcs, nsrcs))) {
-		delay = trysched(ctx, src);
-		if (delay)
-			return delay;
+		/* if dependency not scheduled, we aren't ready yet: */
+		if (!is_scheduled(src))
+			return -1;
 	}
 
 	/* all our dependents are scheduled, figure out if
@@ -236,183 +233,194 @@ static int trysched(struct ir3_sched_ctx *ctx,
 	 */
 	if (is_kill(instr)) {
 		struct ir3 *ir = instr->block->shader;
-		unsigned i;
 
-		for (i = 0; i < ir->baryfs_count; i++) {
+		for (unsigned i = 0; i < ir->baryfs_count; i++) {
 			struct ir3_instruction *baryf = ir->baryfs[i];
 			if (baryf->depth == DEPTH_UNUSED)
 				continue;
-			delay = trysched(ctx, baryf);
-			if (delay)
-				return delay;
+			if (!is_scheduled(baryf)) {
+				notes->blocked_kill = true;
+				return -1;
+			}
 		}
 	}
 
-	/* if this is a write to address/predicate register, and that
-	 * register is currently in use, we need to defer until it is
-	 * free:
-	 */
-	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
-		return DELAYED;
-	}
-	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
-		return DELAYED;
-	}
+	if (check_conflict(ctx, notes, instr))
+		return -1;
 
-	schedule(ctx, instr, true);
-	return SCHEDULED;
+	return 0;
 }
 
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+/* move eligible instructions to the priority list: */
+static unsigned
+add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct list_head *prio_queue, struct list_head *unscheduled_list)
 {
-	return instr->address && (ctx->addr == instr->address);
-}
+	unsigned min_delay = ~0;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
+		int e = instr_eligibility(ctx, notes, instr);
+		if (e < 0)
+			continue;
+		min_delay = MIN2(min_delay, e);
+		if (e == 0) {
+			/* remove from unscheduled list and into priority queue: */
+			list_delinit(&instr->node);
+			ir3_insert_by_depth(instr, prio_queue);
+		}
+	}
 
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-	foreach_ssa_src(src, instr)
-		if (ctx->pred == src)
-			return true;
-	return false;
+	return min_delay;
 }
 
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
  */
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
-		struct list_head *unscheduled_list)
+static void
+split_addr(struct ir3_sched_ctx *ctx)
 {
-	bool addr_in_use = false;
-	bool pred_in_use = false;
-	bool all_delayed = true;
-	unsigned cnt = ~0, attempted = 0;
-
-	list_for_each_entry_safe(struct ir3_instruction, instr, unscheduled_list, node) {
-		bool addr = uses_current_addr(ctx, instr);
-		bool pred = uses_current_pred(ctx, instr);
-
-		if (addr || pred) {
-			int ret = trysched(ctx, instr);
-
-			if (ret != DELAYED)
-				all_delayed = false;
-
-			if (ret == SCHEDULED)
-				cnt = 0;
-			else if (ret > 0)
-				cnt = MIN2(cnt, ret);
-			if (addr)
-				addr_in_use = true;
-			if (pred)
-				pred_in_use = true;
-
-			attempted++;
+	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3_instruction *new_addr = NULL;
+	unsigned i;
+
+	debug_assert(ctx->addr);
+
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *indirect = ir->indirects[i];
+
+		/* skip instructions already scheduled: */
+		if (indirect->flags & IR3_INSTR_MARK)
+			continue;
+
+		/* remap remaining instructions using current addr
+		 * to new addr:
+		 */
+		if (indirect->address == ctx->addr) {
+			if (!new_addr) {
+				new_addr = ir3_instr_clone(ctx->addr);
+				/* original addr is scheduled, but new one isn't: */
+				new_addr->flags &= ~IR3_INSTR_MARK;
+			}
+			indirect->address = new_addr;
 		}
 	}
 
-	if (!addr_in_use)
-		ctx->addr = NULL;
+	/* all remaining indirects remapped to new addr: */
+	ctx->addr = NULL;
+}
 
-	if (!pred_in_use)
-		ctx->pred = NULL;
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static void
+split_pred(struct ir3_sched_ctx *ctx)
+{
+	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3_instruction *new_pred = NULL;
+	unsigned i;
 
-	/* detect if we've gotten ourselves into an impossible situation
-	 * and bail if needed
-	 */
-	if (all_delayed && (attempted > 0)) {
-		if (pred_in_use) {
-			/* TODO we probably need to keep a list of instructions
-			 * that reference predicate, similar to indirects
-			 */
-			ctx->error = true;
-			return DELAYED;
-		}
-		if (addr_in_use) {
-			struct ir3 *ir = ctx->addr->block->shader;
-			struct ir3_instruction *new_addr =
-					ir3_instr_clone(ctx->addr);
-			unsigned i;
-
-			/* original addr is scheduled, but new one isn't: */
-			new_addr->flags &= ~IR3_INSTR_MARK;
-
-			for (i = 0; i < ir->indirects_count; i++) {
-				struct ir3_instruction *indirect = ir->indirects[i];
-
-				/* skip instructions already scheduled: */
-				if (indirect->flags & IR3_INSTR_MARK)
-					continue;
-
-				/* remap remaining instructions using current addr
-				 * to new addr:
-				 */
-				if (indirect->address == ctx->addr)
-					indirect->address = new_addr;
-			}
+	debug_assert(ctx->pred);
 
-			/* all remaining indirects remapped to new addr: */
-			ctx->addr = NULL;
+	for (i = 0; i < ir->predicates_count; i++) {
+		struct ir3_instruction *predicated = ir->predicates[i];
 
-			/* not really, but this will trigger us to go back to
-			 * main trysched() loop now that we've resolved the
-			 * conflict by duplicating the instr that writes to
-			 * the address register.
-			 */
-			return SCHEDULED;
+		/* skip instructions already scheduled: */
+		if (predicated->flags & IR3_INSTR_MARK)
+			continue;
+
+		/* remap remaining instructions using current pred
+		 * to new pred:
+		 *
+		 * TODO is there ever a case when pred isn't first
+		 * (and only) src?
+		 */
+		if (ssa(predicated->regs[1]) == ctx->pred) {
+			if (!new_pred) {
+				new_pred = ir3_instr_clone(ctx->pred);
+				/* original pred is scheduled, but new one isn't: */
+				new_pred->flags &= ~IR3_INSTR_MARK;
+			}
+			predicated->regs[1]->instr = new_pred;
 		}
 	}
 
-	return cnt;
+	/* all remaining predicated remapped to new pred: */
+	ctx->pred = NULL;
 }
 
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct list_head unscheduled_list;
+	struct list_head unscheduled_list, prio_queue;
 
+	ctx->block = block;
+
+	/* move all instructions to the unscheduled list, and
+	 * empty the block's instruction list (to which we will
+	 * be inserting.
+	 */
 	list_replace(&block->instr_list, &unscheduled_list);
 	list_inithead(&block->instr_list);
+	list_inithead(&prio_queue);
 
-	/* schedule all the shader input's (meta-instr) first so that
-	 * the RA step sees that the input registers contain a value
-	 * from the start of the shader:
+	/* first a pre-pass to schedule all meta:input/phi instructions
+	 * (which need to appear first so that RA knows the register is
+	 * occupied:
 	 */
-	if (!block->parent) {
-		unsigned i;
-		for (i = 0; i < block->ninputs; i++) {
-			struct ir3_instruction *in = block->inputs[i];
-			if (in)
-				schedule(ctx, in, true);
-		}
-	}
-
 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-		int cnt = trysched(ctx, instr);
+		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
+				(instr->opc == OPC_META_PHI)))
+			schedule(ctx, instr);
+	}
 
-		if (cnt == DELAYED)
-			cnt = block_sched_undelayed(ctx, &unscheduled_list);
+	while (!(list_empty(&unscheduled_list) &&
+			list_empty(&prio_queue))) {
+		struct ir3_sched_notes notes = {0};
+		unsigned delay;
 
-		/* -1 is signal to return up stack, but to us means same as 0: */
-		cnt = MAX2(0, cnt);
-		cnt += ctx->cnt;
+		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
 
-		/* if deepest remaining instruction cannot be scheduled, try
-		 * the increasingly more shallow instructions until needed
-		 * number of delay slots is filled:
-		 */
-		list_for_each_entry_safe (struct ir3_instruction, instr, &instr->node, node)
-			trysched(ctx, instr);
+		if (!list_empty(&prio_queue)) {
+			struct ir3_instruction *instr = list_last_entry(&prio_queue,
+					struct ir3_instruction, node);
+			/* ugg, this is a bit ugly, but between the time when
+			 * the instruction became eligible and now, a new
+			 * conflict may have arose..
+			 */
+			if (check_conflict(ctx, &notes, instr)) {
+				list_del(&instr->node);
+				list_addtail(&instr->node, &unscheduled_list);
+				continue;
+			}
 
-		/* and if we run out of instructions that can be scheduled,
-		 * then it is time for nop's:
-		 */
-		while (cnt > ctx->cnt)
-			schedule(ctx, ir3_NOP(block), false);
+			schedule(ctx, instr);
+		} else if (delay == ~0) {
+			/* nothing available to schedule.. if we are blocked on
+			 * address/predicate register conflict, then break the
+			 * deadlock by cloning the instruction that wrote that
+			 * reg:
+			 */
+			if (notes.addr_conflict) {
+				split_addr(ctx);
+			} else if (notes.pred_conflict) {
+				split_pred(ctx);
+			} else {
+				debug_assert(0);
+				ctx->error = true;
+				return;
+			}
+		} else {
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
+		}
 	}
 }
 
@@ -420,7 +428,7 @@ int ir3_block_sched(struct ir3_block *block)
 {
 	struct ir3_sched_ctx ctx = {0};
 	ir3_clear_mark(block->shader);
-	block_sched(&ctx, block);
+	sched_block(&ctx, block);
 	if (ctx.error)
 		return -1;
 	return 0;
-- 
2.30.2