From f54d2b4f10481913528b4ef3d68a99b59104f053 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 1 Feb 2018 09:08:39 -0500
Subject: [PATCH] freedreno/ir3: more clever legalize algorithm

Previously we didn't handle flow control in legalize, and instead just
set (ss)(sy) on the first instruction in every block.  Which isn't very
clever.

Instead, consider output state of all predecessor blocks, so we only
set a sync bit if needed for any possible path leading into a block.
Because of loops, we can't require that all successor blocks are
legalized before a given block, so instead run in a loop until results
converge.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 .../drivers/freedreno/ir3/ir3_legalize.c      | 138 ++++++++++++------
 1 file changed, 96 insertions(+), 42 deletions(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 239bfc28826..b29b097d7b0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -47,35 +47,62 @@ struct ir3_legalize_ctx {
 	int max_bary;
 };
 
+struct ir3_legalize_state {
+	regmask_t needs_ss;
+	regmask_t needs_ss_war;       /* write after read */
+	regmask_t needs_sy;
+};
+
+struct ir3_legalize_block_data {
+	bool valid;
+	struct ir3_legalize_state state;
+};
+
 /* We want to evaluate each block from the position of any other
- * predecessor block, in order that the flags set are the union
- * of all possible program paths.  For stopping condition, we
- * want to stop when the pair of <pred-block, current-block> has
- * been visited already.
+ * predecessor block, in order that the flags set are the union of
+ * all possible program paths.
  *
- * XXX is that completely true?  We could have different needs_xyz
- * flags set depending on path leading to pred-block.. we could
- * do *most* of this based on chasing src instructions ptrs (and
- * following all phi srcs).. except the write-after-read hazzard.
+ * To do this, we need to know the output state (needs_ss/ss_war/sy)
+ * of all predecessor blocks.  The tricky thing is loops, which mean
+ * that we can't simply recursively process each predecessor block
+ * before legalizing the current block.
  *
- * For now we just set ss/sy flag on first instruction on block,
- * and handle everything within the block as before.
+ * How we handle that is by looping over all the blocks until the
+ * results converge.  If the output state of a given block changes
+ * in a given pass, this means that all successor blocks are not
+ * yet fully legalized.
  */
 
-static void
+static bool
 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 {
+	struct ir3_legalize_block_data *bd = block->data;
+
+	if (bd->valid)
+		return false;
+
 	struct ir3_instruction *last_input = NULL;
 	struct ir3_instruction *last_rel = NULL;
 	struct ir3_instruction *last_n = NULL;
 	struct list_head instr_list;
-	regmask_t needs_ss_war;       /* write after read */
-	regmask_t needs_ss;
-	regmask_t needs_sy;
+	struct ir3_legalize_state prev_state = bd->state;
+	struct ir3_legalize_state *state = &bd->state;
 
-	regmask_init(&needs_ss_war);
-	regmask_init(&needs_ss);
-	regmask_init(&needs_sy);
+	/* our input state is the OR of all predecessor blocks' state: */
+	for (unsigned i = 0; i < block->predecessors_count; i++) {
+		struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
+		struct ir3_legalize_state *pstate = &pbd->state;
+
+		/* Our input (ss)/(sy) state is based on OR'ing the output
+		 * state of all our predecessor blocks
+		 */
+		regmask_or(&state->needs_ss,
+				&state->needs_ss, &pstate->needs_ss);
+		regmask_or(&state->needs_ss_war,
+				&state->needs_ss_war, &pstate->needs_ss_war);
+		regmask_or(&state->needs_sy,
+				&state->needs_sy, &pstate->needs_sy);
+	}
 
 	/* remove all the instructions from the list, we'll be adding
 	 * them back in as we go
@@ -87,6 +114,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		struct ir3_register *reg;
 		unsigned i;
 
+		n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
 		if (is_meta(n))
 			continue;
 
@@ -114,15 +143,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 				 * instr consuming sfu result.. need to make
 				 * some tests for both this and (sy)..
 				 */
-				if (regmask_get(&needs_ss, reg)) {
+				if (regmask_get(&state->needs_ss, reg)) {
 					n->flags |= IR3_INSTR_SS;
-					regmask_init(&needs_ss_war);
-					regmask_init(&needs_ss);
+					regmask_init(&state->needs_ss_war);
+					regmask_init(&state->needs_ss);
 				}
 
-				if (regmask_get(&needs_sy, reg)) {
+				if (regmask_get(&state->needs_sy, reg)) {
 					n->flags |= IR3_INSTR_SY;
-					regmask_init(&needs_sy);
+					regmask_init(&state->needs_sy);
 				}
 			}
 
@@ -136,10 +165,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 
 		if (n->regs_count > 0) {
 			reg = n->regs[0];
-			if (regmask_get(&needs_ss_war, reg)) {
+			if (regmask_get(&state->needs_ss_war, reg)) {
 				n->flags |= IR3_INSTR_SS;
-				regmask_init(&needs_ss_war);
-				regmask_init(&needs_ss);
+				regmask_init(&state->needs_ss_war);
+				regmask_init(&state->needs_ss);
 			}
 
 			if (last_rel && (reg->num == regid(REG_A0, 0))) {
@@ -177,7 +206,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		list_addtail(&n->node, &block->instr_list);
 
 		if (is_sfu(n))
-			regmask_set(&needs_ss, n->regs[0]);
+			regmask_set(&state->needs_ss, n->regs[0]);
 
 		if (is_tex(n)) {
 			/* this ends up being the # of samp instructions.. but that
@@ -188,23 +217,23 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			 * result is not used.
 			 */
 			ctx->has_samp = true;
-			regmask_set(&needs_sy, n->regs[0]);
+			regmask_set(&state->needs_sy, n->regs[0]);
 		} else if (n->opc == OPC_RESINFO) {
-			regmask_set(&needs_ss, n->regs[0]);
+			regmask_set(&state->needs_ss, n->regs[0]);
 			ir3_NOP(block)->flags |= IR3_INSTR_SS;
 		} else if (is_load(n)) {
 			/* seems like ldlv needs (ss) bit instead??  which is odd but
 			 * makes a bunch of flat-varying tests start working on a4xx.
 			 */
 			if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
-				regmask_set(&needs_ss, n->regs[0]);
+				regmask_set(&state->needs_ss, n->regs[0]);
 			else
-				regmask_set(&needs_sy, n->regs[0]);
+				regmask_set(&state->needs_sy, n->regs[0]);
 		} else if (is_atomic(n->opc)) {
 			if (n->flags & IR3_INSTR_G)
-				regmask_set(&needs_sy, n->regs[0]);
+				regmask_set(&state->needs_sy, n->regs[0]);
 			else
-				regmask_set(&needs_ss, n->regs[0]);
+				regmask_set(&state->needs_ss, n->regs[0]);
 		}
 
 		if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
@@ -216,7 +245,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
 			foreach_src(reg, n) {
 				if (reg_gpr(reg))
-					regmask_set(&needs_ss_war, reg);
+					regmask_set(&state->needs_ss_war, reg);
 			}
 		}
 
@@ -253,8 +282,21 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 	if (last_rel)
 		last_rel->flags |= IR3_INSTR_UL;
 
-	list_first_entry(&block->instr_list, struct ir3_instruction, node)
-		->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+	bd->valid = true;
+
+	if (memcmp(&prev_state, state, sizeof(*state))) {
+		/* our output state changed, this invalidates all of our
+		 * successors:
+		 */
+		for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+			if (!block->successors[i])
+				break;
+			struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+			pbd->valid = false;
+		}
+	}
+
+	return true;
 }
 
 /* NOTE: branch instructions are always the last instruction(s)
@@ -425,21 +467,33 @@ mark_convergence_points(struct ir3 *ir)
 void
 ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary)
 {
-	struct ir3_legalize_ctx ctx = {
-			.max_bary = -1,
-	};
+	struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+	bool progress;
 
+	ctx->max_bary = -1;
+
+	/* allocate per-block data: */
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		legalize_block(&ctx, block);
+		block->data = rzalloc(ctx, struct ir3_legalize_block_data);
 	}
 
-	*has_samp = ctx.has_samp;
-	*has_ssbo = ctx.has_ssbo;
-	*max_bary = ctx.max_bary;
+	/* process each block: */
+	do {
+		progress = false;
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			progress |= legalize_block(ctx, block);
+		}
+	} while (progress);
+
+	*has_samp = ctx->has_samp;
+	*has_ssbo = ctx->has_ssbo;
+	*max_bary = ctx->max_bary;
 
 	do {
 		ir3_count_instructions(ir);
 	} while(resolve_jumps(ir));
 
 	mark_convergence_points(ir);
+
+	ralloc_free(ctx);
 }
-- 
2.30.2