From 6347c2ea89bde624dd16cff6741db57e89d88ad5 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Sat, 21 Mar 2020 14:44:44 -0700
Subject: [PATCH] freedreno/ir3/ra: add def/use iterators

Decouple the messy logic of figuring out vreg names defined/used by an
instruction from the logic of what to do about it by introducing
iterators.  There is still *some* array vs ssa special casing in
ra_block_compute_live_ranges(), but less than before.  And this will
avoid introducing a second copy of the def/use logic in a following
patch which uses the liveranges to calculate the maximum # of live
values (which is the optimal target for max physical register window
to round-robin within).

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4272>
---
 src/freedreno/ir3/ir3_ra.c | 176 +++++++++----------------------------
 src/freedreno/ir3/ir3_ra.h | 159 +++++++++++++++++++++++++++++++++
 2 files changed, 202 insertions(+), 133 deletions(-)

diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index d4663f6167d..fa379c3495b 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -585,159 +585,69 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	}
 
 	foreach_instr (instr, &block->instr_list) {
-		struct ir3_instruction *src;
-		struct ir3_register *reg;
-
-		if (writes_gpr(instr)) {
-			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-			struct ir3_register *dst = instr->regs[0];
-
-			if (dst->flags & IR3_REG_ARRAY) {
-				struct ir3_array *arr =
-					ir3_lookup_array(ctx->ir, dst->array.id);
-				unsigned i;
+		foreach_def (name, ctx, instr) {
+			if (name_is_array(ctx, name)) {
+				struct ir3_array *arr = name_to_array(ctx, name);
 
 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
 
-				/* set the node class now.. in case we don't encounter
-				 * this array dst again.  From register_alloc algo's
-				 * perspective, these are all single/scalar regs:
-				 */
-				for (i = 0; i < arr->length; i++) {
+				for (unsigned i = 0; i < arr->length; i++) {
 					unsigned name = arr->base + i;
 					if(arr->half)
 						ra_set_node_class(ctx->g, name, ctx->set->half_classes[0]);
 					else
 						ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
 				}
-
-				/* indirect write is treated like a write to all array
-				 * elements, since we don't know which one is actually
-				 * written:
-				 */
-				if (dst->flags & IR3_REG_RELATIV) {
-					for (i = 0; i < arr->length; i++) {
-						unsigned name = arr->base + i;
-						def(name, instr);
-					}
+			} else {
+				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+				if (is_high(instr)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+				} else if (is_half(instr)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - HALF_OFFSET]);
 				} else {
-					unsigned name = arr->base + dst->array.offset;
-					def(name, instr);
-				}
-			} else if (id->defn == instr) {
-				/* in scalar pass, we aren't considering virtual register
-				 * classes, ie. if an instruction writes a vec2, then it
-				 * defines two different scalar register names.
-				 */
-				unsigned n = ctx->scalar_pass ? dest_regs(instr) : 1;
-				for (unsigned i = 0; i < n; i++) {
-					unsigned name = scalar_name(ctx, instr, i);
-
-					/* split/collect instructions have duplicate names
-					 * as real instructions, so they skip the hashtable:
-					 */
-					if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) ||
-							(instr->opc == OPC_META_COLLECT))) {
-						/* this is slightly annoying, we can't just use an
-						 * integer on the stack
-						 */
-						unsigned *key = ralloc(ctx->name_to_instr, unsigned);
-						*key = name;
-						debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key));
-						_mesa_hash_table_insert(ctx->name_to_instr, key, instr);
-					}
-
-					/* tex instructions actually have a wrmask, and
-					 * don't touch masked out components.  We can't do
-					 * anything useful about that in the first pass,
-					 * but in the scalar pass we can realize these
-					 * registers are available:
-					 */
-					if (ctx->scalar_pass && is_tex_or_prefetch(instr) &&
-							!(instr->regs[0]->wrmask & (1 << i)))
-						continue;
-
-					def(name, instr);
-
-					if ((instr->opc == OPC_META_INPUT) && first_non_input)
-						use(name, first_non_input);
-
-					if (is_high(instr)) {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->high_classes[id->cls - HIGH_OFFSET]);
-					} else if (is_half(instr)) {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[id->cls - HALF_OFFSET]);
-					} else {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[id->cls]);
-					}
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
 				}
 			}
+
+			def(name, instr);
+
+			if ((instr->opc == OPC_META_INPUT) && first_non_input)
+				use(name, first_non_input);
 		}
 
-		foreach_src (reg, instr) {
-			if (reg->flags & IR3_REG_ARRAY) {
-				struct ir3_array *arr =
-					ir3_lookup_array(ctx->ir, reg->array.id);
+		foreach_use (name, ctx, instr) {
+			if (name_is_array(ctx, name)) {
+				struct ir3_array *arr = name_to_array(ctx, name);
+
 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
 
-				/* indirect read is treated like a read from all array
-				 * elements, since we don't know which one is actually
-				 * read:
+				/* NOTE: arrays are not SSA so unconditionally
+				 * set use bit:
 				 */
-				if (reg->flags & IR3_REG_RELATIV) {
-					unsigned i;
-					for (i = 0; i < arr->length; i++) {
-						unsigned name = arr->base + i;
-						use(name, instr);
-						BITSET_SET(bd->use, name);
-					}
-				} else {
-					unsigned name = arr->base + reg->array.offset;
-					use(name, instr);
-					/* NOTE: arrays are not SSA so unconditionally
-					 * set use bit:
-					 */
-					BITSET_SET(bd->use, name);
-					debug_assert(reg->array.offset < arr->length);
-				}
-			} else if (ctx->scalar_pass) {
-				struct ir3_instruction *src = reg->instr;
-				/* skip things that aren't SSA: */
-				unsigned n = src ? dest_regs(src) : 0;
-
-				/* in scalar pass, we aren't considering virtual register
-				 * classes, ie. if an instruction writes a vec2, then it
-				 * defines two different scalar register names.
-				 *
-				 * We need to traverse up thru collect/split to find the
-				 * actual non-meta instruction names for each of the
-				 * components:
+				BITSET_SET(bd->use, name);
+			}
+
+			use(name, instr);
+		}
+
+		foreach_name (name, ctx, instr) {
+			/* split/collect instructions have duplicate names
+			 * as real instructions, so they skip the hashtable:
+			 */
+			if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) ||
+					(instr->opc == OPC_META_COLLECT))) {
+				/* this is slightly annoying, we can't just use an
+				 * integer on the stack
 				 */
-				for (unsigned i = 0; i < n; i++) {
-					/* Need to filter out a couple special cases, ie.
-					 * writes to a0.x or p0.x:
-					 */
-					if (!writes_gpr(src))
-						continue;
-
-					/* split takes a src w/ wrmask potentially greater
-					 * than 0x1, but it really only cares about a single
-					 * component.  This shows up in splits coming out of
-					 * a tex instruction w/ wrmask=.z, for example.
-					 */
-					if ((instr->opc == OPC_META_SPLIT) &&
-							!(i == instr->split.off))
-						continue;
-
-					use(scalar_name(ctx, src, i), instr);
-				}
-			} else if ((src = ssa(reg)) && writes_gpr(src)) {
-				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
-				use(name, instr);
+				unsigned *key = ralloc(ctx->name_to_instr, unsigned);
+				*key = name;
+				debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key));
+				_mesa_hash_table_insert(ctx->name_to_instr, key, instr);
 			}
 		}
 	}
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index f9c2155b7df..db21eb9f220 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -134,6 +134,18 @@ struct ir3_ra_ctx {
 	/* Tracking for select_reg callback */
 	unsigned start_search_reg;
 	unsigned max_target;
+
+	/* Temporary buffer for def/use iterators
+	 *
+	 * The worst case should probably be an array w/ relative access (ie.
+	 * all elements are def'd or use'd), and that can't be larger than
+	 * the number of registers.
+	 *
+	 * NOTE we could declare this on the stack if needed, but I don't
+	 * think there is a need for nested iterators.
+	 */
+	unsigned namebuf[NUM_REGS];
+	unsigned namecnt, nameidx;
 };
 
 static inline int
@@ -182,6 +194,153 @@ writes_gpr(struct ir3_instruction *instr)
 	return true;
 }
 
+#define NO_NAME ~0
+
+/*
+ * Iterators to iterate the vreg names of an instructions def's and use's
+ */
+
+static inline unsigned
+__ra_name_cnt(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	if (!instr)
+		return 0;
+
+	/* Filter special cases, ie. writes to a0.x or p0.x, or non-ssa: */
+	if (!writes_gpr(instr) || (instr->regs[0]->flags & IR3_REG_ARRAY))
+		return 0;
+
+	/* in scalar pass, we aren't considering virtual register classes, ie.
+	 * if an instruction writes a vec2, then it defines two different scalar
+	 * register names.
+	 */
+	if (ctx->scalar_pass)
+		return dest_regs(instr);
+
+	return 1;
+}
+
+#define foreach_name_n(__name, __n, __ctx, __instr) \
+	for (unsigned __cnt = __ra_name_cnt(__ctx, __instr), __n = 0, __name; \
+	     (__n < __cnt) && ({__name = scalar_name(__ctx, __instr, __n); 1;}); __n++)
+
+#define foreach_name(__name, __ctx, __instr) \
+	foreach_name_n(__name, __n, __ctx, __instr)
+
+static inline unsigned
+__ra_itr_pop(struct ir3_ra_ctx *ctx)
+{
+	if (ctx->nameidx < ctx->namecnt)
+		return ctx->namebuf[ctx->nameidx++];
+	return NO_NAME;
+}
+
+static inline void
+__ra_itr_push(struct ir3_ra_ctx *ctx, unsigned name)
+{
+	assert(ctx->namecnt < ARRAY_SIZE(ctx->namebuf));
+	ctx->namebuf[ctx->namecnt++] = name;
+}
+
+static inline unsigned
+__ra_init_def_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	/* nested use is not supported: */
+	assert(ctx->namecnt == ctx->nameidx);
+
+	ctx->namecnt = ctx->nameidx = 0;
+
+	if (!writes_gpr(instr))
+		return NO_NAME;
+
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	struct ir3_register *dst = instr->regs[0];
+
+	if (dst->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr = ir3_lookup_array(ctx->ir, dst->array.id);
+
+		/* indirect write is treated like a write to all array
+		 * elements, since we don't know which one is actually
+		 * written:
+		 */
+		if (dst->flags & IR3_REG_RELATIV) {
+			for (unsigned i = 0; i < arr->length; i++) {
+				__ra_itr_push(ctx, arr->base + i);
+			}
+		} else {
+			__ra_itr_push(ctx, arr->base + dst->array.offset);
+			debug_assert(dst->array.offset < arr->length);
+		}
+	} else if (id->defn == instr) {
+		foreach_name_n (name, i, ctx, instr) {
+			/* tex instructions actually have a wrmask, and
+			 * don't touch masked out components.  We can't do
+			 * anything useful about that in the first pass,
+			 * but in the scalar pass we can realize these
+			 * registers are available:
+			 */
+			if (ctx->scalar_pass && is_tex_or_prefetch(instr) &&
+					!(instr->regs[0]->wrmask & (1 << i)))
+				continue;
+			__ra_itr_push(ctx, name);
+		}
+	}
+
+	return __ra_itr_pop(ctx);
+}
+
+static inline unsigned
+__ra_init_use_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	/* nested use is not supported: */
+	assert(ctx->namecnt == ctx->nameidx);
+
+	ctx->namecnt = ctx->nameidx = 0;
+
+	struct ir3_register *reg;
+	foreach_src (reg, instr) {
+		if (reg->flags & IR3_REG_ARRAY) {
+			struct ir3_array *arr =
+				ir3_lookup_array(ctx->ir, reg->array.id);
+
+			/* indirect read is treated like a read from all array
+			 * elements, since we don't know which one is actually
+			 * read:
+			 */
+			if (reg->flags & IR3_REG_RELATIV) {
+				for (unsigned i = 0; i < arr->length; i++) {
+					__ra_itr_push(ctx, arr->base + i);
+				}
+			} else {
+				__ra_itr_push(ctx, arr->base + reg->array.offset);
+				debug_assert(reg->array.offset < arr->length);
+			}
+		} else {
+			foreach_name_n (name, i, ctx, reg->instr) {
+				/* split takes a src w/ wrmask potentially greater
+				 * than 0x1, but it really only cares about a single
+				 * component.  This shows up in splits coming out of
+				 * a tex instruction w/ wrmask=.z, for example.
+				 */
+				if (ctx->scalar_pass && (instr->opc == OPC_META_SPLIT) &&
+						!(i == instr->split.off))
+					continue;
+				__ra_itr_push(ctx, name);
+			}
+		}
+	}
+
+	return __ra_itr_pop(ctx);
+}
+
+#define foreach_def(__name, __ctx, __instr) \
+	for (unsigned __name = __ra_init_def_itr(__ctx, __instr); \
+	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
+
+#define foreach_use(__name, __ctx, __instr) \
+	for (unsigned __name = __ra_init_use_itr(__ctx, __instr); \
+	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
+
 int ra_size_to_class(unsigned sz, bool half, bool high);
 
 #endif  /* IR3_RA_H_ */
-- 
2.30.2