X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_postsched.c;h=ad2c9a6c529b33204574b4a192298523e0672ce6;hb=d973e50f699b08e0c2e1788af79bcff38d670e86;hp=4290e8822499544057408090a5d65183935a1cc2;hpb=0f78c32492ed096649b015a4967d6d56c18dd14a;p=mesa.git

diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index 4290e882249..ad2c9a6c529 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -51,17 +51,18 @@
  */
 
 struct ir3_postsched_ctx {
-	struct ir3_context *ctx;
+	struct ir3 *ir;
+
+	struct ir3_shader_variant *v;
 
 	void *mem_ctx;
 	struct ir3_block *block;           /* the current block */
 	struct dag *dag;
 
 	struct list_head unscheduled_list; /* unscheduled instructions */
-	struct ir3_instruction *scheduled; /* last scheduled instr */
-	struct ir3_instruction *pred;      /* current p0.x user, if any */
 
-	bool error;
+	int sfu_delay;
+	int tex_delay;
 };
 
 struct ir3_postsched_node {
@@ -79,39 +80,40 @@ struct ir3_postsched_node {
 #define foreach_bit(b, mask) \
 	for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
 
-// TODO deduplicate
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-	return is_sfu(instr) || is_mem(instr);
-}
-
 static void
 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 {
 	debug_assert(ctx->block == instr->block);
 
-	/* maybe there is a better way to handle this than just stuffing
-	 * a nop.. ideally we'd know about this constraint in the
-	 * scheduling and depth calculation..
-	 */
-	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-		ir3_NOP(ctx->block);
-
 	/* remove from unscheduled_list:
 	 */
 	list_delinit(&instr->node);
 
-	if (writes_pred(instr)) {
-		ctx->pred = instr;
-	}
-
 	di(instr, "schedule");
 
 	list_addtail(&instr->node, &instr->block->instr_list);
-	ctx->scheduled = instr;
 
 	struct ir3_postsched_node *n = instr->data;
 	dag_prune_head(ctx->dag, &n->dag);
+
+	if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
+		return;
+
+	if (is_sfu(instr)) {
+		ctx->sfu_delay = 8;
+	} else if (check_src_cond(instr, is_sfu)) {
+		ctx->sfu_delay = 0;
+	} else if (ctx->sfu_delay > 0) {
+		ctx->sfu_delay--;
+	}
+
+	if (is_tex_or_prefetch(instr)) {
+		ctx->tex_delay = 10;
+	} else if (check_src_cond(instr, is_tex_or_prefetch)) {
+		ctx->tex_delay = 0;
+	} else if (ctx->tex_delay > 0) {
+		ctx->tex_delay--;
+	}
 }
 
 static void
@@ -132,6 +134,27 @@ dump_state(struct ir3_postsched_ctx *ctx)
 	}
 }
 
+/* Determine if this is an instruction that we'd prefer not to schedule
+ * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
+ * counter, ie. the more cycles it has been since the last SFU, the less
+ * costly a sync would be.
+ */
+static bool
+would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+{
+	if (ctx->sfu_delay) {
+		if (check_src_cond(instr, is_sfu))
+			return true;
+	}
+
+	if (ctx->tex_delay) {
+		if (check_src_cond(instr, is_tex_or_prefetch))
+			return true;
+	}
+
+	return false;
+}
+
 /* find instruction to schedule: */
 static struct ir3_instruction *
 choose_instr(struct ir3_postsched_ctx *ctx)
@@ -208,7 +231,35 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 		return chosen->instr;
 	}
 
-	/* First try to find a ready leader w/ soft delay (ie. including extra
+	/*
+	 * Sometimes be better to take a nop, rather than scheduling an
+	 * instruction that would require an (ss) shortly after another
+	 * SFU..  ie. if last SFU was just one or two instr ago, and we
+	 * could choose between taking a nop and then scheduling
+	 * something else, vs scheduling the immed avail instruction that
+	 * would require (ss), we are better with the nop.
+	 */
+	for (unsigned delay = 0; delay < 4; delay++) {
+		foreach_sched_node (n, &ctx->dag->heads) {
+			if (would_sync(ctx, n->instr))
+				continue;
+
+			unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+
+			if (d > delay)
+				continue;
+
+			if (!chosen || (chosen->max_delay < n->max_delay))
+				chosen = n;
+		}
+
+		if (chosen) {
+			di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
+			return chosen->instr;
+		}
+	}
+
+	/* Next try to find a ready leader w/ soft delay (ie. including extra
 	 * delay for things like tex fetch which can be synchronized w/ sync
 	 * bit (but we probably do want to schedule some other instructions
 	 * while we wait)
@@ -266,7 +317,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 }
 
 struct ir3_postsched_deps_state {
-	struct ir3_context *ctx;
+	struct ir3_postsched_ctx *ctx;
 
 	enum { F, R } direction;
 
@@ -351,20 +402,12 @@ static void
 calculate_deps(struct ir3_postsched_deps_state *state,
 		struct ir3_postsched_node *node)
 {
-	static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
-	struct ir3_register *reg;
 	int b;
 
 	/* Add dependencies on instructions that previously (or next,
 	 * in the reverse direction) wrote any of our src registers:
 	 */
 	foreach_src_n (reg, i, node->instr) {
-		/* NOTE: relative access for a src can be either const or gpr: */
-		if (reg->flags & IR3_REG_RELATIV) {
-			/* also reads a0.x: */
-			add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
-		}
-
 		if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 			continue;
 
@@ -380,29 +423,32 @@ calculate_deps(struct ir3_postsched_deps_state *state,
 
 				struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 				if (dep && (state->direction == F)) {
-					unsigned d = ir3_delayslots(dep->instr, node->instr, i);
+					unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 					node->delay = MAX2(node->delay, d);
 				}
 			}
 		}
 	}
 
+	if (node->instr->address) {
+		add_reg_dep(state, node, node->instr->address->regs[0],
+					node->instr->address->regs[0]->num,
+					false);
+	}
+
 	if (dest_regs(node->instr) == 0)
 		return;
 
 	/* And then after we update the state for what this instruction
 	 * wrote:
 	 */
-	reg = node->instr->regs[0];
+	struct ir3_register *reg = node->instr->regs[0];
 	if (reg->flags & IR3_REG_RELATIV) {
 		/* mark the entire array as written: */
 		struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 		for (unsigned i = 0; i < arr->length; i++) {
 			add_reg_dep(state, node, reg, arr->reg + i, true);
 		}
-
-		/* also reads a0.x: */
-		add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
 	} else {
 		foreach_bit (b, reg->wrmask) {
 			add_reg_dep(state, node, reg, reg->num + b, true);
@@ -414,9 +460,9 @@ static void
 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 {
 	struct ir3_postsched_deps_state state = {
-			.ctx = ctx->ctx,
+			.ctx = ctx,
 			.direction = F,
-			.merged = ctx->ctx->compiler->gpu_id >= 600,
+			.merged = ctx->v->mergedregs,
 	};
 
 	foreach_instr (instr, &ctx->unscheduled_list) {
@@ -428,9 +474,9 @@ static void
 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 {
 	struct ir3_postsched_deps_state state = {
-			.ctx = ctx->ctx,
+			.ctx = ctx,
 			.direction = R,
-			.merged = ctx->ctx->compiler->gpu_id >= 600,
+			.merged = ctx->v->mergedregs,
 	};
 
 	foreach_instr_rev (instr, &ctx->unscheduled_list) {
@@ -476,6 +522,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
 	calculate_forward_deps(ctx);
 	calculate_reverse_deps(ctx);
 
+	/*
+	 * To avoid expensive texture fetches, etc, from being moved ahead
+	 * of kills, track the kills we've seen so far, so we can add an
+	 * extra dependency on them for tex/mem instructions
+	 */
+	struct util_dynarray kills;
+	util_dynarray_init(&kills, ctx->mem_ctx);
+
 	/*
 	 * Normal srcs won't be in SSA at this point, those are dealt with in
 	 * calculate_forward_deps() and calculate_reverse_deps().  But we still
@@ -484,7 +538,6 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
 	 */
 	foreach_instr (instr, &ctx->unscheduled_list) {
 		struct ir3_postsched_node *n = instr->data;
-		struct ir3_instruction *src;
 
 		foreach_ssa_src_n (src, i, instr) {
 			if (src->block != instr->block)
@@ -502,6 +555,16 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
 
 			dag_add_edge(&sn->dag, &n->dag, NULL);
 		}
+
+		if (is_kill(instr)) {
+			util_dynarray_append(&kills, struct ir3_instruction *, instr);
+		} else if (is_tex(instr) || is_mem(instr)) {
+			util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
+				struct ir3_instruction *kill = *instrp;
+				struct ir3_postsched_node *kn = kill->data;
+				dag_add_edge(&kn->dag, &n->dag, NULL);
+			}
+		}
 	}
 
 	// TODO do we want to do this after reverse-dependencies?
@@ -520,8 +583,8 @@ static void
 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 {
 	ctx->block = block;
-	ctx->scheduled = NULL;
-	ctx->pred = NULL;
+	ctx->tex_delay = 0;
+	ctx->sfu_delay = 0;
 
 	/* move all instructions to the unscheduled list, and
 	 * empty the block's instruction list (to which we will
@@ -536,7 +599,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 	foreach_instr_safe (instr, &ctx->unscheduled_list) {
 		switch (instr->opc) {
 		case OPC_NOP:
-		case OPC_BR:
+		case OPC_B:
 		case OPC_JUMP:
 			list_delinit(&instr->node);
 			break;
@@ -565,15 +628,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 			schedule(ctx, instr);
 
 	while (!list_is_empty(&ctx->unscheduled_list)) {
-		struct ir3_instruction *instr;
-
-		instr = choose_instr(ctx);
-
-		/* this shouldn't happen: */
-		if (!instr) {
-			ctx->error = true;
-			break;
-		}
+		struct ir3_instruction *instr = choose_instr(ctx);
 
 		unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 		d("delay=%u", delay);
@@ -626,7 +681,6 @@ cleanup_self_movs(struct ir3 *ir)
 {
 	foreach_block (block, &ir->block_list) {
 		foreach_instr_safe (instr, &block->instr_list) {
-			struct ir3_register *reg;
 
 			foreach_src (reg, instr) {
 				if (!reg->instr)
@@ -639,7 +693,7 @@ cleanup_self_movs(struct ir3 *ir)
 			}
 
 			for (unsigned i = 0; i < instr->deps_count; i++) {
-				if (is_self_mov(instr->deps[i])) {
+				if (instr->deps[i] && is_self_mov(instr->deps[i])) {
 					list_delinit(&instr->deps[i]->node);
 					instr->deps[i] = instr->deps[i]->regs[1]->instr;
 				}
@@ -648,22 +702,20 @@ cleanup_self_movs(struct ir3 *ir)
 	}
 }
 
-int
-ir3_postsched(struct ir3_context *cctx)
+bool
+ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
 {
 	struct ir3_postsched_ctx ctx = {
-			.ctx = cctx,
+			.ir = ir,
+			.v  = v,
 	};
 
-	ir3_remove_nops(cctx->ir);
-	cleanup_self_movs(cctx->ir);
+	ir3_remove_nops(ir);
+	cleanup_self_movs(ir);
 
-	foreach_block (block, &cctx->ir->block_list) {
+	foreach_block (block, &ir->block_list) {
 		sched_block(&ctx, block);
 	}
 
-	if (ctx.error)
-		return -1;
-
-	return 0;
+	return true;
 }