*/
struct ir3_postsched_ctx {
- struct ir3_context *ctx;
+ struct ir3 *ir;
+
+ struct ir3_shader_variant *v;
void *mem_ctx;
struct ir3_block *block; /* the current block */
struct dag *dag;
struct list_head unscheduled_list; /* unscheduled instructions */
- struct ir3_instruction *scheduled; /* last scheduled instr */
- struct ir3_instruction *pred; /* current p0.x user, if any */
- bool error;
+ int sfu_delay;
+ int tex_delay;
};
struct ir3_postsched_node {
*/
list_delinit(&instr->node);
- if (writes_pred(instr)) {
- ctx->pred = instr;
- }
-
di(instr, "schedule");
list_addtail(&instr->node, &instr->block->instr_list);
- ctx->scheduled = instr;
struct ir3_postsched_node *n = instr->data;
dag_prune_head(ctx->dag, &n->dag);
+
+ if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
+ return;
+
+ if (is_sfu(instr)) {
+ ctx->sfu_delay = 8;
+ } else if (check_src_cond(instr, is_sfu)) {
+ ctx->sfu_delay = 0;
+ } else if (ctx->sfu_delay > 0) {
+ ctx->sfu_delay--;
+ }
+
+ if (is_tex_or_prefetch(instr)) {
+ ctx->tex_delay = 10;
+ } else if (check_src_cond(instr, is_tex_or_prefetch)) {
+ ctx->tex_delay = 0;
+ } else if (ctx->tex_delay > 0) {
+ ctx->tex_delay--;
+ }
}
static void
}
}
+/* Determine if this is an instruction that we'd prefer not to schedule
+ * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
+ * counter, ie. the more cycles it has been since the last SFU, the less
+ * costly a sync would be.
+ */
+static bool
+would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
+{
+ if (ctx->sfu_delay) {
+ if (check_src_cond(instr, is_sfu))
+ return true;
+ }
+
+ if (ctx->tex_delay) {
+ if (check_src_cond(instr, is_tex_or_prefetch))
+ return true;
+ }
+
+ return false;
+}
+
/* find instruction to schedule: */
static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx *ctx)
return chosen->instr;
}
- /* First try to find a ready leader w/ soft delay (ie. including extra
+ /*
+ * Sometimes be better to take a nop, rather than scheduling an
+ * instruction that would require an (ss) shortly after another
+ * SFU.. ie. if last SFU was just one or two instr ago, and we
+ * could choose between taking a nop and then scheduling
+ * something else, vs scheduling the immed avail instruction that
+ * would require (ss), we are better with the nop.
+ */
+ for (unsigned delay = 0; delay < 4; delay++) {
+ foreach_sched_node (n, &ctx->dag->heads) {
+ if (would_sync(ctx, n->instr))
+ continue;
+
+ unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+
+ if (d > delay)
+ continue;
+
+ if (!chosen || (chosen->max_delay < n->max_delay))
+ chosen = n;
+ }
+
+ if (chosen) {
+ di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
+ return chosen->instr;
+ }
+ }
+
+ /* Next try to find a ready leader w/ soft delay (ie. including extra
* delay for things like tex fetch which can be synchronized w/ sync
* bit (but we probably do want to schedule some other instructions
* while we wait)
}
struct ir3_postsched_deps_state {
- struct ir3_context *ctx;
+ struct ir3_postsched_ctx *ctx;
enum { F, R } direction;
calculate_deps(struct ir3_postsched_deps_state *state,
struct ir3_postsched_node *node)
{
- static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
- struct ir3_register *reg;
int b;
/* Add dependencies on instructions that previously (or next,
* in the reverse direction) wrote any of our src registers:
*/
foreach_src_n (reg, i, node->instr) {
- /* NOTE: relative access for a src can be either const or gpr: */
- if (reg->flags & IR3_REG_RELATIV) {
- /* also reads a0.x: */
- add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
- }
-
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
continue;
}
}
+ if (node->instr->address) {
+ add_reg_dep(state, node, node->instr->address->regs[0],
+ node->instr->address->regs[0]->num,
+ false);
+ }
+
if (dest_regs(node->instr) == 0)
return;
/* And then after we update the state for what this instruction
* wrote:
*/
- reg = node->instr->regs[0];
+ struct ir3_register *reg = node->instr->regs[0];
if (reg->flags & IR3_REG_RELATIV) {
/* mark the entire array as written: */
struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
for (unsigned i = 0; i < arr->length; i++) {
add_reg_dep(state, node, reg, arr->reg + i, true);
}
-
- /* also reads a0.x: */
- add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
} else {
foreach_bit (b, reg->wrmask) {
add_reg_dep(state, node, reg, reg->num + b, true);
calculate_forward_deps(struct ir3_postsched_ctx *ctx)
{
struct ir3_postsched_deps_state state = {
- .ctx = ctx->ctx,
+ .ctx = ctx,
.direction = F,
- .merged = ctx->ctx->compiler->gpu_id >= 600,
+ .merged = ctx->v->mergedregs,
};
foreach_instr (instr, &ctx->unscheduled_list) {
calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
{
struct ir3_postsched_deps_state state = {
- .ctx = ctx->ctx,
+ .ctx = ctx,
.direction = R,
- .merged = ctx->ctx->compiler->gpu_id >= 600,
+ .merged = ctx->v->mergedregs,
};
foreach_instr_rev (instr, &ctx->unscheduled_list) {
calculate_forward_deps(ctx);
calculate_reverse_deps(ctx);
+ /*
+ * To avoid expensive texture fetches, etc, from being moved ahead
+ * of kills, track the kills we've seen so far, so we can add an
+ * extra dependency on them for tex/mem instructions
+ */
+ struct util_dynarray kills;
+ util_dynarray_init(&kills, ctx->mem_ctx);
+
/*
* Normal srcs won't be in SSA at this point, those are dealt with in
* calculate_forward_deps() and calculate_reverse_deps(). But we still
*/
foreach_instr (instr, &ctx->unscheduled_list) {
struct ir3_postsched_node *n = instr->data;
- struct ir3_instruction *src;
foreach_ssa_src_n (src, i, instr) {
if (src->block != instr->block)
dag_add_edge(&sn->dag, &n->dag, NULL);
}
+
+ if (is_kill(instr)) {
+ util_dynarray_append(&kills, struct ir3_instruction *, instr);
+ } else if (is_tex(instr) || is_mem(instr)) {
+ util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
+ struct ir3_instruction *kill = *instrp;
+ struct ir3_postsched_node *kn = kill->data;
+ dag_add_edge(&kn->dag, &n->dag, NULL);
+ }
+ }
}
// TODO do we want to do this after reverse-dependencies?
sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
{
ctx->block = block;
- ctx->scheduled = NULL;
- ctx->pred = NULL;
+ ctx->tex_delay = 0;
+ ctx->sfu_delay = 0;
/* move all instructions to the unscheduled list, and
* empty the block's instruction list (to which we will
foreach_instr_safe (instr, &ctx->unscheduled_list) {
switch (instr->opc) {
case OPC_NOP:
- case OPC_BR:
+ case OPC_B:
case OPC_JUMP:
list_delinit(&instr->node);
break;
schedule(ctx, instr);
while (!list_is_empty(&ctx->unscheduled_list)) {
- struct ir3_instruction *instr;
-
- instr = choose_instr(ctx);
-
- /* this shouldn't happen: */
- if (!instr) {
- ctx->error = true;
- break;
- }
+ struct ir3_instruction *instr = choose_instr(ctx);
unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
d("delay=%u", delay);
{
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
- struct ir3_register *reg;
foreach_src (reg, instr) {
if (!reg->instr)
}
for (unsigned i = 0; i < instr->deps_count; i++) {
- if (is_self_mov(instr->deps[i])) {
+ if (instr->deps[i] && is_self_mov(instr->deps[i])) {
list_delinit(&instr->deps[i]->node);
instr->deps[i] = instr->deps[i]->regs[1]->instr;
}
}
}
-int
-ir3_postsched(struct ir3_context *cctx)
+bool
+ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
{
struct ir3_postsched_ctx ctx = {
- .ctx = cctx,
+ .ir = ir,
+ .v = v,
};
- ir3_remove_nops(cctx->ir);
- cleanup_self_movs(cctx->ir);
+ ir3_remove_nops(ir);
+ cleanup_self_movs(ir);
- foreach_block (block, &cctx->ir->block_list) {
+ foreach_block (block, &ir->block_list) {
sched_block(&ctx, block);
}
- if (ctx.error)
- return -1;
-
- return 0;
+ return true;
}