X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_postsched.c;h=ad2c9a6c529b33204574b4a192298523e0672ce6;hb=d973e50f699b08e0c2e1788af79bcff38d670e86;hp=4290e8822499544057408090a5d65183935a1cc2;hpb=0f78c32492ed096649b015a4967d6d56c18dd14a;p=mesa.git diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index 4290e882249..ad2c9a6c529 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -51,17 +51,18 @@ */ struct ir3_postsched_ctx { - struct ir3_context *ctx; + struct ir3 *ir; + + struct ir3_shader_variant *v; void *mem_ctx; struct ir3_block *block; /* the current block */ struct dag *dag; struct list_head unscheduled_list; /* unscheduled instructions */ - struct ir3_instruction *scheduled; /* last scheduled instr */ - struct ir3_instruction *pred; /* current p0.x user, if any */ - bool error; + int sfu_delay; + int tex_delay; }; struct ir3_postsched_node { @@ -79,39 +80,40 @@ struct ir3_postsched_node { #define foreach_bit(b, mask) \ for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});) -// TODO deduplicate -static bool is_sfu_or_mem(struct ir3_instruction *instr) -{ - return is_sfu(instr) || is_mem(instr); -} - static void schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) { debug_assert(ctx->block == instr->block); - /* maybe there is a better way to handle this than just stuffing - * a nop.. ideally we'd know about this constraint in the - * scheduling and depth calculation.. - */ - if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) - ir3_NOP(ctx->block); - /* remove from unscheduled_list: */ list_delinit(&instr->node); - if (writes_pred(instr)) { - ctx->pred = instr; - } - di(instr, "schedule"); list_addtail(&instr->node, &instr->block->instr_list); - ctx->scheduled = instr; struct ir3_postsched_node *n = instr->data; dag_prune_head(ctx->dag, &n->dag); + + if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) + return; + + if (is_sfu(instr)) { + ctx->sfu_delay = 8; + } else if (check_src_cond(instr, is_sfu)) { + ctx->sfu_delay = 0; + } else if (ctx->sfu_delay > 0) { + ctx->sfu_delay--; + } + + if (is_tex_or_prefetch(instr)) { + ctx->tex_delay = 10; + } else if (check_src_cond(instr, is_tex_or_prefetch)) { + ctx->tex_delay = 0; + } else if (ctx->tex_delay > 0) { + ctx->tex_delay--; + } } static void @@ -132,6 +134,27 @@ dump_state(struct ir3_postsched_ctx *ctx) } } +/* Determine if this is an instruction that we'd prefer not to schedule + * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay + * counter, ie. the more cycles it has been since the last SFU, the less + * costly a sync would be. + */ +static bool +would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) +{ + if (ctx->sfu_delay) { + if (check_src_cond(instr, is_sfu)) + return true; + } + + if (ctx->tex_delay) { + if (check_src_cond(instr, is_tex_or_prefetch)) + return true; + } + + return false; +} + /* find instruction to schedule: */ static struct ir3_instruction * choose_instr(struct ir3_postsched_ctx *ctx) @@ -208,7 +231,35 @@ choose_instr(struct ir3_postsched_ctx *ctx) return chosen->instr; } - /* First try to find a ready leader w/ soft delay (ie. including extra + /* + * Sometimes be better to take a nop, rather than scheduling an + * instruction that would require an (ss) shortly after another + * SFU.. ie. if last SFU was just one or two instr ago, and we + * could choose between taking a nop and then scheduling + * something else, vs scheduling the immed avail instruction that + * would require (ss), we are better with the nop. + */ + for (unsigned delay = 0; delay < 4; delay++) { + foreach_sched_node (n, &ctx->dag->heads) { + if (would_sync(ctx, n->instr)) + continue; + + unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false); + + if (d > delay) + continue; + + if (!chosen || (chosen->max_delay < n->max_delay)) + chosen = n; + } + + if (chosen) { + di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay); + return chosen->instr; + } + } + + /* Next try to find a ready leader w/ soft delay (ie. including extra * delay for things like tex fetch which can be synchronized w/ sync * bit (but we probably do want to schedule some other instructions * while we wait) @@ -266,7 +317,7 @@ choose_instr(struct ir3_postsched_ctx *ctx) } struct ir3_postsched_deps_state { - struct ir3_context *ctx; + struct ir3_postsched_ctx *ctx; enum { F, R } direction; @@ -351,20 +402,12 @@ static void calculate_deps(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *node) { - static const struct ir3_register half_reg = { .flags = IR3_REG_HALF }; - struct ir3_register *reg; int b; /* Add dependencies on instructions that previously (or next, * in the reverse direction) wrote any of our src registers: */ foreach_src_n (reg, i, node->instr) { - /* NOTE: relative access for a src can be either const or gpr: */ - if (reg->flags & IR3_REG_RELATIV) { - /* also reads a0.x: */ - add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); - } - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) continue; @@ -380,29 +423,32 @@ calculate_deps(struct ir3_postsched_deps_state *state, struct ir3_postsched_node *dep = dep_reg(state, reg->num + b); if (dep && (state->direction == F)) { - unsigned d = ir3_delayslots(dep->instr, node->instr, i); + unsigned d = ir3_delayslots(dep->instr, node->instr, i, true); node->delay = MAX2(node->delay, d); } } } } + if (node->instr->address) { + add_reg_dep(state, node, node->instr->address->regs[0], + node->instr->address->regs[0]->num, + false); + } + if (dest_regs(node->instr) == 0) return; /* And then after we update the state for what this instruction * wrote: */ - reg = node->instr->regs[0]; + struct ir3_register *reg = node->instr->regs[0]; if (reg->flags & IR3_REG_RELATIV) { /* mark the entire array as written: */ struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id); for (unsigned i = 0; i < arr->length; i++) { add_reg_dep(state, node, reg, arr->reg + i, true); } - - /* also reads a0.x: */ - add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false); } else { foreach_bit (b, reg->wrmask) { add_reg_dep(state, node, reg, reg->num + b, true); @@ -414,9 +460,9 @@ static void calculate_forward_deps(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_deps_state state = { - .ctx = ctx->ctx, + .ctx = ctx, .direction = F, - .merged = ctx->ctx->compiler->gpu_id >= 600, + .merged = ctx->v->mergedregs, }; foreach_instr (instr, &ctx->unscheduled_list) { @@ -428,9 +474,9 @@ static void calculate_reverse_deps(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_deps_state state = { - .ctx = ctx->ctx, + .ctx = ctx, .direction = R, - .merged = ctx->ctx->compiler->gpu_id >= 600, + .merged = ctx->v->mergedregs, }; foreach_instr_rev (instr, &ctx->unscheduled_list) { @@ -476,6 +522,14 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) calculate_forward_deps(ctx); calculate_reverse_deps(ctx); + /* + * To avoid expensive texture fetches, etc, from being moved ahead + * of kills, track the kills we've seen so far, so we can add an + * extra dependency on them for tex/mem instructions + */ + struct util_dynarray kills; + util_dynarray_init(&kills, ctx->mem_ctx); + /* * Normal srcs won't be in SSA at this point, those are dealt with in * calculate_forward_deps() and calculate_reverse_deps(). But we still @@ -484,7 +538,6 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) */ foreach_instr (instr, &ctx->unscheduled_list) { struct ir3_postsched_node *n = instr->data; - struct ir3_instruction *src; foreach_ssa_src_n (src, i, instr) { if (src->block != instr->block) @@ -502,6 +555,16 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) dag_add_edge(&sn->dag, &n->dag, NULL); } + + if (is_kill(instr)) { + util_dynarray_append(&kills, struct ir3_instruction *, instr); + } else if (is_tex(instr) || is_mem(instr)) { + util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) { + struct ir3_instruction *kill = *instrp; + struct ir3_postsched_node *kn = kill->data; + dag_add_edge(&kn->dag, &n->dag, NULL); + } + } } // TODO do we want to do this after reverse-dependencies? @@ -520,8 +583,8 @@ static void sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) { ctx->block = block; - ctx->scheduled = NULL; - ctx->pred = NULL; + ctx->tex_delay = 0; + ctx->sfu_delay = 0; /* move all instructions to the unscheduled list, and * empty the block's instruction list (to which we will @@ -536,7 +599,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) foreach_instr_safe (instr, &ctx->unscheduled_list) { switch (instr->opc) { case OPC_NOP: - case OPC_BR: + case OPC_B: case OPC_JUMP: list_delinit(&instr->node); break; @@ -565,15 +628,7 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) schedule(ctx, instr); while (!list_is_empty(&ctx->unscheduled_list)) { - struct ir3_instruction *instr; - - instr = choose_instr(ctx); - - /* this shouldn't happen: */ - if (!instr) { - ctx->error = true; - break; - } + struct ir3_instruction *instr = choose_instr(ctx); unsigned delay = ir3_delay_calc(ctx->block, instr, false, false); d("delay=%u", delay); @@ -626,7 +681,6 @@ cleanup_self_movs(struct ir3 *ir) { foreach_block (block, &ir->block_list) { foreach_instr_safe (instr, &block->instr_list) { - struct ir3_register *reg; foreach_src (reg, instr) { if (!reg->instr) @@ -639,7 +693,7 @@ cleanup_self_movs(struct ir3 *ir) } for (unsigned i = 0; i < instr->deps_count; i++) { - if (is_self_mov(instr->deps[i])) { + if (instr->deps[i] && is_self_mov(instr->deps[i])) { list_delinit(&instr->deps[i]->node); instr->deps[i] = instr->deps[i]->regs[1]->instr; } @@ -648,22 +702,20 @@ cleanup_self_movs(struct ir3 *ir) } } -int -ir3_postsched(struct ir3_context *cctx) +bool +ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v) { struct ir3_postsched_ctx ctx = { - .ctx = cctx, + .ir = ir, + .v = v, }; - ir3_remove_nops(cctx->ir); - cleanup_self_movs(cctx->ir); + ir3_remove_nops(ir); + cleanup_self_movs(ir); - foreach_block (block, &cctx->ir->block_list) { + foreach_block (block, &ir->block_list) { sched_block(&ctx, block); } - if (ctx.error) - return -1; - - return 0; + return true; }