X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_legalize.c;h=69efdbabc3f1828854d0b7260c447a4c8b944675;hb=2e1df6a17ff82c4a456caa8be4bfae1fac009b6a;hp=e46d1214d605962f488f0ec16b0634c499b54358;hpb=06bc4875fff8ad144dbebebe53241da4655aecc6;p=mesa.git diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index e46d1214d60..69efdbabc3f 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -28,22 +28,27 @@ #include "util/u_math.h" #include "ir3.h" -#include "ir3_compiler.h" +#include "ir3_shader.h" /* * Legalize: * - * We currently require that scheduling ensures that we have enough nop's - * in all the right places. The legalize step mostly handles fixing up - * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's - * into fewer nop's w/ rpt flag. + * The legalize pass handles ensuring sufficient nop's and sync flags for + * correct execution. + * + * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed, + * based on state flowing out of predecessor blocks until there is + * no further change. In some cases this requires inserting nops. + * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x + * 3) Final nop scheduling for instruction latency + * 4) Resolve jumps and schedule blocks, marking potential convergence + * points with (jp) */ struct ir3_legalize_ctx { struct ir3_compiler *compiler; + struct ir3_shader_variant *so; gl_shader_stage type; - bool has_ssbo; - bool need_pixlod; int max_bary; }; @@ -88,6 +93,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) struct ir3_legalize_state prev_state = bd->state; struct ir3_legalize_state *state = &bd->state; bool last_input_needs_ss = false; + bool has_tex_prefetch = false; + bool mergedregs = ctx->so->mergedregs; /* our input state is the OR of all predecessor blocks' state: */ set_foreach(block->predecessors, entry) { @@ -112,13 +119,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_replace(&block->instr_list, &instr_list); list_inithead(&block->instr_list); - list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) { - struct ir3_register *reg; + foreach_instr_safe (n, &instr_list) { unsigned i; n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); - if (is_meta(n)) + /* _meta::tex_prefetch instructions removed later in + * collect_tex_prefetches() + */ + if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH)) continue; if (is_input(n)) { @@ -130,6 +139,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (last_n && is_barrier(last_n)) { n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; last_input_needs_ss = false; + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); + regmask_init(&state->needs_sy, mergedregs); + } + + if (last_n && (last_n->opc == OPC_PREDT)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); } /* NOTE: consider dst register too.. it could happen that @@ -139,7 +157,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * resulting in undefined results: */ for (i = 0; i < n->regs_count; i++) { - reg = n->regs[i]; + struct ir3_register *reg = n->regs[i]; if (reg_gpr(reg)) { @@ -150,13 +168,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (regmask_get(&state->needs_ss, reg)) { n->flags |= IR3_INSTR_SS; last_input_needs_ss = false; - regmask_init(&state->needs_ss_war); - regmask_init(&state->needs_ss); + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); } if (regmask_get(&state->needs_sy, reg)) { n->flags |= IR3_INSTR_SY; - regmask_init(&state->needs_sy); + regmask_init(&state->needs_sy, mergedregs); } } @@ -169,12 +187,12 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } if (n->regs_count > 0) { - reg = n->regs[0]; + struct ir3_register *reg = n->regs[0]; if (regmask_get(&state->needs_ss_war, reg)) { n->flags |= IR3_INSTR_SS; last_input_needs_ss = false; - regmask_init(&state->needs_ss_war); - regmask_init(&state->needs_ss); + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); } if (last_rel && (reg->num == regid(REG_A0, 0))) { @@ -196,40 +214,21 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } /* need to be able to set (ss) on first instruction: */ - if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) + if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) ir3_NOP(block); - if (is_nop(n) && !list_empty(&block->instr_list)) { - struct ir3_instruction *last = list_last_entry(&block->instr_list, - struct ir3_instruction, node); - if (is_nop(last) && (last->repeat < 5)) { - last->repeat++; - last->flags |= n->flags; - continue; - } - - /* NOTE: I think the nopN encoding works for a5xx and - * probably a4xx, but not a3xx. So far only tested on - * a6xx. - */ - if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) && - ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) { - last->nop++; - continue; - } - } - if (ctx->compiler->samgq_workaround && ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) { struct ir3_instruction *samgp; + list_delinit(&n->node); + for (i = 0; i < 4; i++) { samgp = ir3_instr_clone(n); samgp->opc = OPC_SAMGP0 + i; if (i > 1) samgp->flags |= IR3_INSTR_SY; } - list_delinit(&n->node); } else { list_addtail(&n->node, &block->instr_list); } @@ -237,9 +236,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (is_sfu(n)) regmask_set(&state->needs_ss, n->regs[0]); - if (is_tex(n)) { + if (is_tex_or_prefetch(n)) { regmask_set(&state->needs_sy, n->regs[0]); - ctx->need_pixlod = true; + if (n->opc == OPC_META_TEX_PREFETCH) + has_tex_prefetch = true; } else if (n->opc == OPC_RESINFO) { regmask_set(&state->needs_ss, n->regs[0]); ir3_NOP(block)->flags |= IR3_INSTR_SS; @@ -248,7 +248,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) /* seems like ldlv needs (ss) bit instead?? which is odd but * makes a bunch of flat-varying tests start working on a4xx. */ - if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) + if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || (n->opc == OPC_LDLW)) regmask_set(&state->needs_ss, n->regs[0]); else regmask_set(&state->needs_sy, n->regs[0]); @@ -266,13 +266,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) - ctx->has_ssbo = true; + ctx->so->has_ssbo = true; /* both tex/sfu appear to not always immediately consume * their src register(s): */ if (is_tex(n) || is_sfu(n) || is_mem(n)) { - foreach_src(reg, n) { + foreach_src (reg, n) { if (reg_gpr(reg)) regmask_set(&state->needs_ss_war, reg); } @@ -303,8 +303,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) ir3_reg_create(baryf, regid(0, 0), 0); /* insert the dummy bary.f after last_input: */ - list_delinit(&baryf->node); - list_add(&baryf->node, &last_input->node); + ir3_instr_move_after(baryf, last_input); last_input = baryf; @@ -316,6 +315,22 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) last_input->regs[0]->flags |= IR3_REG_EI; if (last_input_needs_ss) last_input->flags |= IR3_INSTR_SS; + } else if (has_tex_prefetch) { + /* texture prefetch, but *no* inputs.. we need to insert a + * dummy bary.f at the top of the shader to unblock varying + * storage: + */ + struct ir3_instruction *baryf; + + /* (ss)bary.f (ei)r63.x, 0, r0.x */ + baryf = ir3_instr_create(block, OPC_BARY_F); + ir3_reg_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI; + ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; + ir3_reg_create(baryf, regid(0, 0), 0); + + /* insert the dummy bary.f at head: */ + list_delinit(&baryf->node); + list_add(&baryf->node, &block->instr_list); } if (last_rel) @@ -338,6 +353,42 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) return true; } +/* Expands dsxpp and dsypp macros to: + * + * dsxpp.1 dst, src + * dsxpp.1.p dst, src + * + * We apply this after flags syncing, as we don't want to sync in between the + * two (which might happen if dst == src). We do it before nop scheduling + * because that needs to count actual instructions. + */ +static bool +apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) +{ + struct list_head instr_list; + + /* remove all the instructions from the list, we'll be adding + * them back in as we go + */ + list_replace(&block->instr_list, &instr_list); + list_inithead(&block->instr_list); + + foreach_instr_safe (n, &instr_list) { + list_addtail(&n->node, &block->instr_list); + + if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) { + n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1; + + struct ir3_instruction *op_p = ir3_instr_clone(n); + op_p->flags = IR3_INSTR_P; + + ctx->so->need_fine_derivatives = true; + } + } + + return true; +} + /* NOTE: branch instructions are always the last instruction(s) * in the block. We take advantage of this as we resolve the * branches, since "if (foo) break;" constructs turn into @@ -388,7 +439,7 @@ resolve_dest_block(struct ir3_block *block) * (2) (block-is-empty || only-instr-is-jump) */ if (block->successors[1] == NULL) { - if (list_empty(&block->instr_list)) { + if (list_is_empty(&block->instr_list)) { return block->successors[0]; } else if (list_length(&block->instr_list) == 1) { struct ir3_instruction *instr = list_first_entry( @@ -400,6 +451,47 @@ resolve_dest_block(struct ir3_block *block) return block; } +static void +remove_unused_block(struct ir3_block *old_target) +{ + list_delinit(&old_target->node); + + /* cleanup dangling predecessors: */ + for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) { + if (old_target->successors[i]) { + struct ir3_block *succ = old_target->successors[i]; + _mesa_set_remove_key(succ->predecessors, old_target); + } + } +} + +static void +retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target) +{ + struct ir3_block *old_target = instr->cat0.target; + struct ir3_block *cur_block = instr->block; + + /* update current blocks successors to reflect the retargetting: */ + if (cur_block->successors[0] == old_target) { + cur_block->successors[0] = new_target; + } else { + debug_assert(cur_block->successors[1] == old_target); + cur_block->successors[1] = new_target; + } + + /* update new target's predecessors: */ + _mesa_set_add(new_target->predecessors, cur_block); + + /* and remove old_target's predecessor: */ + debug_assert(_mesa_set_search(old_target->predecessors, cur_block)); + _mesa_set_remove_key(old_target->predecessors, cur_block); + + if (old_target->predecessors->entries == 0) + remove_unused_block(old_target); + + instr->cat0.target = new_target; +} + static bool resolve_jump(struct ir3_instruction *instr) { @@ -408,8 +500,7 @@ resolve_jump(struct ir3_instruction *instr) struct ir3_instruction *target; if (tblock != instr->cat0.target) { - list_delinit(&instr->cat0.target->node); - instr->cat0.target = tblock; + retarget_jump(instr, tblock); return true; } @@ -450,8 +541,8 @@ resolve_jump(struct ir3_instruction *instr) static bool resolve_jumps(struct ir3 *ir) { - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) + foreach_block (block, &ir->block_list) + foreach_instr (instr, &block->instr_list) if (is_flow(instr) && instr->cat0.target) if (resolve_jump(instr)) return true; @@ -459,82 +550,251 @@ resolve_jumps(struct ir3 *ir) return false; } -/* we want to mark points where divergent flow control re-converges - * with (jp) flags. For now, since we don't do any optimization for - * things that start out as a 'do {} while()', re-convergence points - * will always be a branch or jump target. Note that this is overly - * conservative, since unconditional jump targets are not convergence - * points, we are just assuming that the other path to reach the jump - * target was divergent. If we were clever enough to optimize the - * jump at end of a loop back to a conditional branch into a single - * conditional branch, ie. like: +static void mark_jp(struct ir3_block *block) +{ + struct ir3_instruction *target = list_first_entry(&block->instr_list, + struct ir3_instruction, node); + target->flags |= IR3_INSTR_JP; +} + +/* Mark points where control flow converges or diverges. * - * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start - * mul.f r1.z, r1.z, r0.x - * mul.f r1.y, r1.y, r0.x - * mul.f r0.z, r1.x, r0.x - * mul.f r0.w, r0.y, r0.x - * cmps.f.ge r0.x, (r)c2.y, (r)r1.w - * add.s r0.x, (r)r0.x, (r)-1 - * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x - * cmps.f.eq p0.x, r0.x, c3.y - * mov.f32f32 r0.x, r1.w - * mov.f32f32 r0.y, r0.w - * mov.f32f32 r1.x, r0.z - * (rpt2)nop - * br !p0.x, #-13 - * (jp)mul.f r0.x, c263.y, r1.y + * Divergence points could actually be re-convergence points where + * "parked" threads are recoverged with threads that took the opposite + * path last time around. Possibly it is easier to think of (jp) as + * "the execution mask might have changed". + */ +static void +mark_xvergence_points(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + if (block->predecessors->entries > 1) { + /* if a block has more than one possible predecessor, then + * the first instruction is a convergence point. + */ + mark_jp(block); + } else if (block->predecessors->entries == 1) { + /* If a block has one predecessor, which has multiple possible + * successors, it is a divergence point. + */ + set_foreach(block->predecessors, entry) { + struct ir3_block *predecessor = (struct ir3_block *)entry->key; + if (predecessor->successors[1]) { + mark_jp(block); + } + } + } + } +} + +/* Insert the branch/jump instructions for flow control between blocks. + * Initially this is done naively, without considering if the successor + * block immediately follows the current block (ie. so no jump required), + * but that is cleaned up in resolve_jumps(). * - * Then we'd have to be more clever, as the convergence point is no - * longer a branch or jump target. + * TODO what ensures that the last write to p0.x in a block is the + * branch condition? Have we been getting lucky all this time? */ static void -mark_convergence_points(struct ir3 *ir) +block_sched(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + if (block->successors[1]) { + /* if/else, conditional branches to "then" or "else": */ + struct ir3_instruction *br; + + debug_assert(block->condition); + + /* create "else" branch first (since "then" block should + * frequently/always end up being a fall-thru): + */ + br = ir3_B(block, block->condition, 0); + br->cat0.inv = true; + br->cat0.target = block->successors[1]; + + /* "then" branch: */ + br = ir3_B(block, block->condition, 0); + br->cat0.target = block->successors[0]; + + } else if (block->successors[0]) { + /* otherwise unconditional jump to next block: */ + struct ir3_instruction *jmp; + + jmp = ir3_JUMP(block); + jmp->cat0.target = block->successors[0]; + } + } +} + +/* Here we workaround the fact that kill doesn't actually kill the thread as + * GL expects. The last instruction always needs to be an end instruction, + * which means that if we're stuck in a loop where kill is the only way out, + * then we may have to jump out to the end. kill may also have the d3d + * semantics of converting the thread to a helper thread, rather than setting + * the exec mask to 0, in which case the helper thread could get stuck in an + * infinite loop. + * + * We do this late, both to give the scheduler the opportunity to reschedule + * kill instructions earlier and to avoid having to create a separate basic + * block. + * + * TODO: Assuming that the wavefront doesn't stop as soon as all threads are + * killed, we might benefit by doing this more aggressively when the remaining + * part of the program after the kill is large, since that would let us + * skip over the instructions when there are no non-killed threads left. + */ +static void +kill_sched(struct ir3 *ir, struct ir3_shader_variant *so) +{ + /* True if we know that this block will always eventually lead to the end + * block: + */ + bool always_ends = true; + bool added = false; + struct ir3_block *last_block = + list_last_entry(&ir->block_list, struct ir3_block, node); + + foreach_block_rev (block, &ir->block_list) { + for (unsigned i = 0; i < 2 && block->successors[i]; i++) { + if (block->successors[i]->start_ip <= block->end_ip) + always_ends = false; + } + + if (always_ends) + continue; + + foreach_instr_safe (instr, &block->instr_list) { + if (instr->opc != OPC_KILL) + continue; + + struct ir3_instruction *br = ir3_instr_create(block, OPC_B); + br->regs[1] = instr->regs[1]; + br->cat0.target = + list_last_entry(&ir->block_list, struct ir3_block, node); + + list_del(&br->node); + list_add(&br->node, &instr->node); + + added = true; + } + } + + if (added) { + /* I'm not entirely sure how the branchstack works, but we probably + * need to add at least one entry for the divergence which is resolved + * at the end: + */ + so->branchstack++; + + /* We don't update predecessors/successors, so we have to do this + * manually: + */ + mark_jp(last_block); + } +} + +/* Insert nop's required to make this a legal/valid shader program: */ +static void +nop_sched(struct ir3 *ir) { - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - if (is_flow(instr) && instr->cat0.target) { - struct ir3_instruction *target = - list_first_entry(&instr->cat0.target->instr_list, - struct ir3_instruction, node); - target->flags |= IR3_INSTR_JP; + foreach_block (block, &ir->block_list) { + struct ir3_instruction *last = NULL; + struct list_head instr_list; + + /* remove all the instructions from the list, we'll be adding + * them back in as we go + */ + list_replace(&block->instr_list, &instr_list); + list_inithead(&block->instr_list); + + foreach_instr_safe (instr, &instr_list) { + unsigned delay = ir3_delay_calc(block, instr, false, true); + + /* NOTE: I think the nopN encoding works for a5xx and + * probably a4xx, but not a3xx. So far only tested on + * a6xx. + */ + + if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last && + ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) && + (last->repeat == 0)) { + /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ + unsigned transfer = MIN2(delay, 3 - last->nop); + last->nop += transfer; + delay -= transfer; + } + + if ((delay > 0) && last && (last->opc == OPC_NOP)) { + /* the previous nop can encode at most 5 repeats: */ + unsigned transfer = MIN2(delay, 5 - last->repeat); + last->repeat += transfer; + delay -= transfer; } + + if (delay > 0) { + debug_assert(delay <= 6); + ir3_NOP(block)->repeat = delay - 1; + } + + list_addtail(&instr->node, &block->instr_list); + last = instr; } } } -void -ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary) +bool +ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) { struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); + bool mergedregs = so->mergedregs; bool progress; + ctx->so = so; ctx->max_bary = -1; ctx->compiler = ir->compiler; ctx->type = ir->type; /* allocate per-block data: */ - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - block->data = rzalloc(ctx, struct ir3_legalize_block_data); + foreach_block (block, &ir->block_list) { + struct ir3_legalize_block_data *bd = + rzalloc(ctx, struct ir3_legalize_block_data); + + regmask_init(&bd->state.needs_ss_war, mergedregs); + regmask_init(&bd->state.needs_ss, mergedregs); + regmask_init(&bd->state.needs_sy, mergedregs); + + block->data = bd; } + ir3_remove_nops(ir); + /* process each block: */ do { progress = false; - list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + foreach_block (block, &ir->block_list) { progress |= legalize_block(ctx, block); } } while (progress); - *has_ssbo = ctx->has_ssbo; - *need_pixlod = ctx->need_pixlod; *max_bary = ctx->max_bary; + block_sched(ir); + if (so->type == MESA_SHADER_FRAGMENT) + kill_sched(ir, so); + + foreach_block (block, &ir->block_list) { + progress |= apply_fine_deriv_macro(ctx, block); + } + + nop_sched(ir); + do { ir3_count_instructions(ir); } while(resolve_jumps(ir)); - mark_convergence_points(ir); + mark_xvergence_points(ir); ralloc_free(ctx); + + return true; }