X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_legalize.c;h=76362c4468cd3924489d2a2b20722ed85b7e1cbf;hb=2b93eb9c76f24785a3fbf4504f0157046041b1bc;hp=b14a789efb246c125fe94aae5d337228254a9128;hpb=cb884d8ab210b4793eb55852b4f07642c71a99a5;p=mesa.git diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index b14a789efb2..76362c4468c 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -41,8 +41,9 @@ struct ir3_legalize_ctx { struct ir3_compiler *compiler; - int num_samp; + gl_shader_stage type; bool has_ssbo; + bool need_pixlod; int max_bary; }; @@ -86,10 +87,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) struct list_head instr_list; struct ir3_legalize_state prev_state = bd->state; struct ir3_legalize_state *state = &bd->state; + bool last_input_needs_ss = false; + bool has_tex_prefetch = false; /* our input state is the OR of all predecessor blocks' state: */ - for (unsigned i = 0; i < block->predecessors_count; i++) { - struct ir3_legalize_block_data *pbd = block->predecessors[i]->data; + set_foreach(block->predecessors, entry) { + struct ir3_block *predecessor = (struct ir3_block *)entry->key; + struct ir3_legalize_block_data *pbd = predecessor->data; struct ir3_legalize_state *pstate = &pbd->state; /* Our input (ss)/(sy) state is based on OR'ing the output @@ -115,7 +119,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); - if (is_meta(n)) + /* _meta::tex_prefetch instructions removed later in + * collect_tex_prefetches() + */ + if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH)) continue; if (is_input(n)) { @@ -124,8 +131,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); } - if (last_n && is_barrier(last_n)) + if (last_n && is_barrier(last_n)) { n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + last_input_needs_ss = false; + } /* NOTE: consider dst register too.. it could happen that * texture sample instruction (for example) writes some @@ -144,6 +153,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ if (regmask_get(&state->needs_ss, reg)) { n->flags |= IR3_INSTR_SS; + last_input_needs_ss = false; regmask_init(&state->needs_ss_war); regmask_init(&state->needs_ss); } @@ -166,6 +176,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) reg = n->regs[0]; if (regmask_get(&state->needs_ss_war, reg)) { n->flags |= IR3_INSTR_SS; + last_input_needs_ss = false; regmask_init(&state->needs_ss_war); regmask_init(&state->needs_ss); } @@ -212,29 +223,38 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } } - list_addtail(&n->node, &block->instr_list); + if (ctx->compiler->samgq_workaround && + ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) { + struct ir3_instruction *samgp; + + for (i = 0; i < 4; i++) { + samgp = ir3_instr_clone(n); + samgp->opc = OPC_SAMGP0 + i; + if (i > 1) + samgp->flags |= IR3_INSTR_SY; + } + list_delinit(&n->node); + } else { + list_addtail(&n->node, &block->instr_list); + } if (is_sfu(n)) regmask_set(&state->needs_ss, n->regs[0]); - if (is_tex(n)) { - /* this ends up being the # of samp instructions.. but that - * is ok, everything else only cares whether it is zero or - * not. We do this here, rather than when we encounter a - * SAMP decl, because (especially in binning pass shader) - * the samp instruction(s) could get eliminated if the - * result is not used. - */ - ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1); + if (is_tex(n) || (n->opc == OPC_META_TEX_PREFETCH)) { regmask_set(&state->needs_sy, n->regs[0]); + ctx->need_pixlod = true; + if (n->opc == OPC_META_TEX_PREFETCH) + has_tex_prefetch = true; } else if (n->opc == OPC_RESINFO) { regmask_set(&state->needs_ss, n->regs[0]); ir3_NOP(block)->flags |= IR3_INSTR_SS; + last_input_needs_ss = false; } else if (is_load(n)) { /* seems like ldlv needs (ss) bit instead?? which is odd but * makes a bunch of flat-varying tests start working on a4xx. */ - if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) + if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || (n->opc == OPC_LDLW)) regmask_set(&state->needs_ss, n->regs[0]); else regmask_set(&state->needs_sy, n->regs[0]); @@ -264,13 +284,17 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } } - if (is_input(n)) + if (is_input(n)) { last_input = n; + last_input_needs_ss |= (n->opc == OPC_LDLV); + } last_n = n; } if (last_input) { + assert(block == list_first_entry(&block->shader->block_list, + struct ir3_block, node)); /* special hack.. if using ldlv to bypass interpolation, * we need to insert a dummy bary.f on which we can set * the (ei) flag: @@ -280,7 +304,6 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) /* (ss)bary.f (ei)r63.x, 0, r0.x */ baryf = ir3_instr_create(block, OPC_BARY_F); - baryf->flags |= IR3_INSTR_SS; ir3_reg_create(baryf, regid(63, 0), 0); ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; ir3_reg_create(baryf, regid(0, 0), 0); @@ -290,8 +313,31 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_add(&baryf->node, &last_input->node); last_input = baryf; + + /* by definition, we need (ss) since we are inserting + * the dummy bary.f immediately after the ldlv: + */ + last_input_needs_ss = true; } last_input->regs[0]->flags |= IR3_REG_EI; + if (last_input_needs_ss) + last_input->flags |= IR3_INSTR_SS; + } else if (has_tex_prefetch) { + /* texture prefetch, but *no* inputs.. we need to insert a + * dummy bary.f at the top of the shader to unblock varying + * storage: + */ + struct ir3_instruction *baryf; + + /* (ss)bary.f (ei)r63.x, 0, r0.x */ + baryf = ir3_instr_create(block, OPC_BARY_F); + ir3_reg_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI; + ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; + ir3_reg_create(baryf, regid(0, 0), 0); + + /* insert the dummy bary.f at head: */ + list_delinit(&baryf->node); + list_add(&baryf->node, &block->instr_list); } if (last_rel) @@ -376,6 +422,47 @@ resolve_dest_block(struct ir3_block *block) return block; } +static void +remove_unused_block(struct ir3_block *old_target) +{ + list_delinit(&old_target->node); + + /* cleanup dangling predecessors: */ + for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) { + if (old_target->successors[i]) { + struct ir3_block *succ = old_target->successors[i]; + _mesa_set_remove_key(succ->predecessors, old_target); + } + } +} + +static void +retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target) +{ + struct ir3_block *old_target = instr->cat0.target; + struct ir3_block *cur_block = instr->block; + + /* update current blocks successors to reflect the retargetting: */ + if (cur_block->successors[0] == old_target) { + cur_block->successors[0] = new_target; + } else { + debug_assert(cur_block->successors[1] == old_target); + cur_block->successors[1] = new_target; + } + + /* update new target's predecessors: */ + _mesa_set_add(new_target->predecessors, cur_block); + + /* and remove old_target's predecessor: */ + debug_assert(_mesa_set_search(old_target->predecessors, cur_block)); + _mesa_set_remove_key(old_target->predecessors, cur_block); + + if (old_target->predecessors->entries == 0) + remove_unused_block(old_target); + + instr->cat0.target = new_target; +} + static bool resolve_jump(struct ir3_instruction *instr) { @@ -384,8 +471,7 @@ resolve_jump(struct ir3_instruction *instr) struct ir3_instruction *target; if (tblock != instr->cat0.target) { - list_delinit(&instr->cat0.target->node); - instr->cat0.target = tblock; + retarget_jump(instr, tblock); return true; } @@ -407,7 +493,7 @@ resolve_jump(struct ir3_instruction *instr) else next_block = 1; - if ((!target) || (target->ip == (instr->ip + next_block))) { + if (target->ip == (instr->ip + next_block)) { list_delinit(&instr->node); return true; } else { @@ -435,58 +521,52 @@ resolve_jumps(struct ir3 *ir) return false; } -/* we want to mark points where divergent flow control re-converges - * with (jp) flags. For now, since we don't do any optimization for - * things that start out as a 'do {} while()', re-convergence points - * will always be a branch or jump target. Note that this is overly - * conservative, since unconditional jump targets are not convergence - * points, we are just assuming that the other path to reach the jump - * target was divergent. If we were clever enough to optimize the - * jump at end of a loop back to a conditional branch into a single - * conditional branch, ie. like: - * - * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start - * mul.f r1.z, r1.z, r0.x - * mul.f r1.y, r1.y, r0.x - * mul.f r0.z, r1.x, r0.x - * mul.f r0.w, r0.y, r0.x - * cmps.f.ge r0.x, (r)c2.y, (r)r1.w - * add.s r0.x, (r)r0.x, (r)-1 - * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x - * cmps.f.eq p0.x, r0.x, c3.y - * mov.f32f32 r0.x, r1.w - * mov.f32f32 r0.y, r0.w - * mov.f32f32 r1.x, r0.z - * (rpt2)nop - * br !p0.x, #-13 - * (jp)mul.f r0.x, c263.y, r1.y +static void mark_jp(struct ir3_block *block) +{ + struct ir3_instruction *target = list_first_entry(&block->instr_list, + struct ir3_instruction, node); + target->flags |= IR3_INSTR_JP; +} + +/* Mark points where control flow converges or diverges. * - * Then we'd have to be more clever, as the convergence point is no - * longer a branch or jump target. + * Divergence points could actually be re-convergence points where + * "parked" threads are recoverged with threads that took the opposite + * path last time around. Possibly it is easier to think of (jp) as + * "the execution mask might have changed". */ static void -mark_convergence_points(struct ir3 *ir) +mark_xvergence_points(struct ir3 *ir) { list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { - list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { - if (is_flow(instr) && instr->cat0.target) { - struct ir3_instruction *target = - list_first_entry(&instr->cat0.target->instr_list, - struct ir3_instruction, node); - target->flags |= IR3_INSTR_JP; + if (block->predecessors->entries > 1) { + /* if a block has more than one possible predecessor, then + * the first instruction is a convergence point. + */ + mark_jp(block); + } else if (block->predecessors->entries == 1) { + /* If a block has one predecessor, which has multiple possible + * successors, it is a divergence point. + */ + set_foreach(block->predecessors, entry) { + struct ir3_block *predecessor = (struct ir3_block *)entry->key; + if (predecessor->successors[1]) { + mark_jp(block); + } } } } } void -ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary) +ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary) { struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); bool progress; ctx->max_bary = -1; ctx->compiler = ir->compiler; + ctx->type = ir->type; /* allocate per-block data: */ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { @@ -501,15 +581,15 @@ ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary) } } while (progress); - *num_samp = ctx->num_samp; *has_ssbo = ctx->has_ssbo; + *need_pixlod = ctx->need_pixlod; *max_bary = ctx->max_bary; do { ir3_count_instructions(ir); } while(resolve_jumps(ir)); - mark_convergence_points(ir); + mark_xvergence_points(ir); ralloc_free(ctx); }