X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fir3%2Fir3_ra.c;h=94386ed46942979525cf0a5eed4d0b8c970f2208;hb=8b3ac7084ab71807850416fc1324c5ca0a42e01e;hp=fa379c3495b9c95eaf7a43075c34ce1b8228b63b;hpb=6347c2ea89bde624dd16cff6741db57e89d88ad5;p=mesa.git diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index fa379c3495b..94386ed4694 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -264,7 +264,7 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (instr->regs_count == 0) continue; /* couple special cases: */ - if (writes_addr(instr) || writes_pred(instr)) { + if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) { id->cls = -1; } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { id->cls = total_class_count; @@ -358,10 +358,44 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) } } +/** + * Set a value for max register target. + * + * Currently this just rounds up to a multiple of full-vec4 (ie. the + * granularity that we configure the hw for.. there is no point to + * using r3.x if you aren't going to make r3.yzw available). But + * in reality there seems to be multiple thresholds that affect the + * number of waves.. and we should round up the target to the next + * threshold when we round-robin registers, to give postsched more + * options. When we understand that better, this is where we'd + * implement that. + */ +static void +ra_set_register_target(struct ir3_ra_ctx *ctx, unsigned max_target) +{ + const unsigned hvec4 = 4; + const unsigned vec4 = 2 * hvec4; + + ctx->max_target = align(max_target, vec4); + + d("New max_target=%u", ctx->max_target); +} + static int pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max) { - for (unsigned i = min; i < max; i++) { + for (unsigned i = min; i <= max; i++) { + if (BITSET_TEST(regs, i)) { + return i; + } + } + return -1; +} + +static int +pick_in_range_rev(BITSET_WORD *regs, int min, int max) +{ + for (int i = max; i >= min; i--) { if (BITSET_TEST(regs, i)) { return i; } @@ -375,6 +409,10 @@ ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) { struct ir3_ra_ctx *ctx = data; unsigned int class = ra_get_node_class(ctx->g, n); + bool half, high; + int sz = ra_class_to_size(class, &half, &high); + + assert (sz > 0); /* dimensions within the register class: */ unsigned max_target, start; @@ -387,23 +425,51 @@ ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) */ unsigned base; + /* TODO I think eventually we want to round-robin in vector pass + * as well, but needs some more work to calculate # of live vals + * for this. (Maybe with some work, we could just figure out + * the scalar target and use that, since that is what we care + * about in the end.. but that would mean setting up use-def/ + * liveranges for scalar pass before doing vector pass.) + * + * For now, in the vector class, just move assignments for scalar + * vals higher to hopefully prevent them from limiting where vecN + * values can be placed. Since the scalar values are re-assigned + * in the 2nd pass, we don't really care where they end up in the + * vector pass. + */ + if (!ctx->scalar_pass) { + base = ctx->set->gpr_to_ra_reg[class][0]; + if (high) { + max_target = HIGH_CLASS_REGS(class - HIGH_OFFSET); + } else if (half) { + max_target = HALF_CLASS_REGS(class - HALF_OFFSET); + } else { + max_target = CLASS_REGS(class); + } + + if ((sz == 1) && !high) { + return pick_in_range_rev(regs, base, base + max_target); + } else { + return pick_in_range(regs, base, base + max_target); + } + } else { + assert(sz == 1); + } + /* NOTE: this is only used in scalar pass, so the register * class will be one of the scalar classes (ie. idx==0): */ - if (class == ctx->set->high_classes[0]) { + base = ctx->set->gpr_to_ra_reg[class][0]; + if (high) { max_target = HIGH_CLASS_REGS(0); start = 0; - base = ctx->set->gpr_to_ra_reg[HIGH_OFFSET][0]; - } else if (class == ctx->set->half_classes[0]) { + } else if (half) { max_target = ctx->max_target; start = ctx->start_search_reg; - base = ctx->set->gpr_to_ra_reg[HALF_OFFSET][0]; - } else if (class == ctx->set->classes[0]) { + } else { max_target = ctx->max_target / 2; start = ctx->start_search_reg; - base = ctx->set->gpr_to_ra_reg[0][0]; - } else { - unreachable("unexpected register class!"); } /* For cat4 instructions, if the src reg is already assigned, and @@ -412,9 +478,16 @@ ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) * for write after read hazards: */ struct ir3_instruction *instr = name_to_instr(ctx, n); - if (is_sfu(instr) && instr->regs[1]->instr) { - struct ir3_instruction *src = instr->regs[1]->instr; - unsigned src_n = scalar_name(ctx, src, 0); + if (is_sfu(instr)) { + struct ir3_register *src = instr->regs[1]; + int src_n; + + if ((src->flags & IR3_REG_ARRAY) && !(src->flags & IR3_REG_RELATIV)) { + struct ir3_array *arr = ir3_lookup_array(ctx->ir, src->array.id); + src_n = arr->base + src->array.offset; + } else { + src_n = scalar_name(ctx, src->instr, 0); + } unsigned reg = ra_get_node_reg(ctx->g, src_n); @@ -424,6 +497,19 @@ ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) return reg; } } + } else if (is_tex_or_prefetch(instr)) { + /* we could have a tex fetch w/ wrmask .z, for example.. these + * cannot land in r0.x since that would underflow when we + * subtract the offset. Ie. if we pick r0.z, and subtract + * the offset, the register encoded for dst will be r0.x + */ + unsigned n = ffs(instr->regs[0]->wrmask); + debug_assert(n > 0); + unsigned offset = n - 1; + if (!half) + offset *= 2; + base += offset; + max_target -= offset; } int r = pick_in_range(regs, base + start, base + max_target); @@ -434,7 +520,7 @@ ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) if (r < 0) { /* overflow, we need to increase max_target: */ - ctx->max_target++; + ra_set_register_target(ctx, ctx->max_target + 1); return ra_select_reg_merged(n, regs, data); } @@ -455,7 +541,7 @@ ra_init(struct ir3_ra_ctx *ctx) unsigned n, base; ir3_clear_mark(ctx->ir); - n = ir3_count_instructions(ctx->ir); + n = ir3_count_instructions_ra(ctx->ir); ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n); @@ -491,11 +577,13 @@ ra_init(struct ir3_ra_ctx *ctx) ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); /* TODO add selector callback for split (pre-a6xx) register file: */ - if (ctx->scalar_pass && (ctx->ir->compiler->gpu_id >= 600)) { + if (ctx->ir->compiler->gpu_id >= 600) { ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx); - ctx->name_to_instr = _mesa_hash_table_create(ctx->g, - _mesa_hash_int, _mesa_key_int_equal); + if (ctx->scalar_pass) { + ctx->name_to_instr = _mesa_hash_table_create(ctx->g, + _mesa_hash_int, _mesa_key_int_equal); + } } } @@ -541,6 +629,11 @@ __def(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name, struct ir3_instruction *instr) { debug_assert(name < ctx->alloc_count); + + /* split/collect do not actually define any real value */ + if ((instr->opc == OPC_META_SPLIT) || (instr->opc == OPC_META_COLLECT)) + return; + /* defined on first write: */ if (!ctx->def[name]) ctx->def[name] = instr->ip; @@ -720,6 +813,187 @@ print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) debug_printf("\n"); } +/* size of one component of instruction result, ie. half vs full: */ +static unsigned +live_size(struct ir3_instruction *instr) +{ + if (is_half(instr)) { + return 1; + } else if (is_high(instr)) { + /* doesn't count towards footprint */ + return 0; + } else { + return 2; + } +} + +static unsigned +name_size(struct ir3_ra_ctx *ctx, unsigned name) +{ + if (name_is_array(ctx, name)) { + struct ir3_array *arr = name_to_array(ctx, name); + return arr->half ? 1 : 2; + } else { + struct ir3_instruction *instr = name_to_instr(ctx, name); + /* in scalar pass, each name represents on scalar value, + * half or full precision + */ + return live_size(instr); + } +} + +static unsigned +ra_calc_block_live_values(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_ra_block_data *bd = block->data; + unsigned name; + + assert(ctx->name_to_instr); + + /* TODO this gets a bit more complicated in non-scalar pass.. but + * possibly a lowball estimate is fine to start with if we do + * round-robin in non-scalar pass? Maybe we just want to handle + * that in a different fxn? + */ + assert(ctx->scalar_pass); + + BITSET_WORD *live = + rzalloc_array(bd, BITSET_WORD, BITSET_WORDS(ctx->alloc_count)); + + /* Add the live input values: */ + unsigned livein = 0; + BITSET_FOREACH_SET (name, bd->livein, ctx->alloc_count) { + livein += name_size(ctx, name); + BITSET_SET(live, name); + } + + d("---------------------"); + d("block%u: LIVEIN: %u", block_id(block), livein); + + unsigned max = livein; + int cur_live = max; + + /* Now that we know the live inputs to the block, iterate the + * instructions adjusting the current # of live values as we + * see their last use: + */ + foreach_instr (instr, &block->instr_list) { + if (RA_DEBUG) + print_bitset("LIVE", live, ctx->alloc_count); + di(instr, "CALC"); + + unsigned new_live = 0; /* newly live values */ + unsigned new_dead = 0; /* newly no-longer live values */ + unsigned next_dead = 0; /* newly dead following this instr */ + + foreach_def (name, ctx, instr) { + /* NOTE: checking ctx->def filters out things like split/ + * collect which are just redefining existing live names + * or array writes to already live array elements: + */ + if (ctx->def[name] != instr->ip) + continue; + new_live += live_size(instr); + d("NEW_LIVE: %u (new_live=%u, use=%u)", name, new_live, ctx->use[name]); + BITSET_SET(live, name); + /* There can be cases where this is *also* the last use + * of a value, for example instructions that write multiple + * values, only some of which are used. These values are + * dead *after* (rather than during) this instruction. + */ + if (ctx->use[name] != instr->ip) + continue; + next_dead += live_size(instr); + d("NEXT_DEAD: %u (next_dead=%u)", name, next_dead); + BITSET_CLEAR(live, name); + } + + /* To be more resilient against special cases where liverange + * is extended (like first_non_input), rather than using the + * foreach_use() iterator, we iterate the current live values + * instead: + */ + BITSET_FOREACH_SET (name, live, ctx->alloc_count) { + /* Is this the last use? */ + if (ctx->use[name] != instr->ip) + continue; + new_dead += name_size(ctx, name); + d("NEW_DEAD: %u (new_dead=%u)", name, new_dead); + BITSET_CLEAR(live, name); + } + + cur_live += new_live; + cur_live -= new_dead; + + assert(cur_live >= 0); + d("CUR_LIVE: %u", cur_live); + + max = MAX2(max, cur_live); + + /* account for written values which are not used later, + * but after updating max (since they are for one cycle + * live) + */ + cur_live -= next_dead; + assert(cur_live >= 0); + + if (RA_DEBUG) { + unsigned cnt = 0; + BITSET_FOREACH_SET (name, live, ctx->alloc_count) { + cnt += name_size(ctx, name); + } + assert(cur_live == cnt); + } + } + + d("block%u max=%u", block_id(block), max); + + /* the remaining live should match liveout (for extra sanity testing): */ + if (RA_DEBUG) { + unsigned new_dead = 0; + BITSET_FOREACH_SET (name, live, ctx->alloc_count) { + /* Is this the last use? */ + if (ctx->use[name] != block->end_ip) + continue; + new_dead += name_size(ctx, name); + d("NEW_DEAD: %u (new_dead=%u)", name, new_dead); + BITSET_CLEAR(live, name); + } + unsigned liveout = 0; + BITSET_FOREACH_SET (name, bd->liveout, ctx->alloc_count) { + liveout += name_size(ctx, name); + BITSET_CLEAR(live, name); + } + + if (cur_live != liveout) { + print_bitset("LEAKED", live, ctx->alloc_count); + /* TODO there are a few edge cases where live-range extension + * tells us a value is livein. But not used by the block or + * liveout for the block. Possibly a bug in the liverange + * extension. But for now leave the assert disabled: + assert(cur_live == liveout); + */ + } + } + + ralloc_free(live); + + return max; +} + +static unsigned +ra_calc_max_live_values(struct ir3_ra_ctx *ctx) +{ + unsigned max = 0; + + foreach_block (block, &ctx->ir->block_list) { + unsigned block_live = ra_calc_block_live_values(ctx, block); + max = MAX2(max, block_live); + } + + return max; +} + static void ra_add_interference(struct ir3_ra_ctx *ctx) { @@ -795,13 +1069,18 @@ ra_add_interference(struct ir3_ra_ctx *ctx) if (BITSET_TEST(bd->livein, i + arr->base)) { arr->start_ip = MIN2(arr->start_ip, block->start_ip); } - if (BITSET_TEST(bd->livein, i + arr->base)) { + if (BITSET_TEST(bd->liveout, i + arr->base)) { arr->end_ip = MAX2(arr->end_ip, block->end_ip); } } } } + if (ctx->name_to_instr) { + unsigned max = ra_calc_max_live_values(ctx); + ra_set_register_target(ctx, max); + } + for (unsigned i = 0; i < ctx->alloc_count; i++) { for (unsigned j = 0; j < ctx->alloc_count; j++) { if (intersects(ctx->def[i], ctx->use[i], @@ -819,39 +1098,6 @@ static void fixup_half_instr_dst(struct ir3_instruction *instr) case 1: /* move instructions */ instr->cat1.dst_type = half_type(instr->cat1.dst_type); break; - case 3: - switch (instr->opc) { - case OPC_MAD_F32: - /* Available for that dest is half and srcs are full. - * eg. mad.f32 hr0, r0.x, r0.y, r0.z - */ - if (instr->regs[1]->flags & IR3_REG_HALF) - instr->opc = OPC_MAD_F16; - break; - case OPC_SEL_B32: - instr->opc = OPC_SEL_B16; - break; - case OPC_SEL_S32: - instr->opc = OPC_SEL_S16; - break; - case OPC_SEL_F32: - instr->opc = OPC_SEL_F16; - break; - case OPC_SAD_S32: - instr->opc = OPC_SAD_S16; - break; - /* instructions may already be fixed up: */ - case OPC_MAD_F16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - break; - default: - assert(0); - break; - } - break; case 4: switch (instr->opc) { case OPC_RSQ: @@ -879,6 +1125,21 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) case OPC_MOV: instr->cat1.src_type = half_type(instr->cat1.src_type); break; + case OPC_MAD_F32: + instr->opc = OPC_MAD_F16; + break; + case OPC_SEL_B32: + instr->opc = OPC_SEL_B16; + break; + case OPC_SEL_S32: + instr->opc = OPC_SEL_S16; + break; + case OPC_SEL_F32: + instr->opc = OPC_SEL_F16; + break; + case OPC_SAD_S32: + instr->opc = OPC_SAD_S16; + break; default: break; } @@ -945,41 +1206,15 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, } } -static void -account_assignment(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) -{ - struct ir3_ra_instr_data *id; - struct ir3_register *dst = instr->regs[0]; - unsigned max; - - if (is_high(instr)) - return; - - if (dst->flags & IR3_REG_ARRAY) { - struct ir3_array *arr = - ir3_lookup_array(ctx->ir, dst->array.id); - max = arr->reg + arr->length; - } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { - unsigned name = scalar_name(ctx, id->defn, 0); - unsigned r = ra_get_node_reg(ctx->g, name); - max = ctx->set->ra_reg_to_gpr[r] + id->off + dest_regs(id->defn); - } else { - return; - } - - if (is_half(instr)) { - ctx->max_half_assigned = MAX2(ctx->max_half_assigned, max); - } else { - ctx->max_assigned = MAX2(ctx->max_assigned, max); - } -} - /* helper to determine which regs to assign in which pass: */ static bool should_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { - if ((instr->opc == OPC_META_SPLIT) || - (instr->opc == OPC_META_COLLECT)) + if ((instr->opc == OPC_META_SPLIT) && + (util_bitcount(instr->regs[1]->wrmask) > 1)) + return !ctx->scalar_pass; + if ((instr->opc == OPC_META_COLLECT) && + (util_bitcount(instr->regs[0]->wrmask) > 1)) return !ctx->scalar_pass; return ctx->scalar_pass; } @@ -991,7 +1226,6 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_register *reg; if (writes_gpr(instr)) { - account_assignment(ctx, instr); if (should_assign(ctx, instr)) { reg_assign(ctx, instr->regs[0], instr); if (instr->regs[0]->flags & IR3_REG_HALF) @@ -1040,7 +1274,6 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned nprecolor) { - unsigned num_precolor = 0; for (unsigned i = 0; i < nprecolor; i++) { if (precolor[i] && !(precolor[i]->flags & IR3_INSTR_UNUSED)) { struct ir3_instruction *instr = precolor[i]; @@ -1080,7 +1313,6 @@ ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned unsigned reg = ctx->set->gpr_to_ra_reg[id->cls][regid]; unsigned name = ra_name(ctx, id); ra_set_node_reg(ctx->g, name, reg); - num_precolor = MAX2(regid, num_precolor); } } @@ -1215,8 +1447,10 @@ ra_precolor_assigned(struct ir3_ra_ctx *ctx) foreach_block (block, &ctx->ir->block_list) { foreach_instr (instr, &block->instr_list) { - if ((instr->opc != OPC_META_SPLIT) && - (instr->opc != OPC_META_COLLECT)) + if (!writes_gpr(instr)) + continue; + + if (should_assign(ctx, instr)) continue; precolor(ctx, instr); @@ -1269,32 +1503,9 @@ ra_sanity_check(struct ir3 *ir) } } -/* Target is calculated in terms of half-regs (with a full reg - * consisting of two half-regs). - */ -static void -ra_calc_merged_register_target(struct ir3_ra_ctx *ctx) -{ - const unsigned vec4 = 2 * 4; // 8 half-regs - unsigned t = MAX2(2 * ctx->max_assigned, ctx->max_half_assigned); - - /* second RA pass may have saved some regs, let's try to reclaim - * the benefit by adjusting the target downwards slightly: - */ - if (ir3_has_latency_to_hide(ctx->ir)) { - if (t > 8 * vec4) { - t -= 2 * vec4; - } else if (t > 6 * vec4) { - t -= vec4; - } - } - - ctx->max_target = t; -} - static int ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, - unsigned nprecolor, bool scalar_pass, unsigned *target) + unsigned nprecolor, bool scalar_pass) { struct ir3_ra_ctx ctx = { .v = v, @@ -1304,10 +1515,6 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, }; int ret; - if (scalar_pass) { - ctx.max_target = *target; - } - ra_init(&ctx); ra_add_interference(&ctx); ra_precolor(&ctx, precolor, nprecolor); @@ -1316,17 +1523,6 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, ret = ra_alloc(&ctx); ra_destroy(&ctx); - /* In the first pass, calculate the target register usage used in the - * second (scalar) pass: - */ - if (!scalar_pass) { - /* TODO: round-robin support for pre-a6xx: */ - if (ctx.ir->compiler->gpu_id >= 600) { - ra_calc_merged_register_target(&ctx); - } - *target = ctx.max_target; - } - return ret; } @@ -1334,11 +1530,10 @@ int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor) { - unsigned target = 0; int ret; /* First pass, assign the vecN (non-scalar) registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, false, &target); + ret = ir3_ra_pass(v, precolor, nprecolor, false); if (ret) return ret; @@ -1348,7 +1543,7 @@ ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, } /* Second pass, assign the scalar registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, true, &target); + ret = ir3_ra_pass(v, precolor, nprecolor, true); if (ret) return ret;