X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Ffreedreno%2Fir3%2Fir3_ra.c;h=f70c779525b17a54c8d3f4795e8858e016166025;hb=32630e211e60a2b41388d403cfbd4f43344d8590;hp=74755eb3bc0035b6e68f70a43451ec14d7f7f4bb;hpb=d5c9955d3eaa7311e2b2350b6964bae516c7b7b2;p=mesa.git diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 74755eb3bc0..f70c779525b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -31,6 +31,8 @@ #include "util/ralloc.h" #include "util/bitset.h" +#include "freedreno_util.h" + #include "ir3.h" #include "ir3_compiler.h" @@ -68,25 +70,24 @@ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after * register assignment. But for us that is horrible from a scheduling * standpoint. Instead what we do is use idea of 'definer' instruction. - * Ie. the first instruction (lowest ip) to write to the array is the + * Ie. the first instruction (lowest ip) to write to the variable is the * one we consider from use/def perspective when building interference - * graph. (Other instructions which write other array elements just - * define the variable some more.) + * graph. (Other instructions which write other variable components + * just define the variable some more.) + * + * Arrays of arbitrary size are handled via pre-coloring a consecutive + * sequence of registers. Additional scalar (single component) reg + * names are allocated starting at ctx->class_base[total_class_count] + * (see arr->base), which are pre-colored. In the use/def graph direct + * access is treated as a single element use/def, and indirect access + * is treated as use or def of all array elements. (Only the first + * def is tracked, in case of multiple indirect writes, etc.) */ static const unsigned class_sizes[] = { 1, 2, 3, 4, 4 + 4, /* txd + 1d/2d */ 4 + 6, /* txd + 3d */ - /* temporary: until we can assign arrays, create classes so we - * can round up array to fit. NOTE with tgsi arrays should - * really all be multiples of four: - */ - 4 * 4, - 4 * 8, - 4 * 16, - 4 * 32, - }; #define class_count ARRAY_SIZE(class_sizes) @@ -97,7 +98,7 @@ static const unsigned half_class_sizes[] = { #define total_class_count (class_count + half_class_count) /* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ -#define NUM_REGS (4 * (REG_A0 - 1)) +#define NUM_REGS (4 * 48) /* Number of virtual regs in a given class: */ #define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) #define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) @@ -265,13 +266,21 @@ struct ir3_ra_ctx { struct ir3_ra_reg_set *set; struct ra_graph *g; unsigned alloc_count; - unsigned class_alloc_count[total_class_count]; - unsigned class_base[total_class_count]; + /* one per class, plus one slot for arrays: */ + unsigned class_alloc_count[total_class_count + 1]; + unsigned class_base[total_class_count + 1]; unsigned instr_cnt; unsigned *def, *use; /* def/use table */ struct ir3_ra_instr_data *instrd; }; +/* does it conflict? */ +static inline bool +intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) +{ + return !((a_start >= b_end) || (b_start >= a_end)); +} + static bool is_half(struct ir3_instruction *instr) { @@ -329,16 +338,13 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; struct ir3_instruction *d = NULL; - if (instr->fanin) - return get_definer(ctx, instr->fanin, sz, off); - if (id->defn) { *sz = id->sz; *off = id->off; return id->defn; } - if (is_meta(instr) && (instr->opc == OPC_META_FI)) { + if (instr->opc == OPC_META_FI) { /* What about the case where collect is subset of array, we * need to find the distance between where actual array starts * and fanin.. that probably doesn't happen currently. @@ -432,7 +438,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, } } - if (is_meta(d) && (d->opc == OPC_META_PHI)) { + if (d->opc == OPC_META_PHI) { /* we have already inserted parallel-copies into * the phi, so we don't need to chase definers */ @@ -452,7 +458,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, d = dd; } - if (is_meta(d) && (d->opc == OPC_META_FO)) { + if (d->opc == OPC_META_FO) { struct ir3_instruction *dd; int dsz, doff; @@ -463,7 +469,8 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, *sz = MAX2(*sz, dsz); - /* Fanout's are grouped, so *off should already valid */ + debug_assert(instr->opc == OPC_META_FO); + *off = MAX2(*off, instr->fo.off); d = dd; } @@ -485,10 +492,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* couple special cases: */ if (writes_addr(instr) || writes_pred(instr)) { id->cls = -1; - continue; + } else if (instr->regs[0]->flags & IR3_REG_ARRAY) { + id->cls = total_class_count; + id->defn = instr; + } else { + id->defn = get_definer(ctx, instr, &id->sz, &id->off); + id->cls = size_to_class(id->sz, is_half(id->defn)); } - id->defn = get_definer(ctx, instr, &id->sz, &id->off); - id->cls = size_to_class(id->sz, is_half(id->defn)); } } @@ -518,8 +528,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) /* arrays which don't fit in one of the pre-defined class * sizes are pre-colored: - * - * TODO but we still need to allocate names for them, don't we?? */ if (id->cls >= 0) { instr->name = ctx->class_alloc_count[id->cls]++; @@ -531,7 +539,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) static void ra_init(struct ir3_ra_ctx *ctx) { - unsigned n; + unsigned n, base; ir3_clear_mark(ctx->ir); n = ir3_count_instructions(ctx->ir); @@ -550,11 +558,20 @@ ra_init(struct ir3_ra_ctx *ctx) * actual ra name is class_base[cls] + instr->name; */ ctx->class_base[0] = 0; - for (unsigned i = 1; i < total_class_count; i++) { + for (unsigned i = 1; i <= total_class_count; i++) { ctx->class_base[i] = ctx->class_base[i-1] + ctx->class_alloc_count[i-1]; } + /* and vreg names for array elements: */ + base = ctx->class_base[total_class_count]; + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + arr->base = base; + ctx->class_alloc_count[total_class_count] += arr->length; + base += arr->length; + } + ctx->alloc_count += ctx->class_alloc_count[total_class_count]; + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); @@ -562,15 +579,23 @@ ra_init(struct ir3_ra_ctx *ctx) } static unsigned -ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) +__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn) { unsigned name; debug_assert(cls >= 0); + debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */ name = ctx->class_base[cls] + defn->name; debug_assert(name < ctx->alloc_count); return name; } +static int +ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id) +{ + /* TODO handle name mapping for arrays */ + return __ra_name(ctx, id->cls, id->defn); +} + static void ra_destroy(struct ir3_ra_ctx *ctx) { @@ -583,6 +608,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_ra_block_data *bd; unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); +#define def(name, instr) \ + do { \ + /* defined on first write: */ \ + if (!ctx->def[name]) \ + ctx->def[name] = instr->ip; \ + ctx->use[name] = instr->ip; \ + BITSET_SET(bd->def, name); \ + } while(0); + +#define use(name, instr) \ + do { \ + ctx->use[name] = MAX2(ctx->use[name], instr->ip); \ + if (!BITSET_TEST(bd->def, name)) \ + BITSET_SET(bd->use, name); \ + } while(0); + bd = rzalloc(ctx->g, struct ir3_ra_block_data); bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words); @@ -594,6 +635,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { struct ir3_instruction *src; + struct ir3_register *reg; if (instr->regs_count == 0) continue; @@ -625,61 +667,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (writes_gpr(instr)) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_register *dst = instr->regs[0]; - if (id->defn == instr) { - /* arrays which don't fit in one of the pre-defined class - * sizes are pre-colored: - */ - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + if (dst->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, dst->array.id); + unsigned i; - ctx->def[name] = id->defn->ip; - ctx->use[name] = id->defn->ip; + debug_assert(!(dst->flags & IR3_REG_PHI_SRC)); - /* since we are in SSA at this point: */ - debug_assert(!BITSET_TEST(bd->use, name)); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); - BITSET_SET(bd->def, name); + /* set the node class now.. in case we don't encounter + * this array dst again. From register_alloc algo's + * perspective, these are all single/scalar regs: + */ + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + ra_set_node_class(ctx->g, name, ctx->set->classes[0]); + } - if (is_half(id->defn)) { - ra_set_node_class(ctx->g, name, - ctx->set->half_classes[id->cls - class_count]); - } else { - ra_set_node_class(ctx->g, name, - ctx->set->classes[id->cls]); + /* indirect write is treated like a write to all array + * elements, since we don't know which one is actually + * written: + */ + if (dst->flags & IR3_REG_RELATIV) { + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + def(name, instr); } + } else { + unsigned name = arr->base + dst->array.offset; + def(name, instr); + } + + } else if (id->defn == instr) { + unsigned name = ra_name(ctx, id); - /* extend the live range for phi srcs, which may come - * from the bottom of the loop - */ - if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { - struct ir3_instruction *phi = id->defn->regs[0]->instr; - foreach_ssa_src(src, phi) { - /* if src is after phi, then we need to extend - * the liverange to the end of src's block: - */ - if (src->ip > phi->ip) { - struct ir3_instruction *last = + /* since we are in SSA at this point: */ + debug_assert(!BITSET_TEST(bd->use, name)); + + def(name, id->defn); + + if (is_half(id->defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[id->cls - class_count]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[id->cls]); + } + + /* extend the live range for phi srcs, which may come + * from the bottom of the loop + */ + if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) { + struct ir3_instruction *phi = id->defn->regs[0]->instr; + foreach_ssa_src(src, phi) { + /* if src is after phi, then we need to extend + * the liverange to the end of src's block: + */ + if (src->ip > phi->ip) { + struct ir3_instruction *last = list_last_entry(&src->block->instr_list, - struct ir3_instruction, node); - ctx->use[name] = MAX2(ctx->use[name], last->ip); - } + struct ir3_instruction, node); + ctx->use[name] = MAX2(ctx->use[name], last->ip); } } } } } - foreach_ssa_src(src, instr) { - if (writes_gpr(src)) { - struct ir3_ra_instr_data *id = &ctx->instrd[src->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = MAX2(ctx->use[name], instr->ip); - if (!BITSET_TEST(bd->def, name)) - BITSET_SET(bd->use, name); + foreach_src(reg, instr) { + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + arr->start_ip = MIN2(arr->start_ip, instr->ip); + arr->end_ip = MAX2(arr->end_ip, instr->ip); + /* indirect read is treated like a read fromall array + * elements, since we don't know which one is actually + * read: + */ + if (reg->flags & IR3_REG_RELATIV) { + unsigned i; + for (i = 0; i < arr->length; i++) { + unsigned name = arr->base + i; + use(name, instr); + } + } else { + unsigned name = arr->base + reg->array.offset; + use(name, instr); + debug_assert(reg->array.offset < arr->length); } + } else if ((src = ssa(reg)) && writes_gpr(src)) { + unsigned name = ra_name(ctx, &ctx->instrd[src->ip]); + use(name, instr); } } } @@ -730,11 +812,33 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx) return progress; } +static void +print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt) +{ + bool first = true; + debug_printf(" %s:", name); + for (unsigned i = 0; i < cnt; i++) { + if (BITSET_TEST(bs, i)) { + if (!first) + debug_printf(","); + debug_printf(" %04u", i); + first = false; + } + } + debug_printf("\n"); +} + static void ra_add_interference(struct ir3_ra_ctx *ctx) { struct ir3 *ir = ctx->ir; + /* initialize array live ranges: */ + list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) { + arr->start_ip = ~0; + arr->end_ip = 0; + } + /* compute live ranges (use/def) on a block level, also updating * block's def/use bitmasks (used below to calculate per-block * livein/liveout): @@ -746,12 +850,24 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* update per-block livein/liveout: */ while (ra_compute_livein_liveout(ctx)) {} + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + debug_printf("AFTER LIVEIN/OUT:\n"); + ir3_print(ir); + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + struct ir3_ra_block_data *bd = block->data; + debug_printf("block%u:\n", block_id(block)); + print_bitset("def", bd->def, ctx->alloc_count); + print_bitset("use", bd->use, ctx->alloc_count); + print_bitset("l/i", bd->livein, ctx->alloc_count); + print_bitset("l/o", bd->liveout, ctx->alloc_count); + } + } + /* extend start/end ranges based on livein/liveout info from cfg: */ - unsigned bitset_words = BITSET_WORDS(ctx->alloc_count); list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { struct ir3_ra_block_data *bd = block->data; - for (unsigned i = 0; i < bitset_words; i++) { + for (unsigned i = 0; i < ctx->alloc_count; i++) { if (BITSET_TEST(bd->livein, i)) { ctx->def[i] = MIN2(ctx->def[i], block->start_ip); ctx->use[i] = MAX2(ctx->use[i], block->start_ip); @@ -767,18 +883,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* need to fix things up to keep outputs live: */ for (unsigned i = 0; i < ir->noutputs; i++) { struct ir3_instruction *instr = ir->outputs[i]; - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; - - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); - ctx->use[name] = ctx->instr_cnt; - } + unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]); + ctx->use[name] = ctx->instr_cnt; } for (unsigned i = 0; i < ctx->alloc_count; i++) { for (unsigned j = 0; j < ctx->alloc_count; j++) { - if (!((ctx->def[i] >= ctx->use[j]) || - (ctx->def[j] >= ctx->use[i]))) { + if (intersects(ctx->def[i], ctx->use[i], + ctx->def[j], ctx->use[j])) { ra_add_node_interference(ctx->g, i, j); } } @@ -788,7 +900,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx) /* some instructions need fix-up if dst register is half precision: */ static void fixup_half_instr_dst(struct ir3_instruction *instr) { - switch (instr->category) { + switch (opc_cat(instr->opc)) { case 1: /* move instructions */ instr->cat1.dst_type = half_type(instr->cat1.dst_type); break; @@ -829,26 +941,45 @@ static void fixup_half_instr_dst(struct ir3_instruction *instr) /* some instructions need fix-up if src register is half precision: */ static void fixup_half_instr_src(struct ir3_instruction *instr) { - switch (instr->category) { - case 1: /* move instructions */ + switch (instr->opc) { + case OPC_MOV: instr->cat1.src_type = half_type(instr->cat1.src_type); break; + default: + break; } } +/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first + * array access(es) which do not have any previous access to depend + * on from scheduling point of view + */ static void reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; + struct ir3_ra_instr_data *id; + + if (reg->flags & IR3_REG_ARRAY) { + struct ir3_array *arr = + ir3_lookup_array(ctx->ir, reg->array.id); + unsigned name = arr->base + reg->array.offset; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r]; + + if (reg->flags & IR3_REG_RELATIV) { + reg->array.offset = num; + } else { + reg->num = num; + } - if (id->cls >= 0) { - unsigned name = ra_name(ctx, id->cls, id->defn); + reg->flags &= ~IR3_REG_ARRAY; + } else if ((id = &ctx->instrd[instr->ip]) && id->defn) { + unsigned name = ra_name(ctx, id); unsigned r = ra_get_node_reg(ctx->g, name); unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off; - if (reg->flags & IR3_REG_RELATIV) - num += reg->offset; + debug_assert(!(reg->flags & IR3_REG_RELATIV)); reg->num = num; reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC); @@ -875,9 +1006,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_src_n(reg, n, instr) { struct ir3_instruction *src = reg->instr; - if (!src) + /* Note: reg->instr could be null for IR3_REG_ARRAY */ + if (!(src || (reg->flags & IR3_REG_ARRAY))) continue; - reg_assign(ctx, instr->regs[n+1], src); if (instr->regs[n+1]->flags & IR3_REG_HALF) fixup_half_instr_src(instr); @@ -888,6 +1019,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) static int ra_alloc(struct ir3_ra_ctx *ctx) { + unsigned n = 0; + /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ @@ -897,7 +1030,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) { struct ir3_instruction *instr = ir->inputs[i]; int cls = size_to_class(1, true); - unsigned name = ra_name(ctx, cls, instr); + unsigned name = __ra_name(ctx, cls, instr); unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; /* if we have frag_face, it gets hr0.x */ @@ -905,7 +1038,8 @@ ra_alloc(struct ir3_ra_ctx *ctx) i += 4; } - for (j = 0; i < ir->ninputs; i++) { + j = 0; + for (; i < ir->ninputs; i++) { struct ir3_instruction *instr = ir->inputs[i]; if (instr) { struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip]; @@ -913,7 +1047,7 @@ ra_alloc(struct ir3_ra_ctx *ctx) if (id->defn == instr) { unsigned name, reg; - name = ra_name(ctx, id->cls, id->defn); + name = ra_name(ctx, id); reg = ctx->set->gpr_to_ra_reg[id->cls][j]; ra_set_node_reg(ctx->g, name, reg); @@ -921,6 +1055,46 @@ ra_alloc(struct ir3_ra_ctx *ctx) } } } + n = j; + } + + /* pre-assign array elements: + */ + list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) { + unsigned base = n; + + if (arr->end_ip == 0) + continue; + + /* figure out what else we conflict with which has already + * been assigned: + */ +retry: + list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) { + if (arr2 == arr) + break; + if (arr2->end_ip == 0) + continue; + /* if it intersects with liverange AND register range.. */ + if (intersects(arr->start_ip, arr->end_ip, + arr2->start_ip, arr2->end_ip) && + intersects(base, base + arr->length, + arr2->reg, arr2->reg + arr2->length)) { + base = MAX2(base, arr2->reg + arr2->length); + goto retry; + } + } + + arr->reg = base; + + for (unsigned i = 0; i < arr->length; i++) { + unsigned name, reg; + + name = arr->base + i; + reg = ctx->set->gpr_to_ra_reg[0][base++]; + + ra_set_node_reg(ctx->g, name, reg); + } } if (!ra_allocate(ctx->g))