From: Rob Clark Date: Fri, 1 May 2015 16:21:12 +0000 (-0400) Subject: freedreno/ir3/ra: use register_allocate X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d52fb2f5ad828f879286b9068023b82b9897bc17;p=mesa.git freedreno/ir3/ra: use register_allocate Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 29a6e402056..93a6ab5da7c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -83,7 +83,6 @@ struct ir3_register { */ IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */ IR3_REG_IA = 0x4000, /* meta-input dst is "assigned" */ - IR3_REG_ADDR = 0x8000, /* register is a0.x */ } flags; union { /* normal registers: @@ -245,6 +244,13 @@ struct ir3_instruction { */ #define DEPTH_UNUSED ~0 unsigned depth; + /* When we get to the RA stage, we no longer need depth, but + * we do need instruction's position/name: + */ + struct { + uint16_t ip; + uint16_t name; + }; }; /* Used during CP and RA stages. For fanin and shader inputs/ @@ -503,6 +509,28 @@ static inline bool is_mem(struct ir3_instruction *instr) return (instr->category == 6); } +static inline bool +is_store(struct ir3_instruction *instr) +{ + if (is_mem(instr)) { + /* these instructions, the "destination" register is + * actually a source, the address to store to. + */ + switch (instr->opc) { + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + case OPC_L2G: + case OPC_G2L: + return true; + default: + break; + } + } + return false; +} + static inline bool is_input(struct ir3_instruction *instr) { /* in some cases, ldlv is used to fetch varying without @@ -527,7 +555,7 @@ static inline bool writes_addr(struct ir3_instruction *instr) { if (instr->regs_count > 0) { struct ir3_register *dst = instr->regs[0]; - return !!(dst->flags & IR3_REG_ADDR); + return reg_num(dst) == REG_A0; } return false; } @@ -558,7 +586,7 @@ static inline bool conflicts(struct ir3_instruction *a, static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_ADDR)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) return false; if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) return false; @@ -771,6 +799,7 @@ void ir3_block_group(struct ir3_block *block); int ir3_block_sched(struct ir3_block *block); /* register assignment: */ +struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(void *memctx); int ir3_block_ra(struct ir3_block *block, enum shader_t type, bool frag_coord, bool frag_face); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c index 0087374539a..7c8eccb54e1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -34,6 +34,7 @@ struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id) { struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); compiler->gpu_id = gpu_id; + compiler->set = ir3_ra_alloc_reg_set(compiler); return compiler; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h index 313916f4288..86b1161d9cb 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -31,8 +31,11 @@ #include "ir3_shader.h" +struct ir3_ra_reg_set; + struct ir3_compiler { uint32_t gpu_id; + struct ir3_ra_reg_set *set; }; struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 9bc54c9b83b..39f4527c22b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -385,7 +385,8 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src) instr->regs[1]->flags |= IR3_REG_HALF; instr = ir3_MOV(block, instr, TYPE_S16); - instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF; + instr->regs[0]->num = regid(REG_A0, 0); + instr->regs[0]->flags |= IR3_REG_HALF; instr->regs[1]->flags |= IR3_REG_HALF; return instr; @@ -589,6 +590,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) compile_assert(ctx, !ctx->frag_face); ctx->frag_face = create_input(block, NULL, 0); + ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; /* for faceness, we always get -1 or 0 (int).. but TGSI expects * positive vs negative float.. and piglit further seems to @@ -1981,9 +1983,18 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, */ if (key.half_precision) { for (i = 0; i < block->noutputs; i++) { - if (!block->outputs[i]) + struct ir3_instruction *out = block->outputs[i]; + if (!out) continue; - block->outputs[i]->regs[0]->flags |= IR3_REG_HALF; + out->regs[0]->flags |= IR3_REG_HALF; + /* output could be a fanout (ie. texture fetch output) + * in which case we need to propagate the half-reg flag + * up to the definer so that RA sees it: + */ + if (is_meta(out) && (out->opc == OPC_META_FO)) { + out = out->regs[1]->instr; + out->regs[0]->flags |= IR3_REG_HALF; + } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 350f7dd5e6b..8c057166f32 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -41,7 +41,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; struct ir3_instruction *src_instr = ssa(src); - if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV)) + if (dst->flags & IR3_REG_RELATIV) return false; if (src->flags & IR3_REG_RELATIV) return false; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 95f6a81861e..39ce9c5d4ce 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -26,267 +26,533 @@ * Rob Clark */ -#include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/register_allocate.h" +#include "util/ralloc.h" #include "ir3.h" +#include "ir3_compiler.h" /* * Register Assignment: * - * NOTE: currently only works on a single basic block.. need to think - * about how multiple basic blocks are going to get scheduled. But - * I think I want to re-arrange how blocks work, ie. get rid of the - * block nesting thing.. + * Uses the register_allocate util, which implements graph coloring + * algo with interference classes. To handle the cases where we need + * consecutive registers (for example, texture sample instructions), + * we model these as larger (double/quad/etc) registers which conflict + * with the corresponding registers in other classes. * - * NOTE: we could do register coalescing (eliminate moves) as part of - * the RA step.. OTOH I think we need to do scheduling before register - * assignment. And if we remove a mov that effects scheduling (unless - * we leave a placeholder nop, which seems lame), so I'm not really - * sure how practical this is to do both in a single stage. But OTOH - * I'm not really sure a sane way for the CP stage to realize when it - * cannot remove a mov due to multi-register constraints.. + * Additionally we create additional classes for half-regs, which + * do not conflict with the full-reg classes. We do need at least + * sizes 1-4 (to deal w/ texture sample instructions output to half- + * reg). At the moment we don't create the higher order half-reg + * classes as half-reg frequently does not have enough precision + * for texture coords at higher resolutions. * - * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has - * some ideas to handle array allocation with a more conventional - * graph coloring algorithm for register assignment, which might be - * a good alternative to the current algo. However afaict it cannot - * handle overlapping arrays, which is a scenario that we have to - * deal with + * There are some additional cases that we need to handle specially, + * as the graph coloring algo doesn't understand "partial writes". + * For example, a sequence like: + * + * add r0.z, ... + * sam (f32)(xy)r0.x, ... + * ... + * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord + * + * In this scenario, we treat r0.xyz as class size 3, which is written + * (from a use/def perspective) at the 'add' instruction and ignore the + * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the + * defining instruction, as it is the first to partially write r0.xyz. + * + * Note i965 has a similar scenario, which they solve with a virtual + * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after + * register assignment. But for us that is horrible from a scheduling + * standpoint. Instead what we do is use idea of 'definer' instruction. + * Ie. the first instruction (lowest ip) to write to the array is the + * one we consider from use/def perspective when building interference + * graph. (Other instructions which write other array elements just + * define the variable some more.) + */ + +static const unsigned class_sizes[] = { + 1, 2, 3, 4, + 4 + 4, /* txd + 1d/2d */ + 4 + 6, /* txd + 3d */ + /* temporary: until we can assign arrays, create classes so we + * can round up array to fit. NOTE with tgsi arrays should + * really all be multiples of four: + */ + 4 * 4, + 4 * 8, + 4 * 16, + 4 * 32, + +}; +#define class_count ARRAY_SIZE(class_sizes) + +static const unsigned half_class_sizes[] = { + 1, 2, 3, 4, +}; +#define half_class_count ARRAY_SIZE(half_class_sizes) +#define total_class_count (class_count + half_class_count) + +/* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */ +#define NUM_REGS (4 * (REG_A0 - 1)) +/* Number of virtual regs in a given class: */ +#define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1)) +#define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1)) + +/* register-set, created one time, used for all shaders: */ +struct ir3_ra_reg_set { + struct ra_regs *regs; + unsigned int classes[class_count]; + unsigned int half_classes[half_class_count]; + /* maps flat virtual register space to base gpr: */ + uint16_t *ra_reg_to_gpr; + /* maps cls,gpr to flat virtual register space: */ + uint16_t **gpr_to_ra_reg; +}; + +/* One-time setup of RA register-set, which describes all the possible + * "virtual" registers and their interferences. Ie. double register + * occupies (and conflicts with) two single registers, and so forth. + * Since registers do not need to be aligned to their class size, they + * can conflict with other registers in the same class too. Ie: + * + * Single (base) | Double + * --------------+--------------- + * R0 | D0 + * R1 | D0 D1 + * R2 | D1 D2 + * R3 | D2 + * .. and so on.. + * + * (NOTE the disassembler uses notation like r0.x/y/z/w but those are + * really just four scalar registers. Don't let that confuse you.) */ +struct ir3_ra_reg_set * +ir3_ra_alloc_reg_set(void *memctx) +{ + struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set); + unsigned ra_reg_count, reg, first_half_reg; + unsigned int **q_values; + + /* calculate # of regs across all classes: */ + ra_reg_count = 0; + for (unsigned i = 0; i < class_count; i++) + ra_reg_count += CLASS_REGS(i); + for (unsigned i = 0; i < half_class_count; i++) + ra_reg_count += HALF_CLASS_REGS(i); + + /* allocate and populate q_values: */ + q_values = ralloc_array(set, unsigned *, total_class_count); + for (unsigned i = 0; i < class_count; i++) { + q_values[i] = rzalloc_array(q_values, unsigned, total_class_count); + + /* From register_allocate.c: + * + * q(B,C) (indexed by C, B is this register class) in + * Runeson/Nyström paper. This is "how many registers of B could + * the worst choice register from C conflict with". + * + * If we just let the register allocation algorithm compute these + * values, is extremely expensive. However, since all of our + * registers are laid out, we can very easily compute them + * ourselves. View the register from C as fixed starting at GRF n + * somewhere in the middle, and the register from B as sliding back + * and forth. Then the first register to conflict from B is the + * one starting at n - class_size[B] + 1 and the last register to + * conflict will start at n + class_size[B] - 1. Therefore, the + * number of conflicts from B is class_size[B] + class_size[C] - 1. + * + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * B | | | | | |n| --> | | | | | | | + * +-+-+-+-+-+-+ +-+-+-+-+-+-+ + * +-+-+-+-+-+ + * C |n| | | | | + * +-+-+-+-+-+ + * + * (Idea copied from brw_fs_reg_allocate.cpp) + */ + for (unsigned j = 0; j < class_count; j++) + q_values[i][j] = class_sizes[i] + class_sizes[j] - 1; + } + + for (unsigned i = class_count; i < total_class_count; i++) { + q_values[i] = ralloc_array(q_values, unsigned, total_class_count); + + /* see comment above: */ + for (unsigned j = class_count; j < total_class_count; j++) { + q_values[i][j] = half_class_sizes[i - class_count] + + half_class_sizes[j - class_count] - 1; + } + } + + /* allocate the reg-set.. */ + set->regs = ra_alloc_reg_set(set, ra_reg_count); + set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); + set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); + + /* .. and classes */ + reg = 0; + for (unsigned i = 0; i < class_count; i++) { + set->classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i)); + + for (unsigned j = 0; j < CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[i][j] = reg; + for (unsigned br = j; br < j + class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br, reg); + + reg++; + } + } + + first_half_reg = reg; + + for (unsigned i = 0; i < half_class_count; i++) { + set->half_classes[i] = ra_alloc_reg_class(set->regs); + + set->gpr_to_ra_reg[class_count + i] = + ralloc_array(set, uint16_t, CLASS_REGS(i)); + + for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) { + ra_class_add_reg(set->regs, set->half_classes[i], reg); + + set->ra_reg_to_gpr[reg] = j; + set->gpr_to_ra_reg[class_count + i][j] = reg; + + for (unsigned br = j; br < j + half_class_sizes[i]; br++) + ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg); + + reg++; + } + } + + ra_set_finalize(set->regs, q_values); + + ralloc_free(q_values); + + return set; +} + +/* register-assign context, per-shader */ struct ir3_ra_ctx { - struct ir3_block *block; + struct ir3 *ir; enum shader_t type; - bool frag_coord; bool frag_face; - int cnt; - bool error; - struct { - unsigned base; - unsigned size; - } arrays[MAX_ARRAYS]; + + struct ir3_ra_reg_set *set; + struct ra_graph *g; + unsigned alloc_count; + unsigned class_alloc_count[total_class_count]; + unsigned class_base[total_class_count]; + unsigned instr_cnt; + unsigned *def, *use; /* def/use table */ }; -#ifdef DEBUG -# include "freedreno_util.h" -# define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS) -#else -# define ra_debug 0 -#endif - -#define ra_dump_list(msg, ir) do { \ - if (ra_debug) { \ - debug_printf("-- " msg); \ - ir3_print(ir); \ - } \ - } while (0) - -#define ra_dump_instr(msg, n) do { \ - if (ra_debug) { \ - debug_printf(">> " msg); \ - ir3_print_instr(n); \ - } \ - } while (0) - -#define ra_assert(ctx, x) do { \ - debug_assert(x); \ - if (!(x)) { \ - debug_printf("RA: failed assert: %s\n", #x); \ - (ctx)->error = true; \ - }; \ - } while (0) - - -/* sorta ugly way to retrofit half-precision support.. rather than - * passing extra param around, just OR in a high bit. All the low - * value arithmetic (ie. +/- offset within a contiguous vec4, etc) - * will continue to work as long as you don't underflow (and that - * would go badly anyways). - */ -#define REG_HALF 0x8000 +static bool +is_half(struct ir3_instruction *instr) +{ + return !!(instr->regs[0]->flags & IR3_REG_HALF); +} -#define REG(n, wm, f) (struct ir3_register){ \ - .flags = (f), \ - .num = (n), \ - .wrmask = TGSI_WRITEMASK_ ## wm, \ +static int +size_to_class(unsigned sz, bool half) +{ + if (half) { + for (unsigned i = 0; i < half_class_count; i++) + if (half_class_sizes[i] >= sz) + return i + class_count; + } else { + for (unsigned i = 0; i < class_count; i++) + if (class_sizes[i] >= sz) + return i; } + debug_assert(0); + return -1; +} -/* check that the register exists, is a GPR and is not special (a0/p0) */ -static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) +static bool +is_temp(struct ir3_register *reg) { - if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) && - !(instr->regs[n]->flags & IR3_REG_SSA)) - return instr->regs[n]; - return NULL; + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return false; + if (reg->flags & IR3_REG_RELATIV) // TODO + return false; + if ((reg->num == regid(REG_A0, 0)) || + (reg->num == regid(REG_P0, 0))) + return false; + return true; } -/* figure out if an unassigned src register points back to the instr we - * are assigning: - */ -static bool instr_used_by(struct ir3_instruction *instr, - struct ir3_register *src) +static bool +writes_gpr(struct ir3_instruction *instr) { - struct ir3_instruction *src_instr = ssa(src); - unsigned i; - if (instr == src_instr) - return true; - if (src_instr && is_meta(src_instr)) - for (i = 1; i < src_instr->regs_count; i++) - if (instr_used_by(instr, src_instr->regs[i])) - return true; - - return false; + if (is_store(instr)) + return false; + /* is dest a normal temp register: */ + return is_temp(instr->regs[0]); } -static bool instr_is_output(struct ir3_instruction *instr) +static struct ir3_instruction * +get_definer(struct ir3_instruction *instr, int *sz, int *off) { - struct ir3_block *block = instr->block; - unsigned i; + struct ir3_instruction *d = NULL; + if (is_meta(instr) && (instr->opc == OPC_META_FI)) { + /* What about the case where collect is subset of array, we + * need to find the distance between where actual array starts + * and fanin.. that probably doesn't happen currently. + */ + struct ir3_register *src; - for (i = 0; i < block->noutputs; i++) - if (instr == block->outputs[i]) - return true; + /* note: don't use foreach_ssa_src as this gets called once + * while assigning regs (which clears SSA flag) + */ + foreach_src(src, instr) { + if (!src->instr) + continue; + if ((!d) || (src->instr->ip < d->ip)) + d = src->instr; + } - return false; -} + *sz = instr->regs_count - 1; + *off = 0; -static void mark_sources(struct ir3_instruction *instr, - struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written) -{ - unsigned i; + } else if (instr->cp.right || instr->cp.left) { + /* covers also the meta:fo case, which ends up w/ single + * scalar instructions for each component: + */ + struct ir3_instruction *f = ir3_neighbor_first(instr); - for (i = 1; i < n->regs_count; i++) { - struct ir3_register *r = reg_check(n, i); - if (r) - regmask_set_if_not(liveregs, r, written); + /* by definition, the entire sequence forms one linked list + * of single scalar register nodes (even if some of them may + * be fanouts from a texture sample (for example) instr. We + * just need to walk the list finding the first element of + * the group defined (lowest ip) + */ + int cnt = 0; + + d = f; + while (f) { + if (f->ip < d->ip) + d = f; + if (f == instr) + *off = cnt; + f = f->cp.right; + cnt++; + } + + *sz = cnt; - /* if any src points back to the instruction(s) in - * the block of neighbors that we are assigning then - * mark any written (clobbered) registers as live: + } else { + /* second case is looking directly at the instruction which + * produces multiple values (eg, texture sample), rather + * than the fanout nodes that point back to that instruction. + * This isn't quite right, because it may be part of a larger + * group, such as: + * + * sam (f32)(xyzw)r0.x, ... + * add r1.x, ... + * add r1.y, ... + * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y) + * + * need to come up with a better way to handle that case. */ - if (instr_used_by(instr, n->regs[i])) - regmask_or(liveregs, liveregs, written); + if (instr->address) { + *sz = instr->regs[0]->size; + } else { + *sz = util_last_bit(instr->regs[0]->wrmask); + } + *off = 0; + return instr; + } + + if (is_meta(d) && (d->opc == OPC_META_FO)) { + struct ir3_instruction *dd; + int dsz, doff; + + dd = get_definer(d->regs[1]->instr, &dsz, &doff); + + /* by definition, should come before: */ + debug_assert(dd->ip < d->ip); + + *sz = MAX2(*sz, dsz); + + d = dd; } + return d; } -/* live means read before written */ -static void compute_liveregs(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, regmask_t *liveregs) +/* give each instruction a name (and ip), and count up the # of names + * of each class + */ +static void +ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) { - struct ir3_block *block = ctx->block; - regmask_t written; - unsigned i; - - regmask_init(&written); + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + instr->ip = ctx->instr_cnt++; + } - list_for_each_entry (struct ir3_instruction, n, &instr->node, node) { - struct ir3_register *r; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *defn; + int cls, sz, off; - if (is_meta(n)) + if (instr->regs_count == 0) continue; - /* check first src's read: */ - mark_sources(instr, n, liveregs, &written); + if (!writes_gpr(instr)) + continue; - /* for instructions that write to an array, we need to - * capture the dependency on the array elements: - */ - if (n->fanin) - mark_sources(instr, n->fanin, liveregs, &written); + defn = get_definer(instr, &sz, &off); - /* meta-instructions don't actually get scheduled, - * so don't let it's write confuse us.. what we - * really care about is when the src to the meta - * instr was written: - */ - if (is_meta(n)) + if (defn != instr) continue; - /* then dst written (if assigned already): */ - r = reg_check(n, 0); - if (r) { - /* if an instruction *is* an output, then it is live */ - if (!instr_is_output(n)) - regmask_set(&written, r); + /* arrays which don't fit in one of the pre-defined class + * sizes are pre-colored: + * + * TODO but we still need to allocate names for them, don't we?? + */ + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + instr->name = ctx->class_alloc_count[cls]++; + ctx->alloc_count++; } - } +} - /* be sure to account for output registers too: */ - for (i = 0; i < block->noutputs; i++) { - struct ir3_register *r; - if (!block->outputs[i]) - continue; - r = reg_check(block->outputs[i], 0); - if (r) - regmask_set_if_not(liveregs, r, &written); - } +static void +ra_init(struct ir3_ra_ctx *ctx) +{ + ir3_clear_mark(ctx->ir); + + ra_block_name_instructions(ctx, ctx->ir->block); - /* if instruction is output, we need a reg that isn't written - * before the end.. equiv to the instr_used_by() check above - * in the loop body - * TODO maybe should follow fanin/fanout? + /* figure out the base register name for each class. The + * actual ra name is class_base[cls] + instr->name; */ - if (instr_is_output(instr)) - regmask_or(liveregs, liveregs, &written); + ctx->class_base[0] = 0; + for (unsigned i = 1; i < total_class_count; i++) { + ctx->class_base[i] = ctx->class_base[i-1] + + ctx->class_alloc_count[i-1]; + } + + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); + ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); + ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); +} + +static void +ra_destroy(struct ir3_ra_ctx *ctx) +{ + ralloc_free(ctx->g); } -static int find_available(regmask_t *liveregs, int size, bool half) +static void +ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) { - unsigned i; - unsigned f = half ? IR3_REG_HALF : 0; - for (i = 0; i < MAX_REG - size; i++) { - if (!regmask_get(liveregs, ®(i, X, f))) { - unsigned start = i++; - for (; (i < MAX_REG) && ((i - start) < size); i++) - if (regmask_get(liveregs, ®(i, X, f))) - break; - if ((i - start) >= size) - return start; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_instruction *src; + + if (instr->regs_count == 0) + continue; + + /* There are a couple special cases to deal with here: + * + * fanout: used to split values from a higher class to a lower + * class, for example split the results of a texture fetch + * into individual scalar values; We skip over these from + * a 'def' perspective, and for a 'use' we walk the chain + * up to the defining instruction. + * + * fanin: used to collect values from lower class and assemble + * them together into a higher class, for example arguments + * to texture sample instructions; We consider these to be + * defined at the fanin node. + * + * In either case, we trace the instruction back to the original + * definer and consider that as the def/use ip. + */ + + if (writes_gpr(instr)) { + struct ir3_instruction *defn; + int cls, sz, off; + + defn = get_definer(instr, &sz, &off); + if (defn == instr) { + /* arrays which don't fit in one of the pre-defined class + * sizes are pre-colored: + */ + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ctx->class_base[cls] + defn->name; + ctx->def[name] = defn->ip; + ctx->use[name] = defn->ip; + + debug_assert(name < ctx->alloc_count); + + if (is_half(defn)) { + ra_set_node_class(ctx->g, name, + ctx->set->half_classes[cls - class_count]); + } else { + ra_set_node_class(ctx->g, name, + ctx->set->classes[cls]); + } + } + } + } + + foreach_ssa_src(src, instr) { + if (writes_gpr(src)) { + struct ir3_instruction *srcdefn; + int cls, sz, off; + + srcdefn = get_definer(src, &sz, &off); + cls = size_to_class(sz, is_half(srcdefn)); + if (cls >= 0) { + unsigned name = ctx->class_base[cls] + srcdefn->name; + ctx->use[name] = instr->ip; + } + } } } - assert(0); - return -1; } -static int alloc_block(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, int size) +static void +ra_add_interference(struct ir3_ra_ctx *ctx) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_instruction *n; - regmask_t liveregs; - unsigned name; - - /* should only ever be called w/ head of neighbor list: */ - debug_assert(!instr->cp.left); - - regmask_init(&liveregs); - - for (n = instr; n; n = n->cp.right) - compute_liveregs(ctx, n, &liveregs); - - /* because we do assignment on fanout nodes for wrmask!=0x1, we - * need to handle this special case, where the fanout nodes all - * appear after one or more of the consumers of the src node: - * - * 0098:009: sam _, r2.x - * 0028:010: mul.f r3.z, r4.x, c13.x - * ; we start assigning here for '0098:009: sam'.. but - * ; would miss the usage at '0028:010: mul.f' - * 0101:009: _meta:fo _, _[0098:009: sam], off=2 - */ - if (is_meta(instr) && (instr->opc == OPC_META_FO)) - compute_liveregs(ctx, instr->regs[1]->instr, &liveregs); + struct ir3_block *block = ctx->ir->block; - name = find_available(&liveregs, size, - !!(dst->flags & IR3_REG_HALF)); + ra_block_compute_live_ranges(ctx, ctx->ir->block); - if (dst->flags & IR3_REG_HALF) - name |= REG_HALF; + /* need to fix things up to keep outputs live: */ + for (unsigned i = 0; i < block->noutputs; i++) { + struct ir3_instruction *instr = block->outputs[i]; + struct ir3_instruction *defn; + int cls, sz, off; - return name; + defn = get_definer(instr, &sz, &off); + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ctx->class_base[cls] + defn->name; + ctx->use[name] = ctx->instr_cnt; + } + } + + for (unsigned i = 0; i < ctx->alloc_count; i++) { + for (unsigned j = 0; j < ctx->alloc_count; j++) { + if (!((ctx->def[i] >= ctx->use[j]) || + (ctx->def[j] >= ctx->use[i]))) { + ra_add_node_interference(ctx->g, i, j); + } + } + } } static type_t half_type(type_t type) @@ -357,324 +623,123 @@ static void fixup_half_instr_src(struct ir3_instruction *instr) } } -static void reg_assign(struct ir3_instruction *instr, - unsigned r, unsigned name) -{ - struct ir3_register *reg = instr->regs[r]; - - reg->flags &= ~IR3_REG_SSA; - reg->num = name & ~REG_HALF; - - if (name & REG_HALF) { - reg->flags |= IR3_REG_HALF; - /* if dst reg being assigned, patch up the instr: */ - if (reg == instr->regs[0]) - fixup_half_instr_dst(instr); - else - fixup_half_instr_src(instr); - } -} - -static void instr_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name); - -static void instr_assign_src(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned r, unsigned name) -{ - struct ir3_register *reg = instr->regs[r]; - - if (reg->flags & IR3_REG_RELATIV) - name += reg->offset; - - reg_assign(instr, r, name); - - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: - /* shader-input does not have a src, only block input: */ - debug_assert(instr->regs_count == 2); - instr_assign(ctx, instr, name); - return; - case OPC_META_FO: - instr_assign(ctx, instr, name + instr->fo.off); - return; - case OPC_META_FI: - instr_assign(ctx, instr, name - (r - 1)); - return; - default: - break; - } - } -} - -static void instr_assign_srcs(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name) -{ - list_for_each_entry (struct ir3_instruction, n, &instr->node, node) { - struct ir3_instruction *src; - foreach_ssa_src_n(src, i, n) { - unsigned r = i + 1; - - /* skip address / etc (non real sources): */ - if (r >= n->regs_count) - continue; - - if (src == instr) - instr_assign_src(ctx, n, r, name); - } - if (ctx->error) - break; - } -} - -static void instr_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, unsigned name) -{ - struct ir3_register *reg = instr->regs[0]; - - if (reg->flags & IR3_REG_RELATIV) - return; - - /* check if already assigned: */ - if (!(reg->flags & IR3_REG_SSA)) { - /* ... and if so, sanity check: */ - ra_assert(ctx, reg->num == (name & ~REG_HALF)); - return; - } - - /* rename this instructions dst register: */ - reg_assign(instr, 0, name); - - /* and rename any subsequent use of result of this instr: */ - instr_assign_srcs(ctx, instr, name); - - /* To simplify the neighbor logic, and to "avoid" dealing with - * instructions which write more than one output, we actually - * do register assignment for instructions that produce multiple - * outputs on the fanout nodes and propagate up the assignment - * to the actual instruction: - */ - if (is_meta(instr) && (instr->opc == OPC_META_FO)) { - struct ir3_instruction *src; - - debug_assert(name >= instr->fo.off); - - foreach_ssa_src(src, instr) - instr_assign(ctx, src, name - instr->fo.off); - } -} - -/* check neighbor list to see if it is already partially (or completely) - * assigned, in which case register block is already allocated and we - * just need to complete the assignment: - */ -static int check_partial_assignment(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *n; - int off = 0; - - debug_assert(!instr->cp.left); - - for (n = instr; n; n = n->cp.right) { - struct ir3_register *dst = n->regs[0]; - if ((n->depth != DEPTH_UNUSED) && - !(dst->flags & IR3_REG_SSA)) { - int name = dst->num - off; - debug_assert(name >= 0); - return name; - } - off++; - } - - return -1; -} - -/* allocate register name(s) for a list of neighboring instructions; - * instr should point to leftmost neighbor (head of list) - */ -static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *n; - struct ir3_register *dst; - int name; - - debug_assert(!instr->cp.left); - - if (instr->regs_count == 0) - return; - - dst = instr->regs[0]; - - /* For indirect dst, take the register assignment from the - * fanin and propagate it forward. - */ - if (dst->flags & IR3_REG_RELATIV) { - /* NOTE can be grouped, if for example outputs: - * for now disable cp if indirect writes - */ - instr_alloc_and_assign(ctx, instr->fanin); - - dst->num += instr->fanin->regs[0]->num; - dst->flags &= ~IR3_REG_SSA; - - instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num); - - return; - } - - /* for instructions w/ fanouts, do the actual register assignment - * on the group of fanout neighbor nodes and propagate the reg - * name back up to the texture instruction. - */ - if (dst->wrmask != 0x1) - return; - - name = check_partial_assignment(ctx, instr); - - /* allocate register(s): */ - if (name >= 0) { - /* already partially assigned, just finish the job */ - } else if (reg_gpr(dst)) { - int size; - /* number of consecutive registers to assign: */ - size = ir3_neighbor_count(instr); - if (dst->wrmask != 0x1) - size = MAX2(size, ffs(~dst->wrmask) - 1); - name = alloc_block(ctx, instr, size); - } else if (dst->flags & IR3_REG_ADDR) { - debug_assert(!instr->cp.right); - dst->flags &= ~IR3_REG_ADDR; - name = regid(REG_A0, 0) | REG_HALF; - } else { - debug_assert(!instr->cp.right); - /* predicate register (p0).. etc */ - name = regid(REG_P0, 0); - debug_assert(dst->num == name); - } - - ra_assert(ctx, name >= 0); - - for (n = instr; n && !ctx->error; n = n->cp.right) { - instr_assign(ctx, n, name); - name++; - } -} - -static void instr_assign_array(struct ir3_ra_ctx *ctx, +static void +reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg, struct ir3_instruction *instr) { - struct ir3_instruction *src; - int name, aid = instr->fi.aid; + struct ir3_instruction *defn; + int cls, sz, off; - if (ctx->arrays[aid].base == ~0) { - int size = instr->regs_count - 1; - ctx->arrays[aid].base = alloc_block(ctx, instr, size); - ctx->arrays[aid].size = size; - } + defn = get_definer(instr, &sz, &off); + cls = size_to_class(sz, is_half(defn)); + if (cls >= 0) { + unsigned name = ctx->class_base[cls] + defn->name; + unsigned r = ra_get_node_reg(ctx->g, name); + unsigned num = ctx->set->ra_reg_to_gpr[r] + off; - name = ctx->arrays[aid].base; + if (reg->flags & IR3_REG_RELATIV) + num += reg->offset; - foreach_ssa_src_n(src, i, instr) { - unsigned r = i + 1; + reg->num = num; + reg->flags &= ~IR3_REG_SSA; - /* skip address / etc (non real sources): */ - if (r >= instr->regs_count) - break; - - instr_assign(ctx, src, name); - name++; + if (is_half(defn)) + reg->flags |= IR3_REG_HALF; } - } -static bool -block_ra(struct ir3_block *block, void *state) +static void +ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) { - struct ir3_ra_ctx *ctx = state; + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + struct ir3_register *reg; - ra_dump_list("-------\n", block->shader); + if (instr->regs_count == 0) + continue; - /* first pass, assign arrays: */ - list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) { - if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) { - debug_assert(!n->cp.left); /* don't think this should happen */ - ra_dump_instr("ASSIGN ARRAY: ", n); - instr_assign_array(ctx, n); - ra_dump_list("-------\n", block->shader); + if (writes_gpr(instr)) { + reg_assign(ctx, instr->regs[0], instr); + if (instr->regs[0]->flags & IR3_REG_HALF) + fixup_half_instr_dst(instr); } - if (ctx->error) - return false; - } - - list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) { - ra_dump_instr("ASSIGN: ", n); - instr_alloc_and_assign(ctx, ir3_neighbor_first(n)); - ra_dump_list("-------\n", block->shader); + foreach_src_n(reg, n, instr) { + struct ir3_instruction *src = reg->instr; + if (!src) + continue; - if (ctx->error) - return false; + reg_assign(ctx, instr->regs[n+1], src); + if (instr->regs[n+1]->flags & IR3_REG_HALF) + fixup_half_instr_src(instr); + } } - - return true; } static int -shader_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) +ra_alloc(struct ir3_ra_ctx *ctx) { /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ if (ctx->type == SHADER_FRAGMENT) { + struct ir3_block *block = ctx->ir->block; unsigned i = 0, j; if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) { + struct ir3_instruction *instr = block->inputs[i]; + unsigned cls = size_to_class(1, true); + unsigned name = ctx->class_base[cls] + instr->name; + unsigned reg = ctx->set->gpr_to_ra_reg[cls][0]; + /* if we have frag_face, it gets hr0.x */ - instr_assign(ctx, block->inputs[i], REG_HALF | 0); + ra_set_node_reg(ctx->g, name, reg); i += 4; } - for (j = 0; i < block->ninputs; i++, j++) - if (block->inputs[i]) - instr_assign(ctx, block->inputs[i], j); + + for (j = 0; i < block->ninputs; i++) { + struct ir3_instruction *instr = block->inputs[i]; + if (instr) { + struct ir3_instruction *defn; + int cls, sz, off; + + defn = get_definer(instr, &sz, &off); + if (defn == instr) { + unsigned name, reg; + + cls = size_to_class(sz, is_half(defn)); + debug_assert(cls >= 0); + name = ctx->class_base[cls] + defn->name; + reg = ctx->set->gpr_to_ra_reg[cls][j]; + + ra_set_node_reg(ctx->g, name, reg); + j += sz; + } + } + } } - block_ra(block, ctx); + if (!ra_allocate(ctx->g)) + return -1; - return ctx->error ? -1 : 0; -} + ra_block_alloc(ctx, ctx->ir->block); -static bool -block_mark_dst(struct ir3_block *block, void *state) -{ - list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) - if (n->regs_count > 0) - n->regs[0]->flags |= IR3_REG_SSA; - return true; + return 0; } int ir3_block_ra(struct ir3_block *block, enum shader_t type, bool frag_coord, bool frag_face) { struct ir3_ra_ctx ctx = { - .block = block, + .ir = block->shader, .type = type, - .frag_coord = frag_coord, .frag_face = frag_face, + .set = block->shader->compiler->set, }; int ret; - memset(&ctx.arrays, ~0, sizeof(ctx.arrays)); - - /* mark dst registers w/ SSA flag so we can see which - * have been assigned so far: - * NOTE: we really should set SSA flag consistently on - * every dst register in the frontend. - */ - block_mark_dst(block, &ctx); - - ir3_clear_mark(block->shader); - ret = shader_ra(&ctx, block); + ra_init(&ctx); + ra_add_interference(&ctx); + ret = ra_alloc(&ctx); + ra_destroy(&ctx); return ret; }