vc4: Enable V3D 2.6.

[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_ra.c
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c

index aa8ad513e043a5a7fe217d1292cbd31603efddec..26c1508fbd2d177349ec8b86d9f9d8fb9fc6d8eb 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -26,372 +26,936 @@
   *    Rob Clark <robclark@freedesktop.org>
   */
  
-#include "pipe/p_shader_tokens.h"
  #include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
+
+#include "freedreno_util.h"
  
  #include "ir3.h"
-#include "ir3_visitor.h"
+#include "ir3_compiler.h"
  
  /*
   * Register Assignment:
   *
- * NOTE: currently only works on a single basic block.. need to think
- * about how multiple basic blocks are going to get scheduled.  But
- * I think I want to re-arrange how blocks work, ie. get rid of the
- * block nesting thing..
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
+ *
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
+ *
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
   *
- * NOTE: we could do register coalescing (eliminate moves) as part of
- * the RA step.. OTOH I think we need to do scheduling before register
- * assignment.  And if we remove a mov that effects scheduling (unless
- * we leave a placeholder nop, which seems lame), so I'm not really
- * sure how practical this is to do both in a single stage.  But OTOH
- * I'm not really sure a sane way for the CP stage to realize when it
- * cannot remove a mov due to multi-register constraints..
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
   *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the variable is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
   */
  
-struct ir3_ra_ctx {
-       struct ir3_block *block;
-       enum shader_t type;
-       bool half_precision;
-       bool frag_coord;
-       bool frag_face;
-       bool has_samp;
-       int cnt;
-       int max_bary;
-       bool error;
+static const unsigned class_sizes[] = {
+       1, 2, 3, 4,
+       4 + 4, /* txd + 1d/2d */
+       4 + 6, /* txd + 3d */
  };
+#define class_count ARRAY_SIZE(class_sizes)
  
-/* sorta ugly way to retrofit half-precision support.. rather than
- * passing extra param around, just OR in a high bit.  All the low
- * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
- * will continue to work as long as you don't underflow (and that
- * would go badly anyways).
- */
-#define REG_HALF  0x8000
-
-struct ir3_ra_assignment {
-       int8_t  off;        /* offset of instruction dst within range */
-       uint8_t num;        /* number of components for the range */
+static const unsigned half_class_sizes[] = {
+       1, 2, 3, 4,
  };
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
  
-static void ra_assign(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *assigner, int num);
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
-
-/*
- * Register Allocation:
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
   */
+static const unsigned high_class_sizes[] = {
+       1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+       struct ra_regs *regs;
+       unsigned int classes[class_count];
+       unsigned int half_classes[half_class_count];
+       unsigned int high_classes[high_class_count];
+       /* maps flat virtual register space to base gpr: */
+       uint16_t *ra_reg_to_gpr;
+       /* maps cls,gpr to flat virtual register space: */
+       uint16_t **gpr_to_ra_reg;
+};
  
-#define REG(n, wm, f) (struct ir3_register){ \
-               .flags  = (f), \
-               .num    = (n), \
-               .wrmask = TGSI_WRITEMASK_ ## wm, \
-       }
-
-/* check that the register exists, is a GPR and is not special (a0/p0) */
-static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+               const unsigned *sizes, unsigned count)
  {
-       if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
-               return instr->regs[n];
-       return NULL;
+       for (unsigned i = 0; i < count; i++) {
+               q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+               /* From register_allocate.c:
+                *
+                * q(B,C) (indexed by C, B is this register class) in
+                * Runeson/Nyström paper.  This is "how many registers of B could
+                * the worst choice register from C conflict with".
+                *
+                * If we just let the register allocation algorithm compute these
+                * values, is extremely expensive.  However, since all of our
+                * registers are laid out, we can very easily compute them
+                * ourselves.  View the register from C as fixed starting at GRF n
+                * somewhere in the middle, and the register from B as sliding back
+                * and forth.  Then the first register to conflict from B is the
+                * one starting at n - class_size[B] + 1 and the last register to
+                * conflict will start at n + class_size[B] - 1.  Therefore, the
+                * number of conflicts from B is class_size[B] + class_size[C] - 1.
+                *
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                * B | | | | | |n| --> | | | | | | |
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                *             +-+-+-+-+-+
+                * C           |n| | | | |
+                *             +-+-+-+-+-+
+                *
+                * (Idea copied from brw_fs_reg_allocate.cpp)
+                */
+               for (unsigned j = 0; j < count; j++)
+                       q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+       }
  }
  
-static int output_base(struct ir3_ra_ctx *ctx)
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(void *memctx)
  {
-       /* ugg, for fragment shader we need to have input at r0.x
-        * (or at least if there is a way to configure it, I can't
-        * see how because the blob driver always uses r0.x (ie.
-        * all zeros)
-        */
-       if (ctx->type == SHADER_FRAGMENT) {
-               if (ctx->half_precision)
-                       return ctx->frag_face ? 4 : 3;
-               return ctx->frag_coord ? 8 : 4;
+       struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
+       unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+       unsigned int **q_values;
+
+       /* calculate # of regs across all classes: */
+       ra_reg_count = 0;
+       for (unsigned i = 0; i < class_count; i++)
+               ra_reg_count += CLASS_REGS(i);
+       for (unsigned i = 0; i < half_class_count; i++)
+               ra_reg_count += HALF_CLASS_REGS(i);
+       for (unsigned i = 0; i < high_class_count; i++)
+               ra_reg_count += HIGH_CLASS_REGS(i);
+
+       /* allocate and populate q_values: */
+       q_values = ralloc_array(set, unsigned *, total_class_count);
+
+       build_q_values(q_values, 0, class_sizes, class_count);
+       build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+       build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
+
+       /* allocate the reg-set.. */
+       set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
+       set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+       set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+       /* .. and classes */
+       reg = 0;
+       for (unsigned i = 0; i < class_count; i++) {
+               set->classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+               for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[i][j] = reg;
+
+                       for (unsigned br = j; br < j + class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+                       reg++;
+               }
         }
-       return 0;
-}
  
-/* live means read before written */
-static void compute_liveregs(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, regmask_t *liveregs)
-{
-       struct ir3_block *block = instr->block;
-       regmask_t written;
-       unsigned i, j;
+       first_half_reg = reg;
+       base = HALF_OFFSET;
  
-       regmask_init(liveregs);
-       regmask_init(&written);
+       for (unsigned i = 0; i < half_class_count; i++) {
+               set->half_classes[i] = ra_alloc_reg_class(set->regs);
  
-       for (instr = instr->next; instr; instr = instr->next) {
-               struct ir3_register *r;
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
  
-               if (is_meta(instr))
-                       continue;
+               for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
+
+                       for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
  
-               /* check first src's read: */
-               for (j = 1; j < instr->regs_count; j++) {
-                       r = reg_check(instr, j);
-                       if (r)
-                               regmask_set_if_not(liveregs, r, &written);
+                       reg++;
                 }
+       }
+
+       first_high_reg = reg;
+       base = HIGH_OFFSET;
+
+       for (unsigned i = 0; i < high_class_count; i++) {
+               set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+               for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->high_classes[i], reg);
  
-               /* then dst written (if assigned already): */
-               if (instr->flags & IR3_INSTR_MARK) {
-                       r = reg_check(instr, 0);
-                       if (r)
-                               regmask_set(&written, r);
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
+
+                       for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+                       reg++;
                 }
         }
  
-       /* be sure to account for output registers too: */
-       for (i = 0; i < block->noutputs; i++) {
-               struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
-               regmask_set_if_not(liveregs, &reg, &written);
+
+       ra_set_finalize(set->regs, q_values);
+
+       ralloc_free(q_values);
+
+       return set;
+}
+
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+       BITSET_WORD *def;        /* variables defined before used in block */
+       BITSET_WORD *use;        /* variables used before defined in block */
+       BITSET_WORD *livein;     /* which defs reach entry point of block */
+       BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+       /* cached instruction 'definer' info: */
+       struct ir3_instruction *defn;
+       int off, sz, cls;
+};
+
+/* register-assign context, per-shader */
+struct ir3_ra_ctx {
+       struct ir3 *ir;
+       enum shader_t type;
+       bool frag_face;
+
+       struct ir3_ra_reg_set *set;
+       struct ra_graph *g;
+       unsigned alloc_count;
+       /* one per class, plus one slot for arrays: */
+       unsigned class_alloc_count[total_class_count + 1];
+       unsigned class_base[total_class_count + 1];
+       unsigned instr_cnt;
+       unsigned *def, *use;     /* def/use table */
+       struct ir3_ra_instr_data *instrd;
+};
+
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+       return !((a_start >= b_end) || (b_start >= a_end));
+}
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static bool
+is_high(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+       if (high) {
+               for (unsigned i = 0; i < high_class_count; i++)
+                       if (high_class_sizes[i] >= sz)
+                               return i + HIGH_OFFSET;
+       } else if (half) {
+               for (unsigned i = 0; i < half_class_count; i++)
+                       if (half_class_sizes[i] >= sz)
+                               return i + HALF_OFFSET;
+       } else {
+               for (unsigned i = 0; i < class_count; i++)
+                       if (class_sizes[i] >= sz)
+                               return i;
         }
+       debug_assert(0);
+       return -1;
  }
  
-/* calculate registers that are clobbered before last use of 'assigner'.
- * This needs to be done backwards, although it could possibly be
- * combined into compute_liveregs().  (Ie. compute_liveregs() could
- * reverse the list, then do this part backwards reversing the list
- * again back to original order.)  Otoh, probably I should try to
- * construct a proper interference graph instead.
- *
- * XXX this need to follow the same recursion path that is used for
- * to rename/assign registers (ie. ra_assign_src()).. this is a bit
- * ugly right now, maybe refactor into node iterator sort of things
- * that iterates nodes in the correct order?
- */
-static bool compute_clobbers(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, struct ir3_instruction *assigner,
-               regmask_t *liveregs)
+static bool
+is_temp(struct ir3_register *reg)
  {
-       unsigned i;
-       bool live = false, was_live = false;
+       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+               return false;
+       if ((reg->num == regid(REG_A0, 0)) ||
+                       (reg->num == regid(REG_P0, 0)))
+               return false;
+       return true;
+}
  
-       if (instr == NULL) {
-               struct ir3_block *block = ctx->block;
+static bool
+writes_gpr(struct ir3_instruction *instr)
+{
+       if (is_store(instr))
+               return false;
+       /* is dest a normal temp register: */
+       return is_temp(instr->regs[0]);
+}
  
-               /* if at the end, check outputs: */
-               for (i = 0; i < block->noutputs; i++)
-                       if (block->outputs[i] == assigner)
-                               return true;
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+       if (a->flags & IR3_INSTR_UNUSED)
                 return false;
+       return (a->ip < b->ip);
+}
+
+static struct ir3_instruction *
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+               int *sz, int *off)
+{
+       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+       struct ir3_instruction *d = NULL;
+
+       if (id->defn) {
+               *sz = id->sz;
+               *off = id->off;
+               return id->defn;
         }
  
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
-                       if (is_meta(instr)) {
-                               switch (instr->opc) {
-                               case OPC_META_INPUT:
-                                       // TODO
-                                       assert(0);
-                                       break;
-                               case OPC_META_FO:
-                               case OPC_META_FI:
-                                       was_live |= compute_clobbers(ctx, instr->next,
-                                                       instr, liveregs);
-                                       break;
-                               default:
-                                       break;
-                               }
+       if (instr->opc == OPC_META_FI) {
+               /* What about the case where collect is subset of array, we
+                * need to find the distance between where actual array starts
+                * and fanin..  that probably doesn't happen currently.
+                */
+               struct ir3_register *src;
+               int dsz, doff;
+
+               /* note: don't use foreach_ssa_src as this gets called once
+                * while assigning regs (which clears SSA flag)
+                */
+               foreach_src_n(src, n, instr) {
+                       struct ir3_instruction *dd;
+                       if (!src->instr)
+                               continue;
+
+                       dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+                       if ((!d) || instr_before(dd, d)) {
+                               d = dd;
+                               *sz = dsz;
+                               *off = doff - n;
                         }
-                       live = true;
-                       break;
                 }
+
+       } else if (instr->cp.right || instr->cp.left) {
+               /* covers also the meta:fo case, which ends up w/ single
+                * scalar instructions for each component:
+                */
+               struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+               /* by definition, the entire sequence forms one linked list
+                * of single scalar register nodes (even if some of them may
+                * be fanouts from a texture sample (for example) instr.  We
+                * just need to walk the list finding the first element of
+                * the group defined (lowest ip)
+                */
+               int cnt = 0;
+
+               /* need to skip over unused in the group: */
+               while (f && (f->flags & IR3_INSTR_UNUSED)) {
+                       f = f->cp.right;
+                       cnt++;
+               }
+
+               while (f) {
+                       if ((!d) || instr_before(f, d))
+                               d = f;
+                       if (f == instr)
+                               *off = cnt;
+                       f = f->cp.right;
+                       cnt++;
+               }
+
+               *sz = cnt;
+
+       } else {
+               /* second case is looking directly at the instruction which
+                * produces multiple values (eg, texture sample), rather
+                * than the fanout nodes that point back to that instruction.
+                * This isn't quite right, because it may be part of a larger
+                * group, such as:
+                *
+                *     sam (f32)(xyzw)r0.x, ...
+                *     add r1.x, ...
+                *     add r1.y, ...
+                *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+                *
+                * need to come up with a better way to handle that case.
+                */
+               if (instr->address) {
+                       *sz = instr->regs[0]->size;
+               } else {
+                       *sz = util_last_bit(instr->regs[0]->wrmask);
+               }
+               *off = 0;
+               d = instr;
         }
  
-       was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
+       if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+               struct ir3_instruction *phi = d->regs[0]->instr;
+               struct ir3_instruction *dd;
+               int dsz, doff;
  
-       if (was_live && (instr->regs_count > 0) &&
-                       (instr->flags & IR3_INSTR_MARK) &&
-                       !is_meta(instr))
-               regmask_set(liveregs, instr->regs[0]);
+               dd = get_definer(ctx, phi, &dsz, &doff);
  
-       return live || was_live;
-}
+               *sz = MAX2(*sz, dsz);
+               *off = doff;
  
-static int find_available(regmask_t *liveregs, int size, bool half)
-{
-       unsigned i;
-       unsigned f = half ? IR3_REG_HALF : 0;
-       for (i = 0; i < MAX_REG - size; i++) {
-               if (!regmask_get(liveregs, &REG(i, X, f))) {
-                       unsigned start = i++;
-                       for (; (i < MAX_REG) && ((i - start) < size); i++)
-                               if (regmask_get(liveregs, &REG(i, X, f)))
-                                       break;
-                       if ((i - start) >= size)
-                               return start;
+               if (instr_before(dd, d)) {
+                       d = dd;
                 }
         }
-       assert(0);
-       return -1;
-}
  
-static int alloc_block(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, int size)
-{
-       if (!instr) {
-               /* special case, allocating shader outputs.  At this
-                * point, nothing is allocated, just start the shader
-                * outputs at r0.x and let compute_liveregs() take
-                * care of the rest from here:
+       if (d->opc == OPC_META_PHI) {
+               /* we have already inserted parallel-copies into
+                * the phi, so we don't need to chase definers
                  */
-               return 0;
-       } else {
-               struct ir3_register *dst = instr->regs[0];
-               regmask_t liveregs;
-
-               compute_liveregs(ctx, instr, &liveregs);
-
-               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
-               // XXX hack.. maybe ra_calc should give us a list of
-               // instrs to compute_clobbers() on?
-               if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
-                               (instr->regs_count == 1)) {
-                       unsigned i, base = instr->regs[0]->num & ~0x3;
-                       for (i = 0; i < 4; i++) {
-                               struct ir3_instruction *in = NULL;
-                               if ((base + i) < ctx->block->ninputs)
-                                       in = ctx->block->inputs[base + i];
-                               if (in)
-                                       compute_clobbers(ctx, in->next, in, &liveregs);
-                       }
-               } else
-               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
-               compute_clobbers(ctx, instr->next, instr, &liveregs);
+               struct ir3_register *src;
+               struct ir3_instruction *dd = d;
+
+               /* note: don't use foreach_ssa_src as this gets called once
+                * while assigning regs (which clears SSA flag)
+                */
+               foreach_src(src, d) {
+                       if (!src->instr)
+                               continue;
+                       if (instr_before(src->instr, dd))
+                               dd = src->instr;
+               }
  
-               return find_available(&liveregs, size,
-                               !!(dst->flags & IR3_REG_HALF));
+               d = dd;
         }
-}
  
-/*
- * Constraint Calculation:
- */
+       if (d->opc == OPC_META_FO) {
+               struct ir3_instruction *dd;
+               int dsz, doff;
  
-struct ra_calc_visitor {
-       struct ir3_visitor base;
-       struct ir3_ra_assignment a;
-};
+               dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
+
+               /* by definition, should come before: */
+               debug_assert(instr_before(dd, d));
+
+               *sz = MAX2(*sz, dsz);
+
+               debug_assert(instr->opc == OPC_META_FO);
+               *off = MAX2(*off, instr->fo.off);
+
+               d = dd;
+       }
+
+       id->defn = d;
+       id->sz = *sz;
+       id->off = *off;
  
-static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
+       return d;
+}
+
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
-       return (struct ra_calc_visitor *)v;
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+               if (instr->regs_count == 0)
+                       continue;
+               /* couple special cases: */
+               if (writes_addr(instr) || writes_pred(instr)) {
+                       id->cls = -1;
+               } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+                       id->cls = total_class_count;
+                       id->defn = instr;
+               } else {
+                       id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+                       id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+               }
+       }
  }
  
-/* calculate register assignment for the instruction.  If the register
- * written by this instruction is required to be part of a range, to
- * handle other (input/output/sam/bary.f/etc) contiguous register range
- * constraints, that is calculated handled here.
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
   */
-static void ra_calc_dst(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       if (is_tex(instr)) {
-               c->a.off = 0;
-               c->a.num = 4;
-       } else {
-               c->a.off = 0;
-               c->a.num = 1;
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+               instr->name = ~0;
+#endif
+
+               ctx->instr_cnt++;
+
+               if (instr->regs_count == 0)
+                       continue;
+
+               if (!writes_gpr(instr))
+                       continue;
+
+               if (id->defn != instr)
+                       continue;
+
+               /* arrays which don't fit in one of the pre-defined class
+                * sizes are pre-colored:
+                */
+               if (id->cls >= 0) {
+                       instr->name = ctx->class_alloc_count[id->cls]++;
+                       ctx->alloc_count++;
+               }
         }
  }
  
  static void
-ra_calc_dst_shader_input(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
+ra_init(struct ir3_ra_ctx *ctx)
  {
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       struct ir3_block *block = instr->block;
-       struct ir3_register *dst = instr->regs[0];
-       unsigned base = dst->num & ~0x3;
-       unsigned i, num = 0;
+       unsigned n, base;
  
-       assert(!(dst->flags & IR3_REG_IA));
+       ir3_clear_mark(ctx->ir);
+       n = ir3_count_instructions(ctx->ir);
  
-       /* check what input components we need: */
-       for (i = 0; i < 4; i++) {
-               unsigned idx = base + i;
-               if ((idx < block->ninputs) && block->inputs[idx])
-                       num = i + 1;
+       ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_find_definers(ctx, block);
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_name_instructions(ctx, block);
+       }
+
+       /* figure out the base register name for each class.  The
+        * actual ra name is class_base[cls] + instr->name;
+        */
+       ctx->class_base[0] = 0;
+       for (unsigned i = 1; i <= total_class_count; i++) {
+               ctx->class_base[i] = ctx->class_base[i-1] +
+                               ctx->class_alloc_count[i-1];
+       }
+
+       /* and vreg names for array elements: */
+       base = ctx->class_base[total_class_count];
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               arr->base = base;
+               ctx->class_alloc_count[total_class_count] += arr->length;
+               base += arr->length;
         }
+       ctx->alloc_count += ctx->class_alloc_count[total_class_count];
  
-       c->a.off = dst->num - base;
-       c->a.num = num;
+       ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+       ralloc_steal(ctx->g, ctx->instrd);
+       ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+       ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
  }
  
-static void ra_calc_src_fanin(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
+static unsigned
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
  {
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       unsigned srcn = ir3_instr_regno(instr, reg) - 1;
-       c->a.off += srcn;
-       c->a.num += srcn;
-       c->a.num = MAX2(c->a.num, instr->regs_count - 1);
+       unsigned name;
+       debug_assert(cls >= 0);
+       debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
+       name = ctx->class_base[cls] + defn->name;
+       debug_assert(name < ctx->alloc_count);
+       return name;
  }
  
-static const struct ir3_visitor_funcs calc_visitor_funcs = {
-               .instr = ir3_visit_instr,
-               .dst_shader_input = ra_calc_dst_shader_input,
-               .dst_fanout = ra_calc_dst,
-               .dst_fanin = ra_calc_dst,
-               .dst = ra_calc_dst,
-               .src_fanout = ir3_visit_reg,
-               .src_fanin = ra_calc_src_fanin,
-               .src = ir3_visit_reg,
-};
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+       /* TODO handle name mapping for arrays */
+       return __ra_name(ctx, id->cls, id->defn);
+}
  
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
  {
-       struct ra_calc_visitor v = {
-                       .base.funcs = &calc_visitor_funcs,
-       };
+       ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_ra_block_data *bd;
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+#define def(name, instr) \
+               do { \
+                       /* defined on first write: */ \
+                       if (!ctx->def[name]) \
+                               ctx->def[name] = instr->ip; \
+                       ctx->use[name] = instr->ip; \
+                       BITSET_SET(bd->def, name); \
+               } while(0);
+
+#define use(name, instr) \
+               do { \
+                       ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+                       if (!BITSET_TEST(bd->def, name)) \
+                               BITSET_SET(bd->use, name); \
+               } while(0);
+
+       bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+       bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+       block->data = bd;
+
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_instruction *src;
+               struct ir3_register *reg;
+
+               if (instr->regs_count == 0)
+                       continue;
+
+               /* There are a couple special cases to deal with here:
+                *
+                * fanout: used to split values from a higher class to a lower
+                *     class, for example split the results of a texture fetch
+                *     into individual scalar values;  We skip over these from
+                *     a 'def' perspective, and for a 'use' we walk the chain
+                *     up to the defining instruction.
+                *
+                * fanin: used to collect values from lower class and assemble
+                *     them together into a higher class, for example arguments
+                *     to texture sample instructions;  We consider these to be
+                *     defined at the earliest fanin source.
+                *
+                * phi: used to merge values from different flow control paths
+                *     to the same reg.  Consider defined at earliest phi src,
+                *     and update all the other phi src's (which may come later
+                *     in the program) as users to extend the var's live range.
+                *
+                * Most of this, other than phi, is completely handled in the
+                * get_definer() helper.
+                *
+                * In either case, we trace the instruction back to the original
+                * definer and consider that as the def/use ip.
+                */
+
+               if (writes_gpr(instr)) {
+                       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+                       struct ir3_register *dst = instr->regs[0];
+
+                       if (dst->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, dst->array.id);
+                               unsigned i;
+
+                               debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
+
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+                               /* set the node class now.. in case we don't encounter
+                                * this array dst again.  From register_alloc algo's
+                                * perspective, these are all single/scalar regs:
+                                */
+                               for (i = 0; i < arr->length; i++) {
+                                       unsigned name = arr->base + i;
+                                       ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+                               }
+
+                               /* indirect write is treated like a write to all array
+                                * elements, since we don't know which one is actually
+                                * written:
+                                */
+                               if (dst->flags & IR3_REG_RELATIV) {
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               def(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + dst->array.offset;
+                                       def(name, instr);
+                               }
+
+                       } else if (id->defn == instr) {
+                               unsigned name = ra_name(ctx, id);
+
+                               /* since we are in SSA at this point: */
+                               debug_assert(!BITSET_TEST(bd->use, name));
+
+                               def(name, id->defn);
  
-       ir3_visit_instr(&v.base, assigner);
+                               if (is_high(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+                               } else if (is_half(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->half_classes[id->cls - HALF_OFFSET]);
+                               } else {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->classes[id->cls]);
+                               }
+
+                               /* extend the live range for phi srcs, which may come
+                                * from the bottom of the loop
+                                */
+                               if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+                                       struct ir3_instruction *phi = id->defn->regs[0]->instr;
+                                       foreach_ssa_src(src, phi) {
+                                               /* if src is after phi, then we need to extend
+                                                * the liverange to the end of src's block:
+                                                */
+                                               if (src->ip > phi->ip) {
+                                                       struct ir3_instruction *last =
+                                                                       list_last_entry(&src->block->instr_list,
+                                                                                       struct ir3_instruction, node);
+                                                       ctx->use[name] = MAX2(ctx->use[name], last->ip);
+                                               }
+                                       }
+                               }
+                       }
+               }
  
-       return v.a;
+               foreach_src(reg, instr) {
+                       if (reg->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, reg->array.id);
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
+                               /* indirect read is treated like a read fromall array
+                                * elements, since we don't know which one is actually
+                                * read:
+                                */
+                               if (reg->flags & IR3_REG_RELATIV) {
+                                       unsigned i;
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               use(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + reg->array.offset;
+                                       use(name, instr);
+                                       debug_assert(reg->array.offset < arr->length);
+                               }
+                       } else if ((src = ssa(reg)) && writes_gpr(src)) {
+                               unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+                               use(name, instr);
+                       }
+               }
+       }
  }
  
-/*
- * Register Assignment:
- */
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+       bool progress = false;
  
-struct ra_assign_visitor {
-       struct ir3_visitor base;
-       struct ir3_ra_ctx *ctx;
-       int num;
-};
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->data;
+
+               /* update livein: */
+               for (unsigned i = 0; i < bitset_words; i++) {
+                       BITSET_WORD new_livein =
+                               (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+                       if (new_livein & ~bd->livein[i]) {
+                               bd->livein[i] |= new_livein;
+                               progress = true;
+                       }
+               }
+
+               /* update liveout: */
+               for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+                       struct ir3_block *succ = block->successors[j];
+                       struct ir3_ra_block_data *succ_bd;
+
+                       if (!succ)
+                               continue;
+
+                       succ_bd = succ->data;
+
+                       for (unsigned i = 0; i < bitset_words; i++) {
+                               BITSET_WORD new_liveout =
+                                       (succ_bd->livein[i] & ~bd->liveout[i]);
  
-static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
+                               if (new_liveout) {
+                                       bd->liveout[i] |= new_liveout;
+                                       progress = true;
+                               }
+                       }
+               }
+       }
+
+       return progress;
+}
+
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
  {
-       return (struct ra_assign_visitor *)v;
+       bool first = true;
+       debug_printf("  %s:", name);
+       for (unsigned i = 0; i < cnt; i++) {
+               if (BITSET_TEST(bs, i)) {
+                       if (!first)
+                               debug_printf(",");
+                       debug_printf(" %04u", i);
+                       first = false;
+               }
+       }
+       debug_printf("\n");
  }
  
-static type_t half_type(type_t type)
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
  {
-       switch (type) {
-       case TYPE_F32: return TYPE_F16;
-       case TYPE_U32: return TYPE_U16;
-       case TYPE_S32: return TYPE_S16;
-       /* instructions may already be fixed up: */
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return type;
-       default:
-               assert(0);
-               return ~0;
+       struct ir3 *ir = ctx->ir;
+
+       /* initialize array live ranges: */
+       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+               arr->start_ip = ~0;
+               arr->end_ip = 0;
+       }
+
+       /* compute live ranges (use/def) on a block level, also updating
+        * block's def/use bitmasks (used below to calculate per-block
+        * livein/liveout):
+        */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ra_block_compute_live_ranges(ctx, block);
+       }
+
+       /* update per-block livein/liveout: */
+       while (ra_compute_livein_liveout(ctx)) {}
+
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               debug_printf("AFTER LIVEIN/OUT:\n");
+               ir3_print(ir);
+               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+                       struct ir3_ra_block_data *bd = block->data;
+                       debug_printf("block%u:\n", block_id(block));
+                       print_bitset("def", bd->def, ctx->alloc_count);
+                       print_bitset("use", bd->use, ctx->alloc_count);
+                       print_bitset("l/i", bd->livein, ctx->alloc_count);
+                       print_bitset("l/o", bd->liveout, ctx->alloc_count);
+               }
+       }
+
+       /* extend start/end ranges based on livein/liveout info from cfg: */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->data;
+
+               for (unsigned i = 0; i < ctx->alloc_count; i++) {
+                       if (BITSET_TEST(bd->livein, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+                       }
+
+                       if (BITSET_TEST(bd->liveout, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+                       }
+               }
+       }
+
+       /* need to fix things up to keep outputs live: */
+       for (unsigned i = 0; i < ir->noutputs; i++) {
+               struct ir3_instruction *instr = ir->outputs[i];
+               unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+               ctx->use[name] = ctx->instr_cnt;
+       }
+
+       for (unsigned i = 0; i < ctx->alloc_count; i++) {
+               for (unsigned j = 0; j < ctx->alloc_count; j++) {
+                       if (intersects(ctx->def[i], ctx->use[i],
+                                       ctx->def[j], ctx->use[j])) {
+                               ra_add_node_interference(ctx->g, i, j);
+                       }
+               }
         }
  }
  
  /* some instructions need fix-up if dst register is half precision: */
  static void fixup_half_instr_dst(struct ir3_instruction *instr)
  {
-       switch (instr->category) {
+       switch (opc_cat(instr->opc)) {
         case 1: /* move instructions */
                 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
                 break;
@@ -432,372 +996,190 @@ static void fixup_half_instr_dst(struct ir3_instruction *instr)
  /* some instructions need fix-up if src register is half precision: */
  static void fixup_half_instr_src(struct ir3_instruction *instr)
  {
-       switch (instr->category) {
-       case 1: /* move instructions */
+       switch (instr->opc) {
+       case OPC_MOV:
                 instr->cat1.src_type = half_type(instr->cat1.src_type);
                 break;
+       default:
+               break;
         }
  }
  
-static void ra_assign_reg(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+               struct ir3_instruction *instr)
  {
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
+       struct ir3_ra_instr_data *id;
  
-       if (is_flow(instr) && (instr->opc == OPC_KILL))
-               return;
+       if (reg->flags & IR3_REG_ARRAY) {
+               struct ir3_array *arr =
+                       ir3_lookup_array(ctx->ir, reg->array.id);
+               unsigned name = arr->base + reg->array.offset;
+               unsigned r = ra_get_node_reg(ctx->g, name);
+               unsigned num = ctx->set->ra_reg_to_gpr[r];
  
-       reg->flags &= ~IR3_REG_SSA;
-       reg->num = a->num & ~REG_HALF;
+               if (reg->flags & IR3_REG_RELATIV) {
+                       reg->array.offset = num;
+               } else {
+                       reg->num = num;
+               }
  
-       assert(reg->num >= 0);
+               reg->flags &= ~IR3_REG_ARRAY;
+       } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+               unsigned name = ra_name(ctx, id);
+               unsigned r = ra_get_node_reg(ctx->g, name);
+               unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
  
-       if (a->num & REG_HALF) {
-               reg->flags |= IR3_REG_HALF;
-               /* if dst reg being assigned, patch up the instr: */
-               if (reg == instr->regs[0])
-                       fixup_half_instr_dst(instr);
-               else
-                       fixup_half_instr_src(instr);
-       }
-}
+               debug_assert(!(reg->flags & IR3_REG_RELATIV));
  
-static void ra_assign_dst_shader_input(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       unsigned i, base = reg->num & ~0x3;
-       int off = base - reg->num;
+               if (is_high(id->defn))
+                       num += FIRST_HIGH_REG;
  
-       ra_assign_reg(v, instr, reg);
-       reg->flags |= IR3_REG_IA;
+               reg->num = num;
+               reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
  
-       /* trigger assignment of all our companion input components: */
-       for (i = 0; i < 4; i++) {
-               struct ir3_instruction *in = NULL;
-               if ((base + i) < instr->block->ninputs)
-                       in = instr->block->inputs[base + i];
-               if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
-                       ra_assign(a->ctx, in, a->num + off + i);
+               if (is_half(id->defn))
+                       reg->flags |= IR3_REG_HALF;
         }
  }
  
-static void ra_assign_dst_fanout(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       struct ir3_register *src = instr->regs[1];
-       ra_assign_reg(v, instr, reg);
-       if (src->flags & IR3_REG_SSA)
-               ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
-}
-
-static void ra_assign_src_fanout(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       ra_assign_reg(v, instr, reg);
-       ra_assign(a->ctx, instr, a->num + instr->fo.off);
-}
-
-
-static void ra_assign_src_fanin(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
-       ra_assign_reg(v, instr, reg);
-       ra_assign(a->ctx, instr, a->num - srcn);
-       for (j = 1; j < instr->regs_count; j++) {
-               struct ir3_register *reg = instr->regs[j];
-               if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
-                       ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
-       }
-}
-
-static const struct ir3_visitor_funcs assign_visitor_funcs = {
-               .instr = ir3_visit_instr,
-               .dst_shader_input = ra_assign_dst_shader_input,
-               .dst_fanout = ra_assign_dst_fanout,
-               .dst_fanin = ra_assign_reg,
-               .dst = ra_assign_reg,
-               .src_fanout = ra_assign_src_fanout,
-               .src_fanin = ra_assign_src_fanin,
-               .src = ra_assign_reg,
-};
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_register *reg;
  
-static void ra_assign(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *assigner, int num)
-{
-       struct ra_assign_visitor v = {
-                       .base.funcs = &assign_visitor_funcs,
-                       .ctx = ctx,
-                       .num = num,
-       };
+               if (instr->regs_count == 0)
+                       continue;
  
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(assigner)) {
-               debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
-               if (assigner->regs[0]->num != (num & ~REG_HALF)) {
-                       /* impossible situation, should have been resolved
-                        * at an earlier stage by inserting extra mov's:
-                        */
-                       ctx->error = true;
+               if (writes_gpr(instr)) {
+                       reg_assign(ctx, instr->regs[0], instr);
+                       if (instr->regs[0]->flags & IR3_REG_HALF)
+                               fixup_half_instr_dst(instr);
                 }
-               return;
-       }
  
-       ir3_visit_instr(&v.base, assigner);
-}
-
-/*
- *
- */
-
-static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       struct ir3_register *dst;
-       unsigned num;
-
-       /* skip over nop's */
-       if (instr->regs_count == 0)
-               return;
-
-       dst = instr->regs[0];
-
-       /* if we've already visited this instruction, bail now: */
-       if (instr->flags & IR3_INSTR_MARK)
-               return;
-
-       /* allocate register(s): */
-       if (is_addr(instr)) {
-               num = instr->regs[2]->num;
-       } else if (reg_gpr(dst)) {
-               struct ir3_ra_assignment a;
-               a = ra_calc(instr);
-               num = alloc_block(ctx, instr, a.num) + a.off;
-       } else if (dst->flags & IR3_REG_ADDR) {
-               dst->flags &= ~IR3_REG_ADDR;
-               num = regid(REG_A0, 0) | REG_HALF;
-       } else {
-               /* predicate register (p0).. etc */
-               return;
+               foreach_src_n(reg, n, instr) {
+                       struct ir3_instruction *src = reg->instr;
+                       /* Note: reg->instr could be null for IR3_REG_ARRAY */
+                       if (!(src || (reg->flags & IR3_REG_ARRAY)))
+                               continue;
+                       reg_assign(ctx, instr->regs[n+1], src);
+                       if (instr->regs[n+1]->flags & IR3_REG_HALF)
+                               fixup_half_instr_src(instr);
+               }
         }
-
-       ra_assign(ctx, instr, num);
  }
  
-/* flatten into shader: */
-// XXX this should probably be somewhere else:
-static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
  {
-       struct ir3_instruction *n;
-       struct ir3 *shader = block->shader;
-       struct ir3_instruction *end =
-                       ir3_instr_create(block, 0, OPC_END);
-       struct ir3_instruction *last_input = NULL;
-       struct ir3_instruction *last_rel = NULL;
-       regmask_t needs_ss_war;       /* write after read */
-       regmask_t needs_ss;
-       regmask_t needs_sy;
-
-       regmask_init(&needs_ss_war);
-       regmask_init(&needs_ss);
-       regmask_init(&needs_sy);
-
-       shader->instrs_count = 0;
-
-       for (n = block->head; n; n = n->next) {
-               struct ir3_register *reg;
-               unsigned i;
-
-               if (is_meta(n))
-                       continue;
+       unsigned n = 0;
  
-               if (is_input(n)) {
-                       struct ir3_register *inloc = n->regs[1];
-                       assert(inloc->flags & IR3_REG_IMMED);
-                       ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+       /* frag shader inputs get pre-assigned, since we have some
+        * constraints/unknowns about setup for some of these regs:
+        */
+       if (ctx->type == SHADER_FRAGMENT) {
+               struct ir3 *ir = ctx->ir;
+               unsigned i = 0, j;
+               if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
+                       struct ir3_instruction *instr = ir->inputs[i];
+                       int cls = size_to_class(1, true, false);
+                       unsigned name = __ra_name(ctx, cls, instr);
+                       unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
+
+                       /* if we have frag_face, it gets hr0.x */
+                       ra_set_node_reg(ctx->g, name, reg);
+                       i += 4;
                 }
  
-               for (i = 1; i < n->regs_count; i++) {
-                       reg = n->regs[i];
+               j = 0;
+               for (; i < ir->ninputs; i++) {
+                       struct ir3_instruction *instr = ir->inputs[i];
+                       if (instr) {
+                               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
  
-                       if (reg_gpr(reg)) {
+                               if (id->defn == instr) {
+                                       unsigned name, reg;
  
-                               /* TODO: we probably only need (ss) for alu
-                                * instr consuming sfu result.. need to make
-                                * some tests for both this and (sy)..
-                                */
-                               if (regmask_get(&needs_ss, reg)) {
-                                       n->flags |= IR3_INSTR_SS;
-                                       regmask_init(&needs_ss);
-                               }
+                                       name = ra_name(ctx, id);
+                                       reg = ctx->set->gpr_to_ra_reg[id->cls][j];
  
-                               if (regmask_get(&needs_sy, reg)) {
-                                       n->flags |= IR3_INSTR_SY;
-                                       regmask_init(&needs_sy);
+                                       ra_set_node_reg(ctx->g, name, reg);
+                                       j += id->sz;
                                 }
                         }
-
-                       /* TODO: is it valid to have address reg loaded from a
-                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
-                        * last_rel check below should be moved ahead of this:
-                        */
-                       if (reg->flags & IR3_REG_RELATIV)
-                               last_rel = n;
                 }
+               n = j;
+       }
  
-               if (n->regs_count > 0) {
-                       reg = n->regs[0];
-                       if (regmask_get(&needs_ss_war, reg)) {
-                               n->flags |= IR3_INSTR_SS;
-                               regmask_init(&needs_ss_war); // ??? I assume?
-                       }
+       /* pre-assign array elements:
+        */
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               unsigned base = n;
  
-                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
-                               last_rel->flags |= IR3_INSTR_UL;
-                               last_rel = NULL;
-                       }
-               }
+               if (arr->end_ip == 0)
+                       continue;
  
-               /* cat5+ does not have an (ss) bit, if needed we need to
-                * insert a nop to carry the sync flag.  Would be kinda
-                * clever if we were aware of this during scheduling, but
-                * this should be a pretty rare case:
+               /* figure out what else we conflict with which has already
+                * been assigned:
                  */
-               if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
-                       struct ir3_instruction *nop;
-                       nop = ir3_instr_create(block, 0, OPC_NOP);
-                       nop->flags |= IR3_INSTR_SS;
-                       n->flags &= ~IR3_INSTR_SS;
-               }
-
-               /* need to be able to set (ss) on first instruction: */
-               if ((shader->instrs_count == 0) && (n->category >= 5))
-                       ir3_instr_create(block, 0, OPC_NOP);
-
-               if (is_nop(n) && shader->instrs_count) {
-                       struct ir3_instruction *last =
-                                       shader->instrs[shader->instrs_count-1];
-                       if (is_nop(last) && (last->repeat < 5)) {
-                               last->repeat++;
-                               last->flags |= n->flags;
+retry:
+               list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+                       if (arr2 == arr)
+                               break;
+                       if (arr2->end_ip == 0)
                                 continue;
+                       /* if it intersects with liverange AND register range.. */
+                       if (intersects(arr->start_ip, arr->end_ip,
+                                       arr2->start_ip, arr2->end_ip) &&
+                               intersects(base, base + arr->length,
+                                       arr2->reg, arr2->reg + arr2->length)) {
+                               base = MAX2(base, arr2->reg + arr2->length);
+                               goto retry;
                         }
                 }
  
-               shader->instrs[shader->instrs_count++] = n;
-
-               if (is_sfu(n))
-                       regmask_set(&needs_ss, n->regs[0]);
-
-               if (is_tex(n)) {
-                       /* this ends up being the # of samp instructions.. but that
-                        * is ok, everything else only cares whether it is zero or
-                        * not.  We do this here, rather than when we encounter a
-                        * SAMP decl, because (especially in binning pass shader)
-                        * the samp instruction(s) could get eliminated if the
-                        * result is not used.
-                        */
-                       ctx->has_samp = true;
-                       regmask_set(&needs_sy, n->regs[0]);
-               }
-
-               /* both tex/sfu appear to not always immediately consume
-                * their src register(s):
-                */
-               if (is_tex(n) || is_sfu(n)) {
-                       for (i = 1; i < n->regs_count; i++) {
-                               reg = n->regs[i];
-                               if (reg_gpr(reg))
-                                       regmask_set(&needs_ss_war, reg);
-                       }
-               }
-
-               if (is_input(n))
-                       last_input = n;
-       }
-
-       if (last_input)
-               last_input->regs[0]->flags |= IR3_REG_EI;
-
-       if (last_rel)
-               last_rel->flags |= IR3_INSTR_UL;
-
-       shader->instrs[shader->instrs_count++] = end;
-
-       shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-}
-
-static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_instruction *n;
-
-       if (!block->parent) {
-               unsigned i, j;
-               int base, off = output_base(ctx);
-
-               base = alloc_block(ctx, NULL, block->noutputs + off);
+               arr->reg = base;
  
-               if (ctx->half_precision)
-                       base |= REG_HALF;
+               for (unsigned i = 0; i < arr->length; i++) {
+                       unsigned name, reg;
  
-               for (i = 0; i < block->noutputs; i++)
-                       if (block->outputs[i] && !is_kill(block->outputs[i]))
-                               ra_assign(ctx, block->outputs[i], base + i + off);
+                       name = arr->base + i;
+                       reg = ctx->set->gpr_to_ra_reg[0][base++];
  
-               if (ctx->type == SHADER_FRAGMENT) {
-                       i = 0;
-                       if (ctx->frag_face) {
-                               /* if we have frag_face, it gets hr0.x */
-                               ra_assign(ctx, block->inputs[i], REG_HALF | 0);
-                               i += 4;
-                       }
-                       for (j = 0; i < block->ninputs; i++, j++)
-                               if (block->inputs[i])
-                                       ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
-               } else {
-                       for (i = 0; i < block->ninputs; i++)
-                               if (block->inputs[i])
-                                       ir3_instr_ra(ctx, block->inputs[i]);
+                       ra_set_node_reg(ctx->g, name, reg);
                 }
         }
  
-       /* then loop over instruction list and assign registers:
-        */
-       n = block->head;
-       while (n) {
-               ir3_instr_ra(ctx, n);
-               if (ctx->error)
-                       return -1;
-               n = n->next;
-       }
+       if (!ra_allocate(ctx->g))
+               return -1;
  
-       legalize(ctx, block);
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_alloc(ctx, block);
+       }
  
         return 0;
  }
  
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-               bool half_precision, bool frag_coord, bool frag_face,
-               bool *has_samp, int *max_bary)
+int ir3_ra(struct ir3 *ir, enum shader_t type,
+               bool frag_coord, bool frag_face)
  {
         struct ir3_ra_ctx ctx = {
-                       .block = block,
+                       .ir = ir,
                         .type = type,
-                       .half_precision = half_precision,
-                       .frag_coord = frag_coord,
                         .frag_face = frag_face,
-                       .max_bary = -1,
+                       .set = ir->compiler->set,
         };
         int ret;
  
-       ir3_clear_mark(block->shader);
-       ret = block_ra(&ctx, block);
-       *has_samp = ctx.has_samp;
-       *max_bary = ctx.max_bary;
+       ra_init(&ctx);
+       ra_add_interference(&ctx);
+       ret = ra_alloc(&ctx);
+       ra_destroy(&ctx);
  
         return ret;
  }