freedreno/ir3: stop hard-coding FS input regs

[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_ra.c
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c

index 0436e01ab2c53374a894f4171e455bfcc312ebb6..83bc375aeb5b8a14242b26addc76e55bbf2a99f8 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -31,6 +31,8 @@
  #include "util/ralloc.h"
  #include "util/bitset.h"
  
+#include "freedreno_util.h"
+
  #include "ir3.h"
  #include "ir3_compiler.h"
  
@@ -68,25 +70,33 @@
   * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
   * register assignment.  But for us that is horrible from a scheduling
   * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the array is the
+ * Ie. the first instruction (lowest ip) to write to the variable is the
   * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other array elements just
- * define the variable some more.)
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
+ *
+ * TODO arrays that fit in one of the pre-defined class sizes should
+ * not need to be pre-colored, but instead could be given a normal
+ * vreg name.  (Ignoring this for now since it is a good way to work
+ * out the kinks with arbitrary sized arrays.)
+ *
+ * TODO might be easier for debugging to split this into two passes,
+ * the first assigning vreg names in a way that we could ir3_print()
+ * the result.
   */
  
  static const unsigned class_sizes[] = {
         1, 2, 3, 4,
         4 + 4, /* txd + 1d/2d */
         4 + 6, /* txd + 3d */
-       /* temporary: until we can assign arrays, create classes so we
-        * can round up array to fit.  NOTE with tgsi arrays should
-        * really all be multiples of four:
-        */
-       4 * 4,
-       4 * 8,
-       4 * 16,
-       4 * 32,
-
  };
  #define class_count ARRAY_SIZE(class_sizes)
  
@@ -94,25 +104,78 @@ static const unsigned half_class_sizes[] = {
         1, 2, 3, 4,
  };
  #define half_class_count  ARRAY_SIZE(half_class_sizes)
-#define total_class_count (class_count + half_class_count)
+
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+       1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
  
  /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * (REG_A0 - 1))
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
  /* Number of virtual regs in a given class: */
  #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
  #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
  
  /* register-set, created one time, used for all shaders: */
  struct ir3_ra_reg_set {
         struct ra_regs *regs;
         unsigned int classes[class_count];
         unsigned int half_classes[half_class_count];
+       unsigned int high_classes[high_class_count];
         /* maps flat virtual register space to base gpr: */
         uint16_t *ra_reg_to_gpr;
         /* maps cls,gpr to flat virtual register space: */
         uint16_t **gpr_to_ra_reg;
  };
  
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+               const unsigned *sizes, unsigned count)
+{
+       for (unsigned i = 0; i < count; i++) {
+               q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+               /* From register_allocate.c:
+                *
+                * q(B,C) (indexed by C, B is this register class) in
+                * Runeson/Nyström paper.  This is "how many registers of B could
+                * the worst choice register from C conflict with".
+                *
+                * If we just let the register allocation algorithm compute these
+                * values, is extremely expensive.  However, since all of our
+                * registers are laid out, we can very easily compute them
+                * ourselves.  View the register from C as fixed starting at GRF n
+                * somewhere in the middle, and the register from B as sliding back
+                * and forth.  Then the first register to conflict from B is the
+                * one starting at n - class_size[B] + 1 and the last register to
+                * conflict will start at n + class_size[B] - 1.  Therefore, the
+                * number of conflicts from B is class_size[B] + class_size[C] - 1.
+                *
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                * B | | | | | |n| --> | | | | | | |
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                *             +-+-+-+-+-+
+                * C           |n| | | | |
+                *             +-+-+-+-+-+
+                *
+                * (Idea copied from brw_fs_reg_allocate.cpp)
+                */
+               for (unsigned j = 0; j < count; j++)
+                       q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+       }
+}
+
  /* One-time setup of RA register-set, which describes all the possible
   * "virtual" registers and their interferences.  Ie. double register
   * occupies (and conflicts with) two single registers, and so forth.
@@ -134,7 +197,7 @@ struct ir3_ra_reg_set *
  ir3_ra_alloc_reg_set(void *memctx)
  {
         struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
-       unsigned ra_reg_count, reg, first_half_reg;
+       unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
         unsigned int **q_values;
  
         /* calculate # of regs across all classes: */
@@ -143,53 +206,18 @@ ir3_ra_alloc_reg_set(void *memctx)
                 ra_reg_count += CLASS_REGS(i);
         for (unsigned i = 0; i < half_class_count; i++)
                 ra_reg_count += HALF_CLASS_REGS(i);
+       for (unsigned i = 0; i < high_class_count; i++)
+               ra_reg_count += HIGH_CLASS_REGS(i);
  
         /* allocate and populate q_values: */
         q_values = ralloc_array(set, unsigned *, total_class_count);
-       for (unsigned i = 0; i < class_count; i++) {
-               q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
-
-               /* From register_allocate.c:
-                *
-                * q(B,C) (indexed by C, B is this register class) in
-                * Runeson/Nyström paper.  This is "how many registers of B could
-                * the worst choice register from C conflict with".
-                *
-                * If we just let the register allocation algorithm compute these
-                * values, is extremely expensive.  However, since all of our
-                * registers are laid out, we can very easily compute them
-                * ourselves.  View the register from C as fixed starting at GRF n
-                * somewhere in the middle, and the register from B as sliding back
-                * and forth.  Then the first register to conflict from B is the
-                * one starting at n - class_size[B] + 1 and the last register to
-                * conflict will start at n + class_size[B] - 1.  Therefore, the
-                * number of conflicts from B is class_size[B] + class_size[C] - 1.
-                *
-                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-                * B | | | | | |n| --> | | | | | | |
-                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-                *             +-+-+-+-+-+
-                * C           |n| | | | |
-                *             +-+-+-+-+-+
-                *
-                * (Idea copied from brw_fs_reg_allocate.cpp)
-                */
-               for (unsigned j = 0; j < class_count; j++)
-                       q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
-       }
  
-       for (unsigned i = class_count; i < total_class_count; i++) {
-               q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
-
-               /* see comment above: */
-               for (unsigned j = class_count; j < total_class_count; j++) {
-                       q_values[i][j] = half_class_sizes[i - class_count] +
-                                       half_class_sizes[j - class_count] - 1;
-               }
-       }
+       build_q_values(q_values, 0, class_sizes, class_count);
+       build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+       build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
  
         /* allocate the reg-set.. */
-       set->regs = ra_alloc_reg_set(set, ra_reg_count);
+       set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
         set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
         set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
  
@@ -214,18 +242,19 @@ ir3_ra_alloc_reg_set(void *memctx)
         }
  
         first_half_reg = reg;
+       base = HALF_OFFSET;
  
         for (unsigned i = 0; i < half_class_count; i++) {
                 set->half_classes[i] = ra_alloc_reg_class(set->regs);
  
-               set->gpr_to_ra_reg[class_count + i] =
-                               ralloc_array(set, uint16_t, CLASS_REGS(i));
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
  
                 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
                         ra_class_add_reg(set->regs, set->half_classes[i], reg);
  
                         set->ra_reg_to_gpr[reg] = j;
-                       set->gpr_to_ra_reg[class_count + i][j] = reg;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
  
                         for (unsigned br = j; br < j + half_class_sizes[i]; br++)
                                 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
@@ -234,6 +263,29 @@ ir3_ra_alloc_reg_set(void *memctx)
                 }
         }
  
+       first_high_reg = reg;
+       base = HIGH_OFFSET;
+
+       for (unsigned i = 0; i < high_class_count; i++) {
+               set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+               for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
+
+                       for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+                       reg++;
+               }
+       }
+
+
         ra_set_finalize(set->regs, q_values);
  
         ralloc_free(q_values);
@@ -241,6 +293,21 @@ ir3_ra_alloc_reg_set(void *memctx)
         return set;
  }
  
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+       BITSET_WORD *def;        /* variables defined before used in block */
+       BITSET_WORD *use;        /* variables used before defined in block */
+       BITSET_WORD *livein;     /* which defs reach entry point of block */
+       BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+       /* cached instruction 'definer' info: */
+       struct ir3_instruction *defn;
+       int off, sz, cls;
+};
+
  /* register-assign context, per-shader */
  struct ir3_ra_ctx {
         struct ir3 *ir;
@@ -250,19 +317,20 @@ struct ir3_ra_ctx {
         struct ir3_ra_reg_set *set;
         struct ra_graph *g;
         unsigned alloc_count;
-       unsigned class_alloc_count[total_class_count];
-       unsigned class_base[total_class_count];
+       /* one per class, plus one slot for arrays: */
+       unsigned class_alloc_count[total_class_count + 1];
+       unsigned class_base[total_class_count + 1];
         unsigned instr_cnt;
         unsigned *def, *use;     /* def/use table */
+       struct ir3_ra_instr_data *instrd;
  };
  
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-       BITSET_WORD *def;        /* variables defined before used in block */
-       BITSET_WORD *use;        /* variables used before defined in block */
-       BITSET_WORD *livein;     /* which defs reach entry point of block */
-       BITSET_WORD *liveout;    /* which defs reach exit point of block */
-};
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+       return !((a_start >= b_end) || (b_start >= a_end));
+}
  
  static bool
  is_half(struct ir3_instruction *instr)
@@ -270,13 +338,23 @@ is_half(struct ir3_instruction *instr)
         return !!(instr->regs[0]->flags & IR3_REG_HALF);
  }
  
+static bool
+is_high(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
  static int
-size_to_class(unsigned sz, bool half)
+size_to_class(unsigned sz, bool half, bool high)
  {
-       if (half) {
+       if (high) {
+               for (unsigned i = 0; i < high_class_count; i++)
+                       if (high_class_sizes[i] >= sz)
+                               return i + HIGH_OFFSET;
+       } else if (half) {
                 for (unsigned i = 0; i < half_class_count; i++)
                         if (half_class_sizes[i] >= sz)
-                               return i + class_count;
+                               return i + HALF_OFFSET;
         } else {
                 for (unsigned i = 0; i < class_count; i++)
                         if (class_sizes[i] >= sz)
@@ -306,33 +384,51 @@ writes_gpr(struct ir3_instruction *instr)
         return is_temp(instr->regs[0]);
  }
  
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+       if (a->flags & IR3_INSTR_UNUSED)
+               return false;
+       return (a->ip < b->ip);
+}
+
  static struct ir3_instruction *
-get_definer(struct ir3_instruction *instr, int *sz, int *off)
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+               int *sz, int *off)
  {
+       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
         struct ir3_instruction *d = NULL;
  
-       if (instr->fanin)
-               return get_definer(instr->fanin, sz, off);
+       if (id->defn) {
+               *sz = id->sz;
+               *off = id->off;
+               return id->defn;
+       }
  
-       if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
+       if (instr->opc == OPC_META_FI) {
                 /* What about the case where collect is subset of array, we
                  * need to find the distance between where actual array starts
                  * and fanin..  that probably doesn't happen currently.
                  */
                 struct ir3_register *src;
+               int dsz, doff;
  
                 /* note: don't use foreach_ssa_src as this gets called once
                  * while assigning regs (which clears SSA flag)
                  */
-               foreach_src(src, instr) {
+               foreach_src_n(src, n, instr) {
+                       struct ir3_instruction *dd;
                         if (!src->instr)
                                 continue;
-                       if ((!d) || (src->instr->ip < d->ip))
-                               d = src->instr;
-               }
  
-               *sz = instr->regs_count - 1;
-               *off = 0;
+                       dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+                       if ((!d) || instr_before(dd, d)) {
+                               d = dd;
+                               *sz = dsz;
+                               *off = doff - n;
+                       }
+               }
  
         } else if (instr->cp.right || instr->cp.left) {
                 /* covers also the meta:fo case, which ends up w/ single
@@ -348,9 +444,14 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
                  */
                 int cnt = 0;
  
-               d = f;
+               /* need to skip over unused in the group: */
+               while (f && (f->flags & IR3_INSTR_UNUSED)) {
+                       f = f->cp.right;
+                       cnt++;
+               }
+
                 while (f) {
-                       if (f->ip < d->ip)
+                       if ((!d) || instr_before(f, d))
                                 d = f;
                         if (f == instr)
                                 *off = cnt;
@@ -383,57 +484,49 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
                 d = instr;
         }
  
-       if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
-               struct ir3_instruction *phi = d->regs[0]->instr;
+       if (d->opc == OPC_META_FO) {
                 struct ir3_instruction *dd;
                 int dsz, doff;
  
-               dd = get_definer(phi, &dsz, &doff);
-
-               *sz = MAX2(*sz, dsz);
-               *off = doff;
-
-               if (dd->ip < d->ip) {
-                       d = dd;
-               }
-       }
-
-       if (is_meta(d) && (d->opc == OPC_META_PHI)) {
-               /* we have already inserted parallel-copies into
-                * the phi, so we don't need to chase definers
-                */
-               struct ir3_register *src;
-
-               /* note: don't use foreach_ssa_src as this gets called once
-                * while assigning regs (which clears SSA flag)
-                */
-               foreach_src(src, d) {
-                       if (!src->instr)
-                               continue;
-                       if (src->instr->ip < d->ip)
-                               d = src->instr;
-               }
-       }
-
-       if (is_meta(d) && (d->opc == OPC_META_FO)) {
-               struct ir3_instruction *dd;
-               int dsz, doff;
-
-               dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+               dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
  
                 /* by definition, should come before: */
-               debug_assert(dd->ip < d->ip);
+               debug_assert(instr_before(dd, d));
  
                 *sz = MAX2(*sz, dsz);
  
-               /* Fanout's are grouped, so *off should already valid */
+               debug_assert(instr->opc == OPC_META_FO);
+               *off = MAX2(*off, instr->fo.off);
  
                 d = dd;
         }
  
+       id->defn = d;
+       id->sz = *sz;
+       id->off = *off;
+
         return d;
  }
  
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+               if (instr->regs_count == 0)
+                       continue;
+               /* couple special cases: */
+               if (writes_addr(instr) || writes_pred(instr)) {
+                       id->cls = -1;
+               } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+                       id->cls = total_class_count;
+               } else {
+                       id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+                       id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+               }
+       }
+}
+
  /* give each instruction a name (and ip), and count up the # of names
   * of each class
   */
@@ -441,8 +534,11 @@ static void
  ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  {
         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               struct ir3_instruction *defn;
-               int cls, sz, off;
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+               instr->name = ~0;
+#endif
  
                 ctx->instr_cnt++;
  
@@ -452,19 +548,14 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                 if (!writes_gpr(instr))
                         continue;
  
-               defn = get_definer(instr, &sz, &off);
-
-               if (defn != instr)
+               if (id->defn != instr)
                         continue;
  
                 /* arrays which don't fit in one of the pre-defined class
                  * sizes are pre-colored:
-                *
-                * TODO but we still need to allocate names for them, don't we??
                  */
-               cls = size_to_class(sz, is_half(defn));
-               if (cls >= 0) {
-                       instr->name = ctx->class_alloc_count[cls]++;
+               if ((id->cls >= 0) && (id->cls < total_class_count)) {
+                       instr->name = ctx->class_alloc_count[id->cls]++;
                         ctx->alloc_count++;
                 }
         }
@@ -473,8 +564,16 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  static void
  ra_init(struct ir3_ra_ctx *ctx)
  {
+       unsigned n, base;
+
         ir3_clear_mark(ctx->ir);
-       ir3_count_instructions(ctx->ir);
+       n = ir3_count_instructions(ctx->ir);
+
+       ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_find_definers(ctx, block);
+       }
  
         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
                 ra_block_name_instructions(ctx, block);
@@ -484,26 +583,44 @@ ra_init(struct ir3_ra_ctx *ctx)
          * actual ra name is class_base[cls] + instr->name;
          */
         ctx->class_base[0] = 0;
-       for (unsigned i = 1; i < total_class_count; i++) {
+       for (unsigned i = 1; i <= total_class_count; i++) {
                 ctx->class_base[i] = ctx->class_base[i-1] +
                                 ctx->class_alloc_count[i-1];
         }
  
+       /* and vreg names for array elements: */
+       base = ctx->class_base[total_class_count];
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               arr->base = base;
+               ctx->class_alloc_count[total_class_count] += arr->length;
+               base += arr->length;
+       }
+       ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
         ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+       ralloc_steal(ctx->g, ctx->instrd);
         ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
         ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
  }
  
  static unsigned
-ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
  {
         unsigned name;
         debug_assert(cls >= 0);
+       debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
         name = ctx->class_base[cls] + defn->name;
         debug_assert(name < ctx->alloc_count);
         return name;
  }
  
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+       /* TODO handle name mapping for arrays */
+       return __ra_name(ctx, id->cls, id->defn);
+}
+
  static void
  ra_destroy(struct ir3_ra_ctx *ctx)
  {
@@ -516,6 +633,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
         struct ir3_ra_block_data *bd;
         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
  
+#define def(name, instr) \
+               do { \
+                       /* defined on first write: */ \
+                       if (!ctx->def[name]) \
+                               ctx->def[name] = instr->ip; \
+                       ctx->use[name] = instr->ip; \
+                       BITSET_SET(bd->def, name); \
+               } while(0);
+
+#define use(name, instr) \
+               do { \
+                       ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+                       if (!BITSET_TEST(bd->def, name)) \
+                               BITSET_SET(bd->use, name); \
+               } while(0);
+
         bd = rzalloc(ctx->g, struct ir3_ra_block_data);
  
         bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
@@ -523,10 +656,11 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
         bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
         bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
  
-       block->bd = bd;
+       block->data = bd;
  
         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
                 struct ir3_instruction *src;
+               struct ir3_register *reg;
  
                 if (instr->regs_count == 0)
                         continue;
@@ -544,81 +678,97 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
                  *     to texture sample instructions;  We consider these to be
                  *     defined at the earliest fanin source.
                  *
-                * phi: used to merge values from different flow control paths
-                *     to the same reg.  Consider defined at earliest phi src,
-                *     and update all the other phi src's (which may come later
-                *     in the program) as users to extend the var's live range.
-                *
-                * Most of this, other than phi, is completely handled in the
-                * get_definer() helper.
+                * Most of this is handled in the get_definer() helper.
                  *
                  * In either case, we trace the instruction back to the original
                  * definer and consider that as the def/use ip.
                  */
  
                 if (writes_gpr(instr)) {
-                       struct ir3_instruction *defn;
-                       int cls, sz, off;
+                       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+                       struct ir3_register *dst = instr->regs[0];
+
+                       if (dst->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, dst->array.id);
+                               unsigned i;
+
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
  
-                       defn = get_definer(instr, &sz, &off);
-                       if (defn == instr) {
-                               /* arrays which don't fit in one of the pre-defined class
-                                * sizes are pre-colored:
+                               /* set the node class now.. in case we don't encounter
+                                * this array dst again.  From register_alloc algo's
+                                * perspective, these are all single/scalar regs:
                                  */
-                               cls = size_to_class(sz, is_half(defn));
-                               if (cls >= 0) {
-                                       unsigned name = ra_name(ctx, cls, defn);
+                               for (i = 0; i < arr->length; i++) {
+                                       unsigned name = arr->base + i;
+                                       ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+                               }
  
-                                       ctx->def[name] = defn->ip;
-                                       ctx->use[name] = defn->ip;
+                               /* indirect write is treated like a write to all array
+                                * elements, since we don't know which one is actually
+                                * written:
+                                */
+                               if (dst->flags & IR3_REG_RELATIV) {
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               def(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + dst->array.offset;
+                                       def(name, instr);
+                               }
  
-                                       /* since we are in SSA at this point: */
-                                       debug_assert(!BITSET_TEST(bd->use, name));
+                       } else if (id->defn == instr) {
+                               unsigned name = ra_name(ctx, id);
  
-                                       BITSET_SET(bd->def, name);
+                               /* since we are in SSA at this point: */
+                               debug_assert(!BITSET_TEST(bd->use, name));
  
-                                       if (is_half(defn)) {
-                                               ra_set_node_class(ctx->g, name,
-                                                               ctx->set->half_classes[cls - class_count]);
-                                       } else {
-                                               ra_set_node_class(ctx->g, name,
-                                                               ctx->set->classes[cls]);
-                                       }
+                               def(name, id->defn);
  
-                                       /* extend the live range for phi srcs, which may come
-                                        * from the bottom of the loop
-                                        */
-                                       if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-                                               struct ir3_instruction *phi = defn->regs[0]->instr;
-                                               foreach_ssa_src(src, phi) {
-                                                       /* if src is after phi, then we need to extend
-                                                        * the liverange to the end of src's block:
-                                                        */
-                                                       if (src->ip > phi->ip) {
-                                                               struct ir3_instruction *last =
-                                                                       list_last_entry(&src->block->instr_list,
-                                                                               struct ir3_instruction, node);
-                                                               ctx->use[name] = MAX2(ctx->use[name], last->ip);
-                                                       }
-                                               }
-                                       }
+                               if (is_high(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+                               } else if (is_half(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->half_classes[id->cls - HALF_OFFSET]);
+                               } else {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->classes[id->cls]);
                                 }
                         }
                 }
  
-               foreach_ssa_src(src, instr) {
-                       if (writes_gpr(src)) {
-                               struct ir3_instruction *srcdefn;
-                               int cls, sz, off;
-
-                               srcdefn = get_definer(src, &sz, &off);
-                               cls = size_to_class(sz, is_half(srcdefn));
-                               if (cls >= 0) {
-                                       unsigned name = ra_name(ctx, cls, srcdefn);
-                                       ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-                                       if (!BITSET_TEST(bd->def, name))
-                                               BITSET_SET(bd->use, name);
+               foreach_src(reg, instr) {
+                       if (reg->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, reg->array.id);
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+                               /* indirect read is treated like a read fromall array
+                                * elements, since we don't know which one is actually
+                                * read:
+                                */
+                               if (reg->flags & IR3_REG_RELATIV) {
+                                       unsigned i;
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               use(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + reg->array.offset;
+                                       use(name, instr);
+                                       /* NOTE: arrays are not SSA so unconditionally
+                                        * set use bit:
+                                        */
+                                       BITSET_SET(bd->use, name);
+                                       debug_assert(reg->array.offset < arr->length);
                                 }
+                       } else if ((src = ssa(reg)) && writes_gpr(src)) {
+                               unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+                               use(name, instr);
                         }
                 }
         }
@@ -631,7 +781,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
         bool progress = false;
  
         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-               struct ir3_ra_block_data *bd = block->bd;
+               struct ir3_ra_block_data *bd = block->data;
  
                 /* update livein: */
                 for (unsigned i = 0; i < bitset_words; i++) {
@@ -652,7 +802,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
                         if (!succ)
                                 continue;
  
-                       succ_bd = succ->bd;
+                       succ_bd = succ->data;
  
                         for (unsigned i = 0; i < bitset_words; i++) {
                                 BITSET_WORD new_liveout =
@@ -669,11 +819,33 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
         return progress;
  }
  
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+       bool first = true;
+       debug_printf("  %s:", name);
+       for (unsigned i = 0; i < cnt; i++) {
+               if (BITSET_TEST(bs, i)) {
+                       if (!first)
+                               debug_printf(",");
+                       debug_printf(" %04u", i);
+                       first = false;
+               }
+       }
+       debug_printf("\n");
+}
+
  static void
  ra_add_interference(struct ir3_ra_ctx *ctx)
  {
         struct ir3 *ir = ctx->ir;
  
+       /* initialize array live ranges: */
+       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+               arr->start_ip = ~0;
+               arr->end_ip = 0;
+       }
+
         /* compute live ranges (use/def) on a block level, also updating
          * block's def/use bitmasks (used below to calculate per-block
          * livein/liveout):
@@ -685,12 +857,30 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
         /* update per-block livein/liveout: */
         while (ra_compute_livein_liveout(ctx)) {}
  
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               debug_printf("AFTER LIVEIN/OUT:\n");
+               ir3_print(ir);
+               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+                       struct ir3_ra_block_data *bd = block->data;
+                       debug_printf("block%u:\n", block_id(block));
+                       print_bitset("  def", bd->def, ctx->alloc_count);
+                       print_bitset("  use", bd->use, ctx->alloc_count);
+                       print_bitset("  l/i", bd->livein, ctx->alloc_count);
+                       print_bitset("  l/o", bd->liveout, ctx->alloc_count);
+               }
+               list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+                       debug_printf("array%u:\n", arr->id);
+                       debug_printf("  length:   %u\n", arr->length);
+                       debug_printf("  start_ip: %u\n", arr->start_ip);
+                       debug_printf("  end_ip:   %u\n", arr->end_ip);
+               }
+       }
+
         /* extend start/end ranges based on livein/liveout info from cfg: */
-       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               struct ir3_ra_block_data *bd = block->bd;
+               struct ir3_ra_block_data *bd = block->data;
  
-               for (unsigned i = 0; i < bitset_words; i++) {
+               for (unsigned i = 0; i < ctx->alloc_count; i++) {
                         if (BITSET_TEST(bd->livein, i)) {
                                 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
                                 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
@@ -701,26 +891,30 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
                                 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
                         }
                 }
+
+               list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+                       for (unsigned i = 0; i < arr->length; i++) {
+                               if (BITSET_TEST(bd->livein, i + arr->base)) {
+                                       arr->start_ip = MIN2(arr->start_ip, block->start_ip);
+                               }
+                               if (BITSET_TEST(bd->livein, i + arr->base)) {
+                                       arr->end_ip = MAX2(arr->end_ip, block->end_ip);
+                               }
+                       }
+               }
         }
  
         /* need to fix things up to keep outputs live: */
         for (unsigned i = 0; i < ir->noutputs; i++) {
                 struct ir3_instruction *instr = ir->outputs[i];
-               struct ir3_instruction *defn;
-               int cls, sz, off;
-
-               defn = get_definer(instr, &sz, &off);
-               cls = size_to_class(sz, is_half(defn));
-               if (cls >= 0) {
-                       unsigned name = ra_name(ctx, cls, defn);
-                       ctx->use[name] = ctx->instr_cnt;
-               }
+               unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+               ctx->use[name] = ctx->instr_cnt;
         }
  
         for (unsigned i = 0; i < ctx->alloc_count; i++) {
                 for (unsigned j = 0; j < ctx->alloc_count; j++) {
-                       if (!((ctx->def[i] >= ctx->use[j]) ||
-                                       (ctx->def[j] >= ctx->use[i]))) {
+                       if (intersects(ctx->def[i], ctx->use[i],
+                                       ctx->def[j], ctx->use[j])) {
                                 ra_add_node_interference(ctx->g, i, j);
                         }
                 }
@@ -730,7 +924,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
  /* some instructions need fix-up if dst register is half precision: */
  static void fixup_half_instr_dst(struct ir3_instruction *instr)
  {
-       switch (instr->category) {
+       switch (opc_cat(instr->opc)) {
         case 1: /* move instructions */
                 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
                 break;
@@ -771,34 +965,54 @@ static void fixup_half_instr_dst(struct ir3_instruction *instr)
  /* some instructions need fix-up if src register is half precision: */
  static void fixup_half_instr_src(struct ir3_instruction *instr)
  {
-       switch (instr->category) {
-       case 1: /* move instructions */
+       switch (instr->opc) {
+       case OPC_MOV:
                 instr->cat1.src_type = half_type(instr->cat1.src_type);
                 break;
+       default:
+               break;
         }
  }
  
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
  static void
  reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
                 struct ir3_instruction *instr)
  {
-       struct ir3_instruction *defn;
-       int cls, sz, off;
+       struct ir3_ra_instr_data *id;
+
+       if (reg->flags & IR3_REG_ARRAY) {
+               struct ir3_array *arr =
+                       ir3_lookup_array(ctx->ir, reg->array.id);
+               unsigned name = arr->base + reg->array.offset;
+               unsigned r = ra_get_node_reg(ctx->g, name);
+               unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+               if (reg->flags & IR3_REG_RELATIV) {
+                       reg->array.offset = num;
+               } else {
+                       reg->num = num;
+                       reg->flags &= ~IR3_REG_SSA;
+               }
  
-       defn = get_definer(instr, &sz, &off);
-       cls = size_to_class(sz, is_half(defn));
-       if (cls >= 0) {
-               unsigned name = ra_name(ctx, cls, defn);
+               reg->flags &= ~IR3_REG_ARRAY;
+       } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+               unsigned name = ra_name(ctx, id);
                 unsigned r = ra_get_node_reg(ctx->g, name);
-               unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
+               unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
  
-               if (reg->flags & IR3_REG_RELATIV)
-                       num += reg->offset;
+               debug_assert(!(reg->flags & IR3_REG_RELATIV));
+
+               if (is_high(id->defn))
+                       num += FIRST_HIGH_REG;
  
                 reg->num = num;
-               reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
+               reg->flags &= ~IR3_REG_SSA;
  
-               if (is_half(defn))
+               if (is_half(id->defn))
                         reg->flags |= IR3_REG_HALF;
         }
  }
@@ -820,9 +1034,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  
                 foreach_src_n(reg, n, instr) {
                         struct ir3_instruction *src = reg->instr;
-                       if (!src)
+                       /* Note: reg->instr could be null for IR3_REG_ARRAY */
+                       if (!(src || (reg->flags & IR3_REG_ARRAY)))
                                 continue;
-
                         reg_assign(ctx, instr->regs[n+1], src);
                         if (instr->regs[n+1]->flags & IR3_REG_HALF)
                                 fixup_half_instr_src(instr);
@@ -833,41 +1047,42 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  static int
  ra_alloc(struct ir3_ra_ctx *ctx)
  {
-       /* frag shader inputs get pre-assigned, since we have some
-        * constraints/unknowns about setup for some of these regs:
+       /* pre-assign array elements:
          */
-       if (ctx->type == SHADER_FRAGMENT) {
-               struct ir3 *ir = ctx->ir;
-               unsigned i = 0, j;
-               if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
-                       struct ir3_instruction *instr = ir->inputs[i];
-                       int cls = size_to_class(1, true);
-                       unsigned name = ra_name(ctx, cls, instr);
-                       unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
-
-                       /* if we have frag_face, it gets hr0.x */
-                       ra_set_node_reg(ctx->g, name, reg);
-                       i += 4;
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               unsigned base = 0;
+
+               if (arr->end_ip == 0)
+                       continue;
+
+               /* figure out what else we conflict with which has already
+                * been assigned:
+                */
+retry:
+               list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+                       if (arr2 == arr)
+                               break;
+                       if (arr2->end_ip == 0)
+                               continue;
+                       /* if it intersects with liverange AND register range.. */
+                       if (intersects(arr->start_ip, arr->end_ip,
+                                       arr2->start_ip, arr2->end_ip) &&
+                               intersects(base, base + arr->length,
+                                       arr2->reg, arr2->reg + arr2->length)) {
+                               base = MAX2(base, arr2->reg + arr2->length);
+                               goto retry;
+                       }
                 }
  
-               for (j = 0; i < ir->ninputs; i++) {
-                       struct ir3_instruction *instr = ir->inputs[i];
-                       if (instr) {
-                               struct ir3_instruction *defn;
-                               int cls, sz, off;
+               arr->reg = base;
  
-                               defn = get_definer(instr, &sz, &off);
-                               if (defn == instr) {
-                                       unsigned name, reg;
+               for (unsigned i = 0; i < arr->length; i++) {
+                       unsigned name, reg;
  
-                                       cls = size_to_class(sz, is_half(defn));
-                                       name = ra_name(ctx, cls, defn);
-                                       reg = ctx->set->gpr_to_ra_reg[cls][j];
+                       name = arr->base + i;
+                       reg = ctx->set->gpr_to_ra_reg[0][base++];
  
-                                       ra_set_node_reg(ctx->g, name, reg);
-                                       j += sz;
-                               }
-                       }
+                       ra_set_node_reg(ctx->g, name, reg);
                 }
         }