freedreno/ir3: round-robin RA

author Rob Clark <robdclark@chromium.org>

Fri, 28 Feb 2020 20:48:16 +0000 (12:48 -0800)

committer Marge Bot <eric+marge@anholt.net>

Tue, 10 Mar 2020 16:01:39 +0000 (16:01 +0000)
author Rob Clark <robdclark@chromium.org>
Fri, 28 Feb 2020 20:48:16 +0000 (12:48 -0800)
committer Marge Bot <eric+marge@anholt.net>
Tue, 10 Mar 2020 16:01:39 +0000 (16:01 +0000)
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c

index 39507184105a2a86a47c81ea9d9ffbb5bdc024c7..05bcdcc60b1164ad57e3335f0cb988fa4063a971 100644 (file)
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -346,6 +346,9 @@ struct ir3_ra_ctx {
         unsigned *def, *use;     /* def/use table */
         struct ir3_ra_instr_data *instrd;
  
+       /* Mapping vreg name back to instruction, used select reg callback: */
+       struct hash_table *name_to_instr;
+
         /* Tracking for max half/full register assigned.  We don't need to
          * track high registers.
          *
@@ -354,8 +357,14 @@ struct ir3_ra_ctx {
          */
         unsigned max_assigned;
         unsigned max_half_assigned;
+
+       /* Tracking for select_reg callback */
+       unsigned start_search_reg;
+       unsigned max_target;
  };
  
+static int scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n);
+
  /* does it conflict? */
  static inline bool
  intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
@@ -640,6 +649,101 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
         }
  }
  
+static int
+pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max)
+{
+       for (unsigned i = min; i < max; i++) {
+               if (BITSET_TEST(regs, i)) {
+                       return i;
+               }
+       }
+       return -1;
+}
+
+/* register selector for the a6xx+ merged register file: */
+static unsigned int
+ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data)
+{
+       struct ir3_ra_ctx *ctx = data;
+       unsigned int class = ra_get_node_class(ctx->g, n);
+
+       /* dimensions within the register class: */
+       unsigned max_target, start;
+
+       /* the regs bitset will include *all* of the virtual regs, but we lay
+        * out the different classes consecutively in the virtual register
+        * space.  So we just need to think about the base offset of a given
+        * class within the virtual register space, and offset the register
+        * space we search within by that base offset.
+        */
+       unsigned base;
+
+       /* NOTE: this is only used in scalar pass, so the register
+        * class will be one of the scalar classes (ie. idx==0):
+        */
+       if (class == ctx->set->high_classes[0]) {
+               max_target = HIGH_CLASS_REGS(0);
+               start = 0;
+               base = ctx->set->gpr_to_ra_reg[HIGH_OFFSET][0];
+       } else if (class == ctx->set->half_classes[0]) {
+               max_target = ctx->max_target;
+               start = ctx->start_search_reg;
+               base = ctx->set->gpr_to_ra_reg[HALF_OFFSET][0];
+       } else if (class == ctx->set->classes[0]) {
+               max_target = ctx->max_target / 2;
+               start = ctx->start_search_reg;
+               base = ctx->set->gpr_to_ra_reg[0][0];
+       } else {
+               unreachable("unexpected register class!");
+       }
+
+       /* For cat4 instructions, if the src reg is already assigned, and
+        * avail to pick, use it.  Because this doesn't introduce unnecessary
+        * dependencies, and it potentially avoids needing (ss) syncs to
+        * for write after read hazards:
+        */
+       struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &n);
+       if (entry) {
+               struct ir3_instruction *instr = entry->data;
+
+               if (is_sfu(instr) && instr->regs[1]->instr) {
+                       struct ir3_instruction *src = instr->regs[1]->instr;
+                       unsigned src_n = scalar_name(ctx, src, 0);
+
+                       unsigned reg = ra_get_node_reg(ctx->g, src_n);
+
+                       /* Check if the src register has been assigned yet: */
+                       if (reg != NO_REG) {
+                               if (BITSET_TEST(regs, reg)) {
+                                       return reg;
+                               }
+                       }
+               }
+       }
+
+       int r = pick_in_range(regs, base + start, base + max_target);
+       if (r < 0) {
+               /* wrap-around: */
+               r = pick_in_range(regs, base, base + start);
+       }
+
+       if (r < 0) {
+               /* overflow, we need to increase max_target: */
+               ctx->max_target++;
+               return ra_select_reg_merged(n, regs, data);
+       }
+
+       if (class == ctx->set->half_classes[0]) {
+               int n = r - base;
+               ctx->start_search_reg = (n + 1) % ctx->max_target;
+       } else if (class == ctx->set->classes[0]) {
+               int n = (r - base) * 2;
+               ctx->start_search_reg = (n + 1) % ctx->max_target;
+       }
+
+       return r;
+}
+
  static void
  ra_init(struct ir3_ra_ctx *ctx)
  {
@@ -680,6 +784,14 @@ ra_init(struct ir3_ra_ctx *ctx)
         ralloc_steal(ctx->g, ctx->instrd);
         ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
         ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+
+       /* TODO add selector callback for split (pre-a6xx) register file: */
+       if (ctx->scalar_pass && (ctx->ir->compiler->gpu_id >= 600)) {
+               ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx);
+
+               ctx->name_to_instr = _mesa_hash_table_create(ctx->g,
+                               _mesa_hash_int, _mesa_key_int_equal);
+       }
  }
  
  static unsigned
@@ -837,6 +949,16 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
  
                                         def(name, instr);
  
+                                       if (ctx->name_to_instr && is_sfu(instr)) {
+                                               /* this is slightly annoying, we can't just use an
+                                                * integer on the stack
+                                                */
+                                               unsigned *key = ralloc(ctx->name_to_instr, unsigned);
+                                               *key = name;
+                                               debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key));
+                                               _mesa_hash_table_insert(ctx->name_to_instr, key, instr);
+                                       }
+
                                         if ((instr->opc == OPC_META_INPUT) && first_non_input)
                                                 use(name, first_non_input);
  
@@ -1536,9 +1658,32 @@ ra_sanity_check(struct ir3 *ir)
         }
  }
  
+/* Target is calculated in terms of half-regs (with a full reg
+ * consisting of two half-regs).
+ */
+static void
+ra_calc_merged_register_target(struct ir3_ra_ctx *ctx)
+{
+       const unsigned vec4 = 2 * 4;  // 8 half-regs
+       unsigned t = MAX2(2 * ctx->max_assigned, ctx->max_half_assigned);
+
+       /* second RA pass may have saved some regs, let's try to reclaim
+        * the benefit by adjusting the target downwards slightly:
+        */
+       if (ir3_has_latency_to_hide(ctx->ir)) {
+               if (t > 8 * vec4) {
+                       t -= 2 * vec4;
+               } else if (t > 6 * vec4) {
+                       t -= vec4;
+               }
+       }
+
+       ctx->max_target = t;
+}
+
  static int
  ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
-               unsigned nprecolor, bool scalar_pass)
+               unsigned nprecolor, bool scalar_pass, unsigned *target)
  {
         struct ir3_ra_ctx ctx = {
                         .v = v,
@@ -1548,6 +1693,10 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
         };
         int ret;
  
+       if (scalar_pass) {
+               ctx.max_target = *target;
+       }
+
         ra_init(&ctx);
         ra_add_interference(&ctx);
         ra_precolor(&ctx, precolor, nprecolor);
@@ -1556,7 +1705,16 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
         ret = ra_alloc(&ctx);
         ra_destroy(&ctx);
  
-       printf("#### max_assigned=%u, max_half_assigned=%u\n", ctx.max_assigned, ctx.max_half_assigned);
+       /* In the first pass, calculate the target register usage used in the
+        * second (scalar) pass:
+        */
+       if (!scalar_pass) {
+               /* TODO: round-robin support for pre-a6xx: */
+               if (ctx.ir->compiler->gpu_id >= 600) {
+                       ra_calc_merged_register_target(&ctx);
+               }
+               *target = ctx.max_target;
+       }
  
         return ret;
  }
@@ -1565,10 +1723,11 @@ int
  ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
                 unsigned nprecolor)
  {
+       unsigned target = 0;
         int ret;
  
         /* First pass, assign the vecN (non-scalar) registers: */
-       ret = ir3_ra_pass(v, precolor, nprecolor, false);
+       ret = ir3_ra_pass(v, precolor, nprecolor, false, &target);
         if (ret)
                 return ret;
  
@@ -1578,7 +1737,7 @@ ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
         }
  
         /* Second pass, assign the scalar registers: */
-       ret = ir3_ra_pass(v, precolor, nprecolor, true);
+       ret = ir3_ra_pass(v, precolor, nprecolor, true, &target);
         if (ret)
                 return ret;
author	Rob Clark <robdclark@chromium.org>
	Fri, 28 Feb 2020 20:48:16 +0000 (12:48 -0800)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 10 Mar 2020 16:01:39 +0000 (16:01 +0000)