freedreno/ir3: scheduler improvements

author Rob Clark <robdclark@chromium.org>

Thu, 30 May 2019 17:44:16 +0000 (10:44 -0700)

committer Rob Clark <robdclark@chromium.org>

Mon, 3 Jun 2019 19:44:03 +0000 (12:44 -0700)
author Rob Clark <robdclark@chromium.org>
Thu, 30 May 2019 17:44:16 +0000 (10:44 -0700)
committer Rob Clark <robdclark@chromium.org>
Mon, 3 Jun 2019 19:44:03 +0000 (12:44 -0700)
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index f3c25ea2792b62ae5a224808755609b354daf2a5..ccd102b8e4424cae3f54423c0c2bc8bd8c1883b4 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -292,6 +292,11 @@ struct ir3_instruction {
         };
  
         /* used for per-pass extra instruction data.
+        *
+        * TODO we should remove the per-pass data like this and 'use_count'
+        * and do something similar to what RA does w/ ir3_ra_instr_data..
+        * ie. use the ir3_count_instructions pass, and then use instr->ip
+        * to index into a table of pass-private data.
          */
         void *data;
  
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

index 16199ca3fb97dde4a44d7b8fbefe9def7485c079..1b07bf8c1dd16ee5c059c3af0aed21cf0ce1ecf9 100644 (file)
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -216,7 +216,7 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
                 return NULL;
  
         for (; i < nsrcs; i++)
-               if (srcs[i] && (srcs[i]->sun > d->sun))
+               if (srcs[i] && (srcs[i]->depth > d->depth))
                         d = srcs[id = i];
  
         srcs[id] = NULL;
@@ -500,13 +500,63 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         return NULL;
  }
  
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+       int new_live = dest_regs(instr);
+       int old_live = 0;
+
+       foreach_ssa_src_n(src, n, instr) {
+               if (__is_false_dep(instr, n))
+                       continue;
+
+               if (instr->block != src->block)
+                       continue;
+
+               /* for fanout/split, just pass things along to the real src: */
+               if (src->opc == OPC_META_FO)
+                       src = ssa(src->regs[1]);
+
+               /* for fanin/collect, if this is the last use of *each* src,
+                * then it will decrease the live values, since RA treats
+                * them as a whole:
+                */
+               if (src->opc == OPC_META_FI) {
+                       struct ir3_instruction *src2;
+                       bool last_use = true;
+
+                       foreach_ssa_src(src2, src) {
+                               if (src2->use_count > 1) {
+                                       last_use = false;
+                                       break;
+                               }
+                       }
+
+                       if (last_use)
+                               old_live += dest_regs(src);
+
+               } else {
+                       debug_assert(src->use_count > 0);
+
+                       if (src->use_count == 1) {
+                               old_live += dest_regs(src);
+                       }
+               }
+       }
+
+       return new_live - old_live;
+}
+
  /* find instruction to schedule: */
  static struct ir3_instruction *
  find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                 bool soft)
  {
         struct ir3_instruction *best_instr = NULL;
-       unsigned min_delay = ~0;
+       int best_rank = INT_MAX;      /* lower is better */
+       unsigned deepest = 0;
  
         /* TODO we'd really rather use the list/array of block outputs.  But we
          * don't have such a thing.  Recursing *every* instruction in the list
@@ -516,23 +566,70 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
          */
         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
                 struct ir3_instruction *candidate;
-               unsigned delay;
  
                 candidate = find_instr_recursive(ctx, notes, instr);
                 if (!candidate)
                         continue;
  
-               if (ctx->live_values > 16*4) {
-                       /* under register pressure, only care about reducing live values: */
-                       if (!best_instr || (candidate->sun > best_instr->sun))
-                               best_instr = candidate;
-               } else {
-                       delay = delay_calc(ctx->block, candidate, soft, false);
-                       if ((delay < min_delay) ||
-                                       ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
-                               best_instr = candidate;
-                               min_delay = delay;
+               deepest = MAX2(deepest, candidate->depth);
+       }
+
+       /* traverse the list a second time.. but since we cache the result of
+        * find_instr_recursive() it isn't as bad as it looks.
+        */
+       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+               struct ir3_instruction *candidate;
+
+               candidate = find_instr_recursive(ctx, notes, instr);
+               if (!candidate)
+                       continue;
+
+               /* determine net change to # of live values: */
+               int le = live_effect(candidate);
+
+               /* if there is a net increase in # of live values, then apply some
+                * threshold to avoid instructions getting scheduled *too* early
+                * and increasing register pressure.
+                */
+               if (le >= 1) {
+                       unsigned threshold;
+
+                       if (ctx->live_values > 4*4) {
+                               threshold = 4;
+                       } else {
+                               threshold = 6;
                         }
+
+                       /* Filter out any "shallow" instructions which would otherwise
+                        * tend to get scheduled too early to fill delay slots even
+                        * when they are not needed for a while.  There will probably
+                        * be later delay slots that they could just as easily fill.
+                        *
+                        * A classic case where this comes up is frag shaders that
+                        * write a constant value (like 1.0f) to one of the channels
+                        * of the output color(s).  Since the mov from immed has no
+                        * dependencies, it would otherwise get scheduled early to
+                        * fill delay slots, occupying a register until the end of
+                        * the program.
+                        */
+                       if ((deepest - candidate->depth) > threshold)
+                               continue;
+               }
+
+               int rank = delay_calc(ctx->block, candidate, soft, false);
+
+               /* if too many live values, prioritize instructions that reduce the
+                * number of live values:
+                */
+               if (ctx->live_values > 16*4) {
+                       rank = le;
+               } else if (ctx->live_values > 4*4) {
+                       rank += le;
+               }
+
+               if (rank < best_rank) {
+                       best_instr = candidate;
+                       best_rank = rank;
                 }
         }
author	Rob Clark <robdclark@chromium.org>
	Thu, 30 May 2019 17:44:16 +0000 (10:44 -0700)
committer	Rob Clark <robdclark@chromium.org>
	Mon, 3 Jun 2019 19:44:03 +0000 (12:44 -0700)
src/freedreno/ir3/ir3.h		patch \| blob \| history
src/freedreno/ir3/ir3_sched.c		patch \| blob \| history