freedreno/ir3: fix typo

[mesa.git] / src / freedreno / ir3 / ir3_sched.c
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

index 6552980d90c4317fbabfe34277208aef1255ba65..f027d7b7a304a9eb745cd644717c217a397a3132 100644 (file)
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -55,6 +55,7 @@ struct ir3_sched_ctx {
         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
         struct ir3_instruction *addr;      /* current a0.x user, if any */
         struct ir3_instruction *pred;      /* current p0.x user, if any */
+       int live_values;                   /* estimate of current live values */
         bool error;
  };
  
@@ -63,6 +64,93 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
         return is_sfu(instr) || is_mem(instr);
  }
  
+static void
+unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+
+       foreach_ssa_src_n(src, n, instr) {
+               if (__is_false_dep(instr, n))
+                       continue;
+               if (instr->block != src->block)
+                       continue;
+               if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
+                       unuse_each_src(ctx, src);
+               } else {
+                       debug_assert(src->use_count > 0);
+
+                       if (--src->use_count == 0) {
+                               ctx->live_values -= dest_regs(src);
+                               debug_assert(ctx->live_values >= 0);
+                       }
+               }
+       }
+}
+
+static void use_instr(struct ir3_instruction *instr);
+
+static void
+use_each_src(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+
+       foreach_ssa_src_n(src, n, instr) {
+               if (__is_false_dep(instr, n))
+                       continue;
+               use_instr(src);
+       }
+}
+
+static void
+use_instr(struct ir3_instruction *instr)
+{
+       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
+               use_each_src(instr);
+       } else {
+               instr->use_count++;
+       }
+}
+
+static void
+update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+               return;
+
+       ctx->live_values += dest_regs(instr);
+       unuse_each_src(ctx, instr);
+}
+
+static void
+update_use_count(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       instr->use_count = 0;
+               }
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
+                               continue;
+
+                       use_each_src(instr);
+               }
+       }
+
+       /* Shader outputs are also used:
+        */
+       for (unsigned i = 0; i <  ir->noutputs; i++) {
+               struct ir3_instruction  *out = ir->outputs[i];
+
+               if (!out)
+                       continue;
+
+               use_instr(out);
+       }
+}
+
  #define NULL_INSTR ((void *)~0)
  
  static void
@@ -105,6 +193,8 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
         list_addtail(&instr->node, &instr->block->instr_list);
         ctx->scheduled = instr;
  
+       update_live_values(ctx, instr);
+
         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
                 clear_cache(ctx, NULL);
         } else {
@@ -378,7 +468,7 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  
         /* find unscheduled srcs: */
         foreach_ssa_src(src, instr) {
-               if (!is_scheduled(src)) {
+               if (!is_scheduled(src) && (src->block == instr->block)) {
                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
                         srcs[nsrcs++] = src;
                 }
@@ -410,13 +500,63 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
         return NULL;
  }
  
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+       int new_live = dest_regs(instr);
+       int old_live = 0;
+
+       foreach_ssa_src_n(src, n, instr) {
+               if (__is_false_dep(instr, n))
+                       continue;
+
+               if (instr->block != src->block)
+                       continue;
+
+               /* for fanout/split, just pass things along to the real src: */
+               if (src->opc == OPC_META_FO)
+                       src = ssa(src->regs[1]);
+
+               /* for fanin/collect, if this is the last use of *each* src,
+                * then it will decrease the live values, since RA treats
+                * them as a whole:
+                */
+               if (src->opc == OPC_META_FI) {
+                       struct ir3_instruction *src2;
+                       bool last_use = true;
+
+                       foreach_ssa_src(src2, src) {
+                               if (src2->use_count > 1) {
+                                       last_use = false;
+                                       break;
+                               }
+                       }
+
+                       if (last_use)
+                               old_live += dest_regs(src);
+
+               } else {
+                       debug_assert(src->use_count > 0);
+
+                       if (src->use_count == 1) {
+                               old_live += dest_regs(src);
+                       }
+               }
+       }
+
+       return new_live - old_live;
+}
+
  /* find instruction to schedule: */
  static struct ir3_instruction *
  find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                 bool soft)
  {
         struct ir3_instruction *best_instr = NULL;
-       unsigned min_delay = ~0;
+       int best_rank = INT_MAX;      /* lower is better */
+       unsigned deepest = 0;
  
         /* TODO we'd really rather use the list/array of block outputs.  But we
          * don't have such a thing.  Recursing *every* instruction in the list
@@ -426,20 +566,74 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
          */
         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
                 struct ir3_instruction *candidate;
-               unsigned delay;
  
                 candidate = find_instr_recursive(ctx, notes, instr);
                 if (!candidate)
                         continue;
  
-               delay = delay_calc(ctx->block, candidate, soft, false);
-               if (delay < min_delay) {
-                       best_instr = candidate;
-                       min_delay = delay;
+               if (is_meta(candidate))
+                       return candidate;
+
+               deepest = MAX2(deepest, candidate->depth);
+       }
+
+       /* traverse the list a second time.. but since we cache the result of
+        * find_instr_recursive() it isn't as bad as it looks.
+        */
+       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+               struct ir3_instruction *candidate;
+
+               candidate = find_instr_recursive(ctx, notes, instr);
+               if (!candidate)
+                       continue;
+
+               /* determine net change to # of live values: */
+               int le = live_effect(candidate);
+
+               /* if there is a net increase in # of live values, then apply some
+                * threshold to avoid instructions getting scheduled *too* early
+                * and increasing register pressure.
+                */
+               if (le >= 1) {
+                       unsigned threshold;
+
+                       if (ctx->live_values > 4*4) {
+                               threshold = 4;
+                       } else {
+                               threshold = 6;
+                       }
+
+                       /* Filter out any "shallow" instructions which would otherwise
+                        * tend to get scheduled too early to fill delay slots even
+                        * when they are not needed for a while.  There will probably
+                        * be later delay slots that they could just as easily fill.
+                        *
+                        * A classic case where this comes up is frag shaders that
+                        * write a constant value (like 1.0f) to one of the channels
+                        * of the output color(s).  Since the mov from immed has no
+                        * dependencies, it would otherwise get scheduled early to
+                        * fill delay slots, occupying a register until the end of
+                        * the program.
+                        */
+                       if ((deepest - candidate->depth) > threshold)
+                               continue;
                 }
  
-               if (min_delay == 0)
-                       break;
+               int rank = delay_calc(ctx->block, candidate, soft, false);
+
+               /* if too many live values, prioritize instructions that reduce the
+                * number of live values:
+                */
+               if (ctx->live_values > 16*4) {
+                       rank = le;
+               } else if (ctx->live_values > 4*4) {
+                       rank += le;
+               }
+
+               if (rank < best_rank) {
+                       best_instr = candidate;
+                       best_rank = rank;
+               }
         }
  
         return best_instr;
@@ -712,8 +906,10 @@ int ir3_sched(struct ir3 *ir)
         struct ir3_sched_ctx ctx = {0};
  
         ir3_clear_mark(ir);
+       update_use_count(ir);
  
         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ctx.live_values = 0;
                 sched_block(&ctx, block);
         }
  
@@ -723,9 +919,24 @@ int ir3_sched(struct ir3 *ir)
  
         if (ctx.error)
                 return -1;
+
         return 0;
  }
  
+static unsigned
+get_array_id(struct ir3_instruction *instr)
+{
+       /* The expectation is that there is only a single array
+        * src or dst, ir3_cp should enforce this.
+        */
+
+       for (unsigned i = 0; i < instr->regs_count; i++)
+               if (instr->regs[i]->flags & IR3_REG_ARRAY)
+                       return instr->regs[i]->array.id;
+
+       unreachable("this was unexpected");
+}
+
  /* does instruction 'prior' need to be scheduled before 'instr'? */
  static bool
  depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
@@ -738,7 +949,22 @@ depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
                 return true;
-       return !!(instr->barrier_class & prior->barrier_conflict);
+
+       if (instr->barrier_class & prior->barrier_conflict) {
+               if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
+                       /* if only array barrier, then we can further limit false-deps
+                        * by considering the array-id, ie reads/writes to different
+                        * arrays do not depend on each other (no aliasing)
+                        */
+                       if (get_array_id(instr) != get_array_id(prior)) {
+                               return false;
+                       }
+               }
+
+               return true;
+       }
+
+       return false;
  }
  
  static void