freedreno/ir3: intra-block scheduling

author Rob Clark <robdclark@gmail.com>

Mon, 5 Feb 2018 13:45:29 +0000 (08:45 -0500)

committer Rob Clark <robdclark@gmail.com>

Sat, 10 Feb 2018 19:54:58 +0000 (14:54 -0500)
author Rob Clark <robdclark@gmail.com>
Mon, 5 Feb 2018 13:45:29 +0000 (08:45 -0500)
committer Rob Clark <robdclark@gmail.com>
Sat, 10 Feb 2018 19:54:58 +0000 (14:54 -0500)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c

index 72bd0b2ce0701538bf1f95b911a1167de737e4ba..9269f471c56b3ea45faaeec651653b8ab383af6c 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -136,29 +136,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
         return d;
  }
  
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
  static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
-               unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred)
  {
-       struct list_head *instr_list = &ctx->block->instr_list;
         unsigned d = 0;
  
-       list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
                 if ((n == instr) || (d >= maxd))
-                       break;
-               if (is_alu(n) || is_flow(n))
+                       return d;
+               /* NOTE: don't count branch/jump since we don't know yet if they will
+                * be eliminated later in resolve_jumps().. really should do that
+                * earlier so we don't have this constraint.
+                */
+               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
                         d++;
         }
  
+       /* if coming from a predecessor block, assume it is assigned far
+        * enough away.. we'll fix up later.
+        */
+       if (!pred)
+               return maxd;
+
+       if (pred && (block->data != block)) {
+               /* Search into predecessor blocks, finding the one with the
+                * shortest distance, since that will be the worst case
+                */
+               unsigned min = maxd - d;
+
+               /* (ab)use block->data to prevent recursion: */
+               block->data = block;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned n;
+
+                       n = distance(block->predecessors[i], instr, min, pred);
+
+                       min = MIN2(min, n);
+               }
+
+               block->data = NULL;
+               d += min;
+       }
+
         return d;
  }
  
  /* calculate delay for specified src: */
  static unsigned
-delay_calc_srcn(struct ir3_sched_ctx *ctx,
+delay_calc_srcn(struct ir3_block *block,
                 struct ir3_instruction *assigner,
                 struct ir3_instruction *consumer,
-               unsigned srcn, bool soft)
+               unsigned srcn, bool soft, bool pred)
  {
         unsigned delay = 0;
  
@@ -166,9 +210,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
                 struct ir3_instruction *src;
                 foreach_ssa_src(src, assigner) {
                         unsigned d;
-                       if (src->block != assigner->block)
-                               break;
-                       d = delay_calc_srcn(ctx, src, consumer, srcn, soft);
+                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
                         delay = MAX2(delay, d);
                 }
         } else {
@@ -181,7 +223,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
                 } else {
                         delay = ir3_delayslots(assigner, consumer, srcn);
                 }
-               delay -= distance(ctx, assigner, delay);
+               delay -= distance(block, assigner, delay, pred);
         }
  
         return delay;
@@ -189,19 +231,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
  
  /* calculate delay for instruction (maximum of delay for all srcs): */
  static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft)
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred)
  {
         unsigned delay = 0;
         struct ir3_instruction *src;
  
         foreach_ssa_src_n(src, i, instr) {
                 unsigned d;
-               /* for array writes, no need to delay on previous write: */
-               if (i == 0)
-                       continue;
-               if (src->block != instr->block)
-                       continue;
-               d = delay_calc_srcn(ctx, src, instr, i, soft);
+               d = delay_calc_srcn(block, src, instr, i, soft, pred);
                 delay = MAX2(delay, d);
         }
  
@@ -396,7 +434,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                 if (!candidate)
                         continue;
  
-               delay = delay_calc(ctx, candidate, soft);
+               delay = delay_calc(ctx->block, candidate, soft, false);
                 if (delay < min_delay) {
                         best_instr = candidate;
                         min_delay = delay;
@@ -537,7 +575,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                         instr = find_eligible_instr(ctx, &notes, false);
  
                 if (instr) {
-                       unsigned delay = delay_calc(ctx, instr, false);
+                       unsigned delay = delay_calc(ctx->block, instr, false, false);
  
                         /* and if we run out of instructions that can be scheduled,
                          * then it is time for nop's:
@@ -594,7 +632,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                 debug_assert(ctx->pred);
                 debug_assert(block->condition);
  
-               delay -= distance(ctx, ctx->pred, delay);
+               delay -= distance(ctx->block, ctx->pred, delay, false);
  
                 while (delay > 0) {
                         ir3_NOP(block);
@@ -633,14 +671,58 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
          */
  }
  
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+       unsigned n = 0;
+
+       ctx->block = block;
+
+       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+               unsigned delay = 0;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+                       delay = MAX2(d, delay);
+               }
+
+               while (delay > n) {
+                       struct ir3_instruction *nop = ir3_NOP(block);
+
+                       /* move to before instr: */
+                       list_delinit(&nop->node);
+                       list_addtail(&nop->node, &instr->node);
+
+                       n++;
+               }
+
+               /* we can bail once we hit worst case delay: */
+               if (++n > 6)
+                       break;
+       }
+}
+
  int ir3_sched(struct ir3 *ir)
  {
         struct ir3_sched_ctx ctx = {0};
  
         ir3_clear_mark(ir);
+
         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
                 sched_block(&ctx, block);
         }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_intra_block(&ctx, block);
+       }
+
         if (ctx.error)
                 return -1;
         return 0;
author	Rob Clark <robdclark@gmail.com>
	Mon, 5 Feb 2018 13:45:29 +0000 (08:45 -0500)
committer	Rob Clark <robdclark@gmail.com>
	Sat, 10 Feb 2018 19:54:58 +0000 (14:54 -0500)