freedreno: Remove the Emacs mode lines
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_sched.c
index 8f640febc5da6b691a3668278fd85d7b4fb0de13..6552980d90c4317fbabfe34277208aef1255ba65 100644 (file)
@@ -1,5 +1,3 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
 /*
  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  *
@@ -136,28 +134,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
        return d;
 }
 
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
 static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
-               unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred)
 {
-       struct list_head *instr_list = &ctx->block->instr_list;
        unsigned d = 0;
 
-       list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
                if ((n == instr) || (d >= maxd))
-                       break;
-               if (is_alu(n) || is_flow(n))
+                       return d;
+               /* NOTE: don't count branch/jump since we don't know yet if they will
+                * be eliminated later in resolve_jumps().. really should do that
+                * earlier so we don't have this constraint.
+                */
+               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
                        d++;
        }
 
+       /* if coming from a predecessor block, assume it is assigned far
+        * enough away.. we'll fix up later.
+        */
+       if (!pred)
+               return maxd;
+
+       if (pred && (block->data != block)) {
+               /* Search into predecessor blocks, finding the one with the
+                * shortest distance, since that will be the worst case
+                */
+               unsigned min = maxd - d;
+
+               /* (ab)use block->data to prevent recursion: */
+               block->data = block;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned n;
+
+                       n = distance(block->predecessors[i], instr, min, pred);
+
+                       min = MIN2(min, n);
+               }
+
+               block->data = NULL;
+               d += min;
+       }
+
        return d;
 }
 
 /* calculate delay for specified src: */
 static unsigned
-delay_calc_srcn(struct ir3_sched_ctx *ctx,
+delay_calc_srcn(struct ir3_block *block,
                struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned srcn)
+               struct ir3_instruction *consumer,
+               unsigned srcn, bool soft, bool pred)
 {
        unsigned delay = 0;
 
@@ -165,14 +208,20 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
                struct ir3_instruction *src;
                foreach_ssa_src(src, assigner) {
                        unsigned d;
-                       if (src->block != assigner->block)
-                               break;
-                       d = delay_calc_srcn(ctx, src, consumer, srcn);
+                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
                        delay = MAX2(delay, d);
                }
        } else {
-               delay = ir3_delayslots(assigner, consumer, srcn);
-               delay -= distance(ctx, assigner, delay);
+               if (soft) {
+                       if (is_sfu(assigner)) {
+                               delay = 4;
+                       } else {
+                               delay = ir3_delayslots(assigner, consumer, srcn);
+                       }
+               } else {
+                       delay = ir3_delayslots(assigner, consumer, srcn);
+               }
+               delay -= distance(block, assigner, delay, pred);
        }
 
        return delay;
@@ -180,19 +229,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred)
 {
        unsigned delay = 0;
        struct ir3_instruction *src;
 
        foreach_ssa_src_n(src, i, instr) {
                unsigned d;
-               /* for array writes, no need to delay on previous write: */
-               if (i == 0)
-                       continue;
-               if (src->block != instr->block)
-                       continue;
-               d = delay_calc_srcn(ctx, src, instr, i);
+               d = delay_calc_srcn(block, src, instr, i, soft, pred);
                delay = MAX2(delay, d);
        }
 
@@ -367,7 +412,8 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 
 /* find instruction to schedule: */
 static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               bool soft)
 {
        struct ir3_instruction *best_instr = NULL;
        unsigned min_delay = ~0;
@@ -386,7 +432,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
                if (!candidate)
                        continue;
 
-               delay = delay_calc(ctx, candidate);
+               delay = delay_calc(ctx->block, candidate, soft, false);
                if (delay < min_delay) {
                        best_instr = candidate;
                        min_delay = delay;
@@ -506,13 +552,12 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
        list_inithead(&block->instr_list);
        list_inithead(&ctx->depth_list);
 
-       /* first a pre-pass to schedule all meta:input/phi instructions
+       /* first a pre-pass to schedule all meta:input instructions
         * (which need to appear first so that RA knows the register is
         * occupied), and move remaining to depth sorted list:
         */
        list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-               if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
-                               (instr->opc == OPC_META_PHI))) {
+               if (instr->opc == OPC_META_INPUT) {
                        schedule(ctx, instr);
                } else {
                        ir3_insert_by_depth(instr, &ctx->depth_list);
@@ -523,10 +568,12 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                struct ir3_sched_notes notes = {0};
                struct ir3_instruction *instr;
 
-               instr = find_eligible_instr(ctx, &notes);
+               instr = find_eligible_instr(ctx, &notes, true);
+               if (!instr)
+                       instr = find_eligible_instr(ctx, &notes, false);
 
                if (instr) {
-                       unsigned delay = delay_calc(ctx, instr);
+                       unsigned delay = delay_calc(ctx->block, instr, false, false);
 
                        /* and if we run out of instructions that can be scheduled,
                         * then it is time for nop's:
@@ -583,7 +630,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                debug_assert(ctx->pred);
                debug_assert(block->condition);
 
-               delay -= distance(ctx, ctx->pred, delay);
+               delay -= distance(ctx->block, ctx->pred, delay, false);
 
                while (delay > 0) {
                        ir3_NOP(block);
@@ -622,36 +669,150 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         */
 }
 
-/* this is needed to ensure later RA stage succeeds: */
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
 static void
-sched_insert_parallel_copies(struct ir3_block *block)
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
-                       struct ir3_register *reg;
-                       foreach_src(reg, instr) {
-                               struct ir3_instruction *src = reg->instr;
-                               struct ir3_instruction *mov =
-                                       ir3_MOV(src->block, src, TYPE_U32);
-                               mov->regs[0]->flags |= IR3_REG_PHI_SRC;
-                               mov->regs[0]->instr = instr;
-                               reg->instr = mov;
-                       }
+       unsigned n = 0;
+
+       ctx->block = block;
+
+       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+               unsigned delay = 0;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+                       delay = MAX2(d, delay);
                }
+
+               while (delay > n) {
+                       struct ir3_instruction *nop = ir3_NOP(block);
+
+                       /* move to before instr: */
+                       list_delinit(&nop->node);
+                       list_addtail(&nop->node, &instr->node);
+
+                       n++;
+               }
+
+               /* we can bail once we hit worst case delay: */
+               if (++n > 6)
+                       break;
        }
 }
 
 int ir3_sched(struct ir3 *ir)
 {
        struct ir3_sched_ctx ctx = {0};
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               sched_insert_parallel_copies(block);
-       }
+
        ir3_clear_mark(ir);
+
        list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
                sched_block(&ctx, block);
        }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_intra_block(&ctx, block);
+       }
+
        if (ctx.error)
                return -1;
        return 0;
 }
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+       /* TODO for dependencies that are related to a specific object, ie
+        * a specific SSBO/image/array, we could relax this constraint to
+        * make accesses to unrelated objects not depend on each other (at
+        * least as long as not declared coherent)
+        */
+       if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+                       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+               return true;
+       return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+       struct list_head *prev = instr->node.prev;
+       struct list_head *next = instr->node.next;
+
+       /* add dependencies on previous instructions that must be scheduled
+        * prior to the current instruction
+        */
+       while (prev != &block->instr_list) {
+               struct ir3_instruction *pi =
+                       LIST_ENTRY(struct ir3_instruction, prev, node);
+
+               prev = prev->prev;
+
+               if (is_meta(pi))
+                       continue;
+
+               if (instr->barrier_class == pi->barrier_class) {
+                       ir3_instr_add_dep(instr, pi);
+                       break;
+               }
+
+               if (depends_on(instr, pi))
+                       ir3_instr_add_dep(instr, pi);
+       }
+
+       /* add dependencies on this instruction to following instructions
+        * that must be scheduled after the current instruction:
+        */
+       while (next != &block->instr_list) {
+               struct ir3_instruction *ni =
+                       LIST_ENTRY(struct ir3_instruction, next, node);
+
+               next = next->next;
+
+               if (is_meta(ni))
+                       continue;
+
+               if (instr->barrier_class == ni->barrier_class) {
+                       ir3_instr_add_dep(ni, instr);
+                       break;
+               }
+
+               if (depends_on(ni, instr))
+                       ir3_instr_add_dep(ni, instr);
+       }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               if (instr->barrier_class) {
+                       add_barrier_deps(block, instr);
+               }
+       }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               calculate_deps(block);
+       }
+}