freedreno/ir3: split out delay helpers
authorRob Clark <robdclark@chromium.org>
Wed, 18 Dec 2019 19:10:12 +0000 (11:10 -0800)
committerMarge Bot <eric+marge@anholt.net>
Sat, 1 Feb 2020 02:40:22 +0000 (02:40 +0000)
We're going to want these also for a post-RA sched pass.  And also to
split nop stuffing out into it's own pass.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>

src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_delay.c [new file with mode: 0644]
src/freedreno/ir3/ir3_depth.c
src/freedreno/ir3/ir3_sched.c
src/freedreno/ir3/meson.build

index a8ce9ea12a6c69ff310352bc3d1d9fd165a73fbe..03abaafa3939f9fd7da78383df99e04e6ea761eb 100644 (file)
@@ -1113,10 +1113,16 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
 void ir3_print(struct ir3 *ir);
 void ir3_print_instr(struct ir3_instruction *instr);
 
-/* depth calculation: */
-struct ir3_shader_variant;
+/* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
                struct ir3_instruction *consumer, unsigned n);
+unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred);
+unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred);
+
+/* depth calculation: */
+struct ir3_shader_variant;
 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
 void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
 
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
new file mode 100644 (file)
index 0000000..506e296
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2019 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "ir3.h"
+
+/*
+ * Helpers to figure out the necessary delay slots between instructions.  Used
+ * both in scheduling pass(es) and the final pass to insert any required nop's
+ * so that the shader program is valid.
+ *
+ * Note that this needs to work both pre and post RA, so we can't assume ssa
+ * src iterators work.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store.  The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n)
+{
+       if (!__is_false_dep(consumer, n))
+               return false;
+
+       if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+               struct ir3_register *dst = assigner->regs[0];
+               struct ir3_register *src;
+
+               debug_assert(dst->flags & IR3_REG_ARRAY);
+
+               foreach_src (src, consumer) {
+                       if ((src->flags & IR3_REG_ARRAY) &&
+                                       (dst->array.id == src->array.id)) {
+                               return false;
+                       }
+               }
+       }
+
+       return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int
+ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n)
+{
+       if (ignore_dep(assigner, consumer, n))
+               return 0;
+
+       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+        * handled with sync bits
+        */
+
+       if (is_meta(assigner) || is_meta(consumer))
+               return 0;
+
+       if (writes_addr(assigner))
+               return 6;
+
+       /* handled via sync flags: */
+       if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+               return 0;
+
+       /* assigner must be alu: */
+       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+                       is_mem(consumer)) {
+               return 6;
+       } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+                       (n == 3)) {
+               /* special case, 3rd src to cat3 not required on first cycle */
+               return 1;
+       } else {
+               return 3;
+       }
+}
+
+static bool
+count_instruction(struct ir3_instruction *n)
+{
+       /* NOTE: don't count branch/jump since we don't know yet if they will
+        * be eliminated later in resolve_jumps().. really should do that
+        * earlier so we don't have this constraint.
+        */
+       return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled)
+ */
+unsigned
+ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred)
+{
+       unsigned d = 0;
+
+       /* Note that this relies on incrementally building up the block's
+        * instruction list.. but this is how scheduling and nopsched
+        * work.
+        */
+       foreach_instr_rev (n, &block->instr_list) {
+               if ((n == instr) || (d >= maxd))
+                       return MIN2(maxd, d + n->nop);
+               if (count_instruction(n))
+                       d = MIN2(maxd, d + 1 + n->repeat + n->nop);
+       }
+
+       /* if coming from a predecessor block, assume it is assigned far
+        * enough away.. we'll fix up later.
+        */
+       if (!pred)
+               return maxd;
+
+       if (pred && (block->data != block)) {
+               /* Search into predecessor blocks, finding the one with the
+                * shortest distance, since that will be the worst case
+                */
+               unsigned min = maxd - d;
+
+               /* (ab)use block->data to prevent recursion: */
+               block->data = block;
+
+               set_foreach (block->predecessors, entry) {
+                       struct ir3_block *pred = (struct ir3_block *)entry->key;
+                       unsigned n;
+
+                       n = ir3_distance(pred, instr, min, pred);
+
+                       min = MIN2(min, n);
+               }
+
+               block->data = NULL;
+               d += min;
+       }
+
+       return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+               struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer,
+               unsigned srcn, bool soft, bool pred)
+{
+       unsigned delay = 0;
+
+       if (is_meta(assigner)) {
+               struct ir3_register *src;
+               foreach_src (src, assigner) {
+                       unsigned d;
+
+                       if (!src->instr)
+                               continue;
+
+                       d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
+                       delay = MAX2(delay, d);
+               }
+       } else {
+               if (soft) {
+                       if (is_sfu(assigner)) {
+                               delay = 4;
+                       } else {
+                               delay = ir3_delayslots(assigner, consumer, srcn);
+                       }
+               } else {
+                       delay = ir3_delayslots(assigner, consumer, srcn);
+               }
+               delay -= ir3_distance(block, assigner, delay, pred);
+       }
+
+       return delay;
+}
+
+static struct ir3_instruction *
+find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
+{
+       unsigned d = 0;
+
+       /* Note that this relies on incrementally building up the block's
+        * instruction list.. but this is how scheduling and nopsched
+        * work.
+        */
+       foreach_instr_rev (n, &block->instr_list) {
+               if (d >= maxd)
+                       return NULL;
+               if (count_instruction(n))
+                       d++;
+               if (dest_regs(n) == 0)
+                       continue;
+
+               /* note that a dest reg will never be an immediate */
+               if (n->regs[0]->array.id == array_id)
+                       return n;
+       }
+
+       return NULL;
+}
+
+/* like list_length() but only counts instructions which count in the
+ * delay determination:
+ */
+static unsigned
+count_block_delay(struct ir3_block *block)
+{
+       unsigned delay = 0;
+       foreach_instr (n, &block->instr_list) {
+               if (!count_instruction(n))
+                       continue;
+               delay++;
+       }
+       return delay;
+}
+
+static unsigned
+delay_calc_array(struct ir3_block *block, unsigned array_id,
+               struct ir3_instruction *consumer, unsigned srcn,
+               bool soft, bool pred, unsigned maxd)
+{
+       struct ir3_instruction *assigner;
+
+       assigner = find_array_write(block, array_id, maxd);
+       if (assigner)
+               return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
+
+       if (!pred)
+               return 0;
+
+       unsigned len = count_block_delay(block);
+       if (maxd <= len)
+               return 0;
+
+       maxd -= len;
+
+       if (block->data == block) {
+               /* we have a loop, return worst case: */
+               return maxd;
+       }
+
+       /* If we need to search into predecessors, find the one with the
+        * max delay.. the resulting delay is that minus the number of
+        * counted instructions in this block:
+        */
+       unsigned max = 0;
+
+       /* (ab)use block->data to prevent recursion: */
+       block->data = block;
+
+       set_foreach (block->predecessors, entry) {
+               struct ir3_block *pred = (struct ir3_block *)entry->key;
+               unsigned delay =
+                       delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
+
+               max = MAX2(max, delay);
+       }
+
+       block->data = NULL;
+
+       if (max < len)
+               return 0;
+
+       return max - len;
+}
+
+/**
+ * Calculate delay for instruction (maximum of delay for all srcs):
+ *
+ * @soft:  If true, add additional delay for situations where they
+ *    would not be strictly required because a sync flag would be
+ *    used (but scheduler would prefer to schedule some other
+ *    instructions first to avoid stalling on sync flag)
+ * @pred:  If true, recurse into predecessor blocks
+ */
+unsigned
+ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred)
+{
+       unsigned delay = 0;
+       struct ir3_register *src;
+
+       foreach_src_n (src, i, instr) {
+               unsigned d = 0;
+
+               if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
+                       d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
+               } else if (src->instr) {
+                       d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
+               }
+
+               delay = MAX2(delay, d);
+       }
+
+       if (instr->address) {
+               unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
+               delay = MAX2(delay, d);
+       }
+
+       return delay;
+}
index f1f7b94b2fc9a1d3ea52fabbf5da44a6a19d82bb..59ea3af407bbc2de23b9f4d986829b49e4441488 100644 (file)
  * blocks depth sorted list, which is used by the scheduling pass.
  */
 
-/* generally don't count false dependencies, since this can just be
- * something like a barrier, or SSBO store.  The exception is array
- * dependencies if the assigner is an array write and the consumer
- * reads the same array.
- */
-static bool
-ignore_dep(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
-{
-       if (!__is_false_dep(consumer, n))
-               return false;
-
-       if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
-               struct ir3_register *dst = assigner->regs[0];
-               struct ir3_register *src;
-
-               debug_assert(dst->flags & IR3_REG_ARRAY);
-
-               foreach_src(src, consumer) {
-                       if ((src->flags & IR3_REG_ARRAY) &&
-                                       (dst->array.id == src->array.id)) {
-                               return false;
-                       }
-               }
-       }
-
-       return true;
-}
-
-/* calculate required # of delay slots between the instruction that
- * assigns a value and the one that consumes
- */
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
-{
-       if (ignore_dep(assigner, consumer, n))
-               return 0;
-
-       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-        * handled with sync bits
-        */
-
-       if (is_meta(assigner) || is_meta(consumer))
-               return 0;
-
-       if (writes_addr(assigner))
-               return 6;
-
-       /* handled via sync flags: */
-       if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
-               return 0;
-
-       /* assigner must be alu: */
-       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-                       is_mem(consumer)) {
-               return 6;
-       } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-                       (n == 3)) {
-               /* special case, 3rd src to cat3 not required on first cycle */
-               return 1;
-       } else {
-               return 3;
-       }
-}
-
 void
 ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
 {
index c2f6b3e020d8aab4e4cf15d90d0448a0b1ba13fb..ec5ad6e872e7eec42a47c545c2562de11c5ff495 100644 (file)
@@ -265,117 +265,6 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
        return d;
 }
 
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned maxd, bool pred)
-{
-       unsigned d = 0;
-
-       foreach_instr_rev (n, &block->instr_list) {
-               if ((n == instr) || (d >= maxd))
-                       return d;
-               /* NOTE: don't count branch/jump since we don't know yet if they will
-                * be eliminated later in resolve_jumps().. really should do that
-                * earlier so we don't have this constraint.
-                */
-               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
-                       d++;
-       }
-
-       /* if coming from a predecessor block, assume it is assigned far
-        * enough away.. we'll fix up later.
-        */
-       if (!pred)
-               return maxd;
-
-       if (pred && (block->data != block)) {
-               /* Search into predecessor blocks, finding the one with the
-                * shortest distance, since that will be the worst case
-                */
-               unsigned min = maxd - d;
-
-               /* (ab)use block->data to prevent recursion: */
-               block->data = block;
-
-               set_foreach(block->predecessors, entry) {
-                       struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned n;
-
-                       n = distance(pred, instr, min, pred);
-
-                       min = MIN2(min, n);
-               }
-
-               block->data = NULL;
-               d += min;
-       }
-
-       return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer,
-               unsigned srcn, bool soft, bool pred)
-{
-       unsigned delay = 0;
-
-       if (is_meta(assigner)) {
-               struct ir3_instruction *src;
-               foreach_ssa_src(src, assigner) {
-                       unsigned d;
-                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
-                       delay = MAX2(delay, d);
-               }
-       } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
-               delay -= distance(block, assigner, delay, pred);
-       }
-
-       return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool pred)
-{
-       unsigned delay = 0;
-       struct ir3_instruction *src;
-
-       foreach_ssa_src_n(src, i, instr) {
-               unsigned d;
-               d = delay_calc_srcn(block, src, instr, i, soft, pred);
-               delay = MAX2(delay, d);
-       }
-
-       return delay;
-}
-
 struct ir3_sched_notes {
        /* there is at least one kill which could be scheduled, except
         * for unscheduled bary.f's:
@@ -658,7 +547,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
                                continue;
                }
 
-               int rank = delay_calc(ctx->block, candidate, soft, false);
+               int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
 
                /* if too many live values, prioritize instructions that reduce the
                 * number of live values:
@@ -827,7 +716,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                        instr = find_eligible_instr(ctx, &notes, false);
 
                if (instr) {
-                       unsigned delay = delay_calc(ctx->block, instr, false, false);
+                       unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 
                        d("delay=%u", delay);
 
@@ -886,7 +775,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
                debug_assert(ctx->pred);
                debug_assert(block->condition);
 
-               delay -= distance(ctx->block, ctx->pred, delay, false);
+               delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
 
                while (delay > 0) {
                        ir3_NOP(block);
@@ -944,7 +833,7 @@ sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
                set_foreach(block->predecessors, entry) {
                        struct ir3_block *pred = (struct ir3_block *)entry->key;
-                       unsigned d = delay_calc(pred, instr, false, true);
+                       unsigned d = ir3_delay_calc(pred, instr, false, true);
                        delay = MAX2(d, delay);
                }
 
index c377767084cf1adf432794505da95f4d0f4a52ac..0c2cb21930ad970f9d27441866cd6257e35f0276 100644 (file)
@@ -54,6 +54,7 @@ libfreedreno_ir3_files = files(
   'ir3_context.c',
   'ir3_context.h',
   'ir3_cp.c',
+  'ir3_delay.c',
   'ir3_depth.c',
   'ir3_group.c',
   'ir3_image.c',