intel/fs/gen12: Introduce software scoreboard lowering pass.

author Francisco Jerez <currojerez@riseup.net>

Fri, 9 Nov 2018 22:13:37 +0000 (14:13 -0800)

committer Francisco Jerez <currojerez@riseup.net>

Fri, 11 Oct 2019 19:24:16 +0000 (12:24 -0700)
author Francisco Jerez <currojerez@riseup.net>
Fri, 9 Nov 2018 22:13:37 +0000 (14:13 -0800)
committer Francisco Jerez <currojerez@riseup.net>
Fri, 11 Oct 2019 19:24:16 +0000 (12:24 -0700)
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources

index a57c0d1bada8b7e66b44099577c0d3146d9c4b79..ebe4efee992ab1237b04844c8b281028400088da 100644 (file)
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -68,6 +68,7 @@ COMPILER_FILES = \
         compiler/brw_fs_reg_allocate.cpp \
         compiler/brw_fs_register_coalesce.cpp \
         compiler/brw_fs_saturate_propagation.cpp \
+       compiler/brw_fs_scoreboard.cpp \
         compiler/brw_fs_sel_peephole.cpp \
         compiler/brw_fs_validate.cpp \
         compiler/brw_fs_visitor.cpp \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index 30a95b419f0b07cefcbf93d2afc8775ad9945cc8..337dde63848c51a87f6d8350d10e79735cfe0ae6 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -3509,6 +3509,8 @@ fs_visitor::emit_repclear_shader()
        assert(mov->src[0].file == FIXED_GRF);
        mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
     }
+
+   lower_scoreboard();
  }
  
  /**
@@ -7557,6 +7559,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
         */
        assert(prog_data->total_scratch < max_scratch_size);
     }
+
+   lower_scoreboard();
  }
  
  bool
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h

index 4d5cdd0c0135de1ac963d7d580e66a7f55e139f9..ede5b0a701cb734c5cb4d897e90e8c0ba37850e6 100644 (file)
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -167,6 +167,7 @@ public:
     bool lower_integer_multiplication();
     bool lower_minmax();
     bool lower_simd_width();
+   bool lower_scoreboard();
     bool opt_combine_constants();
  
     void emit_dummy_fs();
diff --git a/src/intel/compiler/brw_fs_scoreboard.cpp b/src/intel/compiler/brw_fs_scoreboard.cpp

new file mode 100644 (file)

index 0000000..f05a150
--- /dev/null
+++ b/src/intel/compiler/brw_fs_scoreboard.cpp
@@ -0,0 +1,939 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_scoreboard.cpp
+ *
+ * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
+ * data coherency between register reads and writes in previous generations.
+ * This lowering pass runs after register allocation in order to make up for
+ * it.
+ *
+ * It works by performing global dataflow analysis in order to determine the
+ * set of potential dependencies of every instruction in the shader, and then
+ * inserts any required SWSB annotations and additional SYNC instructions in
+ * order to guarantee data coherency.
+ *
+ * WARNING - Access of the following (rarely used) ARF registers is not
+ *           tracked here, and require the RegDist SWSB annotation to be set
+ *           to 1 by the generator in order to avoid data races:
+ *
+ *  - sp stack pointer
+ *  - sr0 state register
+ *  - cr0 control register
+ *  - ip instruction pointer
+ *  - tm0 timestamp register
+ *  - dbg0 debug register
+ *
+ * The following ARF registers don't need to be tracked here because data
+ * coherency is still provided transparently by the hardware:
+ *
+ *  - f0-1 flag registers
+ *  - n0 notification register
+ *  - tdr0 thread dependency register
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+namespace {
+   /**
+    * In-order instruction accounting.
+    * @{
+    */
+
+   /**
+    * Number of in-order hardware instructions contained in this IR
+    * instruction.  This determines the increment applied to the RegDist
+    * counter calculated for any ordered dependency that crosses this
+    * instruction.
+    */
+   unsigned
+   ordered_unit(const fs_inst *inst)
+   {
+      switch (inst->opcode) {
+      case BRW_OPCODE_SYNC:
+      case BRW_OPCODE_DO:
+      case SHADER_OPCODE_UNDEF:
+      case FS_OPCODE_PLACEHOLDER_HALT:
+         return 0;
+      default:
+         /* Note that the following is inaccurate for virtual instructions
+          * that expand to more in-order instructions than assumed here, but
+          * that can only lead to suboptimal execution ordering, data
+          * coherency won't be impacted.  Providing exact RegDist counts for
+          * each virtual instruction would allow better ALU performance, but
+          * it would require keeping this switch statement in perfect sync
+          * with the generator in order to avoid data corruption.  Lesson is
+          * (again) don't use virtual instructions if you want optimal
+          * scheduling.
+          */
+         return is_unordered(inst) ? 0 : 1;
+      }
+   }
+
+   /**
+    * Type for an instruction counter that increments for in-order
+    * instructions only, arbitrarily denoted 'jp' throughout this lowering
+    * pass in order to distinguish it from the regular instruction counter.
+    */
+   typedef int ordered_address;
+
+   /**
+    * Calculate the local ordered_address instruction counter at every
+    * instruction of the shader for subsequent constant-time look-up.
+    */
+   std::vector<ordered_address>
+   ordered_inst_addresses(const fs_visitor *shader)
+   {
+      std::vector<ordered_address> jps;
+      ordered_address jp = 0;
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+         jps.push_back(jp);
+         jp += ordered_unit(inst);
+      }
+
+      return jps;
+   }
+
+   /**
+    * Synchronization mode required for data manipulated by in-order
+    * instructions.
+    *
+    * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
+    * enum for additional type safety.  The hardware doesn't provide control
+    * over the synchronization mode for RegDist annotations, this is only used
+    * internally in this pass in order to optimize out redundant read
+    * dependencies where possible.
+    */
+   enum tgl_regdist_mode {
+      TGL_REGDIST_NULL = 0,
+      TGL_REGDIST_SRC = 1,
+      TGL_REGDIST_DST = 2
+   };
+
+   /**
+    * Allow bitwise arithmetic of tgl_regdist_mode enums.
+    */
+   tgl_regdist_mode
+   operator|(tgl_regdist_mode x, tgl_regdist_mode y)
+   {
+      return tgl_regdist_mode(unsigned(x) | unsigned(y));
+   }
+
+   tgl_regdist_mode
+   operator&(tgl_regdist_mode x, tgl_regdist_mode y)
+   {
+      return tgl_regdist_mode(unsigned(x) & unsigned(y));
+   }
+
+   tgl_regdist_mode &
+   operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
+   {
+      return x = x | y;
+   }
+
+   tgl_regdist_mode &
+   operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
+   {
+      return x = x & y;
+   }
+
+   /** @} */
+
+   /**
+    * Representation of an equivalence relation among the set of unsigned
+    * integers.
+    *
+    * Its initial state is the identity relation '~' such that i ~ j if and
+    * only if i == j for every pair of unsigned integers i and j.
+    */
+   struct equivalence_relation {
+      /**
+       * Return equivalence class index of the specified element.  Effectively
+       * this is the numeric value of an arbitrary representative from the
+       * equivalence class.
+       *
+       * Allows the evaluation of the equivalence relation according to the
+       * rule that i ~ j if and only if lookup(i) == lookup(j).
+       */
+      unsigned
+      lookup(unsigned i) const
+      {
+         if (i < is.size() && is[i] != i)
+            return lookup(is[i]);
+         else
+            return i;
+      }
+
+      /**
+       * Create an array with the results of the lookup() method for
+       * constant-time evaluation.
+       */
+      std::vector<unsigned>
+      flatten() const {
+         std::vector<unsigned> ids;
+
+         for (const auto i : is)
+            ids.push_back(lookup(i));
+
+         return ids;
+      }
+
+      /**
+       * Mutate the existing equivalence relation minimally by imposing the
+       * additional requirement that i ~ j.
+       *
+       * The algorithm updates the internal representation recursively in
+       * order to guarantee transitivity while preserving the previously
+       * specified equivalence requirements.
+       */
+      unsigned
+      link(unsigned i, unsigned j)
+      {
+         const unsigned k = lookup(i);
+         assign(i, k);
+         assign(j, k);
+         return k;
+      }
+
+   private:
+      /**
+       * Assign the representative of \p from to be equivalent to \p to.
+       *
+       * At the same time the data structure is partially flattened as much as
+       * it's possible without increasing the number of recursive calls.
+       */
+      void
+      assign(unsigned from, unsigned to)
+      {
+         if (from != to) {
+            if (from < is.size() && is[from] != from)
+               assign(is[from], to);
+
+            for (unsigned i = is.size(); i <= from; i++)
+               is.push_back(i);
+
+            is[from] = to;
+         }
+      }
+
+      std::vector<unsigned> is;
+   };
+
+   /**
+    * Representation of a data dependency between two instructions in the
+    * program.
+    * @{
+    */
+   struct dependency {
+      /**
+       * No dependency information.
+       */
+      dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
+                     unordered(TGL_SBID_NULL), id(0) {}
+
+      /**
+       * Construct a dependency on the in-order instruction with the provided
+       * ordered_address instruction counter.
+       */
+      dependency(tgl_regdist_mode mode, ordered_address jp) :
+         ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
+
+      /**
+       * Construct a dependency on the out-of-order instruction with the
+       * specified synchronization token.
+       */
+      dependency(tgl_sbid_mode mode, unsigned id) :
+         ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
+
+      /**
+       * Synchronization mode of in-order dependency, or zero if no in-order
+       * dependency is present.
+       */
+      tgl_regdist_mode ordered;
+
+      /**
+       * Instruction counter of in-order dependency.
+       *
+       * For a dependency part of a different block in the program, this is
+       * relative to the specific control flow path taken between the
+       * dependency and the current block: It is the ordered_address such that
+       * the difference between it and the ordered_address of the first
+       * instruction of the current block is exactly the number of in-order
+       * instructions across that control flow path.  It is not guaranteed to
+       * be equal to the local ordered_address of the generating instruction
+       * [as returned by ordered_inst_addresses()], except for block-local
+       * dependencies.
+       */
+      ordered_address jp;
+
+      /**
+       * Synchronization mode of unordered dependency, or zero if no unordered
+       * dependency is present.
+       */
+      tgl_sbid_mode unordered;
+
+      /** Synchronization token of out-of-order dependency. */
+      unsigned id;
+
+      /**
+       * Trivial in-order dependency that's always satisfied.
+       *
+       * Note that unlike a default-constructed dependency() which is also
+       * trivially satisfied, this is considered to provide dependency
+       * information and can be used to clear a previously pending dependency
+       * via shadow().
+       */
+      static const dependency done;
+
+      friend bool
+      operator==(const dependency &dep0, const dependency &dep1)
+      {
+         return dep0.ordered == dep1.ordered &&
+                dep0.jp == dep1.jp &&
+                dep0.unordered == dep1.unordered &&
+                dep0.id == dep1.id;
+      }
+
+      friend bool
+      operator!=(const dependency &dep0, const dependency &dep1)
+      {
+         return !(dep0 == dep1);
+      }
+   };
+
+   const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
+
+   /**
+    * Return whether \p dep contains any dependency information.
+    */
+   bool
+   is_valid(const dependency &dep)
+   {
+      return dep.ordered || dep.unordered;
+   }
+
+   /**
+    * Combine \p dep0 and \p dep1 into a single dependency object that is only
+    * satisfied when both original dependencies are satisfied.  This might
+    * involve updating the equivalence relation \p eq in order to make sure
+    * that both out-of-order dependencies are assigned the same hardware SBID
+    * as synchronization token.
+    */
+   dependency
+   merge(equivalence_relation &eq,
+         const dependency &dep0, const dependency &dep1)
+   {
+      dependency dep;
+
+      if (dep0.ordered || dep1.ordered) {
+         dep.ordered = dep0.ordered | dep1.ordered;
+         dep.jp = MAX2(dep0.jp, dep1.jp);
+      }
+
+      if (dep0.unordered || dep1.unordered) {
+         dep.unordered = dep0.unordered | dep1.unordered;
+         dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
+                          dep1.unordered ? dep1.id : dep0.id);
+      }
+
+      return dep;
+   }
+
+   /**
+    * Override dependency information of \p dep0 with that of \p dep1.
+    */
+   dependency
+   shadow(const dependency &dep0, const dependency &dep1)
+   {
+      return is_valid(dep1) ? dep1 : dep0;
+   }
+
+   /**
+    * Translate dependency information across the program.
+    *
+    * This returns a dependency on the same instruction translated to the
+    * ordered_address space of a different block.  The correct shift for
+    * transporting a dependency across an edge of the CFG is the difference
+    * between the local ordered_address of the first instruction of the target
+    * block and the local ordered_address of the instruction immediately after
+    * the end of the origin block.
+    */
+   dependency
+   transport(dependency dep, int delta)
+   {
+      if (dep.ordered && dep.jp > INT_MIN)
+         dep.jp += delta;
+
+      return dep;
+   }
+
+   /**
+    * Return simplified dependency removing any synchronization modes not
+    * applicable to an instruction reading the same register location.
+    */
+   dependency
+   dependency_for_read(dependency dep)
+   {
+      dep.ordered &= TGL_REGDIST_DST;
+      return dep;
+   }
+
+   /**
+    * Return simplified dependency removing any synchronization modes not
+    * applicable to an instruction \p inst writing the same register location.
+    */
+   dependency
+   dependency_for_write(const fs_inst *inst, dependency dep)
+   {
+      if (!is_unordered(inst))
+         dep.ordered &= TGL_REGDIST_DST;
+      return dep;
+   }
+
+   /** @} */
+
+   /**
+    * Scoreboard representation.  This keeps track of the data dependencies of
+    * registers with GRF granularity.
+    */
+   class scoreboard {
+   public:
+      /**
+       * Look up the most current data dependency for register \p r.
+       */
+      dependency
+      get(const fs_reg &r) const
+      {
+         if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
+            return *p;
+         else
+            return dependency();
+      }
+
+      /**
+       * Specify the most current data dependency for register \p r.
+       */
+      void
+      set(const fs_reg &r, const dependency &d)
+      {
+         if (dependency *p = dep(r))
+            *p = d;
+      }
+
+      /**
+       * Component-wise merge() of corresponding dependencies from two
+       * scoreboard objects.  \sa merge().
+       */
+      friend scoreboard
+      merge(equivalence_relation &eq,
+            const scoreboard &sb0, const scoreboard &sb1)
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
+
+         sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
+            sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
+
+         return sb;
+      }
+
+      /**
+       * Component-wise shadow() of corresponding dependencies from two
+       * scoreboard objects.  \sa shadow().
+       */
+      friend scoreboard
+      shadow(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
+
+         sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
+            sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
+
+         return sb;
+      }
+
+      /**
+       * Component-wise transport() of dependencies from a scoreboard
+       * object.  \sa transport().
+       */
+      friend scoreboard
+      transport(const scoreboard &sb0, int delta)
+      {
+         scoreboard sb;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
+            sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
+
+         sb.addr_dep = transport(sb0.addr_dep, delta);
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
+            sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
+
+         return sb;
+      }
+
+      friend bool
+      operator==(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
+            if (sb0.grf_deps[i] != sb1.grf_deps[i])
+               return false;
+         }
+
+         if (sb0.addr_dep != sb1.addr_dep)
+            return false;
+
+         for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
+            if (sb0.accum_deps[i] != sb1.accum_deps[i])
+               return false;
+         }
+
+         return true;
+      }
+
+      friend bool
+      operator!=(const scoreboard &sb0, const scoreboard &sb1)
+      {
+         return !(sb0 == sb1);
+      }
+
+   private:
+      dependency grf_deps[BRW_MAX_GRF];
+      dependency addr_dep;
+      dependency accum_deps[10];
+
+      dependency *
+      dep(const fs_reg &r)
+      {
+         const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
+                               reg_offset(r) / REG_SIZE);
+
+         return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
+                 r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
+                 r.file == ARF && reg >= BRW_ARF_ADDRESS &&
+                                  reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
+                 r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
+                                  reg < BRW_ARF_FLAG ? &accum_deps[
+                                     reg - BRW_ARF_ACCUMULATOR] :
+                 NULL);
+      }
+   };
+
+   /**
+    * Dependency list handling.
+    * @{
+    */
+
+   /**
+    * Add dependency \p dep to the list of dependencies of an instruction
+    * \p deps.
+    */
+   void
+   add_dependency(const std::vector<unsigned> &ids,
+                  std::vector<dependency> &deps, dependency dep)
+   {
+      if (is_valid(dep)) {
+         /* Translate the unordered dependency token first in order to keep
+          * the list minimally redundant.
+          */
+         if (dep.unordered && dep.id < ids.size())
+            dep.id = ids[dep.id];
+
+         /* Try to combine the specified dependency with any existing ones. */
+         for (auto &dep1 : deps) {
+            if (dep.ordered && dep1.ordered) {
+               dep1.jp = MAX2(dep1.jp, dep.jp);
+               dep1.ordered |= dep.ordered;
+               dep.ordered = TGL_REGDIST_NULL;
+            }
+
+            if (dep.unordered && dep1.unordered && dep1.id == dep.id) {
+               dep1.unordered |= dep.unordered;
+               dep.unordered = TGL_SBID_NULL;
+            }
+         }
+
+         /* Add it to the end of the list if necessary. */
+         if (is_valid(dep))
+            deps.push_back(dep);
+      }
+   }
+
+   /**
+    * Construct a tgl_swsb annotation encoding any ordered dependencies from
+    * the dependency list \p deps of an instruction with ordered_address
+    * \p jp.
+    */
+   tgl_swsb
+   ordered_dependency_swsb(const std::vector<dependency> &deps,
+                           const ordered_address &jp)
+   {
+      unsigned min_dist = ~0u;
+
+      for (const auto &dep : deps) {
+         if (dep.ordered) {
+            const unsigned dist = jp - dep.jp;
+            const unsigned max_dist = 10;
+            assert(jp > dep.jp);
+            if (dist <= max_dist)
+               min_dist = MIN3(min_dist, dist, 7);
+         }
+      }
+
+      return { min_dist == ~0u ? 0 : min_dist };
+   }
+
+   /**
+    * Return whether the dependency list \p deps of an instruction with
+    * ordered_address \p jp has any non-trivial ordered dependencies.
+    */
+   bool
+   find_ordered_dependency(const std::vector<dependency> &deps,
+                           const ordered_address &jp)
+   {
+      return ordered_dependency_swsb(deps, jp).regdist;
+   }
+
+   /**
+    * Return the full tgl_sbid_mode bitset for the first unordered dependency
+    * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
+    * no such dependency is present.
+    */
+   tgl_sbid_mode
+   find_unordered_dependency(const std::vector<dependency> &deps,
+                             tgl_sbid_mode unordered)
+   {
+      if (unordered) {
+         for (const auto &dep : deps) {
+            if (unordered & dep.unordered)
+               return dep.unordered;
+         }
+      }
+
+      return TGL_SBID_NULL;
+   }
+
+   /**
+    * Return the tgl_sbid_mode bitset of an unordered dependency from the list
+    * \p deps that can be represented directly in the SWSB annotation of the
+    * instruction without additional SYNC instructions, or zero if no such
+    * dependency is present.
+    */
+   tgl_sbid_mode
+   baked_unordered_dependency_mode(const fs_inst *inst,
+                                   const std::vector<dependency> &deps,
+                                   const ordered_address &jp)
+   {
+      const bool has_ordered = find_ordered_dependency(deps, jp);
+
+      if (find_unordered_dependency(deps, TGL_SBID_SET))
+         return find_unordered_dependency(deps, TGL_SBID_SET);
+      else if (has_ordered && is_unordered(inst))
+         return TGL_SBID_NULL;
+      else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
+               (!has_ordered || !is_unordered(inst)))
+         return find_unordered_dependency(deps, TGL_SBID_DST);
+      else if (!has_ordered)
+         return find_unordered_dependency(deps, TGL_SBID_SRC);
+      else
+         return TGL_SBID_NULL;
+   }
+
+   /** @} */
+
+   /**
+    * Shader instruction dependency calculation.
+    * @{
+    */
+
+   /**
+    * Update scoreboard object \p sb to account for the execution of
+    * instruction \p inst.
+    */
+   void
+   update_inst_scoreboard(const fs_visitor *shader,
+                          const std::vector<ordered_address> &jps,
+                          const fs_inst *inst, unsigned ip, scoreboard &sb)
+   {
+      /* Track any source registers that may be fetched asynchronously by this
+       * instruction, otherwise clear the dependency in order to avoid
+       * subsequent redundant synchronization.
+       */
+      for (unsigned i = 0; i < inst->sources; i++) {
+         const dependency rd_dep =
+            inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
+            ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
+            dependency::done;
+
+         for (unsigned j = 0; j < regs_read(inst, i); j++)
+            sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
+      }
+
+      if (is_send(inst) && inst->base_mrf != -1) {
+         const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
+
+         for (unsigned j = 0; j < inst->mlen; j++)
+            sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
+      }
+
+      /* Track any destination registers of this instruction. */
+      const dependency wr_dep =
+         is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
+         ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
+         dependency();
+
+      if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
+          !inst->dst.is_null()) {
+         for (unsigned j = 0; j < regs_written(inst); j++)
+            sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
+      }
+   }
+
+   /**
+    * Calculate scoreboard objects locally that represent any pending (and
+    * unconditionally resolved) dependencies at the end of each block of the
+    * program.
+    */
+   std::vector<scoreboard>
+   gather_block_scoreboards(const fs_visitor *shader,
+                            const std::vector<ordered_address> &jps)
+   {
+      std::vector<scoreboard> sbs(shader->cfg->num_blocks);
+      unsigned ip = 0;
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
+         update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
+
+      return sbs;
+   }
+
+   /**
+    * Propagate data dependencies globally through the control flow graph
+    * until a fixed point is reached.
+    *
+    * Calculates the set of dependencies potentially pending at the beginning
+    * of each block, and returns it as an array of scoreboard objects.
+    */
+   std::pair<std::vector<scoreboard>, std::vector<unsigned>>
+   propagate_block_scoreboards(const fs_visitor *shader,
+                               const std::vector<ordered_address> &jps)
+   {
+      const std::vector<scoreboard> delta_sbs =
+         gather_block_scoreboards(shader, jps);
+      std::vector<scoreboard> in_sbs(shader->cfg->num_blocks);
+      std::vector<scoreboard> out_sbs(shader->cfg->num_blocks);
+      equivalence_relation eq;
+
+      for (bool progress = true; progress;) {
+         progress = false;
+
+         foreach_block(block, shader->cfg) {
+            const scoreboard sb = shadow(in_sbs[block->num],
+                                         delta_sbs[block->num]);
+
+            if (sb != out_sbs[block->num]) {
+               foreach_list_typed(bblock_link, child_link, link,
+                                  &block->children) {
+                  scoreboard &in_sb = in_sbs[child_link->block->num];
+                  const int delta =
+                     jps[child_link->block->start_ip] - jps[block->end_ip]
+                     - ordered_unit(static_cast<const fs_inst *>(block->end()));
+
+                  in_sb = merge(eq, in_sb, transport(sb, delta));
+               }
+
+               out_sbs[block->num] = sb;
+               progress = true;
+            }
+         }
+      }
+
+      return { std::move(in_sbs), eq.flatten() };
+   }
+
+   /**
+    * Return the list of potential dependencies of each instruction in the
+    * shader based on the result of global dependency analysis.
+    */
+   std::vector<std::vector<dependency>>
+   gather_inst_dependencies(const fs_visitor *shader,
+                            const std::vector<ordered_address> &jps)
+   {
+      std::vector<scoreboard> sbs;
+      std::vector<unsigned> ids;
+      std::vector<std::vector<dependency>> deps;
+      unsigned ip = 0;
+
+      std::tie(sbs, ids) = propagate_block_scoreboards(shader, jps);
+
+      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+         scoreboard &sb = sbs[block->num];
+         std::vector<dependency> inst_deps;
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            for (unsigned j = 0; j < regs_read(inst, i); j++)
+               add_dependency(ids, inst_deps, dependency_for_read(
+                  sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
+         }
+
+         if (is_send(inst) && inst->base_mrf != -1) {
+            for (unsigned j = 0; j < inst->mlen; j++)
+               add_dependency(ids, inst_deps, dependency_for_read(
+                  sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
+         }
+
+         if (is_unordered(inst))
+            add_dependency(ids, inst_deps, dependency(TGL_SBID_SET, ip));
+
+         if (!inst->no_dd_check) {
+            if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+               for (unsigned j = 0; j < regs_written(inst); j++) {
+                  add_dependency(ids, inst_deps, dependency_for_write(inst,
+                     sb.get(byte_offset(inst->dst, REG_SIZE * j))));
+               }
+            }
+
+            if (is_send(inst) && inst->base_mrf != -1) {
+               for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
+                  add_dependency(ids, inst_deps, dependency_for_write(inst,
+                     sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
+            }
+         }
+
+         deps.push_back(inst_deps);
+         update_inst_scoreboard(shader, jps, inst, ip, sb);
+         ip++;
+      }
+
+      return deps;
+   }
+
+   /** @} */
+
+   /**
+    * Allocate SBID tokens to track the execution of every out-of-order
+    * instruction of the shader.
+    */
+   std::vector<std::vector<dependency>>
+   allocate_inst_dependencies(const fs_visitor *shader,
+                              const std::vector<std::vector<dependency>> &deps0)
+   {
+      /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
+       *       shaders with a large number of SEND messages.
+       */
+      std::vector<std::vector<dependency>> deps1;
+      std::vector<unsigned> ids(deps0.size(), ~0u);
+      unsigned next_id = 0;
+
+      for (const auto &inst_deps0 : deps0) {
+         std::vector<dependency> inst_deps1;
+
+         for (const auto &dep : inst_deps0) {
+            if (dep.unordered && ids[dep.id] == ~0u)
+               ids[dep.id] = (next_id++) & 0xf;
+
+            add_dependency(ids, inst_deps1, dep);
+         }
+
+         deps1.push_back(inst_deps1);
+      }
+
+      return deps1;
+   }
+
+   /**
+    * Emit dependency information provided by \p deps into the shader,
+    * inserting additional SYNC instructions for dependencies that can't be
+    * represented directly by annotating existing instructions.
+    */
+   void
+   emit_inst_dependencies(fs_visitor *shader,
+                          const std::vector<ordered_address> &jps,
+                          const std::vector<std::vector<dependency>> &deps)
+   {
+      unsigned ip = 0;
+
+      foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
+         tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
+         const tgl_sbid_mode unordered_mode =
+            baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
+
+         for (const auto &dep : deps[ip]) {
+            if (dep.unordered) {
+               if (unordered_mode == dep.unordered && !swsb.mode) {
+                  /* Bake unordered dependency into the instruction's SWSB if
+                   * possible.
+                   */
+                  swsb.sbid = dep.id;
+                  swsb.mode = dep.unordered;
+               } else {
+                  /* Emit dependency into the SWSB of an extra SYNC
+                   * instruction.
+                   */
+                  const fs_builder ibld = fs_builder(shader, block, inst)
+                     .exec_all().group(1, 0);
+                  fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
+                                            brw_imm_ud(TGL_SYNC_NOP));
+                  sync->sched.sbid = dep.id;
+                  sync->sched.mode = dep.unordered;
+                  assert(!(sync->sched.mode & TGL_SBID_SET));
+               }
+            }
+         }
+
+         /* Update the IR. */
+         inst->sched = swsb;
+         inst->no_dd_check = inst->no_dd_clear = false;
+         ip++;
+      }
+   }
+}
+
+bool
+fs_visitor::lower_scoreboard()
+{
+   if (devinfo->gen >= 12) {
+      const std::vector<ordered_address> jps = ordered_inst_addresses(this);
+      emit_inst_dependencies(this, jps,
+         allocate_inst_dependencies(this,
+            gather_inst_dependencies(this, jps)));
+   }
+
+   return true;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build

index d74caa0632ba04cf055c8072da100badebba358e..0a782fa70f191e6ddf532cd40867bf3254c83084 100644 (file)
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -62,6 +62,7 @@ libintel_compiler_files = files(
    'brw_fs_reg_allocate.cpp',
    'brw_fs_register_coalesce.cpp',
    'brw_fs_saturate_propagation.cpp',
+  'brw_fs_scoreboard.cpp',
    'brw_fs_sel_peephole.cpp',
    'brw_fs_validate.cpp',
    'brw_fs_visitor.cpp',
author	Francisco Jerez <currojerez@riseup.net>
	Fri, 9 Nov 2018 22:13:37 +0000 (14:13 -0800)
committer	Francisco Jerez <currojerez@riseup.net>
	Fri, 11 Oct 2019 19:24:16 +0000 (12:24 -0700)
src/intel/Makefile.sources		patch \| blob \| history
src/intel/compiler/brw_fs.cpp		patch \| blob \| history
src/intel/compiler/brw_fs.h		patch \| blob \| history
src/intel/compiler/brw_fs_scoreboard.cpp	[new file with mode: 0644]	patch \| blob
src/intel/compiler/meson.build		patch \| blob \| history