Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / intel / compiler / brw_fs_scoreboard.cpp
index f05a150e00fca757b6fdc5935d1593b185431e61..0e810218db6d3eac24b97db27def957cd8cd032e 100644 (file)
@@ -52,9 +52,6 @@
  *  - tdr0 thread dependency register
  */
 
-#include <tuple>
-#include <vector>
-
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
@@ -103,19 +100,30 @@ namespace {
     */
    typedef int ordered_address;
 
+   /**
+    * Return the number of instructions in the program.
+    */
+   unsigned
+   num_instructions(const backend_shader *shader)
+   {
+      return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
+   }
+
    /**
     * Calculate the local ordered_address instruction counter at every
     * instruction of the shader for subsequent constant-time look-up.
     */
-   std::vector<ordered_address>
+   ordered_address *
    ordered_inst_addresses(const fs_visitor *shader)
    {
-      std::vector<ordered_address> jps;
+      ordered_address *jps = new ordered_address[num_instructions(shader)];
       ordered_address jp = 0;
+      unsigned ip = 0;
 
       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
-         jps.push_back(jp);
+         jps[ip] = jp;
          jp += ordered_unit(inst);
+         ip++;
       }
 
       return jps;
@@ -174,6 +182,17 @@ namespace {
     * only if i == j for every pair of unsigned integers i and j.
     */
    struct equivalence_relation {
+      equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
+      {
+         for (unsigned i = 0; i < n; i++)
+            is[i] = i;
+      }
+
+      ~equivalence_relation()
+      {
+         delete[] is;
+      }
+
       /**
        * Return equivalence class index of the specified element.  Effectively
        * this is the numeric value of an arbitrary representative from the
@@ -185,7 +204,7 @@ namespace {
       unsigned
       lookup(unsigned i) const
       {
-         if (i < is.size() && is[i] != i)
+         if (i < n && is[i] != i)
             return lookup(is[i]);
          else
             return i;
@@ -195,12 +214,13 @@ namespace {
        * Create an array with the results of the lookup() method for
        * constant-time evaluation.
        */
-      std::vector<unsigned>
-      flatten() const {
-         std::vector<unsigned> ids;
+      unsigned *
+      flatten() const
+      {
+         unsigned *ids = new unsigned[n];
 
-         for (const auto i : is)
-            ids.push_back(lookup(i));
+         for (unsigned i = 0; i < n; i++)
+            ids[i] = lookup(i);
 
          return ids;
       }
@@ -223,6 +243,11 @@ namespace {
       }
 
    private:
+      equivalence_relation(const equivalence_relation &);
+
+      equivalence_relation &
+      operator=(const equivalence_relation &);
+
       /**
        * Assign the representative of \p from to be equivalent to \p to.
        *
@@ -233,17 +258,17 @@ namespace {
       assign(unsigned from, unsigned to)
       {
          if (from != to) {
-            if (from < is.size() && is[from] != from)
-               assign(is[from], to);
+            assert(from < n);
 
-            for (unsigned i = is.size(); i <= from; i++)
-               is.push_back(i);
+            if (is[from] != from)
+               assign(is[from], to);
 
             is[from] = to;
          }
       }
 
-      std::vector<unsigned> is;
+      unsigned *is;
+      unsigned n;
    };
 
    /**
@@ -256,21 +281,24 @@ namespace {
        * No dependency information.
        */
       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
-                     unordered(TGL_SBID_NULL), id(0) {}
+                     unordered(TGL_SBID_NULL), id(0),
+                     exec_all(false) {}
 
       /**
        * Construct a dependency on the in-order instruction with the provided
        * ordered_address instruction counter.
        */
-      dependency(tgl_regdist_mode mode, ordered_address jp) :
-         ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
+      dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
+         ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
+         exec_all(exec_all) {}
 
       /**
        * Construct a dependency on the out-of-order instruction with the
        * specified synchronization token.
        */
-      dependency(tgl_sbid_mode mode, unsigned id) :
-         ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
+      dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
+         ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
+         exec_all(exec_all) {}
 
       /**
        * Synchronization mode of in-order dependency, or zero if no in-order
@@ -302,6 +330,14 @@ namespace {
       /** Synchronization token of out-of-order dependency. */
       unsigned id;
 
+      /**
+       * Whether the dependency could be run with execution masking disabled,
+       * which might lead to the unwanted execution of the generating
+       * instruction in cases where a BB is executed with all channels
+       * disabled due to hardware bug GEN:BUG:1407528679.
+       */
+      bool exec_all;
+
       /**
        * Trivial in-order dependency that's always satisfied.
        *
@@ -318,7 +354,8 @@ namespace {
          return dep0.ordered == dep1.ordered &&
                 dep0.jp == dep1.jp &&
                 dep0.unordered == dep1.unordered &&
-                dep0.id == dep1.id;
+                dep0.id == dep1.id &&
+                dep0.exec_all == dep1.exec_all;
       }
 
       friend bool
@@ -328,7 +365,7 @@ namespace {
       }
    };
 
-   const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
+   const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
 
    /**
     * Return whether \p dep contains any dependency information.
@@ -363,6 +400,8 @@ namespace {
                           dep1.unordered ? dep1.id : dep0.id);
       }
 
+      dep.exec_all = dep0.exec_all || dep1.exec_all;
+
       return dep;
    }
 
@@ -559,32 +598,86 @@ namespace {
     * Dependency list handling.
     * @{
     */
+   struct dependency_list {
+      dependency_list() : deps(NULL), n(0) {}
+
+      ~dependency_list()
+      {
+         free(deps);
+      }
+
+      void
+      push_back(const dependency &dep)
+      {
+         deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
+         deps[n++] = dep;
+      }
+
+      unsigned
+      size() const
+      {
+         return n;
+      }
+
+      const dependency &
+      operator[](unsigned i) const
+      {
+         assert(i < n);
+         return deps[i];
+      }
+
+      dependency &
+      operator[](unsigned i)
+      {
+         assert(i < n);
+         return deps[i];
+      }
+
+   private:
+      dependency_list(const dependency_list &);
+      dependency_list &
+      operator=(const dependency_list &);
+
+      dependency *deps;
+      unsigned n;
+   };
 
    /**
     * Add dependency \p dep to the list of dependencies of an instruction
     * \p deps.
     */
    void
-   add_dependency(const std::vector<unsigned> &ids,
-                  std::vector<dependency> &deps, dependency dep)
+   add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
    {
       if (is_valid(dep)) {
          /* Translate the unordered dependency token first in order to keep
           * the list minimally redundant.
           */
-         if (dep.unordered && dep.id < ids.size())
+         if (dep.unordered)
             dep.id = ids[dep.id];
 
          /* Try to combine the specified dependency with any existing ones. */
-         for (auto &dep1 : deps) {
-            if (dep.ordered && dep1.ordered) {
-               dep1.jp = MAX2(dep1.jp, dep.jp);
-               dep1.ordered |= dep.ordered;
+         for (unsigned i = 0; i < deps.size(); i++) {
+            /* Don't combine otherwise matching dependencies if there is an
+             * exec_all mismatch which would cause a SET dependency to gain an
+             * exec_all flag, since that would prevent it from being baked
+             * into the instruction we want to allocate an SBID for.
+             */
+            if (deps[i].exec_all != dep.exec_all &&
+                (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
+                (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
+               continue;
+
+            if (dep.ordered && deps[i].ordered) {
+               deps[i].jp = MAX2(deps[i].jp, dep.jp);
+               deps[i].ordered |= dep.ordered;
+               deps[i].exec_all |= dep.exec_all;
                dep.ordered = TGL_REGDIST_NULL;
             }
 
-            if (dep.unordered && dep1.unordered && dep1.id == dep.id) {
-               dep1.unordered |= dep.unordered;
+            if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
+               deps[i].unordered |= dep.unordered;
+               deps[i].exec_all |= dep.exec_all;
                dep.unordered = TGL_SBID_NULL;
             }
          }
@@ -597,20 +690,22 @@ namespace {
 
    /**
     * Construct a tgl_swsb annotation encoding any ordered dependencies from
-    * the dependency list \p deps of an instruction with ordered_address
-    * \p jp.
+    * the dependency list \p deps of an instruction with ordered_address \p
+    * jp.  If \p exec_all is false only dependencies known to be executed with
+    * channel masking applied will be considered in the calculation.
     */
    tgl_swsb
-   ordered_dependency_swsb(const std::vector<dependency> &deps,
-                           const ordered_address &jp)
+   ordered_dependency_swsb(const dependency_list &deps,
+                           const ordered_address &jp,
+                           bool exec_all)
    {
       unsigned min_dist = ~0u;
 
-      for (const auto &dep : deps) {
-         if (dep.ordered) {
-            const unsigned dist = jp - dep.jp;
+      for (unsigned i = 0; i < deps.size(); i++) {
+         if (deps[i].ordered && exec_all >= deps[i].exec_all) {
+            const unsigned dist = jp - deps[i].jp;
             const unsigned max_dist = 10;
-            assert(jp > dep.jp);
+            assert(jp > deps[i].jp);
             if (dist <= max_dist)
                min_dist = MIN3(min_dist, dist, 7);
          }
@@ -621,28 +716,35 @@ namespace {
 
    /**
     * Return whether the dependency list \p deps of an instruction with
-    * ordered_address \p jp has any non-trivial ordered dependencies.
+    * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
+    * exec_all is false only dependencies known to be executed with channel
+    * masking applied will be considered in the calculation.
     */
    bool
-   find_ordered_dependency(const std::vector<dependency> &deps,
-                           const ordered_address &jp)
+   find_ordered_dependency(const dependency_list &deps,
+                           const ordered_address &jp,
+                           bool exec_all)
    {
-      return ordered_dependency_swsb(deps, jp).regdist;
+      return ordered_dependency_swsb(deps, jp, exec_all).regdist;
    }
 
    /**
     * Return the full tgl_sbid_mode bitset for the first unordered dependency
     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
-    * no such dependency is present.
+    * no such dependency is present.  If \p exec_all is false only
+    * dependencies known to be executed with channel masking applied will be
+    * considered in the calculation.
     */
    tgl_sbid_mode
-   find_unordered_dependency(const std::vector<dependency> &deps,
-                             tgl_sbid_mode unordered)
+   find_unordered_dependency(const dependency_list &deps,
+                             tgl_sbid_mode unordered,
+                             bool exec_all)
    {
       if (unordered) {
-         for (const auto &dep : deps) {
-            if (unordered & dep.unordered)
-               return dep.unordered;
+         for (unsigned i = 0; i < deps.size(); i++) {
+            if ((unordered & deps[i].unordered) &&
+                exec_all >= deps[i].exec_all)
+               return deps[i].unordered;
          }
       }
 
@@ -657,20 +759,21 @@ namespace {
     */
    tgl_sbid_mode
    baked_unordered_dependency_mode(const fs_inst *inst,
-                                   const std::vector<dependency> &deps,
+                                   const dependency_list &deps,
                                    const ordered_address &jp)
    {
-      const bool has_ordered = find_ordered_dependency(deps, jp);
+      const bool exec_all = inst->force_writemask_all;
+      const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
 
-      if (find_unordered_dependency(deps, TGL_SBID_SET))
-         return find_unordered_dependency(deps, TGL_SBID_SET);
+      if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
+         return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
       else if (has_ordered && is_unordered(inst))
          return TGL_SBID_NULL;
-      else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
+      else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
                (!has_ordered || !is_unordered(inst)))
-         return find_unordered_dependency(deps, TGL_SBID_DST);
+         return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
       else if (!has_ordered)
-         return find_unordered_dependency(deps, TGL_SBID_SRC);
+         return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
       else
          return TGL_SBID_NULL;
    }
@@ -687,18 +790,20 @@ namespace {
     * instruction \p inst.
     */
    void
-   update_inst_scoreboard(const fs_visitor *shader,
-                          const std::vector<ordered_address> &jps,
+   update_inst_scoreboard(const ordered_address *jps,
                           const fs_inst *inst, unsigned ip, scoreboard &sb)
    {
+      const bool exec_all = inst->force_writemask_all;
+
       /* Track any source registers that may be fetched asynchronously by this
        * instruction, otherwise clear the dependency in order to avoid
        * subsequent redundant synchronization.
        */
       for (unsigned i = 0; i < inst->sources; i++) {
          const dependency rd_dep =
-            inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
-            ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
+            (inst->is_payload(i) ||
+             inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
+            ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
             dependency::done;
 
          for (unsigned j = 0; j < regs_read(inst, i); j++)
@@ -706,7 +811,7 @@ namespace {
       }
 
       if (is_send(inst) && inst->base_mrf != -1) {
-         const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
+         const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
 
          for (unsigned j = 0; j < inst->mlen; j++)
             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
@@ -714,8 +819,8 @@ namespace {
 
       /* Track any destination registers of this instruction. */
       const dependency wr_dep =
-         is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
-         ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
+         is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
+         ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
          dependency();
 
       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
@@ -730,15 +835,15 @@ namespace {
     * unconditionally resolved) dependencies at the end of each block of the
     * program.
     */
-   std::vector<scoreboard>
+   scoreboard *
    gather_block_scoreboards(const fs_visitor *shader,
-                            const std::vector<ordered_address> &jps)
+                            const ordered_address *jps)
    {
-      std::vector<scoreboard> sbs(shader->cfg->num_blocks);
+      scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
       unsigned ip = 0;
 
       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
-         update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
+         update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
 
       return sbs;
    }
@@ -750,15 +855,14 @@ namespace {
     * Calculates the set of dependencies potentially pending at the beginning
     * of each block, and returns it as an array of scoreboard objects.
     */
-   std::pair<std::vector<scoreboard>, std::vector<unsigned>>
+   scoreboard *
    propagate_block_scoreboards(const fs_visitor *shader,
-                               const std::vector<ordered_address> &jps)
+                               const ordered_address *jps,
+                               equivalence_relation &eq)
    {
-      const std::vector<scoreboard> delta_sbs =
-         gather_block_scoreboards(shader, jps);
-      std::vector<scoreboard> in_sbs(shader->cfg->num_blocks);
-      std::vector<scoreboard> out_sbs(shader->cfg->num_blocks);
-      equivalence_relation eq;
+      const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
+      scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
+      scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
 
       for (bool progress = true; progress;) {
          progress = false;
@@ -784,63 +888,68 @@ namespace {
          }
       }
 
-      return { std::move(in_sbs), eq.flatten() };
+      delete[] delta_sbs;
+      delete[] out_sbs;
+
+      return in_sbs;
    }
 
    /**
     * Return the list of potential dependencies of each instruction in the
     * shader based on the result of global dependency analysis.
     */
-   std::vector<std::vector<dependency>>
+   dependency_list *
    gather_inst_dependencies(const fs_visitor *shader,
-                            const std::vector<ordered_address> &jps)
+                            const ordered_address *jps)
    {
-      std::vector<scoreboard> sbs;
-      std::vector<unsigned> ids;
-      std::vector<std::vector<dependency>> deps;
+      equivalence_relation eq(num_instructions(shader));
+      scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
+      const unsigned *ids = eq.flatten();
+      dependency_list *deps = new dependency_list[num_instructions(shader)];
       unsigned ip = 0;
 
-      std::tie(sbs, ids) = propagate_block_scoreboards(shader, jps);
-
       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+         const bool exec_all = inst->force_writemask_all;
          scoreboard &sb = sbs[block->num];
-         std::vector<dependency> inst_deps;
 
          for (unsigned i = 0; i < inst->sources; i++) {
             for (unsigned j = 0; j < regs_read(inst, i); j++)
-               add_dependency(ids, inst_deps, dependency_for_read(
+               add_dependency(ids, deps[ip], dependency_for_read(
                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
          }
 
          if (is_send(inst) && inst->base_mrf != -1) {
             for (unsigned j = 0; j < inst->mlen; j++)
-               add_dependency(ids, inst_deps, dependency_for_read(
+               add_dependency(ids, deps[ip], dependency_for_read(
                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
          }
 
          if (is_unordered(inst))
-            add_dependency(ids, inst_deps, dependency(TGL_SBID_SET, ip));
+            add_dependency(ids, deps[ip],
+                           dependency(TGL_SBID_SET, ip, exec_all));
 
          if (!inst->no_dd_check) {
             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
                for (unsigned j = 0; j < regs_written(inst); j++) {
-                  add_dependency(ids, inst_deps, dependency_for_write(inst,
+                  add_dependency(ids, deps[ip], dependency_for_write(inst,
                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
                }
             }
 
             if (is_send(inst) && inst->base_mrf != -1) {
-               for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
-                  add_dependency(ids, inst_deps, dependency_for_write(inst,
+               for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
+                  add_dependency(ids, deps[ip], dependency_for_write(inst,
                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
             }
          }
 
-         deps.push_back(inst_deps);
-         update_inst_scoreboard(shader, jps, inst, ip, sb);
+         update_inst_scoreboard(jps, inst, ip, sb);
          ip++;
       }
 
+      delete[] sbs;
+      delete[] ids;
+
       return deps;
    }
 
@@ -850,30 +959,39 @@ namespace {
     * Allocate SBID tokens to track the execution of every out-of-order
     * instruction of the shader.
     */
-   std::vector<std::vector<dependency>>
+   dependency_list *
    allocate_inst_dependencies(const fs_visitor *shader,
-                              const std::vector<std::vector<dependency>> &deps0)
+                              const dependency_list *deps0)
    {
       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
        *       shaders with a large number of SEND messages.
        */
-      std::vector<std::vector<dependency>> deps1;
-      std::vector<unsigned> ids(deps0.size(), ~0u);
+
+      /* Allocate an unordered dependency ID to hardware SBID translation
+       * table with as many entries as instructions there are in the shader,
+       * which is the maximum number of unordered IDs we can find in the
+       * program.
+       */
+      unsigned *ids = new unsigned[num_instructions(shader)];
+      for (unsigned ip = 0; ip < num_instructions(shader); ip++)
+         ids[ip] = ~0u;
+
+      dependency_list *deps1 = new dependency_list[num_instructions(shader)];
       unsigned next_id = 0;
 
-      for (const auto &inst_deps0 : deps0) {
-         std::vector<dependency> inst_deps1;
+      for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
+         for (unsigned i = 0; i < deps0[ip].size(); i++) {
+            const dependency &dep = deps0[ip][i];
 
-         for (const auto &dep : inst_deps0) {
             if (dep.unordered && ids[dep.id] == ~0u)
                ids[dep.id] = (next_id++) & 0xf;
 
-            add_dependency(ids, inst_deps1, dep);
+            add_dependency(ids, deps1[ip], dep);
          }
-
-         deps1.push_back(inst_deps1);
       }
 
+      delete[] ids;
+
       return deps1;
    }
 
@@ -884,21 +1002,28 @@ namespace {
     */
    void
    emit_inst_dependencies(fs_visitor *shader,
-                          const std::vector<ordered_address> &jps,
-                          const std::vector<std::vector<dependency>> &deps)
+                          const ordered_address *jps,
+                          const dependency_list *deps)
    {
       unsigned ip = 0;
 
       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
-         tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
+         const bool exec_all = inst->force_writemask_all;
+         tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
          const tgl_sbid_mode unordered_mode =
             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
 
-         for (const auto &dep : deps[ip]) {
+         for (unsigned i = 0; i < deps[ip].size(); i++) {
+            const dependency &dep = deps[ip][i];
+
             if (dep.unordered) {
-               if (unordered_mode == dep.unordered && !swsb.mode) {
+               if (unordered_mode == dep.unordered &&
+                   exec_all >= dep.exec_all && !swsb.mode) {
                   /* Bake unordered dependency into the instruction's SWSB if
-                   * possible.
+                   * possible, except in cases where the current instruction
+                   * isn't marked NoMask but the dependency is, since that
+                   * might lead to data coherency issues due to
+                   * GEN:BUG:1407528679.
                    */
                   swsb.sbid = dep.id;
                   swsb.mode = dep.unordered;
@@ -907,7 +1032,7 @@ namespace {
                    * instruction.
                    */
                   const fs_builder ibld = fs_builder(shader, block, inst)
-                     .exec_all().group(1, 0);
+                                          .exec_all().group(1, 0);
                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
                                             brw_imm_ud(TGL_SYNC_NOP));
                   sync->sched.sbid = dep.id;
@@ -917,6 +1042,27 @@ namespace {
             }
          }
 
+         for (unsigned i = 0; i < deps[ip].size(); i++) {
+            const dependency &dep = deps[ip][i];
+
+            if (dep.ordered && dep.exec_all > exec_all &&
+                find_ordered_dependency(deps[ip], jps[ip], true)) {
+               /* If the current instruction is not marked NoMask but an
+                * ordered dependency is, perform the synchronization as a
+                * separate NoMask SYNC instruction in order to avoid data
+                * coherency issues due to GEN:BUG:1407528679.  The similar
+                * scenario with unordered dependencies should have been
+                * handled above.
+                */
+               const fs_builder ibld = fs_builder(shader, block, inst)
+                                       .exec_all().group(1, 0);
+               fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
+                                         brw_imm_ud(TGL_SYNC_NOP));
+               sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
+               break;
+            }
+         }
+
          /* Update the IR. */
          inst->sched = swsb;
          inst->no_dd_check = inst->no_dd_clear = false;
@@ -929,10 +1075,13 @@ bool
 fs_visitor::lower_scoreboard()
 {
    if (devinfo->gen >= 12) {
-      const std::vector<ordered_address> jps = ordered_inst_addresses(this);
-      emit_inst_dependencies(this, jps,
-         allocate_inst_dependencies(this,
-            gather_inst_dependencies(this, jps)));
+      const ordered_address *jps = ordered_inst_addresses(this);
+      const dependency_list *deps0 = gather_inst_dependencies(this, jps);
+      const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
+      emit_inst_dependencies(this, jps, deps1);
+      delete[] deps1;
+      delete[] deps0;
+      delete[] jps;
    }
 
    return true;