src/intel/compiler/brw_fs_scoreboard.cpp

   1 /*
   2  * Copyright © 2019 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_scoreboard.cpp
  25  *
  26  * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
  27  * data coherency between register reads and writes in previous generations.
  28  * This lowering pass runs after register allocation in order to make up for
  29  * it.
  30  *
  31  * It works by performing global dataflow analysis in order to determine the
  32  * set of potential dependencies of every instruction in the shader, and then
  33  * inserts any required SWSB annotations and additional SYNC instructions in
  34  * order to guarantee data coherency.
  35  *
  36  * WARNING - Access of the following (rarely used) ARF registers is not
  37  *           tracked here, and require the RegDist SWSB annotation to be set
  38  *           to 1 by the generator in order to avoid data races:
  39  *
  40  *  - sp stack pointer
  41  *  - sr0 state register
  42  *  - cr0 control register
  43  *  - ip instruction pointer
  44  *  - tm0 timestamp register
  45  *  - dbg0 debug register
  46  *
  47  * The following ARF registers don't need to be tracked here because data
  48  * coherency is still provided transparently by the hardware:
  49  *
  50  *  - f0-1 flag registers
  51  *  - n0 notification register
  52  *  - tdr0 thread dependency register
  53  */
  54
  55 #include <tuple>
  56 #include <vector>
  57
  58 #include "brw_fs.h"
  59 #include "brw_cfg.h"
  60
  61 using namespace brw;
  62
  63 namespace {
  64    /**
  65     * In-order instruction accounting.
  66     * @{
  67     */
  68
  69    /**
  70     * Number of in-order hardware instructions contained in this IR
  71     * instruction.  This determines the increment applied to the RegDist
  72     * counter calculated for any ordered dependency that crosses this
  73     * instruction.
  74     */
  75    unsigned
  76    ordered_unit(const fs_inst *inst)
  77    {
  78       switch (inst->opcode) {
  79       case BRW_OPCODE_SYNC:
  80       case BRW_OPCODE_DO:
  81       case SHADER_OPCODE_UNDEF:
  82       case FS_OPCODE_PLACEHOLDER_HALT:
  83          return 0;
  84       default:
  85          /* Note that the following is inaccurate for virtual instructions
  86           * that expand to more in-order instructions than assumed here, but
  87           * that can only lead to suboptimal execution ordering, data
  88           * coherency won't be impacted.  Providing exact RegDist counts for
  89           * each virtual instruction would allow better ALU performance, but
  90           * it would require keeping this switch statement in perfect sync
  91           * with the generator in order to avoid data corruption.  Lesson is
  92           * (again) don't use virtual instructions if you want optimal
  93           * scheduling.
  94           */
  95          return is_unordered(inst) ? 0 : 1;
  96       }
  97    }
  98
  99    /**
 100     * Type for an instruction counter that increments for in-order
 101     * instructions only, arbitrarily denoted 'jp' throughout this lowering
 102     * pass in order to distinguish it from the regular instruction counter.
 103     */
 104    typedef int ordered_address;
 105
 106    /**
 107     * Calculate the local ordered_address instruction counter at every
 108     * instruction of the shader for subsequent constant-time look-up.
 109     */
 110    std::vector<ordered_address>
 111    ordered_inst_addresses(const fs_visitor *shader)
 112    {
 113       std::vector<ordered_address> jps;
 114       ordered_address jp = 0;
 115
 116       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 117          jps.push_back(jp);
 118          jp += ordered_unit(inst);
 119       }
 120
 121       return jps;
 122    }
 123
 124    /**
 125     * Synchronization mode required for data manipulated by in-order
 126     * instructions.
 127     *
 128     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
 129     * enum for additional type safety.  The hardware doesn't provide control
 130     * over the synchronization mode for RegDist annotations, this is only used
 131     * internally in this pass in order to optimize out redundant read
 132     * dependencies where possible.
 133     */
 134    enum tgl_regdist_mode {
 135       TGL_REGDIST_NULL = 0,
 136       TGL_REGDIST_SRC = 1,
 137       TGL_REGDIST_DST = 2
 138    };
 139
 140    /**
 141     * Allow bitwise arithmetic of tgl_regdist_mode enums.
 142     */
 143    tgl_regdist_mode
 144    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
 145    {
 146       return tgl_regdist_mode(unsigned(x) | unsigned(y));
 147    }
 148
 149    tgl_regdist_mode
 150    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
 151    {
 152       return tgl_regdist_mode(unsigned(x) & unsigned(y));
 153    }
 154
 155    tgl_regdist_mode &
 156    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
 157    {
 158       return x = x | y;
 159    }
 160
 161    tgl_regdist_mode &
 162    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
 163    {
 164       return x = x & y;
 165    }
 166
 167    /** @} */
 168
 169    /**
 170     * Representation of an equivalence relation among the set of unsigned
 171     * integers.
 172     *
 173     * Its initial state is the identity relation '~' such that i ~ j if and
 174     * only if i == j for every pair of unsigned integers i and j.
 175     */
 176    struct equivalence_relation {
 177       /**
 178        * Return equivalence class index of the specified element.  Effectively
 179        * this is the numeric value of an arbitrary representative from the
 180        * equivalence class.
 181        *
 182        * Allows the evaluation of the equivalence relation according to the
 183        * rule that i ~ j if and only if lookup(i) == lookup(j).
 184        */
 185       unsigned
 186       lookup(unsigned i) const
 187       {
 188          if (i < is.size() && is[i] != i)
 189             return lookup(is[i]);
 190          else
 191             return i;
 192       }
 193
 194       /**
 195        * Create an array with the results of the lookup() method for
 196        * constant-time evaluation.
 197        */
 198       std::vector<unsigned>
 199       flatten() const {
 200          std::vector<unsigned> ids;
 201
 202          for (const auto i : is)
 203             ids.push_back(lookup(i));
 204
 205          return ids;
 206       }
 207
 208       /**
 209        * Mutate the existing equivalence relation minimally by imposing the
 210        * additional requirement that i ~ j.
 211        *
 212        * The algorithm updates the internal representation recursively in
 213        * order to guarantee transitivity while preserving the previously
 214        * specified equivalence requirements.
 215        */
 216       unsigned
 217       link(unsigned i, unsigned j)
 218       {
 219          const unsigned k = lookup(i);
 220          assign(i, k);
 221          assign(j, k);
 222          return k;
 223       }
 224
 225    private:
 226       /**
 227        * Assign the representative of \p from to be equivalent to \p to.
 228        *
 229        * At the same time the data structure is partially flattened as much as
 230        * it's possible without increasing the number of recursive calls.
 231        */
 232       void
 233       assign(unsigned from, unsigned to)
 234       {
 235          if (from != to) {
 236             if (from < is.size() && is[from] != from)
 237                assign(is[from], to);
 238
 239             for (unsigned i = is.size(); i <= from; i++)
 240                is.push_back(i);
 241
 242             is[from] = to;
 243          }
 244       }
 245
 246       std::vector<unsigned> is;
 247    };
 248
 249    /**
 250     * Representation of a data dependency between two instructions in the
 251     * program.
 252     * @{
 253     */
 254    struct dependency {
 255       /**
 256        * No dependency information.
 257        */
 258       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
 259                      unordered(TGL_SBID_NULL), id(0) {}
 260
 261       /**
 262        * Construct a dependency on the in-order instruction with the provided
 263        * ordered_address instruction counter.
 264        */
 265       dependency(tgl_regdist_mode mode, ordered_address jp) :
 266          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
 267
 268       /**
 269        * Construct a dependency on the out-of-order instruction with the
 270        * specified synchronization token.
 271        */
 272       dependency(tgl_sbid_mode mode, unsigned id) :
 273          ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
 274
 275       /**
 276        * Synchronization mode of in-order dependency, or zero if no in-order
 277        * dependency is present.
 278        */
 279       tgl_regdist_mode ordered;
 280
 281       /**
 282        * Instruction counter of in-order dependency.
 283        *
 284        * For a dependency part of a different block in the program, this is
 285        * relative to the specific control flow path taken between the
 286        * dependency and the current block: It is the ordered_address such that
 287        * the difference between it and the ordered_address of the first
 288        * instruction of the current block is exactly the number of in-order
 289        * instructions across that control flow path.  It is not guaranteed to
 290        * be equal to the local ordered_address of the generating instruction
 291        * [as returned by ordered_inst_addresses()], except for block-local
 292        * dependencies.
 293        */
 294       ordered_address jp;
 295
 296       /**
 297        * Synchronization mode of unordered dependency, or zero if no unordered
 298        * dependency is present.
 299        */
 300       tgl_sbid_mode unordered;
 301
 302       /** Synchronization token of out-of-order dependency. */
 303       unsigned id;
 304
 305       /**
 306        * Trivial in-order dependency that's always satisfied.
 307        *
 308        * Note that unlike a default-constructed dependency() which is also
 309        * trivially satisfied, this is considered to provide dependency
 310        * information and can be used to clear a previously pending dependency
 311        * via shadow().
 312        */
 313       static const dependency done;
 314
 315       friend bool
 316       operator==(const dependency &dep0, const dependency &dep1)
 317       {
 318          return dep0.ordered == dep1.ordered &&
 319                 dep0.jp == dep1.jp &&
 320                 dep0.unordered == dep1.unordered &&
 321                 dep0.id == dep1.id;
 322       }
 323
 324       friend bool
 325       operator!=(const dependency &dep0, const dependency &dep1)
 326       {
 327          return !(dep0 == dep1);
 328       }
 329    };
 330
 331    const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
 332
 333    /**
 334     * Return whether \p dep contains any dependency information.
 335     */
 336    bool
 337    is_valid(const dependency &dep)
 338    {
 339       return dep.ordered || dep.unordered;
 340    }
 341
 342    /**
 343     * Combine \p dep0 and \p dep1 into a single dependency object that is only
 344     * satisfied when both original dependencies are satisfied.  This might
 345     * involve updating the equivalence relation \p eq in order to make sure
 346     * that both out-of-order dependencies are assigned the same hardware SBID
 347     * as synchronization token.
 348     */
 349    dependency
 350    merge(equivalence_relation &eq,
 351          const dependency &dep0, const dependency &dep1)
 352    {
 353       dependency dep;
 354
 355       if (dep0.ordered || dep1.ordered) {
 356          dep.ordered = dep0.ordered | dep1.ordered;
 357          dep.jp = MAX2(dep0.jp, dep1.jp);
 358       }
 359
 360       if (dep0.unordered || dep1.unordered) {
 361          dep.unordered = dep0.unordered | dep1.unordered;
 362          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
 363                           dep1.unordered ? dep1.id : dep0.id);
 364       }
 365
 366       return dep;
 367    }
 368
 369    /**
 370     * Override dependency information of \p dep0 with that of \p dep1.
 371     */
 372    dependency
 373    shadow(const dependency &dep0, const dependency &dep1)
 374    {
 375       return is_valid(dep1) ? dep1 : dep0;
 376    }
 377
 378    /**
 379     * Translate dependency information across the program.
 380     *
 381     * This returns a dependency on the same instruction translated to the
 382     * ordered_address space of a different block.  The correct shift for
 383     * transporting a dependency across an edge of the CFG is the difference
 384     * between the local ordered_address of the first instruction of the target
 385     * block and the local ordered_address of the instruction immediately after
 386     * the end of the origin block.
 387     */
 388    dependency
 389    transport(dependency dep, int delta)
 390    {
 391       if (dep.ordered && dep.jp > INT_MIN)
 392          dep.jp += delta;
 393
 394       return dep;
 395    }
 396
 397    /**
 398     * Return simplified dependency removing any synchronization modes not
 399     * applicable to an instruction reading the same register location.
 400     */
 401    dependency
 402    dependency_for_read(dependency dep)
 403    {
 404       dep.ordered &= TGL_REGDIST_DST;
 405       return dep;
 406    }
 407
 408    /**
 409     * Return simplified dependency removing any synchronization modes not
 410     * applicable to an instruction \p inst writing the same register location.
 411     */
 412    dependency
 413    dependency_for_write(const fs_inst *inst, dependency dep)
 414    {
 415       if (!is_unordered(inst))
 416          dep.ordered &= TGL_REGDIST_DST;
 417       return dep;
 418    }
 419
 420    /** @} */
 421
 422    /**
 423     * Scoreboard representation.  This keeps track of the data dependencies of
 424     * registers with GRF granularity.
 425     */
 426    class scoreboard {
 427    public:
 428       /**
 429        * Look up the most current data dependency for register \p r.
 430        */
 431       dependency
 432       get(const fs_reg &r) const
 433       {
 434          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
 435             return *p;
 436          else
 437             return dependency();
 438       }
 439
 440       /**
 441        * Specify the most current data dependency for register \p r.
 442        */
 443       void
 444       set(const fs_reg &r, const dependency &d)
 445       {
 446          if (dependency *p = dep(r))
 447             *p = d;
 448       }
 449
 450       /**
 451        * Component-wise merge() of corresponding dependencies from two
 452        * scoreboard objects.  \sa merge().
 453        */
 454       friend scoreboard
 455       merge(equivalence_relation &eq,
 456             const scoreboard &sb0, const scoreboard &sb1)
 457       {
 458          scoreboard sb;
 459
 460          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 461             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
 462
 463          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
 464
 465          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 466             sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
 467
 468          return sb;
 469       }
 470
 471       /**
 472        * Component-wise shadow() of corresponding dependencies from two
 473        * scoreboard objects.  \sa shadow().
 474        */
 475       friend scoreboard
 476       shadow(const scoreboard &sb0, const scoreboard &sb1)
 477       {
 478          scoreboard sb;
 479
 480          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 481             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
 482
 483          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
 484
 485          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 486             sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
 487
 488          return sb;
 489       }
 490
 491       /**
 492        * Component-wise transport() of dependencies from a scoreboard
 493        * object.  \sa transport().
 494        */
 495       friend scoreboard
 496       transport(const scoreboard &sb0, int delta)
 497       {
 498          scoreboard sb;
 499
 500          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 501             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
 502
 503          sb.addr_dep = transport(sb0.addr_dep, delta);
 504
 505          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 506             sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
 507
 508          return sb;
 509       }
 510
 511       friend bool
 512       operator==(const scoreboard &sb0, const scoreboard &sb1)
 513       {
 514          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
 515             if (sb0.grf_deps[i] != sb1.grf_deps[i])
 516                return false;
 517          }
 518
 519          if (sb0.addr_dep != sb1.addr_dep)
 520             return false;
 521
 522          for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
 523             if (sb0.accum_deps[i] != sb1.accum_deps[i])
 524                return false;
 525          }
 526
 527          return true;
 528       }
 529
 530       friend bool
 531       operator!=(const scoreboard &sb0, const scoreboard &sb1)
 532       {
 533          return !(sb0 == sb1);
 534       }
 535
 536    private:
 537       dependency grf_deps[BRW_MAX_GRF];
 538       dependency addr_dep;
 539       dependency accum_deps[10];
 540
 541       dependency *
 542       dep(const fs_reg &r)
 543       {
 544          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
 545                                reg_offset(r) / REG_SIZE);
 546
 547          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
 548                  r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
 549                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
 550                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
 551                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
 552                                   reg < BRW_ARF_FLAG ? &accum_deps[
 553                                      reg - BRW_ARF_ACCUMULATOR] :
 554                  NULL);
 555       }
 556    };
 557
 558    /**
 559     * Dependency list handling.
 560     * @{
 561     */
 562
 563    /**
 564     * Add dependency \p dep to the list of dependencies of an instruction
 565     * \p deps.
 566     */
 567    void
 568    add_dependency(const std::vector<unsigned> &ids,
 569                   std::vector<dependency> &deps, dependency dep)
 570    {
 571       if (is_valid(dep)) {
 572          /* Translate the unordered dependency token first in order to keep
 573           * the list minimally redundant.
 574           */
 575          if (dep.unordered && dep.id < ids.size())
 576             dep.id = ids[dep.id];
 577
 578          /* Try to combine the specified dependency with any existing ones. */
 579          for (auto &dep1 : deps) {
 580             if (dep.ordered && dep1.ordered) {
 581                dep1.jp = MAX2(dep1.jp, dep.jp);
 582                dep1.ordered |= dep.ordered;
 583                dep.ordered = TGL_REGDIST_NULL;
 584             }
 585
 586             if (dep.unordered && dep1.unordered && dep1.id == dep.id) {
 587                dep1.unordered |= dep.unordered;
 588                dep.unordered = TGL_SBID_NULL;
 589             }
 590          }
 591
 592          /* Add it to the end of the list if necessary. */
 593          if (is_valid(dep))
 594             deps.push_back(dep);
 595       }
 596    }
 597
 598    /**
 599     * Construct a tgl_swsb annotation encoding any ordered dependencies from
 600     * the dependency list \p deps of an instruction with ordered_address
 601     * \p jp.
 602     */
 603    tgl_swsb
 604    ordered_dependency_swsb(const std::vector<dependency> &deps,
 605                            const ordered_address &jp)
 606    {
 607       unsigned min_dist = ~0u;
 608
 609       for (const auto &dep : deps) {
 610          if (dep.ordered) {
 611             const unsigned dist = jp - dep.jp;
 612             const unsigned max_dist = 10;
 613             assert(jp > dep.jp);
 614             if (dist <= max_dist)
 615                min_dist = MIN3(min_dist, dist, 7);
 616          }
 617       }
 618
 619       return { min_dist == ~0u ? 0 : min_dist };
 620    }
 621
 622    /**
 623     * Return whether the dependency list \p deps of an instruction with
 624     * ordered_address \p jp has any non-trivial ordered dependencies.
 625     */
 626    bool
 627    find_ordered_dependency(const std::vector<dependency> &deps,
 628                            const ordered_address &jp)
 629    {
 630       return ordered_dependency_swsb(deps, jp).regdist;
 631    }
 632
 633    /**
 634     * Return the full tgl_sbid_mode bitset for the first unordered dependency
 635     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
 636     * no such dependency is present.
 637     */
 638    tgl_sbid_mode
 639    find_unordered_dependency(const std::vector<dependency> &deps,
 640                              tgl_sbid_mode unordered)
 641    {
 642       if (unordered) {
 643          for (const auto &dep : deps) {
 644             if (unordered & dep.unordered)
 645                return dep.unordered;
 646          }
 647       }
 648
 649       return TGL_SBID_NULL;
 650    }
 651
 652    /**
 653     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
 654     * \p deps that can be represented directly in the SWSB annotation of the
 655     * instruction without additional SYNC instructions, or zero if no such
 656     * dependency is present.
 657     */
 658    tgl_sbid_mode
 659    baked_unordered_dependency_mode(const fs_inst *inst,
 660                                    const std::vector<dependency> &deps,
 661                                    const ordered_address &jp)
 662    {
 663       const bool has_ordered = find_ordered_dependency(deps, jp);
 664
 665       if (find_unordered_dependency(deps, TGL_SBID_SET))
 666          return find_unordered_dependency(deps, TGL_SBID_SET);
 667       else if (has_ordered && is_unordered(inst))
 668          return TGL_SBID_NULL;
 669       else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
 670                (!has_ordered || !is_unordered(inst)))
 671          return find_unordered_dependency(deps, TGL_SBID_DST);
 672       else if (!has_ordered)
 673          return find_unordered_dependency(deps, TGL_SBID_SRC);
 674       else
 675          return TGL_SBID_NULL;
 676    }
 677
 678    /** @} */
 679
 680    /**
 681     * Shader instruction dependency calculation.
 682     * @{
 683     */
 684
 685    /**
 686     * Update scoreboard object \p sb to account for the execution of
 687     * instruction \p inst.
 688     */
 689    void
 690    update_inst_scoreboard(const fs_visitor *shader,
 691                           const std::vector<ordered_address> &jps,
 692                           const fs_inst *inst, unsigned ip, scoreboard &sb)
 693    {
 694       /* Track any source registers that may be fetched asynchronously by this
 695        * instruction, otherwise clear the dependency in order to avoid
 696        * subsequent redundant synchronization.
 697        */
 698       for (unsigned i = 0; i < inst->sources; i++) {
 699          const dependency rd_dep =
 700             inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
 701             ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
 702             dependency::done;
 703
 704          for (unsigned j = 0; j < regs_read(inst, i); j++)
 705             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
 706       }
 707
 708       if (is_send(inst) && inst->base_mrf != -1) {
 709          const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
 710
 711          for (unsigned j = 0; j < inst->mlen; j++)
 712             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
 713       }
 714
 715       /* Track any destination registers of this instruction. */
 716       const dependency wr_dep =
 717          is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
 718          ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
 719          dependency();
 720
 721       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
 722           !inst->dst.is_null()) {
 723          for (unsigned j = 0; j < regs_written(inst); j++)
 724             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
 725       }
 726    }
 727
 728    /**
 729     * Calculate scoreboard objects locally that represent any pending (and
 730     * unconditionally resolved) dependencies at the end of each block of the
 731     * program.
 732     */
 733    std::vector<scoreboard>
 734    gather_block_scoreboards(const fs_visitor *shader,
 735                             const std::vector<ordered_address> &jps)
 736    {
 737       std::vector<scoreboard> sbs(shader->cfg->num_blocks);
 738       unsigned ip = 0;
 739
 740       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
 741          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
 742
 743       return sbs;
 744    }
 745
 746    /**
 747     * Propagate data dependencies globally through the control flow graph
 748     * until a fixed point is reached.
 749     *
 750     * Calculates the set of dependencies potentially pending at the beginning
 751     * of each block, and returns it as an array of scoreboard objects.
 752     */
 753    std::pair<std::vector<scoreboard>, std::vector<unsigned>>
 754    propagate_block_scoreboards(const fs_visitor *shader,
 755                                const std::vector<ordered_address> &jps)
 756    {
 757       const std::vector<scoreboard> delta_sbs =
 758          gather_block_scoreboards(shader, jps);
 759       std::vector<scoreboard> in_sbs(shader->cfg->num_blocks);
 760       std::vector<scoreboard> out_sbs(shader->cfg->num_blocks);
 761       equivalence_relation eq;
 762
 763       for (bool progress = true; progress;) {
 764          progress = false;
 765
 766          foreach_block(block, shader->cfg) {
 767             const scoreboard sb = shadow(in_sbs[block->num],
 768                                          delta_sbs[block->num]);
 769
 770             if (sb != out_sbs[block->num]) {
 771                foreach_list_typed(bblock_link, child_link, link,
 772                                   &block->children) {
 773                   scoreboard &in_sb = in_sbs[child_link->block->num];
 774                   const int delta =
 775                      jps[child_link->block->start_ip] - jps[block->end_ip]
 776                      - ordered_unit(static_cast<const fs_inst *>(block->end()));
 777
 778                   in_sb = merge(eq, in_sb, transport(sb, delta));
 779                }
 780
 781                out_sbs[block->num] = sb;
 782                progress = true;
 783             }
 784          }
 785       }
 786
 787       return { std::move(in_sbs), eq.flatten() };
 788    }
 789
 790    /**
 791     * Return the list of potential dependencies of each instruction in the
 792     * shader based on the result of global dependency analysis.
 793     */
 794    std::vector<std::vector<dependency>>
 795    gather_inst_dependencies(const fs_visitor *shader,
 796                             const std::vector<ordered_address> &jps)
 797    {
 798       std::vector<scoreboard> sbs;
 799       std::vector<unsigned> ids;
 800       std::vector<std::vector<dependency>> deps;
 801       unsigned ip = 0;
 802
 803       std::tie(sbs, ids) = propagate_block_scoreboards(shader, jps);
 804
 805       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 806          scoreboard &sb = sbs[block->num];
 807          std::vector<dependency> inst_deps;
 808
 809          for (unsigned i = 0; i < inst->sources; i++) {
 810             for (unsigned j = 0; j < regs_read(inst, i); j++)
 811                add_dependency(ids, inst_deps, dependency_for_read(
 812                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
 813          }
 814
 815          if (is_send(inst) && inst->base_mrf != -1) {
 816             for (unsigned j = 0; j < inst->mlen; j++)
 817                add_dependency(ids, inst_deps, dependency_for_read(
 818                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 819          }
 820
 821          if (is_unordered(inst))
 822             add_dependency(ids, inst_deps, dependency(TGL_SBID_SET, ip));
 823
 824          if (!inst->no_dd_check) {
 825             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
 826                for (unsigned j = 0; j < regs_written(inst); j++) {
 827                   add_dependency(ids, inst_deps, dependency_for_write(inst,
 828                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
 829                }
 830             }
 831
 832             if (is_send(inst) && inst->base_mrf != -1) {
 833                for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
 834                   add_dependency(ids, inst_deps, dependency_for_write(inst,
 835                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 836             }
 837          }
 838
 839          deps.push_back(inst_deps);
 840          update_inst_scoreboard(shader, jps, inst, ip, sb);
 841          ip++;
 842       }
 843
 844       return deps;
 845    }
 846
 847    /** @} */
 848
 849    /**
 850     * Allocate SBID tokens to track the execution of every out-of-order
 851     * instruction of the shader.
 852     */
 853    std::vector<std::vector<dependency>>
 854    allocate_inst_dependencies(const fs_visitor *shader,
 855                               const std::vector<std::vector<dependency>> &deps0)
 856    {
 857       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
 858        *       shaders with a large number of SEND messages.
 859        */
 860       std::vector<std::vector<dependency>> deps1;
 861       std::vector<unsigned> ids(deps0.size(), ~0u);
 862       unsigned next_id = 0;
 863
 864       for (const auto &inst_deps0 : deps0) {
 865          std::vector<dependency> inst_deps1;
 866
 867          for (const auto &dep : inst_deps0) {
 868             if (dep.unordered && ids[dep.id] == ~0u)
 869                ids[dep.id] = (next_id++) & 0xf;
 870
 871             add_dependency(ids, inst_deps1, dep);
 872          }
 873
 874          deps1.push_back(inst_deps1);
 875       }
 876
 877       return deps1;
 878    }
 879
 880    /**
 881     * Emit dependency information provided by \p deps into the shader,
 882     * inserting additional SYNC instructions for dependencies that can't be
 883     * represented directly by annotating existing instructions.
 884     */
 885    void
 886    emit_inst_dependencies(fs_visitor *shader,
 887                           const std::vector<ordered_address> &jps,
 888                           const std::vector<std::vector<dependency>> &deps)
 889    {
 890       unsigned ip = 0;
 891
 892       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
 893          tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
 894          const tgl_sbid_mode unordered_mode =
 895             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
 896
 897          for (const auto &dep : deps[ip]) {
 898             if (dep.unordered) {
 899                if (unordered_mode == dep.unordered && !swsb.mode) {
 900                   /* Bake unordered dependency into the instruction's SWSB if
 901                    * possible.
 902                    */
 903                   swsb.sbid = dep.id;
 904                   swsb.mode = dep.unordered;
 905                } else {
 906                   /* Emit dependency into the SWSB of an extra SYNC
 907                    * instruction.
 908                    */
 909                   const fs_builder ibld = fs_builder(shader, block, inst)
 910                      .exec_all().group(1, 0);
 911                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
 912                                             brw_imm_ud(TGL_SYNC_NOP));
 913                   sync->sched.sbid = dep.id;
 914                   sync->sched.mode = dep.unordered;
 915                   assert(!(sync->sched.mode & TGL_SBID_SET));
 916                }
 917             }
 918          }
 919
 920          /* Update the IR. */
 921          inst->sched = swsb;
 922          inst->no_dd_check = inst->no_dd_clear = false;
 923          ip++;
 924       }
 925    }
 926 }
 927
 928 bool
 929 fs_visitor::lower_scoreboard()
 930 {
 931    if (devinfo->gen >= 12) {
 932       const std::vector<ordered_address> jps = ordered_inst_addresses(this);
 933       emit_inst_dependencies(this, jps,
 934          allocate_inst_dependencies(this,
 935             gather_inst_dependencies(this, jps)));
 936    }
 937
 938    return true;
 939 }