src/intel/compiler/brw_fs_scoreboard.cpp

   1 /*
   2  * Copyright © 2019 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_scoreboard.cpp
  25  *
  26  * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
  27  * data coherency between register reads and writes in previous generations.
  28  * This lowering pass runs after register allocation in order to make up for
  29  * it.
  30  *
  31  * It works by performing global dataflow analysis in order to determine the
  32  * set of potential dependencies of every instruction in the shader, and then
  33  * inserts any required SWSB annotations and additional SYNC instructions in
  34  * order to guarantee data coherency.
  35  *
  36  * WARNING - Access of the following (rarely used) ARF registers is not
  37  *           tracked here, and require the RegDist SWSB annotation to be set
  38  *           to 1 by the generator in order to avoid data races:
  39  *
  40  *  - sp stack pointer
  41  *  - sr0 state register
  42  *  - cr0 control register
  43  *  - ip instruction pointer
  44  *  - tm0 timestamp register
  45  *  - dbg0 debug register
  46  *
  47  * The following ARF registers don't need to be tracked here because data
  48  * coherency is still provided transparently by the hardware:
  49  *
  50  *  - f0-1 flag registers
  51  *  - n0 notification register
  52  *  - tdr0 thread dependency register
  53  */
  54
  55 #include "brw_fs.h"
  56 #include "brw_cfg.h"
  57
  58 using namespace brw;
  59
  60 namespace {
  61    /**
  62     * In-order instruction accounting.
  63     * @{
  64     */
  65
  66    /**
  67     * Number of in-order hardware instructions contained in this IR
  68     * instruction.  This determines the increment applied to the RegDist
  69     * counter calculated for any ordered dependency that crosses this
  70     * instruction.
  71     */
  72    unsigned
  73    ordered_unit(const fs_inst *inst)
  74    {
  75       switch (inst->opcode) {
  76       case BRW_OPCODE_SYNC:
  77       case BRW_OPCODE_DO:
  78       case SHADER_OPCODE_UNDEF:
  79       case FS_OPCODE_PLACEHOLDER_HALT:
  80          return 0;
  81       default:
  82          /* Note that the following is inaccurate for virtual instructions
  83           * that expand to more in-order instructions than assumed here, but
  84           * that can only lead to suboptimal execution ordering, data
  85           * coherency won't be impacted.  Providing exact RegDist counts for
  86           * each virtual instruction would allow better ALU performance, but
  87           * it would require keeping this switch statement in perfect sync
  88           * with the generator in order to avoid data corruption.  Lesson is
  89           * (again) don't use virtual instructions if you want optimal
  90           * scheduling.
  91           */
  92          return is_unordered(inst) ? 0 : 1;
  93       }
  94    }
  95
  96    /**
  97     * Type for an instruction counter that increments for in-order
  98     * instructions only, arbitrarily denoted 'jp' throughout this lowering
  99     * pass in order to distinguish it from the regular instruction counter.
 100     */
 101    typedef int ordered_address;
 102
 103    /**
 104     * Return the number of instructions in the program.
 105     */
 106    unsigned
 107    num_instructions(const backend_shader *shader)
 108    {
 109       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
 110    }
 111
 112    /**
 113     * Calculate the local ordered_address instruction counter at every
 114     * instruction of the shader for subsequent constant-time look-up.
 115     */
 116    ordered_address *
 117    ordered_inst_addresses(const fs_visitor *shader)
 118    {
 119       ordered_address *jps = new ordered_address[num_instructions(shader)];
 120       ordered_address jp = 0;
 121       unsigned ip = 0;
 122
 123       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 124          jps[ip] = jp;
 125          jp += ordered_unit(inst);
 126          ip++;
 127       }
 128
 129       return jps;
 130    }
 131
 132    /**
 133     * Synchronization mode required for data manipulated by in-order
 134     * instructions.
 135     *
 136     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
 137     * enum for additional type safety.  The hardware doesn't provide control
 138     * over the synchronization mode for RegDist annotations, this is only used
 139     * internally in this pass in order to optimize out redundant read
 140     * dependencies where possible.
 141     */
 142    enum tgl_regdist_mode {
 143       TGL_REGDIST_NULL = 0,
 144       TGL_REGDIST_SRC = 1,
 145       TGL_REGDIST_DST = 2
 146    };
 147
 148    /**
 149     * Allow bitwise arithmetic of tgl_regdist_mode enums.
 150     */
 151    tgl_regdist_mode
 152    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
 153    {
 154       return tgl_regdist_mode(unsigned(x) | unsigned(y));
 155    }
 156
 157    tgl_regdist_mode
 158    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
 159    {
 160       return tgl_regdist_mode(unsigned(x) & unsigned(y));
 161    }
 162
 163    tgl_regdist_mode &
 164    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
 165    {
 166       return x = x | y;
 167    }
 168
 169    tgl_regdist_mode &
 170    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
 171    {
 172       return x = x & y;
 173    }
 174
 175    /** @} */
 176
 177    /**
 178     * Representation of an equivalence relation among the set of unsigned
 179     * integers.
 180     *
 181     * Its initial state is the identity relation '~' such that i ~ j if and
 182     * only if i == j for every pair of unsigned integers i and j.
 183     */
 184    struct equivalence_relation {
 185       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
 186       {
 187          for (unsigned i = 0; i < n; i++)
 188             is[i] = i;
 189       }
 190
 191       ~equivalence_relation()
 192       {
 193          delete[] is;
 194       }
 195
 196       /**
 197        * Return equivalence class index of the specified element.  Effectively
 198        * this is the numeric value of an arbitrary representative from the
 199        * equivalence class.
 200        *
 201        * Allows the evaluation of the equivalence relation according to the
 202        * rule that i ~ j if and only if lookup(i) == lookup(j).
 203        */
 204       unsigned
 205       lookup(unsigned i) const
 206       {
 207          if (i < n && is[i] != i)
 208             return lookup(is[i]);
 209          else
 210             return i;
 211       }
 212
 213       /**
 214        * Create an array with the results of the lookup() method for
 215        * constant-time evaluation.
 216        */
 217       unsigned *
 218       flatten() const
 219       {
 220          unsigned *ids = new unsigned[n];
 221
 222          for (unsigned i = 0; i < n; i++)
 223             ids[i] = lookup(i);
 224
 225          return ids;
 226       }
 227
 228       /**
 229        * Mutate the existing equivalence relation minimally by imposing the
 230        * additional requirement that i ~ j.
 231        *
 232        * The algorithm updates the internal representation recursively in
 233        * order to guarantee transitivity while preserving the previously
 234        * specified equivalence requirements.
 235        */
 236       unsigned
 237       link(unsigned i, unsigned j)
 238       {
 239          const unsigned k = lookup(i);
 240          assign(i, k);
 241          assign(j, k);
 242          return k;
 243       }
 244
 245    private:
 246       equivalence_relation(const equivalence_relation &);
 247
 248       equivalence_relation &
 249       operator=(const equivalence_relation &);
 250
 251       /**
 252        * Assign the representative of \p from to be equivalent to \p to.
 253        *
 254        * At the same time the data structure is partially flattened as much as
 255        * it's possible without increasing the number of recursive calls.
 256        */
 257       void
 258       assign(unsigned from, unsigned to)
 259       {
 260          if (from != to) {
 261             assert(from < n);
 262
 263             if (is[from] != from)
 264                assign(is[from], to);
 265
 266             is[from] = to;
 267          }
 268       }
 269
 270       unsigned *is;
 271       unsigned n;
 272    };
 273
 274    /**
 275     * Representation of a data dependency between two instructions in the
 276     * program.
 277     * @{
 278     */
 279    struct dependency {
 280       /**
 281        * No dependency information.
 282        */
 283       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
 284                      unordered(TGL_SBID_NULL), id(0) {}
 285
 286       /**
 287        * Construct a dependency on the in-order instruction with the provided
 288        * ordered_address instruction counter.
 289        */
 290       dependency(tgl_regdist_mode mode, ordered_address jp) :
 291          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
 292
 293       /**
 294        * Construct a dependency on the out-of-order instruction with the
 295        * specified synchronization token.
 296        */
 297       dependency(tgl_sbid_mode mode, unsigned id) :
 298          ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
 299
 300       /**
 301        * Synchronization mode of in-order dependency, or zero if no in-order
 302        * dependency is present.
 303        */
 304       tgl_regdist_mode ordered;
 305
 306       /**
 307        * Instruction counter of in-order dependency.
 308        *
 309        * For a dependency part of a different block in the program, this is
 310        * relative to the specific control flow path taken between the
 311        * dependency and the current block: It is the ordered_address such that
 312        * the difference between it and the ordered_address of the first
 313        * instruction of the current block is exactly the number of in-order
 314        * instructions across that control flow path.  It is not guaranteed to
 315        * be equal to the local ordered_address of the generating instruction
 316        * [as returned by ordered_inst_addresses()], except for block-local
 317        * dependencies.
 318        */
 319       ordered_address jp;
 320
 321       /**
 322        * Synchronization mode of unordered dependency, or zero if no unordered
 323        * dependency is present.
 324        */
 325       tgl_sbid_mode unordered;
 326
 327       /** Synchronization token of out-of-order dependency. */
 328       unsigned id;
 329
 330       /**
 331        * Trivial in-order dependency that's always satisfied.
 332        *
 333        * Note that unlike a default-constructed dependency() which is also
 334        * trivially satisfied, this is considered to provide dependency
 335        * information and can be used to clear a previously pending dependency
 336        * via shadow().
 337        */
 338       static const dependency done;
 339
 340       friend bool
 341       operator==(const dependency &dep0, const dependency &dep1)
 342       {
 343          return dep0.ordered == dep1.ordered &&
 344                 dep0.jp == dep1.jp &&
 345                 dep0.unordered == dep1.unordered &&
 346                 dep0.id == dep1.id;
 347       }
 348
 349       friend bool
 350       operator!=(const dependency &dep0, const dependency &dep1)
 351       {
 352          return !(dep0 == dep1);
 353       }
 354    };
 355
 356    const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
 357
 358    /**
 359     * Return whether \p dep contains any dependency information.
 360     */
 361    bool
 362    is_valid(const dependency &dep)
 363    {
 364       return dep.ordered || dep.unordered;
 365    }
 366
 367    /**
 368     * Combine \p dep0 and \p dep1 into a single dependency object that is only
 369     * satisfied when both original dependencies are satisfied.  This might
 370     * involve updating the equivalence relation \p eq in order to make sure
 371     * that both out-of-order dependencies are assigned the same hardware SBID
 372     * as synchronization token.
 373     */
 374    dependency
 375    merge(equivalence_relation &eq,
 376          const dependency &dep0, const dependency &dep1)
 377    {
 378       dependency dep;
 379
 380       if (dep0.ordered || dep1.ordered) {
 381          dep.ordered = dep0.ordered | dep1.ordered;
 382          dep.jp = MAX2(dep0.jp, dep1.jp);
 383       }
 384
 385       if (dep0.unordered || dep1.unordered) {
 386          dep.unordered = dep0.unordered | dep1.unordered;
 387          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
 388                           dep1.unordered ? dep1.id : dep0.id);
 389       }
 390
 391       return dep;
 392    }
 393
 394    /**
 395     * Override dependency information of \p dep0 with that of \p dep1.
 396     */
 397    dependency
 398    shadow(const dependency &dep0, const dependency &dep1)
 399    {
 400       return is_valid(dep1) ? dep1 : dep0;
 401    }
 402
 403    /**
 404     * Translate dependency information across the program.
 405     *
 406     * This returns a dependency on the same instruction translated to the
 407     * ordered_address space of a different block.  The correct shift for
 408     * transporting a dependency across an edge of the CFG is the difference
 409     * between the local ordered_address of the first instruction of the target
 410     * block and the local ordered_address of the instruction immediately after
 411     * the end of the origin block.
 412     */
 413    dependency
 414    transport(dependency dep, int delta)
 415    {
 416       if (dep.ordered && dep.jp > INT_MIN)
 417          dep.jp += delta;
 418
 419       return dep;
 420    }
 421
 422    /**
 423     * Return simplified dependency removing any synchronization modes not
 424     * applicable to an instruction reading the same register location.
 425     */
 426    dependency
 427    dependency_for_read(dependency dep)
 428    {
 429       dep.ordered &= TGL_REGDIST_DST;
 430       return dep;
 431    }
 432
 433    /**
 434     * Return simplified dependency removing any synchronization modes not
 435     * applicable to an instruction \p inst writing the same register location.
 436     */
 437    dependency
 438    dependency_for_write(const fs_inst *inst, dependency dep)
 439    {
 440       if (!is_unordered(inst))
 441          dep.ordered &= TGL_REGDIST_DST;
 442       return dep;
 443    }
 444
 445    /** @} */
 446
 447    /**
 448     * Scoreboard representation.  This keeps track of the data dependencies of
 449     * registers with GRF granularity.
 450     */
 451    class scoreboard {
 452    public:
 453       /**
 454        * Look up the most current data dependency for register \p r.
 455        */
 456       dependency
 457       get(const fs_reg &r) const
 458       {
 459          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
 460             return *p;
 461          else
 462             return dependency();
 463       }
 464
 465       /**
 466        * Specify the most current data dependency for register \p r.
 467        */
 468       void
 469       set(const fs_reg &r, const dependency &d)
 470       {
 471          if (dependency *p = dep(r))
 472             *p = d;
 473       }
 474
 475       /**
 476        * Component-wise merge() of corresponding dependencies from two
 477        * scoreboard objects.  \sa merge().
 478        */
 479       friend scoreboard
 480       merge(equivalence_relation &eq,
 481             const scoreboard &sb0, const scoreboard &sb1)
 482       {
 483          scoreboard sb;
 484
 485          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 486             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
 487
 488          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
 489
 490          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 491             sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
 492
 493          return sb;
 494       }
 495
 496       /**
 497        * Component-wise shadow() of corresponding dependencies from two
 498        * scoreboard objects.  \sa shadow().
 499        */
 500       friend scoreboard
 501       shadow(const scoreboard &sb0, const scoreboard &sb1)
 502       {
 503          scoreboard sb;
 504
 505          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 506             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
 507
 508          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
 509
 510          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 511             sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
 512
 513          return sb;
 514       }
 515
 516       /**
 517        * Component-wise transport() of dependencies from a scoreboard
 518        * object.  \sa transport().
 519        */
 520       friend scoreboard
 521       transport(const scoreboard &sb0, int delta)
 522       {
 523          scoreboard sb;
 524
 525          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 526             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
 527
 528          sb.addr_dep = transport(sb0.addr_dep, delta);
 529
 530          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 531             sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
 532
 533          return sb;
 534       }
 535
 536       friend bool
 537       operator==(const scoreboard &sb0, const scoreboard &sb1)
 538       {
 539          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
 540             if (sb0.grf_deps[i] != sb1.grf_deps[i])
 541                return false;
 542          }
 543
 544          if (sb0.addr_dep != sb1.addr_dep)
 545             return false;
 546
 547          for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
 548             if (sb0.accum_deps[i] != sb1.accum_deps[i])
 549                return false;
 550          }
 551
 552          return true;
 553       }
 554
 555       friend bool
 556       operator!=(const scoreboard &sb0, const scoreboard &sb1)
 557       {
 558          return !(sb0 == sb1);
 559       }
 560
 561    private:
 562       dependency grf_deps[BRW_MAX_GRF];
 563       dependency addr_dep;
 564       dependency accum_deps[10];
 565
 566       dependency *
 567       dep(const fs_reg &r)
 568       {
 569          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
 570                                reg_offset(r) / REG_SIZE);
 571
 572          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
 573                  r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
 574                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
 575                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
 576                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
 577                                   reg < BRW_ARF_FLAG ? &accum_deps[
 578                                      reg - BRW_ARF_ACCUMULATOR] :
 579                  NULL);
 580       }
 581    };
 582
 583    /**
 584     * Dependency list handling.
 585     * @{
 586     */
 587    struct dependency_list {
 588       dependency_list() : deps(NULL), n(0) {}
 589
 590       ~dependency_list()
 591       {
 592          free(deps);
 593       }
 594
 595       void
 596       push_back(const dependency &dep)
 597       {
 598          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
 599          deps[n++] = dep;
 600       }
 601
 602       unsigned
 603       size() const
 604       {
 605          return n;
 606       }
 607
 608       const dependency &
 609       operator[](unsigned i) const
 610       {
 611          assert(i < n);
 612          return deps[i];
 613       }
 614
 615       dependency &
 616       operator[](unsigned i)
 617       {
 618          assert(i < n);
 619          return deps[i];
 620       }
 621
 622    private:
 623       dependency_list(const dependency_list &);
 624       dependency_list &
 625       operator=(const dependency_list &);
 626
 627       dependency *deps;
 628       unsigned n;
 629    };
 630
 631    /**
 632     * Add dependency \p dep to the list of dependencies of an instruction
 633     * \p deps.
 634     */
 635    void
 636    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
 637    {
 638       if (is_valid(dep)) {
 639          /* Translate the unordered dependency token first in order to keep
 640           * the list minimally redundant.
 641           */
 642          if (dep.unordered)
 643             dep.id = ids[dep.id];
 644
 645          /* Try to combine the specified dependency with any existing ones. */
 646          for (unsigned i = 0; i < deps.size(); i++) {
 647             if (dep.ordered && deps[i].ordered) {
 648                deps[i].jp = MAX2(deps[i].jp, dep.jp);
 649                deps[i].ordered |= dep.ordered;
 650                dep.ordered = TGL_REGDIST_NULL;
 651             }
 652
 653             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
 654                deps[i].unordered |= dep.unordered;
 655                dep.unordered = TGL_SBID_NULL;
 656             }
 657          }
 658
 659          /* Add it to the end of the list if necessary. */
 660          if (is_valid(dep))
 661             deps.push_back(dep);
 662       }
 663    }
 664
 665    /**
 666     * Construct a tgl_swsb annotation encoding any ordered dependencies from
 667     * the dependency list \p deps of an instruction with ordered_address
 668     * \p jp.
 669     */
 670    tgl_swsb
 671    ordered_dependency_swsb(const dependency_list &deps,
 672                            const ordered_address &jp)
 673    {
 674       unsigned min_dist = ~0u;
 675
 676       for (unsigned i = 0; i < deps.size(); i++) {
 677          if (deps[i].ordered) {
 678             const unsigned dist = jp - deps[i].jp;
 679             const unsigned max_dist = 10;
 680             assert(jp > deps[i].jp);
 681             if (dist <= max_dist)
 682                min_dist = MIN3(min_dist, dist, 7);
 683          }
 684       }
 685
 686       return { min_dist == ~0u ? 0 : min_dist };
 687    }
 688
 689    /**
 690     * Return whether the dependency list \p deps of an instruction with
 691     * ordered_address \p jp has any non-trivial ordered dependencies.
 692     */
 693    bool
 694    find_ordered_dependency(const dependency_list &deps,
 695                            const ordered_address &jp)
 696    {
 697       return ordered_dependency_swsb(deps, jp).regdist;
 698    }
 699
 700    /**
 701     * Return the full tgl_sbid_mode bitset for the first unordered dependency
 702     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
 703     * no such dependency is present.
 704     */
 705    tgl_sbid_mode
 706    find_unordered_dependency(const dependency_list &deps,
 707                              tgl_sbid_mode unordered)
 708    {
 709       if (unordered) {
 710          for (unsigned i = 0; i < deps.size(); i++) {
 711             if (unordered & deps[i].unordered)
 712                return deps[i].unordered;
 713          }
 714       }
 715
 716       return TGL_SBID_NULL;
 717    }
 718
 719    /**
 720     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
 721     * \p deps that can be represented directly in the SWSB annotation of the
 722     * instruction without additional SYNC instructions, or zero if no such
 723     * dependency is present.
 724     */
 725    tgl_sbid_mode
 726    baked_unordered_dependency_mode(const fs_inst *inst,
 727                                    const dependency_list &deps,
 728                                    const ordered_address &jp)
 729    {
 730       const bool has_ordered = find_ordered_dependency(deps, jp);
 731
 732       if (find_unordered_dependency(deps, TGL_SBID_SET))
 733          return find_unordered_dependency(deps, TGL_SBID_SET);
 734       else if (has_ordered && is_unordered(inst))
 735          return TGL_SBID_NULL;
 736       else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
 737                (!has_ordered || !is_unordered(inst)))
 738          return find_unordered_dependency(deps, TGL_SBID_DST);
 739       else if (!has_ordered)
 740          return find_unordered_dependency(deps, TGL_SBID_SRC);
 741       else
 742          return TGL_SBID_NULL;
 743    }
 744
 745    /** @} */
 746
 747    /**
 748     * Shader instruction dependency calculation.
 749     * @{
 750     */
 751
 752    /**
 753     * Update scoreboard object \p sb to account for the execution of
 754     * instruction \p inst.
 755     */
 756    void
 757    update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
 758                           const fs_inst *inst, unsigned ip, scoreboard &sb)
 759    {
 760       /* Track any source registers that may be fetched asynchronously by this
 761        * instruction, otherwise clear the dependency in order to avoid
 762        * subsequent redundant synchronization.
 763        */
 764       for (unsigned i = 0; i < inst->sources; i++) {
 765          const dependency rd_dep =
 766             inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
 767             ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
 768             dependency::done;
 769
 770          for (unsigned j = 0; j < regs_read(inst, i); j++)
 771             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
 772       }
 773
 774       if (is_send(inst) && inst->base_mrf != -1) {
 775          const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
 776
 777          for (unsigned j = 0; j < inst->mlen; j++)
 778             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
 779       }
 780
 781       /* Track any destination registers of this instruction. */
 782       const dependency wr_dep =
 783          is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
 784          ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
 785          dependency();
 786
 787       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
 788           !inst->dst.is_null()) {
 789          for (unsigned j = 0; j < regs_written(inst); j++)
 790             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
 791       }
 792    }
 793
 794    /**
 795     * Calculate scoreboard objects locally that represent any pending (and
 796     * unconditionally resolved) dependencies at the end of each block of the
 797     * program.
 798     */
 799    scoreboard *
 800    gather_block_scoreboards(const fs_visitor *shader,
 801                             const ordered_address *jps)
 802    {
 803       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
 804       unsigned ip = 0;
 805
 806       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
 807          update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
 808
 809       return sbs;
 810    }
 811
 812    /**
 813     * Propagate data dependencies globally through the control flow graph
 814     * until a fixed point is reached.
 815     *
 816     * Calculates the set of dependencies potentially pending at the beginning
 817     * of each block, and returns it as an array of scoreboard objects.
 818     */
 819    scoreboard *
 820    propagate_block_scoreboards(const fs_visitor *shader,
 821                                const ordered_address *jps,
 822                                equivalence_relation &eq)
 823    {
 824       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
 825       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
 826       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
 827
 828       for (bool progress = true; progress;) {
 829          progress = false;
 830
 831          foreach_block(block, shader->cfg) {
 832             const scoreboard sb = shadow(in_sbs[block->num],
 833                                          delta_sbs[block->num]);
 834
 835             if (sb != out_sbs[block->num]) {
 836                foreach_list_typed(bblock_link, child_link, link,
 837                                   &block->children) {
 838                   scoreboard &in_sb = in_sbs[child_link->block->num];
 839                   const int delta =
 840                      jps[child_link->block->start_ip] - jps[block->end_ip]
 841                      - ordered_unit(static_cast<const fs_inst *>(block->end()));
 842
 843                   in_sb = merge(eq, in_sb, transport(sb, delta));
 844                }
 845
 846                out_sbs[block->num] = sb;
 847                progress = true;
 848             }
 849          }
 850       }
 851
 852       delete[] delta_sbs;
 853       delete[] out_sbs;
 854
 855       return in_sbs;
 856    }
 857
 858    /**
 859     * Return the list of potential dependencies of each instruction in the
 860     * shader based on the result of global dependency analysis.
 861     */
 862    dependency_list *
 863    gather_inst_dependencies(const fs_visitor *shader,
 864                             const ordered_address *jps)
 865    {
 866       equivalence_relation eq(num_instructions(shader));
 867       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
 868       const unsigned *ids = eq.flatten();
 869       dependency_list *deps = new dependency_list[num_instructions(shader)];
 870       unsigned ip = 0;
 871
 872       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 873          scoreboard &sb = sbs[block->num];
 874
 875          for (unsigned i = 0; i < inst->sources; i++) {
 876             for (unsigned j = 0; j < regs_read(inst, i); j++)
 877                add_dependency(ids, deps[ip], dependency_for_read(
 878                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
 879          }
 880
 881          if (is_send(inst) && inst->base_mrf != -1) {
 882             for (unsigned j = 0; j < inst->mlen; j++)
 883                add_dependency(ids, deps[ip], dependency_for_read(
 884                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 885          }
 886
 887          if (is_unordered(inst))
 888             add_dependency(ids, deps[ip], dependency(TGL_SBID_SET, ip));
 889
 890          if (!inst->no_dd_check) {
 891             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
 892                for (unsigned j = 0; j < regs_written(inst); j++) {
 893                   add_dependency(ids, deps[ip], dependency_for_write(inst,
 894                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
 895                }
 896             }
 897
 898             if (is_send(inst) && inst->base_mrf != -1) {
 899                for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
 900                   add_dependency(ids, deps[ip], dependency_for_write(inst,
 901                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 902             }
 903          }
 904
 905          update_inst_scoreboard(shader, jps, inst, ip, sb);
 906          ip++;
 907       }
 908
 909       delete[] sbs;
 910       delete[] ids;
 911
 912       return deps;
 913    }
 914
 915    /** @} */
 916
 917    /**
 918     * Allocate SBID tokens to track the execution of every out-of-order
 919     * instruction of the shader.
 920     */
 921    dependency_list *
 922    allocate_inst_dependencies(const fs_visitor *shader,
 923                               const dependency_list *deps0)
 924    {
 925       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
 926        *       shaders with a large number of SEND messages.
 927        */
 928
 929       /* Allocate an unordered dependency ID to hardware SBID translation
 930        * table with as many entries as instructions there are in the shader,
 931        * which is the maximum number of unordered IDs we can find in the
 932        * program.
 933        */
 934       unsigned *ids = new unsigned[num_instructions(shader)];
 935       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
 936          ids[ip] = ~0u;
 937
 938       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
 939       unsigned next_id = 0;
 940
 941       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
 942          for (unsigned i = 0; i < deps0[ip].size(); i++) {
 943             const dependency &dep = deps0[ip][i];
 944
 945             if (dep.unordered && ids[dep.id] == ~0u)
 946                ids[dep.id] = (next_id++) & 0xf;
 947
 948             add_dependency(ids, deps1[ip], dep);
 949          }
 950       }
 951
 952       delete[] ids;
 953
 954       return deps1;
 955    }
 956
 957    /**
 958     * Emit dependency information provided by \p deps into the shader,
 959     * inserting additional SYNC instructions for dependencies that can't be
 960     * represented directly by annotating existing instructions.
 961     */
 962    void
 963    emit_inst_dependencies(fs_visitor *shader,
 964                           const ordered_address *jps,
 965                           const dependency_list *deps)
 966    {
 967       unsigned ip = 0;
 968
 969       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
 970          tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
 971          const tgl_sbid_mode unordered_mode =
 972             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
 973
 974          for (unsigned i = 0; i < deps[ip].size(); i++) {
 975             const dependency &dep = deps[ip][i];
 976
 977             if (dep.unordered) {
 978                if (unordered_mode == dep.unordered && !swsb.mode) {
 979                   /* Bake unordered dependency into the instruction's SWSB if
 980                    * possible.
 981                    */
 982                   swsb.sbid = dep.id;
 983                   swsb.mode = dep.unordered;
 984                } else {
 985                   /* Emit dependency into the SWSB of an extra SYNC
 986                    * instruction.
 987                    */
 988                   const fs_builder ibld = fs_builder(shader, block, inst)
 989                      .exec_all().group(1, 0);
 990                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
 991                                             brw_imm_ud(TGL_SYNC_NOP));
 992                   sync->sched.sbid = dep.id;
 993                   sync->sched.mode = dep.unordered;
 994                   assert(!(sync->sched.mode & TGL_SBID_SET));
 995                }
 996             }
 997          }
 998
 999          /* Update the IR. */
1000          inst->sched = swsb;
1001          inst->no_dd_check = inst->no_dd_clear = false;
1002          ip++;
1003       }
1004    }
1005 }
1006
1007 bool
1008 fs_visitor::lower_scoreboard()
1009 {
1010    if (devinfo->gen >= 12) {
1011       const ordered_address *jps = ordered_inst_addresses(this);
1012       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1013       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1014       emit_inst_dependencies(this, jps, deps1);
1015       delete[] deps1;
1016       delete[] deps0;
1017       delete[] jps;
1018    }
1019
1020    return true;
1021 }