src/intel/compiler/brw_fs_scoreboard.cpp

   1 /*
   2  * Copyright © 2019 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_scoreboard.cpp
  25  *
  26  * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
  27  * data coherency between register reads and writes in previous generations.
  28  * This lowering pass runs after register allocation in order to make up for
  29  * it.
  30  *
  31  * It works by performing global dataflow analysis in order to determine the
  32  * set of potential dependencies of every instruction in the shader, and then
  33  * inserts any required SWSB annotations and additional SYNC instructions in
  34  * order to guarantee data coherency.
  35  *
  36  * WARNING - Access of the following (rarely used) ARF registers is not
  37  *           tracked here, and require the RegDist SWSB annotation to be set
  38  *           to 1 by the generator in order to avoid data races:
  39  *
  40  *  - sp stack pointer
  41  *  - sr0 state register
  42  *  - cr0 control register
  43  *  - ip instruction pointer
  44  *  - tm0 timestamp register
  45  *  - dbg0 debug register
  46  *
  47  * The following ARF registers don't need to be tracked here because data
  48  * coherency is still provided transparently by the hardware:
  49  *
  50  *  - f0-1 flag registers
  51  *  - n0 notification register
  52  *  - tdr0 thread dependency register
  53  */
  54
  55 #include "brw_fs.h"
  56 #include "brw_cfg.h"
  57
  58 using namespace brw;
  59
  60 namespace {
  61    /**
  62     * In-order instruction accounting.
  63     * @{
  64     */
  65
  66    /**
  67     * Number of in-order hardware instructions contained in this IR
  68     * instruction.  This determines the increment applied to the RegDist
  69     * counter calculated for any ordered dependency that crosses this
  70     * instruction.
  71     */
  72    unsigned
  73    ordered_unit(const fs_inst *inst)
  74    {
  75       switch (inst->opcode) {
  76       case BRW_OPCODE_SYNC:
  77       case BRW_OPCODE_DO:
  78       case SHADER_OPCODE_UNDEF:
  79       case FS_OPCODE_PLACEHOLDER_HALT:
  80          return 0;
  81       default:
  82          /* Note that the following is inaccurate for virtual instructions
  83           * that expand to more in-order instructions than assumed here, but
  84           * that can only lead to suboptimal execution ordering, data
  85           * coherency won't be impacted.  Providing exact RegDist counts for
  86           * each virtual instruction would allow better ALU performance, but
  87           * it would require keeping this switch statement in perfect sync
  88           * with the generator in order to avoid data corruption.  Lesson is
  89           * (again) don't use virtual instructions if you want optimal
  90           * scheduling.
  91           */
  92          return is_unordered(inst) ? 0 : 1;
  93       }
  94    }
  95
  96    /**
  97     * Type for an instruction counter that increments for in-order
  98     * instructions only, arbitrarily denoted 'jp' throughout this lowering
  99     * pass in order to distinguish it from the regular instruction counter.
 100     */
 101    typedef int ordered_address;
 102
 103    /**
 104     * Return the number of instructions in the program.
 105     */
 106    unsigned
 107    num_instructions(const backend_shader *shader)
 108    {
 109       return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
 110    }
 111
 112    /**
 113     * Calculate the local ordered_address instruction counter at every
 114     * instruction of the shader for subsequent constant-time look-up.
 115     */
 116    ordered_address *
 117    ordered_inst_addresses(const fs_visitor *shader)
 118    {
 119       ordered_address *jps = new ordered_address[num_instructions(shader)];
 120       ordered_address jp = 0;
 121       unsigned ip = 0;
 122
 123       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 124          jps[ip] = jp;
 125          jp += ordered_unit(inst);
 126          ip++;
 127       }
 128
 129       return jps;
 130    }
 131
 132    /**
 133     * Synchronization mode required for data manipulated by in-order
 134     * instructions.
 135     *
 136     * Similar to tgl_sbid_mode, but without SET mode.  Defined as a separate
 137     * enum for additional type safety.  The hardware doesn't provide control
 138     * over the synchronization mode for RegDist annotations, this is only used
 139     * internally in this pass in order to optimize out redundant read
 140     * dependencies where possible.
 141     */
 142    enum tgl_regdist_mode {
 143       TGL_REGDIST_NULL = 0,
 144       TGL_REGDIST_SRC = 1,
 145       TGL_REGDIST_DST = 2
 146    };
 147
 148    /**
 149     * Allow bitwise arithmetic of tgl_regdist_mode enums.
 150     */
 151    tgl_regdist_mode
 152    operator|(tgl_regdist_mode x, tgl_regdist_mode y)
 153    {
 154       return tgl_regdist_mode(unsigned(x) | unsigned(y));
 155    }
 156
 157    tgl_regdist_mode
 158    operator&(tgl_regdist_mode x, tgl_regdist_mode y)
 159    {
 160       return tgl_regdist_mode(unsigned(x) & unsigned(y));
 161    }
 162
 163    tgl_regdist_mode &
 164    operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
 165    {
 166       return x = x | y;
 167    }
 168
 169    tgl_regdist_mode &
 170    operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
 171    {
 172       return x = x & y;
 173    }
 174
 175    /** @} */
 176
 177    /**
 178     * Representation of an equivalence relation among the set of unsigned
 179     * integers.
 180     *
 181     * Its initial state is the identity relation '~' such that i ~ j if and
 182     * only if i == j for every pair of unsigned integers i and j.
 183     */
 184    struct equivalence_relation {
 185       equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
 186       {
 187          for (unsigned i = 0; i < n; i++)
 188             is[i] = i;
 189       }
 190
 191       ~equivalence_relation()
 192       {
 193          delete[] is;
 194       }
 195
 196       /**
 197        * Return equivalence class index of the specified element.  Effectively
 198        * this is the numeric value of an arbitrary representative from the
 199        * equivalence class.
 200        *
 201        * Allows the evaluation of the equivalence relation according to the
 202        * rule that i ~ j if and only if lookup(i) == lookup(j).
 203        */
 204       unsigned
 205       lookup(unsigned i) const
 206       {
 207          if (i < n && is[i] != i)
 208             return lookup(is[i]);
 209          else
 210             return i;
 211       }
 212
 213       /**
 214        * Create an array with the results of the lookup() method for
 215        * constant-time evaluation.
 216        */
 217       unsigned *
 218       flatten() const
 219       {
 220          unsigned *ids = new unsigned[n];
 221
 222          for (unsigned i = 0; i < n; i++)
 223             ids[i] = lookup(i);
 224
 225          return ids;
 226       }
 227
 228       /**
 229        * Mutate the existing equivalence relation minimally by imposing the
 230        * additional requirement that i ~ j.
 231        *
 232        * The algorithm updates the internal representation recursively in
 233        * order to guarantee transitivity while preserving the previously
 234        * specified equivalence requirements.
 235        */
 236       unsigned
 237       link(unsigned i, unsigned j)
 238       {
 239          const unsigned k = lookup(i);
 240          assign(i, k);
 241          assign(j, k);
 242          return k;
 243       }
 244
 245    private:
 246       equivalence_relation(const equivalence_relation &);
 247
 248       equivalence_relation &
 249       operator=(const equivalence_relation &);
 250
 251       /**
 252        * Assign the representative of \p from to be equivalent to \p to.
 253        *
 254        * At the same time the data structure is partially flattened as much as
 255        * it's possible without increasing the number of recursive calls.
 256        */
 257       void
 258       assign(unsigned from, unsigned to)
 259       {
 260          if (from != to) {
 261             assert(from < n);
 262
 263             if (is[from] != from)
 264                assign(is[from], to);
 265
 266             is[from] = to;
 267          }
 268       }
 269
 270       unsigned *is;
 271       unsigned n;
 272    };
 273
 274    /**
 275     * Representation of a data dependency between two instructions in the
 276     * program.
 277     * @{
 278     */
 279    struct dependency {
 280       /**
 281        * No dependency information.
 282        */
 283       dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
 284                      unordered(TGL_SBID_NULL), id(0),
 285                      exec_all(false) {}
 286
 287       /**
 288        * Construct a dependency on the in-order instruction with the provided
 289        * ordered_address instruction counter.
 290        */
 291       dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
 292          ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
 293          exec_all(exec_all) {}
 294
 295       /**
 296        * Construct a dependency on the out-of-order instruction with the
 297        * specified synchronization token.
 298        */
 299       dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
 300          ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
 301          exec_all(exec_all) {}
 302
 303       /**
 304        * Synchronization mode of in-order dependency, or zero if no in-order
 305        * dependency is present.
 306        */
 307       tgl_regdist_mode ordered;
 308
 309       /**
 310        * Instruction counter of in-order dependency.
 311        *
 312        * For a dependency part of a different block in the program, this is
 313        * relative to the specific control flow path taken between the
 314        * dependency and the current block: It is the ordered_address such that
 315        * the difference between it and the ordered_address of the first
 316        * instruction of the current block is exactly the number of in-order
 317        * instructions across that control flow path.  It is not guaranteed to
 318        * be equal to the local ordered_address of the generating instruction
 319        * [as returned by ordered_inst_addresses()], except for block-local
 320        * dependencies.
 321        */
 322       ordered_address jp;
 323
 324       /**
 325        * Synchronization mode of unordered dependency, or zero if no unordered
 326        * dependency is present.
 327        */
 328       tgl_sbid_mode unordered;
 329
 330       /** Synchronization token of out-of-order dependency. */
 331       unsigned id;
 332
 333       /**
 334        * Whether the dependency could be run with execution masking disabled,
 335        * which might lead to the unwanted execution of the generating
 336        * instruction in cases where a BB is executed with all channels
 337        * disabled due to hardware bug GEN:BUG:1407528679.
 338        */
 339       bool exec_all;
 340
 341       /**
 342        * Trivial in-order dependency that's always satisfied.
 343        *
 344        * Note that unlike a default-constructed dependency() which is also
 345        * trivially satisfied, this is considered to provide dependency
 346        * information and can be used to clear a previously pending dependency
 347        * via shadow().
 348        */
 349       static const dependency done;
 350
 351       friend bool
 352       operator==(const dependency &dep0, const dependency &dep1)
 353       {
 354          return dep0.ordered == dep1.ordered &&
 355                 dep0.jp == dep1.jp &&
 356                 dep0.unordered == dep1.unordered &&
 357                 dep0.id == dep1.id &&
 358                 dep0.exec_all == dep1.exec_all;
 359       }
 360
 361       friend bool
 362       operator!=(const dependency &dep0, const dependency &dep1)
 363       {
 364          return !(dep0 == dep1);
 365       }
 366    };
 367
 368    const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
 369
 370    /**
 371     * Return whether \p dep contains any dependency information.
 372     */
 373    bool
 374    is_valid(const dependency &dep)
 375    {
 376       return dep.ordered || dep.unordered;
 377    }
 378
 379    /**
 380     * Combine \p dep0 and \p dep1 into a single dependency object that is only
 381     * satisfied when both original dependencies are satisfied.  This might
 382     * involve updating the equivalence relation \p eq in order to make sure
 383     * that both out-of-order dependencies are assigned the same hardware SBID
 384     * as synchronization token.
 385     */
 386    dependency
 387    merge(equivalence_relation &eq,
 388          const dependency &dep0, const dependency &dep1)
 389    {
 390       dependency dep;
 391
 392       if (dep0.ordered || dep1.ordered) {
 393          dep.ordered = dep0.ordered | dep1.ordered;
 394          dep.jp = MAX2(dep0.jp, dep1.jp);
 395       }
 396
 397       if (dep0.unordered || dep1.unordered) {
 398          dep.unordered = dep0.unordered | dep1.unordered;
 399          dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
 400                           dep1.unordered ? dep1.id : dep0.id);
 401       }
 402
 403       dep.exec_all = dep0.exec_all || dep1.exec_all;
 404
 405       return dep;
 406    }
 407
 408    /**
 409     * Override dependency information of \p dep0 with that of \p dep1.
 410     */
 411    dependency
 412    shadow(const dependency &dep0, const dependency &dep1)
 413    {
 414       return is_valid(dep1) ? dep1 : dep0;
 415    }
 416
 417    /**
 418     * Translate dependency information across the program.
 419     *
 420     * This returns a dependency on the same instruction translated to the
 421     * ordered_address space of a different block.  The correct shift for
 422     * transporting a dependency across an edge of the CFG is the difference
 423     * between the local ordered_address of the first instruction of the target
 424     * block and the local ordered_address of the instruction immediately after
 425     * the end of the origin block.
 426     */
 427    dependency
 428    transport(dependency dep, int delta)
 429    {
 430       if (dep.ordered && dep.jp > INT_MIN)
 431          dep.jp += delta;
 432
 433       return dep;
 434    }
 435
 436    /**
 437     * Return simplified dependency removing any synchronization modes not
 438     * applicable to an instruction reading the same register location.
 439     */
 440    dependency
 441    dependency_for_read(dependency dep)
 442    {
 443       dep.ordered &= TGL_REGDIST_DST;
 444       return dep;
 445    }
 446
 447    /**
 448     * Return simplified dependency removing any synchronization modes not
 449     * applicable to an instruction \p inst writing the same register location.
 450     */
 451    dependency
 452    dependency_for_write(const fs_inst *inst, dependency dep)
 453    {
 454       if (!is_unordered(inst))
 455          dep.ordered &= TGL_REGDIST_DST;
 456       return dep;
 457    }
 458
 459    /** @} */
 460
 461    /**
 462     * Scoreboard representation.  This keeps track of the data dependencies of
 463     * registers with GRF granularity.
 464     */
 465    class scoreboard {
 466    public:
 467       /**
 468        * Look up the most current data dependency for register \p r.
 469        */
 470       dependency
 471       get(const fs_reg &r) const
 472       {
 473          if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
 474             return *p;
 475          else
 476             return dependency();
 477       }
 478
 479       /**
 480        * Specify the most current data dependency for register \p r.
 481        */
 482       void
 483       set(const fs_reg &r, const dependency &d)
 484       {
 485          if (dependency *p = dep(r))
 486             *p = d;
 487       }
 488
 489       /**
 490        * Component-wise merge() of corresponding dependencies from two
 491        * scoreboard objects.  \sa merge().
 492        */
 493       friend scoreboard
 494       merge(equivalence_relation &eq,
 495             const scoreboard &sb0, const scoreboard &sb1)
 496       {
 497          scoreboard sb;
 498
 499          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 500             sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
 501
 502          sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
 503
 504          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 505             sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]);
 506
 507          return sb;
 508       }
 509
 510       /**
 511        * Component-wise shadow() of corresponding dependencies from two
 512        * scoreboard objects.  \sa shadow().
 513        */
 514       friend scoreboard
 515       shadow(const scoreboard &sb0, const scoreboard &sb1)
 516       {
 517          scoreboard sb;
 518
 519          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 520             sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
 521
 522          sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
 523
 524          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 525             sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]);
 526
 527          return sb;
 528       }
 529
 530       /**
 531        * Component-wise transport() of dependencies from a scoreboard
 532        * object.  \sa transport().
 533        */
 534       friend scoreboard
 535       transport(const scoreboard &sb0, int delta)
 536       {
 537          scoreboard sb;
 538
 539          for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
 540             sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
 541
 542          sb.addr_dep = transport(sb0.addr_dep, delta);
 543
 544          for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++)
 545             sb.accum_deps[i] = transport(sb0.accum_deps[i], delta);
 546
 547          return sb;
 548       }
 549
 550       friend bool
 551       operator==(const scoreboard &sb0, const scoreboard &sb1)
 552       {
 553          for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
 554             if (sb0.grf_deps[i] != sb1.grf_deps[i])
 555                return false;
 556          }
 557
 558          if (sb0.addr_dep != sb1.addr_dep)
 559             return false;
 560
 561          for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) {
 562             if (sb0.accum_deps[i] != sb1.accum_deps[i])
 563                return false;
 564          }
 565
 566          return true;
 567       }
 568
 569       friend bool
 570       operator!=(const scoreboard &sb0, const scoreboard &sb1)
 571       {
 572          return !(sb0 == sb1);
 573       }
 574
 575    private:
 576       dependency grf_deps[BRW_MAX_GRF];
 577       dependency addr_dep;
 578       dependency accum_deps[10];
 579
 580       dependency *
 581       dep(const fs_reg &r)
 582       {
 583          const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
 584                                reg_offset(r) / REG_SIZE);
 585
 586          return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
 587                  r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] :
 588                  r.file == ARF && reg >= BRW_ARF_ADDRESS &&
 589                                   reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
 590                  r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
 591                                   reg < BRW_ARF_FLAG ? &accum_deps[
 592                                      reg - BRW_ARF_ACCUMULATOR] :
 593                  NULL);
 594       }
 595    };
 596
 597    /**
 598     * Dependency list handling.
 599     * @{
 600     */
 601    struct dependency_list {
 602       dependency_list() : deps(NULL), n(0) {}
 603
 604       ~dependency_list()
 605       {
 606          free(deps);
 607       }
 608
 609       void
 610       push_back(const dependency &dep)
 611       {
 612          deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
 613          deps[n++] = dep;
 614       }
 615
 616       unsigned
 617       size() const
 618       {
 619          return n;
 620       }
 621
 622       const dependency &
 623       operator[](unsigned i) const
 624       {
 625          assert(i < n);
 626          return deps[i];
 627       }
 628
 629       dependency &
 630       operator[](unsigned i)
 631       {
 632          assert(i < n);
 633          return deps[i];
 634       }
 635
 636    private:
 637       dependency_list(const dependency_list &);
 638       dependency_list &
 639       operator=(const dependency_list &);
 640
 641       dependency *deps;
 642       unsigned n;
 643    };
 644
 645    /**
 646     * Add dependency \p dep to the list of dependencies of an instruction
 647     * \p deps.
 648     */
 649    void
 650    add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
 651    {
 652       if (is_valid(dep)) {
 653          /* Translate the unordered dependency token first in order to keep
 654           * the list minimally redundant.
 655           */
 656          if (dep.unordered)
 657             dep.id = ids[dep.id];
 658
 659          /* Try to combine the specified dependency with any existing ones. */
 660          for (unsigned i = 0; i < deps.size(); i++) {
 661             /* Don't combine otherwise matching dependencies if there is an
 662              * exec_all mismatch which would cause a SET dependency to gain an
 663              * exec_all flag, since that would prevent it from being baked
 664              * into the instruction we want to allocate an SBID for.
 665              */
 666             if (deps[i].exec_all != dep.exec_all &&
 667                 (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
 668                 (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
 669                continue;
 670
 671             if (dep.ordered && deps[i].ordered) {
 672                deps[i].jp = MAX2(deps[i].jp, dep.jp);
 673                deps[i].ordered |= dep.ordered;
 674                deps[i].exec_all |= dep.exec_all;
 675                dep.ordered = TGL_REGDIST_NULL;
 676             }
 677
 678             if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
 679                deps[i].unordered |= dep.unordered;
 680                deps[i].exec_all |= dep.exec_all;
 681                dep.unordered = TGL_SBID_NULL;
 682             }
 683          }
 684
 685          /* Add it to the end of the list if necessary. */
 686          if (is_valid(dep))
 687             deps.push_back(dep);
 688       }
 689    }
 690
 691    /**
 692     * Construct a tgl_swsb annotation encoding any ordered dependencies from
 693     * the dependency list \p deps of an instruction with ordered_address \p
 694     * jp.  If \p exec_all is false only dependencies known to be executed with
 695     * channel masking applied will be considered in the calculation.
 696     */
 697    tgl_swsb
 698    ordered_dependency_swsb(const dependency_list &deps,
 699                            const ordered_address &jp,
 700                            bool exec_all)
 701    {
 702       unsigned min_dist = ~0u;
 703
 704       for (unsigned i = 0; i < deps.size(); i++) {
 705          if (deps[i].ordered && exec_all >= deps[i].exec_all) {
 706             const unsigned dist = jp - deps[i].jp;
 707             const unsigned max_dist = 10;
 708             assert(jp > deps[i].jp);
 709             if (dist <= max_dist)
 710                min_dist = MIN3(min_dist, dist, 7);
 711          }
 712       }
 713
 714       return { min_dist == ~0u ? 0 : min_dist };
 715    }
 716
 717    /**
 718     * Return whether the dependency list \p deps of an instruction with
 719     * ordered_address \p jp has any non-trivial ordered dependencies.  If \p
 720     * exec_all is false only dependencies known to be executed with channel
 721     * masking applied will be considered in the calculation.
 722     */
 723    bool
 724    find_ordered_dependency(const dependency_list &deps,
 725                            const ordered_address &jp,
 726                            bool exec_all)
 727    {
 728       return ordered_dependency_swsb(deps, jp, exec_all).regdist;
 729    }
 730
 731    /**
 732     * Return the full tgl_sbid_mode bitset for the first unordered dependency
 733     * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
 734     * no such dependency is present.  If \p exec_all is false only
 735     * dependencies known to be executed with channel masking applied will be
 736     * considered in the calculation.
 737     */
 738    tgl_sbid_mode
 739    find_unordered_dependency(const dependency_list &deps,
 740                              tgl_sbid_mode unordered,
 741                              bool exec_all)
 742    {
 743       if (unordered) {
 744          for (unsigned i = 0; i < deps.size(); i++) {
 745             if ((unordered & deps[i].unordered) &&
 746                 exec_all >= deps[i].exec_all)
 747                return deps[i].unordered;
 748          }
 749       }
 750
 751       return TGL_SBID_NULL;
 752    }
 753
 754    /**
 755     * Return the tgl_sbid_mode bitset of an unordered dependency from the list
 756     * \p deps that can be represented directly in the SWSB annotation of the
 757     * instruction without additional SYNC instructions, or zero if no such
 758     * dependency is present.
 759     */
 760    tgl_sbid_mode
 761    baked_unordered_dependency_mode(const fs_inst *inst,
 762                                    const dependency_list &deps,
 763                                    const ordered_address &jp)
 764    {
 765       const bool exec_all = inst->force_writemask_all;
 766       const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
 767
 768       if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
 769          return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
 770       else if (has_ordered && is_unordered(inst))
 771          return TGL_SBID_NULL;
 772       else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
 773                (!has_ordered || !is_unordered(inst)))
 774          return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
 775       else if (!has_ordered)
 776          return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
 777       else
 778          return TGL_SBID_NULL;
 779    }
 780
 781    /** @} */
 782
 783    /**
 784     * Shader instruction dependency calculation.
 785     * @{
 786     */
 787
 788    /**
 789     * Update scoreboard object \p sb to account for the execution of
 790     * instruction \p inst.
 791     */
 792    void
 793    update_inst_scoreboard(const ordered_address *jps,
 794                           const fs_inst *inst, unsigned ip, scoreboard &sb)
 795    {
 796       const bool exec_all = inst->force_writemask_all;
 797
 798       /* Track any source registers that may be fetched asynchronously by this
 799        * instruction, otherwise clear the dependency in order to avoid
 800        * subsequent redundant synchronization.
 801        */
 802       for (unsigned i = 0; i < inst->sources; i++) {
 803          const dependency rd_dep =
 804             (inst->is_payload(i) ||
 805              inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
 806             ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
 807             dependency::done;
 808
 809          for (unsigned j = 0; j < regs_read(inst, i); j++)
 810             sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
 811       }
 812
 813       if (is_send(inst) && inst->base_mrf != -1) {
 814          const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
 815
 816          for (unsigned j = 0; j < inst->mlen; j++)
 817             sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
 818       }
 819
 820       /* Track any destination registers of this instruction. */
 821       const dependency wr_dep =
 822          is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
 823          ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
 824          dependency();
 825
 826       if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
 827           !inst->dst.is_null()) {
 828          for (unsigned j = 0; j < regs_written(inst); j++)
 829             sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
 830       }
 831    }
 832
 833    /**
 834     * Calculate scoreboard objects locally that represent any pending (and
 835     * unconditionally resolved) dependencies at the end of each block of the
 836     * program.
 837     */
 838    scoreboard *
 839    gather_block_scoreboards(const fs_visitor *shader,
 840                             const ordered_address *jps)
 841    {
 842       scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
 843       unsigned ip = 0;
 844
 845       foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
 846          update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
 847
 848       return sbs;
 849    }
 850
 851    /**
 852     * Propagate data dependencies globally through the control flow graph
 853     * until a fixed point is reached.
 854     *
 855     * Calculates the set of dependencies potentially pending at the beginning
 856     * of each block, and returns it as an array of scoreboard objects.
 857     */
 858    scoreboard *
 859    propagate_block_scoreboards(const fs_visitor *shader,
 860                                const ordered_address *jps,
 861                                equivalence_relation &eq)
 862    {
 863       const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
 864       scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
 865       scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
 866
 867       for (bool progress = true; progress;) {
 868          progress = false;
 869
 870          foreach_block(block, shader->cfg) {
 871             const scoreboard sb = shadow(in_sbs[block->num],
 872                                          delta_sbs[block->num]);
 873
 874             if (sb != out_sbs[block->num]) {
 875                foreach_list_typed(bblock_link, child_link, link,
 876                                   &block->children) {
 877                   scoreboard &in_sb = in_sbs[child_link->block->num];
 878                   const int delta =
 879                      jps[child_link->block->start_ip] - jps[block->end_ip]
 880                      - ordered_unit(static_cast<const fs_inst *>(block->end()));
 881
 882                   in_sb = merge(eq, in_sb, transport(sb, delta));
 883                }
 884
 885                out_sbs[block->num] = sb;
 886                progress = true;
 887             }
 888          }
 889       }
 890
 891       delete[] delta_sbs;
 892       delete[] out_sbs;
 893
 894       return in_sbs;
 895    }
 896
 897    /**
 898     * Return the list of potential dependencies of each instruction in the
 899     * shader based on the result of global dependency analysis.
 900     */
 901    dependency_list *
 902    gather_inst_dependencies(const fs_visitor *shader,
 903                             const ordered_address *jps)
 904    {
 905       equivalence_relation eq(num_instructions(shader));
 906       scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
 907       const unsigned *ids = eq.flatten();
 908       dependency_list *deps = new dependency_list[num_instructions(shader)];
 909       unsigned ip = 0;
 910
 911       foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
 912          const bool exec_all = inst->force_writemask_all;
 913          scoreboard &sb = sbs[block->num];
 914
 915          for (unsigned i = 0; i < inst->sources; i++) {
 916             for (unsigned j = 0; j < regs_read(inst, i); j++)
 917                add_dependency(ids, deps[ip], dependency_for_read(
 918                   sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
 919          }
 920
 921          if (is_send(inst) && inst->base_mrf != -1) {
 922             for (unsigned j = 0; j < inst->mlen; j++)
 923                add_dependency(ids, deps[ip], dependency_for_read(
 924                   sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 925          }
 926
 927          if (is_unordered(inst))
 928             add_dependency(ids, deps[ip],
 929                            dependency(TGL_SBID_SET, ip, exec_all));
 930
 931          if (!inst->no_dd_check) {
 932             if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
 933                for (unsigned j = 0; j < regs_written(inst); j++) {
 934                   add_dependency(ids, deps[ip], dependency_for_write(inst,
 935                      sb.get(byte_offset(inst->dst, REG_SIZE * j))));
 936                }
 937             }
 938
 939             if (is_send(inst) && inst->base_mrf != -1) {
 940                for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
 941                   add_dependency(ids, deps[ip], dependency_for_write(inst,
 942                      sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
 943             }
 944          }
 945
 946          update_inst_scoreboard(jps, inst, ip, sb);
 947          ip++;
 948       }
 949
 950       delete[] sbs;
 951       delete[] ids;
 952
 953       return deps;
 954    }
 955
 956    /** @} */
 957
 958    /**
 959     * Allocate SBID tokens to track the execution of every out-of-order
 960     * instruction of the shader.
 961     */
 962    dependency_list *
 963    allocate_inst_dependencies(const fs_visitor *shader,
 964                               const dependency_list *deps0)
 965    {
 966       /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
 967        *       shaders with a large number of SEND messages.
 968        */
 969
 970       /* Allocate an unordered dependency ID to hardware SBID translation
 971        * table with as many entries as instructions there are in the shader,
 972        * which is the maximum number of unordered IDs we can find in the
 973        * program.
 974        */
 975       unsigned *ids = new unsigned[num_instructions(shader)];
 976       for (unsigned ip = 0; ip < num_instructions(shader); ip++)
 977          ids[ip] = ~0u;
 978
 979       dependency_list *deps1 = new dependency_list[num_instructions(shader)];
 980       unsigned next_id = 0;
 981
 982       for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
 983          for (unsigned i = 0; i < deps0[ip].size(); i++) {
 984             const dependency &dep = deps0[ip][i];
 985
 986             if (dep.unordered && ids[dep.id] == ~0u)
 987                ids[dep.id] = (next_id++) & 0xf;
 988
 989             add_dependency(ids, deps1[ip], dep);
 990          }
 991       }
 992
 993       delete[] ids;
 994
 995       return deps1;
 996    }
 997
 998    /**
 999     * Emit dependency information provided by \p deps into the shader,
1000     * inserting additional SYNC instructions for dependencies that can't be
1001     * represented directly by annotating existing instructions.
1002     */
1003    void
1004    emit_inst_dependencies(fs_visitor *shader,
1005                           const ordered_address *jps,
1006                           const dependency_list *deps)
1007    {
1008       unsigned ip = 0;
1009
1010       foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1011          const bool exec_all = inst->force_writemask_all;
1012          tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1013          const tgl_sbid_mode unordered_mode =
1014             baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
1015
1016          for (unsigned i = 0; i < deps[ip].size(); i++) {
1017             const dependency &dep = deps[ip][i];
1018
1019             if (dep.unordered) {
1020                if (unordered_mode == dep.unordered &&
1021                    exec_all >= dep.exec_all && !swsb.mode) {
1022                   /* Bake unordered dependency into the instruction's SWSB if
1023                    * possible, except in cases where the current instruction
1024                    * isn't marked NoMask but the dependency is, since that
1025                    * might lead to data coherency issues due to
1026                    * GEN:BUG:1407528679.
1027                    */
1028                   swsb.sbid = dep.id;
1029                   swsb.mode = dep.unordered;
1030                } else {
1031                   /* Emit dependency into the SWSB of an extra SYNC
1032                    * instruction.
1033                    */
1034                   const fs_builder ibld = fs_builder(shader, block, inst)
1035                                           .exec_all().group(1, 0);
1036                   fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1037                                             brw_imm_ud(TGL_SYNC_NOP));
1038                   sync->sched.sbid = dep.id;
1039                   sync->sched.mode = dep.unordered;
1040                   assert(!(sync->sched.mode & TGL_SBID_SET));
1041                }
1042             }
1043          }
1044
1045          for (unsigned i = 0; i < deps[ip].size(); i++) {
1046             const dependency &dep = deps[ip][i];
1047
1048             if (dep.ordered && dep.exec_all > exec_all &&
1049                 find_ordered_dependency(deps[ip], jps[ip], true)) {
1050                /* If the current instruction is not marked NoMask but an
1051                 * ordered dependency is, perform the synchronization as a
1052                 * separate NoMask SYNC instruction in order to avoid data
1053                 * coherency issues due to GEN:BUG:1407528679.  The similar
1054                 * scenario with unordered dependencies should have been
1055                 * handled above.
1056                 */
1057                const fs_builder ibld = fs_builder(shader, block, inst)
1058                                        .exec_all().group(1, 0);
1059                fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1060                                          brw_imm_ud(TGL_SYNC_NOP));
1061                sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1062                break;
1063             }
1064          }
1065
1066          /* Update the IR. */
1067          inst->sched = swsb;
1068          inst->no_dd_check = inst->no_dd_clear = false;
1069          ip++;
1070       }
1071    }
1072 }
1073
1074 bool
1075 fs_visitor::lower_scoreboard()
1076 {
1077    if (devinfo->gen >= 12) {
1078       const ordered_address *jps = ordered_inst_addresses(this);
1079       const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1080       const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1081       emit_inst_dependencies(this, jps, deps1);
1082       delete[] deps1;
1083       delete[] deps0;
1084       delete[] jps;
1085    }
1086
1087    return true;
1088 }