src/intel/compiler/brw_fs_bank_conflicts.cpp

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_bank_conflicts.cpp
  25  *
  26  * This file contains a GRF bank conflict mitigation pass.  The pass is
  27  * intended to be run after register allocation and works by rearranging the
  28  * layout of the GRF space (without altering the semantics of the program) in
  29  * a way that minimizes the number of GRF bank conflicts incurred by ternary
  30  * instructions.
  31  *
  32  * Unfortunately there is close to no information about bank conflicts in the
  33  * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to
  34  * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
  35  * the second and third source are stored in the same GRF bank (\sa bank_of()
  36  * for the exact bank layout) which cannot be fetched during the same cycle by
  37  * the EU, unless the EU logic manages to optimize out the read cycle of a
  38  * duplicate source register (\sa is_conflict_optimized_out()).
  39  *
  40  * The asymptotic run-time of the algorithm is dominated by the
  41  * shader_conflict_weight_matrix() computation below, which is O(n) on the
  42  * number of instructions in the program, however for small and medium-sized
  43  * programs the run-time is likely to be dominated by
  44  * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
  45  * the program (\sa partitioning), which is bounded (since the program uses a
  46  * bounded number of registers post-regalloc) and of the order of 100.  For
  47  * that reason optimize_reg_permutation() is vectorized in order to keep the
  48  * cubic term within reasonable bounds for m close to its theoretical maximum.
  49  */
  50
  51 #include "brw_fs.h"
  52 #include "brw_cfg.h"
  53
  54 #ifdef __SSE2__
  55
  56 #include <emmintrin.h>
  57
  58 /**
  59  * Thin layer around vector intrinsics so they can be easily replaced with
  60  * e.g. the fall-back scalar path, an implementation with different vector
  61  * width or using different SIMD architectures (AVX-512?!).
  62  *
  63  * This implementation operates on pairs of independent SSE2 integer vectors à
  64  * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
  65  * all platforms that care about bank conflicts, so this path should almost
  66  * always be available in practice.
  67  */
  68 namespace {
  69    /**
  70     * SIMD integer vector data type.
  71     */
  72    struct vector_type {
  73       __m128i v[2];
  74    };
  75
  76    /**
  77     * Scalar data type matching the representation of a single component of \p
  78     * vector_type.
  79     */
  80    typedef int16_t scalar_type;
  81
  82    /**
  83     * Maximum integer value representable as a \p scalar_type.
  84     */
  85    const scalar_type max_scalar = INT16_MAX;
  86
  87    /**
  88     * Number of components of a \p vector_type.
  89     */
  90    const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
  91
  92    /**
  93     * Set the i-th component of vector \p v to \p x.
  94     */
  95    void
  96    set(vector_type &v, unsigned i, scalar_type x)
  97    {
  98       assert(i < vector_width);
  99       memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
 100    }
 101
 102    /**
 103     * Get the i-th component of vector \p v.
 104     */
 105    scalar_type
 106    get(const vector_type &v, unsigned i)
 107    {
 108       assert(i < vector_width);
 109       scalar_type x;
 110       memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
 111       return x;
 112    }
 113
 114    /**
 115     * Add two vectors with saturation.
 116     */
 117    vector_type
 118    adds(const vector_type &v, const vector_type &w)
 119    {
 120       const vector_type u = {{
 121             _mm_adds_epi16(v.v[0], w.v[0]),
 122             _mm_adds_epi16(v.v[1], w.v[1])
 123          }};
 124       return u;
 125    }
 126
 127    /**
 128     * Subtract two vectors with saturation.
 129     */
 130    vector_type
 131    subs(const vector_type &v, const vector_type &w)
 132    {
 133       const vector_type u = {{
 134             _mm_subs_epi16(v.v[0], w.v[0]),
 135             _mm_subs_epi16(v.v[1], w.v[1])
 136          }};
 137       return u;
 138    }
 139
 140    /**
 141     * Compute the bitwise conjunction of two vectors.
 142     */
 143    vector_type
 144    mask(const vector_type &v, const vector_type &w)
 145    {
 146       const vector_type u = {{
 147             _mm_and_si128(v.v[0], w.v[0]),
 148             _mm_and_si128(v.v[1], w.v[1])
 149          }};
 150       return u;
 151    }
 152
 153    /**
 154     * Reduce the components of a vector using saturating addition.
 155     */
 156    scalar_type
 157    sums(const vector_type &v)
 158    {
 159       const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
 160       const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
 161       const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
 162       const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
 163       return _mm_extract_epi16(v1, 0);
 164    }
 165 }
 166
 167 #else
 168
 169 /**
 170  * Thin layer around vector intrinsics so they can be easily replaced with
 171  * e.g. the fall-back scalar path, an implementation with different vector
 172  * width or using different SIMD architectures (AVX-512?!).
 173  *
 174  * This implementation operates on scalar values and doesn't rely on
 175  * any vector extensions.  This is mainly intended for debugging and
 176  * to keep this file building on exotic platforms.
 177  */
 178 namespace {
 179    /**
 180     * SIMD integer vector data type.
 181     */
 182    typedef int16_t vector_type;
 183
 184    /**
 185     * Scalar data type matching the representation of a single component of \p
 186     * vector_type.
 187     */
 188    typedef int16_t scalar_type;
 189
 190    /**
 191     * Maximum integer value representable as a \p scalar_type.
 192     */
 193    const scalar_type max_scalar = INT16_MAX;
 194
 195    /**
 196     * Number of components of a \p vector_type.
 197     */
 198    const unsigned vector_width = 1;
 199
 200    /**
 201     * Set the i-th component of vector \p v to \p x.
 202     */
 203    void
 204    set(vector_type &v, unsigned i, scalar_type x)
 205    {
 206       assert(i < vector_width);
 207       v = x;
 208    }
 209
 210    /**
 211     * Get the i-th component of vector \p v.
 212     */
 213    scalar_type
 214    get(const vector_type &v, unsigned i)
 215    {
 216       assert(i < vector_width);
 217       return v;
 218    }
 219
 220    /**
 221     * Add two vectors with saturation.
 222     */
 223    vector_type
 224    adds(vector_type v, vector_type w)
 225    {
 226       return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
 227    }
 228
 229    /**
 230     * Substract two vectors with saturation.
 231     */
 232    vector_type
 233    subs(vector_type v, vector_type w)
 234    {
 235       return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
 236    }
 237
 238    /**
 239     * Compute the bitwise conjunction of two vectors.
 240     */
 241    vector_type
 242    mask(vector_type v, vector_type w)
 243    {
 244       return v & w;
 245    }
 246
 247    /**
 248     * Reduce the components of a vector using saturating addition.
 249     */
 250    scalar_type
 251    sums(vector_type v)
 252    {
 253       return v;
 254    }
 255 }
 256
 257 #endif
 258
 259 /**
 260  * Swap \p x and \p y.
 261  */
 262 #define SWAP(x, y) do {                          \
 263       __typeof(y) _swap_tmp = y;                 \
 264       y = x;                                     \
 265       x = _swap_tmp;                             \
 266    } while (0)
 267
 268 namespace {
 269    /**
 270     * Variable-length vector type intended to represent cycle-count costs for
 271     * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
 272     * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
 273     * the conflict (respectively, whether the cost is incurred whenever the
 274     * atoms are assigned the same bank b or opposite-parity banks b and b^1).
 275     * \sa shader_conflict_weight_matrix()
 276     */
 277    struct weight_vector_type {
 278       weight_vector_type() : v(NULL), size(0) {}
 279
 280       weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
 281
 282       weight_vector_type(const weight_vector_type &u) :
 283          v(alloc(u.size)), size(u.size)
 284       {
 285          memcpy(v, u.v,
 286                 DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
 287       }
 288
 289       ~weight_vector_type()
 290       {
 291          free(v);
 292       }
 293
 294       weight_vector_type &
 295       operator=(weight_vector_type u)
 296       {
 297          SWAP(v, u.v);
 298          SWAP(size, u.size);
 299          return *this;
 300       }
 301
 302       vector_type *v;
 303       unsigned size;
 304
 305    private:
 306       static vector_type *
 307       alloc(unsigned n)
 308       {
 309          const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
 310          const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
 311          void *p;
 312          if (posix_memalign(&p, align, size))
 313             return NULL;
 314          memset(p, 0, size);
 315          return reinterpret_cast<vector_type *>(p);
 316       }
 317    };
 318
 319    /**
 320     * Set the (i, p)-th component of weight vector \p v to \p x.
 321     */
 322    void
 323    set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
 324    {
 325       set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
 326    }
 327
 328    /**
 329     * Get the (i, p)-th component of weight vector \p v.
 330     */
 331    scalar_type
 332    get(const weight_vector_type &v, unsigned i, unsigned p)
 333    {
 334       return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
 335    }
 336
 337    /**
 338     * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
 339     */
 340    void
 341    swap(weight_vector_type &v,
 342         unsigned i, unsigned p,
 343         unsigned j, unsigned q)
 344    {
 345       const scalar_type tmp = get(v, i, p);
 346       set(v, i, p, get(v, j, q));
 347       set(v, j, q, tmp);
 348    }
 349 }
 350
 351 namespace {
 352    /**
 353     * Object that represents the partitioning of an arbitrary register space
 354     * into indivisible units (referred to as atoms below) that can potentially
 355     * be rearranged independently from other registers.  The partitioning is
 356     * inferred from a number of contiguity requirements specified using
 357     * require_contiguous().  This allows efficient look-up of the atom index a
 358     * given register address belongs to, or conversely the range of register
 359     * addresses that belong to a given atom.
 360     */
 361    struct partitioning {
 362       /**
 363        * Create a (for the moment unrestricted) partitioning of a register
 364        * file of size \p n.  The units are arbitrary.
 365        */
 366       partitioning(unsigned n) :
 367          max_reg(n),
 368          offsets(new unsigned[n + num_terminator_atoms]),
 369          atoms(new unsigned[n + num_terminator_atoms])
 370       {
 371          for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
 372             offsets[i] = i;
 373             atoms[i] = i;
 374          }
 375       }
 376
 377       partitioning(const partitioning &p) :
 378          max_reg(p.max_reg),
 379          offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
 380          atoms(new unsigned[p.max_reg + num_terminator_atoms])
 381       {
 382          memcpy(offsets, p.offsets,
 383                 sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
 384          memcpy(atoms, p.atoms,
 385                 sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
 386       }
 387
 388       ~partitioning()
 389       {
 390          delete[] offsets;
 391          delete[] atoms;
 392       }
 393
 394       partitioning &
 395       operator=(partitioning p)
 396       {
 397          SWAP(max_reg, p.max_reg);
 398          SWAP(offsets, p.offsets);
 399          SWAP(atoms, p.atoms);
 400          return *this;
 401       }
 402
 403       /**
 404        * Require register range [reg, reg + n[ to be considered part of the
 405        * same atom.
 406        */
 407       void
 408       require_contiguous(unsigned reg, unsigned n)
 409       {
 410          unsigned r = atoms[reg];
 411
 412          /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
 413           * case that the specified contiguity requirement leads to the fusion
 414           * (yay) of one or more existing atoms.
 415           */
 416          for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
 417             if (offsets[atoms[reg1]] < reg + n) {
 418                atoms[reg1] = r;
 419             } else {
 420                if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
 421                   r++;
 422
 423                offsets[r] = offsets[atoms[reg1]];
 424                atoms[reg1] = r;
 425             }
 426          }
 427       }
 428
 429       /**
 430        * Get the atom index register address \p reg belongs to.
 431        */
 432       unsigned
 433       atom_of_reg(unsigned reg) const
 434       {
 435          return atoms[reg];
 436       }
 437
 438       /**
 439        * Get the base register address that belongs to atom \p r.
 440        */
 441       unsigned
 442       reg_of_atom(unsigned r) const
 443       {
 444          return offsets[r];
 445       }
 446
 447       /**
 448        * Get the size of atom \p r in register address units.
 449        */
 450       unsigned
 451       size_of_atom(unsigned r) const
 452       {
 453          assert(r < num_atoms());
 454          return reg_of_atom(r + 1) - reg_of_atom(r);
 455       }
 456
 457       /**
 458        * Get the number of atoms the whole register space is partitioned into.
 459        */
 460       unsigned
 461       num_atoms() const
 462       {
 463          return atoms[max_reg];
 464       }
 465
 466    private:
 467       /**
 468        * Number of trailing atoms inserted for convenience so among other
 469        * things we don't need to special-case the last element in
 470        * size_of_atom().
 471        */
 472       static const unsigned num_terminator_atoms = 1;
 473       unsigned max_reg;
 474       unsigned *offsets;
 475       unsigned *atoms;
 476    };
 477
 478    /**
 479     * Only GRF sources (whether they have been register-allocated or not) can
 480     * possibly incur bank conflicts.
 481     */
 482    bool
 483    is_grf(const fs_reg &r)
 484    {
 485       return r.file == VGRF || r.file == FIXED_GRF;
 486    }
 487
 488    /**
 489     * Register offset of \p r in GRF units.  Useful because the representation
 490     * of GRFs post-register allocation is somewhat inconsistent and depends on
 491     * whether the register already had a fixed GRF offset prior to register
 492     * allocation or whether it was part of a VGRF allocation.
 493     */
 494    unsigned
 495    reg_of(const fs_reg &r)
 496    {
 497       assert(is_grf(r));
 498       if (r.file == VGRF)
 499          return r.nr + r.offset / REG_SIZE;
 500       else
 501          return reg_offset(r) / REG_SIZE;
 502    }
 503
 504    /**
 505     * Calculate the finest partitioning of the GRF space compatible with the
 506     * register contiguity requirements derived from all instructions part of
 507     * the program.
 508     */
 509    partitioning
 510    shader_reg_partitioning(const fs_visitor *v)
 511    {
 512       partitioning p(BRW_MAX_GRF);
 513
 514       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
 515          if (is_grf(inst->dst))
 516             p.require_contiguous(reg_of(inst->dst), regs_written(inst));
 517
 518          for (int i = 0; i < inst->sources; i++) {
 519             if (is_grf(inst->src[i]))
 520                p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
 521          }
 522       }
 523
 524       return p;
 525    }
 526
 527    /**
 528     * Return the set of GRF atoms that should be left untouched at their
 529     * original location to avoid violating hardware or software assumptions.
 530     */
 531    bool *
 532    shader_reg_constraints(const fs_visitor *v, const partitioning &p)
 533    {
 534       bool *constrained = new bool[p.num_atoms()]();
 535
 536       /* These are read implicitly by some send-message instructions without
 537        * any indication at the IR level.  Assume they are unsafe to move
 538        * around.
 539        */
 540       for (unsigned reg = 0; reg < 2; reg++)
 541          constrained[p.atom_of_reg(reg)] = true;
 542
 543       /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
 544        * subsection "EUISA Instructions", Send Message (page 990):
 545        *
 546        * "r127 must not be used for return address when there is a src and
 547        * dest overlap in send instruction."
 548        *
 549        * Register allocation ensures that, so don't move 127 around to avoid
 550        * breaking that property.
 551        */
 552       if (v->devinfo->gen >= 8)
 553          constrained[p.atom_of_reg(127)] = true;
 554
 555       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
 556          /* Assume that anything referenced via fixed GRFs is baked into the
 557           * hardware's fixed-function logic and may be unsafe to move around.
 558           * Also take into account the source GRF restrictions of EOT
 559           * send-message instructions.
 560           */
 561          if (inst->dst.file == FIXED_GRF)
 562             constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
 563
 564          for (int i = 0; i < inst->sources; i++) {
 565             if (inst->src[i].file == FIXED_GRF ||
 566                 (is_grf(inst->src[i]) && inst->eot))
 567                constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
 568          }
 569
 570          /* Preserve the original allocation of VGRFs used by the barycentric
 571           * source of the LINTERP instruction on Gen6, since pair-aligned
 572           * barycentrics allow the PLN instruction to be used.
 573           */
 574          if (v->devinfo->has_pln && v->devinfo->gen <= 6 &&
 575              inst->opcode == FS_OPCODE_LINTERP)
 576             constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
 577
 578          /* The location of the Gen7 MRF hack registers is hard-coded in the
 579           * rest of the compiler back-end.  Don't attempt to move them around.
 580           */
 581          if (v->devinfo->gen >= 7) {
 582             assert(inst->dst.file != MRF);
 583
 584             for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
 585                const unsigned reg = GEN7_MRF_HACK_START + inst->base_mrf + i;
 586                constrained[p.atom_of_reg(reg)] = true;
 587             }
 588          }
 589       }
 590
 591       return constrained;
 592    }
 593
 594    /**
 595     * Return whether the hardware will be able to prevent a bank conflict by
 596     * optimizing out the read cycle of a source register.  The formula was
 597     * found experimentally.
 598     */
 599    bool
 600    is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst)
 601    {
 602       return devinfo->gen >= 9 &&
 603          ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
 604                                     reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
 605           reg_of(inst->src[1]) == reg_of(inst->src[2]));
 606    }
 607
 608    /**
 609     * Return a matrix that allows reasonably efficient computation of the
 610     * cycle-count cost of bank conflicts incurred throughout the whole program
 611     * for any given atom-to-bank assignment.
 612     *
 613     * More precisely, if C_r_s_p is the result of this function, the total
 614     * cost of all bank conflicts involving any given atom r can be readily
 615     * recovered as follows:
 616     *
 617     *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
 618     *
 619     * where d_i_j is the Kronecker delta, and B_r indicates the bank
 620     * assignment of r.  \sa delta_conflicts() for a vectorized implementation
 621     * of the expression above.
 622     *
 623     * FINISHME: Teach this about the Gen10+ bank conflict rules, which are
 624     *           somewhat more relaxed than on previous generations.  In the
 625     *           meantime optimizing based on Gen9 weights is likely to be more
 626     *           helpful than not optimizing at all.
 627     */
 628    weight_vector_type *
 629    shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
 630    {
 631       weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
 632       for (unsigned r = 0; r < p.num_atoms(); r++)
 633          conflicts[r] = weight_vector_type(2 * p.num_atoms());
 634
 635       /* Crude approximation of the number of times the current basic block
 636        * will be executed at run-time.
 637        */
 638       unsigned block_scale = 1;
 639
 640       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
 641          if (inst->opcode == BRW_OPCODE_DO) {
 642             block_scale *= 10;
 643
 644          } else if (inst->opcode == BRW_OPCODE_WHILE) {
 645             block_scale /= 10;
 646
 647          } else if (inst->is_3src(v->devinfo) &&
 648                     is_grf(inst->src[1]) && is_grf(inst->src[2])) {
 649             const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
 650             const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
 651
 652             /* Estimate of the cycle-count cost of incurring a bank conflict
 653              * for this instruction.  This is only true on the average, for a
 654              * sequence of back-to-back ternary instructions, since the EU
 655              * front-end only seems to be able to issue a new instruction at
 656              * an even cycle.  The cost of a bank conflict incurred by an
 657              * isolated ternary instruction may be higher.
 658              */
 659             const unsigned exec_size = inst->dst.component_size(inst->exec_size);
 660             const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
 661                                                                     REG_SIZE);
 662
 663             /* Neglect same-atom conflicts (since they're either trivial or
 664              * impossible to avoid without splitting the atom), and conflicts
 665              * known to be optimized out by the hardware.
 666              */
 667             if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
 668                /* Calculate the parity of the sources relative to the start of
 669                 * their respective atoms.  If their parity is the same (and
 670                 * none of the atoms straddle the 2KB mark), the instruction
 671                 * will incur a conflict iff both atoms are assigned the same
 672                 * bank b.  If their parity is opposite, the instruction will
 673                 * incur a conflict iff they are assigned opposite banks (b and
 674                 * b^1).
 675                 */
 676                const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
 677                const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
 678                const unsigned p = p_r ^ p_s;
 679
 680                /* Calculate the updated cost of a hypothetical conflict
 681                 * between atoms r and s.  Note that the weight matrix is
 682                 * symmetric with respect to indices r and s by construction.
 683                 */
 684                const scalar_type w = MIN2(unsigned(max_scalar),
 685                                           get(conflicts[r], s, p) + cycle_scale);
 686                set(conflicts[r], s, p, w);
 687                set(conflicts[s], r, p, w);
 688             }
 689          }
 690       }
 691
 692       return conflicts;
 693    }
 694
 695    /**
 696     * Return the set of GRF atoms that could potentially lead to bank
 697     * conflicts if laid out unfavorably in the GRF space according to
 698     * the specified \p conflicts matrix (\sa
 699     * shader_conflict_weight_matrix()).
 700     */
 701    bool *
 702    have_any_conflicts(const partitioning &p,
 703                       const weight_vector_type *conflicts)
 704    {
 705       bool *any_conflicts = new bool[p.num_atoms()]();
 706
 707       for (unsigned r = 0; r < p.num_atoms(); r++) {
 708          const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
 709          for (unsigned s = 0; s < m; s++)
 710             any_conflicts[r] |= sums(conflicts[r].v[s]);
 711       }
 712
 713       return any_conflicts;
 714    }
 715
 716    /**
 717     * Calculate the difference between two S(B) cost estimates as defined
 718     * above (\sa shader_conflict_weight_matrix()).  This represents the
 719     * (partial) cycle-count benefit from moving an atom r from bank p to n.
 720     * The respective bank assignments Bp and Bn are encoded as the \p
 721     * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
 722     * according to the formula:
 723     *
 724     *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
 725     *
 726     * Notice the similarity with the delta function in the S(B) expression
 727     * above, and how bank_mask(B) can be precomputed for every possible
 728     * selection of r since bank_mask(B) only depends on it via B_r that may
 729     * only assume one of four different values, so the caller can keep every
 730     * possible bank_mask(B) vector in memory without much hassle (\sa
 731     * bank_characteristics()).
 732     */
 733    int
 734    delta_conflicts(const weight_vector_type &bank_mask_p,
 735                    const weight_vector_type &bank_mask_n,
 736                    const weight_vector_type &conflicts)
 737    {
 738       const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
 739       vector_type s_p = {}, s_n = {};
 740
 741       for (unsigned r = 0; r < m; r++) {
 742          s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
 743          s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
 744       }
 745
 746       return sums(subs(s_p, s_n));
 747    }
 748
 749    /**
 750     * Register atom permutation, represented as the start GRF offset each atom
 751     * is mapped into.
 752     */
 753    struct permutation {
 754       permutation() : v(NULL), size(0) {}
 755
 756       permutation(unsigned n) :
 757          v(new unsigned[n]()), size(n) {}
 758
 759       permutation(const permutation &p) :
 760          v(new unsigned[p.size]), size(p.size)
 761       {
 762          memcpy(v, p.v, p.size * sizeof(unsigned));
 763       }
 764
 765       ~permutation()
 766       {
 767          delete[] v;
 768       }
 769
 770       permutation &
 771       operator=(permutation p)
 772       {
 773          SWAP(v, p.v);
 774          SWAP(size, p.size);
 775          return *this;
 776       }
 777
 778       unsigned *v;
 779       unsigned size;
 780    };
 781
 782    /**
 783     * Return an identity permutation of GRF atoms.
 784     */
 785    permutation
 786    identity_reg_permutation(const partitioning &p)
 787    {
 788       permutation map(p.num_atoms());
 789
 790       for (unsigned r = 0; r < map.size; r++)
 791          map.v[r] = p.reg_of_atom(r);
 792
 793       return map;
 794    }
 795
 796    /**
 797     * Return the bank index of GRF address \p reg, numbered according to the
 798     * table:
 799     *        Even Odd
 800     *    Lo    0   1
 801     *    Hi    2   3
 802     */
 803    unsigned
 804    bank_of(unsigned reg)
 805    {
 806       return (reg & 0x40) >> 5 | (reg & 1);
 807    }
 808
 809    /**
 810     * Return bitmasks suitable for use as bank mask arguments for the
 811     * delta_conflicts() computation.  Note that this is just the (negative)
 812     * characteristic function of each bank, if you regard it as a set
 813     * containing all atoms assigned to it according to the \p map array.
 814     */
 815    weight_vector_type *
 816    bank_characteristics(const permutation &map)
 817    {
 818       weight_vector_type *banks = new weight_vector_type[4];
 819
 820       for (unsigned b = 0; b < 4; b++) {
 821          banks[b] = weight_vector_type(2 * map.size);
 822
 823          for (unsigned j = 0; j < map.size; j++) {
 824             for (unsigned p = 0; p < 2; p++)
 825                set(banks[b], j, p,
 826                    (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
 827          }
 828       }
 829
 830       return banks;
 831    }
 832
 833    /**
 834     * Return an improved permutation of GRF atoms based on \p map attempting
 835     * to reduce the total cycle-count cost of bank conflicts greedily.
 836     *
 837     * Note that this doesn't attempt to merge multiple atoms into one, which
 838     * may allow it to do a better job in some cases -- It simply reorders
 839     * existing atoms in the GRF space without affecting their identity.
 840     */
 841    permutation
 842    optimize_reg_permutation(const partitioning &p,
 843                             const bool *constrained,
 844                             const weight_vector_type *conflicts,
 845                             permutation map)
 846    {
 847       const bool *any_conflicts = have_any_conflicts(p, conflicts);
 848       weight_vector_type *banks = bank_characteristics(map);
 849
 850       for (unsigned r = 0; r < map.size; r++) {
 851          const unsigned bank_r = bank_of(map.v[r]);
 852
 853          if (!constrained[r]) {
 854             unsigned best_s = r;
 855             int best_benefit = 0;
 856
 857             for (unsigned s = 0; s < map.size; s++) {
 858                const unsigned bank_s = bank_of(map.v[s]);
 859
 860                if (bank_r != bank_s && !constrained[s] &&
 861                    p.size_of_atom(r) == p.size_of_atom(s) &&
 862                    (any_conflicts[r] || any_conflicts[s])) {
 863                   const int benefit =
 864                      delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
 865                      delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
 866
 867                   if (benefit > best_benefit) {
 868                      best_s = s;
 869                      best_benefit = benefit;
 870                   }
 871                }
 872             }
 873
 874             if (best_s != r) {
 875                for (unsigned b = 0; b < 4; b++) {
 876                   for (unsigned p = 0; p < 2; p++)
 877                      swap(banks[b], r, p, best_s, p);
 878                }
 879
 880                SWAP(map.v[r], map.v[best_s]);
 881             }
 882          }
 883       }
 884
 885       delete[] banks;
 886       delete[] any_conflicts;
 887       return map;
 888    }
 889
 890    /**
 891     * Apply the GRF atom permutation given by \p map to register \p r and
 892     * return the result.
 893     */
 894    fs_reg
 895    transform(const partitioning &p, const permutation &map, fs_reg r)
 896    {
 897       if (r.file == VGRF) {
 898          const unsigned reg = reg_of(r);
 899          const unsigned s = p.atom_of_reg(reg);
 900          r.nr = map.v[s] + reg - p.reg_of_atom(s);
 901          r.offset = r.offset % REG_SIZE;
 902       }
 903
 904       return r;
 905    }
 906 }
 907
 908 bool
 909 fs_visitor::opt_bank_conflicts()
 910 {
 911    assert(grf_used || !"Must be called after register allocation");
 912
 913    /* No ternary instructions -- No bank conflicts. */
 914    if (devinfo->gen < 6)
 915       return false;
 916
 917    const partitioning p = shader_reg_partitioning(this);
 918    const bool *constrained = shader_reg_constraints(this, p);
 919    const weight_vector_type *conflicts =
 920       shader_conflict_weight_matrix(this, p);
 921    const permutation map =
 922       optimize_reg_permutation(p, constrained, conflicts,
 923                                identity_reg_permutation(p));
 924
 925    foreach_block_and_inst(block, fs_inst, inst, cfg) {
 926       inst->dst = transform(p, map, inst->dst);
 927
 928       for (int i = 0; i < inst->sources; i++)
 929          inst->src[i] = transform(p, map, inst->src[i]);
 930    }
 931
 932    delete[] conflicts;
 933    delete[] constrained;
 934    return true;
 935 }
 936
 937 /**
 938  * Return whether the instruction incurs GRF bank conflict cycles.
 939  *
 940  * Note that this is only accurate after register allocation because otherwise
 941  * we don't know which bank each VGRF is going to end up aligned to.
 942  */
 943 bool
 944 has_bank_conflict(const gen_device_info *devinfo, const fs_inst *inst)
 945 {
 946    return inst->is_3src(devinfo) &&
 947           is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
 948           bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
 949           !is_conflict_optimized_out(devinfo, inst);
 950 }