2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_bank_conflicts.cpp
26 * This file contains a GRF bank conflict mitigation pass. The pass is
27 * intended to be run after register allocation and works by rearranging the
28 * layout of the GRF space (without altering the semantics of the program) in
29 * a way that minimizes the number of GRF bank conflicts incurred by ternary
32 * Unfortunately there is close to no information about bank conflicts in the
33 * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to
34 * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
35 * the second and third source are stored in the same GRF bank (\sa bank_of()
36 * for the exact bank layout) which cannot be fetched during the same cycle by
37 * the EU, unless the EU logic manages to optimize out the read cycle of a
38 * duplicate source register (\sa is_conflict_optimized_out()).
40 * The asymptotic run-time of the algorithm is dominated by the
41 * shader_conflict_weight_matrix() computation below, which is O(n) on the
42 * number of instructions in the program, however for small and medium-sized
43 * programs the run-time is likely to be dominated by
44 * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
45 * the program (\sa partitioning), which is bounded (since the program uses a
46 * bounded number of registers post-regalloc) and of the order of 100. For
47 * that reason optimize_reg_permutation() is vectorized in order to keep the
48 * cubic term within reasonable bounds for m close to its theoretical maximum.
56 #include <emmintrin.h>
59 * Thin layer around vector intrinsics so they can be easily replaced with
60 * e.g. the fall-back scalar path, an implementation with different vector
61 * width or using different SIMD architectures (AVX-512?!).
63 * This implementation operates on pairs of independent SSE2 integer vectors à
64 * la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually
65 * all platforms that care about bank conflicts, so this path should almost
66 * always be available in practice.
70 * SIMD integer vector data type.
77 * Scalar data type matching the representation of a single component of \p
80 typedef int16_t scalar_type
;
83 * Maximum integer value representable as a \p scalar_type.
85 const scalar_type max_scalar
= INT16_MAX
;
88 * Number of components of a \p vector_type.
90 const unsigned vector_width
= 2 * sizeof(__m128i
) / sizeof(scalar_type
);
93 * Set the i-th component of vector \p v to \p x.
96 set(vector_type
&v
, unsigned i
, scalar_type x
)
98 assert(i
< vector_width
);
99 memcpy((char *)v
.v
+ i
* sizeof(x
), &x
, sizeof(x
));
103 * Get the i-th component of vector \p v.
106 get(const vector_type
&v
, unsigned i
)
108 assert(i
< vector_width
);
110 memcpy(&x
, (char *)v
.v
+ i
* sizeof(x
), sizeof(x
));
115 * Add two vectors with saturation.
118 adds(const vector_type
&v
, const vector_type
&w
)
120 const vector_type u
= {{
121 _mm_adds_epi16(v
.v
[0], w
.v
[0]),
122 _mm_adds_epi16(v
.v
[1], w
.v
[1])
128 * Subtract two vectors with saturation.
131 subs(const vector_type
&v
, const vector_type
&w
)
133 const vector_type u
= {{
134 _mm_subs_epi16(v
.v
[0], w
.v
[0]),
135 _mm_subs_epi16(v
.v
[1], w
.v
[1])
141 * Compute the bitwise conjunction of two vectors.
144 mask(const vector_type
&v
, const vector_type
&w
)
146 const vector_type u
= {{
147 _mm_and_si128(v
.v
[0], w
.v
[0]),
148 _mm_and_si128(v
.v
[1], w
.v
[1])
154 * Reduce the components of a vector using saturating addition.
157 sums(const vector_type
&v
)
159 const __m128i v8
= _mm_adds_epi16(v
.v
[0], v
.v
[1]);
160 const __m128i v4
= _mm_adds_epi16(v8
, _mm_shuffle_epi32(v8
, 0x4e));
161 const __m128i v2
= _mm_adds_epi16(v4
, _mm_shuffle_epi32(v4
, 0xb1));
162 const __m128i v1
= _mm_adds_epi16(v2
, _mm_shufflelo_epi16(v2
, 0xb1));
163 return _mm_extract_epi16(v1
, 0);
170 * Thin layer around vector intrinsics so they can be easily replaced with
171 * e.g. the fall-back scalar path, an implementation with different vector
172 * width or using different SIMD architectures (AVX-512?!).
174 * This implementation operates on scalar values and doesn't rely on
175 * any vector extensions. This is mainly intended for debugging and
176 * to keep this file building on exotic platforms.
180 * SIMD integer vector data type.
182 typedef int16_t vector_type
;
185 * Scalar data type matching the representation of a single component of \p
188 typedef int16_t scalar_type
;
191 * Maximum integer value representable as a \p scalar_type.
193 const scalar_type max_scalar
= INT16_MAX
;
196 * Number of components of a \p vector_type.
198 const unsigned vector_width
= 1;
201 * Set the i-th component of vector \p v to \p x.
204 set(vector_type
&v
, unsigned i
, scalar_type x
)
206 assert(i
< vector_width
);
211 * Get the i-th component of vector \p v.
214 get(const vector_type
&v
, unsigned i
)
216 assert(i
< vector_width
);
221 * Add two vectors with saturation.
224 adds(vector_type v
, vector_type w
)
226 return MAX2(INT16_MIN
, MIN2(INT16_MAX
, int(v
) + w
));
230 * Substract two vectors with saturation.
233 subs(vector_type v
, vector_type w
)
235 return MAX2(INT16_MIN
, MIN2(INT16_MAX
, int(v
) - w
));
239 * Compute the bitwise conjunction of two vectors.
242 mask(vector_type v
, vector_type w
)
248 * Reduce the components of a vector using saturating addition.
260 * Swap \p x and \p y.
262 #define SWAP(x, y) do { \
263 __typeof(y) _swap_tmp = y; \
270 * Variable-length vector type intended to represent cycle-count costs for
271 * arbitrary atom-to-bank assignments. It's indexed by a pair of integers
272 * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
273 * the conflict (respectively, whether the cost is incurred whenever the
274 * atoms are assigned the same bank b or opposite-parity banks b and b^1).
275 * \sa shader_conflict_weight_matrix()
277 struct weight_vector_type
{
278 weight_vector_type() : v(NULL
), size(0) {}
280 weight_vector_type(unsigned n
) : v(alloc(n
)), size(n
) {}
282 weight_vector_type(const weight_vector_type
&u
) :
283 v(alloc(u
.size
)), size(u
.size
)
286 DIV_ROUND_UP(u
.size
, vector_width
) * sizeof(vector_type
));
289 ~weight_vector_type()
295 operator=(weight_vector_type u
)
309 const unsigned align
= MAX2(sizeof(void *), __alignof__(vector_type
));
310 const unsigned size
= DIV_ROUND_UP(n
, vector_width
) * sizeof(vector_type
);
312 if (posix_memalign(&p
, align
, size
))
315 return reinterpret_cast<vector_type
*>(p
);
320 * Set the (i, p)-th component of weight vector \p v to \p x.
323 set(weight_vector_type
&v
, unsigned i
, unsigned p
, scalar_type x
)
325 set(v
.v
[(2 * i
+ p
) / vector_width
], (2 * i
+ p
) % vector_width
, x
);
329 * Get the (i, p)-th component of weight vector \p v.
332 get(const weight_vector_type
&v
, unsigned i
, unsigned p
)
334 return get(v
.v
[(2 * i
+ p
) / vector_width
], (2 * i
+ p
) % vector_width
);
338 * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
341 swap(weight_vector_type
&v
,
342 unsigned i
, unsigned p
,
343 unsigned j
, unsigned q
)
345 const scalar_type tmp
= get(v
, i
, p
);
346 set(v
, i
, p
, get(v
, j
, q
));
353 * Object that represents the partitioning of an arbitrary register space
354 * into indivisible units (referred to as atoms below) that can potentially
355 * be rearranged independently from other registers. The partitioning is
356 * inferred from a number of contiguity requirements specified using
357 * require_contiguous(). This allows efficient look-up of the atom index a
358 * given register address belongs to, or conversely the range of register
359 * addresses that belong to a given atom.
361 struct partitioning
{
363 * Create a (for the moment unrestricted) partitioning of a register
364 * file of size \p n. The units are arbitrary.
366 partitioning(unsigned n
) :
368 offsets(new unsigned[n
+ num_terminator_atoms
]),
369 atoms(new unsigned[n
+ num_terminator_atoms
])
371 for (unsigned i
= 0; i
< n
+ num_terminator_atoms
; i
++) {
377 partitioning(const partitioning
&p
) :
379 offsets(new unsigned[p
.num_atoms() + num_terminator_atoms
]),
380 atoms(new unsigned[p
.max_reg
+ num_terminator_atoms
])
382 memcpy(offsets
, p
.offsets
,
383 sizeof(unsigned) * (p
.num_atoms() + num_terminator_atoms
));
384 memcpy(atoms
, p
.atoms
,
385 sizeof(unsigned) * (p
.max_reg
+ num_terminator_atoms
));
395 operator=(partitioning p
)
397 SWAP(max_reg
, p
.max_reg
);
398 SWAP(offsets
, p
.offsets
);
399 SWAP(atoms
, p
.atoms
);
404 * Require register range [reg, reg + n[ to be considered part of the
408 require_contiguous(unsigned reg
, unsigned n
)
410 unsigned r
= atoms
[reg
];
412 /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
413 * case that the specified contiguity requirement leads to the fusion
414 * (yay) of one or more existing atoms.
416 for (unsigned reg1
= reg
+ 1; reg1
<= max_reg
; reg1
++) {
417 if (offsets
[atoms
[reg1
]] < reg
+ n
) {
420 if (offsets
[atoms
[reg1
- 1]] != offsets
[atoms
[reg1
]])
423 offsets
[r
] = offsets
[atoms
[reg1
]];
430 * Get the atom index register address \p reg belongs to.
433 atom_of_reg(unsigned reg
) const
439 * Get the base register address that belongs to atom \p r.
442 reg_of_atom(unsigned r
) const
448 * Get the size of atom \p r in register address units.
451 size_of_atom(unsigned r
) const
453 assert(r
< num_atoms());
454 return reg_of_atom(r
+ 1) - reg_of_atom(r
);
458 * Get the number of atoms the whole register space is partitioned into.
463 return atoms
[max_reg
];
468 * Number of trailing atoms inserted for convenience so among other
469 * things we don't need to special-case the last element in
472 static const unsigned num_terminator_atoms
= 1;
479 * Only GRF sources (whether they have been register-allocated or not) can
480 * possibly incur bank conflicts.
483 is_grf(const fs_reg
&r
)
485 return r
.file
== VGRF
|| r
.file
== FIXED_GRF
;
489 * Register offset of \p r in GRF units. Useful because the representation
490 * of GRFs post-register allocation is somewhat inconsistent and depends on
491 * whether the register already had a fixed GRF offset prior to register
492 * allocation or whether it was part of a VGRF allocation.
495 reg_of(const fs_reg
&r
)
499 return r
.nr
+ r
.offset
/ REG_SIZE
;
501 return reg_offset(r
) / REG_SIZE
;
505 * Calculate the finest partitioning of the GRF space compatible with the
506 * register contiguity requirements derived from all instructions part of
510 shader_reg_partitioning(const fs_visitor
*v
)
512 partitioning
p(BRW_MAX_GRF
);
514 foreach_block_and_inst(block
, fs_inst
, inst
, v
->cfg
) {
515 if (is_grf(inst
->dst
))
516 p
.require_contiguous(reg_of(inst
->dst
), regs_written(inst
));
518 for (int i
= 0; i
< inst
->sources
; i
++) {
519 if (is_grf(inst
->src
[i
]))
520 p
.require_contiguous(reg_of(inst
->src
[i
]), regs_read(inst
, i
));
528 * Return the set of GRF atoms that should be left untouched at their
529 * original location to avoid violating hardware or software assumptions.
532 shader_reg_constraints(const fs_visitor
*v
, const partitioning
&p
)
534 bool *constrained
= new bool[p
.num_atoms()]();
536 /* These are read implicitly by some send-message instructions without
537 * any indication at the IR level. Assume they are unsafe to move
540 for (unsigned reg
= 0; reg
< 2; reg
++)
541 constrained
[p
.atom_of_reg(reg
)] = true;
543 /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
544 * subsection "EUISA Instructions", Send Message (page 990):
546 * "r127 must not be used for return address when there is a src and
547 * dest overlap in send instruction."
549 * Register allocation ensures that, so don't move 127 around to avoid
550 * breaking that property.
552 if (v
->devinfo
->gen
>= 8)
553 constrained
[p
.atom_of_reg(127)] = true;
555 foreach_block_and_inst(block
, fs_inst
, inst
, v
->cfg
) {
556 /* Assume that anything referenced via fixed GRFs is baked into the
557 * hardware's fixed-function logic and may be unsafe to move around.
558 * Also take into account the source GRF restrictions of EOT
559 * send-message instructions.
561 if (inst
->dst
.file
== FIXED_GRF
)
562 constrained
[p
.atom_of_reg(reg_of(inst
->dst
))] = true;
564 for (int i
= 0; i
< inst
->sources
; i
++) {
565 if (inst
->src
[i
].file
== FIXED_GRF
||
566 (is_grf(inst
->src
[i
]) && inst
->eot
))
567 constrained
[p
.atom_of_reg(reg_of(inst
->src
[i
]))] = true;
570 /* Preserve the original allocation of VGRFs used by the barycentric
571 * source of the LINTERP instruction on Gen6, since pair-aligned
572 * barycentrics allow the PLN instruction to be used.
574 if (v
->devinfo
->has_pln
&& v
->devinfo
->gen
<= 6 &&
575 inst
->opcode
== FS_OPCODE_LINTERP
)
576 constrained
[p
.atom_of_reg(reg_of(inst
->src
[0]))] = true;
578 /* The location of the Gen7 MRF hack registers is hard-coded in the
579 * rest of the compiler back-end. Don't attempt to move them around.
581 if (v
->devinfo
->gen
>= 7) {
582 assert(inst
->dst
.file
!= MRF
);
584 for (unsigned i
= 0; i
< inst
->implied_mrf_writes(); i
++) {
585 const unsigned reg
= GEN7_MRF_HACK_START
+ inst
->base_mrf
+ i
;
586 constrained
[p
.atom_of_reg(reg
)] = true;
595 * Return whether the hardware will be able to prevent a bank conflict by
596 * optimizing out the read cycle of a source register. The formula was
597 * found experimentally.
600 is_conflict_optimized_out(const gen_device_info
*devinfo
, const fs_inst
*inst
)
602 return devinfo
->gen
>= 9 &&
603 ((is_grf(inst
->src
[0]) && (reg_of(inst
->src
[0]) == reg_of(inst
->src
[1]) ||
604 reg_of(inst
->src
[0]) == reg_of(inst
->src
[2]))) ||
605 reg_of(inst
->src
[1]) == reg_of(inst
->src
[2]));
609 * Return a matrix that allows reasonably efficient computation of the
610 * cycle-count cost of bank conflicts incurred throughout the whole program
611 * for any given atom-to-bank assignment.
613 * More precisely, if C_r_s_p is the result of this function, the total
614 * cost of all bank conflicts involving any given atom r can be readily
615 * recovered as follows:
617 * S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
619 * where d_i_j is the Kronecker delta, and B_r indicates the bank
620 * assignment of r. \sa delta_conflicts() for a vectorized implementation
621 * of the expression above.
623 * FINISHME: Teach this about the Gen10+ bank conflict rules, which are
624 * somewhat more relaxed than on previous generations. In the
625 * meantime optimizing based on Gen9 weights is likely to be more
626 * helpful than not optimizing at all.
629 shader_conflict_weight_matrix(const fs_visitor
*v
, const partitioning
&p
)
631 weight_vector_type
*conflicts
= new weight_vector_type
[p
.num_atoms()];
632 for (unsigned r
= 0; r
< p
.num_atoms(); r
++)
633 conflicts
[r
] = weight_vector_type(2 * p
.num_atoms());
635 /* Crude approximation of the number of times the current basic block
636 * will be executed at run-time.
638 unsigned block_scale
= 1;
640 foreach_block_and_inst(block
, fs_inst
, inst
, v
->cfg
) {
641 if (inst
->opcode
== BRW_OPCODE_DO
) {
644 } else if (inst
->opcode
== BRW_OPCODE_WHILE
) {
647 } else if (inst
->is_3src(v
->devinfo
) &&
648 is_grf(inst
->src
[1]) && is_grf(inst
->src
[2])) {
649 const unsigned r
= p
.atom_of_reg(reg_of(inst
->src
[1]));
650 const unsigned s
= p
.atom_of_reg(reg_of(inst
->src
[2]));
652 /* Estimate of the cycle-count cost of incurring a bank conflict
653 * for this instruction. This is only true on the average, for a
654 * sequence of back-to-back ternary instructions, since the EU
655 * front-end only seems to be able to issue a new instruction at
656 * an even cycle. The cost of a bank conflict incurred by an
657 * isolated ternary instruction may be higher.
659 const unsigned exec_size
= inst
->dst
.component_size(inst
->exec_size
);
660 const unsigned cycle_scale
= block_scale
* DIV_ROUND_UP(exec_size
,
663 /* Neglect same-atom conflicts (since they're either trivial or
664 * impossible to avoid without splitting the atom), and conflicts
665 * known to be optimized out by the hardware.
667 if (r
!= s
&& !is_conflict_optimized_out(v
->devinfo
, inst
)) {
668 /* Calculate the parity of the sources relative to the start of
669 * their respective atoms. If their parity is the same (and
670 * none of the atoms straddle the 2KB mark), the instruction
671 * will incur a conflict iff both atoms are assigned the same
672 * bank b. If their parity is opposite, the instruction will
673 * incur a conflict iff they are assigned opposite banks (b and
676 const bool p_r
= 1 & (reg_of(inst
->src
[1]) - p
.reg_of_atom(r
));
677 const bool p_s
= 1 & (reg_of(inst
->src
[2]) - p
.reg_of_atom(s
));
678 const unsigned p
= p_r
^ p_s
;
680 /* Calculate the updated cost of a hypothetical conflict
681 * between atoms r and s. Note that the weight matrix is
682 * symmetric with respect to indices r and s by construction.
684 const scalar_type w
= MIN2(unsigned(max_scalar
),
685 get(conflicts
[r
], s
, p
) + cycle_scale
);
686 set(conflicts
[r
], s
, p
, w
);
687 set(conflicts
[s
], r
, p
, w
);
696 * Return the set of GRF atoms that could potentially lead to bank
697 * conflicts if laid out unfavorably in the GRF space according to
698 * the specified \p conflicts matrix (\sa
699 * shader_conflict_weight_matrix()).
702 have_any_conflicts(const partitioning
&p
,
703 const weight_vector_type
*conflicts
)
705 bool *any_conflicts
= new bool[p
.num_atoms()]();
707 for (unsigned r
= 0; r
< p
.num_atoms(); r
++) {
708 const unsigned m
= DIV_ROUND_UP(conflicts
[r
].size
, vector_width
);
709 for (unsigned s
= 0; s
< m
; s
++)
710 any_conflicts
[r
] |= sums(conflicts
[r
].v
[s
]);
713 return any_conflicts
;
717 * Calculate the difference between two S(B) cost estimates as defined
718 * above (\sa shader_conflict_weight_matrix()). This represents the
719 * (partial) cycle-count benefit from moving an atom r from bank p to n.
720 * The respective bank assignments Bp and Bn are encoded as the \p
721 * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
722 * according to the formula:
724 * bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
726 * Notice the similarity with the delta function in the S(B) expression
727 * above, and how bank_mask(B) can be precomputed for every possible
728 * selection of r since bank_mask(B) only depends on it via B_r that may
729 * only assume one of four different values, so the caller can keep every
730 * possible bank_mask(B) vector in memory without much hassle (\sa
731 * bank_characteristics()).
734 delta_conflicts(const weight_vector_type
&bank_mask_p
,
735 const weight_vector_type
&bank_mask_n
,
736 const weight_vector_type
&conflicts
)
738 const unsigned m
= DIV_ROUND_UP(conflicts
.size
, vector_width
);
739 vector_type s_p
= {}, s_n
= {};
741 for (unsigned r
= 0; r
< m
; r
++) {
742 s_p
= adds(s_p
, mask(bank_mask_p
.v
[r
], conflicts
.v
[r
]));
743 s_n
= adds(s_n
, mask(bank_mask_n
.v
[r
], conflicts
.v
[r
]));
746 return sums(subs(s_p
, s_n
));
750 * Register atom permutation, represented as the start GRF offset each atom
754 permutation() : v(NULL
), size(0) {}
756 permutation(unsigned n
) :
757 v(new unsigned[n
]()), size(n
) {}
759 permutation(const permutation
&p
) :
760 v(new unsigned[p
.size
]), size(p
.size
)
762 memcpy(v
, p
.v
, p
.size
* sizeof(unsigned));
771 operator=(permutation p
)
783 * Return an identity permutation of GRF atoms.
786 identity_reg_permutation(const partitioning
&p
)
788 permutation
map(p
.num_atoms());
790 for (unsigned r
= 0; r
< map
.size
; r
++)
791 map
.v
[r
] = p
.reg_of_atom(r
);
797 * Return the bank index of GRF address \p reg, numbered according to the
804 bank_of(unsigned reg
)
806 return (reg
& 0x40) >> 5 | (reg
& 1);
810 * Return bitmasks suitable for use as bank mask arguments for the
811 * delta_conflicts() computation. Note that this is just the (negative)
812 * characteristic function of each bank, if you regard it as a set
813 * containing all atoms assigned to it according to the \p map array.
816 bank_characteristics(const permutation
&map
)
818 weight_vector_type
*banks
= new weight_vector_type
[4];
820 for (unsigned b
= 0; b
< 4; b
++) {
821 banks
[b
] = weight_vector_type(2 * map
.size
);
823 for (unsigned j
= 0; j
< map
.size
; j
++) {
824 for (unsigned p
= 0; p
< 2; p
++)
826 (b
^ p
) == bank_of(map
.v
[j
]) ? -1 : 0);
834 * Return an improved permutation of GRF atoms based on \p map attempting
835 * to reduce the total cycle-count cost of bank conflicts greedily.
837 * Note that this doesn't attempt to merge multiple atoms into one, which
838 * may allow it to do a better job in some cases -- It simply reorders
839 * existing atoms in the GRF space without affecting their identity.
842 optimize_reg_permutation(const partitioning
&p
,
843 const bool *constrained
,
844 const weight_vector_type
*conflicts
,
847 const bool *any_conflicts
= have_any_conflicts(p
, conflicts
);
848 weight_vector_type
*banks
= bank_characteristics(map
);
850 for (unsigned r
= 0; r
< map
.size
; r
++) {
851 const unsigned bank_r
= bank_of(map
.v
[r
]);
853 if (!constrained
[r
]) {
855 int best_benefit
= 0;
857 for (unsigned s
= 0; s
< map
.size
; s
++) {
858 const unsigned bank_s
= bank_of(map
.v
[s
]);
860 if (bank_r
!= bank_s
&& !constrained
[s
] &&
861 p
.size_of_atom(r
) == p
.size_of_atom(s
) &&
862 (any_conflicts
[r
] || any_conflicts
[s
])) {
864 delta_conflicts(banks
[bank_r
], banks
[bank_s
], conflicts
[r
]) +
865 delta_conflicts(banks
[bank_s
], banks
[bank_r
], conflicts
[s
]);
867 if (benefit
> best_benefit
) {
869 best_benefit
= benefit
;
875 for (unsigned b
= 0; b
< 4; b
++) {
876 for (unsigned p
= 0; p
< 2; p
++)
877 swap(banks
[b
], r
, p
, best_s
, p
);
880 SWAP(map
.v
[r
], map
.v
[best_s
]);
886 delete[] any_conflicts
;
891 * Apply the GRF atom permutation given by \p map to register \p r and
895 transform(const partitioning
&p
, const permutation
&map
, fs_reg r
)
897 if (r
.file
== VGRF
) {
898 const unsigned reg
= reg_of(r
);
899 const unsigned s
= p
.atom_of_reg(reg
);
900 r
.nr
= map
.v
[s
] + reg
- p
.reg_of_atom(s
);
901 r
.offset
= r
.offset
% REG_SIZE
;
909 fs_visitor::opt_bank_conflicts()
911 assert(grf_used
|| !"Must be called after register allocation");
913 /* No ternary instructions -- No bank conflicts. */
914 if (devinfo
->gen
< 6)
917 const partitioning p
= shader_reg_partitioning(this);
918 const bool *constrained
= shader_reg_constraints(this, p
);
919 const weight_vector_type
*conflicts
=
920 shader_conflict_weight_matrix(this, p
);
921 const permutation map
=
922 optimize_reg_permutation(p
, constrained
, conflicts
,
923 identity_reg_permutation(p
));
925 foreach_block_and_inst(block
, fs_inst
, inst
, cfg
) {
926 inst
->dst
= transform(p
, map
, inst
->dst
);
928 for (int i
= 0; i
< inst
->sources
; i
++)
929 inst
->src
[i
] = transform(p
, map
, inst
->src
[i
]);
933 delete[] constrained
;
938 * Return whether the instruction incurs GRF bank conflict cycles.
940 * Note that this is only accurate after register allocation because otherwise
941 * we don't know which bank each VGRF is going to end up aligned to.
944 has_bank_conflict(const gen_device_info
*devinfo
, const fs_inst
*inst
)
946 return inst
->is_3src(devinfo
) &&
947 is_grf(inst
->src
[1]) && is_grf(inst
->src
[2]) &&
948 bank_of(reg_of(inst
->src
[1])) == bank_of(reg_of(inst
->src
[2])) &&
949 !is_conflict_optimized_out(devinfo
, inst
);