2 * Copyright © 2019 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_scoreboard.cpp
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
65 * In-order instruction accounting.
70 * Number of in-order hardware instructions contained in this IR
71 * instruction. This determines the increment applied to the RegDist
72 * counter calculated for any ordered dependency that crosses this
76 ordered_unit(const fs_inst
*inst
)
78 switch (inst
->opcode
) {
81 case SHADER_OPCODE_UNDEF
:
82 case FS_OPCODE_PLACEHOLDER_HALT
:
85 /* Note that the following is inaccurate for virtual instructions
86 * that expand to more in-order instructions than assumed here, but
87 * that can only lead to suboptimal execution ordering, data
88 * coherency won't be impacted. Providing exact RegDist counts for
89 * each virtual instruction would allow better ALU performance, but
90 * it would require keeping this switch statement in perfect sync
91 * with the generator in order to avoid data corruption. Lesson is
92 * (again) don't use virtual instructions if you want optimal
95 return is_unordered(inst
) ? 0 : 1;
100 * Type for an instruction counter that increments for in-order
101 * instructions only, arbitrarily denoted 'jp' throughout this lowering
102 * pass in order to distinguish it from the regular instruction counter.
104 typedef int ordered_address
;
107 * Calculate the local ordered_address instruction counter at every
108 * instruction of the shader for subsequent constant-time look-up.
110 std::vector
<ordered_address
>
111 ordered_inst_addresses(const fs_visitor
*shader
)
113 std::vector
<ordered_address
> jps
;
114 ordered_address jp
= 0;
116 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
118 jp
+= ordered_unit(inst
);
125 * Synchronization mode required for data manipulated by in-order
128 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
129 * enum for additional type safety. The hardware doesn't provide control
130 * over the synchronization mode for RegDist annotations, this is only used
131 * internally in this pass in order to optimize out redundant read
132 * dependencies where possible.
134 enum tgl_regdist_mode
{
135 TGL_REGDIST_NULL
= 0,
141 * Allow bitwise arithmetic of tgl_regdist_mode enums.
144 operator|(tgl_regdist_mode x
, tgl_regdist_mode y
)
146 return tgl_regdist_mode(unsigned(x
) | unsigned(y
));
150 operator&(tgl_regdist_mode x
, tgl_regdist_mode y
)
152 return tgl_regdist_mode(unsigned(x
) & unsigned(y
));
156 operator|=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
162 operator&=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
170 * Representation of an equivalence relation among the set of unsigned
173 * Its initial state is the identity relation '~' such that i ~ j if and
174 * only if i == j for every pair of unsigned integers i and j.
176 struct equivalence_relation
{
178 * Return equivalence class index of the specified element. Effectively
179 * this is the numeric value of an arbitrary representative from the
182 * Allows the evaluation of the equivalence relation according to the
183 * rule that i ~ j if and only if lookup(i) == lookup(j).
186 lookup(unsigned i
) const
188 if (i
< is
.size() && is
[i
] != i
)
189 return lookup(is
[i
]);
195 * Create an array with the results of the lookup() method for
196 * constant-time evaluation.
198 std::vector
<unsigned>
200 std::vector
<unsigned> ids
;
202 for (const auto i
: is
)
203 ids
.push_back(lookup(i
));
209 * Mutate the existing equivalence relation minimally by imposing the
210 * additional requirement that i ~ j.
212 * The algorithm updates the internal representation recursively in
213 * order to guarantee transitivity while preserving the previously
214 * specified equivalence requirements.
217 link(unsigned i
, unsigned j
)
219 const unsigned k
= lookup(i
);
227 * Assign the representative of \p from to be equivalent to \p to.
229 * At the same time the data structure is partially flattened as much as
230 * it's possible without increasing the number of recursive calls.
233 assign(unsigned from
, unsigned to
)
236 if (from
< is
.size() && is
[from
] != from
)
237 assign(is
[from
], to
);
239 for (unsigned i
= is
.size(); i
<= from
; i
++)
246 std::vector
<unsigned> is
;
250 * Representation of a data dependency between two instructions in the
256 * No dependency information.
258 dependency() : ordered(TGL_REGDIST_NULL
), jp(INT_MIN
),
259 unordered(TGL_SBID_NULL
), id(0) {}
262 * Construct a dependency on the in-order instruction with the provided
263 * ordered_address instruction counter.
265 dependency(tgl_regdist_mode mode
, ordered_address jp
) :
266 ordered(mode
), jp(jp
), unordered(TGL_SBID_NULL
), id(0) {}
269 * Construct a dependency on the out-of-order instruction with the
270 * specified synchronization token.
272 dependency(tgl_sbid_mode mode
, unsigned id
) :
273 ordered(TGL_REGDIST_NULL
), jp(INT_MIN
), unordered(mode
), id(id
) {}
276 * Synchronization mode of in-order dependency, or zero if no in-order
277 * dependency is present.
279 tgl_regdist_mode ordered
;
282 * Instruction counter of in-order dependency.
284 * For a dependency part of a different block in the program, this is
285 * relative to the specific control flow path taken between the
286 * dependency and the current block: It is the ordered_address such that
287 * the difference between it and the ordered_address of the first
288 * instruction of the current block is exactly the number of in-order
289 * instructions across that control flow path. It is not guaranteed to
290 * be equal to the local ordered_address of the generating instruction
291 * [as returned by ordered_inst_addresses()], except for block-local
297 * Synchronization mode of unordered dependency, or zero if no unordered
298 * dependency is present.
300 tgl_sbid_mode unordered
;
302 /** Synchronization token of out-of-order dependency. */
306 * Trivial in-order dependency that's always satisfied.
308 * Note that unlike a default-constructed dependency() which is also
309 * trivially satisfied, this is considered to provide dependency
310 * information and can be used to clear a previously pending dependency
313 static const dependency done
;
316 operator==(const dependency
&dep0
, const dependency
&dep1
)
318 return dep0
.ordered
== dep1
.ordered
&&
319 dep0
.jp
== dep1
.jp
&&
320 dep0
.unordered
== dep1
.unordered
&&
325 operator!=(const dependency
&dep0
, const dependency
&dep1
)
327 return !(dep0
== dep1
);
331 const dependency
dependency::done
= dependency(TGL_REGDIST_SRC
, INT_MIN
);
334 * Return whether \p dep contains any dependency information.
337 is_valid(const dependency
&dep
)
339 return dep
.ordered
|| dep
.unordered
;
343 * Combine \p dep0 and \p dep1 into a single dependency object that is only
344 * satisfied when both original dependencies are satisfied. This might
345 * involve updating the equivalence relation \p eq in order to make sure
346 * that both out-of-order dependencies are assigned the same hardware SBID
347 * as synchronization token.
350 merge(equivalence_relation
&eq
,
351 const dependency
&dep0
, const dependency
&dep1
)
355 if (dep0
.ordered
|| dep1
.ordered
) {
356 dep
.ordered
= dep0
.ordered
| dep1
.ordered
;
357 dep
.jp
= MAX2(dep0
.jp
, dep1
.jp
);
360 if (dep0
.unordered
|| dep1
.unordered
) {
361 dep
.unordered
= dep0
.unordered
| dep1
.unordered
;
362 dep
.id
= eq
.link(dep0
.unordered
? dep0
.id
: dep1
.id
,
363 dep1
.unordered
? dep1
.id
: dep0
.id
);
370 * Override dependency information of \p dep0 with that of \p dep1.
373 shadow(const dependency
&dep0
, const dependency
&dep1
)
375 return is_valid(dep1
) ? dep1
: dep0
;
379 * Translate dependency information across the program.
381 * This returns a dependency on the same instruction translated to the
382 * ordered_address space of a different block. The correct shift for
383 * transporting a dependency across an edge of the CFG is the difference
384 * between the local ordered_address of the first instruction of the target
385 * block and the local ordered_address of the instruction immediately after
386 * the end of the origin block.
389 transport(dependency dep
, int delta
)
391 if (dep
.ordered
&& dep
.jp
> INT_MIN
)
398 * Return simplified dependency removing any synchronization modes not
399 * applicable to an instruction reading the same register location.
402 dependency_for_read(dependency dep
)
404 dep
.ordered
&= TGL_REGDIST_DST
;
409 * Return simplified dependency removing any synchronization modes not
410 * applicable to an instruction \p inst writing the same register location.
413 dependency_for_write(const fs_inst
*inst
, dependency dep
)
415 if (!is_unordered(inst
))
416 dep
.ordered
&= TGL_REGDIST_DST
;
423 * Scoreboard representation. This keeps track of the data dependencies of
424 * registers with GRF granularity.
429 * Look up the most current data dependency for register \p r.
432 get(const fs_reg
&r
) const
434 if (const dependency
*p
= const_cast<scoreboard
*>(this)->dep(r
))
441 * Specify the most current data dependency for register \p r.
444 set(const fs_reg
&r
, const dependency
&d
)
446 if (dependency
*p
= dep(r
))
451 * Component-wise merge() of corresponding dependencies from two
452 * scoreboard objects. \sa merge().
455 merge(equivalence_relation
&eq
,
456 const scoreboard
&sb0
, const scoreboard
&sb1
)
460 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
461 sb
.grf_deps
[i
] = merge(eq
, sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
463 sb
.addr_dep
= merge(eq
, sb0
.addr_dep
, sb1
.addr_dep
);
465 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
466 sb
.accum_deps
[i
] = merge(eq
, sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
472 * Component-wise shadow() of corresponding dependencies from two
473 * scoreboard objects. \sa shadow().
476 shadow(const scoreboard
&sb0
, const scoreboard
&sb1
)
480 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
481 sb
.grf_deps
[i
] = shadow(sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
483 sb
.addr_dep
= shadow(sb0
.addr_dep
, sb1
.addr_dep
);
485 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
486 sb
.accum_deps
[i
] = shadow(sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
492 * Component-wise transport() of dependencies from a scoreboard
493 * object. \sa transport().
496 transport(const scoreboard
&sb0
, int delta
)
500 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
501 sb
.grf_deps
[i
] = transport(sb0
.grf_deps
[i
], delta
);
503 sb
.addr_dep
= transport(sb0
.addr_dep
, delta
);
505 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
506 sb
.accum_deps
[i
] = transport(sb0
.accum_deps
[i
], delta
);
512 operator==(const scoreboard
&sb0
, const scoreboard
&sb1
)
514 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.grf_deps
); i
++) {
515 if (sb0
.grf_deps
[i
] != sb1
.grf_deps
[i
])
519 if (sb0
.addr_dep
!= sb1
.addr_dep
)
522 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.accum_deps
); i
++) {
523 if (sb0
.accum_deps
[i
] != sb1
.accum_deps
[i
])
531 operator!=(const scoreboard
&sb0
, const scoreboard
&sb1
)
533 return !(sb0
== sb1
);
537 dependency grf_deps
[BRW_MAX_GRF
];
539 dependency accum_deps
[10];
544 const unsigned reg
= (r
.file
== VGRF
? r
.nr
+ r
.offset
/ REG_SIZE
:
545 reg_offset(r
) / REG_SIZE
);
547 return (r
.file
== VGRF
|| r
.file
== FIXED_GRF
? &grf_deps
[reg
] :
548 r
.file
== MRF
? &grf_deps
[GEN7_MRF_HACK_START
+ reg
] :
549 r
.file
== ARF
&& reg
>= BRW_ARF_ADDRESS
&&
550 reg
< BRW_ARF_ACCUMULATOR
? &addr_dep
:
551 r
.file
== ARF
&& reg
>= BRW_ARF_ACCUMULATOR
&&
552 reg
< BRW_ARF_FLAG
? &accum_deps
[
553 reg
- BRW_ARF_ACCUMULATOR
] :
559 * Dependency list handling.
564 * Add dependency \p dep to the list of dependencies of an instruction
568 add_dependency(const std::vector
<unsigned> &ids
,
569 std::vector
<dependency
> &deps
, dependency dep
)
572 /* Translate the unordered dependency token first in order to keep
573 * the list minimally redundant.
575 if (dep
.unordered
&& dep
.id
< ids
.size())
576 dep
.id
= ids
[dep
.id
];
578 /* Try to combine the specified dependency with any existing ones. */
579 for (auto &dep1
: deps
) {
580 if (dep
.ordered
&& dep1
.ordered
) {
581 dep1
.jp
= MAX2(dep1
.jp
, dep
.jp
);
582 dep1
.ordered
|= dep
.ordered
;
583 dep
.ordered
= TGL_REGDIST_NULL
;
586 if (dep
.unordered
&& dep1
.unordered
&& dep1
.id
== dep
.id
) {
587 dep1
.unordered
|= dep
.unordered
;
588 dep
.unordered
= TGL_SBID_NULL
;
592 /* Add it to the end of the list if necessary. */
599 * Construct a tgl_swsb annotation encoding any ordered dependencies from
600 * the dependency list \p deps of an instruction with ordered_address
604 ordered_dependency_swsb(const std::vector
<dependency
> &deps
,
605 const ordered_address
&jp
)
607 unsigned min_dist
= ~0u;
609 for (const auto &dep
: deps
) {
611 const unsigned dist
= jp
- dep
.jp
;
612 const unsigned max_dist
= 10;
614 if (dist
<= max_dist
)
615 min_dist
= MIN3(min_dist
, dist
, 7);
619 return { min_dist
== ~0u ? 0 : min_dist
};
623 * Return whether the dependency list \p deps of an instruction with
624 * ordered_address \p jp has any non-trivial ordered dependencies.
627 find_ordered_dependency(const std::vector
<dependency
> &deps
,
628 const ordered_address
&jp
)
630 return ordered_dependency_swsb(deps
, jp
).regdist
;
634 * Return the full tgl_sbid_mode bitset for the first unordered dependency
635 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
636 * no such dependency is present.
639 find_unordered_dependency(const std::vector
<dependency
> &deps
,
640 tgl_sbid_mode unordered
)
643 for (const auto &dep
: deps
) {
644 if (unordered
& dep
.unordered
)
645 return dep
.unordered
;
649 return TGL_SBID_NULL
;
653 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
654 * \p deps that can be represented directly in the SWSB annotation of the
655 * instruction without additional SYNC instructions, or zero if no such
656 * dependency is present.
659 baked_unordered_dependency_mode(const fs_inst
*inst
,
660 const std::vector
<dependency
> &deps
,
661 const ordered_address
&jp
)
663 const bool has_ordered
= find_ordered_dependency(deps
, jp
);
665 if (find_unordered_dependency(deps
, TGL_SBID_SET
))
666 return find_unordered_dependency(deps
, TGL_SBID_SET
);
667 else if (has_ordered
&& is_unordered(inst
))
668 return TGL_SBID_NULL
;
669 else if (find_unordered_dependency(deps
, TGL_SBID_DST
) &&
670 (!has_ordered
|| !is_unordered(inst
)))
671 return find_unordered_dependency(deps
, TGL_SBID_DST
);
672 else if (!has_ordered
)
673 return find_unordered_dependency(deps
, TGL_SBID_SRC
);
675 return TGL_SBID_NULL
;
681 * Shader instruction dependency calculation.
686 * Update scoreboard object \p sb to account for the execution of
687 * instruction \p inst.
690 update_inst_scoreboard(const fs_visitor
*shader
,
691 const std::vector
<ordered_address
> &jps
,
692 const fs_inst
*inst
, unsigned ip
, scoreboard
&sb
)
694 /* Track any source registers that may be fetched asynchronously by this
695 * instruction, otherwise clear the dependency in order to avoid
696 * subsequent redundant synchronization.
698 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
699 const dependency rd_dep
=
700 inst
->is_payload(i
) || inst
->is_math() ? dependency(TGL_SBID_SRC
, ip
) :
701 ordered_unit(inst
) ? dependency(TGL_REGDIST_SRC
, jps
[ip
]) :
704 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
705 sb
.set(byte_offset(inst
->src
[i
], REG_SIZE
* j
), rd_dep
);
708 if (is_send(inst
) && inst
->base_mrf
!= -1) {
709 const dependency rd_dep
= dependency(TGL_SBID_SRC
, ip
);
711 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
712 sb
.set(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0), rd_dep
);
715 /* Track any destination registers of this instruction. */
716 const dependency wr_dep
=
717 is_unordered(inst
) ? dependency(TGL_SBID_DST
, ip
) :
718 ordered_unit(inst
) ? dependency(TGL_REGDIST_DST
, jps
[ip
]) :
721 if (is_valid(wr_dep
) && inst
->dst
.file
!= BAD_FILE
&&
722 !inst
->dst
.is_null()) {
723 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
724 sb
.set(byte_offset(inst
->dst
, REG_SIZE
* j
), wr_dep
);
729 * Calculate scoreboard objects locally that represent any pending (and
730 * unconditionally resolved) dependencies at the end of each block of the
733 std::vector
<scoreboard
>
734 gather_block_scoreboards(const fs_visitor
*shader
,
735 const std::vector
<ordered_address
> &jps
)
737 std::vector
<scoreboard
> sbs(shader
->cfg
->num_blocks
);
740 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
)
741 update_inst_scoreboard(shader
, jps
, inst
, ip
++, sbs
[block
->num
]);
747 * Propagate data dependencies globally through the control flow graph
748 * until a fixed point is reached.
750 * Calculates the set of dependencies potentially pending at the beginning
751 * of each block, and returns it as an array of scoreboard objects.
753 std::pair
<std::vector
<scoreboard
>, std::vector
<unsigned>>
754 propagate_block_scoreboards(const fs_visitor
*shader
,
755 const std::vector
<ordered_address
> &jps
)
757 const std::vector
<scoreboard
> delta_sbs
=
758 gather_block_scoreboards(shader
, jps
);
759 std::vector
<scoreboard
> in_sbs(shader
->cfg
->num_blocks
);
760 std::vector
<scoreboard
> out_sbs(shader
->cfg
->num_blocks
);
761 equivalence_relation eq
;
763 for (bool progress
= true; progress
;) {
766 foreach_block(block
, shader
->cfg
) {
767 const scoreboard sb
= shadow(in_sbs
[block
->num
],
768 delta_sbs
[block
->num
]);
770 if (sb
!= out_sbs
[block
->num
]) {
771 foreach_list_typed(bblock_link
, child_link
, link
,
773 scoreboard
&in_sb
= in_sbs
[child_link
->block
->num
];
775 jps
[child_link
->block
->start_ip
] - jps
[block
->end_ip
]
776 - ordered_unit(static_cast<const fs_inst
*>(block
->end()));
778 in_sb
= merge(eq
, in_sb
, transport(sb
, delta
));
781 out_sbs
[block
->num
] = sb
;
787 return { std::move(in_sbs
), eq
.flatten() };
791 * Return the list of potential dependencies of each instruction in the
792 * shader based on the result of global dependency analysis.
794 std::vector
<std::vector
<dependency
>>
795 gather_inst_dependencies(const fs_visitor
*shader
,
796 const std::vector
<ordered_address
> &jps
)
798 std::vector
<scoreboard
> sbs
;
799 std::vector
<unsigned> ids
;
800 std::vector
<std::vector
<dependency
>> deps
;
803 std::tie(sbs
, ids
) = propagate_block_scoreboards(shader
, jps
);
805 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
806 scoreboard
&sb
= sbs
[block
->num
];
807 std::vector
<dependency
> inst_deps
;
809 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
810 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
811 add_dependency(ids
, inst_deps
, dependency_for_read(
812 sb
.get(byte_offset(inst
->src
[i
], REG_SIZE
* j
))));
815 if (is_send(inst
) && inst
->base_mrf
!= -1) {
816 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
817 add_dependency(ids
, inst_deps
, dependency_for_read(
818 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
821 if (is_unordered(inst
))
822 add_dependency(ids
, inst_deps
, dependency(TGL_SBID_SET
, ip
));
824 if (!inst
->no_dd_check
) {
825 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
826 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
827 add_dependency(ids
, inst_deps
, dependency_for_write(inst
,
828 sb
.get(byte_offset(inst
->dst
, REG_SIZE
* j
))));
832 if (is_send(inst
) && inst
->base_mrf
!= -1) {
833 for (int j
= 0; j
< shader
->implied_mrf_writes(inst
); j
++)
834 add_dependency(ids
, inst_deps
, dependency_for_write(inst
,
835 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
839 deps
.push_back(inst_deps
);
840 update_inst_scoreboard(shader
, jps
, inst
, ip
, sb
);
850 * Allocate SBID tokens to track the execution of every out-of-order
851 * instruction of the shader.
853 std::vector
<std::vector
<dependency
>>
854 allocate_inst_dependencies(const fs_visitor
*shader
,
855 const std::vector
<std::vector
<dependency
>> &deps0
)
857 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
858 * shaders with a large number of SEND messages.
860 std::vector
<std::vector
<dependency
>> deps1
;
861 std::vector
<unsigned> ids(deps0
.size(), ~0u);
862 unsigned next_id
= 0;
864 for (const auto &inst_deps0
: deps0
) {
865 std::vector
<dependency
> inst_deps1
;
867 for (const auto &dep
: inst_deps0
) {
868 if (dep
.unordered
&& ids
[dep
.id
] == ~0u)
869 ids
[dep
.id
] = (next_id
++) & 0xf;
871 add_dependency(ids
, inst_deps1
, dep
);
874 deps1
.push_back(inst_deps1
);
881 * Emit dependency information provided by \p deps into the shader,
882 * inserting additional SYNC instructions for dependencies that can't be
883 * represented directly by annotating existing instructions.
886 emit_inst_dependencies(fs_visitor
*shader
,
887 const std::vector
<ordered_address
> &jps
,
888 const std::vector
<std::vector
<dependency
>> &deps
)
892 foreach_block_and_inst_safe(block
, fs_inst
, inst
, shader
->cfg
) {
893 tgl_swsb swsb
= ordered_dependency_swsb(deps
[ip
], jps
[ip
]);
894 const tgl_sbid_mode unordered_mode
=
895 baked_unordered_dependency_mode(inst
, deps
[ip
], jps
[ip
]);
897 for (const auto &dep
: deps
[ip
]) {
899 if (unordered_mode
== dep
.unordered
&& !swsb
.mode
) {
900 /* Bake unordered dependency into the instruction's SWSB if
904 swsb
.mode
= dep
.unordered
;
906 /* Emit dependency into the SWSB of an extra SYNC
909 const fs_builder ibld
= fs_builder(shader
, block
, inst
)
910 .exec_all().group(1, 0);
911 fs_inst
*sync
= ibld
.emit(BRW_OPCODE_SYNC
, ibld
.null_reg_ud(),
912 brw_imm_ud(TGL_SYNC_NOP
));
913 sync
->sched
.sbid
= dep
.id
;
914 sync
->sched
.mode
= dep
.unordered
;
915 assert(!(sync
->sched
.mode
& TGL_SBID_SET
));
922 inst
->no_dd_check
= inst
->no_dd_clear
= false;
929 fs_visitor::lower_scoreboard()
931 if (devinfo
->gen
>= 12) {
932 const std::vector
<ordered_address
> jps
= ordered_inst_addresses(this);
933 emit_inst_dependencies(this, jps
,
934 allocate_inst_dependencies(this,
935 gather_inst_dependencies(this, jps
)));