2 * Copyright © 2019 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_scoreboard.cpp
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
62 * In-order instruction accounting.
67 * Number of in-order hardware instructions contained in this IR
68 * instruction. This determines the increment applied to the RegDist
69 * counter calculated for any ordered dependency that crosses this
73 ordered_unit(const fs_inst
*inst
)
75 switch (inst
->opcode
) {
78 case SHADER_OPCODE_UNDEF
:
79 case FS_OPCODE_PLACEHOLDER_HALT
:
82 /* Note that the following is inaccurate for virtual instructions
83 * that expand to more in-order instructions than assumed here, but
84 * that can only lead to suboptimal execution ordering, data
85 * coherency won't be impacted. Providing exact RegDist counts for
86 * each virtual instruction would allow better ALU performance, but
87 * it would require keeping this switch statement in perfect sync
88 * with the generator in order to avoid data corruption. Lesson is
89 * (again) don't use virtual instructions if you want optimal
92 return is_unordered(inst
) ? 0 : 1;
97 * Type for an instruction counter that increments for in-order
98 * instructions only, arbitrarily denoted 'jp' throughout this lowering
99 * pass in order to distinguish it from the regular instruction counter.
101 typedef int ordered_address
;
104 * Return the number of instructions in the program.
107 num_instructions(const backend_shader
*shader
)
109 return shader
->cfg
->blocks
[shader
->cfg
->num_blocks
- 1]->end_ip
+ 1;
113 * Calculate the local ordered_address instruction counter at every
114 * instruction of the shader for subsequent constant-time look-up.
117 ordered_inst_addresses(const fs_visitor
*shader
)
119 ordered_address
*jps
= new ordered_address
[num_instructions(shader
)];
120 ordered_address jp
= 0;
123 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
125 jp
+= ordered_unit(inst
);
133 * Synchronization mode required for data manipulated by in-order
136 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
137 * enum for additional type safety. The hardware doesn't provide control
138 * over the synchronization mode for RegDist annotations, this is only used
139 * internally in this pass in order to optimize out redundant read
140 * dependencies where possible.
142 enum tgl_regdist_mode
{
143 TGL_REGDIST_NULL
= 0,
149 * Allow bitwise arithmetic of tgl_regdist_mode enums.
152 operator|(tgl_regdist_mode x
, tgl_regdist_mode y
)
154 return tgl_regdist_mode(unsigned(x
) | unsigned(y
));
158 operator&(tgl_regdist_mode x
, tgl_regdist_mode y
)
160 return tgl_regdist_mode(unsigned(x
) & unsigned(y
));
164 operator|=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
170 operator&=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
178 * Representation of an equivalence relation among the set of unsigned
181 * Its initial state is the identity relation '~' such that i ~ j if and
182 * only if i == j for every pair of unsigned integers i and j.
184 struct equivalence_relation
{
185 equivalence_relation(unsigned n
) : is(new unsigned[n
]), n(n
)
187 for (unsigned i
= 0; i
< n
; i
++)
191 ~equivalence_relation()
197 * Return equivalence class index of the specified element. Effectively
198 * this is the numeric value of an arbitrary representative from the
201 * Allows the evaluation of the equivalence relation according to the
202 * rule that i ~ j if and only if lookup(i) == lookup(j).
205 lookup(unsigned i
) const
207 if (i
< n
&& is
[i
] != i
)
208 return lookup(is
[i
]);
214 * Create an array with the results of the lookup() method for
215 * constant-time evaluation.
220 unsigned *ids
= new unsigned[n
];
222 for (unsigned i
= 0; i
< n
; i
++)
229 * Mutate the existing equivalence relation minimally by imposing the
230 * additional requirement that i ~ j.
232 * The algorithm updates the internal representation recursively in
233 * order to guarantee transitivity while preserving the previously
234 * specified equivalence requirements.
237 link(unsigned i
, unsigned j
)
239 const unsigned k
= lookup(i
);
246 equivalence_relation(const equivalence_relation
&);
248 equivalence_relation
&
249 operator=(const equivalence_relation
&);
252 * Assign the representative of \p from to be equivalent to \p to.
254 * At the same time the data structure is partially flattened as much as
255 * it's possible without increasing the number of recursive calls.
258 assign(unsigned from
, unsigned to
)
263 if (is
[from
] != from
)
264 assign(is
[from
], to
);
275 * Representation of a data dependency between two instructions in the
281 * No dependency information.
283 dependency() : ordered(TGL_REGDIST_NULL
), jp(INT_MIN
),
284 unordered(TGL_SBID_NULL
), id(0),
288 * Construct a dependency on the in-order instruction with the provided
289 * ordered_address instruction counter.
291 dependency(tgl_regdist_mode mode
, ordered_address jp
, bool exec_all
) :
292 ordered(mode
), jp(jp
), unordered(TGL_SBID_NULL
), id(0),
293 exec_all(exec_all
) {}
296 * Construct a dependency on the out-of-order instruction with the
297 * specified synchronization token.
299 dependency(tgl_sbid_mode mode
, unsigned id
, bool exec_all
) :
300 ordered(TGL_REGDIST_NULL
), jp(INT_MIN
), unordered(mode
), id(id
),
301 exec_all(exec_all
) {}
304 * Synchronization mode of in-order dependency, or zero if no in-order
305 * dependency is present.
307 tgl_regdist_mode ordered
;
310 * Instruction counter of in-order dependency.
312 * For a dependency part of a different block in the program, this is
313 * relative to the specific control flow path taken between the
314 * dependency and the current block: It is the ordered_address such that
315 * the difference between it and the ordered_address of the first
316 * instruction of the current block is exactly the number of in-order
317 * instructions across that control flow path. It is not guaranteed to
318 * be equal to the local ordered_address of the generating instruction
319 * [as returned by ordered_inst_addresses()], except for block-local
325 * Synchronization mode of unordered dependency, or zero if no unordered
326 * dependency is present.
328 tgl_sbid_mode unordered
;
330 /** Synchronization token of out-of-order dependency. */
334 * Whether the dependency could be run with execution masking disabled,
335 * which might lead to the unwanted execution of the generating
336 * instruction in cases where a BB is executed with all channels
337 * disabled due to hardware bug GEN:BUG:1407528679.
342 * Trivial in-order dependency that's always satisfied.
344 * Note that unlike a default-constructed dependency() which is also
345 * trivially satisfied, this is considered to provide dependency
346 * information and can be used to clear a previously pending dependency
349 static const dependency done
;
352 operator==(const dependency
&dep0
, const dependency
&dep1
)
354 return dep0
.ordered
== dep1
.ordered
&&
355 dep0
.jp
== dep1
.jp
&&
356 dep0
.unordered
== dep1
.unordered
&&
357 dep0
.id
== dep1
.id
&&
358 dep0
.exec_all
== dep1
.exec_all
;
362 operator!=(const dependency
&dep0
, const dependency
&dep1
)
364 return !(dep0
== dep1
);
368 const dependency
dependency::done
= dependency(TGL_REGDIST_SRC
, INT_MIN
, false);
371 * Return whether \p dep contains any dependency information.
374 is_valid(const dependency
&dep
)
376 return dep
.ordered
|| dep
.unordered
;
380 * Combine \p dep0 and \p dep1 into a single dependency object that is only
381 * satisfied when both original dependencies are satisfied. This might
382 * involve updating the equivalence relation \p eq in order to make sure
383 * that both out-of-order dependencies are assigned the same hardware SBID
384 * as synchronization token.
387 merge(equivalence_relation
&eq
,
388 const dependency
&dep0
, const dependency
&dep1
)
392 if (dep0
.ordered
|| dep1
.ordered
) {
393 dep
.ordered
= dep0
.ordered
| dep1
.ordered
;
394 dep
.jp
= MAX2(dep0
.jp
, dep1
.jp
);
397 if (dep0
.unordered
|| dep1
.unordered
) {
398 dep
.unordered
= dep0
.unordered
| dep1
.unordered
;
399 dep
.id
= eq
.link(dep0
.unordered
? dep0
.id
: dep1
.id
,
400 dep1
.unordered
? dep1
.id
: dep0
.id
);
403 dep
.exec_all
= dep0
.exec_all
|| dep1
.exec_all
;
409 * Override dependency information of \p dep0 with that of \p dep1.
412 shadow(const dependency
&dep0
, const dependency
&dep1
)
414 return is_valid(dep1
) ? dep1
: dep0
;
418 * Translate dependency information across the program.
420 * This returns a dependency on the same instruction translated to the
421 * ordered_address space of a different block. The correct shift for
422 * transporting a dependency across an edge of the CFG is the difference
423 * between the local ordered_address of the first instruction of the target
424 * block and the local ordered_address of the instruction immediately after
425 * the end of the origin block.
428 transport(dependency dep
, int delta
)
430 if (dep
.ordered
&& dep
.jp
> INT_MIN
)
437 * Return simplified dependency removing any synchronization modes not
438 * applicable to an instruction reading the same register location.
441 dependency_for_read(dependency dep
)
443 dep
.ordered
&= TGL_REGDIST_DST
;
448 * Return simplified dependency removing any synchronization modes not
449 * applicable to an instruction \p inst writing the same register location.
452 dependency_for_write(const fs_inst
*inst
, dependency dep
)
454 if (!is_unordered(inst
))
455 dep
.ordered
&= TGL_REGDIST_DST
;
462 * Scoreboard representation. This keeps track of the data dependencies of
463 * registers with GRF granularity.
468 * Look up the most current data dependency for register \p r.
471 get(const fs_reg
&r
) const
473 if (const dependency
*p
= const_cast<scoreboard
*>(this)->dep(r
))
480 * Specify the most current data dependency for register \p r.
483 set(const fs_reg
&r
, const dependency
&d
)
485 if (dependency
*p
= dep(r
))
490 * Component-wise merge() of corresponding dependencies from two
491 * scoreboard objects. \sa merge().
494 merge(equivalence_relation
&eq
,
495 const scoreboard
&sb0
, const scoreboard
&sb1
)
499 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
500 sb
.grf_deps
[i
] = merge(eq
, sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
502 sb
.addr_dep
= merge(eq
, sb0
.addr_dep
, sb1
.addr_dep
);
504 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
505 sb
.accum_deps
[i
] = merge(eq
, sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
511 * Component-wise shadow() of corresponding dependencies from two
512 * scoreboard objects. \sa shadow().
515 shadow(const scoreboard
&sb0
, const scoreboard
&sb1
)
519 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
520 sb
.grf_deps
[i
] = shadow(sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
522 sb
.addr_dep
= shadow(sb0
.addr_dep
, sb1
.addr_dep
);
524 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
525 sb
.accum_deps
[i
] = shadow(sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
531 * Component-wise transport() of dependencies from a scoreboard
532 * object. \sa transport().
535 transport(const scoreboard
&sb0
, int delta
)
539 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
540 sb
.grf_deps
[i
] = transport(sb0
.grf_deps
[i
], delta
);
542 sb
.addr_dep
= transport(sb0
.addr_dep
, delta
);
544 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
545 sb
.accum_deps
[i
] = transport(sb0
.accum_deps
[i
], delta
);
551 operator==(const scoreboard
&sb0
, const scoreboard
&sb1
)
553 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.grf_deps
); i
++) {
554 if (sb0
.grf_deps
[i
] != sb1
.grf_deps
[i
])
558 if (sb0
.addr_dep
!= sb1
.addr_dep
)
561 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.accum_deps
); i
++) {
562 if (sb0
.accum_deps
[i
] != sb1
.accum_deps
[i
])
570 operator!=(const scoreboard
&sb0
, const scoreboard
&sb1
)
572 return !(sb0
== sb1
);
576 dependency grf_deps
[BRW_MAX_GRF
];
578 dependency accum_deps
[10];
583 const unsigned reg
= (r
.file
== VGRF
? r
.nr
+ r
.offset
/ REG_SIZE
:
584 reg_offset(r
) / REG_SIZE
);
586 return (r
.file
== VGRF
|| r
.file
== FIXED_GRF
? &grf_deps
[reg
] :
587 r
.file
== MRF
? &grf_deps
[GEN7_MRF_HACK_START
+ reg
] :
588 r
.file
== ARF
&& reg
>= BRW_ARF_ADDRESS
&&
589 reg
< BRW_ARF_ACCUMULATOR
? &addr_dep
:
590 r
.file
== ARF
&& reg
>= BRW_ARF_ACCUMULATOR
&&
591 reg
< BRW_ARF_FLAG
? &accum_deps
[
592 reg
- BRW_ARF_ACCUMULATOR
] :
598 * Dependency list handling.
601 struct dependency_list
{
602 dependency_list() : deps(NULL
), n(0) {}
610 push_back(const dependency
&dep
)
612 deps
= (dependency
*)realloc(deps
, (n
+ 1) * sizeof(*deps
));
623 operator[](unsigned i
) const
630 operator[](unsigned i
)
637 dependency_list(const dependency_list
&);
639 operator=(const dependency_list
&);
646 * Add dependency \p dep to the list of dependencies of an instruction
650 add_dependency(const unsigned *ids
, dependency_list
&deps
, dependency dep
)
653 /* Translate the unordered dependency token first in order to keep
654 * the list minimally redundant.
657 dep
.id
= ids
[dep
.id
];
659 /* Try to combine the specified dependency with any existing ones. */
660 for (unsigned i
= 0; i
< deps
.size(); i
++) {
661 /* Don't combine otherwise matching dependencies if there is an
662 * exec_all mismatch which would cause a SET dependency to gain an
663 * exec_all flag, since that would prevent it from being baked
664 * into the instruction we want to allocate an SBID for.
666 if (deps
[i
].exec_all
!= dep
.exec_all
&&
667 (!deps
[i
].exec_all
|| (dep
.unordered
& TGL_SBID_SET
)) &&
668 (!dep
.exec_all
|| (deps
[i
].unordered
& TGL_SBID_SET
)))
671 if (dep
.ordered
&& deps
[i
].ordered
) {
672 deps
[i
].jp
= MAX2(deps
[i
].jp
, dep
.jp
);
673 deps
[i
].ordered
|= dep
.ordered
;
674 deps
[i
].exec_all
|= dep
.exec_all
;
675 dep
.ordered
= TGL_REGDIST_NULL
;
678 if (dep
.unordered
&& deps
[i
].unordered
&& deps
[i
].id
== dep
.id
) {
679 deps
[i
].unordered
|= dep
.unordered
;
680 deps
[i
].exec_all
|= dep
.exec_all
;
681 dep
.unordered
= TGL_SBID_NULL
;
685 /* Add it to the end of the list if necessary. */
692 * Construct a tgl_swsb annotation encoding any ordered dependencies from
693 * the dependency list \p deps of an instruction with ordered_address \p
694 * jp. If \p exec_all is false only dependencies known to be executed with
695 * channel masking applied will be considered in the calculation.
698 ordered_dependency_swsb(const dependency_list
&deps
,
699 const ordered_address
&jp
,
702 unsigned min_dist
= ~0u;
704 for (unsigned i
= 0; i
< deps
.size(); i
++) {
705 if (deps
[i
].ordered
&& exec_all
>= deps
[i
].exec_all
) {
706 const unsigned dist
= jp
- deps
[i
].jp
;
707 const unsigned max_dist
= 10;
708 assert(jp
> deps
[i
].jp
);
709 if (dist
<= max_dist
)
710 min_dist
= MIN3(min_dist
, dist
, 7);
714 return { min_dist
== ~0u ? 0 : min_dist
};
718 * Return whether the dependency list \p deps of an instruction with
719 * ordered_address \p jp has any non-trivial ordered dependencies. If \p
720 * exec_all is false only dependencies known to be executed with channel
721 * masking applied will be considered in the calculation.
724 find_ordered_dependency(const dependency_list
&deps
,
725 const ordered_address
&jp
,
728 return ordered_dependency_swsb(deps
, jp
, exec_all
).regdist
;
732 * Return the full tgl_sbid_mode bitset for the first unordered dependency
733 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
734 * no such dependency is present. If \p exec_all is false only
735 * dependencies known to be executed with channel masking applied will be
736 * considered in the calculation.
739 find_unordered_dependency(const dependency_list
&deps
,
740 tgl_sbid_mode unordered
,
744 for (unsigned i
= 0; i
< deps
.size(); i
++) {
745 if ((unordered
& deps
[i
].unordered
) &&
746 exec_all
>= deps
[i
].exec_all
)
747 return deps
[i
].unordered
;
751 return TGL_SBID_NULL
;
755 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
756 * \p deps that can be represented directly in the SWSB annotation of the
757 * instruction without additional SYNC instructions, or zero if no such
758 * dependency is present.
761 baked_unordered_dependency_mode(const fs_inst
*inst
,
762 const dependency_list
&deps
,
763 const ordered_address
&jp
)
765 const bool exec_all
= inst
->force_writemask_all
;
766 const bool has_ordered
= find_ordered_dependency(deps
, jp
, exec_all
);
768 if (find_unordered_dependency(deps
, TGL_SBID_SET
, exec_all
))
769 return find_unordered_dependency(deps
, TGL_SBID_SET
, exec_all
);
770 else if (has_ordered
&& is_unordered(inst
))
771 return TGL_SBID_NULL
;
772 else if (find_unordered_dependency(deps
, TGL_SBID_DST
, exec_all
) &&
773 (!has_ordered
|| !is_unordered(inst
)))
774 return find_unordered_dependency(deps
, TGL_SBID_DST
, exec_all
);
775 else if (!has_ordered
)
776 return find_unordered_dependency(deps
, TGL_SBID_SRC
, exec_all
);
778 return TGL_SBID_NULL
;
784 * Shader instruction dependency calculation.
789 * Update scoreboard object \p sb to account for the execution of
790 * instruction \p inst.
793 update_inst_scoreboard(const ordered_address
*jps
,
794 const fs_inst
*inst
, unsigned ip
, scoreboard
&sb
)
796 const bool exec_all
= inst
->force_writemask_all
;
798 /* Track any source registers that may be fetched asynchronously by this
799 * instruction, otherwise clear the dependency in order to avoid
800 * subsequent redundant synchronization.
802 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
803 const dependency rd_dep
=
804 (inst
->is_payload(i
) ||
805 inst
->is_math()) ? dependency(TGL_SBID_SRC
, ip
, exec_all
) :
806 ordered_unit(inst
) ? dependency(TGL_REGDIST_SRC
, jps
[ip
], exec_all
) :
809 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
810 sb
.set(byte_offset(inst
->src
[i
], REG_SIZE
* j
), rd_dep
);
813 if (is_send(inst
) && inst
->base_mrf
!= -1) {
814 const dependency rd_dep
= dependency(TGL_SBID_SRC
, ip
, exec_all
);
816 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
817 sb
.set(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0), rd_dep
);
820 /* Track any destination registers of this instruction. */
821 const dependency wr_dep
=
822 is_unordered(inst
) ? dependency(TGL_SBID_DST
, ip
, exec_all
) :
823 ordered_unit(inst
) ? dependency(TGL_REGDIST_DST
, jps
[ip
], exec_all
) :
826 if (is_valid(wr_dep
) && inst
->dst
.file
!= BAD_FILE
&&
827 !inst
->dst
.is_null()) {
828 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
829 sb
.set(byte_offset(inst
->dst
, REG_SIZE
* j
), wr_dep
);
834 * Calculate scoreboard objects locally that represent any pending (and
835 * unconditionally resolved) dependencies at the end of each block of the
839 gather_block_scoreboards(const fs_visitor
*shader
,
840 const ordered_address
*jps
)
842 scoreboard
*sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
845 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
)
846 update_inst_scoreboard(jps
, inst
, ip
++, sbs
[block
->num
]);
852 * Propagate data dependencies globally through the control flow graph
853 * until a fixed point is reached.
855 * Calculates the set of dependencies potentially pending at the beginning
856 * of each block, and returns it as an array of scoreboard objects.
859 propagate_block_scoreboards(const fs_visitor
*shader
,
860 const ordered_address
*jps
,
861 equivalence_relation
&eq
)
863 const scoreboard
*delta_sbs
= gather_block_scoreboards(shader
, jps
);
864 scoreboard
*in_sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
865 scoreboard
*out_sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
867 for (bool progress
= true; progress
;) {
870 foreach_block(block
, shader
->cfg
) {
871 const scoreboard sb
= shadow(in_sbs
[block
->num
],
872 delta_sbs
[block
->num
]);
874 if (sb
!= out_sbs
[block
->num
]) {
875 foreach_list_typed(bblock_link
, child_link
, link
,
877 scoreboard
&in_sb
= in_sbs
[child_link
->block
->num
];
879 jps
[child_link
->block
->start_ip
] - jps
[block
->end_ip
]
880 - ordered_unit(static_cast<const fs_inst
*>(block
->end()));
882 in_sb
= merge(eq
, in_sb
, transport(sb
, delta
));
885 out_sbs
[block
->num
] = sb
;
898 * Return the list of potential dependencies of each instruction in the
899 * shader based on the result of global dependency analysis.
902 gather_inst_dependencies(const fs_visitor
*shader
,
903 const ordered_address
*jps
)
905 equivalence_relation
eq(num_instructions(shader
));
906 scoreboard
*sbs
= propagate_block_scoreboards(shader
, jps
, eq
);
907 const unsigned *ids
= eq
.flatten();
908 dependency_list
*deps
= new dependency_list
[num_instructions(shader
)];
911 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
912 const bool exec_all
= inst
->force_writemask_all
;
913 scoreboard
&sb
= sbs
[block
->num
];
915 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
916 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
917 add_dependency(ids
, deps
[ip
], dependency_for_read(
918 sb
.get(byte_offset(inst
->src
[i
], REG_SIZE
* j
))));
921 if (is_send(inst
) && inst
->base_mrf
!= -1) {
922 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
923 add_dependency(ids
, deps
[ip
], dependency_for_read(
924 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
927 if (is_unordered(inst
))
928 add_dependency(ids
, deps
[ip
],
929 dependency(TGL_SBID_SET
, ip
, exec_all
));
931 if (!inst
->no_dd_check
) {
932 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
933 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
934 add_dependency(ids
, deps
[ip
], dependency_for_write(inst
,
935 sb
.get(byte_offset(inst
->dst
, REG_SIZE
* j
))));
939 if (is_send(inst
) && inst
->base_mrf
!= -1) {
940 for (unsigned j
= 0; j
< inst
->implied_mrf_writes(); j
++)
941 add_dependency(ids
, deps
[ip
], dependency_for_write(inst
,
942 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
946 update_inst_scoreboard(jps
, inst
, ip
, sb
);
959 * Allocate SBID tokens to track the execution of every out-of-order
960 * instruction of the shader.
963 allocate_inst_dependencies(const fs_visitor
*shader
,
964 const dependency_list
*deps0
)
966 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
967 * shaders with a large number of SEND messages.
970 /* Allocate an unordered dependency ID to hardware SBID translation
971 * table with as many entries as instructions there are in the shader,
972 * which is the maximum number of unordered IDs we can find in the
975 unsigned *ids
= new unsigned[num_instructions(shader
)];
976 for (unsigned ip
= 0; ip
< num_instructions(shader
); ip
++)
979 dependency_list
*deps1
= new dependency_list
[num_instructions(shader
)];
980 unsigned next_id
= 0;
982 for (unsigned ip
= 0; ip
< num_instructions(shader
); ip
++) {
983 for (unsigned i
= 0; i
< deps0
[ip
].size(); i
++) {
984 const dependency
&dep
= deps0
[ip
][i
];
986 if (dep
.unordered
&& ids
[dep
.id
] == ~0u)
987 ids
[dep
.id
] = (next_id
++) & 0xf;
989 add_dependency(ids
, deps1
[ip
], dep
);
999 * Emit dependency information provided by \p deps into the shader,
1000 * inserting additional SYNC instructions for dependencies that can't be
1001 * represented directly by annotating existing instructions.
1004 emit_inst_dependencies(fs_visitor
*shader
,
1005 const ordered_address
*jps
,
1006 const dependency_list
*deps
)
1010 foreach_block_and_inst_safe(block
, fs_inst
, inst
, shader
->cfg
) {
1011 const bool exec_all
= inst
->force_writemask_all
;
1012 tgl_swsb swsb
= ordered_dependency_swsb(deps
[ip
], jps
[ip
], exec_all
);
1013 const tgl_sbid_mode unordered_mode
=
1014 baked_unordered_dependency_mode(inst
, deps
[ip
], jps
[ip
]);
1016 for (unsigned i
= 0; i
< deps
[ip
].size(); i
++) {
1017 const dependency
&dep
= deps
[ip
][i
];
1019 if (dep
.unordered
) {
1020 if (unordered_mode
== dep
.unordered
&&
1021 exec_all
>= dep
.exec_all
&& !swsb
.mode
) {
1022 /* Bake unordered dependency into the instruction's SWSB if
1023 * possible, except in cases where the current instruction
1024 * isn't marked NoMask but the dependency is, since that
1025 * might lead to data coherency issues due to
1026 * GEN:BUG:1407528679.
1029 swsb
.mode
= dep
.unordered
;
1031 /* Emit dependency into the SWSB of an extra SYNC
1034 const fs_builder ibld
= fs_builder(shader
, block
, inst
)
1035 .exec_all().group(1, 0);
1036 fs_inst
*sync
= ibld
.emit(BRW_OPCODE_SYNC
, ibld
.null_reg_ud(),
1037 brw_imm_ud(TGL_SYNC_NOP
));
1038 sync
->sched
.sbid
= dep
.id
;
1039 sync
->sched
.mode
= dep
.unordered
;
1040 assert(!(sync
->sched
.mode
& TGL_SBID_SET
));
1045 for (unsigned i
= 0; i
< deps
[ip
].size(); i
++) {
1046 const dependency
&dep
= deps
[ip
][i
];
1048 if (dep
.ordered
&& dep
.exec_all
> exec_all
&&
1049 find_ordered_dependency(deps
[ip
], jps
[ip
], true)) {
1050 /* If the current instruction is not marked NoMask but an
1051 * ordered dependency is, perform the synchronization as a
1052 * separate NoMask SYNC instruction in order to avoid data
1053 * coherency issues due to GEN:BUG:1407528679. The similar
1054 * scenario with unordered dependencies should have been
1057 const fs_builder ibld
= fs_builder(shader
, block
, inst
)
1058 .exec_all().group(1, 0);
1059 fs_inst
*sync
= ibld
.emit(BRW_OPCODE_SYNC
, ibld
.null_reg_ud(),
1060 brw_imm_ud(TGL_SYNC_NOP
));
1061 sync
->sched
= ordered_dependency_swsb(deps
[ip
], jps
[ip
], true);
1066 /* Update the IR. */
1068 inst
->no_dd_check
= inst
->no_dd_clear
= false;
1075 fs_visitor::lower_scoreboard()
1077 if (devinfo
->gen
>= 12) {
1078 const ordered_address
*jps
= ordered_inst_addresses(this);
1079 const dependency_list
*deps0
= gather_inst_dependencies(this, jps
);
1080 const dependency_list
*deps1
= allocate_inst_dependencies(this, deps0
);
1081 emit_inst_dependencies(this, jps
, deps1
);