2 * Copyright © 2019 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_fs_scoreboard.cpp
26 * Gen12+ hardware lacks the register scoreboard logic that used to guarantee
27 * data coherency between register reads and writes in previous generations.
28 * This lowering pass runs after register allocation in order to make up for
31 * It works by performing global dataflow analysis in order to determine the
32 * set of potential dependencies of every instruction in the shader, and then
33 * inserts any required SWSB annotations and additional SYNC instructions in
34 * order to guarantee data coherency.
36 * WARNING - Access of the following (rarely used) ARF registers is not
37 * tracked here, and require the RegDist SWSB annotation to be set
38 * to 1 by the generator in order to avoid data races:
41 * - sr0 state register
42 * - cr0 control register
43 * - ip instruction pointer
44 * - tm0 timestamp register
45 * - dbg0 debug register
47 * The following ARF registers don't need to be tracked here because data
48 * coherency is still provided transparently by the hardware:
50 * - f0-1 flag registers
51 * - n0 notification register
52 * - tdr0 thread dependency register
62 * In-order instruction accounting.
67 * Number of in-order hardware instructions contained in this IR
68 * instruction. This determines the increment applied to the RegDist
69 * counter calculated for any ordered dependency that crosses this
73 ordered_unit(const fs_inst
*inst
)
75 switch (inst
->opcode
) {
78 case SHADER_OPCODE_UNDEF
:
79 case FS_OPCODE_PLACEHOLDER_HALT
:
82 /* Note that the following is inaccurate for virtual instructions
83 * that expand to more in-order instructions than assumed here, but
84 * that can only lead to suboptimal execution ordering, data
85 * coherency won't be impacted. Providing exact RegDist counts for
86 * each virtual instruction would allow better ALU performance, but
87 * it would require keeping this switch statement in perfect sync
88 * with the generator in order to avoid data corruption. Lesson is
89 * (again) don't use virtual instructions if you want optimal
92 return is_unordered(inst
) ? 0 : 1;
97 * Type for an instruction counter that increments for in-order
98 * instructions only, arbitrarily denoted 'jp' throughout this lowering
99 * pass in order to distinguish it from the regular instruction counter.
101 typedef int ordered_address
;
104 * Return the number of instructions in the program.
107 num_instructions(const backend_shader
*shader
)
109 return shader
->cfg
->blocks
[shader
->cfg
->num_blocks
- 1]->end_ip
+ 1;
113 * Calculate the local ordered_address instruction counter at every
114 * instruction of the shader for subsequent constant-time look-up.
117 ordered_inst_addresses(const fs_visitor
*shader
)
119 ordered_address
*jps
= new ordered_address
[num_instructions(shader
)];
120 ordered_address jp
= 0;
123 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
125 jp
+= ordered_unit(inst
);
133 * Synchronization mode required for data manipulated by in-order
136 * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
137 * enum for additional type safety. The hardware doesn't provide control
138 * over the synchronization mode for RegDist annotations, this is only used
139 * internally in this pass in order to optimize out redundant read
140 * dependencies where possible.
142 enum tgl_regdist_mode
{
143 TGL_REGDIST_NULL
= 0,
149 * Allow bitwise arithmetic of tgl_regdist_mode enums.
152 operator|(tgl_regdist_mode x
, tgl_regdist_mode y
)
154 return tgl_regdist_mode(unsigned(x
) | unsigned(y
));
158 operator&(tgl_regdist_mode x
, tgl_regdist_mode y
)
160 return tgl_regdist_mode(unsigned(x
) & unsigned(y
));
164 operator|=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
170 operator&=(tgl_regdist_mode
&x
, tgl_regdist_mode y
)
178 * Representation of an equivalence relation among the set of unsigned
181 * Its initial state is the identity relation '~' such that i ~ j if and
182 * only if i == j for every pair of unsigned integers i and j.
184 struct equivalence_relation
{
185 equivalence_relation(unsigned n
) : is(new unsigned[n
]), n(n
)
187 for (unsigned i
= 0; i
< n
; i
++)
191 ~equivalence_relation()
197 * Return equivalence class index of the specified element. Effectively
198 * this is the numeric value of an arbitrary representative from the
201 * Allows the evaluation of the equivalence relation according to the
202 * rule that i ~ j if and only if lookup(i) == lookup(j).
205 lookup(unsigned i
) const
207 if (i
< n
&& is
[i
] != i
)
208 return lookup(is
[i
]);
214 * Create an array with the results of the lookup() method for
215 * constant-time evaluation.
220 unsigned *ids
= new unsigned[n
];
222 for (unsigned i
= 0; i
< n
; i
++)
229 * Mutate the existing equivalence relation minimally by imposing the
230 * additional requirement that i ~ j.
232 * The algorithm updates the internal representation recursively in
233 * order to guarantee transitivity while preserving the previously
234 * specified equivalence requirements.
237 link(unsigned i
, unsigned j
)
239 const unsigned k
= lookup(i
);
246 equivalence_relation(const equivalence_relation
&);
248 equivalence_relation
&
249 operator=(const equivalence_relation
&);
252 * Assign the representative of \p from to be equivalent to \p to.
254 * At the same time the data structure is partially flattened as much as
255 * it's possible without increasing the number of recursive calls.
258 assign(unsigned from
, unsigned to
)
263 if (is
[from
] != from
)
264 assign(is
[from
], to
);
275 * Representation of a data dependency between two instructions in the
281 * No dependency information.
283 dependency() : ordered(TGL_REGDIST_NULL
), jp(INT_MIN
),
284 unordered(TGL_SBID_NULL
), id(0) {}
287 * Construct a dependency on the in-order instruction with the provided
288 * ordered_address instruction counter.
290 dependency(tgl_regdist_mode mode
, ordered_address jp
) :
291 ordered(mode
), jp(jp
), unordered(TGL_SBID_NULL
), id(0) {}
294 * Construct a dependency on the out-of-order instruction with the
295 * specified synchronization token.
297 dependency(tgl_sbid_mode mode
, unsigned id
) :
298 ordered(TGL_REGDIST_NULL
), jp(INT_MIN
), unordered(mode
), id(id
) {}
301 * Synchronization mode of in-order dependency, or zero if no in-order
302 * dependency is present.
304 tgl_regdist_mode ordered
;
307 * Instruction counter of in-order dependency.
309 * For a dependency part of a different block in the program, this is
310 * relative to the specific control flow path taken between the
311 * dependency and the current block: It is the ordered_address such that
312 * the difference between it and the ordered_address of the first
313 * instruction of the current block is exactly the number of in-order
314 * instructions across that control flow path. It is not guaranteed to
315 * be equal to the local ordered_address of the generating instruction
316 * [as returned by ordered_inst_addresses()], except for block-local
322 * Synchronization mode of unordered dependency, or zero if no unordered
323 * dependency is present.
325 tgl_sbid_mode unordered
;
327 /** Synchronization token of out-of-order dependency. */
331 * Trivial in-order dependency that's always satisfied.
333 * Note that unlike a default-constructed dependency() which is also
334 * trivially satisfied, this is considered to provide dependency
335 * information and can be used to clear a previously pending dependency
338 static const dependency done
;
341 operator==(const dependency
&dep0
, const dependency
&dep1
)
343 return dep0
.ordered
== dep1
.ordered
&&
344 dep0
.jp
== dep1
.jp
&&
345 dep0
.unordered
== dep1
.unordered
&&
350 operator!=(const dependency
&dep0
, const dependency
&dep1
)
352 return !(dep0
== dep1
);
356 const dependency
dependency::done
= dependency(TGL_REGDIST_SRC
, INT_MIN
);
359 * Return whether \p dep contains any dependency information.
362 is_valid(const dependency
&dep
)
364 return dep
.ordered
|| dep
.unordered
;
368 * Combine \p dep0 and \p dep1 into a single dependency object that is only
369 * satisfied when both original dependencies are satisfied. This might
370 * involve updating the equivalence relation \p eq in order to make sure
371 * that both out-of-order dependencies are assigned the same hardware SBID
372 * as synchronization token.
375 merge(equivalence_relation
&eq
,
376 const dependency
&dep0
, const dependency
&dep1
)
380 if (dep0
.ordered
|| dep1
.ordered
) {
381 dep
.ordered
= dep0
.ordered
| dep1
.ordered
;
382 dep
.jp
= MAX2(dep0
.jp
, dep1
.jp
);
385 if (dep0
.unordered
|| dep1
.unordered
) {
386 dep
.unordered
= dep0
.unordered
| dep1
.unordered
;
387 dep
.id
= eq
.link(dep0
.unordered
? dep0
.id
: dep1
.id
,
388 dep1
.unordered
? dep1
.id
: dep0
.id
);
395 * Override dependency information of \p dep0 with that of \p dep1.
398 shadow(const dependency
&dep0
, const dependency
&dep1
)
400 return is_valid(dep1
) ? dep1
: dep0
;
404 * Translate dependency information across the program.
406 * This returns a dependency on the same instruction translated to the
407 * ordered_address space of a different block. The correct shift for
408 * transporting a dependency across an edge of the CFG is the difference
409 * between the local ordered_address of the first instruction of the target
410 * block and the local ordered_address of the instruction immediately after
411 * the end of the origin block.
414 transport(dependency dep
, int delta
)
416 if (dep
.ordered
&& dep
.jp
> INT_MIN
)
423 * Return simplified dependency removing any synchronization modes not
424 * applicable to an instruction reading the same register location.
427 dependency_for_read(dependency dep
)
429 dep
.ordered
&= TGL_REGDIST_DST
;
434 * Return simplified dependency removing any synchronization modes not
435 * applicable to an instruction \p inst writing the same register location.
438 dependency_for_write(const fs_inst
*inst
, dependency dep
)
440 if (!is_unordered(inst
))
441 dep
.ordered
&= TGL_REGDIST_DST
;
448 * Scoreboard representation. This keeps track of the data dependencies of
449 * registers with GRF granularity.
454 * Look up the most current data dependency for register \p r.
457 get(const fs_reg
&r
) const
459 if (const dependency
*p
= const_cast<scoreboard
*>(this)->dep(r
))
466 * Specify the most current data dependency for register \p r.
469 set(const fs_reg
&r
, const dependency
&d
)
471 if (dependency
*p
= dep(r
))
476 * Component-wise merge() of corresponding dependencies from two
477 * scoreboard objects. \sa merge().
480 merge(equivalence_relation
&eq
,
481 const scoreboard
&sb0
, const scoreboard
&sb1
)
485 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
486 sb
.grf_deps
[i
] = merge(eq
, sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
488 sb
.addr_dep
= merge(eq
, sb0
.addr_dep
, sb1
.addr_dep
);
490 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
491 sb
.accum_deps
[i
] = merge(eq
, sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
497 * Component-wise shadow() of corresponding dependencies from two
498 * scoreboard objects. \sa shadow().
501 shadow(const scoreboard
&sb0
, const scoreboard
&sb1
)
505 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
506 sb
.grf_deps
[i
] = shadow(sb0
.grf_deps
[i
], sb1
.grf_deps
[i
]);
508 sb
.addr_dep
= shadow(sb0
.addr_dep
, sb1
.addr_dep
);
510 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
511 sb
.accum_deps
[i
] = shadow(sb0
.accum_deps
[i
], sb1
.accum_deps
[i
]);
517 * Component-wise transport() of dependencies from a scoreboard
518 * object. \sa transport().
521 transport(const scoreboard
&sb0
, int delta
)
525 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.grf_deps
); i
++)
526 sb
.grf_deps
[i
] = transport(sb0
.grf_deps
[i
], delta
);
528 sb
.addr_dep
= transport(sb0
.addr_dep
, delta
);
530 for (unsigned i
= 0; i
< ARRAY_SIZE(sb
.accum_deps
); i
++)
531 sb
.accum_deps
[i
] = transport(sb0
.accum_deps
[i
], delta
);
537 operator==(const scoreboard
&sb0
, const scoreboard
&sb1
)
539 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.grf_deps
); i
++) {
540 if (sb0
.grf_deps
[i
] != sb1
.grf_deps
[i
])
544 if (sb0
.addr_dep
!= sb1
.addr_dep
)
547 for (unsigned i
= 0; i
< ARRAY_SIZE(sb0
.accum_deps
); i
++) {
548 if (sb0
.accum_deps
[i
] != sb1
.accum_deps
[i
])
556 operator!=(const scoreboard
&sb0
, const scoreboard
&sb1
)
558 return !(sb0
== sb1
);
562 dependency grf_deps
[BRW_MAX_GRF
];
564 dependency accum_deps
[10];
569 const unsigned reg
= (r
.file
== VGRF
? r
.nr
+ r
.offset
/ REG_SIZE
:
570 reg_offset(r
) / REG_SIZE
);
572 return (r
.file
== VGRF
|| r
.file
== FIXED_GRF
? &grf_deps
[reg
] :
573 r
.file
== MRF
? &grf_deps
[GEN7_MRF_HACK_START
+ reg
] :
574 r
.file
== ARF
&& reg
>= BRW_ARF_ADDRESS
&&
575 reg
< BRW_ARF_ACCUMULATOR
? &addr_dep
:
576 r
.file
== ARF
&& reg
>= BRW_ARF_ACCUMULATOR
&&
577 reg
< BRW_ARF_FLAG
? &accum_deps
[
578 reg
- BRW_ARF_ACCUMULATOR
] :
584 * Dependency list handling.
587 struct dependency_list
{
588 dependency_list() : deps(NULL
), n(0) {}
596 push_back(const dependency
&dep
)
598 deps
= (dependency
*)realloc(deps
, (n
+ 1) * sizeof(*deps
));
609 operator[](unsigned i
) const
616 operator[](unsigned i
)
623 dependency_list(const dependency_list
&);
625 operator=(const dependency_list
&);
632 * Add dependency \p dep to the list of dependencies of an instruction
636 add_dependency(const unsigned *ids
, dependency_list
&deps
, dependency dep
)
639 /* Translate the unordered dependency token first in order to keep
640 * the list minimally redundant.
643 dep
.id
= ids
[dep
.id
];
645 /* Try to combine the specified dependency with any existing ones. */
646 for (unsigned i
= 0; i
< deps
.size(); i
++) {
647 if (dep
.ordered
&& deps
[i
].ordered
) {
648 deps
[i
].jp
= MAX2(deps
[i
].jp
, dep
.jp
);
649 deps
[i
].ordered
|= dep
.ordered
;
650 dep
.ordered
= TGL_REGDIST_NULL
;
653 if (dep
.unordered
&& deps
[i
].unordered
&& deps
[i
].id
== dep
.id
) {
654 deps
[i
].unordered
|= dep
.unordered
;
655 dep
.unordered
= TGL_SBID_NULL
;
659 /* Add it to the end of the list if necessary. */
666 * Construct a tgl_swsb annotation encoding any ordered dependencies from
667 * the dependency list \p deps of an instruction with ordered_address
671 ordered_dependency_swsb(const dependency_list
&deps
,
672 const ordered_address
&jp
)
674 unsigned min_dist
= ~0u;
676 for (unsigned i
= 0; i
< deps
.size(); i
++) {
677 if (deps
[i
].ordered
) {
678 const unsigned dist
= jp
- deps
[i
].jp
;
679 const unsigned max_dist
= 10;
680 assert(jp
> deps
[i
].jp
);
681 if (dist
<= max_dist
)
682 min_dist
= MIN3(min_dist
, dist
, 7);
686 return { min_dist
== ~0u ? 0 : min_dist
};
690 * Return whether the dependency list \p deps of an instruction with
691 * ordered_address \p jp has any non-trivial ordered dependencies.
694 find_ordered_dependency(const dependency_list
&deps
,
695 const ordered_address
&jp
)
697 return ordered_dependency_swsb(deps
, jp
).regdist
;
701 * Return the full tgl_sbid_mode bitset for the first unordered dependency
702 * on the list \p deps that matches the specified tgl_sbid_mode, or zero if
703 * no such dependency is present.
706 find_unordered_dependency(const dependency_list
&deps
,
707 tgl_sbid_mode unordered
)
710 for (unsigned i
= 0; i
< deps
.size(); i
++) {
711 if (unordered
& deps
[i
].unordered
)
712 return deps
[i
].unordered
;
716 return TGL_SBID_NULL
;
720 * Return the tgl_sbid_mode bitset of an unordered dependency from the list
721 * \p deps that can be represented directly in the SWSB annotation of the
722 * instruction without additional SYNC instructions, or zero if no such
723 * dependency is present.
726 baked_unordered_dependency_mode(const fs_inst
*inst
,
727 const dependency_list
&deps
,
728 const ordered_address
&jp
)
730 const bool has_ordered
= find_ordered_dependency(deps
, jp
);
732 if (find_unordered_dependency(deps
, TGL_SBID_SET
))
733 return find_unordered_dependency(deps
, TGL_SBID_SET
);
734 else if (has_ordered
&& is_unordered(inst
))
735 return TGL_SBID_NULL
;
736 else if (find_unordered_dependency(deps
, TGL_SBID_DST
) &&
737 (!has_ordered
|| !is_unordered(inst
)))
738 return find_unordered_dependency(deps
, TGL_SBID_DST
);
739 else if (!has_ordered
)
740 return find_unordered_dependency(deps
, TGL_SBID_SRC
);
742 return TGL_SBID_NULL
;
748 * Shader instruction dependency calculation.
753 * Update scoreboard object \p sb to account for the execution of
754 * instruction \p inst.
757 update_inst_scoreboard(const fs_visitor
*shader
, const ordered_address
*jps
,
758 const fs_inst
*inst
, unsigned ip
, scoreboard
&sb
)
760 /* Track any source registers that may be fetched asynchronously by this
761 * instruction, otherwise clear the dependency in order to avoid
762 * subsequent redundant synchronization.
764 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
765 const dependency rd_dep
=
766 inst
->is_payload(i
) || inst
->is_math() ? dependency(TGL_SBID_SRC
, ip
) :
767 ordered_unit(inst
) ? dependency(TGL_REGDIST_SRC
, jps
[ip
]) :
770 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
771 sb
.set(byte_offset(inst
->src
[i
], REG_SIZE
* j
), rd_dep
);
774 if (is_send(inst
) && inst
->base_mrf
!= -1) {
775 const dependency rd_dep
= dependency(TGL_SBID_SRC
, ip
);
777 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
778 sb
.set(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0), rd_dep
);
781 /* Track any destination registers of this instruction. */
782 const dependency wr_dep
=
783 is_unordered(inst
) ? dependency(TGL_SBID_DST
, ip
) :
784 ordered_unit(inst
) ? dependency(TGL_REGDIST_DST
, jps
[ip
]) :
787 if (is_valid(wr_dep
) && inst
->dst
.file
!= BAD_FILE
&&
788 !inst
->dst
.is_null()) {
789 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
790 sb
.set(byte_offset(inst
->dst
, REG_SIZE
* j
), wr_dep
);
795 * Calculate scoreboard objects locally that represent any pending (and
796 * unconditionally resolved) dependencies at the end of each block of the
800 gather_block_scoreboards(const fs_visitor
*shader
,
801 const ordered_address
*jps
)
803 scoreboard
*sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
806 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
)
807 update_inst_scoreboard(shader
, jps
, inst
, ip
++, sbs
[block
->num
]);
813 * Propagate data dependencies globally through the control flow graph
814 * until a fixed point is reached.
816 * Calculates the set of dependencies potentially pending at the beginning
817 * of each block, and returns it as an array of scoreboard objects.
820 propagate_block_scoreboards(const fs_visitor
*shader
,
821 const ordered_address
*jps
,
822 equivalence_relation
&eq
)
824 const scoreboard
*delta_sbs
= gather_block_scoreboards(shader
, jps
);
825 scoreboard
*in_sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
826 scoreboard
*out_sbs
= new scoreboard
[shader
->cfg
->num_blocks
];
828 for (bool progress
= true; progress
;) {
831 foreach_block(block
, shader
->cfg
) {
832 const scoreboard sb
= shadow(in_sbs
[block
->num
],
833 delta_sbs
[block
->num
]);
835 if (sb
!= out_sbs
[block
->num
]) {
836 foreach_list_typed(bblock_link
, child_link
, link
,
838 scoreboard
&in_sb
= in_sbs
[child_link
->block
->num
];
840 jps
[child_link
->block
->start_ip
] - jps
[block
->end_ip
]
841 - ordered_unit(static_cast<const fs_inst
*>(block
->end()));
843 in_sb
= merge(eq
, in_sb
, transport(sb
, delta
));
846 out_sbs
[block
->num
] = sb
;
859 * Return the list of potential dependencies of each instruction in the
860 * shader based on the result of global dependency analysis.
863 gather_inst_dependencies(const fs_visitor
*shader
,
864 const ordered_address
*jps
)
866 equivalence_relation
eq(num_instructions(shader
));
867 scoreboard
*sbs
= propagate_block_scoreboards(shader
, jps
, eq
);
868 const unsigned *ids
= eq
.flatten();
869 dependency_list
*deps
= new dependency_list
[num_instructions(shader
)];
872 foreach_block_and_inst(block
, fs_inst
, inst
, shader
->cfg
) {
873 scoreboard
&sb
= sbs
[block
->num
];
875 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
876 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
877 add_dependency(ids
, deps
[ip
], dependency_for_read(
878 sb
.get(byte_offset(inst
->src
[i
], REG_SIZE
* j
))));
881 if (is_send(inst
) && inst
->base_mrf
!= -1) {
882 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
883 add_dependency(ids
, deps
[ip
], dependency_for_read(
884 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
887 if (is_unordered(inst
))
888 add_dependency(ids
, deps
[ip
], dependency(TGL_SBID_SET
, ip
));
890 if (!inst
->no_dd_check
) {
891 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
892 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
893 add_dependency(ids
, deps
[ip
], dependency_for_write(inst
,
894 sb
.get(byte_offset(inst
->dst
, REG_SIZE
* j
))));
898 if (is_send(inst
) && inst
->base_mrf
!= -1) {
899 for (int j
= 0; j
< shader
->implied_mrf_writes(inst
); j
++)
900 add_dependency(ids
, deps
[ip
], dependency_for_write(inst
,
901 sb
.get(brw_uvec_mrf(8, inst
->base_mrf
+ j
, 0))));
905 update_inst_scoreboard(shader
, jps
, inst
, ip
, sb
);
918 * Allocate SBID tokens to track the execution of every out-of-order
919 * instruction of the shader.
922 allocate_inst_dependencies(const fs_visitor
*shader
,
923 const dependency_list
*deps0
)
925 /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
926 * shaders with a large number of SEND messages.
929 /* Allocate an unordered dependency ID to hardware SBID translation
930 * table with as many entries as instructions there are in the shader,
931 * which is the maximum number of unordered IDs we can find in the
934 unsigned *ids
= new unsigned[num_instructions(shader
)];
935 for (unsigned ip
= 0; ip
< num_instructions(shader
); ip
++)
938 dependency_list
*deps1
= new dependency_list
[num_instructions(shader
)];
939 unsigned next_id
= 0;
941 for (unsigned ip
= 0; ip
< num_instructions(shader
); ip
++) {
942 for (unsigned i
= 0; i
< deps0
[ip
].size(); i
++) {
943 const dependency
&dep
= deps0
[ip
][i
];
945 if (dep
.unordered
&& ids
[dep
.id
] == ~0u)
946 ids
[dep
.id
] = (next_id
++) & 0xf;
948 add_dependency(ids
, deps1
[ip
], dep
);
958 * Emit dependency information provided by \p deps into the shader,
959 * inserting additional SYNC instructions for dependencies that can't be
960 * represented directly by annotating existing instructions.
963 emit_inst_dependencies(fs_visitor
*shader
,
964 const ordered_address
*jps
,
965 const dependency_list
*deps
)
969 foreach_block_and_inst_safe(block
, fs_inst
, inst
, shader
->cfg
) {
970 tgl_swsb swsb
= ordered_dependency_swsb(deps
[ip
], jps
[ip
]);
971 const tgl_sbid_mode unordered_mode
=
972 baked_unordered_dependency_mode(inst
, deps
[ip
], jps
[ip
]);
974 for (unsigned i
= 0; i
< deps
[ip
].size(); i
++) {
975 const dependency
&dep
= deps
[ip
][i
];
978 if (unordered_mode
== dep
.unordered
&& !swsb
.mode
) {
979 /* Bake unordered dependency into the instruction's SWSB if
983 swsb
.mode
= dep
.unordered
;
985 /* Emit dependency into the SWSB of an extra SYNC
988 const fs_builder ibld
= fs_builder(shader
, block
, inst
)
989 .exec_all().group(1, 0);
990 fs_inst
*sync
= ibld
.emit(BRW_OPCODE_SYNC
, ibld
.null_reg_ud(),
991 brw_imm_ud(TGL_SYNC_NOP
));
992 sync
->sched
.sbid
= dep
.id
;
993 sync
->sched
.mode
= dep
.unordered
;
994 assert(!(sync
->sched
.mode
& TGL_SBID_SET
));
1001 inst
->no_dd_check
= inst
->no_dd_clear
= false;
1008 fs_visitor::lower_scoreboard()
1010 if (devinfo
->gen
>= 12) {
1011 const ordered_address
*jps
= ordered_inst_addresses(this);
1012 const dependency_list
*deps0
= gather_inst_dependencies(this, jps
);
1013 const dependency_list
*deps1
= allocate_inst_dependencies(this, deps0
);
1014 emit_inst_dependencies(this, jps
, deps1
);