2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 void join(const NOP_ctx_gfx6
&other
) {
38 set_vskip_mode_then_vector
= MAX2(set_vskip_mode_then_vector
, other
.set_vskip_mode_then_vector
);
39 valu_wr_vcc_then_vccz
= MAX2(valu_wr_vcc_then_vccz
, other
.valu_wr_vcc_then_vccz
);
40 valu_wr_exec_then_execz
= MAX2(valu_wr_exec_then_execz
, other
.valu_wr_exec_then_execz
);
41 valu_wr_vcc_then_div_fmas
= MAX2(valu_wr_vcc_then_div_fmas
, other
.valu_wr_vcc_then_div_fmas
);
42 salu_wr_m0_then_gds_msg_ttrace
= MAX2(salu_wr_m0_then_gds_msg_ttrace
, other
.salu_wr_m0_then_gds_msg_ttrace
);
43 valu_wr_exec_then_dpp
= MAX2(valu_wr_exec_then_dpp
, other
.valu_wr_exec_then_dpp
);
44 salu_wr_m0_then_lds
= MAX2(salu_wr_m0_then_lds
, other
.salu_wr_m0_then_lds
);
45 salu_wr_m0_then_moverel
= MAX2(salu_wr_m0_then_moverel
, other
.salu_wr_m0_then_moverel
);
46 setreg_then_getsetreg
= MAX2(setreg_then_getsetreg
, other
.setreg_then_getsetreg
);
47 vmem_store_then_wr_data
|= other
.vmem_store_then_wr_data
;
48 smem_clause
|= other
.smem_clause
;
49 smem_write
|= other
.smem_write
;
50 for (unsigned i
= 0; i
< BITSET_WORDS(128); i
++) {
51 smem_clause_read_write
[i
] |= other
.smem_clause_read_write
[i
];
52 smem_clause_write
[i
] |= other
.smem_clause_write
[i
];
56 bool operator==(const NOP_ctx_gfx6
&other
)
59 set_vskip_mode_then_vector
== other
.set_vskip_mode_then_vector
&&
60 valu_wr_vcc_then_vccz
== other
.valu_wr_vcc_then_vccz
&&
61 valu_wr_exec_then_execz
== other
.valu_wr_exec_then_execz
&&
62 valu_wr_vcc_then_div_fmas
== other
.valu_wr_vcc_then_div_fmas
&&
63 vmem_store_then_wr_data
== other
.vmem_store_then_wr_data
&&
64 salu_wr_m0_then_gds_msg_ttrace
== other
.salu_wr_m0_then_gds_msg_ttrace
&&
65 valu_wr_exec_then_dpp
== other
.valu_wr_exec_then_dpp
&&
66 salu_wr_m0_then_lds
== other
.salu_wr_m0_then_lds
&&
67 salu_wr_m0_then_moverel
== other
.salu_wr_m0_then_moverel
&&
68 setreg_then_getsetreg
== other
.setreg_then_getsetreg
&&
69 smem_clause
== other
.smem_clause
&&
70 smem_write
== other
.smem_write
&&
71 BITSET_EQUAL(smem_clause_read_write
, other
.smem_clause_read_write
) &&
72 BITSET_EQUAL(smem_clause_write
, other
.smem_clause_write
);
75 void add_wait_states(unsigned amount
)
77 if ((set_vskip_mode_then_vector
-= amount
) < 0)
78 set_vskip_mode_then_vector
= 0;
80 if ((valu_wr_vcc_then_vccz
-= amount
) < 0)
81 valu_wr_vcc_then_vccz
= 0;
83 if ((valu_wr_exec_then_execz
-= amount
) < 0)
84 valu_wr_exec_then_execz
= 0;
86 if ((valu_wr_vcc_then_div_fmas
-= amount
) < 0)
87 valu_wr_vcc_then_div_fmas
= 0;
89 if ((salu_wr_m0_then_gds_msg_ttrace
-= amount
) < 0)
90 salu_wr_m0_then_gds_msg_ttrace
= 0;
92 if ((valu_wr_exec_then_dpp
-= amount
) < 0)
93 valu_wr_exec_then_dpp
= 0;
95 if ((salu_wr_m0_then_lds
-= amount
) < 0)
96 salu_wr_m0_then_lds
= 0;
98 if ((salu_wr_m0_then_moverel
-= amount
) < 0)
99 salu_wr_m0_then_moverel
= 0;
101 if ((setreg_then_getsetreg
-= amount
) < 0)
102 setreg_then_getsetreg
= 0;
104 vmem_store_then_wr_data
.reset();
107 /* setting MODE.vskip and then any vector op requires 2 wait states */
108 int8_t set_vskip_mode_then_vector
= 0;
110 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
111 int8_t valu_wr_vcc_then_vccz
= 0;
112 int8_t valu_wr_exec_then_execz
= 0;
114 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
115 int8_t valu_wr_vcc_then_div_fmas
= 0;
117 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
118 int8_t salu_wr_m0_then_gds_msg_ttrace
= 0;
120 /* VALU writing EXEC followed by DPP requires 5 wait states */
121 int8_t valu_wr_exec_then_dpp
= 0;
123 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
124 int8_t salu_wr_m0_then_lds
= 0;
126 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
127 int8_t salu_wr_m0_then_moverel
= 0;
129 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
130 * currently we don't look at the actual register */
131 int8_t setreg_then_getsetreg
= 0;
133 /* some memory instructions writing >64bit followed by a instructions
134 * writing the VGPRs holding the writedata requires 1 wait state */
135 std::bitset
<256> vmem_store_then_wr_data
;
137 /* we break up SMEM clauses that contain stores or overwrite an
138 * operand/definition of another instruction in the clause */
139 bool smem_clause
= false;
140 bool smem_write
= false;
141 BITSET_DECLARE(smem_clause_read_write
, 128) = {0};
142 BITSET_DECLARE(smem_clause_write
, 128) = {0};
145 struct NOP_ctx_gfx10
{
146 bool has_VOPC
= false;
147 bool has_nonVALU_exec_read
= false;
148 bool has_VMEM
= false;
149 bool has_branch_after_VMEM
= false;
151 bool has_branch_after_DS
= false;
152 std::bitset
<128> sgprs_read_by_VMEM
;
153 std::bitset
<128> sgprs_read_by_SMEM
;
155 void join(const NOP_ctx_gfx10
&other
) {
156 has_VOPC
|= other
.has_VOPC
;
157 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
158 has_VMEM
|= other
.has_VMEM
;
159 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
160 has_DS
|= other
.has_DS
;
161 has_branch_after_DS
|= other
.has_branch_after_DS
;
162 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
163 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
166 bool operator==(const NOP_ctx_gfx10
&other
)
169 has_VOPC
== other
.has_VOPC
&&
170 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
171 has_VMEM
== other
.has_VMEM
&&
172 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
173 has_DS
== other
.has_DS
&&
174 has_branch_after_DS
== other
.has_branch_after_DS
&&
175 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
176 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
180 int get_wait_states(aco_ptr
<Instruction
>& instr
)
182 if (instr
->opcode
== aco_opcode::s_nop
)
183 return static_cast<SOPP_instruction
*>(instr
.get())->imm
+ 1;
184 else if (instr
->opcode
== aco_opcode::p_constaddr
)
185 return 3; /* lowered to 3 instructions in the assembler */
190 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
192 return a_reg
> b_reg
?
193 (a_reg
- b_reg
< b_size
) :
194 (b_reg
- a_reg
< a_size
);
197 template <bool Valu
, bool Vintrp
, bool Salu
>
198 int handle_raw_hazard_internal(Program
*program
, Block
*block
,
199 int nops_needed
, PhysReg reg
, uint32_t mask
)
201 unsigned mask_size
= util_last_bit(mask
);
202 for (int pred_idx
= block
->instructions
.size() - 1; pred_idx
>= 0; pred_idx
--) {
203 aco_ptr
<Instruction
>& pred
= block
->instructions
[pred_idx
];
205 uint32_t writemask
= 0;
206 for (Definition
& def
: pred
->definitions
) {
207 if (regs_intersect(reg
, mask_size
, def
.physReg(), def
.size())) {
208 unsigned start
= def
.physReg() > reg
? def
.physReg() - reg
: 0;
209 unsigned end
= MIN2(mask_size
, start
+ def
.size());
210 writemask
|= u_bit_consecutive(start
, end
- start
);
214 bool is_hazard
= writemask
!= 0 &&
215 ((pred
->isVALU() && Valu
) ||
216 (pred
->format
== Format::VINTRP
&& Vintrp
) ||
217 (pred
->isSALU() && Salu
));
222 nops_needed
-= get_wait_states(pred
);
224 if (nops_needed
<= 0 || mask
== 0)
231 template <bool Valu
, bool Vintrp
, bool Salu
>
232 void handle_raw_hazard(Program
*program
, Block
*cur_block
, int *NOPs
, int min_states
, Operand op
)
234 if (*NOPs
>= min_states
)
236 int res
= handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(program
, cur_block
, min_states
, op
.physReg(), u_bit_consecutive(0, op
.size()));
237 *NOPs
= MAX2(*NOPs
, res
);
240 static auto handle_valu_then_read_hazard
= handle_raw_hazard
<true, true, false>;
241 static auto handle_vintrp_then_read_hazard
= handle_raw_hazard
<false, true, false>;
242 static auto handle_valu_salu_then_read_hazard
= handle_raw_hazard
<true, true, true>;
244 void set_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
245 unsigned end
= start
+ size
- 1;
246 unsigned start_mod
= start
% BITSET_WORDBITS
;
247 if (start_mod
+ size
<= BITSET_WORDBITS
) {
248 BITSET_SET_RANGE(words
, start
, end
);
250 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
251 set_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
);
252 set_bitset_range(words
, start
+ first_size
, size
- first_size
);
256 bool test_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
257 unsigned end
= start
+ size
- 1;
258 unsigned start_mod
= start
% BITSET_WORDBITS
;
259 if (start_mod
+ size
<= BITSET_WORDBITS
) {
260 return BITSET_TEST_RANGE(words
, start
, end
);
262 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
263 return test_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
) ||
264 test_bitset_range(words
, start
+ first_size
, size
- first_size
);
268 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
269 void handle_instruction_gfx6(Program
*program
, Block
*cur_block
, NOP_ctx_gfx6
&ctx
,
270 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
275 if (instr
->format
== Format::SMEM
) {
276 if (program
->chip_class
== GFX6
) {
277 /* A read of an SGPR by SMRD instruction requires 4 wait states
278 * when the SGPR was written by a VALU instruction. According to LLVM,
279 * there is also an undocumented hardware behavior when the buffer
280 * descriptor is written by a SALU instruction */
281 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
282 Operand op
= instr
->operands
[i
];
286 bool is_buffer_desc
= i
== 0 && op
.size() > 2;
288 handle_valu_salu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
290 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
294 /* break off from prevous SMEM clause if needed */
295 if (!NOPs
& (ctx
.smem_clause
|| ctx
.smem_write
)) {
296 /* Don't allow clauses with store instructions since the clause's
297 * instructions may use the same address. */
298 if (ctx
.smem_write
|| instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
301 for (Operand op
: instr
->operands
) {
302 if (!op
.isConstant() && test_bitset_range(ctx
.smem_clause_write
, op
.physReg(), op
.size())) {
307 Definition def
= instr
->definitions
[0];
308 if (!NOPs
&& test_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size()))
312 } else if (instr
->isSALU()) {
313 if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
||
314 instr
->opcode
== aco_opcode::s_getreg_b32
) {
315 NOPs
= MAX2(NOPs
, ctx
.setreg_then_getsetreg
);
318 if (program
->chip_class
== GFX9
) {
319 if (instr
->opcode
== aco_opcode::s_movrels_b32
|| instr
->opcode
== aco_opcode::s_movrels_b64
||
320 instr
->opcode
== aco_opcode::s_movreld_b32
|| instr
->opcode
== aco_opcode::s_movreld_b64
) {
321 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_moverel
);
325 if (instr
->opcode
== aco_opcode::s_sendmsg
|| instr
->opcode
== aco_opcode::s_ttracedata
)
326 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
327 } else if (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
) {
328 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
329 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
330 for (Operand op
: instr
->operands
) {
331 if (op
.physReg() == vccz
)
332 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_vccz
);
333 if (op
.physReg() == execz
)
334 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_execz
);
337 if (instr
->isDPP()) {
338 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_dpp
);
339 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 2, instr
->operands
[0]);
342 for (Definition def
: instr
->definitions
) {
343 if (def
.regClass().type() != RegType::sgpr
) {
344 for (unsigned i
= 0; i
< def
.size(); i
++)
345 NOPs
= MAX2(NOPs
, ctx
.vmem_store_then_wr_data
[(def
.physReg() & 0xff) + i
]);
349 if ((instr
->opcode
== aco_opcode::v_readlane_b32
||
350 instr
->opcode
== aco_opcode::v_readlane_b32_e64
||
351 instr
->opcode
== aco_opcode::v_writelane_b32
||
352 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) &&
353 !instr
->operands
[1].isConstant()) {
354 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, instr
->operands
[1]);
357 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
358 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
359 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
360 * This hazard isn't documented anywhere but AMD confirmed that hazard.
362 if (program
->chip_class
== GFX6
&&
363 (instr
->opcode
== aco_opcode::v_readlane_b32
|| /* GFX6 doesn't have v_readlane_b32_e64 */
364 instr
->opcode
== aco_opcode::v_readfirstlane_b32
)) {
365 handle_vintrp_then_read_hazard(program
, cur_block
, &NOPs
, 1, instr
->operands
[0]);
368 if (instr
->opcode
== aco_opcode::v_div_fmas_f32
|| instr
->opcode
== aco_opcode::v_div_fmas_f64
)
369 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_div_fmas
);
370 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
371 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
372 for (Operand op
: instr
->operands
) {
373 if (!op
.isConstant() && !op
.isUndefined() && op
.regClass().type() == RegType::sgpr
)
374 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 5, op
);
378 if (!instr
->isSALU() && instr
->format
!= Format::SMEM
)
379 NOPs
= MAX2(NOPs
, ctx
.set_vskip_mode_then_vector
);
381 if (program
->chip_class
== GFX9
) {
382 bool lds_scratch_global
= (instr
->format
== Format::SCRATCH
|| instr
->format
== Format::GLOBAL
) &&
383 static_cast<FLAT_instruction
*>(instr
.get())->lds
;
384 if (instr
->format
== Format::VINTRP
||
385 instr
->opcode
== aco_opcode::ds_read_addtid_b32
||
386 instr
->opcode
== aco_opcode::ds_write_addtid_b32
||
387 instr
->opcode
== aco_opcode::buffer_store_lds_dword
||
388 lds_scratch_global
) {
389 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_lds
);
393 ctx
.add_wait_states(NOPs
+ get_wait_states(instr
));
395 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
398 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
401 new_instructions
.emplace_back(std::move(nop
));
404 /* update information to check for later hazards */
405 if ((ctx
.smem_clause
|| ctx
.smem_write
) && (NOPs
|| instr
->format
!= Format::SMEM
)) {
406 ctx
.smem_clause
= false;
407 ctx
.smem_write
= false;
408 BITSET_ZERO(ctx
.smem_clause_read_write
);
409 BITSET_ZERO(ctx
.smem_clause_write
);
412 if (instr
->format
== Format::SMEM
) {
413 if (instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
414 ctx
.smem_write
= true;
416 ctx
.smem_clause
= true;
418 for (Operand op
: instr
->operands
) {
419 if (!op
.isConstant()) {
420 set_bitset_range(ctx
.smem_clause_read_write
, op
.physReg(), op
.size());
424 Definition def
= instr
->definitions
[0];
425 set_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size());
426 set_bitset_range(ctx
.smem_clause_write
, def
.physReg(), def
.size());
428 } else if (instr
->isVALU()) {
429 for (Definition def
: instr
->definitions
) {
430 if (def
.regClass().type() == RegType::sgpr
) {
431 if (def
.physReg() == vcc
|| def
.physReg() == vcc_hi
) {
432 ctx
.valu_wr_vcc_then_vccz
= 5;
433 ctx
.valu_wr_vcc_then_div_fmas
= 4;
435 if (def
.physReg() == exec
|| def
.physReg() == exec_hi
) {
436 ctx
.valu_wr_exec_then_execz
= 5;
437 ctx
.valu_wr_exec_then_dpp
= 5;
441 } else if (instr
->isSALU() && !instr
->definitions
.empty()) {
442 if (!instr
->definitions
.empty()) {
443 /* all other definitions should be SCC */
444 Definition def
= instr
->definitions
[0];
445 if (def
.physReg() == m0
) {
446 ctx
.salu_wr_m0_then_gds_msg_ttrace
= 1;
447 ctx
.salu_wr_m0_then_lds
= 1;
448 ctx
.salu_wr_m0_then_moverel
= 1;
450 } else if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
) {
451 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
.get());
452 unsigned offset
= (sopk
->imm
>> 6) & 0x1f;
453 unsigned size
= ((sopk
->imm
>> 11) & 0x1f) + 1;
454 unsigned reg
= sopk
->imm
& 0x3f;
455 ctx
.setreg_then_getsetreg
= 2;
457 if (reg
== 1 && offset
>= 28 && size
> (28 - offset
))
458 ctx
.set_vskip_mode_then_vector
= 2;
460 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
461 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
462 bool consider_buf
= (instr
->format
== Format::MUBUF
|| instr
->format
== Format::MTBUF
) &&
463 instr
->operands
.size() == 4 &&
464 instr
->operands
[3].size() > 2 &&
465 instr
->operands
[2].physReg() >= 128;
466 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
467 bool consider_mimg
= instr
->format
== Format::MIMG
&&
468 instr
->operands
[1].regClass().type() == RegType::vgpr
&&
469 instr
->operands
[1].size() > 2 &&
470 instr
->operands
[0].size() == 4;
471 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
472 bool consider_flat
= (instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) &&
473 instr
->operands
.size() == 3 &&
474 instr
->operands
[2].size() > 2;
475 if (consider_buf
|| consider_mimg
|| consider_flat
) {
476 PhysReg wrdata
= instr
->operands
[consider_flat
? 2 : 3].physReg();
477 unsigned size
= instr
->operands
[consider_flat
? 2 : 3].size();
478 for (unsigned i
= 0; i
< size
; i
++)
479 ctx
.vmem_store_then_wr_data
[(wrdata
& 0xff) + i
] = 1;
484 template <std::size_t N
>
485 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
487 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
488 bool writes_any
= false;
489 for (unsigned i
= 0; i
< def
.size(); i
++) {
490 unsigned def_reg
= def
.physReg() + i
;
491 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
497 template <std::size_t N
>
498 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
500 for (const Operand
&op
: instr
->operands
) {
501 for (unsigned i
= 0; i
< op
.size(); i
++) {
502 unsigned reg
= op
.physReg() + i
;
503 if (reg
< reg_reads
.size())
509 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
511 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
513 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
515 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
516 instr
->opcode
== aco_opcode::v_readlane_b32
||
517 instr
->opcode
== aco_opcode::v_readlane_b32_e64
)
522 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
524 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
525 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
529 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
531 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
532 return def
.getTemp().type() == RegType::sgpr
;
536 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
538 return instr
->opcode
== aco_opcode::s_branch
||
539 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
540 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
541 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
542 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
543 instr
->opcode
== aco_opcode::s_cbranch_execz
||
544 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
545 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
546 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
547 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
548 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
549 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
550 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
551 instr
->opcode
== aco_opcode::s_setpc_b64
||
552 instr
->opcode
== aco_opcode::s_swappc_b64
||
553 instr
->opcode
== aco_opcode::s_getpc_b64
||
554 instr
->opcode
== aco_opcode::s_call_b64
;
557 void handle_instruction_gfx10(Program
*program
, Block
*cur_block
, NOP_ctx_gfx10
&ctx
,
558 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
560 //TODO: s_dcache_inv needs to be in it's own group on GFX10
562 /* VMEMtoScalarWriteHazard
563 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
565 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
566 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
567 /* Remember all SGPRs that are read by the VMEM instruction */
568 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
569 ctx
.sgprs_read_by_VMEM
.set(exec
);
570 if (program
->wave_size
== 64)
571 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
572 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
573 /* Check if SALU writes an SGPR that was previously read by the VALU */
574 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
575 ctx
.sgprs_read_by_VMEM
.reset();
577 /* Insert v_nop to mitigate the problem */
578 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
579 new_instructions
.emplace_back(std::move(nop
));
581 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
582 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
583 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
584 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
586 ctx
.sgprs_read_by_VMEM
.reset();
587 } else if (instr
->isVALU()) {
588 /* Hazard is mitigated by any VALU instruction */
589 ctx
.sgprs_read_by_VMEM
.reset();
592 /* VcmpxPermlaneHazard
593 * Handle any permlane following a VOPC instruction, insert v_mov between them.
595 if (instr
->format
== Format::VOPC
) {
597 } else if (ctx
.has_VOPC
&&
598 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
599 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
600 ctx
.has_VOPC
= false;
602 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
603 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
604 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
605 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
606 new_instructions
.emplace_back(std::move(v_mov
));
607 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
608 ctx
.has_VOPC
= false;
611 /* VcmpxExecWARHazard
612 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
614 if (!instr
->isVALU() && instr
->reads_exec()) {
615 ctx
.has_nonVALU_exec_read
= true;
616 } else if (instr
->isVALU()) {
617 if (instr_writes_exec(instr
)) {
618 ctx
.has_nonVALU_exec_read
= false;
620 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
621 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
622 depctr
->imm
= 0xfffe;
624 new_instructions
.emplace_back(std::move(depctr
));
625 } else if (instr_writes_sgpr(instr
)) {
626 /* Any VALU instruction that writes an SGPR mitigates the problem */
627 ctx
.has_nonVALU_exec_read
= false;
629 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
630 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
631 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
632 if ((sopp
->imm
& 0xfffe) == 0xfffe)
633 ctx
.has_nonVALU_exec_read
= false;
636 /* SMEMtoVectorWriteHazard
637 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
639 if (instr
->format
== Format::SMEM
) {
640 /* Remember all SGPRs that are read by the SMEM instruction */
641 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
642 } else if (VALU_writes_sgpr(instr
)) {
643 /* Check if VALU writes an SGPR that was previously read by SMEM */
644 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
645 ctx
.sgprs_read_by_SMEM
.reset();
647 /* Insert s_mov to mitigate the problem */
648 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
649 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
650 s_mov
->operands
[0] = Operand(0u);
651 new_instructions
.emplace_back(std::move(s_mov
));
653 } else if (instr
->isSALU()) {
654 if (instr
->format
!= Format::SOPP
) {
655 /* SALU can mitigate the hazard */
656 ctx
.sgprs_read_by_SMEM
.reset();
658 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
659 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
660 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
661 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
662 ctx
.sgprs_read_by_SMEM
.reset();
663 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
664 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
666 ctx
.sgprs_read_by_SMEM
.reset();
671 /* LdsBranchVmemWARHazard
672 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
674 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
676 ctx
.has_branch_after_VMEM
= false;
677 /* Mitigation for DS is needed only if there was already a branch after */
678 ctx
.has_DS
= ctx
.has_branch_after_DS
;
679 } else if (instr
->format
== Format::DS
) {
681 ctx
.has_branch_after_DS
= false;
682 /* Mitigation for VMEM is needed only if there was already a branch after */
683 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
684 } else if (instr_is_branch(instr
)) {
685 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
686 ctx
.has_branch_after_DS
= ctx
.has_DS
;
687 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
688 /* Only s_waitcnt_vscnt can mitigate the hazard */
689 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
690 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
691 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
693 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
694 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
696 /* Insert s_waitcnt_vscnt to mitigate the problem */
697 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
698 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
700 new_instructions
.emplace_back(std::move(wait
));
704 template <typename Ctx
>
705 using HandleInstr
= void (*)(Program
*, Block
*block
, Ctx
&, aco_ptr
<Instruction
>&,
706 std::vector
<aco_ptr
<Instruction
>>&);
708 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
709 void handle_block(Program
*program
, Ctx
& ctx
, Block
& block
)
711 if (block
.instructions
.empty())
714 std::vector
<aco_ptr
<Instruction
>> old_instructions
= std::move(block
.instructions
);
716 block
.instructions
.reserve(block
.instructions
.size());
718 for (aco_ptr
<Instruction
>& instr
: old_instructions
) {
719 Handle(program
, &block
, ctx
, instr
, block
.instructions
);
720 block
.instructions
.emplace_back(std::move(instr
));
724 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
725 void mitigate_hazards(Program
*program
)
727 std::vector
<Ctx
> all_ctx(program
->blocks
.size());
728 std::stack
<unsigned> loop_header_indices
;
730 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
731 Block
& block
= program
->blocks
[i
];
732 Ctx
&ctx
= all_ctx
[i
];
734 if (block
.kind
& block_kind_loop_header
) {
735 loop_header_indices
.push(i
);
736 } else if (block
.kind
& block_kind_loop_exit
) {
737 /* Go through the whole loop again */
738 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
740 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
741 loop_block_ctx
.join(all_ctx
[b
]);
743 handle_block
<Ctx
, Handle
>(program
, loop_block_ctx
, program
->blocks
[idx
]);
745 /* We only need to continue if the loop header context changed */
746 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
749 all_ctx
[idx
] = loop_block_ctx
;
752 loop_header_indices
.pop();
755 for (unsigned b
: block
.linear_preds
)
756 ctx
.join(all_ctx
[b
]);
758 handle_block
<Ctx
, Handle
>(program
, ctx
, block
);
762 } /* end namespace */
764 void insert_NOPs(Program
* program
)
766 if (program
->chip_class
>= GFX10
) {
767 mitigate_hazards
<NOP_ctx_gfx10
, handle_instruction_gfx10
>(program
);
769 for (Block
& block
: program
->blocks
) {
771 handle_block
<NOP_ctx_gfx6
, handle_instruction_gfx6
>(program
, ctx
, block
);