2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 void join(const NOP_ctx_gfx6
&other
) {
38 set_vskip_mode_then_vector
= MAX2(set_vskip_mode_then_vector
, other
.set_vskip_mode_then_vector
);
39 valu_wr_vcc_then_vccz
= MAX2(valu_wr_vcc_then_vccz
, other
.valu_wr_vcc_then_vccz
);
40 valu_wr_exec_then_execz
= MAX2(valu_wr_exec_then_execz
, other
.valu_wr_exec_then_execz
);
41 valu_wr_vcc_then_div_fmas
= MAX2(valu_wr_vcc_then_div_fmas
, other
.valu_wr_vcc_then_div_fmas
);
42 salu_wr_m0_then_gds_msg_ttrace
= MAX2(salu_wr_m0_then_gds_msg_ttrace
, other
.salu_wr_m0_then_gds_msg_ttrace
);
43 valu_wr_exec_then_dpp
= MAX2(valu_wr_exec_then_dpp
, other
.valu_wr_exec_then_dpp
);
44 salu_wr_m0_then_lds
= MAX2(salu_wr_m0_then_lds
, other
.salu_wr_m0_then_lds
);
45 salu_wr_m0_then_moverel
= MAX2(salu_wr_m0_then_moverel
, other
.salu_wr_m0_then_moverel
);
46 setreg_then_getsetreg
= MAX2(setreg_then_getsetreg
, other
.setreg_then_getsetreg
);
47 vmem_store_then_wr_data
|= other
.vmem_store_then_wr_data
;
48 smem_clause
|= other
.smem_clause
;
49 smem_write
|= other
.smem_write
;
50 for (unsigned i
= 0; i
< BITSET_WORDS(128); i
++) {
51 smem_clause_read_write
[i
] |= other
.smem_clause_read_write
[i
];
52 smem_clause_write
[i
] |= other
.smem_clause_write
[i
];
56 bool operator==(const NOP_ctx_gfx6
&other
)
59 set_vskip_mode_then_vector
== other
.set_vskip_mode_then_vector
&&
60 valu_wr_vcc_then_vccz
== other
.valu_wr_vcc_then_vccz
&&
61 valu_wr_exec_then_execz
== other
.valu_wr_exec_then_execz
&&
62 valu_wr_vcc_then_div_fmas
== other
.valu_wr_vcc_then_div_fmas
&&
63 vmem_store_then_wr_data
== other
.vmem_store_then_wr_data
&&
64 salu_wr_m0_then_gds_msg_ttrace
== other
.salu_wr_m0_then_gds_msg_ttrace
&&
65 valu_wr_exec_then_dpp
== other
.valu_wr_exec_then_dpp
&&
66 salu_wr_m0_then_lds
== other
.salu_wr_m0_then_lds
&&
67 salu_wr_m0_then_moverel
== other
.salu_wr_m0_then_moverel
&&
68 setreg_then_getsetreg
== other
.setreg_then_getsetreg
&&
69 smem_clause
== other
.smem_clause
&&
70 smem_write
== other
.smem_write
&&
71 BITSET_EQUAL(smem_clause_read_write
, other
.smem_clause_read_write
) &&
72 BITSET_EQUAL(smem_clause_write
, other
.smem_clause_write
);
75 void add_wait_states(unsigned amount
)
77 if ((set_vskip_mode_then_vector
-= amount
) < 0)
78 set_vskip_mode_then_vector
= 0;
80 if ((valu_wr_vcc_then_vccz
-= amount
) < 0)
81 valu_wr_vcc_then_vccz
= 0;
83 if ((valu_wr_exec_then_execz
-= amount
) < 0)
84 valu_wr_exec_then_execz
= 0;
86 if ((valu_wr_vcc_then_div_fmas
-= amount
) < 0)
87 valu_wr_vcc_then_div_fmas
= 0;
89 if ((salu_wr_m0_then_gds_msg_ttrace
-= amount
) < 0)
90 salu_wr_m0_then_gds_msg_ttrace
= 0;
92 if ((valu_wr_exec_then_dpp
-= amount
) < 0)
93 valu_wr_exec_then_dpp
= 0;
95 if ((salu_wr_m0_then_lds
-= amount
) < 0)
96 salu_wr_m0_then_lds
= 0;
98 if ((salu_wr_m0_then_moverel
-= amount
) < 0)
99 salu_wr_m0_then_moverel
= 0;
101 if ((setreg_then_getsetreg
-= amount
) < 0)
102 setreg_then_getsetreg
= 0;
104 vmem_store_then_wr_data
.reset();
107 /* setting MODE.vskip and then any vector op requires 2 wait states */
108 int8_t set_vskip_mode_then_vector
= 0;
110 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
111 int8_t valu_wr_vcc_then_vccz
= 0;
112 int8_t valu_wr_exec_then_execz
= 0;
114 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
115 int8_t valu_wr_vcc_then_div_fmas
= 0;
117 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
118 int8_t salu_wr_m0_then_gds_msg_ttrace
= 0;
120 /* VALU writing EXEC followed by DPP requires 5 wait states */
121 int8_t valu_wr_exec_then_dpp
= 0;
123 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
124 int8_t salu_wr_m0_then_lds
= 0;
126 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
127 int8_t salu_wr_m0_then_moverel
= 0;
129 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
130 * currently we don't look at the actual register */
131 int8_t setreg_then_getsetreg
= 0;
133 /* some memory instructions writing >64bit followed by a instructions
134 * writing the VGPRs holding the writedata requires 1 wait state */
135 std::bitset
<256> vmem_store_then_wr_data
;
137 /* we break up SMEM clauses that contain stores or overwrite an
138 * operand/definition of another instruction in the clause */
139 bool smem_clause
= false;
140 bool smem_write
= false;
141 BITSET_DECLARE(smem_clause_read_write
, 128) = {0};
142 BITSET_DECLARE(smem_clause_write
, 128) = {0};
145 struct NOP_ctx_gfx10
{
146 bool has_VOPC
= false;
147 bool has_nonVALU_exec_read
= false;
148 bool has_VMEM
= false;
149 bool has_branch_after_VMEM
= false;
151 bool has_branch_after_DS
= false;
152 std::bitset
<128> sgprs_read_by_VMEM
;
153 std::bitset
<128> sgprs_read_by_SMEM
;
155 void join(const NOP_ctx_gfx10
&other
) {
156 has_VOPC
|= other
.has_VOPC
;
157 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
158 has_VMEM
|= other
.has_VMEM
;
159 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
160 has_DS
|= other
.has_DS
;
161 has_branch_after_DS
|= other
.has_branch_after_DS
;
162 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
163 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
166 bool operator==(const NOP_ctx_gfx10
&other
)
169 has_VOPC
== other
.has_VOPC
&&
170 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
171 has_VMEM
== other
.has_VMEM
&&
172 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
173 has_DS
== other
.has_DS
&&
174 has_branch_after_DS
== other
.has_branch_after_DS
&&
175 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
176 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
180 int get_wait_states(aco_ptr
<Instruction
>& instr
)
185 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
187 return a_reg
> b_reg
?
188 (a_reg
- b_reg
< b_size
) :
189 (b_reg
- a_reg
< a_size
);
192 template <bool Valu
, bool Vintrp
, bool Salu
>
193 int handle_raw_hazard_internal(Program
*program
, Block
*block
,
194 int nops_needed
, PhysReg reg
, uint32_t mask
)
196 unsigned mask_size
= util_last_bit(mask
);
197 for (int pred_idx
= block
->instructions
.size() - 1; pred_idx
>= 0; pred_idx
--) {
198 aco_ptr
<Instruction
>& pred
= block
->instructions
[pred_idx
];
200 uint32_t writemask
= 0;
201 for (Definition
& def
: pred
->definitions
) {
202 if (regs_intersect(reg
, mask_size
, def
.physReg(), def
.size())) {
203 unsigned start
= def
.physReg() > reg
? def
.physReg() - reg
: 0;
204 unsigned end
= MIN2(mask_size
, start
+ def
.size());
205 writemask
|= u_bit_consecutive(start
, end
- start
);
209 bool is_hazard
= writemask
!= 0 &&
210 ((pred
->isVALU() && Valu
) ||
211 (pred
->format
== Format::VINTRP
&& Vintrp
) ||
212 (pred
->isSALU() && Salu
));
216 nops_needed
-= get_wait_states(pred
);
218 if (nops_needed
<= 0)
225 template <bool Valu
, bool Vintrp
, bool Salu
>
226 void handle_raw_hazard(Program
*program
, Block
*cur_block
, int *NOPs
, int min_states
, Operand op
)
228 if (*NOPs
>= min_states
)
230 int res
= handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(program
, cur_block
, min_states
, op
.physReg(), u_bit_consecutive(0, op
.size()));
231 *NOPs
= MAX2(*NOPs
, res
);
234 static auto handle_valu_then_read_hazard
= handle_raw_hazard
<true, true, false>;
235 static auto handle_vintrp_then_read_hazard
= handle_raw_hazard
<false, true, false>;
236 static auto handle_valu_salu_then_read_hazard
= handle_raw_hazard
<true, true, true>;
238 void set_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
239 unsigned end
= start
+ size
- 1;
240 unsigned start_mod
= start
% BITSET_WORDBITS
;
241 if (start_mod
+ size
<= BITSET_WORDBITS
) {
242 BITSET_SET_RANGE(words
, start
, end
);
244 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
245 set_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
);
246 set_bitset_range(words
, start
+ first_size
, size
- first_size
);
250 bool test_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
251 unsigned end
= start
+ size
- 1;
252 unsigned start_mod
= start
% BITSET_WORDBITS
;
253 if (start_mod
+ size
<= BITSET_WORDBITS
) {
254 return BITSET_TEST_RANGE(words
, start
, end
);
256 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
257 return test_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
) ||
258 test_bitset_range(words
, start
+ first_size
, size
- first_size
);
262 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
263 void handle_instruction_gfx6(Program
*program
, Block
*cur_block
, NOP_ctx_gfx6
&ctx
,
264 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
269 if (instr
->format
== Format::SMEM
) {
270 if (program
->chip_class
== GFX6
) {
271 /* A read of an SGPR by SMRD instruction requires 4 wait states
272 * when the SGPR was written by a VALU instruction. According to LLVM,
273 * there is also an undocumented hardware behavior when the buffer
274 * descriptor is written by a SALU instruction */
275 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
276 Operand op
= instr
->operands
[i
];
280 bool is_buffer_desc
= i
== 0 && op
.size() > 2;
282 handle_valu_salu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
284 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
288 /* break off from prevous SMEM clause if needed */
289 if (!NOPs
& (ctx
.smem_clause
|| ctx
.smem_write
)) {
290 /* Don't allow clauses with store instructions since the clause's
291 * instructions may use the same address. */
292 if (ctx
.smem_write
|| instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
295 for (Operand op
: instr
->operands
) {
296 if (!op
.isConstant() && test_bitset_range(ctx
.smem_clause_write
, op
.physReg(), op
.size())) {
301 Definition def
= instr
->definitions
[0];
302 if (!NOPs
&& test_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size()))
306 } else if (instr
->isSALU()) {
307 if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
||
308 instr
->opcode
== aco_opcode::s_getreg_b32
) {
309 NOPs
= MAX2(NOPs
, ctx
.setreg_then_getsetreg
);
312 if (program
->chip_class
== GFX9
) {
313 if (instr
->opcode
== aco_opcode::s_movrels_b32
|| instr
->opcode
== aco_opcode::s_movrels_b64
||
314 instr
->opcode
== aco_opcode::s_movreld_b32
|| instr
->opcode
== aco_opcode::s_movreld_b64
) {
315 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_moverel
);
319 if (instr
->opcode
== aco_opcode::s_sendmsg
|| instr
->opcode
== aco_opcode::s_ttracedata
)
320 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
321 } else if (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
) {
322 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
323 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
324 for (Operand op
: instr
->operands
) {
325 if (op
.physReg() == vccz
)
326 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_vccz
);
327 if (op
.physReg() == execz
)
328 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_execz
);
331 if (instr
->isDPP()) {
332 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_dpp
);
333 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 2, instr
->operands
[0]);
336 for (Definition def
: instr
->definitions
) {
337 if (def
.regClass().type() != RegType::sgpr
) {
338 for (unsigned i
= 0; i
< def
.size(); i
++)
339 NOPs
= MAX2(NOPs
, ctx
.vmem_store_then_wr_data
[(def
.physReg() & 0xff) + i
]);
343 if ((instr
->opcode
== aco_opcode::v_readlane_b32
||
344 instr
->opcode
== aco_opcode::v_readlane_b32_e64
||
345 instr
->opcode
== aco_opcode::v_writelane_b32
||
346 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) &&
347 !instr
->operands
[1].isConstant()) {
348 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, instr
->operands
[1]);
351 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
352 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
353 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
354 * This hazard isn't documented anywhere but AMD confirmed that hazard.
356 if (program
->chip_class
== GFX6
&&
357 (instr
->opcode
== aco_opcode::v_readlane_b32
|| /* GFX6 doesn't have v_readlane_b32_e64 */
358 instr
->opcode
== aco_opcode::v_readfirstlane_b32
)) {
359 handle_vintrp_then_read_hazard(program
, cur_block
, &NOPs
, 1, instr
->operands
[0]);
362 if (instr
->opcode
== aco_opcode::v_div_fmas_f32
|| instr
->opcode
== aco_opcode::v_div_fmas_f64
)
363 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_div_fmas
);
364 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
365 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
366 for (Operand op
: instr
->operands
) {
367 if (!op
.isConstant() && !op
.isUndefined() && op
.regClass().type() == RegType::sgpr
)
368 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 5, op
);
372 if (!instr
->isSALU() && instr
->format
!= Format::SMEM
)
373 NOPs
= MAX2(NOPs
, ctx
.set_vskip_mode_then_vector
);
375 if (program
->chip_class
== GFX9
) {
376 bool lds_scratch_global
= (instr
->format
== Format::SCRATCH
|| instr
->format
== Format::GLOBAL
) &&
377 static_cast<FLAT_instruction
*>(instr
.get())->lds
;
378 if (instr
->format
== Format::VINTRP
||
379 instr
->opcode
== aco_opcode::ds_read_addtid_b32
||
380 instr
->opcode
== aco_opcode::ds_write_addtid_b32
||
381 instr
->opcode
== aco_opcode::buffer_store_lds_dword
||
382 lds_scratch_global
) {
383 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_lds
);
387 ctx
.add_wait_states(NOPs
+ get_wait_states(instr
));
389 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
392 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
395 new_instructions
.emplace_back(std::move(nop
));
398 /* update information to check for later hazards */
399 if ((ctx
.smem_clause
|| ctx
.smem_write
) && (NOPs
|| instr
->format
!= Format::SMEM
)) {
400 ctx
.smem_clause
= false;
401 ctx
.smem_write
= false;
402 BITSET_ZERO(ctx
.smem_clause_read_write
);
403 BITSET_ZERO(ctx
.smem_clause_write
);
406 if (instr
->format
== Format::SMEM
) {
407 if (instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
408 ctx
.smem_write
= true;
410 ctx
.smem_clause
= true;
412 for (Operand op
: instr
->operands
) {
413 if (!op
.isConstant()) {
414 set_bitset_range(ctx
.smem_clause_read_write
, op
.physReg(), op
.size());
418 Definition def
= instr
->definitions
[0];
419 set_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size());
420 set_bitset_range(ctx
.smem_clause_write
, def
.physReg(), def
.size());
422 } else if (instr
->isVALU()) {
423 for (Definition def
: instr
->definitions
) {
424 if (def
.regClass().type() == RegType::sgpr
) {
425 if (def
.physReg() == vcc
|| def
.physReg() == vcc_hi
) {
426 ctx
.valu_wr_vcc_then_vccz
= 5;
427 ctx
.valu_wr_vcc_then_div_fmas
= 4;
429 if (def
.physReg() == exec
|| def
.physReg() == exec_hi
) {
430 ctx
.valu_wr_exec_then_execz
= 5;
431 ctx
.valu_wr_exec_then_dpp
= 5;
435 } else if (instr
->isSALU() && !instr
->definitions
.empty()) {
436 if (!instr
->definitions
.empty()) {
437 /* all other definitions should be SCC */
438 Definition def
= instr
->definitions
[0];
439 if (def
.physReg() == m0
) {
440 ctx
.salu_wr_m0_then_gds_msg_ttrace
= 1;
441 ctx
.salu_wr_m0_then_lds
= 1;
442 ctx
.salu_wr_m0_then_moverel
= 1;
444 } else if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
) {
445 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
.get());
446 unsigned offset
= (sopk
->imm
>> 6) & 0x1f;
447 unsigned size
= ((sopk
->imm
>> 11) & 0x1f) + 1;
448 unsigned reg
= sopk
->imm
& 0x3f;
449 ctx
.setreg_then_getsetreg
= 2;
451 if (reg
== 1 && offset
>= 28 && size
> (28 - offset
))
452 ctx
.set_vskip_mode_then_vector
= 2;
454 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
455 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
456 bool consider_buf
= (instr
->format
== Format::MUBUF
|| instr
->format
== Format::MTBUF
) &&
457 instr
->operands
.size() == 4 &&
458 instr
->operands
[3].size() > 2 &&
459 instr
->operands
[2].physReg() >= 128;
460 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
461 bool consider_mimg
= instr
->format
== Format::MIMG
&&
462 instr
->operands
[1].regClass().type() == RegType::vgpr
&&
463 instr
->operands
[1].size() > 2 &&
464 instr
->operands
[0].size() == 4;
465 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
466 bool consider_flat
= (instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) &&
467 instr
->operands
.size() == 3 &&
468 instr
->operands
[2].size() > 2;
469 if (consider_buf
|| consider_mimg
|| consider_flat
) {
470 PhysReg wrdata
= instr
->operands
[consider_flat
? 2 : 3].physReg();
471 unsigned size
= instr
->operands
[consider_flat
? 2 : 3].size();
472 for (unsigned i
= 0; i
< size
; i
++)
473 ctx
.vmem_store_then_wr_data
[(wrdata
& 0xff) + i
] = 1;
478 template <std::size_t N
>
479 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
481 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
482 bool writes_any
= false;
483 for (unsigned i
= 0; i
< def
.size(); i
++) {
484 unsigned def_reg
= def
.physReg() + i
;
485 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
491 template <std::size_t N
>
492 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
494 for (const Operand
&op
: instr
->operands
) {
495 for (unsigned i
= 0; i
< op
.size(); i
++) {
496 unsigned reg
= op
.physReg() + i
;
497 if (reg
< reg_reads
.size())
503 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
505 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
507 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
509 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
510 instr
->opcode
== aco_opcode::v_readlane_b32
||
511 instr
->opcode
== aco_opcode::v_readlane_b32_e64
)
516 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
518 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
519 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
523 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
525 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
526 return def
.getTemp().type() == RegType::sgpr
;
530 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
532 return instr
->opcode
== aco_opcode::s_branch
||
533 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
534 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
535 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
536 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
537 instr
->opcode
== aco_opcode::s_cbranch_execz
||
538 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
539 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
540 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
541 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
542 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
543 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
544 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
545 instr
->opcode
== aco_opcode::s_setpc_b64
||
546 instr
->opcode
== aco_opcode::s_swappc_b64
||
547 instr
->opcode
== aco_opcode::s_getpc_b64
||
548 instr
->opcode
== aco_opcode::s_call_b64
;
551 void handle_instruction_gfx10(Program
*program
, Block
*cur_block
, NOP_ctx_gfx10
&ctx
,
552 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
554 //TODO: s_dcache_inv needs to be in it's own group on GFX10
556 /* VMEMtoScalarWriteHazard
557 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
559 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
560 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
561 /* Remember all SGPRs that are read by the VMEM instruction */
562 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
563 ctx
.sgprs_read_by_VMEM
.set(exec
);
564 if (program
->wave_size
== 64)
565 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
566 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
567 /* Check if SALU writes an SGPR that was previously read by the VALU */
568 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
569 ctx
.sgprs_read_by_VMEM
.reset();
571 /* Insert v_nop to mitigate the problem */
572 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
573 new_instructions
.emplace_back(std::move(nop
));
575 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
576 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
577 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
578 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
580 ctx
.sgprs_read_by_VMEM
.reset();
581 } else if (instr
->isVALU()) {
582 /* Hazard is mitigated by any VALU instruction */
583 ctx
.sgprs_read_by_VMEM
.reset();
586 /* VcmpxPermlaneHazard
587 * Handle any permlane following a VOPC instruction, insert v_mov between them.
589 if (instr
->format
== Format::VOPC
) {
591 } else if (ctx
.has_VOPC
&&
592 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
593 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
594 ctx
.has_VOPC
= false;
596 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
597 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
598 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
599 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
600 new_instructions
.emplace_back(std::move(v_mov
));
601 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
602 ctx
.has_VOPC
= false;
605 /* VcmpxExecWARHazard
606 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
608 if (!instr
->isVALU() && instr
->reads_exec()) {
609 ctx
.has_nonVALU_exec_read
= true;
610 } else if (instr
->isVALU()) {
611 if (instr_writes_exec(instr
)) {
612 ctx
.has_nonVALU_exec_read
= false;
614 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
615 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
616 depctr
->imm
= 0xfffe;
618 new_instructions
.emplace_back(std::move(depctr
));
619 } else if (instr_writes_sgpr(instr
)) {
620 /* Any VALU instruction that writes an SGPR mitigates the problem */
621 ctx
.has_nonVALU_exec_read
= false;
623 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
624 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
625 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
626 if ((sopp
->imm
& 0xfffe) == 0xfffe)
627 ctx
.has_nonVALU_exec_read
= false;
630 /* SMEMtoVectorWriteHazard
631 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
633 if (instr
->format
== Format::SMEM
) {
634 /* Remember all SGPRs that are read by the SMEM instruction */
635 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
636 } else if (VALU_writes_sgpr(instr
)) {
637 /* Check if VALU writes an SGPR that was previously read by SMEM */
638 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
639 ctx
.sgprs_read_by_SMEM
.reset();
641 /* Insert s_mov to mitigate the problem */
642 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
643 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
644 s_mov
->operands
[0] = Operand(0u);
645 new_instructions
.emplace_back(std::move(s_mov
));
647 } else if (instr
->isSALU()) {
648 if (instr
->format
!= Format::SOPP
) {
649 /* SALU can mitigate the hazard */
650 ctx
.sgprs_read_by_SMEM
.reset();
652 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
653 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
654 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
655 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
656 ctx
.sgprs_read_by_SMEM
.reset();
657 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
658 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
660 ctx
.sgprs_read_by_SMEM
.reset();
665 /* LdsBranchVmemWARHazard
666 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
668 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
670 ctx
.has_branch_after_VMEM
= false;
671 /* Mitigation for DS is needed only if there was already a branch after */
672 ctx
.has_DS
= ctx
.has_branch_after_DS
;
673 } else if (instr
->format
== Format::DS
) {
675 ctx
.has_branch_after_DS
= false;
676 /* Mitigation for VMEM is needed only if there was already a branch after */
677 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
678 } else if (instr_is_branch(instr
)) {
679 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
680 ctx
.has_branch_after_DS
= ctx
.has_DS
;
681 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
682 /* Only s_waitcnt_vscnt can mitigate the hazard */
683 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
684 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
685 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
687 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
688 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
690 /* Insert s_waitcnt_vscnt to mitigate the problem */
691 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
692 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
694 new_instructions
.emplace_back(std::move(wait
));
698 template <typename Ctx
>
699 using HandleInstr
= void (*)(Program
*, Block
*block
, Ctx
&, aco_ptr
<Instruction
>&,
700 std::vector
<aco_ptr
<Instruction
>>&);
702 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
703 void handle_block(Program
*program
, Ctx
& ctx
, Block
& block
)
705 if (block
.instructions
.empty())
708 std::vector
<aco_ptr
<Instruction
>> old_instructions
= std::move(block
.instructions
);
710 block
.instructions
.reserve(block
.instructions
.size());
712 for (aco_ptr
<Instruction
>& instr
: old_instructions
) {
713 Handle(program
, &block
, ctx
, instr
, block
.instructions
);
714 block
.instructions
.emplace_back(std::move(instr
));
718 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
719 void mitigate_hazards(Program
*program
)
721 std::vector
<Ctx
> all_ctx(program
->blocks
.size());
722 std::stack
<unsigned> loop_header_indices
;
724 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
725 Block
& block
= program
->blocks
[i
];
726 Ctx
&ctx
= all_ctx
[i
];
728 if (block
.kind
& block_kind_loop_header
) {
729 loop_header_indices
.push(i
);
730 } else if (block
.kind
& block_kind_loop_exit
) {
731 /* Go through the whole loop again */
732 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
734 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
735 loop_block_ctx
.join(all_ctx
[b
]);
737 handle_block
<Ctx
, Handle
>(program
, loop_block_ctx
, program
->blocks
[idx
]);
739 /* We only need to continue if the loop header context changed */
740 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
743 all_ctx
[idx
] = loop_block_ctx
;
746 loop_header_indices
.pop();
749 for (unsigned b
: block
.linear_preds
)
750 ctx
.join(all_ctx
[b
]);
752 handle_block
<Ctx
, Handle
>(program
, ctx
, block
);
756 } /* end namespace */
758 void insert_NOPs(Program
* program
)
760 if (program
->chip_class
>= GFX10
) {
761 mitigate_hazards
<NOP_ctx_gfx10
, handle_instruction_gfx10
>(program
);
763 for (Block
& block
: program
->blocks
) {
765 handle_block
<NOP_ctx_gfx6
, handle_instruction_gfx6
>(program
, ctx
, block
);