2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 void join(const NOP_ctx_gfx6
&other
) {
38 set_vskip_mode_then_vector
= MAX2(set_vskip_mode_then_vector
, other
.set_vskip_mode_then_vector
);
39 valu_wr_vcc_then_vccz
= MAX2(valu_wr_vcc_then_vccz
, other
.valu_wr_vcc_then_vccz
);
40 valu_wr_exec_then_execz
= MAX2(valu_wr_exec_then_execz
, other
.valu_wr_exec_then_execz
);
41 valu_wr_vcc_then_div_fmas
= MAX2(valu_wr_vcc_then_div_fmas
, other
.valu_wr_vcc_then_div_fmas
);
42 salu_wr_m0_then_gds_msg_ttrace
= MAX2(salu_wr_m0_then_gds_msg_ttrace
, other
.salu_wr_m0_then_gds_msg_ttrace
);
43 valu_wr_exec_then_dpp
= MAX2(valu_wr_exec_then_dpp
, other
.valu_wr_exec_then_dpp
);
44 salu_wr_m0_then_lds
= MAX2(salu_wr_m0_then_lds
, other
.salu_wr_m0_then_lds
);
45 salu_wr_m0_then_moverel
= MAX2(salu_wr_m0_then_moverel
, other
.salu_wr_m0_then_moverel
);
46 setreg_then_getsetreg
= MAX2(setreg_then_getsetreg
, other
.setreg_then_getsetreg
);
47 vmem_store_then_wr_data
|= other
.vmem_store_then_wr_data
;
48 smem_clause
|= other
.smem_clause
;
49 smem_write
|= other
.smem_write
;
50 for (unsigned i
= 0; i
< BITSET_WORDS(128); i
++) {
51 smem_clause_read_write
[i
] |= other
.smem_clause_read_write
[i
];
52 smem_clause_write
[i
] |= other
.smem_clause_write
[i
];
56 bool operator==(const NOP_ctx_gfx6
&other
)
59 set_vskip_mode_then_vector
== other
.set_vskip_mode_then_vector
&&
60 valu_wr_vcc_then_vccz
== other
.valu_wr_vcc_then_vccz
&&
61 valu_wr_exec_then_execz
== other
.valu_wr_exec_then_execz
&&
62 valu_wr_vcc_then_div_fmas
== other
.valu_wr_vcc_then_div_fmas
&&
63 vmem_store_then_wr_data
== other
.vmem_store_then_wr_data
&&
64 salu_wr_m0_then_gds_msg_ttrace
== other
.salu_wr_m0_then_gds_msg_ttrace
&&
65 valu_wr_exec_then_dpp
== other
.valu_wr_exec_then_dpp
&&
66 salu_wr_m0_then_lds
== other
.salu_wr_m0_then_lds
&&
67 salu_wr_m0_then_moverel
== other
.salu_wr_m0_then_moverel
&&
68 setreg_then_getsetreg
== other
.setreg_then_getsetreg
&&
69 smem_clause
== other
.smem_clause
&&
70 smem_write
== other
.smem_write
&&
71 BITSET_EQUAL(smem_clause_read_write
, other
.smem_clause_read_write
) &&
72 BITSET_EQUAL(smem_clause_write
, other
.smem_clause_write
);
75 void add_wait_states(unsigned amount
)
77 if ((set_vskip_mode_then_vector
-= amount
) < 0)
78 set_vskip_mode_then_vector
= 0;
80 if ((valu_wr_vcc_then_vccz
-= amount
) < 0)
81 valu_wr_vcc_then_vccz
= 0;
83 if ((valu_wr_exec_then_execz
-= amount
) < 0)
84 valu_wr_exec_then_execz
= 0;
86 if ((valu_wr_vcc_then_div_fmas
-= amount
) < 0)
87 valu_wr_vcc_then_div_fmas
= 0;
89 if ((salu_wr_m0_then_gds_msg_ttrace
-= amount
) < 0)
90 salu_wr_m0_then_gds_msg_ttrace
= 0;
92 if ((valu_wr_exec_then_dpp
-= amount
) < 0)
93 valu_wr_exec_then_dpp
= 0;
95 if ((salu_wr_m0_then_lds
-= amount
) < 0)
96 salu_wr_m0_then_lds
= 0;
98 if ((salu_wr_m0_then_moverel
-= amount
) < 0)
99 salu_wr_m0_then_moverel
= 0;
101 if ((setreg_then_getsetreg
-= amount
) < 0)
102 setreg_then_getsetreg
= 0;
104 vmem_store_then_wr_data
.reset();
107 /* setting MODE.vskip and then any vector op requires 2 wait states */
108 int8_t set_vskip_mode_then_vector
= 0;
110 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
111 int8_t valu_wr_vcc_then_vccz
= 0;
112 int8_t valu_wr_exec_then_execz
= 0;
114 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
115 int8_t valu_wr_vcc_then_div_fmas
= 0;
117 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
118 int8_t salu_wr_m0_then_gds_msg_ttrace
= 0;
120 /* VALU writing EXEC followed by DPP requires 5 wait states */
121 int8_t valu_wr_exec_then_dpp
= 0;
123 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
124 int8_t salu_wr_m0_then_lds
= 0;
126 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
127 int8_t salu_wr_m0_then_moverel
= 0;
129 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
130 * currently we don't look at the actual register */
131 int8_t setreg_then_getsetreg
= 0;
133 /* some memory instructions writing >64bit followed by a instructions
134 * writing the VGPRs holding the writedata requires 1 wait state */
135 std::bitset
<256> vmem_store_then_wr_data
;
137 /* we break up SMEM clauses that contain stores or overwrite an
138 * operand/definition of another instruction in the clause */
139 bool smem_clause
= false;
140 bool smem_write
= false;
141 BITSET_DECLARE(smem_clause_read_write
, 128) = {0};
142 BITSET_DECLARE(smem_clause_write
, 128) = {0};
145 struct NOP_ctx_gfx10
{
146 bool has_VOPC
= false;
147 bool has_nonVALU_exec_read
= false;
148 bool has_VMEM
= false;
149 bool has_branch_after_VMEM
= false;
151 bool has_branch_after_DS
= false;
152 std::bitset
<128> sgprs_read_by_VMEM
;
153 std::bitset
<128> sgprs_read_by_SMEM
;
155 void join(const NOP_ctx_gfx10
&other
) {
156 has_VOPC
|= other
.has_VOPC
;
157 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
158 has_VMEM
|= other
.has_VMEM
;
159 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
160 has_DS
|= other
.has_DS
;
161 has_branch_after_DS
|= other
.has_branch_after_DS
;
162 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
163 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
166 bool operator==(const NOP_ctx_gfx10
&other
)
169 has_VOPC
== other
.has_VOPC
&&
170 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
171 has_VMEM
== other
.has_VMEM
&&
172 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
173 has_DS
== other
.has_DS
&&
174 has_branch_after_DS
== other
.has_branch_after_DS
&&
175 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
176 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
180 int get_wait_states(aco_ptr
<Instruction
>& instr
)
182 if (instr
->opcode
== aco_opcode::s_nop
)
183 return static_cast<SOPP_instruction
*>(instr
.get())->imm
+ 1;
184 else if (instr
->opcode
== aco_opcode::p_constaddr
)
185 return 3; /* lowered to 3 instructions in the assembler */
190 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
192 return a_reg
> b_reg
?
193 (a_reg
- b_reg
< b_size
) :
194 (b_reg
- a_reg
< a_size
);
197 template <bool Valu
, bool Vintrp
, bool Salu
>
198 int handle_raw_hazard_internal(Program
*program
, Block
*block
,
199 int nops_needed
, PhysReg reg
, uint32_t mask
)
201 unsigned mask_size
= util_last_bit(mask
);
202 for (int pred_idx
= block
->instructions
.size() - 1; pred_idx
>= 0; pred_idx
--) {
203 aco_ptr
<Instruction
>& pred
= block
->instructions
[pred_idx
];
205 uint32_t writemask
= 0;
206 for (Definition
& def
: pred
->definitions
) {
207 if (regs_intersect(reg
, mask_size
, def
.physReg(), def
.size())) {
208 unsigned start
= def
.physReg() > reg
? def
.physReg() - reg
: 0;
209 unsigned end
= MIN2(mask_size
, start
+ def
.size());
210 writemask
|= u_bit_consecutive(start
, end
- start
);
214 bool is_hazard
= writemask
!= 0 &&
215 ((pred
->isVALU() && Valu
) ||
216 (pred
->format
== Format::VINTRP
&& Vintrp
) ||
217 (pred
->isSALU() && Salu
));
222 nops_needed
-= get_wait_states(pred
);
224 if (nops_needed
<= 0 || mask
== 0)
230 /* Loops require branch instructions, which count towards the wait
231 * states. So even with loops this should finish unless nops_needed is some
233 for (unsigned lin_pred
: block
->linear_preds
) {
234 res
= std::max(res
, handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(
235 program
, &program
->blocks
[lin_pred
], nops_needed
, reg
, mask
));
240 template <bool Valu
, bool Vintrp
, bool Salu
>
241 void handle_raw_hazard(Program
*program
, Block
*cur_block
, int *NOPs
, int min_states
, Operand op
)
243 if (*NOPs
>= min_states
)
245 int res
= handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(program
, cur_block
, min_states
, op
.physReg(), u_bit_consecutive(0, op
.size()));
246 *NOPs
= MAX2(*NOPs
, res
);
249 static auto handle_valu_then_read_hazard
= handle_raw_hazard
<true, true, false>;
250 static auto handle_vintrp_then_read_hazard
= handle_raw_hazard
<false, true, false>;
251 static auto handle_valu_salu_then_read_hazard
= handle_raw_hazard
<true, true, true>;
253 void set_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
254 unsigned end
= start
+ size
- 1;
255 unsigned start_mod
= start
% BITSET_WORDBITS
;
256 if (start_mod
+ size
<= BITSET_WORDBITS
) {
257 BITSET_SET_RANGE(words
, start
, end
);
259 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
260 set_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
);
261 set_bitset_range(words
, start
+ first_size
, size
- first_size
);
265 bool test_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
266 unsigned end
= start
+ size
- 1;
267 unsigned start_mod
= start
% BITSET_WORDBITS
;
268 if (start_mod
+ size
<= BITSET_WORDBITS
) {
269 return BITSET_TEST_RANGE(words
, start
, end
);
271 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
272 return test_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
) ||
273 test_bitset_range(words
, start
+ first_size
, size
- first_size
);
277 /* A SMEM clause is any group of consecutive SMEM instructions. The
278 * instructions in this group may return out of order and/or may be replayed.
280 * To fix this potential hazard correctly, we have to make sure that when a
281 * clause has more than one instruction, no instruction in the clause writes
282 * to a register that is read by another instruction in the clause (including
283 * itself). In this case, we have to break the SMEM clause by inserting non
286 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
288 void handle_smem_clause_hazards(Program
*program
, NOP_ctx_gfx6
&ctx
,
289 aco_ptr
<Instruction
>& instr
, int *NOPs
)
291 /* break off from previous SMEM clause if needed */
292 if (!*NOPs
& (ctx
.smem_clause
|| ctx
.smem_write
)) {
293 /* Don't allow clauses with store instructions since the clause's
294 * instructions may use the same address. */
295 if (ctx
.smem_write
|| instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
297 } else if (program
->xnack_enabled
) {
298 for (Operand op
: instr
->operands
) {
299 if (!op
.isConstant() && test_bitset_range(ctx
.smem_clause_write
, op
.physReg(), op
.size())) {
305 Definition def
= instr
->definitions
[0];
306 if (!*NOPs
&& test_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size()))
312 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
313 void handle_instruction_gfx6(Program
*program
, Block
*cur_block
, NOP_ctx_gfx6
&ctx
,
314 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
319 if (instr
->format
== Format::SMEM
) {
320 if (program
->chip_class
== GFX6
) {
321 /* A read of an SGPR by SMRD instruction requires 4 wait states
322 * when the SGPR was written by a VALU instruction. According to LLVM,
323 * there is also an undocumented hardware behavior when the buffer
324 * descriptor is written by a SALU instruction */
325 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
326 Operand op
= instr
->operands
[i
];
330 bool is_buffer_desc
= i
== 0 && op
.size() > 2;
332 handle_valu_salu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
334 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
338 handle_smem_clause_hazards(program
, ctx
, instr
, &NOPs
);
339 } else if (instr
->isSALU()) {
340 if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
||
341 instr
->opcode
== aco_opcode::s_getreg_b32
) {
342 NOPs
= MAX2(NOPs
, ctx
.setreg_then_getsetreg
);
345 if (program
->chip_class
== GFX9
) {
346 if (instr
->opcode
== aco_opcode::s_movrels_b32
|| instr
->opcode
== aco_opcode::s_movrels_b64
||
347 instr
->opcode
== aco_opcode::s_movreld_b32
|| instr
->opcode
== aco_opcode::s_movreld_b64
) {
348 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_moverel
);
352 if (instr
->opcode
== aco_opcode::s_sendmsg
|| instr
->opcode
== aco_opcode::s_ttracedata
)
353 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
354 } else if (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
) {
355 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
356 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
357 for (Operand op
: instr
->operands
) {
358 if (op
.physReg() == vccz
)
359 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_vccz
);
360 if (op
.physReg() == execz
)
361 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_execz
);
364 if (instr
->isDPP()) {
365 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_dpp
);
366 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 2, instr
->operands
[0]);
369 for (Definition def
: instr
->definitions
) {
370 if (def
.regClass().type() != RegType::sgpr
) {
371 for (unsigned i
= 0; i
< def
.size(); i
++)
372 NOPs
= MAX2(NOPs
, ctx
.vmem_store_then_wr_data
[(def
.physReg() & 0xff) + i
]);
376 if ((instr
->opcode
== aco_opcode::v_readlane_b32
||
377 instr
->opcode
== aco_opcode::v_readlane_b32_e64
||
378 instr
->opcode
== aco_opcode::v_writelane_b32
||
379 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) &&
380 !instr
->operands
[1].isConstant()) {
381 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, instr
->operands
[1]);
384 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
385 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
386 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
387 * This hazard isn't documented anywhere but AMD confirmed that hazard.
389 if (program
->chip_class
== GFX6
&&
390 (instr
->opcode
== aco_opcode::v_readlane_b32
|| /* GFX6 doesn't have v_readlane_b32_e64 */
391 instr
->opcode
== aco_opcode::v_readfirstlane_b32
)) {
392 handle_vintrp_then_read_hazard(program
, cur_block
, &NOPs
, 1, instr
->operands
[0]);
395 if (instr
->opcode
== aco_opcode::v_div_fmas_f32
|| instr
->opcode
== aco_opcode::v_div_fmas_f64
)
396 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_div_fmas
);
397 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
398 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
399 for (Operand op
: instr
->operands
) {
400 if (!op
.isConstant() && !op
.isUndefined() && op
.regClass().type() == RegType::sgpr
)
401 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 5, op
);
405 if (!instr
->isSALU() && instr
->format
!= Format::SMEM
)
406 NOPs
= MAX2(NOPs
, ctx
.set_vskip_mode_then_vector
);
408 if (program
->chip_class
== GFX9
) {
409 bool lds_scratch_global
= (instr
->format
== Format::SCRATCH
|| instr
->format
== Format::GLOBAL
) &&
410 static_cast<FLAT_instruction
*>(instr
.get())->lds
;
411 if (instr
->format
== Format::VINTRP
||
412 instr
->opcode
== aco_opcode::ds_read_addtid_b32
||
413 instr
->opcode
== aco_opcode::ds_write_addtid_b32
||
414 instr
->opcode
== aco_opcode::buffer_store_lds_dword
||
415 lds_scratch_global
) {
416 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_lds
);
420 ctx
.add_wait_states(NOPs
+ get_wait_states(instr
));
422 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
425 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
428 new_instructions
.emplace_back(std::move(nop
));
431 /* update information to check for later hazards */
432 if ((ctx
.smem_clause
|| ctx
.smem_write
) && (NOPs
|| instr
->format
!= Format::SMEM
)) {
433 ctx
.smem_clause
= false;
434 ctx
.smem_write
= false;
436 if (program
->xnack_enabled
) {
437 BITSET_ZERO(ctx
.smem_clause_read_write
);
438 BITSET_ZERO(ctx
.smem_clause_write
);
442 if (instr
->format
== Format::SMEM
) {
443 if (instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
444 ctx
.smem_write
= true;
446 ctx
.smem_clause
= true;
448 if (program
->xnack_enabled
) {
449 for (Operand op
: instr
->operands
) {
450 if (!op
.isConstant()) {
451 set_bitset_range(ctx
.smem_clause_read_write
, op
.physReg(), op
.size());
455 Definition def
= instr
->definitions
[0];
456 set_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size());
457 set_bitset_range(ctx
.smem_clause_write
, def
.physReg(), def
.size());
460 } else if (instr
->isVALU()) {
461 for (Definition def
: instr
->definitions
) {
462 if (def
.regClass().type() == RegType::sgpr
) {
463 if (def
.physReg() == vcc
|| def
.physReg() == vcc_hi
) {
464 ctx
.valu_wr_vcc_then_vccz
= 5;
465 ctx
.valu_wr_vcc_then_div_fmas
= 4;
467 if (def
.physReg() == exec
|| def
.physReg() == exec_hi
) {
468 ctx
.valu_wr_exec_then_execz
= 5;
469 ctx
.valu_wr_exec_then_dpp
= 5;
473 } else if (instr
->isSALU() && !instr
->definitions
.empty()) {
474 if (!instr
->definitions
.empty()) {
475 /* all other definitions should be SCC */
476 Definition def
= instr
->definitions
[0];
477 if (def
.physReg() == m0
) {
478 ctx
.salu_wr_m0_then_gds_msg_ttrace
= 1;
479 ctx
.salu_wr_m0_then_lds
= 1;
480 ctx
.salu_wr_m0_then_moverel
= 1;
482 } else if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
) {
483 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
.get());
484 unsigned offset
= (sopk
->imm
>> 6) & 0x1f;
485 unsigned size
= ((sopk
->imm
>> 11) & 0x1f) + 1;
486 unsigned reg
= sopk
->imm
& 0x3f;
487 ctx
.setreg_then_getsetreg
= 2;
489 if (reg
== 1 && offset
>= 28 && size
> (28 - offset
))
490 ctx
.set_vskip_mode_then_vector
= 2;
492 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
493 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
494 bool consider_buf
= (instr
->format
== Format::MUBUF
|| instr
->format
== Format::MTBUF
) &&
495 instr
->operands
.size() == 4 &&
496 instr
->operands
[3].size() > 2 &&
497 instr
->operands
[2].physReg() >= 128;
498 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
499 bool consider_mimg
= instr
->format
== Format::MIMG
&&
500 instr
->operands
[1].regClass().type() == RegType::vgpr
&&
501 instr
->operands
[1].size() > 2 &&
502 instr
->operands
[0].size() == 4;
503 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
504 bool consider_flat
= (instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) &&
505 instr
->operands
.size() == 3 &&
506 instr
->operands
[2].size() > 2;
507 if (consider_buf
|| consider_mimg
|| consider_flat
) {
508 PhysReg wrdata
= instr
->operands
[consider_flat
? 2 : 3].physReg();
509 unsigned size
= instr
->operands
[consider_flat
? 2 : 3].size();
510 for (unsigned i
= 0; i
< size
; i
++)
511 ctx
.vmem_store_then_wr_data
[(wrdata
& 0xff) + i
] = 1;
516 template <std::size_t N
>
517 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
519 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
520 bool writes_any
= false;
521 for (unsigned i
= 0; i
< def
.size(); i
++) {
522 unsigned def_reg
= def
.physReg() + i
;
523 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
529 template <std::size_t N
>
530 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
532 for (const Operand
&op
: instr
->operands
) {
533 for (unsigned i
= 0; i
< op
.size(); i
++) {
534 unsigned reg
= op
.physReg() + i
;
535 if (reg
< reg_reads
.size())
541 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
543 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
545 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
547 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
548 instr
->opcode
== aco_opcode::v_readlane_b32
||
549 instr
->opcode
== aco_opcode::v_readlane_b32_e64
)
554 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
556 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
557 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
561 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
563 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
564 return def
.getTemp().type() == RegType::sgpr
;
568 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
570 return instr
->opcode
== aco_opcode::s_branch
||
571 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
572 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
573 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
574 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
575 instr
->opcode
== aco_opcode::s_cbranch_execz
||
576 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
577 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
578 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
579 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
580 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
581 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
582 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
583 instr
->opcode
== aco_opcode::s_setpc_b64
||
584 instr
->opcode
== aco_opcode::s_swappc_b64
||
585 instr
->opcode
== aco_opcode::s_getpc_b64
||
586 instr
->opcode
== aco_opcode::s_call_b64
;
589 void handle_instruction_gfx10(Program
*program
, Block
*cur_block
, NOP_ctx_gfx10
&ctx
,
590 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
592 //TODO: s_dcache_inv needs to be in it's own group on GFX10
594 /* VMEMtoScalarWriteHazard
595 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
597 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
598 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
599 /* Remember all SGPRs that are read by the VMEM instruction */
600 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
601 ctx
.sgprs_read_by_VMEM
.set(exec
);
602 if (program
->wave_size
== 64)
603 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
604 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
605 if (instr
->opcode
== aco_opcode::s_waitcnt
) {
606 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
607 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
608 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
610 ctx
.sgprs_read_by_VMEM
.reset();
611 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
612 /* Hazard is mitigated by a s_waitcnt_depctr with a magic imm */
613 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
614 if (sopp
->imm
== 0xffe3)
615 ctx
.sgprs_read_by_VMEM
.reset();
618 /* Check if SALU writes an SGPR that was previously read by the VALU */
619 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
620 ctx
.sgprs_read_by_VMEM
.reset();
622 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
623 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
624 depctr
->imm
= 0xffe3;
626 new_instructions
.emplace_back(std::move(depctr
));
628 } else if (instr
->isVALU()) {
629 /* Hazard is mitigated by any VALU instruction */
630 ctx
.sgprs_read_by_VMEM
.reset();
633 /* VcmpxPermlaneHazard
634 * Handle any permlane following a VOPC instruction, insert v_mov between them.
636 if (instr
->format
== Format::VOPC
) {
638 } else if (ctx
.has_VOPC
&&
639 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
640 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
641 ctx
.has_VOPC
= false;
643 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
644 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
645 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
646 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
647 new_instructions
.emplace_back(std::move(v_mov
));
648 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
649 ctx
.has_VOPC
= false;
652 /* VcmpxExecWARHazard
653 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
655 if (!instr
->isVALU() && instr
->reads_exec()) {
656 ctx
.has_nonVALU_exec_read
= true;
657 } else if (instr
->isVALU()) {
658 if (instr_writes_exec(instr
)) {
659 ctx
.has_nonVALU_exec_read
= false;
661 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
662 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
663 depctr
->imm
= 0xfffe;
665 new_instructions
.emplace_back(std::move(depctr
));
666 } else if (instr_writes_sgpr(instr
)) {
667 /* Any VALU instruction that writes an SGPR mitigates the problem */
668 ctx
.has_nonVALU_exec_read
= false;
670 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
671 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
672 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
673 if ((sopp
->imm
& 0xfffe) == 0xfffe)
674 ctx
.has_nonVALU_exec_read
= false;
677 /* SMEMtoVectorWriteHazard
678 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
680 if (instr
->format
== Format::SMEM
) {
681 /* Remember all SGPRs that are read by the SMEM instruction */
682 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
683 } else if (VALU_writes_sgpr(instr
)) {
684 /* Check if VALU writes an SGPR that was previously read by SMEM */
685 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
686 ctx
.sgprs_read_by_SMEM
.reset();
688 /* Insert s_mov to mitigate the problem */
689 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
690 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
691 s_mov
->operands
[0] = Operand(0u);
692 new_instructions
.emplace_back(std::move(s_mov
));
694 } else if (instr
->isSALU()) {
695 if (instr
->format
!= Format::SOPP
) {
696 /* SALU can mitigate the hazard */
697 ctx
.sgprs_read_by_SMEM
.reset();
699 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
700 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
701 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
702 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
703 ctx
.sgprs_read_by_SMEM
.reset();
704 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
705 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
707 ctx
.sgprs_read_by_SMEM
.reset();
712 /* LdsBranchVmemWARHazard
713 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
715 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
717 ctx
.has_branch_after_VMEM
= false;
718 /* Mitigation for DS is needed only if there was already a branch after */
719 ctx
.has_DS
= ctx
.has_branch_after_DS
;
720 } else if (instr
->format
== Format::DS
) {
722 ctx
.has_branch_after_DS
= false;
723 /* Mitigation for VMEM is needed only if there was already a branch after */
724 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
725 } else if (instr_is_branch(instr
)) {
726 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
727 ctx
.has_branch_after_DS
= ctx
.has_DS
;
728 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
729 /* Only s_waitcnt_vscnt can mitigate the hazard */
730 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
731 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
732 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
734 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
735 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
737 /* Insert s_waitcnt_vscnt to mitigate the problem */
738 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
739 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
741 new_instructions
.emplace_back(std::move(wait
));
745 template <typename Ctx
>
746 using HandleInstr
= void (*)(Program
*, Block
*block
, Ctx
&, aco_ptr
<Instruction
>&,
747 std::vector
<aco_ptr
<Instruction
>>&);
749 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
750 void handle_block(Program
*program
, Ctx
& ctx
, Block
& block
)
752 if (block
.instructions
.empty())
755 std::vector
<aco_ptr
<Instruction
>> old_instructions
= std::move(block
.instructions
);
757 block
.instructions
.reserve(block
.instructions
.size());
759 for (aco_ptr
<Instruction
>& instr
: old_instructions
) {
760 Handle(program
, &block
, ctx
, instr
, block
.instructions
);
761 block
.instructions
.emplace_back(std::move(instr
));
765 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
766 void mitigate_hazards(Program
*program
)
768 std::vector
<Ctx
> all_ctx(program
->blocks
.size());
769 std::stack
<unsigned> loop_header_indices
;
771 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
772 Block
& block
= program
->blocks
[i
];
773 Ctx
&ctx
= all_ctx
[i
];
775 if (block
.kind
& block_kind_loop_header
) {
776 loop_header_indices
.push(i
);
777 } else if (block
.kind
& block_kind_loop_exit
) {
778 /* Go through the whole loop again */
779 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
781 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
782 loop_block_ctx
.join(all_ctx
[b
]);
784 handle_block
<Ctx
, Handle
>(program
, loop_block_ctx
, program
->blocks
[idx
]);
786 /* We only need to continue if the loop header context changed */
787 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
790 all_ctx
[idx
] = loop_block_ctx
;
793 loop_header_indices
.pop();
796 for (unsigned b
: block
.linear_preds
)
797 ctx
.join(all_ctx
[b
]);
799 handle_block
<Ctx
, Handle
>(program
, ctx
, block
);
803 } /* end namespace */
805 void insert_NOPs(Program
* program
)
807 if (program
->chip_class
>= GFX10_3
)
808 ; /* no hazards/bugs to mitigate */
809 else if (program
->chip_class
>= GFX10
)
810 mitigate_hazards
<NOP_ctx_gfx10
, handle_instruction_gfx10
>(program
);
812 mitigate_hazards
<NOP_ctx_gfx6
, handle_instruction_gfx6
>(program
);