2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 void join(const NOP_ctx_gfx6
&other
) {
38 set_vskip_mode_then_vector
= MAX2(set_vskip_mode_then_vector
, other
.set_vskip_mode_then_vector
);
39 valu_wr_vcc_then_vccz
= MAX2(valu_wr_vcc_then_vccz
, other
.valu_wr_vcc_then_vccz
);
40 valu_wr_exec_then_execz
= MAX2(valu_wr_exec_then_execz
, other
.valu_wr_exec_then_execz
);
41 valu_wr_vcc_then_div_fmas
= MAX2(valu_wr_vcc_then_div_fmas
, other
.valu_wr_vcc_then_div_fmas
);
42 salu_wr_m0_then_gds_msg_ttrace
= MAX2(salu_wr_m0_then_gds_msg_ttrace
, other
.salu_wr_m0_then_gds_msg_ttrace
);
43 valu_wr_exec_then_dpp
= MAX2(valu_wr_exec_then_dpp
, other
.valu_wr_exec_then_dpp
);
44 salu_wr_m0_then_lds
= MAX2(salu_wr_m0_then_lds
, other
.salu_wr_m0_then_lds
);
45 salu_wr_m0_then_moverel
= MAX2(salu_wr_m0_then_moverel
, other
.salu_wr_m0_then_moverel
);
46 setreg_then_getsetreg
= MAX2(setreg_then_getsetreg
, other
.setreg_then_getsetreg
);
47 vmem_store_then_wr_data
|= other
.vmem_store_then_wr_data
;
48 smem_clause
|= other
.smem_clause
;
49 smem_write
|= other
.smem_write
;
50 for (unsigned i
= 0; i
< BITSET_WORDS(128); i
++) {
51 smem_clause_read_write
[i
] |= other
.smem_clause_read_write
[i
];
52 smem_clause_write
[i
] |= other
.smem_clause_write
[i
];
56 bool operator==(const NOP_ctx_gfx6
&other
)
59 set_vskip_mode_then_vector
== other
.set_vskip_mode_then_vector
&&
60 valu_wr_vcc_then_vccz
== other
.valu_wr_vcc_then_vccz
&&
61 valu_wr_exec_then_execz
== other
.valu_wr_exec_then_execz
&&
62 valu_wr_vcc_then_div_fmas
== other
.valu_wr_vcc_then_div_fmas
&&
63 vmem_store_then_wr_data
== other
.vmem_store_then_wr_data
&&
64 salu_wr_m0_then_gds_msg_ttrace
== other
.salu_wr_m0_then_gds_msg_ttrace
&&
65 valu_wr_exec_then_dpp
== other
.valu_wr_exec_then_dpp
&&
66 salu_wr_m0_then_lds
== other
.salu_wr_m0_then_lds
&&
67 salu_wr_m0_then_moverel
== other
.salu_wr_m0_then_moverel
&&
68 setreg_then_getsetreg
== other
.setreg_then_getsetreg
&&
69 smem_clause
== other
.smem_clause
&&
70 smem_write
== other
.smem_write
&&
71 BITSET_EQUAL(smem_clause_read_write
, other
.smem_clause_read_write
) &&
72 BITSET_EQUAL(smem_clause_write
, other
.smem_clause_write
);
75 void add_wait_states(unsigned amount
)
77 if ((set_vskip_mode_then_vector
-= amount
) < 0)
78 set_vskip_mode_then_vector
= 0;
80 if ((valu_wr_vcc_then_vccz
-= amount
) < 0)
81 valu_wr_vcc_then_vccz
= 0;
83 if ((valu_wr_exec_then_execz
-= amount
) < 0)
84 valu_wr_exec_then_execz
= 0;
86 if ((valu_wr_vcc_then_div_fmas
-= amount
) < 0)
87 valu_wr_vcc_then_div_fmas
= 0;
89 if ((salu_wr_m0_then_gds_msg_ttrace
-= amount
) < 0)
90 salu_wr_m0_then_gds_msg_ttrace
= 0;
92 if ((valu_wr_exec_then_dpp
-= amount
) < 0)
93 valu_wr_exec_then_dpp
= 0;
95 if ((salu_wr_m0_then_lds
-= amount
) < 0)
96 salu_wr_m0_then_lds
= 0;
98 if ((salu_wr_m0_then_moverel
-= amount
) < 0)
99 salu_wr_m0_then_moverel
= 0;
101 if ((setreg_then_getsetreg
-= amount
) < 0)
102 setreg_then_getsetreg
= 0;
104 vmem_store_then_wr_data
.reset();
107 /* setting MODE.vskip and then any vector op requires 2 wait states */
108 int8_t set_vskip_mode_then_vector
= 0;
110 /* VALU writing VCC/EXEC and then a VALU reading VCCZ/EXECZ requires 5 wait states */
111 int8_t valu_wr_vcc_then_vccz
= 0;
112 int8_t valu_wr_exec_then_execz
= 0;
114 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
115 int8_t valu_wr_vcc_then_div_fmas
= 0;
117 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
118 int8_t salu_wr_m0_then_gds_msg_ttrace
= 0;
120 /* VALU writing EXEC followed by DPP requires 5 wait states */
121 int8_t valu_wr_exec_then_dpp
= 0;
123 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
124 int8_t salu_wr_m0_then_lds
= 0;
126 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
127 int8_t salu_wr_m0_then_moverel
= 0;
129 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
130 * currently we don't look at the actual register */
131 int8_t setreg_then_getsetreg
= 0;
133 /* some memory instructions writing >64bit followed by a instructions
134 * writing the VGPRs holding the writedata requires 1 wait state */
135 std::bitset
<256> vmem_store_then_wr_data
;
137 /* we break up SMEM clauses that contain stores or overwrite an
138 * operand/definition of another instruction in the clause */
139 bool smem_clause
= false;
140 bool smem_write
= false;
141 BITSET_DECLARE(smem_clause_read_write
, 128) = {0};
142 BITSET_DECLARE(smem_clause_write
, 128) = {0};
145 struct NOP_ctx_gfx10
{
146 bool has_VOPC
= false;
147 bool has_nonVALU_exec_read
= false;
148 bool has_VMEM
= false;
149 bool has_branch_after_VMEM
= false;
151 bool has_branch_after_DS
= false;
152 std::bitset
<128> sgprs_read_by_VMEM
;
153 std::bitset
<128> sgprs_read_by_SMEM
;
155 void join(const NOP_ctx_gfx10
&other
) {
156 has_VOPC
|= other
.has_VOPC
;
157 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
158 has_VMEM
|= other
.has_VMEM
;
159 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
160 has_DS
|= other
.has_DS
;
161 has_branch_after_DS
|= other
.has_branch_after_DS
;
162 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
163 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
166 bool operator==(const NOP_ctx_gfx10
&other
)
169 has_VOPC
== other
.has_VOPC
&&
170 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
171 has_VMEM
== other
.has_VMEM
&&
172 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
173 has_DS
== other
.has_DS
&&
174 has_branch_after_DS
== other
.has_branch_after_DS
&&
175 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
176 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
180 int get_wait_states(aco_ptr
<Instruction
>& instr
)
182 if (instr
->opcode
== aco_opcode::s_nop
)
183 return static_cast<SOPP_instruction
*>(instr
.get())->imm
+ 1;
184 else if (instr
->opcode
== aco_opcode::p_constaddr
)
185 return 3; /* lowered to 3 instructions in the assembler */
190 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
192 return a_reg
> b_reg
?
193 (a_reg
- b_reg
< b_size
) :
194 (b_reg
- a_reg
< a_size
);
197 template <bool Valu
, bool Vintrp
, bool Salu
>
198 int handle_raw_hazard_internal(Program
*program
, Block
*block
,
199 int nops_needed
, PhysReg reg
, uint32_t mask
)
201 unsigned mask_size
= util_last_bit(mask
);
202 for (int pred_idx
= block
->instructions
.size() - 1; pred_idx
>= 0; pred_idx
--) {
203 aco_ptr
<Instruction
>& pred
= block
->instructions
[pred_idx
];
205 uint32_t writemask
= 0;
206 for (Definition
& def
: pred
->definitions
) {
207 if (regs_intersect(reg
, mask_size
, def
.physReg(), def
.size())) {
208 unsigned start
= def
.physReg() > reg
? def
.physReg() - reg
: 0;
209 unsigned end
= MIN2(mask_size
, start
+ def
.size());
210 writemask
|= u_bit_consecutive(start
, end
- start
);
214 bool is_hazard
= writemask
!= 0 &&
215 ((pred
->isVALU() && Valu
) ||
216 (pred
->format
== Format::VINTRP
&& Vintrp
) ||
217 (pred
->isSALU() && Salu
));
222 nops_needed
-= get_wait_states(pred
);
224 if (nops_needed
<= 0 || mask
== 0)
230 /* Loops require branch instructions, which count towards the wait
231 * states. So even with loops this should finish unless nops_needed is some
233 for (unsigned lin_pred
: block
->linear_preds
) {
234 res
= std::max(res
, handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(
235 program
, &program
->blocks
[lin_pred
], nops_needed
, reg
, mask
));
240 template <bool Valu
, bool Vintrp
, bool Salu
>
241 void handle_raw_hazard(Program
*program
, Block
*cur_block
, int *NOPs
, int min_states
, Operand op
)
243 if (*NOPs
>= min_states
)
245 int res
= handle_raw_hazard_internal
<Valu
, Vintrp
, Salu
>(program
, cur_block
, min_states
, op
.physReg(), u_bit_consecutive(0, op
.size()));
246 *NOPs
= MAX2(*NOPs
, res
);
249 static auto handle_valu_then_read_hazard
= handle_raw_hazard
<true, true, false>;
250 static auto handle_vintrp_then_read_hazard
= handle_raw_hazard
<false, true, false>;
251 static auto handle_valu_salu_then_read_hazard
= handle_raw_hazard
<true, true, true>;
253 void set_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
254 unsigned end
= start
+ size
- 1;
255 unsigned start_mod
= start
% BITSET_WORDBITS
;
256 if (start_mod
+ size
<= BITSET_WORDBITS
) {
257 BITSET_SET_RANGE(words
, start
, end
);
259 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
260 set_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
);
261 set_bitset_range(words
, start
+ first_size
, size
- first_size
);
265 bool test_bitset_range(BITSET_WORD
*words
, unsigned start
, unsigned size
) {
266 unsigned end
= start
+ size
- 1;
267 unsigned start_mod
= start
% BITSET_WORDBITS
;
268 if (start_mod
+ size
<= BITSET_WORDBITS
) {
269 return BITSET_TEST_RANGE(words
, start
, end
);
271 unsigned first_size
= BITSET_WORDBITS
- start_mod
;
272 return test_bitset_range(words
, start
, BITSET_WORDBITS
- start_mod
) ||
273 test_bitset_range(words
, start
+ first_size
, size
- first_size
);
277 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
278 void handle_instruction_gfx6(Program
*program
, Block
*cur_block
, NOP_ctx_gfx6
&ctx
,
279 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
284 if (instr
->format
== Format::SMEM
) {
285 if (program
->chip_class
== GFX6
) {
286 /* A read of an SGPR by SMRD instruction requires 4 wait states
287 * when the SGPR was written by a VALU instruction. According to LLVM,
288 * there is also an undocumented hardware behavior when the buffer
289 * descriptor is written by a SALU instruction */
290 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
291 Operand op
= instr
->operands
[i
];
295 bool is_buffer_desc
= i
== 0 && op
.size() > 2;
297 handle_valu_salu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
299 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, op
);
303 /* break off from prevous SMEM clause if needed */
304 if (!NOPs
& (ctx
.smem_clause
|| ctx
.smem_write
)) {
305 /* Don't allow clauses with store instructions since the clause's
306 * instructions may use the same address. */
307 if (ctx
.smem_write
|| instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
310 for (Operand op
: instr
->operands
) {
311 if (!op
.isConstant() && test_bitset_range(ctx
.smem_clause_write
, op
.physReg(), op
.size())) {
316 Definition def
= instr
->definitions
[0];
317 if (!NOPs
&& test_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size()))
321 } else if (instr
->isSALU()) {
322 if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
||
323 instr
->opcode
== aco_opcode::s_getreg_b32
) {
324 NOPs
= MAX2(NOPs
, ctx
.setreg_then_getsetreg
);
327 if (program
->chip_class
== GFX9
) {
328 if (instr
->opcode
== aco_opcode::s_movrels_b32
|| instr
->opcode
== aco_opcode::s_movrels_b64
||
329 instr
->opcode
== aco_opcode::s_movreld_b32
|| instr
->opcode
== aco_opcode::s_movreld_b64
) {
330 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_moverel
);
334 if (instr
->opcode
== aco_opcode::s_sendmsg
|| instr
->opcode
== aco_opcode::s_ttracedata
)
335 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
336 } else if (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
) {
337 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_gds_msg_ttrace
);
338 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
339 for (Operand op
: instr
->operands
) {
340 if (op
.physReg() == vccz
)
341 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_vccz
);
342 if (op
.physReg() == execz
)
343 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_execz
);
346 if (instr
->isDPP()) {
347 NOPs
= MAX2(NOPs
, ctx
.valu_wr_exec_then_dpp
);
348 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 2, instr
->operands
[0]);
351 for (Definition def
: instr
->definitions
) {
352 if (def
.regClass().type() != RegType::sgpr
) {
353 for (unsigned i
= 0; i
< def
.size(); i
++)
354 NOPs
= MAX2(NOPs
, ctx
.vmem_store_then_wr_data
[(def
.physReg() & 0xff) + i
]);
358 if ((instr
->opcode
== aco_opcode::v_readlane_b32
||
359 instr
->opcode
== aco_opcode::v_readlane_b32_e64
||
360 instr
->opcode
== aco_opcode::v_writelane_b32
||
361 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) &&
362 !instr
->operands
[1].isConstant()) {
363 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 4, instr
->operands
[1]);
366 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
367 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
368 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
369 * This hazard isn't documented anywhere but AMD confirmed that hazard.
371 if (program
->chip_class
== GFX6
&&
372 (instr
->opcode
== aco_opcode::v_readlane_b32
|| /* GFX6 doesn't have v_readlane_b32_e64 */
373 instr
->opcode
== aco_opcode::v_readfirstlane_b32
)) {
374 handle_vintrp_then_read_hazard(program
, cur_block
, &NOPs
, 1, instr
->operands
[0]);
377 if (instr
->opcode
== aco_opcode::v_div_fmas_f32
|| instr
->opcode
== aco_opcode::v_div_fmas_f64
)
378 NOPs
= MAX2(NOPs
, ctx
.valu_wr_vcc_then_div_fmas
);
379 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
380 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
381 for (Operand op
: instr
->operands
) {
382 if (!op
.isConstant() && !op
.isUndefined() && op
.regClass().type() == RegType::sgpr
)
383 handle_valu_then_read_hazard(program
, cur_block
, &NOPs
, 5, op
);
387 if (!instr
->isSALU() && instr
->format
!= Format::SMEM
)
388 NOPs
= MAX2(NOPs
, ctx
.set_vskip_mode_then_vector
);
390 if (program
->chip_class
== GFX9
) {
391 bool lds_scratch_global
= (instr
->format
== Format::SCRATCH
|| instr
->format
== Format::GLOBAL
) &&
392 static_cast<FLAT_instruction
*>(instr
.get())->lds
;
393 if (instr
->format
== Format::VINTRP
||
394 instr
->opcode
== aco_opcode::ds_read_addtid_b32
||
395 instr
->opcode
== aco_opcode::ds_write_addtid_b32
||
396 instr
->opcode
== aco_opcode::buffer_store_lds_dword
||
397 lds_scratch_global
) {
398 NOPs
= MAX2(NOPs
, ctx
.salu_wr_m0_then_lds
);
402 ctx
.add_wait_states(NOPs
+ get_wait_states(instr
));
404 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
407 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
410 new_instructions
.emplace_back(std::move(nop
));
413 /* update information to check for later hazards */
414 if ((ctx
.smem_clause
|| ctx
.smem_write
) && (NOPs
|| instr
->format
!= Format::SMEM
)) {
415 ctx
.smem_clause
= false;
416 ctx
.smem_write
= false;
417 BITSET_ZERO(ctx
.smem_clause_read_write
);
418 BITSET_ZERO(ctx
.smem_clause_write
);
421 if (instr
->format
== Format::SMEM
) {
422 if (instr
->definitions
.empty() || instr_info
.is_atomic
[(unsigned)instr
->opcode
]) {
423 ctx
.smem_write
= true;
425 ctx
.smem_clause
= true;
427 for (Operand op
: instr
->operands
) {
428 if (!op
.isConstant()) {
429 set_bitset_range(ctx
.smem_clause_read_write
, op
.physReg(), op
.size());
433 Definition def
= instr
->definitions
[0];
434 set_bitset_range(ctx
.smem_clause_read_write
, def
.physReg(), def
.size());
435 set_bitset_range(ctx
.smem_clause_write
, def
.physReg(), def
.size());
437 } else if (instr
->isVALU()) {
438 for (Definition def
: instr
->definitions
) {
439 if (def
.regClass().type() == RegType::sgpr
) {
440 if (def
.physReg() == vcc
|| def
.physReg() == vcc_hi
) {
441 ctx
.valu_wr_vcc_then_vccz
= 5;
442 ctx
.valu_wr_vcc_then_div_fmas
= 4;
444 if (def
.physReg() == exec
|| def
.physReg() == exec_hi
) {
445 ctx
.valu_wr_exec_then_execz
= 5;
446 ctx
.valu_wr_exec_then_dpp
= 5;
450 } else if (instr
->isSALU() && !instr
->definitions
.empty()) {
451 if (!instr
->definitions
.empty()) {
452 /* all other definitions should be SCC */
453 Definition def
= instr
->definitions
[0];
454 if (def
.physReg() == m0
) {
455 ctx
.salu_wr_m0_then_gds_msg_ttrace
= 1;
456 ctx
.salu_wr_m0_then_lds
= 1;
457 ctx
.salu_wr_m0_then_moverel
= 1;
459 } else if (instr
->opcode
== aco_opcode::s_setreg_b32
|| instr
->opcode
== aco_opcode::s_setreg_imm32_b32
) {
460 SOPK_instruction
*sopk
= static_cast<SOPK_instruction
*>(instr
.get());
461 unsigned offset
= (sopk
->imm
>> 6) & 0x1f;
462 unsigned size
= ((sopk
->imm
>> 11) & 0x1f) + 1;
463 unsigned reg
= sopk
->imm
& 0x3f;
464 ctx
.setreg_then_getsetreg
= 2;
466 if (reg
== 1 && offset
>= 28 && size
> (28 - offset
))
467 ctx
.set_vskip_mode_then_vector
= 2;
469 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) {
470 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
471 bool consider_buf
= (instr
->format
== Format::MUBUF
|| instr
->format
== Format::MTBUF
) &&
472 instr
->operands
.size() == 4 &&
473 instr
->operands
[3].size() > 2 &&
474 instr
->operands
[2].physReg() >= 128;
475 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
476 bool consider_mimg
= instr
->format
== Format::MIMG
&&
477 instr
->operands
[1].regClass().type() == RegType::vgpr
&&
478 instr
->operands
[1].size() > 2 &&
479 instr
->operands
[0].size() == 4;
480 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
481 bool consider_flat
= (instr
->isFlatOrGlobal() || instr
->format
== Format::SCRATCH
) &&
482 instr
->operands
.size() == 3 &&
483 instr
->operands
[2].size() > 2;
484 if (consider_buf
|| consider_mimg
|| consider_flat
) {
485 PhysReg wrdata
= instr
->operands
[consider_flat
? 2 : 3].physReg();
486 unsigned size
= instr
->operands
[consider_flat
? 2 : 3].size();
487 for (unsigned i
= 0; i
< size
; i
++)
488 ctx
.vmem_store_then_wr_data
[(wrdata
& 0xff) + i
] = 1;
493 template <std::size_t N
>
494 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
496 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
497 bool writes_any
= false;
498 for (unsigned i
= 0; i
< def
.size(); i
++) {
499 unsigned def_reg
= def
.physReg() + i
;
500 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
506 template <std::size_t N
>
507 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
509 for (const Operand
&op
: instr
->operands
) {
510 for (unsigned i
= 0; i
< op
.size(); i
++) {
511 unsigned reg
= op
.physReg() + i
;
512 if (reg
< reg_reads
.size())
518 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
520 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
522 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
524 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
525 instr
->opcode
== aco_opcode::v_readlane_b32
||
526 instr
->opcode
== aco_opcode::v_readlane_b32_e64
)
531 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
533 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
534 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
538 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
540 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
541 return def
.getTemp().type() == RegType::sgpr
;
545 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
547 return instr
->opcode
== aco_opcode::s_branch
||
548 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
549 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
550 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
551 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
552 instr
->opcode
== aco_opcode::s_cbranch_execz
||
553 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
554 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
555 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
556 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
557 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
558 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
559 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
560 instr
->opcode
== aco_opcode::s_setpc_b64
||
561 instr
->opcode
== aco_opcode::s_swappc_b64
||
562 instr
->opcode
== aco_opcode::s_getpc_b64
||
563 instr
->opcode
== aco_opcode::s_call_b64
;
566 void handle_instruction_gfx10(Program
*program
, Block
*cur_block
, NOP_ctx_gfx10
&ctx
,
567 aco_ptr
<Instruction
>& instr
, std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
569 //TODO: s_dcache_inv needs to be in it's own group on GFX10
571 /* VMEMtoScalarWriteHazard
572 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
574 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
575 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
576 /* Remember all SGPRs that are read by the VMEM instruction */
577 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
578 ctx
.sgprs_read_by_VMEM
.set(exec
);
579 if (program
->wave_size
== 64)
580 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
581 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
582 /* Check if SALU writes an SGPR that was previously read by the VALU */
583 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
584 ctx
.sgprs_read_by_VMEM
.reset();
586 /* Insert v_nop to mitigate the problem */
587 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
588 new_instructions
.emplace_back(std::move(nop
));
590 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
591 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
592 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
593 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
595 ctx
.sgprs_read_by_VMEM
.reset();
596 } else if (instr
->isVALU()) {
597 /* Hazard is mitigated by any VALU instruction */
598 ctx
.sgprs_read_by_VMEM
.reset();
601 /* VcmpxPermlaneHazard
602 * Handle any permlane following a VOPC instruction, insert v_mov between them.
604 if (instr
->format
== Format::VOPC
) {
606 } else if (ctx
.has_VOPC
&&
607 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
608 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
609 ctx
.has_VOPC
= false;
611 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
612 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
613 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
614 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
615 new_instructions
.emplace_back(std::move(v_mov
));
616 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
617 ctx
.has_VOPC
= false;
620 /* VcmpxExecWARHazard
621 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
623 if (!instr
->isVALU() && instr
->reads_exec()) {
624 ctx
.has_nonVALU_exec_read
= true;
625 } else if (instr
->isVALU()) {
626 if (instr_writes_exec(instr
)) {
627 ctx
.has_nonVALU_exec_read
= false;
629 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
630 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
631 depctr
->imm
= 0xfffe;
633 new_instructions
.emplace_back(std::move(depctr
));
634 } else if (instr_writes_sgpr(instr
)) {
635 /* Any VALU instruction that writes an SGPR mitigates the problem */
636 ctx
.has_nonVALU_exec_read
= false;
638 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
639 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
640 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
641 if ((sopp
->imm
& 0xfffe) == 0xfffe)
642 ctx
.has_nonVALU_exec_read
= false;
645 /* SMEMtoVectorWriteHazard
646 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
648 if (instr
->format
== Format::SMEM
) {
649 /* Remember all SGPRs that are read by the SMEM instruction */
650 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
651 } else if (VALU_writes_sgpr(instr
)) {
652 /* Check if VALU writes an SGPR that was previously read by SMEM */
653 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
654 ctx
.sgprs_read_by_SMEM
.reset();
656 /* Insert s_mov to mitigate the problem */
657 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
658 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
659 s_mov
->operands
[0] = Operand(0u);
660 new_instructions
.emplace_back(std::move(s_mov
));
662 } else if (instr
->isSALU()) {
663 if (instr
->format
!= Format::SOPP
) {
664 /* SALU can mitigate the hazard */
665 ctx
.sgprs_read_by_SMEM
.reset();
667 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
668 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
669 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
670 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
671 ctx
.sgprs_read_by_SMEM
.reset();
672 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
673 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
675 ctx
.sgprs_read_by_SMEM
.reset();
680 /* LdsBranchVmemWARHazard
681 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
683 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
685 ctx
.has_branch_after_VMEM
= false;
686 /* Mitigation for DS is needed only if there was already a branch after */
687 ctx
.has_DS
= ctx
.has_branch_after_DS
;
688 } else if (instr
->format
== Format::DS
) {
690 ctx
.has_branch_after_DS
= false;
691 /* Mitigation for VMEM is needed only if there was already a branch after */
692 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
693 } else if (instr_is_branch(instr
)) {
694 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
695 ctx
.has_branch_after_DS
= ctx
.has_DS
;
696 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
697 /* Only s_waitcnt_vscnt can mitigate the hazard */
698 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
699 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
700 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
702 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
703 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
705 /* Insert s_waitcnt_vscnt to mitigate the problem */
706 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
707 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
709 new_instructions
.emplace_back(std::move(wait
));
713 template <typename Ctx
>
714 using HandleInstr
= void (*)(Program
*, Block
*block
, Ctx
&, aco_ptr
<Instruction
>&,
715 std::vector
<aco_ptr
<Instruction
>>&);
717 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
718 void handle_block(Program
*program
, Ctx
& ctx
, Block
& block
)
720 if (block
.instructions
.empty())
723 std::vector
<aco_ptr
<Instruction
>> old_instructions
= std::move(block
.instructions
);
725 block
.instructions
.reserve(block
.instructions
.size());
727 for (aco_ptr
<Instruction
>& instr
: old_instructions
) {
728 Handle(program
, &block
, ctx
, instr
, block
.instructions
);
729 block
.instructions
.emplace_back(std::move(instr
));
733 template <typename Ctx
, HandleInstr
<Ctx
> Handle
>
734 void mitigate_hazards(Program
*program
)
736 std::vector
<Ctx
> all_ctx(program
->blocks
.size());
737 std::stack
<unsigned> loop_header_indices
;
739 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
740 Block
& block
= program
->blocks
[i
];
741 Ctx
&ctx
= all_ctx
[i
];
743 if (block
.kind
& block_kind_loop_header
) {
744 loop_header_indices
.push(i
);
745 } else if (block
.kind
& block_kind_loop_exit
) {
746 /* Go through the whole loop again */
747 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
749 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
750 loop_block_ctx
.join(all_ctx
[b
]);
752 handle_block
<Ctx
, Handle
>(program
, loop_block_ctx
, program
->blocks
[idx
]);
754 /* We only need to continue if the loop header context changed */
755 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
758 all_ctx
[idx
] = loop_block_ctx
;
761 loop_header_indices
.pop();
764 for (unsigned b
: block
.linear_preds
)
765 ctx
.join(all_ctx
[b
]);
767 handle_block
<Ctx
, Handle
>(program
, ctx
, block
);
771 } /* end namespace */
773 void insert_NOPs(Program
* program
)
775 if (program
->chip_class
>= GFX10
)
776 mitigate_hazards
<NOP_ctx_gfx10
, handle_instruction_gfx10
>(program
);
778 mitigate_hazards
<NOP_ctx_gfx6
, handle_instruction_gfx6
>(program
);