2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 struct NOP_ctx_gfx8_9
{
34 enum chip_class chip_class
;
35 unsigned vcc_physical
;
37 /* just initialize these with something less than max NOPs */
38 int VALU_wrexec
= -10;
40 int VALU_wrsgpr
= -10;
42 NOP_ctx_gfx8_9(Program
* program
) : chip_class(program
->chip_class
) {
43 vcc_physical
= program
->config
->num_sgprs
- 2;
47 struct NOP_ctx_gfx10
{
48 bool has_VOPC
= false;
49 bool has_nonVALU_exec_read
= false;
50 bool has_VMEM
= false;
51 bool has_branch_after_VMEM
= false;
53 bool has_branch_after_DS
= false;
54 std::bitset
<128> sgprs_read_by_VMEM
;
55 std::bitset
<128> sgprs_read_by_SMEM
;
57 void join(const NOP_ctx_gfx10
&other
) {
58 has_VOPC
|= other
.has_VOPC
;
59 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
60 has_VMEM
|= other
.has_VMEM
;
61 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
62 has_DS
|= other
.has_DS
;
63 has_branch_after_DS
|= other
.has_branch_after_DS
;
64 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
65 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
68 bool operator==(const NOP_ctx_gfx10
&other
)
71 has_VOPC
== other
.has_VOPC
&&
72 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
73 has_VMEM
== other
.has_VMEM
&&
74 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
75 has_DS
== other
.has_DS
&&
76 has_branch_after_DS
== other
.has_branch_after_DS
&&
77 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
78 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
82 template <std::size_t N
>
83 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
85 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
86 bool writes_any
= false;
87 for (unsigned i
= 0; i
< def
.size(); i
++) {
88 unsigned def_reg
= def
.physReg() + i
;
89 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
95 template <std::size_t N
>
96 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
98 for (const Operand
&op
: instr
->operands
) {
99 for (unsigned i
= 0; i
< op
.size(); i
++) {
100 unsigned reg
= op
.physReg() + i
;
101 if (reg
< reg_reads
.size())
107 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
109 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
111 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
113 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
114 instr
->opcode
== aco_opcode::v_readlane_b32
||
115 instr
->opcode
== aco_opcode::v_readlane_b32_e64
)
120 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
122 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
123 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
127 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
129 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
130 return def
.getTemp().type() == RegType::sgpr
;
134 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
136 return instr
->opcode
== aco_opcode::s_branch
||
137 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
138 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
139 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
140 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
141 instr
->opcode
== aco_opcode::s_cbranch_execz
||
142 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
143 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
144 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
145 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
146 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
147 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
148 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
149 instr
->opcode
== aco_opcode::s_setpc_b64
||
150 instr
->opcode
== aco_opcode::s_swappc_b64
||
151 instr
->opcode
== aco_opcode::s_getpc_b64
||
152 instr
->opcode
== aco_opcode::s_call_b64
;
155 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
157 return a_reg
> b_reg
?
158 (a_reg
- b_reg
< b_size
) :
159 (b_reg
- a_reg
< a_size
);
162 unsigned handle_SMEM_clause(aco_ptr
<Instruction
>& instr
, int new_idx
,
163 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
165 //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
166 const bool is_store
= instr
->definitions
.empty();
167 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0; pred_idx
--) {
168 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
169 if (pred
->format
!= Format::SMEM
)
172 /* Don't allow clauses with store instructions since the clause's
173 * instructions may use the same address. */
174 if (is_store
|| pred
->definitions
.empty())
177 Definition
& instr_def
= instr
->definitions
[0];
178 Definition
& pred_def
= pred
->definitions
[0];
180 /* ISA reference doesn't say anything about this, but best to be safe */
181 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), pred_def
.physReg(), pred_def
.size()))
184 for (const Operand
& op
: pred
->operands
) {
185 if (op
.isConstant() || !op
.isFixed())
187 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), op
.physReg(), op
.size()))
190 for (const Operand
& op
: instr
->operands
) {
191 if (op
.isConstant() || !op
.isFixed())
193 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
201 int handle_instruction_gfx8_9(NOP_ctx_gfx8_9
& ctx
, aco_ptr
<Instruction
>& instr
,
202 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
203 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
205 int new_idx
= new_instructions
.size();
207 // TODO: setreg / getreg / m0 writes
208 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
211 if (instr
->format
== Format::SMEM
) {
212 if (ctx
.chip_class
== GFX6
) {
213 bool is_buffer_load
= instr
->operands
.size() && instr
->operands
[0].size() > 2;
214 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 4; pred_idx
--) {
215 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
216 /* A read of an SGPR by SMRD instruction requires 4 wait states
217 * when the SGPR was written by a VALU instruction. */
218 if (VALU_writes_sgpr(pred
)) {
219 Definition pred_def
= pred
->definitions
[pred
->definitions
.size() - 1];
220 for (const Operand
& op
: instr
->operands
) {
221 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
222 return 4 + pred_idx
- new_idx
+ 1;
225 /* According to LLVM, this is an undocumented hardware behavior */
226 if (is_buffer_load
&& pred
->isSALU() && pred
->definitions
.size()) {
227 Definition pred_def
= pred
->definitions
[0];
228 Operand
& op
= instr
->operands
[0];
229 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
230 return 4 + pred_idx
- new_idx
+ 1;
235 /* break off from prevous SMEM clause if needed */
236 return handle_SMEM_clause(instr
, new_idx
, new_instructions
);
238 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
241 if (instr
->isDPP()) {
242 /* VALU does not forward EXEC to DPP. */
243 if (ctx
.VALU_wrexec
+ 5 >= new_idx
)
244 NOPs
= 5 + ctx
.VALU_wrexec
- new_idx
+ 1;
246 /* VALU DPP reads VGPR written by VALU */
247 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 2; pred_idx
--) {
248 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
249 if ((pred
->isVALU() || pred
->format
== Format::VINTRP
) &&
250 !pred
->definitions
.empty() &&
251 pred
->definitions
[0].physReg() == instr
->operands
[0].physReg()) {
252 NOPs
= std::max(NOPs
, 2 + pred_idx
- new_idx
+ 1);
259 if (instr
->format
== Format::VINTRP
&& new_idx
> 0 && ctx
.chip_class
>= GFX9
) {
260 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
261 if (pred
->isSALU() &&
262 !pred
->definitions
.empty() &&
263 pred
->definitions
[0].physReg() == m0
)
264 NOPs
= std::max(NOPs
, 1);
267 for (const Operand
& op
: instr
->operands
) {
268 /* VALU which uses VCCZ */
269 if (op
.physReg() == PhysReg
{251} &&
270 ctx
.VALU_wrvcc
+ 5 >= new_idx
)
271 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrvcc
- new_idx
+ 1);
273 /* VALU which uses EXECZ */
274 if (op
.physReg() == PhysReg
{252} &&
275 ctx
.VALU_wrexec
+ 5 >= new_idx
)
276 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrexec
- new_idx
+ 1);
278 /* VALU which reads VCC as a constant */
279 if (ctx
.VALU_wrvcc
+ 1 >= new_idx
) {
280 for (unsigned k
= 0; k
< op
.size(); k
++) {
281 unsigned reg
= op
.physReg() + k
;
282 if (reg
== ctx
.vcc_physical
|| reg
== ctx
.vcc_physical
+ 1)
283 NOPs
= std::max(NOPs
, 1);
288 switch (instr
->opcode
) {
289 case aco_opcode::v_readlane_b32
:
290 case aco_opcode::v_readlane_b32_e64
:
291 case aco_opcode::v_writelane_b32
:
292 case aco_opcode::v_writelane_b32_e64
: {
293 if (ctx
.VALU_wrsgpr
+ 4 < new_idx
)
295 PhysReg reg
= instr
->operands
[1].physReg();
296 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 4; pred_idx
--) {
297 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
298 if (!pred
->isVALU() || !VALU_writes_sgpr(pred
))
300 for (const Definition
& def
: pred
->definitions
) {
301 if (def
.physReg() == reg
)
302 NOPs
= std::max(NOPs
, 4 + pred_idx
- new_idx
+ 1);
307 case aco_opcode::v_div_fmas_f32
:
308 case aco_opcode::v_div_fmas_f64
: {
309 if (ctx
.VALU_wrvcc
+ 4 >= new_idx
)
310 NOPs
= std::max(NOPs
, 4 + ctx
.VALU_wrvcc
- new_idx
+ 1);
317 /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
318 // FIXME: handle case if the last instruction of a block without branch is such store
320 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
321 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
322 bool consider_buf
= (pred
->format
== Format::MUBUF
|| pred
->format
== Format::MTBUF
) &&
323 pred
->operands
.size() == 4 &&
324 pred
->operands
[3].size() > 2 &&
325 pred
->operands
[2].physReg() >= 128;
326 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
327 bool consider_mimg
= pred
->format
== Format::MIMG
&&
328 pred
->operands
[1].regClass().type() == RegType::vgpr
&&
329 pred
->operands
[1].size() > 2 &&
330 pred
->operands
[0].size() == 4;
331 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
332 bool consider_flat
= (pred
->isFlatOrGlobal() || pred
->format
== Format::SCRATCH
) &&
333 pred
->operands
.size() == 3 &&
334 pred
->operands
[2].size() > 2;
335 if (consider_buf
|| consider_mimg
|| consider_flat
) {
336 PhysReg wrdata
= pred
->operands
[consider_flat
? 2 : 3].physReg();
337 unsigned size
= pred
->operands
[consider_flat
? 2 : 3].size();
338 assert(wrdata
>= 256);
339 for (const Definition
& def
: instr
->definitions
) {
340 if (regs_intersect(def
.physReg(), def
.size(), wrdata
, size
))
341 NOPs
= std::max(NOPs
, 1);
346 if (VALU_writes_sgpr(instr
)) {
347 for (const Definition
& def
: instr
->definitions
) {
348 if (def
.physReg() == vcc
)
349 ctx
.VALU_wrvcc
= NOPs
? new_idx
: new_idx
+ 1;
350 else if (def
.physReg() == exec
)
351 ctx
.VALU_wrexec
= NOPs
? new_idx
: new_idx
+ 1;
352 else if (def
.physReg() <= 102)
353 ctx
.VALU_wrsgpr
= NOPs
? new_idx
: new_idx
+ 1;
357 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
358 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
359 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
360 * This hazard isn't documented anywhere but AMD confirmed that hazard.
362 if (ctx
.chip_class
== GFX6
&&
363 !new_instructions
.empty() &&
364 (instr
->opcode
== aco_opcode::v_readfirstlane_b32
||
365 instr
->opcode
== aco_opcode::v_readlane_b32
)) {
366 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
367 if (pred
->format
== Format::VINTRP
) {
368 Definition pred_def
= pred
->definitions
[0];
369 Operand
& op
= instr
->operands
[0];
370 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
371 NOPs
= std::max(NOPs
, 1);
375 } else if (instr
->isVMEM() && ctx
.VALU_wrsgpr
+ 5 >= new_idx
) {
376 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
377 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 5; pred_idx
--) {
378 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
379 // TODO: break if something else writes the SGPR
380 if (!(pred
->isVALU() && VALU_writes_sgpr(pred
)))
383 for (const Definition
& def
: pred
->definitions
) {
384 if (def
.physReg() > 102)
387 for (const Operand
& op
: instr
->operands
) {
388 if (regs_intersect(op
.physReg(), op
.size(), def
.physReg(), def
.size()))
389 return 5 + pred_idx
- new_idx
+ 1;
394 } else if (instr
->format
== Format::SOPP
) {
395 if (instr
->opcode
== aco_opcode::s_sendmsg
&& new_idx
> 0) {
396 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
397 if (pred
->isSALU() &&
398 !pred
->definitions
.empty() &&
399 pred
->definitions
[0].physReg() == m0
)
407 void handle_block_gfx8_9(NOP_ctx_gfx8_9
& ctx
, Block
& block
)
409 std::vector
<aco_ptr
<Instruction
>> instructions
;
410 instructions
.reserve(block
.instructions
.size());
411 for (unsigned i
= 0; i
< block
.instructions
.size(); i
++) {
412 aco_ptr
<Instruction
>& instr
= block
.instructions
[i
];
413 unsigned NOPs
= handle_instruction_gfx8_9(ctx
, instr
, block
.instructions
, instructions
);
415 // TODO: try to move the instruction down
417 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
420 instructions
.emplace_back(std::move(nop
));
423 instructions
.emplace_back(std::move(instr
));
426 ctx
.VALU_wrvcc
-= instructions
.size();
427 ctx
.VALU_wrexec
-= instructions
.size();
428 ctx
.VALU_wrsgpr
-= instructions
.size();
429 block
.instructions
= std::move(instructions
);
432 void insert_NOPs_gfx8_9(Program
* program
)
434 NOP_ctx_gfx8_9
ctx(program
);
436 for (Block
& block
: program
->blocks
) {
437 if (block
.instructions
.empty())
440 handle_block_gfx8_9(ctx
, block
);
444 void handle_instruction_gfx10(Program
*program
, NOP_ctx_gfx10
&ctx
, aco_ptr
<Instruction
>& instr
,
445 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
446 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
448 /* VMEMtoScalarWriteHazard
449 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
451 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
452 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
453 /* Remember all SGPRs that are read by the VMEM instruction */
454 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
455 ctx
.sgprs_read_by_VMEM
.set(exec
);
456 if (program
->wave_size
== 64)
457 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
458 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
459 /* Check if SALU writes an SGPR that was previously read by the VALU */
460 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
461 ctx
.sgprs_read_by_VMEM
.reset();
463 /* Insert v_nop to mitigate the problem */
464 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
465 new_instructions
.emplace_back(std::move(nop
));
467 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
468 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
469 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
470 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
472 ctx
.sgprs_read_by_VMEM
.reset();
473 } else if (instr
->isVALU()) {
474 /* Hazard is mitigated by any VALU instruction */
475 ctx
.sgprs_read_by_VMEM
.reset();
478 /* VcmpxPermlaneHazard
479 * Handle any permlane following a VOPC instruction, insert v_mov between them.
481 if (instr
->format
== Format::VOPC
) {
483 } else if (ctx
.has_VOPC
&&
484 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
485 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
486 ctx
.has_VOPC
= false;
488 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
489 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
490 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
491 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
492 new_instructions
.emplace_back(std::move(v_mov
));
493 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
494 ctx
.has_VOPC
= false;
497 /* VcmpxExecWARHazard
498 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
500 if (!instr
->isVALU() && instr
->reads_exec()) {
501 ctx
.has_nonVALU_exec_read
= true;
502 } else if (instr
->isVALU()) {
503 if (instr_writes_exec(instr
)) {
504 ctx
.has_nonVALU_exec_read
= false;
506 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
507 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 0)};
508 depctr
->imm
= 0xfffe;
510 new_instructions
.emplace_back(std::move(depctr
));
511 } else if (instr_writes_sgpr(instr
)) {
512 /* Any VALU instruction that writes an SGPR mitigates the problem */
513 ctx
.has_nonVALU_exec_read
= false;
515 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
516 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
517 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
518 if ((sopp
->imm
& 0xfffe) == 0xfffe)
519 ctx
.has_nonVALU_exec_read
= false;
522 /* SMEMtoVectorWriteHazard
523 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
525 if (instr
->format
== Format::SMEM
) {
526 /* Remember all SGPRs that are read by the SMEM instruction */
527 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
528 } else if (VALU_writes_sgpr(instr
)) {
529 /* Check if VALU writes an SGPR that was previously read by SMEM */
530 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
531 ctx
.sgprs_read_by_SMEM
.reset();
533 /* Insert s_mov to mitigate the problem */
534 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
535 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
536 s_mov
->operands
[0] = Operand(0u);
537 new_instructions
.emplace_back(std::move(s_mov
));
539 } else if (instr
->isSALU()) {
540 if (instr
->format
!= Format::SOPP
) {
541 /* SALU can mitigate the hazard */
542 ctx
.sgprs_read_by_SMEM
.reset();
544 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
545 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
546 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
547 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
548 ctx
.sgprs_read_by_SMEM
.reset();
549 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
550 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
552 ctx
.sgprs_read_by_SMEM
.reset();
557 /* LdsBranchVmemWARHazard
558 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
560 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
562 ctx
.has_branch_after_VMEM
= false;
563 /* Mitigation for DS is needed only if there was already a branch after */
564 ctx
.has_DS
= ctx
.has_branch_after_DS
;
565 } else if (instr
->format
== Format::DS
) {
567 ctx
.has_branch_after_DS
= false;
568 /* Mitigation for VMEM is needed only if there was already a branch after */
569 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
570 } else if (instr_is_branch(instr
)) {
571 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
572 ctx
.has_branch_after_DS
= ctx
.has_DS
;
573 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
574 /* Only s_waitcnt_vscnt can mitigate the hazard */
575 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
576 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
577 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
579 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
580 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
582 /* Insert s_waitcnt_vscnt to mitigate the problem */
583 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
584 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
586 new_instructions
.emplace_back(std::move(wait
));
590 void handle_block_gfx10(Program
*program
, NOP_ctx_gfx10
& ctx
, Block
& block
)
592 if (block
.instructions
.empty())
595 std::vector
<aco_ptr
<Instruction
>> instructions
;
596 instructions
.reserve(block
.instructions
.size());
598 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
599 handle_instruction_gfx10(program
, ctx
, instr
, block
.instructions
, instructions
);
600 instructions
.emplace_back(std::move(instr
));
603 block
.instructions
= std::move(instructions
);
606 void mitigate_hazards_gfx10(Program
*program
)
608 NOP_ctx_gfx10 all_ctx
[program
->blocks
.size()];
609 std::stack
<unsigned> loop_header_indices
;
611 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
612 Block
& block
= program
->blocks
[i
];
613 NOP_ctx_gfx10
&ctx
= all_ctx
[i
];
615 if (block
.kind
& block_kind_loop_header
) {
616 loop_header_indices
.push(i
);
617 } else if (block
.kind
& block_kind_loop_exit
) {
618 /* Go through the whole loop again */
619 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
620 NOP_ctx_gfx10 loop_block_ctx
;
621 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
622 loop_block_ctx
.join(all_ctx
[b
]);
624 handle_block_gfx10(program
, loop_block_ctx
, program
->blocks
[idx
]);
626 /* We only need to continue if the loop header context changed */
627 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
630 all_ctx
[idx
] = loop_block_ctx
;
633 loop_header_indices
.pop();
636 for (unsigned b
: block
.linear_preds
)
637 ctx
.join(all_ctx
[b
]);
639 handle_block_gfx10(program
, ctx
, block
);
643 } /* end namespace */
645 void insert_NOPs(Program
* program
)
647 if (program
->chip_class
>= GFX10
)
648 mitigate_hazards_gfx10(program
);
650 insert_NOPs_gfx8_9(program
);