2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 struct NOP_ctx_gfx8_9
{
34 enum chip_class chip_class
;
35 unsigned vcc_physical
;
37 /* just initialize these with something less than max NOPs */
38 int VALU_wrexec
= -10;
40 int VALU_wrsgpr
= -10;
42 NOP_ctx_gfx8_9(Program
* program
) : chip_class(program
->chip_class
) {
43 vcc_physical
= program
->config
->num_sgprs
- 2;
47 struct NOP_ctx_gfx10
{
48 bool has_VOPC
= false;
49 bool has_nonVALU_exec_read
= false;
50 bool has_VMEM
= false;
51 bool has_branch_after_VMEM
= false;
53 bool has_branch_after_DS
= false;
54 std::bitset
<128> sgprs_read_by_VMEM
;
55 std::bitset
<128> sgprs_read_by_SMEM
;
57 void join(const NOP_ctx_gfx10
&other
) {
58 has_VOPC
|= other
.has_VOPC
;
59 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
60 has_VMEM
|= other
.has_VMEM
;
61 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
62 has_DS
|= other
.has_DS
;
63 has_branch_after_DS
|= other
.has_branch_after_DS
;
64 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
65 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
68 bool operator==(const NOP_ctx_gfx10
&other
)
71 has_VOPC
== other
.has_VOPC
&&
72 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
73 has_VMEM
== other
.has_VMEM
&&
74 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
75 has_DS
== other
.has_DS
&&
76 has_branch_after_DS
== other
.has_branch_after_DS
&&
77 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
78 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
82 template <std::size_t N
>
83 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
85 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
86 bool writes_any
= false;
87 for (unsigned i
= 0; i
< def
.size(); i
++) {
88 unsigned def_reg
= def
.physReg() + i
;
89 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
95 template <std::size_t N
>
96 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
98 for (const Operand
&op
: instr
->operands
) {
99 for (unsigned i
= 0; i
< op
.size(); i
++) {
100 unsigned reg
= op
.physReg() + i
;
101 if (reg
< reg_reads
.size())
107 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
109 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
111 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
113 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
|| instr
->opcode
== aco_opcode::v_readlane_b32
)
118 bool instr_reads_exec(const aco_ptr
<Instruction
>& instr
)
120 return std::any_of(instr
->operands
.begin(), instr
->operands
.end(), [](const Operand
&op
) -> bool {
121 return op
.physReg() == exec_lo
|| op
.physReg() == exec_hi
;
125 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
127 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
128 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
132 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
134 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
135 return def
.getTemp().type() == RegType::sgpr
;
139 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
141 return instr
->opcode
== aco_opcode::s_branch
||
142 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
143 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
144 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
145 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
146 instr
->opcode
== aco_opcode::s_cbranch_execz
||
147 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
148 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
149 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
150 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
151 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
152 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
153 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
154 instr
->opcode
== aco_opcode::s_setpc_b64
||
155 instr
->opcode
== aco_opcode::s_swappc_b64
||
156 instr
->opcode
== aco_opcode::s_getpc_b64
||
157 instr
->opcode
== aco_opcode::s_call_b64
;
160 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
162 return a_reg
> b_reg
?
163 (a_reg
- b_reg
< b_size
) :
164 (b_reg
- a_reg
< a_size
);
167 unsigned handle_SMEM_clause(aco_ptr
<Instruction
>& instr
, int new_idx
,
168 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
170 //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
171 const bool is_store
= instr
->definitions
.empty();
172 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0; pred_idx
--) {
173 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
174 if (pred
->format
!= Format::SMEM
)
177 /* Don't allow clauses with store instructions since the clause's
178 * instructions may use the same address. */
179 if (is_store
|| pred
->definitions
.empty())
182 Definition
& instr_def
= instr
->definitions
[0];
183 Definition
& pred_def
= pred
->definitions
[0];
185 /* ISA reference doesn't say anything about this, but best to be safe */
186 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), pred_def
.physReg(), pred_def
.size()))
189 for (const Operand
& op
: pred
->operands
) {
190 if (op
.isConstant() || !op
.isFixed())
192 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), op
.physReg(), op
.size()))
195 for (const Operand
& op
: instr
->operands
) {
196 if (op
.isConstant() || !op
.isFixed())
198 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
206 int handle_instruction_gfx8_9(NOP_ctx_gfx8_9
& ctx
, aco_ptr
<Instruction
>& instr
,
207 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
208 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
210 int new_idx
= new_instructions
.size();
212 // TODO: setreg / getreg / m0 writes
213 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
215 /* break off from prevous SMEM clause if needed */
216 if (instr
->format
== Format::SMEM
&& ctx
.chip_class
>= GFX8
) {
217 return handle_SMEM_clause(instr
, new_idx
, new_instructions
);
218 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
221 if (instr
->isDPP()) {
222 /* VALU does not forward EXEC to DPP. */
223 if (ctx
.VALU_wrexec
+ 5 >= new_idx
)
224 NOPs
= 5 + ctx
.VALU_wrexec
- new_idx
+ 1;
226 /* VALU DPP reads VGPR written by VALU */
227 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 2; pred_idx
--) {
228 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
229 if ((pred
->isVALU() || pred
->format
== Format::VINTRP
) &&
230 !pred
->definitions
.empty() &&
231 pred
->definitions
[0].physReg() == instr
->operands
[0].physReg()) {
232 NOPs
= std::max(NOPs
, 2 + pred_idx
- new_idx
+ 1);
239 if (instr
->format
== Format::VINTRP
&& new_idx
> 0 && ctx
.chip_class
>= GFX9
) {
240 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
241 if (pred
->isSALU() &&
242 !pred
->definitions
.empty() &&
243 pred
->definitions
[0].physReg() == m0
)
244 NOPs
= std::max(NOPs
, 1);
247 for (const Operand
& op
: instr
->operands
) {
248 /* VALU which uses VCCZ */
249 if (op
.physReg() == PhysReg
{251} &&
250 ctx
.VALU_wrvcc
+ 5 >= new_idx
)
251 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrvcc
- new_idx
+ 1);
253 /* VALU which uses EXECZ */
254 if (op
.physReg() == PhysReg
{252} &&
255 ctx
.VALU_wrexec
+ 5 >= new_idx
)
256 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrexec
- new_idx
+ 1);
258 /* VALU which reads VCC as a constant */
259 if (ctx
.VALU_wrvcc
+ 1 >= new_idx
) {
260 for (unsigned k
= 0; k
< op
.size(); k
++) {
261 unsigned reg
= op
.physReg() + k
;
262 if (reg
== ctx
.vcc_physical
|| reg
== ctx
.vcc_physical
+ 1)
263 NOPs
= std::max(NOPs
, 1);
268 switch (instr
->opcode
) {
269 case aco_opcode::v_readlane_b32
:
270 case aco_opcode::v_writelane_b32
: {
271 if (ctx
.VALU_wrsgpr
+ 4 < new_idx
)
273 PhysReg reg
= instr
->operands
[1].physReg();
274 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 4; pred_idx
--) {
275 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
276 if (!pred
->isVALU() || !VALU_writes_sgpr(pred
))
278 for (const Definition
& def
: pred
->definitions
) {
279 if (def
.physReg() == reg
)
280 NOPs
= std::max(NOPs
, 4 + pred_idx
- new_idx
+ 1);
285 case aco_opcode::v_div_fmas_f32
:
286 case aco_opcode::v_div_fmas_f64
: {
287 if (ctx
.VALU_wrvcc
+ 4 >= new_idx
)
288 NOPs
= std::max(NOPs
, 4 + ctx
.VALU_wrvcc
- new_idx
+ 1);
295 /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
296 // FIXME: handle case if the last instruction of a block without branch is such store
297 // TODO: confirm that DS instructions cannot cause WAR hazards here
299 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
300 if (pred
->isVMEM() &&
301 pred
->operands
.size() == 4 &&
302 pred
->operands
[3].size() > 2 &&
303 pred
->operands
[1].size() != 8 &&
304 (pred
->format
!= Format::MUBUF
|| pred
->operands
[2].physReg() >= 102)) {
305 /* Ops that use a 256-bit T# do not need a wait state.
306 * BUFFER_STORE_* operations that use an SGPR for "offset"
307 * do not require any wait states. */
308 PhysReg wrdata
= pred
->operands
[3].physReg();
309 unsigned size
= pred
->operands
[3].size();
310 assert(wrdata
>= 256);
311 for (const Definition
& def
: instr
->definitions
) {
312 if (regs_intersect(def
.physReg(), def
.size(), wrdata
, size
))
313 NOPs
= std::max(NOPs
, 1);
318 if (VALU_writes_sgpr(instr
)) {
319 for (const Definition
& def
: instr
->definitions
) {
320 if (def
.physReg() == vcc
)
321 ctx
.VALU_wrvcc
= NOPs
? new_idx
: new_idx
+ 1;
322 else if (def
.physReg() == exec
)
323 ctx
.VALU_wrexec
= NOPs
? new_idx
: new_idx
+ 1;
324 else if (def
.physReg() <= 102)
325 ctx
.VALU_wrsgpr
= NOPs
? new_idx
: new_idx
+ 1;
329 } else if (instr
->isVMEM() && ctx
.VALU_wrsgpr
+ 5 >= new_idx
) {
330 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
331 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 5; pred_idx
--) {
332 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
333 if (!(pred
->isVALU() && VALU_writes_sgpr(pred
)))
336 for (const Definition
& def
: pred
->definitions
) {
337 if (def
.physReg() > 102)
340 if (instr
->operands
.size() > 1 &&
341 regs_intersect(instr
->operands
[1].physReg(), instr
->operands
[1].size(),
342 def
.physReg(), def
.size())) {
343 return 5 + pred_idx
- new_idx
+ 1;
346 if (instr
->operands
.size() > 2 &&
347 regs_intersect(instr
->operands
[2].physReg(), instr
->operands
[2].size(),
348 def
.physReg(), def
.size())) {
349 return 5 + pred_idx
- new_idx
+ 1;
358 void handle_block_gfx8_9(NOP_ctx_gfx8_9
& ctx
, Block
& block
)
360 std::vector
<aco_ptr
<Instruction
>> instructions
;
361 instructions
.reserve(block
.instructions
.size());
362 for (unsigned i
= 0; i
< block
.instructions
.size(); i
++) {
363 aco_ptr
<Instruction
>& instr
= block
.instructions
[i
];
364 unsigned NOPs
= handle_instruction_gfx8_9(ctx
, instr
, block
.instructions
, instructions
);
366 // TODO: try to move the instruction down
368 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
371 instructions
.emplace_back(std::move(nop
));
374 instructions
.emplace_back(std::move(instr
));
377 ctx
.VALU_wrvcc
-= instructions
.size();
378 ctx
.VALU_wrexec
-= instructions
.size();
379 ctx
.VALU_wrsgpr
-= instructions
.size();
380 block
.instructions
= std::move(instructions
);
383 void insert_NOPs_gfx8_9(Program
* program
)
385 NOP_ctx_gfx8_9
ctx(program
);
387 for (Block
& block
: program
->blocks
) {
388 if (block
.instructions
.empty())
391 handle_block_gfx8_9(ctx
, block
);
395 void handle_instruction_gfx10(NOP_ctx_gfx10
&ctx
, aco_ptr
<Instruction
>& instr
,
396 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
397 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
399 /* VMEMtoScalarWriteHazard
400 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
402 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
403 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
404 /* Remember all SGPRs that are read by the VMEM instruction */
405 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
406 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
407 /* Check if SALU writes an SGPR that was previously read by the VALU */
408 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
409 ctx
.sgprs_read_by_VMEM
.reset();
411 /* Insert v_nop to mitigate the problem */
412 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
413 new_instructions
.emplace_back(std::move(nop
));
415 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
416 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
417 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
418 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
420 ctx
.sgprs_read_by_VMEM
.reset();
421 } else if (instr
->isVALU()) {
422 /* Hazard is mitigated by any VALU instruction */
423 ctx
.sgprs_read_by_VMEM
.reset();
426 /* VcmpxPermlaneHazard
427 * Handle any permlane following a VOPC instruction, insert v_mov between them.
429 if (instr
->format
== Format::VOPC
) {
431 } else if (ctx
.has_VOPC
&&
432 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
433 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
434 ctx
.has_VOPC
= false;
436 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
437 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
438 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
439 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
440 new_instructions
.emplace_back(std::move(v_mov
));
441 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
442 ctx
.has_VOPC
= false;
445 /* VcmpxExecWARHazard
446 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
448 if (!instr
->isVALU() && instr_reads_exec(instr
)) {
449 ctx
.has_nonVALU_exec_read
= true;
450 } else if (instr
->isVALU()) {
451 if (instr_writes_exec(instr
)) {
452 ctx
.has_nonVALU_exec_read
= false;
454 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
455 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 1)};
456 depctr
->imm
= 0xfffe;
457 depctr
->definitions
[0] = Definition(sgpr_null
, s1
);
458 new_instructions
.emplace_back(std::move(depctr
));
459 } else if (instr_writes_sgpr(instr
)) {
460 /* Any VALU instruction that writes an SGPR mitigates the problem */
461 ctx
.has_nonVALU_exec_read
= false;
463 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
464 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
465 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
466 if ((sopp
->imm
& 0xfffe) == 0xfffe)
467 ctx
.has_nonVALU_exec_read
= false;
470 /* SMEMtoVectorWriteHazard
471 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
473 if (instr
->format
== Format::SMEM
) {
474 /* Remember all SGPRs that are read by the SMEM instruction */
475 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
476 } else if (VALU_writes_sgpr(instr
)) {
477 /* Check if VALU writes an SGPR that was previously read by SMEM */
478 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
479 ctx
.sgprs_read_by_SMEM
.reset();
481 /* Insert s_mov to mitigate the problem */
482 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
483 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
484 s_mov
->operands
[0] = Operand(0u);
485 new_instructions
.emplace_back(std::move(s_mov
));
487 } else if (instr
->isSALU()) {
488 if (instr
->format
!= Format::SOPP
) {
489 /* SALU can mitigate the hazard */
490 ctx
.sgprs_read_by_SMEM
.reset();
492 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
493 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
494 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
495 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
496 ctx
.sgprs_read_by_SMEM
.reset();
497 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
498 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
500 ctx
.sgprs_read_by_SMEM
.reset();
505 /* LdsBranchVmemWARHazard
506 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
508 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
510 ctx
.has_branch_after_VMEM
= false;
511 /* Mitigation for DS is needed only if there was already a branch after */
512 ctx
.has_DS
= ctx
.has_branch_after_DS
;
513 } else if (instr
->format
== Format::DS
) {
515 ctx
.has_branch_after_DS
= false;
516 /* Mitigation for VMEM is needed only if there was already a branch after */
517 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
518 } else if (instr_is_branch(instr
)) {
519 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
520 ctx
.has_branch_after_DS
= ctx
.has_DS
;
521 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
522 /* Only s_waitcnt_vscnt can mitigate the hazard */
523 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
524 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
525 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
527 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
528 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
530 /* Insert s_waitcnt_vscnt to mitigate the problem */
531 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
532 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
534 new_instructions
.emplace_back(std::move(wait
));
538 void handle_block_gfx10(NOP_ctx_gfx10
& ctx
, Block
& block
)
540 if (block
.instructions
.empty())
543 std::vector
<aco_ptr
<Instruction
>> instructions
;
544 instructions
.reserve(block
.instructions
.size());
546 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
547 handle_instruction_gfx10(ctx
, instr
, block
.instructions
, instructions
);
548 instructions
.emplace_back(std::move(instr
));
551 block
.instructions
= std::move(instructions
);
554 void mitigate_hazards_gfx10(Program
*program
)
556 NOP_ctx_gfx10 all_ctx
[program
->blocks
.size()];
557 std::stack
<unsigned> loop_header_indices
;
559 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
560 Block
& block
= program
->blocks
[i
];
561 NOP_ctx_gfx10
&ctx
= all_ctx
[i
];
563 if (block
.kind
== block_kind_loop_header
) {
564 loop_header_indices
.push(i
);
565 } else if (block
.kind
== block_kind_loop_exit
) {
566 /* Go through the whole loop again */
567 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
568 NOP_ctx_gfx10 loop_block_ctx
;
569 for (unsigned b
: block
.linear_preds
)
570 loop_block_ctx
.join(all_ctx
[b
]);
572 handle_block_gfx10(loop_block_ctx
, program
->blocks
[idx
]);
574 /* We only need to continue if the loop header context changed */
575 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
578 all_ctx
[idx
] = loop_block_ctx
;
581 loop_header_indices
.pop();
584 for (unsigned b
: block
.linear_preds
)
585 ctx
.join(all_ctx
[b
]);
587 handle_block_gfx10(ctx
, block
);
591 } /* end namespace */
593 void insert_NOPs(Program
* program
)
595 if (program
->chip_class
>= GFX10
)
596 mitigate_hazards_gfx10(program
);
598 insert_NOPs_gfx8_9(program
);