2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 struct NOP_ctx_gfx8_9
{
34 enum chip_class chip_class
;
35 unsigned vcc_physical
;
37 /* just initialize these with something less than max NOPs */
38 int VALU_wrexec
= -10;
40 int VALU_wrsgpr
= -10;
42 NOP_ctx_gfx8_9(Program
* program
) : chip_class(program
->chip_class
) {
43 vcc_physical
= program
->config
->num_sgprs
- 2;
47 struct NOP_ctx_gfx10
{
48 bool has_VOPC
= false;
49 bool has_nonVALU_exec_read
= false;
50 bool has_VMEM
= false;
51 bool has_branch_after_VMEM
= false;
53 bool has_branch_after_DS
= false;
54 std::bitset
<128> sgprs_read_by_VMEM
;
55 std::bitset
<128> sgprs_read_by_SMEM
;
57 void join(const NOP_ctx_gfx10
&other
) {
58 has_VOPC
|= other
.has_VOPC
;
59 has_nonVALU_exec_read
|= other
.has_nonVALU_exec_read
;
60 has_VMEM
|= other
.has_VMEM
;
61 has_branch_after_VMEM
|= other
.has_branch_after_VMEM
;
62 has_DS
|= other
.has_DS
;
63 has_branch_after_DS
|= other
.has_branch_after_DS
;
64 sgprs_read_by_VMEM
|= other
.sgprs_read_by_VMEM
;
65 sgprs_read_by_SMEM
|= other
.sgprs_read_by_SMEM
;
68 bool operator==(const NOP_ctx_gfx10
&other
)
71 has_VOPC
== other
.has_VOPC
&&
72 has_nonVALU_exec_read
== other
.has_nonVALU_exec_read
&&
73 has_VMEM
== other
.has_VMEM
&&
74 has_branch_after_VMEM
== other
.has_branch_after_VMEM
&&
75 has_DS
== other
.has_DS
&&
76 has_branch_after_DS
== other
.has_branch_after_DS
&&
77 sgprs_read_by_VMEM
== other
.sgprs_read_by_VMEM
&&
78 sgprs_read_by_SMEM
== other
.sgprs_read_by_SMEM
;
82 template <std::size_t N
>
83 bool check_written_regs(const aco_ptr
<Instruction
> &instr
, const std::bitset
<N
> &check_regs
)
85 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [&check_regs
](const Definition
&def
) -> bool {
86 bool writes_any
= false;
87 for (unsigned i
= 0; i
< def
.size(); i
++) {
88 unsigned def_reg
= def
.physReg() + i
;
89 writes_any
|= def_reg
< check_regs
.size() && check_regs
[def_reg
];
95 template <std::size_t N
>
96 void mark_read_regs(const aco_ptr
<Instruction
> &instr
, std::bitset
<N
> ®_reads
)
98 for (const Operand
&op
: instr
->operands
) {
99 for (unsigned i
= 0; i
< op
.size(); i
++) {
100 unsigned reg
= op
.physReg() + i
;
101 if (reg
< reg_reads
.size())
107 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
109 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
111 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
113 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
|| instr
->opcode
== aco_opcode::v_readlane_b32
)
118 bool instr_writes_exec(const aco_ptr
<Instruction
>& instr
)
120 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
121 return def
.physReg() == exec_lo
|| def
.physReg() == exec_hi
;
125 bool instr_writes_sgpr(const aco_ptr
<Instruction
>& instr
)
127 return std::any_of(instr
->definitions
.begin(), instr
->definitions
.end(), [](const Definition
&def
) -> bool {
128 return def
.getTemp().type() == RegType::sgpr
;
132 inline bool instr_is_branch(const aco_ptr
<Instruction
>& instr
)
134 return instr
->opcode
== aco_opcode::s_branch
||
135 instr
->opcode
== aco_opcode::s_cbranch_scc0
||
136 instr
->opcode
== aco_opcode::s_cbranch_scc1
||
137 instr
->opcode
== aco_opcode::s_cbranch_vccz
||
138 instr
->opcode
== aco_opcode::s_cbranch_vccnz
||
139 instr
->opcode
== aco_opcode::s_cbranch_execz
||
140 instr
->opcode
== aco_opcode::s_cbranch_execnz
||
141 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys
||
142 instr
->opcode
== aco_opcode::s_cbranch_cdbguser
||
143 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_or_user
||
144 instr
->opcode
== aco_opcode::s_cbranch_cdbgsys_and_user
||
145 instr
->opcode
== aco_opcode::s_subvector_loop_begin
||
146 instr
->opcode
== aco_opcode::s_subvector_loop_end
||
147 instr
->opcode
== aco_opcode::s_setpc_b64
||
148 instr
->opcode
== aco_opcode::s_swappc_b64
||
149 instr
->opcode
== aco_opcode::s_getpc_b64
||
150 instr
->opcode
== aco_opcode::s_call_b64
;
153 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
155 return a_reg
> b_reg
?
156 (a_reg
- b_reg
< b_size
) :
157 (b_reg
- a_reg
< a_size
);
160 unsigned handle_SMEM_clause(aco_ptr
<Instruction
>& instr
, int new_idx
,
161 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
163 //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
164 const bool is_store
= instr
->definitions
.empty();
165 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0; pred_idx
--) {
166 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
167 if (pred
->format
!= Format::SMEM
)
170 /* Don't allow clauses with store instructions since the clause's
171 * instructions may use the same address. */
172 if (is_store
|| pred
->definitions
.empty())
175 Definition
& instr_def
= instr
->definitions
[0];
176 Definition
& pred_def
= pred
->definitions
[0];
178 /* ISA reference doesn't say anything about this, but best to be safe */
179 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), pred_def
.physReg(), pred_def
.size()))
182 for (const Operand
& op
: pred
->operands
) {
183 if (op
.isConstant() || !op
.isFixed())
185 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), op
.physReg(), op
.size()))
188 for (const Operand
& op
: instr
->operands
) {
189 if (op
.isConstant() || !op
.isFixed())
191 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
199 int handle_instruction_gfx8_9(NOP_ctx_gfx8_9
& ctx
, aco_ptr
<Instruction
>& instr
,
200 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
201 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
203 int new_idx
= new_instructions
.size();
205 // TODO: setreg / getreg / m0 writes
206 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
208 /* break off from prevous SMEM clause if needed */
209 if (instr
->format
== Format::SMEM
&& ctx
.chip_class
>= GFX8
) {
210 return handle_SMEM_clause(instr
, new_idx
, new_instructions
);
211 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
214 if (instr
->isDPP()) {
215 /* VALU does not forward EXEC to DPP. */
216 if (ctx
.VALU_wrexec
+ 5 >= new_idx
)
217 NOPs
= 5 + ctx
.VALU_wrexec
- new_idx
+ 1;
219 /* VALU DPP reads VGPR written by VALU */
220 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 2; pred_idx
--) {
221 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
222 if ((pred
->isVALU() || pred
->format
== Format::VINTRP
) &&
223 !pred
->definitions
.empty() &&
224 pred
->definitions
[0].physReg() == instr
->operands
[0].physReg()) {
225 NOPs
= std::max(NOPs
, 2 + pred_idx
- new_idx
+ 1);
232 if (instr
->format
== Format::VINTRP
&& new_idx
> 0 && ctx
.chip_class
>= GFX9
) {
233 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
234 if (pred
->isSALU() &&
235 !pred
->definitions
.empty() &&
236 pred
->definitions
[0].physReg() == m0
)
237 NOPs
= std::max(NOPs
, 1);
240 for (const Operand
& op
: instr
->operands
) {
241 /* VALU which uses VCCZ */
242 if (op
.physReg() == PhysReg
{251} &&
243 ctx
.VALU_wrvcc
+ 5 >= new_idx
)
244 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrvcc
- new_idx
+ 1);
246 /* VALU which uses EXECZ */
247 if (op
.physReg() == PhysReg
{252} &&
248 ctx
.VALU_wrexec
+ 5 >= new_idx
)
249 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrexec
- new_idx
+ 1);
251 /* VALU which reads VCC as a constant */
252 if (ctx
.VALU_wrvcc
+ 1 >= new_idx
) {
253 for (unsigned k
= 0; k
< op
.size(); k
++) {
254 unsigned reg
= op
.physReg() + k
;
255 if (reg
== ctx
.vcc_physical
|| reg
== ctx
.vcc_physical
+ 1)
256 NOPs
= std::max(NOPs
, 1);
261 switch (instr
->opcode
) {
262 case aco_opcode::v_readlane_b32
:
263 case aco_opcode::v_writelane_b32
: {
264 if (ctx
.VALU_wrsgpr
+ 4 < new_idx
)
266 PhysReg reg
= instr
->operands
[1].physReg();
267 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 4; pred_idx
--) {
268 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
269 if (!pred
->isVALU() || !VALU_writes_sgpr(pred
))
271 for (const Definition
& def
: pred
->definitions
) {
272 if (def
.physReg() == reg
)
273 NOPs
= std::max(NOPs
, 4 + pred_idx
- new_idx
+ 1);
278 case aco_opcode::v_div_fmas_f32
:
279 case aco_opcode::v_div_fmas_f64
: {
280 if (ctx
.VALU_wrvcc
+ 4 >= new_idx
)
281 NOPs
= std::max(NOPs
, 4 + ctx
.VALU_wrvcc
- new_idx
+ 1);
288 /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
289 // FIXME: handle case if the last instruction of a block without branch is such store
290 // TODO: confirm that DS instructions cannot cause WAR hazards here
292 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
293 if (pred
->isVMEM() &&
294 pred
->operands
.size() == 4 &&
295 pred
->operands
[3].size() > 2 &&
296 pred
->operands
[1].size() != 8 &&
297 (pred
->format
!= Format::MUBUF
|| pred
->operands
[2].physReg() >= 102)) {
298 /* Ops that use a 256-bit T# do not need a wait state.
299 * BUFFER_STORE_* operations that use an SGPR for "offset"
300 * do not require any wait states. */
301 PhysReg wrdata
= pred
->operands
[3].physReg();
302 unsigned size
= pred
->operands
[3].size();
303 assert(wrdata
>= 256);
304 for (const Definition
& def
: instr
->definitions
) {
305 if (regs_intersect(def
.physReg(), def
.size(), wrdata
, size
))
306 NOPs
= std::max(NOPs
, 1);
311 if (VALU_writes_sgpr(instr
)) {
312 for (const Definition
& def
: instr
->definitions
) {
313 if (def
.physReg() == vcc
)
314 ctx
.VALU_wrvcc
= NOPs
? new_idx
: new_idx
+ 1;
315 else if (def
.physReg() == exec
)
316 ctx
.VALU_wrexec
= NOPs
? new_idx
: new_idx
+ 1;
317 else if (def
.physReg() <= 102)
318 ctx
.VALU_wrsgpr
= NOPs
? new_idx
: new_idx
+ 1;
322 } else if (instr
->isVMEM() && ctx
.VALU_wrsgpr
+ 5 >= new_idx
) {
323 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
324 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 5; pred_idx
--) {
325 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
326 if (!(pred
->isVALU() && VALU_writes_sgpr(pred
)))
329 for (const Definition
& def
: pred
->definitions
) {
330 if (def
.physReg() > 102)
333 if (instr
->operands
.size() > 1 &&
334 regs_intersect(instr
->operands
[1].physReg(), instr
->operands
[1].size(),
335 def
.physReg(), def
.size())) {
336 return 5 + pred_idx
- new_idx
+ 1;
339 if (instr
->operands
.size() > 2 &&
340 regs_intersect(instr
->operands
[2].physReg(), instr
->operands
[2].size(),
341 def
.physReg(), def
.size())) {
342 return 5 + pred_idx
- new_idx
+ 1;
351 void handle_block_gfx8_9(NOP_ctx_gfx8_9
& ctx
, Block
& block
)
353 std::vector
<aco_ptr
<Instruction
>> instructions
;
354 instructions
.reserve(block
.instructions
.size());
355 for (unsigned i
= 0; i
< block
.instructions
.size(); i
++) {
356 aco_ptr
<Instruction
>& instr
= block
.instructions
[i
];
357 unsigned NOPs
= handle_instruction_gfx8_9(ctx
, instr
, block
.instructions
, instructions
);
359 // TODO: try to move the instruction down
361 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
364 instructions
.emplace_back(std::move(nop
));
367 instructions
.emplace_back(std::move(instr
));
370 ctx
.VALU_wrvcc
-= instructions
.size();
371 ctx
.VALU_wrexec
-= instructions
.size();
372 ctx
.VALU_wrsgpr
-= instructions
.size();
373 block
.instructions
= std::move(instructions
);
376 void insert_NOPs_gfx8_9(Program
* program
)
378 NOP_ctx_gfx8_9
ctx(program
);
380 for (Block
& block
: program
->blocks
) {
381 if (block
.instructions
.empty())
384 handle_block_gfx8_9(ctx
, block
);
388 void handle_instruction_gfx10(Program
*program
, NOP_ctx_gfx10
&ctx
, aco_ptr
<Instruction
>& instr
,
389 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
390 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
392 /* VMEMtoScalarWriteHazard
393 * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
395 if (instr
->isVMEM() || instr
->format
== Format::FLAT
|| instr
->format
== Format::GLOBAL
||
396 instr
->format
== Format::SCRATCH
|| instr
->format
== Format::DS
) {
397 /* Remember all SGPRs that are read by the VMEM instruction */
398 mark_read_regs(instr
, ctx
.sgprs_read_by_VMEM
);
399 ctx
.sgprs_read_by_VMEM
.set(exec
);
400 if (program
->wave_size
== 64)
401 ctx
.sgprs_read_by_VMEM
.set(exec_hi
);
402 } else if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
403 /* Check if SALU writes an SGPR that was previously read by the VALU */
404 if (check_written_regs(instr
, ctx
.sgprs_read_by_VMEM
)) {
405 ctx
.sgprs_read_by_VMEM
.reset();
407 /* Insert v_nop to mitigate the problem */
408 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
409 new_instructions
.emplace_back(std::move(nop
));
411 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
412 /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
413 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
414 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
416 ctx
.sgprs_read_by_VMEM
.reset();
417 } else if (instr
->isVALU()) {
418 /* Hazard is mitigated by any VALU instruction */
419 ctx
.sgprs_read_by_VMEM
.reset();
422 /* VcmpxPermlaneHazard
423 * Handle any permlane following a VOPC instruction, insert v_mov between them.
425 if (instr
->format
== Format::VOPC
) {
427 } else if (ctx
.has_VOPC
&&
428 (instr
->opcode
== aco_opcode::v_permlane16_b32
||
429 instr
->opcode
== aco_opcode::v_permlanex16_b32
)) {
430 ctx
.has_VOPC
= false;
432 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
433 aco_ptr
<VOP1_instruction
> v_mov
{create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1)};
434 v_mov
->definitions
[0] = Definition(instr
->operands
[0].physReg(), v1
);
435 v_mov
->operands
[0] = Operand(instr
->operands
[0].physReg(), v1
);
436 new_instructions
.emplace_back(std::move(v_mov
));
437 } else if (instr
->isVALU() && instr
->opcode
!= aco_opcode::v_nop
) {
438 ctx
.has_VOPC
= false;
441 /* VcmpxExecWARHazard
442 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
444 if (!instr
->isVALU() && instr
->reads_exec()) {
445 ctx
.has_nonVALU_exec_read
= true;
446 } else if (instr
->isVALU()) {
447 if (instr_writes_exec(instr
)) {
448 ctx
.has_nonVALU_exec_read
= false;
450 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
451 aco_ptr
<SOPP_instruction
> depctr
{create_instruction
<SOPP_instruction
>(aco_opcode::s_waitcnt_depctr
, Format::SOPP
, 0, 1)};
452 depctr
->imm
= 0xfffe;
453 depctr
->definitions
[0] = Definition(sgpr_null
, s1
);
454 new_instructions
.emplace_back(std::move(depctr
));
455 } else if (instr_writes_sgpr(instr
)) {
456 /* Any VALU instruction that writes an SGPR mitigates the problem */
457 ctx
.has_nonVALU_exec_read
= false;
459 } else if (instr
->opcode
== aco_opcode::s_waitcnt_depctr
) {
460 /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */
461 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
462 if ((sopp
->imm
& 0xfffe) == 0xfffe)
463 ctx
.has_nonVALU_exec_read
= false;
466 /* SMEMtoVectorWriteHazard
467 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
469 if (instr
->format
== Format::SMEM
) {
470 /* Remember all SGPRs that are read by the SMEM instruction */
471 mark_read_regs(instr
, ctx
.sgprs_read_by_SMEM
);
472 } else if (VALU_writes_sgpr(instr
)) {
473 /* Check if VALU writes an SGPR that was previously read by SMEM */
474 if (check_written_regs(instr
, ctx
.sgprs_read_by_SMEM
)) {
475 ctx
.sgprs_read_by_SMEM
.reset();
477 /* Insert s_mov to mitigate the problem */
478 aco_ptr
<SOP1_instruction
> s_mov
{create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1)};
479 s_mov
->definitions
[0] = Definition(sgpr_null
, s1
);
480 s_mov
->operands
[0] = Operand(0u);
481 new_instructions
.emplace_back(std::move(s_mov
));
483 } else if (instr
->isSALU()) {
484 if (instr
->format
!= Format::SOPP
) {
485 /* SALU can mitigate the hazard */
486 ctx
.sgprs_read_by_SMEM
.reset();
488 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
489 const SOPP_instruction
*sopp
= static_cast<const SOPP_instruction
*>(instr
.get());
490 if (sopp
->opcode
== aco_opcode::s_waitcnt_lgkmcnt
) {
491 if (sopp
->imm
== 0 && sopp
->definitions
[0].physReg() == sgpr_null
)
492 ctx
.sgprs_read_by_SMEM
.reset();
493 } else if (sopp
->opcode
== aco_opcode::s_waitcnt
) {
494 unsigned lgkm
= (sopp
->imm
>> 8) & 0x3f;
496 ctx
.sgprs_read_by_SMEM
.reset();
501 /* LdsBranchVmemWARHazard
502 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
504 if (instr
->isVMEM() || instr
->format
== Format::GLOBAL
|| instr
->format
== Format::SCRATCH
) {
506 ctx
.has_branch_after_VMEM
= false;
507 /* Mitigation for DS is needed only if there was already a branch after */
508 ctx
.has_DS
= ctx
.has_branch_after_DS
;
509 } else if (instr
->format
== Format::DS
) {
511 ctx
.has_branch_after_DS
= false;
512 /* Mitigation for VMEM is needed only if there was already a branch after */
513 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
;
514 } else if (instr_is_branch(instr
)) {
515 ctx
.has_branch_after_VMEM
= ctx
.has_VMEM
;
516 ctx
.has_branch_after_DS
= ctx
.has_DS
;
517 } else if (instr
->opcode
== aco_opcode::s_waitcnt_vscnt
) {
518 /* Only s_waitcnt_vscnt can mitigate the hazard */
519 const SOPK_instruction
*sopk
= static_cast<const SOPK_instruction
*>(instr
.get());
520 if (sopk
->definitions
[0].physReg() == sgpr_null
&& sopk
->imm
== 0)
521 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
523 if ((ctx
.has_VMEM
&& ctx
.has_branch_after_DS
) || (ctx
.has_DS
&& ctx
.has_branch_after_VMEM
)) {
524 ctx
.has_VMEM
= ctx
.has_branch_after_VMEM
= ctx
.has_DS
= ctx
.has_branch_after_DS
= false;
526 /* Insert s_waitcnt_vscnt to mitigate the problem */
527 aco_ptr
<SOPK_instruction
> wait
{create_instruction
<SOPK_instruction
>(aco_opcode::s_waitcnt_vscnt
, Format::SOPK
, 0, 1)};
528 wait
->definitions
[0] = Definition(sgpr_null
, s1
);
530 new_instructions
.emplace_back(std::move(wait
));
534 void handle_block_gfx10(Program
*program
, NOP_ctx_gfx10
& ctx
, Block
& block
)
536 if (block
.instructions
.empty())
539 std::vector
<aco_ptr
<Instruction
>> instructions
;
540 instructions
.reserve(block
.instructions
.size());
542 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
543 handle_instruction_gfx10(program
, ctx
, instr
, block
.instructions
, instructions
);
544 instructions
.emplace_back(std::move(instr
));
547 block
.instructions
= std::move(instructions
);
550 void mitigate_hazards_gfx10(Program
*program
)
552 NOP_ctx_gfx10 all_ctx
[program
->blocks
.size()];
553 std::stack
<unsigned> loop_header_indices
;
555 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
556 Block
& block
= program
->blocks
[i
];
557 NOP_ctx_gfx10
&ctx
= all_ctx
[i
];
559 if (block
.kind
& block_kind_loop_header
) {
560 loop_header_indices
.push(i
);
561 } else if (block
.kind
& block_kind_loop_exit
) {
562 /* Go through the whole loop again */
563 for (unsigned idx
= loop_header_indices
.top(); idx
< i
; idx
++) {
564 NOP_ctx_gfx10 loop_block_ctx
;
565 for (unsigned b
: program
->blocks
[idx
].linear_preds
)
566 loop_block_ctx
.join(all_ctx
[b
]);
568 handle_block_gfx10(program
, loop_block_ctx
, program
->blocks
[idx
]);
570 /* We only need to continue if the loop header context changed */
571 if (idx
== loop_header_indices
.top() && loop_block_ctx
== all_ctx
[idx
])
574 all_ctx
[idx
] = loop_block_ctx
;
577 loop_header_indices
.pop();
580 for (unsigned b
: block
.linear_preds
)
581 ctx
.join(all_ctx
[b
]);
583 handle_block_gfx10(program
, ctx
, block
);
587 } /* end namespace */
589 void insert_NOPs(Program
* program
)
591 if (program
->chip_class
>= GFX10
)
592 mitigate_hazards_gfx10(program
);
594 insert_NOPs_gfx8_9(program
);