2 * Copyright © 2019 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 enum chip_class chip_class
;
32 unsigned vcc_physical
;
35 /* just initialize these with something less than max NOPs */
36 int VALU_wrexec
= -10;
38 int VALU_wrsgpr
= -10;
41 int last_VMEM_since_scalar_write
= -1;
43 NOP_ctx(Program
* program
) : chip_class(program
->chip_class
) {
44 vcc_physical
= program
->config
->num_sgprs
- 2;
48 bool VALU_writes_sgpr(aco_ptr
<Instruction
>& instr
)
50 if ((uint32_t) instr
->format
& (uint32_t) Format::VOPC
)
52 if (instr
->isVOP3() && instr
->definitions
.size() == 2)
54 if (instr
->opcode
== aco_opcode::v_readfirstlane_b32
|| instr
->opcode
== aco_opcode::v_readlane_b32
)
59 bool regs_intersect(PhysReg a_reg
, unsigned a_size
, PhysReg b_reg
, unsigned b_size
)
61 return a_reg
> b_reg
?
62 (a_reg
- b_reg
< b_size
) :
63 (b_reg
- a_reg
< a_size
);
66 unsigned handle_SMEM_clause(aco_ptr
<Instruction
>& instr
, int new_idx
,
67 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
69 //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?)
70 const bool is_store
= instr
->definitions
.empty();
71 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0; pred_idx
--) {
72 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
73 if (pred
->format
!= Format::SMEM
)
76 /* Don't allow clauses with store instructions since the clause's
77 * instructions may use the same address. */
78 if (is_store
|| pred
->definitions
.empty())
81 Definition
& instr_def
= instr
->definitions
[0];
82 Definition
& pred_def
= pred
->definitions
[0];
84 /* ISA reference doesn't say anything about this, but best to be safe */
85 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), pred_def
.physReg(), pred_def
.size()))
88 for (const Operand
& op
: pred
->operands
) {
89 if (op
.isConstant() || !op
.isFixed())
91 if (regs_intersect(instr_def
.physReg(), instr_def
.size(), op
.physReg(), op
.size()))
94 for (const Operand
& op
: instr
->operands
) {
95 if (op
.isConstant() || !op
.isFixed())
97 if (regs_intersect(pred_def
.physReg(), pred_def
.size(), op
.physReg(), op
.size()))
105 int handle_instruction(NOP_ctx
& ctx
, aco_ptr
<Instruction
>& instr
,
106 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
107 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
109 int new_idx
= new_instructions
.size();
111 // TODO: setreg / getreg / m0 writes
112 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
114 /* break off from prevous SMEM clause if needed */
115 if (instr
->format
== Format::SMEM
&& ctx
.chip_class
>= GFX8
) {
116 return handle_SMEM_clause(instr
, new_idx
, new_instructions
);
117 } else if (instr
->isVALU() || instr
->format
== Format::VINTRP
) {
120 if (instr
->isDPP()) {
121 /* VALU does not forward EXEC to DPP. */
122 if (ctx
.VALU_wrexec
+ 5 >= new_idx
)
123 NOPs
= 5 + ctx
.VALU_wrexec
- new_idx
+ 1;
125 /* VALU DPP reads VGPR written by VALU */
126 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 2; pred_idx
--) {
127 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
128 if ((pred
->isVALU() || pred
->format
== Format::VINTRP
) &&
129 !pred
->definitions
.empty() &&
130 pred
->definitions
[0].physReg() == instr
->operands
[0].physReg()) {
131 NOPs
= std::max(NOPs
, 2 + pred_idx
- new_idx
+ 1);
138 if (instr
->format
== Format::VINTRP
&& new_idx
> 0 && ctx
.chip_class
>= GFX9
) {
139 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
140 if (pred
->isSALU() &&
141 !pred
->definitions
.empty() &&
142 pred
->definitions
[0].physReg() == m0
)
143 NOPs
= std::max(NOPs
, 1);
146 for (const Operand
& op
: instr
->operands
) {
147 /* VALU which uses VCCZ */
148 if (op
.physReg() == PhysReg
{251} &&
149 ctx
.VALU_wrvcc
+ 5 >= new_idx
)
150 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrvcc
- new_idx
+ 1);
152 /* VALU which uses EXECZ */
153 if (op
.physReg() == PhysReg
{252} &&
154 ctx
.VALU_wrexec
+ 5 >= new_idx
)
155 NOPs
= std::max(NOPs
, 5 + ctx
.VALU_wrexec
- new_idx
+ 1);
157 /* VALU which reads VCC as a constant */
158 if (ctx
.VALU_wrvcc
+ 1 >= new_idx
) {
159 for (unsigned k
= 0; k
< op
.size(); k
++) {
160 unsigned reg
= op
.physReg() + k
;
161 if (reg
== ctx
.vcc_physical
|| reg
== ctx
.vcc_physical
+ 1)
162 NOPs
= std::max(NOPs
, 1);
167 switch (instr
->opcode
) {
168 case aco_opcode::v_readlane_b32
:
169 case aco_opcode::v_writelane_b32
: {
170 if (ctx
.VALU_wrsgpr
+ 4 < new_idx
)
172 PhysReg reg
= instr
->operands
[1].physReg();
173 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 4; pred_idx
--) {
174 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
175 if (!pred
->isVALU() || !VALU_writes_sgpr(pred
))
177 for (const Definition
& def
: pred
->definitions
) {
178 if (def
.physReg() == reg
)
179 NOPs
= std::max(NOPs
, 4 + pred_idx
- new_idx
+ 1);
184 case aco_opcode::v_div_fmas_f32
:
185 case aco_opcode::v_div_fmas_f64
: {
186 if (ctx
.VALU_wrvcc
+ 4 >= new_idx
)
187 NOPs
= std::max(NOPs
, 4 + ctx
.VALU_wrvcc
- new_idx
+ 1);
194 /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
195 // FIXME: handle case if the last instruction of a block without branch is such store
196 // TODO: confirm that DS instructions cannot cause WAR hazards here
198 aco_ptr
<Instruction
>& pred
= new_instructions
.back();
199 if (pred
->isVMEM() &&
200 pred
->operands
.size() == 4 &&
201 pred
->operands
[3].size() > 2 &&
202 pred
->operands
[1].size() != 8 &&
203 (pred
->format
!= Format::MUBUF
|| pred
->operands
[2].physReg() >= 102)) {
204 /* Ops that use a 256-bit T# do not need a wait state.
205 * BUFFER_STORE_* operations that use an SGPR for "offset"
206 * do not require any wait states. */
207 PhysReg wrdata
= pred
->operands
[3].physReg();
208 unsigned size
= pred
->operands
[3].size();
209 assert(wrdata
>= 256);
210 for (const Definition
& def
: instr
->definitions
) {
211 if (regs_intersect(def
.physReg(), def
.size(), wrdata
, size
))
212 NOPs
= std::max(NOPs
, 1);
217 if (VALU_writes_sgpr(instr
)) {
218 for (const Definition
& def
: instr
->definitions
) {
219 if (def
.physReg() == vcc
)
220 ctx
.VALU_wrvcc
= NOPs
? new_idx
: new_idx
+ 1;
221 else if (def
.physReg() == exec
)
222 ctx
.VALU_wrexec
= NOPs
? new_idx
: new_idx
+ 1;
223 else if (def
.physReg() <= 102)
224 ctx
.VALU_wrsgpr
= NOPs
? new_idx
: new_idx
+ 1;
228 } else if (instr
->isVMEM() && ctx
.VALU_wrsgpr
+ 5 >= new_idx
) {
229 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
230 for (int pred_idx
= new_idx
- 1; pred_idx
>= 0 && pred_idx
>= new_idx
- 5; pred_idx
--) {
231 aco_ptr
<Instruction
>& pred
= new_instructions
[pred_idx
];
232 if (!(pred
->isVALU() && VALU_writes_sgpr(pred
)))
235 for (const Definition
& def
: pred
->definitions
) {
236 if (def
.physReg() > 102)
239 if (instr
->operands
.size() > 1 &&
240 regs_intersect(instr
->operands
[1].physReg(), instr
->operands
[1].size(),
241 def
.physReg(), def
.size())) {
242 return 5 + pred_idx
- new_idx
+ 1;
245 if (instr
->operands
.size() > 2 &&
246 regs_intersect(instr
->operands
[2].physReg(), instr
->operands
[2].size(),
247 def
.physReg(), def
.size())) {
248 return 5 + pred_idx
- new_idx
+ 1;
257 std::pair
<int, int> handle_instruction_gfx10(NOP_ctx
& ctx
, aco_ptr
<Instruction
>& instr
,
258 std::vector
<aco_ptr
<Instruction
>>& old_instructions
,
259 std::vector
<aco_ptr
<Instruction
>>& new_instructions
)
261 int new_idx
= new_instructions
.size();
265 /* break off from prevous SMEM group ("clause" seems to mean something different in RDNA) if needed */
266 if (instr
->format
== Format::SMEM
)
267 sNOPs
= std::max(sNOPs
, handle_SMEM_clause(instr
, new_idx
, new_instructions
));
269 /* handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between */
270 if (instr
->isSALU() || instr
->format
== Format::SMEM
) {
271 if (!instr
->definitions
.empty() && ctx
.last_VMEM_since_scalar_write
!= -1) {
272 ctx
.last_VMEM_since_scalar_write
= -1;
275 } else if (instr
->isVMEM() || instr
->isFlatOrGlobal()) {
276 ctx
.last_VMEM_since_scalar_write
= new_idx
;
277 } else if (instr
->opcode
== aco_opcode::s_waitcnt
) {
278 uint16_t imm
= static_cast<SOPP_instruction
*>(instr
.get())->imm
;
279 unsigned vmcnt
= (imm
& 0xF) | ((imm
& (0x3 << 14)) >> 10);
281 ctx
.last_VMEM_since_scalar_write
= -1;
282 } else if (instr
->isVALU()) {
283 ctx
.last_VMEM_since_scalar_write
= -1;
286 return std::make_pair(sNOPs
, vNOPs
);
290 void handle_block(NOP_ctx
& ctx
, Block
& block
)
292 std::vector
<aco_ptr
<Instruction
>> instructions
;
293 instructions
.reserve(block
.instructions
.size());
294 for (unsigned i
= 0; i
< block
.instructions
.size(); i
++) {
295 aco_ptr
<Instruction
>& instr
= block
.instructions
[i
];
296 unsigned NOPs
= handle_instruction(ctx
, instr
, block
.instructions
, instructions
);
298 // TODO: try to move the instruction down
300 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
303 instructions
.emplace_back(std::move(nop
));
306 instructions
.emplace_back(std::move(instr
));
309 ctx
.VALU_wrvcc
-= instructions
.size();
310 ctx
.VALU_wrexec
-= instructions
.size();
311 ctx
.VALU_wrsgpr
-= instructions
.size();
312 block
.instructions
= std::move(instructions
);
315 void handle_block_gfx10(NOP_ctx
& ctx
, Block
& block
)
317 std::vector
<aco_ptr
<Instruction
>> instructions
;
318 instructions
.reserve(block
.instructions
.size());
319 for (unsigned i
= 0; i
< block
.instructions
.size(); i
++) {
320 aco_ptr
<Instruction
>& instr
= block
.instructions
[i
];
321 std::pair
<int, int> NOPs
= handle_instruction_gfx10(ctx
, instr
, block
.instructions
, instructions
);
322 for (int i
= 0; i
< NOPs
.second
; i
++) {
323 // TODO: try to move the instruction down
325 aco_ptr
<VOP1_instruction
> nop
{create_instruction
<VOP1_instruction
>(aco_opcode::v_nop
, Format::VOP1
, 0, 0)};
326 instructions
.emplace_back(std::move(nop
));
329 // TODO: try to move the instruction down
331 aco_ptr
<SOPP_instruction
> nop
{create_instruction
<SOPP_instruction
>(aco_opcode::s_nop
, Format::SOPP
, 0, 0)};
332 nop
->imm
= NOPs
.first
- 1;
334 instructions
.emplace_back(std::move(nop
));
337 instructions
.emplace_back(std::move(instr
));
340 block
.instructions
= std::move(instructions
);
343 } /* end namespace */
346 void insert_NOPs(Program
* program
)
348 NOP_ctx
ctx(program
);
350 for (Block
& block
: program
->blocks
) {
351 if (block
.instructions
.empty())
354 if (ctx
.chip_class
>= GFX10
)
355 handle_block_gfx10(ctx
, block
);
357 handle_block(ctx
, block
);