aco: Initial commit of independent AMD compiler
[mesa.git] / src / amd / compiler / aco_insert_NOPs.cpp
1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_ir.h"
26
27 namespace aco {
28 namespace {
29
30 struct NOP_ctx {
31 /* just initialize these with something less than max NOPs */
32 int VALU_wrexec = -10;
33 int VALU_wrvcc = -10;
34 int VALU_wrsgpr = -10;
35 enum chip_class chip_class;
36 unsigned vcc_physical;
37 NOP_ctx(Program* program) : chip_class(program->chip_class) {
38 vcc_physical = program->config->num_sgprs - 2;
39 }
40 };
41
42 bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
43 {
44 if ((uint32_t) instr->format & (uint32_t) Format::VOPC)
45 return true;
46 if (instr->isVOP3() && instr->definitions.size() == 2)
47 return true;
48 if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
49 return true;
50 return false;
51 }
52
53 bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
54 {
55 return a_reg > b_reg ?
56 (a_reg - b_reg < b_size) :
57 (b_reg - a_reg < a_size);
58 }
59
60 int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr,
61 std::vector<aco_ptr<Instruction>>& old_instructions,
62 std::vector<aco_ptr<Instruction>>& new_instructions)
63 {
64 int new_idx = new_instructions.size();
65
66 // TODO: setreg / getreg / m0 writes
67 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
68
69 /* break off from prevous SMEM clause if needed */
70 if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) {
71 const bool is_store = instr->definitions.empty();
72 for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) {
73 aco_ptr<Instruction>& pred = new_instructions[pred_idx];
74 if (pred->format != Format::SMEM)
75 break;
76
77 /* Don't allow clauses with store instructions since the clause's
78 * instructions may use the same address. */
79 if (is_store || pred->definitions.empty())
80 return 1;
81
82 Definition& instr_def = instr->definitions[0];
83 Definition& pred_def = pred->definitions[0];
84
85 /* ISA reference doesn't say anything about this, but best to be safe */
86 if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size()))
87 return 1;
88
89 for (const Operand& op : pred->operands) {
90 if (op.isConstant() || !op.isFixed())
91 continue;
92 if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size()))
93 return 1;
94 }
95 for (const Operand& op : instr->operands) {
96 if (op.isConstant() || !op.isFixed())
97 continue;
98 if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size()))
99 return 1;
100 }
101 }
102 } else if (instr->isVALU() || instr->format == Format::VINTRP) {
103 int NOPs = 0;
104
105 if (instr->isDPP()) {
106 /* VALU does not forward EXEC to DPP. */
107 if (ctx.VALU_wrexec + 5 >= new_idx)
108 NOPs = 5 + ctx.VALU_wrexec - new_idx + 1;
109
110 /* VALU DPP reads VGPR written by VALU */
111 for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) {
112 aco_ptr<Instruction>& pred = new_instructions[pred_idx];
113 if ((pred->isVALU() || pred->format == Format::VINTRP) &&
114 !pred->definitions.empty() &&
115 pred->definitions[0].physReg() == instr->operands[0].physReg()) {
116 NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1);
117 break;
118 }
119 }
120 }
121
122 /* SALU writes M0 */
123 if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) {
124 aco_ptr<Instruction>& pred = new_instructions.back();
125 if (pred->isSALU() &&
126 !pred->definitions.empty() &&
127 pred->definitions[0].physReg() == m0)
128 NOPs = std::max(NOPs, 1);
129 }
130
131 for (const Operand& op : instr->operands) {
132 /* VALU which uses VCCZ */
133 if (op.physReg() == PhysReg{251} &&
134 ctx.VALU_wrvcc + 5 >= new_idx)
135 NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1);
136
137 /* VALU which uses EXECZ */
138 if (op.physReg() == PhysReg{252} &&
139 ctx.VALU_wrexec + 5 >= new_idx)
140 NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1);
141
142 /* VALU which reads VCC as a constant */
143 if (ctx.VALU_wrvcc + 1 >= new_idx) {
144 for (unsigned k = 0; k < op.size(); k++) {
145 unsigned reg = op.physReg() + k;
146 if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1)
147 NOPs = std::max(NOPs, 1);
148 }
149 }
150 }
151
152 switch (instr->opcode) {
153 case aco_opcode::v_readlane_b32:
154 case aco_opcode::v_writelane_b32: {
155 if (ctx.VALU_wrsgpr + 4 < new_idx)
156 break;
157 PhysReg reg = instr->operands[1].physReg();
158 for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) {
159 aco_ptr<Instruction>& pred = new_instructions[pred_idx];
160 if (!pred->isVALU() || !VALU_writes_sgpr(pred))
161 continue;
162 for (const Definition& def : pred->definitions) {
163 if (def.physReg() == reg)
164 NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1);
165 }
166 }
167 break;
168 }
169 case aco_opcode::v_div_fmas_f32:
170 case aco_opcode::v_div_fmas_f64: {
171 if (ctx.VALU_wrvcc + 4 >= new_idx)
172 NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1);
173 break;
174 }
175 default:
176 break;
177 }
178
179 /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */
180 // FIXME: handle case if the last instruction of a block without branch is such store
181 // TODO: confirm that DS instructions cannot cause WAR hazards here
182 if (new_idx > 0) {
183 aco_ptr<Instruction>& pred = new_instructions.back();
184 if (pred->isVMEM() &&
185 pred->operands.size() == 4 &&
186 pred->operands[3].size() > 2 &&
187 pred->operands[1].size() != 8 &&
188 (pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) {
189 /* Ops that use a 256-bit T# do not need a wait state.
190 * BUFFER_STORE_* operations that use an SGPR for "offset"
191 * do not require any wait states. */
192 PhysReg wrdata = pred->operands[3].physReg();
193 unsigned size = pred->operands[3].size();
194 assert(wrdata >= 256);
195 for (const Definition& def : instr->definitions) {
196 if (regs_intersect(def.physReg(), def.size(), wrdata, size))
197 NOPs = std::max(NOPs, 1);
198 }
199 }
200 }
201
202 if (VALU_writes_sgpr(instr)) {
203 for (const Definition& def : instr->definitions) {
204 if (def.physReg() == vcc)
205 ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1;
206 else if (def.physReg() == exec)
207 ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1;
208 else if (def.physReg() <= 102)
209 ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1;
210 }
211 }
212 return NOPs;
213 } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) {
214 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
215 for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) {
216 aco_ptr<Instruction>& pred = new_instructions[pred_idx];
217 if (!(pred->isVALU() && VALU_writes_sgpr(pred)))
218 continue;
219
220 for (const Definition& def : pred->definitions) {
221 if (def.physReg() > 102)
222 continue;
223
224 if (instr->operands.size() > 1 &&
225 regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(),
226 def.physReg(), def.size())) {
227 return 5 + pred_idx - new_idx + 1;
228 }
229
230 if (instr->operands.size() > 2 &&
231 regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(),
232 def.physReg(), def.size())) {
233 return 5 + pred_idx - new_idx + 1;
234 }
235 }
236 }
237 }
238
239 return 0;
240 }
241
242
243 void handle_block(NOP_ctx& ctx, Block& block)
244 {
245 std::vector<aco_ptr<Instruction>> instructions;
246 instructions.reserve(block.instructions.size());
247 for (unsigned i = 0; i < block.instructions.size(); i++) {
248 aco_ptr<Instruction>& instr = block.instructions[i];
249 unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions);
250 if (NOPs) {
251 // TODO: try to move the instruction down
252 /* create NOP */
253 aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
254 nop->imm = NOPs - 1;
255 nop->block = -1;
256 instructions.emplace_back(std::move(nop));
257 }
258
259 instructions.emplace_back(std::move(instr));
260 }
261
262 ctx.VALU_wrvcc -= instructions.size();
263 ctx.VALU_wrexec -= instructions.size();
264 ctx.VALU_wrsgpr -= instructions.size();
265 block.instructions = std::move(instructions);
266 }
267
268 } /* end namespace */
269
270
271 void insert_NOPs(Program* program)
272 {
273 NOP_ctx ctx(program);
274 for (Block& block : program->blocks) {
275 if (block.instructions.empty())
276 continue;
277
278 handle_block(ctx, block);
279 }
280 }
281
282 }