aco: allow overflow for some SMEM instructions
[mesa.git] / src / amd / compiler / aco_ssa_elimination.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25
26 #include "aco_ir.h"
27
28 #include <map>
29
30 namespace aco {
31 namespace {
32
33 /* map: block-id -> pair (dest, src) to store phi information */
34 typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
35
36 struct ssa_elimination_ctx {
37 phi_info logical_phi_info;
38 phi_info linear_phi_info;
39 std::vector<bool> empty_blocks;
40 Program* program;
41
42 ssa_elimination_ctx(Program* program) : empty_blocks(program->blocks.size(), true), program(program) {}
43 };
44
45 void collect_phi_info(ssa_elimination_ctx& ctx)
46 {
47 for (Block& block : ctx.program->blocks) {
48 for (aco_ptr<Instruction>& phi : block.instructions) {
49 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
50 break;
51
52 for (unsigned i = 0; i < phi->operands.size(); i++) {
53 if (phi->operands[i].isUndefined())
54 continue;
55 if (phi->operands[i].isTemp() && phi->operands[i].physReg() == phi->definitions[0].physReg())
56 continue;
57
58 std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
59 phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info;
60 const auto result = info.emplace(preds[i], std::vector<std::pair<Definition, Operand>>());
61 assert(phi->definitions[0].size() == phi->operands[i].size());
62 result.first->second.emplace_back(phi->definitions[0], phi->operands[i]);
63 ctx.empty_blocks[preds[i]] = false;
64 }
65 }
66 }
67 }
68
69 void insert_parallelcopies(ssa_elimination_ctx& ctx)
70 {
71 /* insert the parallelcopies from logical phis before p_logical_end */
72 for (auto&& entry : ctx.logical_phi_info) {
73 Block& block = ctx.program->blocks[entry.first];
74 unsigned idx = block.instructions.size() - 1;
75 while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
76 assert(idx > 0);
77 idx--;
78 }
79
80 std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
81 aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())};
82 unsigned i = 0;
83 for (std::pair<Definition, Operand>& pair : entry.second)
84 {
85 pc->definitions[i] = pair.first;
86 pc->operands[i] = pair.second;
87 i++;
88 }
89 /* this shouldn't be needed since we're only copying vgprs */
90 pc->tmp_in_scc = false;
91 block.instructions.insert(it, std::move(pc));
92 }
93
94 /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
95 for (auto&& entry : ctx.linear_phi_info) {
96 Block& block = ctx.program->blocks[entry.first];
97 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
98 --it;
99 assert((*it)->format == Format::PSEUDO_BRANCH);
100 aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())};
101 unsigned i = 0;
102 for (std::pair<Definition, Operand>& pair : entry.second)
103 {
104 pc->definitions[i] = pair.first;
105 pc->operands[i] = pair.second;
106 i++;
107 }
108 pc->tmp_in_scc = block.scc_live_out;
109 pc->scratch_sgpr = block.scratch_sgpr;
110 block.instructions.insert(it, std::move(pc));
111 }
112 }
113
114 bool is_empty_block(Block* block, bool ignore_exec_writes)
115 {
116 /* check if this block is empty and the exec mask is not needed */
117 for (aco_ptr<Instruction>& instr : block->instructions) {
118 switch (instr->opcode) {
119 case aco_opcode::p_linear_phi:
120 case aco_opcode::p_phi:
121 case aco_opcode::p_logical_start:
122 case aco_opcode::p_logical_end:
123 case aco_opcode::p_branch:
124 break;
125 case aco_opcode::p_parallelcopy:
126 for (unsigned i = 0; i < instr->definitions.size(); i++) {
127 if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
128 continue;
129 if (instr->definitions[i].physReg() != instr->operands[i].physReg())
130 return false;
131 }
132 break;
133 case aco_opcode::s_andn2_b64:
134 case aco_opcode::s_andn2_b32:
135 if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
136 break;
137 default:
138 return false;
139 }
140 }
141 return true;
142 }
143
144 void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
145 {
146 /* check if the successor is another merge block which restores exec */
147 // TODO: divergent loops also restore exec
148 if (block->linear_succs.size() != 1 ||
149 !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge))
150 return;
151
152 /* check if this block is empty */
153 if (!is_empty_block(block, true))
154 return;
155
156 /* keep the branch instruction and remove the rest */
157 aco_ptr<Instruction> branch = std::move(block->instructions.back());
158 block->instructions.clear();
159 block->instructions.emplace_back(std::move(branch));
160 }
161
162 void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
163 {
164 assert(block->linear_succs.size() == 2);
165 /* only remove this block if the successor got removed as well */
166 if (block->linear_succs[0] != block->linear_succs[1])
167 return;
168
169 /* check if block is otherwise empty */
170 if (!is_empty_block(block, true))
171 return;
172
173 unsigned succ_idx = block->linear_succs[0];
174 assert(block->linear_preds.size() == 2);
175 for (unsigned i = 0; i < 2; i++) {
176 Block *pred = &ctx.program->blocks[block->linear_preds[i]];
177 pred->linear_succs[0] = succ_idx;
178 ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
179
180 Pseudo_branch_instruction *branch = static_cast<Pseudo_branch_instruction*>(pred->instructions.back().get());
181 assert(branch->format == Format::PSEUDO_BRANCH);
182 branch->target[0] = succ_idx;
183 branch->target[1] = succ_idx;
184 }
185
186 block->instructions.clear();
187 block->linear_preds.clear();
188 block->linear_succs.clear();
189 }
190
191 void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
192 {
193 if (!is_empty_block(block, false))
194 return;
195
196 Block& pred = ctx.program->blocks[block->linear_preds[0]];
197 Block& succ = ctx.program->blocks[block->linear_succs[0]];
198 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(pred.instructions.back().get());
199 if (branch->opcode == aco_opcode::p_branch) {
200 branch->target[0] = succ.index;
201 branch->target[1] = succ.index;
202 } else if (branch->target[0] == block->index) {
203 branch->target[0] = succ.index;
204 } else if (branch->target[0] == succ.index) {
205 assert(branch->target[1] == block->index);
206 branch->target[1] = succ.index;
207 branch->opcode = aco_opcode::p_branch;
208 } else if (branch->target[1] == block->index) {
209 /* check if there is a fall-through path from block to succ */
210 bool falls_through = block->index < succ.index;
211 for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) {
212 assert(ctx.program->blocks[j].index == j);
213 if (!ctx.program->blocks[j].instructions.empty())
214 falls_through = false;
215 }
216 if (falls_through) {
217 branch->target[1] = succ.index;
218 } else {
219 /* check if there is a fall-through path for the alternative target */
220 if (block->index >= branch->target[0])
221 return;
222 for (unsigned j = block->index + 1; j < branch->target[0]; j++) {
223 if (!ctx.program->blocks[j].instructions.empty())
224 return;
225 }
226
227 /* This is a (uniform) break or continue block. The branch condition has to be inverted. */
228 if (branch->opcode == aco_opcode::p_cbranch_z)
229 branch->opcode = aco_opcode::p_cbranch_nz;
230 else if (branch->opcode == aco_opcode::p_cbranch_nz)
231 branch->opcode = aco_opcode::p_cbranch_z;
232 else
233 assert(false);
234 /* also invert the linear successors */
235 pred.linear_succs[0] = pred.linear_succs[1];
236 pred.linear_succs[1] = succ.index;
237 branch->target[1] = branch->target[0];
238 branch->target[0] = succ.index;
239 }
240 } else {
241 assert(false);
242 }
243
244 if (branch->target[0] == branch->target[1])
245 branch->opcode = aco_opcode::p_branch;
246
247 for (unsigned i = 0; i < pred.linear_succs.size(); i++)
248 if (pred.linear_succs[i] == block->index)
249 pred.linear_succs[i] = succ.index;
250
251 for (unsigned i = 0; i < succ.linear_preds.size(); i++)
252 if (succ.linear_preds[i] == block->index)
253 succ.linear_preds[i] = pred.index;
254
255 block->instructions.clear();
256 block->linear_preds.clear();
257 block->linear_succs.clear();
258 }
259
260 void jump_threading(ssa_elimination_ctx& ctx)
261 {
262 for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
263 Block* block = &ctx.program->blocks[i];
264
265 if (!ctx.empty_blocks[i])
266 continue;
267
268 if (block->kind & block_kind_invert) {
269 try_remove_invert_block(ctx, block);
270 continue;
271 }
272
273 if (block->linear_succs.size() > 1)
274 continue;
275
276 if (block->kind & block_kind_merge ||
277 block->kind & block_kind_loop_exit)
278 try_remove_merge_block(ctx, block);
279
280 if (block->linear_preds.size() == 1)
281 try_remove_simple_block(ctx, block);
282 }
283 }
284
285 } /* end namespace */
286
287
288 void ssa_elimination(Program* program)
289 {
290 ssa_elimination_ctx ctx(program);
291
292 /* Collect information about every phi-instruction */
293 collect_phi_info(ctx);
294
295 /* eliminate empty blocks */
296 jump_threading(ctx);
297
298 /* insert parallelcopies from SSA elimination */
299 insert_parallelcopies(ctx);
300
301 }
302 }