aco: Remove superfluous argument from emit_boolean_logic.
[mesa.git] / src / amd / compiler / aco_lower_to_cssa.cpp
1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <map>
26 #include "aco_ir.h"
27 #include "aco_builder.h"
28
29 /*
30 * Implements an algorithm to lower to Concentional SSA Form (CSSA).
31 * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
32 * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
33 *
34 * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
35 * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
36 * The algorithm tries to find beneficial insertion points by checking if a basic block
37 * is empty and if the variable already has a new definition in a dominating block.
38 */
39
40
41 namespace aco {
42 namespace {
43
44 typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
45
46 struct cssa_ctx {
47 Program* program;
48 live& live_vars;
49 phi_info logical_phi_info;
50 phi_info linear_phi_info;
51
52 cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
53 };
54
55 unsigned get_lca(cssa_ctx& ctx, unsigned x, unsigned y, bool is_logical)
56 {
57 while (x != y) {
58 if (x > y) {
59 x = is_logical ? ctx.program->blocks[x].logical_idom : ctx.program->blocks[x].linear_idom;
60 } else {
61 y = is_logical ? ctx.program->blocks[y].logical_idom : ctx.program->blocks[y].linear_idom;
62 }
63 }
64 return x;
65 }
66
67 bool collect_phi_info(cssa_ctx& ctx)
68 {
69 bool progress = false;
70 for (Block& block : ctx.program->blocks) {
71 for (aco_ptr<Instruction>& phi : block.instructions) {
72 bool is_logical;
73 if (phi->opcode == aco_opcode::p_phi)
74 is_logical = true;
75 else if (phi->opcode == aco_opcode::p_linear_phi)
76 is_logical = false;
77 else
78 break;
79
80 /* no CSSA for the exec mask as we don't spill it anyway */
81 if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
82 continue;
83 std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;
84
85 /* collect definition's block per Operand */
86 std::vector<unsigned> def_points(phi->operands.size());
87 for (unsigned i = 0; i < phi->operands.size(); i++) {
88 Operand& op = phi->operands[i];
89 if (op.isUndefined()) {
90 def_points[i] = preds[i];
91 } else if (op.isConstant()) {
92 /* in theory, we could insert the definition there... */
93 def_points[i] = 0;
94 } else {
95 assert(op.isTemp());
96 unsigned pred = preds[i];
97 do {
98 def_points[i] = pred;
99 pred = is_logical ?
100 ctx.program->blocks[pred].logical_idom :
101 ctx.program->blocks[pred].linear_idom;
102 } while (def_points[i] != pred &&
103 ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
104 }
105 }
106
107 /* check live-range intersections */
108 for (unsigned i = 0; i < phi->operands.size(); i++) {
109 Operand op = phi->operands[i];
110 if (op.isUndefined())
111 continue;
112 /* check if the operand comes from the exec mask of a predecessor */
113 if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
114 op.setFixed(exec);
115
116 /* calculate the earliest block where we can insert a copy if needed */
117 bool intersects = false;
118 unsigned upper_bound = preds[i];
119 while (def_points[i] < upper_bound) {
120 unsigned new_ub = is_logical ?
121 ctx.program->blocks[upper_bound].logical_idom :
122 ctx.program->blocks[upper_bound].linear_idom;
123
124 for (unsigned j = 0; j < phi->operands.size(); j++) {
125 /* same operands cannot intersect */
126 if (phi->operands[j].isTemp() && op.getTemp() == phi->operands[j].getTemp())
127 continue;
128 /* live-ranges intersect if any other definition point is dominated by the current definition */
129 intersects |= def_points[j] == new_ub;
130 }
131 if (intersects)
132 break;
133 else
134 upper_bound = new_ub;
135 }
136
137 if (!intersects) {
138 /* constants and live-through definitions can get a copy
139 * at their definition point if there is no other intersection */
140 if (op.isConstant() || !op.isKill() || op.regClass().type() != phi->definitions[0].regClass().type()) {
141 upper_bound = def_points[i];
142 } else {
143 continue;
144 }
145 }
146
147 progress = true;
148 unsigned insert_block = preds[i];
149
150 /* if the predecessor block is empty, try to insert at the dominator */
151 bool is_empty = (is_logical && ctx.program->blocks[insert_block].instructions.size() == 3) ||
152 ctx.program->blocks[insert_block].instructions.size() == 1;
153 if (is_empty) {
154 unsigned idom = is_logical ?
155 ctx.program->blocks[insert_block].logical_idom :
156 ctx.program->blocks[insert_block].linear_idom;
157 if (idom > upper_bound)
158 insert_block = idom;
159 }
160
161 /* if other operands share the same value, try to insert at LCA */
162 std::vector<unsigned> indices = {i};
163 for (unsigned j = i + 1; j < phi->operands.size(); j++) {
164 if ((phi->operands[j].isTemp() && op.isTemp() && phi->operands[j].getTemp() == op.getTemp()) ||
165 (phi->operands[j].isConstant() && op.isConstant() && phi->operands[j].constantValue() == op.constantValue())) {
166 unsigned lca = get_lca(ctx, insert_block, preds[j], is_logical);
167 if (lca > upper_bound) {
168 insert_block = lca;
169 indices.push_back(j);
170 }
171 }
172 }
173
174 /* create new temporary and rename operands */
175 Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
176 if (is_logical)
177 ctx.logical_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
178 else
179 ctx.linear_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
180 for (unsigned index : indices) {
181 phi->operands[index] = Operand(new_tmp);
182 phi->operands[index].setKill(true);
183 def_points[index] = insert_block;
184 }
185 }
186 }
187 }
188 return progress;
189 }
190
191 void insert_parallelcopies(cssa_ctx& ctx)
192 {
193 /* insert the parallelcopies from logical phis before p_logical_end */
194 for (auto&& entry : ctx.logical_phi_info) {
195 Block& block = ctx.program->blocks[entry.first];
196 unsigned idx = block.instructions.size() - 1;
197 while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
198 assert(idx > 0);
199 idx--;
200 }
201
202 Builder bld(ctx.program);
203 bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
204 for (std::pair<Definition, Operand>& pair : entry.second)
205 bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
206 }
207
208 /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
209 for (auto&& entry : ctx.linear_phi_info) {
210 Block& block = ctx.program->blocks[entry.first];
211 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
212 --it;
213 assert((*it)->format == Format::PSEUDO_BRANCH);
214
215 Builder bld(ctx.program);
216 bld.reset(&block.instructions, it);
217 for (std::pair<Definition, Operand>& pair : entry.second)
218 bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
219 }
220 }
221
222 } /* end namespace */
223
224
225 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
226 {
227 cssa_ctx ctx = {program, live_vars};
228 /* collect information about all interfering phi operands */
229 bool progress = collect_phi_info(ctx);
230
231 if (!progress)
232 return;
233
234 insert_parallelcopies(ctx);
235
236 /* update live variable information */
237 live_vars = live_var_analysis(program, options);
238 }
239 }
240