aco: print ACO IR before scheduling instead of after
[mesa.git] / src / amd / compiler / aco_lower_to_cssa.cpp
1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <map>
26 #include "aco_ir.h"
27 #include "aco_builder.h"
28
29 /*
30 * Implements an algorithm to lower to Concentional SSA Form (CSSA).
31 * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
32 * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
33 *
34 * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
35 * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
36 * The algorithm tries to find beneficial insertion points by checking if a basic block
37 * is empty and if the variable already has a new definition in a dominating block.
38 */
39
40
41 namespace aco {
42 namespace {
43
44 typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
45
46 struct cssa_ctx {
47 Program* program;
48 live& live_vars;
49 phi_info logical_phi_info;
50 phi_info linear_phi_info;
51
52 cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
53 };
54
55 bool collect_phi_info(cssa_ctx& ctx)
56 {
57 bool progress = false;
58 for (Block& block : ctx.program->blocks) {
59 for (aco_ptr<Instruction>& phi : block.instructions) {
60 bool is_logical;
61 if (phi->opcode == aco_opcode::p_phi)
62 is_logical = true;
63 else if (phi->opcode == aco_opcode::p_linear_phi)
64 is_logical = false;
65 else
66 break;
67
68 /* no CSSA for the exec mask as we don't spill it anyway */
69 if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
70 continue;
71 std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;
72
73 /* collect definition's block per Operand */
74 std::vector<unsigned> def_points(phi->operands.size());
75 for (unsigned i = 0; i < phi->operands.size(); i++) {
76 Operand& op = phi->operands[i];
77 if (op.isUndefined()) {
78 def_points[i] = preds[i];
79 } else if (op.isConstant()) {
80 /* in theory, we could insert the definition there... */
81 def_points[i] = 0;
82 } else {
83 assert(op.isTemp());
84 unsigned pred = preds[i];
85 do {
86 def_points[i] = pred;
87 pred = is_logical ?
88 ctx.program->blocks[pred].logical_idom :
89 ctx.program->blocks[pred].linear_idom;
90 } while (def_points[i] != pred &&
91 ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
92 }
93 }
94
95 /* check live-range intersections */
96 for (unsigned i = 0; i < phi->operands.size(); i++) {
97 Operand op = phi->operands[i];
98 if (op.isUndefined())
99 continue;
100 /* check if the operand comes from the exec mask of a predecessor */
101 if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
102 op.setFixed(exec);
103
104 bool interferes = false;
105 unsigned idom = is_logical ?
106 ctx.program->blocks[def_points[i]].logical_idom :
107 ctx.program->blocks[def_points[i]].linear_idom;
108 /* live-through operands definitely interfere */
109 if (op.isTemp() && !op.isKill()) {
110 interferes = true;
111 /* create copies for constants to ease spilling */
112 } else if (op.isConstant()) {
113 interferes = true;
114 /* create copies for SGPR -> VGPR moves */
115 } else if (op.regClass() != phi->definitions[0].regClass()) {
116 interferes = true;
117 /* operand might interfere with any phi-def*/
118 } else if (def_points[i] == block.index) {
119 interferes = true;
120 /* operand might interfere with phi-def */
121 } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
122 interferes = true;
123 /* else check for interferences with other operands */
124 } else {
125 for (unsigned j = 0; !interferes && j < phi->operands.size(); j++) {
126 /* don't care about other register classes */
127 if (!phi->operands[j].isTemp() || phi->operands[j].regClass() != phi->definitions[0].regClass())
128 continue;
129 /* same operands cannot interfere */
130 if (op.getTemp() == phi->operands[j].getTemp())
131 continue;
132 /* if def_points[i] dominates any other def_point, assume they interfere.
133 * As live-through operands are checked above, only test up the current block. */
134 unsigned other_def_point = def_points[j];
135 while (def_points[i] < other_def_point && other_def_point != block.index)
136 other_def_point = is_logical ?
137 ctx.program->blocks[other_def_point].logical_idom :
138 ctx.program->blocks[other_def_point].linear_idom;
139 interferes = def_points[i] == other_def_point;
140 }
141 }
142
143 if (!interferes)
144 continue;
145
146 progress = true;
147
148 /* create new temporary and rename operands */
149 Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
150 if (is_logical)
151 ctx.logical_phi_info[preds[i]].emplace_back(Definition(new_tmp), op);
152 else
153 ctx.linear_phi_info[preds[i]].emplace_back(Definition(new_tmp), op);
154 phi->operands[i] = Operand(new_tmp);
155 phi->operands[i].setKill(true);
156 def_points[i] = preds[i];
157 }
158 }
159 }
160 return progress;
161 }
162
163 void insert_parallelcopies(cssa_ctx& ctx)
164 {
165 /* insert the parallelcopies from logical phis before p_logical_end */
166 for (auto&& entry : ctx.logical_phi_info) {
167 Block& block = ctx.program->blocks[entry.first];
168 unsigned idx = block.instructions.size() - 1;
169 while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
170 assert(idx > 0);
171 idx--;
172 }
173
174 Builder bld(ctx.program);
175 bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
176 for (std::pair<Definition, Operand>& pair : entry.second)
177 bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
178 }
179
180 /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
181 for (auto&& entry : ctx.linear_phi_info) {
182 Block& block = ctx.program->blocks[entry.first];
183 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
184 --it;
185 assert((*it)->format == Format::PSEUDO_BRANCH);
186
187 Builder bld(ctx.program);
188 bld.reset(&block.instructions, it);
189 for (std::pair<Definition, Operand>& pair : entry.second)
190 bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
191 }
192 }
193
194 } /* end namespace */
195
196
197 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
198 {
199 cssa_ctx ctx = {program, live_vars};
200 /* collect information about all interfering phi operands */
201 bool progress = collect_phi_info(ctx);
202
203 if (!progress)
204 return;
205
206 insert_parallelcopies(ctx);
207
208 /* update live variable information */
209 live_vars = live_var_analysis(program, options);
210 }
211 }
212