aco: add a helper for building a trap handler shader
[mesa.git] / src / amd / compiler / aco_live_var_analysis.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 * Authors:
25 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
26 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
27 *
28 */
29
30 #include "aco_ir.h"
31 #include "util/u_math.h"
32
33 #include <set>
34 #include <vector>
35
36 #include "vulkan/radv_shader.h"
37
38 namespace aco {
39 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
40 {
41 RegisterDemand changes;
42 for (const Definition& def : instr->definitions) {
43 if (!def.isTemp() || def.isKill())
44 continue;
45 changes += def.getTemp();
46 }
47
48 for (const Operand& op : instr->operands) {
49 if (!op.isTemp() || !op.isFirstKill())
50 continue;
51 changes -= op.getTemp();
52 }
53
54 return changes;
55 }
56
57 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
58 {
59 RegisterDemand temp_registers;
60
61 for (Definition def : instr->definitions) {
62 if (!def.isTemp())
63 continue;
64 if (def.isKill())
65 temp_registers += def.getTemp();
66 }
67
68 for (Operand op : instr->operands) {
69 if (op.isTemp() && op.isLateKill() && op.isFirstKill())
70 temp_registers += op.getTemp();
71 }
72
73 return temp_registers;
74 }
75
76 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
77 {
78 demand -= get_live_changes(instr);
79 demand -= get_temp_registers(instr);
80 if (instr_before)
81 demand += get_temp_registers(instr_before);
82 return demand;
83 }
84
85 namespace {
86 void process_live_temps_per_block(Program *program, live& lives, Block* block,
87 std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
88 {
89 std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
90 RegisterDemand new_demand;
91
92 register_demand.resize(block->instructions.size());
93 block->register_demand = RegisterDemand();
94 TempSet live = lives.live_out[block->index];
95
96 /* add the live_out_exec to live */
97 bool exec_live = false;
98 if (block->live_out_exec != Temp()) {
99 live.insert(block->live_out_exec);
100 exec_live = true;
101 }
102
103 /* initialize register demand */
104 for (Temp t : live)
105 new_demand += t;
106 new_demand.sgpr -= phi_sgpr_ops[block->index];
107
108 /* traverse the instructions backwards */
109 int idx;
110 for (idx = block->instructions.size() -1; idx >= 0; idx--) {
111 Instruction *insn = block->instructions[idx].get();
112 if (is_phi(insn))
113 break;
114
115 /* substract the 1 or 2 sgprs from exec */
116 if (exec_live)
117 assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
118 register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));
119
120 /* KILL */
121 for (Definition& definition : insn->definitions) {
122 if (!definition.isTemp()) {
123 continue;
124 }
125 if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
126 program->needs_vcc = true;
127
128 const Temp temp = definition.getTemp();
129 const size_t n = live.erase(temp);
130
131 if (n) {
132 new_demand -= temp;
133 definition.setKill(false);
134 } else {
135 register_demand[idx] += temp;
136 definition.setKill(true);
137 }
138
139 if (definition.isFixed() && definition.physReg() == exec)
140 exec_live = false;
141 }
142
143 /* GEN */
144 if (insn->opcode == aco_opcode::p_logical_end) {
145 new_demand.sgpr += phi_sgpr_ops[block->index];
146 } else {
147 /* we need to do this in a separate loop because the next one can
148 * setKill() for several operands at once and we don't want to
149 * overwrite that in a later iteration */
150 for (Operand& op : insn->operands)
151 op.setKill(false);
152
153 for (unsigned i = 0; i < insn->operands.size(); ++i)
154 {
155 Operand& operand = insn->operands[i];
156 if (!operand.isTemp())
157 continue;
158 if (operand.isFixed() && operand.physReg() == vcc)
159 program->needs_vcc = true;
160 const Temp temp = operand.getTemp();
161 const bool inserted = live.insert(temp).second;
162 if (inserted) {
163 operand.setFirstKill(true);
164 for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
165 if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
166 insn->operands[j].setFirstKill(false);
167 insn->operands[j].setKill(true);
168 }
169 }
170 if (operand.isLateKill())
171 register_demand[idx] += temp;
172 new_demand += temp;
173 }
174
175 if (operand.isFixed() && operand.physReg() == exec)
176 exec_live = true;
177 }
178 }
179
180 block->register_demand.update(register_demand[idx]);
181 }
182
183 /* update block's register demand for a last time */
184 if (exec_live)
185 assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
186 new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
187 block->register_demand.update(new_demand);
188
189 /* handle phi definitions */
190 int phi_idx = idx;
191 while (phi_idx >= 0) {
192 register_demand[phi_idx] = new_demand;
193 Instruction *insn = block->instructions[phi_idx].get();
194
195 assert(is_phi(insn));
196 assert(insn->definitions.size() == 1 && insn->definitions[0].isTemp());
197 Definition& definition = insn->definitions[0];
198 if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
199 program->needs_vcc = true;
200 const Temp temp = definition.getTemp();
201 const size_t n = live.erase(temp);
202
203 if (n)
204 definition.setKill(false);
205 else
206 definition.setKill(true);
207
208 phi_idx--;
209 }
210
211 /* now, we need to merge the live-ins into the live-out sets */
212 for (Temp t : live) {
213 std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
214
215 #ifndef NDEBUG
216 if (preds.empty())
217 aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
218 #endif
219
220 for (unsigned pred_idx : preds) {
221 auto it = lives.live_out[pred_idx].insert(t);
222 if (it.second)
223 worklist.insert(pred_idx);
224 }
225 }
226
227 /* handle phi operands */
228 phi_idx = idx;
229 while (phi_idx >= 0) {
230 Instruction *insn = block->instructions[phi_idx].get();
231 assert(is_phi(insn));
232 /* directly insert into the predecessors live-out set */
233 std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
234 ? block->logical_preds
235 : block->linear_preds;
236 for (unsigned i = 0; i < preds.size(); ++i) {
237 Operand &operand = insn->operands[i];
238 if (!operand.isTemp())
239 continue;
240 if (operand.isFixed() && operand.physReg() == vcc)
241 program->needs_vcc = true;
242 /* check if we changed an already processed block */
243 const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
244 if (inserted) {
245 operand.setKill(true);
246 worklist.insert(preds[i]);
247 if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
248 phi_sgpr_ops[preds[i]] += operand.size();
249 }
250 }
251 phi_idx--;
252 }
253
254 assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
255 }
256
257 unsigned calc_waves_per_workgroup(Program *program)
258 {
259 /* When workgroup size is not known, just go with wave_size */
260 unsigned workgroup_size = program->workgroup_size == UINT_MAX
261 ? program->wave_size
262 : program->workgroup_size;
263
264 return align(workgroup_size, program->wave_size) / program->wave_size;
265 }
266 } /* end namespace */
267
268 uint16_t get_extra_sgprs(Program *program)
269 {
270 if (program->chip_class >= GFX10) {
271 assert(!program->needs_flat_scr);
272 assert(!program->xnack_enabled);
273 return 2;
274 } else if (program->chip_class >= GFX8) {
275 if (program->needs_flat_scr)
276 return 6;
277 else if (program->xnack_enabled)
278 return 4;
279 else if (program->needs_vcc)
280 return 2;
281 else
282 return 0;
283 } else {
284 assert(!program->xnack_enabled);
285 if (program->needs_flat_scr)
286 return 4;
287 else if (program->needs_vcc)
288 return 2;
289 else
290 return 0;
291 }
292 }
293
294 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
295 {
296 assert(addressable_sgprs <= program->sgpr_limit);
297 uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
298 uint16_t granule = program->sgpr_alloc_granule + 1;
299 return align(std::max(sgprs, granule), granule);
300 }
301
302 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
303 {
304 assert(addressable_vgprs <= program->vgpr_limit);
305 uint16_t granule = program->vgpr_alloc_granule + 1;
306 return align(std::max(addressable_vgprs, granule), granule);
307 }
308
309 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
310 {
311 uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
312 sgprs -= get_extra_sgprs(program);
313 return std::min(sgprs, program->sgpr_limit);
314 }
315
316 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
317 {
318 uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule;
319 return std::min(vgprs, program->vgpr_limit);
320 }
321
322 void calc_min_waves(Program* program)
323 {
324 unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
325 /* currently min_waves is in wave64 waves */
326 if (program->wave_size == 32)
327 waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);
328
329 unsigned simd_per_cu = 4; /* TODO: different on Navi */
330 bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
331 unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
332
333 program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
334 }
335
336 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
337 {
338 /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
339 unsigned max_waves_per_simd = 10;
340 if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) || program->chip_class >= GFX10_3)
341 max_waves_per_simd = 8;
342 unsigned simd_per_cu = 4;
343
344 bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
345 unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
346 unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;
347
348 /* this won't compile, register pressure reduction necessary */
349 if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
350 program->num_waves = 0;
351 program->max_reg_demand = new_demand;
352 } else {
353 program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
354 program->num_waves = std::min<uint16_t>(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr));
355 program->max_waves = max_waves_per_simd;
356
357 /* adjust max_waves for workgroup and LDS limits */
358 unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
359 unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
360 if (program->config->lds_size) {
361 unsigned lds = program->config->lds_size * program->lds_alloc_granule;
362 workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
363 }
364 if (waves_per_workgroup > 1 && program->chip_class < GFX10)
365 workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
366
367 /* in cases like waves_per_workgroup=3 or lds=65536 and
368 * waves_per_workgroup=1, we want the maximum possible number of waves per
369 * SIMD and not the minimum. so DIV_ROUND_UP is used */
370 program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
371
372 /* incorporate max_waves and calculate max_reg_demand */
373 program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
374 program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
375 program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
376 }
377 }
378
379 live live_var_analysis(Program* program,
380 const struct radv_nir_compiler_options *options)
381 {
382 live result;
383 result.live_out.resize(program->blocks.size());
384 result.register_demand.resize(program->blocks.size());
385 std::set<unsigned> worklist;
386 std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
387 RegisterDemand new_demand;
388
389 program->needs_vcc = false;
390
391 /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
392 for (Block& block : program->blocks)
393 worklist.insert(block.index);
394 while (!worklist.empty()) {
395 std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
396 unsigned block_idx = *b_it;
397 worklist.erase(block_idx);
398 process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
399 new_demand.update(program->blocks[block_idx].register_demand);
400 }
401
402 /* calculate the program's register demand and number of waves */
403 update_vgpr_sgpr_demand(program, new_demand);
404
405 return result;
406 }
407
408 }
409