aco: rematerialize s_movk instructions
[mesa.git] / src / amd / compiler / aco_spill.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include "aco_ir.h"
27 #include "aco_builder.h"
28 #include "sid.h"
29
30 #include <map>
31 #include <stack>
32
33 /*
34 * Implements the spilling algorithm on SSA-form from
35 * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
36 * by Matthias Braun and Sebastian Hack.
37 */
38
39 namespace aco {
40
41 namespace {
42
43 struct remat_info {
44 Instruction *instr;
45 };
46
47 struct spill_ctx {
48 RegisterDemand target_pressure;
49 Program* program;
50 std::vector<std::vector<RegisterDemand>> register_demand;
51 std::vector<std::map<Temp, Temp>> renames;
52 std::vector<std::map<Temp, uint32_t>> spills_entry;
53 std::vector<std::map<Temp, uint32_t>> spills_exit;
54 std::vector<bool> processed;
55 std::stack<Block*> loop_header;
56 std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
57 std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
58 std::vector<std::pair<RegClass, std::set<uint32_t>>> interferences;
59 std::vector<std::vector<uint32_t>> affinities;
60 std::vector<bool> is_reloaded;
61 std::map<Temp, remat_info> remat;
62 std::map<Instruction *, bool> remat_used;
63
64 spill_ctx(const RegisterDemand target_pressure, Program* program,
65 std::vector<std::vector<RegisterDemand>> register_demand)
66 : target_pressure(target_pressure), program(program),
67 register_demand(register_demand), renames(program->blocks.size()),
68 spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),
69 processed(program->blocks.size(), false) {}
70
71 void add_affinity(uint32_t first, uint32_t second)
72 {
73 unsigned found_first = affinities.size();
74 unsigned found_second = affinities.size();
75 for (unsigned i = 0; i < affinities.size(); i++) {
76 std::vector<uint32_t>& vec = affinities[i];
77 for (uint32_t entry : vec) {
78 if (entry == first)
79 found_first = i;
80 else if (entry == second)
81 found_second = i;
82 }
83 }
84 if (found_first == affinities.size() && found_second == affinities.size()) {
85 affinities.emplace_back(std::vector<uint32_t>({first, second}));
86 } else if (found_first < affinities.size() && found_second == affinities.size()) {
87 affinities[found_first].push_back(second);
88 } else if (found_second < affinities.size() && found_first == affinities.size()) {
89 affinities[found_second].push_back(first);
90 } else if (found_first != found_second) {
91 /* merge second into first */
92 affinities[found_first].insert(affinities[found_first].end(), affinities[found_second].begin(), affinities[found_second].end());
93 affinities.erase(std::next(affinities.begin(), found_second));
94 } else {
95 assert(found_first == found_second);
96 }
97 }
98
99 uint32_t allocate_spill_id(RegClass rc)
100 {
101 interferences.emplace_back(rc, std::set<uint32_t>());
102 is_reloaded.push_back(false);
103 return next_spill_id++;
104 }
105
106 uint32_t next_spill_id = 0;
107 };
108
109 int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
110 {
111
112 if (idx_a == -1)
113 return idx_b;
114 if (idx_b == -1)
115 return idx_a;
116 if (is_linear) {
117 while (idx_a != idx_b) {
118 if (idx_a > idx_b)
119 idx_a = program->blocks[idx_a].linear_idom;
120 else
121 idx_b = program->blocks[idx_b].linear_idom;
122 }
123 } else {
124 while (idx_a != idx_b) {
125 if (idx_a > idx_b)
126 idx_a = program->blocks[idx_a].logical_idom;
127 else
128 idx_b = program->blocks[idx_b].logical_idom;
129 }
130 }
131 assert(idx_a != -1);
132 return idx_a;
133 }
134
135 void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
136 {
137 Block* block = &ctx.program->blocks[block_idx];
138 std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
139
140 /* to compute the next use distance at the beginning of the block, we have to add the block's size */
141 for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin(); it != next_uses.end(); ++it)
142 it->second.second = it->second.second + block->instructions.size();
143
144 int idx = block->instructions.size() - 1;
145 while (idx >= 0) {
146 aco_ptr<Instruction>& instr = block->instructions[idx];
147
148 if (instr->opcode == aco_opcode::p_linear_phi ||
149 instr->opcode == aco_opcode::p_phi)
150 break;
151
152 for (const Definition& def : instr->definitions) {
153 if (def.isTemp())
154 next_uses.erase(def.getTemp());
155 }
156
157 for (const Operand& op : instr->operands) {
158 /* omit exec mask */
159 if (op.isFixed() && op.physReg() == exec)
160 continue;
161 if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
162 continue;
163 if (op.isTemp())
164 next_uses[op.getTemp()] = {block_idx, idx};
165 }
166 idx--;
167 }
168
169 assert(block_idx != 0 || next_uses.empty());
170 ctx.next_use_distances_start[block_idx] = next_uses;
171 while (idx >= 0) {
172 aco_ptr<Instruction>& instr = block->instructions[idx];
173 assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);
174
175 for (unsigned i = 0; i < instr->operands.size(); i++) {
176 unsigned pred_idx = instr->opcode == aco_opcode::p_phi ?
177 block->logical_preds[i] :
178 block->linear_preds[i];
179 if (instr->operands[i].isTemp()) {
180 if (instr->operands[i].getTemp() == ctx.program->blocks[pred_idx].live_out_exec)
181 continue;
182 if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() ||
183 ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != std::pair<uint32_t, uint32_t>{block_idx, 0})
184 worklist.insert(pred_idx);
185 ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = {block_idx, 0};
186 }
187 }
188 next_uses.erase(instr->definitions[0].getTemp());
189 idx--;
190 }
191
192 /* all remaining live vars must be live-out at the predecessors */
193 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {
194 Temp temp = pair.first;
195 uint32_t distance = pair.second.second;
196 uint32_t dom = pair.second.first;
197 std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;
198 for (unsigned pred_idx : preds) {
199 if (temp == ctx.program->blocks[pred_idx].live_out_exec)
200 continue;
201 if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
202 distance += 0xFFFF;
203 if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) {
204 dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear());
205 distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
206 }
207 if (ctx.next_use_distances_end[pred_idx][temp] != std::pair<uint32_t, uint32_t>{dom, distance})
208 worklist.insert(pred_idx);
209 ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
210 }
211 }
212
213 }
214
215 void compute_global_next_uses(spill_ctx& ctx, std::vector<std::set<Temp>>& live_out)
216 {
217 ctx.next_use_distances_start.resize(ctx.program->blocks.size());
218 ctx.next_use_distances_end.resize(ctx.program->blocks.size());
219 std::set<uint32_t> worklist;
220 for (Block& block : ctx.program->blocks)
221 worklist.insert(block.index);
222
223 while (!worklist.empty()) {
224 std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
225 unsigned block_idx = *b_it;
226 worklist.erase(block_idx);
227 next_uses_per_block(ctx, block_idx, worklist);
228 }
229 }
230
231 bool should_rematerialize(aco_ptr<Instruction>& instr)
232 {
233 /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
234 if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK)
235 return false;
236 /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */
237 if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector)
238 return false;
239 if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32)
240 return false;
241
242 for (const Operand& op : instr->operands) {
243 /* TODO: rematerialization using temporaries isn't yet supported */
244 if (op.isTemp())
245 return false;
246 }
247
248 /* TODO: rematerialization with multiple definitions isn't yet supported */
249 if (instr->definitions.size() > 1)
250 return false;
251
252 return true;
253 }
254
255 aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
256 {
257 std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
258 if (remat != ctx.remat.end()) {
259 Instruction *instr = remat->second.instr;
260 assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported");
261 assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported");
262 assert(instr->definitions.size() == 1 && "unsupported");
263
264 aco_ptr<Instruction> res;
265 if (instr->format == Format::VOP1) {
266 res.reset(create_instruction<VOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
267 } else if (instr->format == Format::SOP1) {
268 res.reset(create_instruction<SOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
269 } else if (instr->format == Format::PSEUDO) {
270 res.reset(create_instruction<Pseudo_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
271 } else if (instr->format == Format::SOPK) {
272 res.reset(create_instruction<SOPK_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));
273 static_cast<SOPK_instruction*>(res.get())->imm = static_cast<SOPK_instruction*>(instr)->imm;
274 }
275 for (unsigned i = 0; i < instr->operands.size(); i++) {
276 res->operands[i] = instr->operands[i];
277 if (instr->operands[i].isTemp()) {
278 assert(false && "unsupported");
279 if (ctx.remat.count(instr->operands[i].getTemp()))
280 ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;
281 }
282 }
283 res->definitions[0] = Definition(new_name);
284 return res;
285 } else {
286 aco_ptr<Pseudo_instruction> reload{create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
287 reload->operands[0] = Operand(spill_id);
288 reload->definitions[0] = Definition(new_name);
289 ctx.is_reloaded[spill_id] = true;
290 return reload;
291 }
292 }
293
294 void get_rematerialize_info(spill_ctx& ctx)
295 {
296 for (Block& block : ctx.program->blocks) {
297 bool logical = false;
298 for (aco_ptr<Instruction>& instr : block.instructions) {
299 if (instr->opcode == aco_opcode::p_logical_start)
300 logical = true;
301 else if (instr->opcode == aco_opcode::p_logical_end)
302 logical = false;
303 if (logical && should_rematerialize(instr)) {
304 for (const Definition& def : instr->definitions) {
305 if (def.isTemp()) {
306 ctx.remat[def.getTemp()] = (remat_info){instr.get()};
307 ctx.remat_used[instr.get()] = false;
308 }
309 }
310 }
311 }
312 }
313 }
314
315 std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* block)
316 {
317 std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
318
319 std::map<Temp, uint32_t> next_uses;
320 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block->index])
321 next_uses[pair.first] = pair.second.second + block->instructions.size();
322
323 for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
324 aco_ptr<Instruction>& instr = block->instructions[idx];
325 if (!instr)
326 break;
327 if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)
328 break;
329
330 for (const Operand& op : instr->operands) {
331 if (op.isFixed() && op.physReg() == exec)
332 continue;
333 if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
334 continue;
335 if (op.isTemp())
336 next_uses[op.getTemp()] = idx;
337 }
338 for (const Definition& def : instr->definitions) {
339 if (def.isTemp())
340 next_uses.erase(def.getTemp());
341 }
342 local_next_uses[idx] = next_uses;
343 }
344 return local_next_uses;
345 }
346
347
348 RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
349 {
350 RegisterDemand spilled_registers;
351
352 /* first block, nothing was spilled before */
353 if (block_idx == 0)
354 return {0, 0};
355
356 /* loop header block */
357 if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {
358 assert(block->linear_preds[0] == block_idx - 1);
359 assert(block->logical_preds[0] == block_idx - 1);
360
361 /* create new loop_info */
362 ctx.loop_header.emplace(block);
363
364 /* check how many live-through variables should be spilled */
365 RegisterDemand new_demand;
366 unsigned i = block_idx;
367 while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {
368 assert(ctx.program->blocks.size() > i);
369 new_demand.update(ctx.program->blocks[i].register_demand);
370 i++;
371 }
372 unsigned loop_end = i;
373
374 /* select live-through vgpr variables */
375 while (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) {
376 unsigned distance = 0;
377 Temp to_spill;
378 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) {
379 if (pair.first.type() == RegType::vgpr &&
380 pair.second.first >= loop_end &&
381 pair.second.second > distance &&
382 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
383 to_spill = pair.first;
384 distance = pair.second.second;
385 }
386 }
387 if (distance == 0)
388 break;
389
390 uint32_t spill_id;
391 if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) {
392 spill_id = ctx.allocate_spill_id(to_spill.regClass());
393 } else {
394 spill_id = ctx.spills_exit[block_idx - 1][to_spill];
395 }
396
397 ctx.spills_entry[block_idx][to_spill] = spill_id;
398 spilled_registers.vgpr += to_spill.size();
399 }
400
401 /* select live-through sgpr variables */
402 while (new_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
403 unsigned distance = 0;
404 Temp to_spill;
405 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) {
406 if (pair.first.type() == RegType::sgpr &&
407 pair.second.first >= loop_end &&
408 pair.second.second > distance &&
409 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
410 to_spill = pair.first;
411 distance = pair.second.second;
412 }
413 }
414 if (distance == 0)
415 break;
416
417 uint32_t spill_id;
418 if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) {
419 spill_id = ctx.allocate_spill_id(to_spill.regClass());
420 } else {
421 spill_id = ctx.spills_exit[block_idx - 1][to_spill];
422 }
423
424 ctx.spills_entry[block_idx][to_spill] = spill_id;
425 spilled_registers.sgpr += to_spill.size();
426 }
427
428
429
430 /* shortcut */
431 if (!RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure))
432 return spilled_registers;
433
434 /* if reg pressure is too high at beginning of loop, add variables with furthest use */
435 unsigned idx = 0;
436 while (block->instructions[idx]->opcode == aco_opcode::p_phi || block->instructions[idx]->opcode == aco_opcode::p_linear_phi)
437 idx++;
438
439 assert(idx != 0 && "loop without phis: TODO");
440 idx--;
441 RegisterDemand reg_pressure = ctx.register_demand[block_idx][idx] - spilled_registers;
442 while (reg_pressure.sgpr > ctx.target_pressure.sgpr) {
443 unsigned distance = 0;
444 Temp to_spill;
445 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
446 if (pair.first.type() == RegType::sgpr &&
447 pair.second.second > distance &&
448 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
449 to_spill = pair.first;
450 distance = pair.second.second;
451 }
452 }
453 assert(distance != 0);
454
455 ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
456 spilled_registers.sgpr += to_spill.size();
457 reg_pressure.sgpr -= to_spill.size();
458 }
459 while (reg_pressure.vgpr > ctx.target_pressure.vgpr) {
460 unsigned distance = 0;
461 Temp to_spill;
462 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
463 if (pair.first.type() == RegType::vgpr &&
464 pair.second.second > distance &&
465 ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
466 to_spill = pair.first;
467 distance = pair.second.second;
468 }
469 }
470 assert(distance != 0);
471 ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
472 spilled_registers.vgpr += to_spill.size();
473 reg_pressure.vgpr -= to_spill.size();
474 }
475
476 return spilled_registers;
477 }
478
479 /* branch block */
480 if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
481 /* keep variables spilled if they are alive and not used in the current block */
482 unsigned pred_idx = block->linear_preds[0];
483 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
484 if (pair.first.type() == RegType::sgpr &&
485 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
486 ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) {
487 ctx.spills_entry[block_idx].insert(pair);
488 spilled_registers.sgpr += pair.first.size();
489 }
490 }
491 if (block->logical_preds.size() == 1) {
492 pred_idx = block->logical_preds[0];
493 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
494 if (pair.first.type() == RegType::vgpr &&
495 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
496 ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) {
497 ctx.spills_entry[block_idx].insert(pair);
498 spilled_registers.vgpr += pair.first.size();
499 }
500 }
501 }
502
503 /* if register demand is still too high, we just keep all spilled live vars and process the block */
504 if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
505 pred_idx = block->linear_preds[0];
506 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
507 if (pair.first.type() == RegType::sgpr &&
508 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
509 ctx.spills_entry[block_idx].insert(pair).second) {
510 spilled_registers.sgpr += pair.first.size();
511 }
512 }
513 }
514 if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) {
515 pred_idx = block->logical_preds[0];
516 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
517 if (pair.first.type() == RegType::vgpr &&
518 ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() &&
519 ctx.spills_entry[block_idx].insert(pair).second) {
520 spilled_registers.vgpr += pair.first.size();
521 }
522 }
523 }
524
525 return spilled_registers;
526 }
527
528 /* else: merge block */
529 std::set<Temp> partial_spills;
530
531 /* keep variables spilled on all incoming paths */
532 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
533 std::vector<unsigned>& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
534 /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it.
535 * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors.
536 * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */
537 /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */
538 bool remat = ctx.remat.count(pair.first);
539 bool spill = !remat;
540 uint32_t spill_id = 0;
541 for (unsigned pred_idx : preds) {
542 /* variable is not even live at the predecessor: probably from a phi */
543 if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) {
544 spill = false;
545 break;
546 }
547 if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {
548 if (!remat)
549 spill = false;
550 } else {
551 partial_spills.insert(pair.first);
552 /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */
553 spill_id = ctx.spills_exit[pred_idx][pair.first];
554 if (remat)
555 spill = true;
556 }
557 }
558 if (spill) {
559 ctx.spills_entry[block_idx][pair.first] = spill_id;
560 partial_spills.erase(pair.first);
561 spilled_registers += pair.first;
562 }
563 }
564
565 /* same for phis */
566 unsigned idx = 0;
567 while (block->instructions[idx]->opcode == aco_opcode::p_linear_phi ||
568 block->instructions[idx]->opcode == aco_opcode::p_phi) {
569 aco_ptr<Instruction>& phi = block->instructions[idx];
570 std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
571 bool spill = true;
572
573 for (unsigned i = 0; i < phi->operands.size(); i++) {
574 if (phi->operands[i].isUndefined())
575 continue;
576 assert(phi->operands[i].isTemp());
577 if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
578 spill = false;
579 else
580 partial_spills.insert(phi->definitions[0].getTemp());
581 }
582 if (spill) {
583 ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass());
584 partial_spills.erase(phi->definitions[0].getTemp());
585 spilled_registers += phi->definitions[0].getTemp();
586 }
587
588 idx++;
589 }
590
591 /* if reg pressure at first instruction is still too high, add partially spilled variables */
592 RegisterDemand reg_pressure;
593 if (idx == 0) {
594 for (const Definition& def : block->instructions[idx]->definitions) {
595 if (def.isTemp()) {
596 reg_pressure -= def.getTemp();
597 }
598 }
599 for (const Operand& op : block->instructions[idx]->operands) {
600 if (op.isTemp() && op.isFirstKill()) {
601 reg_pressure += op.getTemp();
602 }
603 }
604 } else {
605 idx--;
606 }
607 reg_pressure += ctx.register_demand[block_idx][idx] - spilled_registers;
608
609 while (reg_pressure.sgpr > ctx.target_pressure.sgpr) {
610 assert(!partial_spills.empty());
611
612 std::set<Temp>::iterator it = partial_spills.begin();
613 Temp to_spill = *it;
614 unsigned distance = ctx.next_use_distances_start[block_idx][*it].second;
615 while (it != partial_spills.end()) {
616 assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
617
618 if (it->type() == RegType::sgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) {
619 distance = ctx.next_use_distances_start[block_idx][*it].second;
620 to_spill = *it;
621 }
622 ++it;
623 }
624 assert(distance != 0);
625
626 ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
627 partial_spills.erase(to_spill);
628 spilled_registers.sgpr += to_spill.size();
629 reg_pressure.sgpr -= to_spill.size();
630 }
631
632 while (reg_pressure.vgpr > ctx.target_pressure.vgpr) {
633 assert(!partial_spills.empty());
634
635 std::set<Temp>::iterator it = partial_spills.begin();
636 Temp to_spill = *it;
637 unsigned distance = ctx.next_use_distances_start[block_idx][*it].second;
638 while (it != partial_spills.end()) {
639 assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
640
641 if (it->type() == RegType::vgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) {
642 distance = ctx.next_use_distances_start[block_idx][*it].second;
643 to_spill = *it;
644 }
645 ++it;
646 }
647 assert(distance != 0);
648
649 ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());
650 partial_spills.erase(to_spill);
651 spilled_registers.vgpr += to_spill.size();
652 reg_pressure.vgpr -= to_spill.size();
653 }
654
655 return spilled_registers;
656 }
657
658
659 void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
660 {
661 /* no coupling code necessary */
662 if (block->linear_preds.size() == 0)
663 return;
664
665 std::vector<aco_ptr<Instruction>> instructions;
666 /* branch block: TODO take other branch into consideration */
667 if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {
668 assert(ctx.processed[block->linear_preds[0]]);
669 assert(ctx.register_demand[block_idx].size() == block->instructions.size());
670 std::vector<RegisterDemand> reg_demand;
671 unsigned insert_idx = 0;
672 unsigned pred_idx = block->linear_preds[0];
673
674 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
675 if (!live.first.is_linear())
676 continue;
677 /* still spilled */
678 if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
679 continue;
680
681 /* in register at end of predecessor */
682 if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
683 std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
684 if (it != ctx.renames[pred_idx].end())
685 ctx.renames[block_idx].insert(*it);
686 continue;
687 }
688
689 /* variable is spilled at predecessor and live at current block: create reload instruction */
690 Temp new_name = {ctx.program->allocateId(), live.first.regClass()};
691 aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
692 instructions.emplace_back(std::move(reload));
693 reg_demand.push_back(RegisterDemand());
694 ctx.renames[block_idx][live.first] = new_name;
695 }
696
697 if (block->logical_preds.size() == 1) {
698 do {
699 assert(insert_idx < block->instructions.size());
700 instructions.emplace_back(std::move(block->instructions[insert_idx]));
701 reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]);
702 insert_idx++;
703 } while (instructions.back()->opcode != aco_opcode::p_logical_start);
704
705 unsigned pred_idx = block->logical_preds[0];
706 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) {
707 if (live.first.is_linear())
708 continue;
709 /* still spilled */
710 if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
711 continue;
712
713 /* in register at end of predecessor */
714 if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
715 std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
716 if (it != ctx.renames[pred_idx].end())
717 ctx.renames[block_idx].insert(*it);
718 continue;
719 }
720
721 /* variable is spilled at predecessor and live at current block: create reload instruction */
722 Temp new_name = {ctx.program->allocateId(), live.first.regClass()};
723 aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
724 instructions.emplace_back(std::move(reload));
725 reg_demand.emplace_back(reg_demand.back());
726 ctx.renames[block_idx][live.first] = new_name;
727 }
728 }
729
730 /* combine new reload instructions with original block */
731 if (!instructions.empty()) {
732 reg_demand.insert(reg_demand.end(), std::next(ctx.register_demand[block->index].begin(), insert_idx),
733 ctx.register_demand[block->index].end());
734 ctx.register_demand[block_idx] = std::move(reg_demand);
735 instructions.insert(instructions.end(),
736 std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(std::next(block->instructions.begin(), insert_idx)),
737 std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
738 block->instructions = std::move(instructions);
739 }
740 return;
741 }
742
743 /* loop header and merge blocks: check if all (linear) predecessors have been processed */
744 for (ASSERTED unsigned pred : block->linear_preds)
745 assert(ctx.processed[pred]);
746
747 /* iterate the phi nodes for which operands to spill at the predecessor */
748 for (aco_ptr<Instruction>& phi : block->instructions) {
749 if (phi->opcode != aco_opcode::p_phi &&
750 phi->opcode != aco_opcode::p_linear_phi)
751 break;
752
753 /* if the phi is not spilled, add to instructions */
754 if (ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) {
755 instructions.emplace_back(std::move(phi));
756 continue;
757 }
758
759 std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
760 uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
761
762 for (unsigned i = 0; i < phi->operands.size(); i++) {
763 if (phi->operands[i].isUndefined())
764 continue;
765
766 unsigned pred_idx = preds[i];
767 assert(phi->operands[i].isTemp() && phi->operands[i].isKill());
768 Temp var = phi->operands[i].getTemp();
769
770 /* build interferences between the phi def and all spilled variables at the predecessor blocks */
771 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
772 if (var == pair.first)
773 continue;
774 ctx.interferences[def_spill_id].second.emplace(pair.second);
775 ctx.interferences[pair.second].second.emplace(def_spill_id);
776 }
777
778 /* check if variable is already spilled at predecessor */
779 std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
780 if (spilled != ctx.spills_exit[pred_idx].end()) {
781 if (spilled->second != def_spill_id)
782 ctx.add_affinity(def_spill_id, spilled->second);
783 continue;
784 }
785
786 /* rename if necessary */
787 std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
788 if (rename_it != ctx.renames[pred_idx].end()) {
789 var = rename_it->second;
790 ctx.renames[pred_idx].erase(rename_it);
791 }
792
793 uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
794 ctx.add_affinity(def_spill_id, spill_id);
795 aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
796 spill->operands[0] = Operand(var);
797 spill->operands[1] = Operand(spill_id);
798 Block& pred = ctx.program->blocks[pred_idx];
799 unsigned idx = pred.instructions.size();
800 do {
801 assert(idx != 0);
802 idx--;
803 } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
804 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
805 pred.instructions.insert(it, std::move(spill));
806 ctx.spills_exit[pred_idx][phi->operands[i].getTemp()] = spill_id;
807 }
808
809 /* remove phi from instructions */
810 phi.reset();
811 }
812
813 /* iterate all (other) spilled variables for which to spill at the predecessor */
814 // TODO: would be better to have them sorted: first vgprs and first with longest distance
815 for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {
816 std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
817
818 for (unsigned pred_idx : preds) {
819 /* variable is already spilled at predecessor */
820 std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);
821 if (spilled != ctx.spills_exit[pred_idx].end()) {
822 if (spilled->second != pair.second)
823 ctx.add_affinity(pair.second, spilled->second);
824 continue;
825 }
826
827 /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
828 if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
829 continue;
830
831 /* add interferences between spilled variable and predecessors exit spills */
832 for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) {
833 if (exit_spill.first == pair.first)
834 continue;
835 ctx.interferences[exit_spill.second].second.emplace(pair.second);
836 ctx.interferences[pair.second].second.emplace(exit_spill.second);
837 }
838
839 /* variable is in register at predecessor and has to be spilled */
840 /* rename if necessary */
841 Temp var = pair.first;
842 std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
843 if (rename_it != ctx.renames[pred_idx].end()) {
844 var = rename_it->second;
845 ctx.renames[pred_idx].erase(rename_it);
846 }
847
848 aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
849 spill->operands[0] = Operand(var);
850 spill->operands[1] = Operand(pair.second);
851 Block& pred = ctx.program->blocks[pred_idx];
852 unsigned idx = pred.instructions.size();
853 do {
854 assert(idx != 0);
855 idx--;
856 } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
857 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
858 pred.instructions.insert(it, std::move(spill));
859 ctx.spills_exit[pred.index][pair.first] = pair.second;
860 }
861 }
862
863 /* iterate phis for which operands to reload */
864 for (aco_ptr<Instruction>& phi : instructions) {
865 assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
866 assert(ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end());
867
868 std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
869 for (unsigned i = 0; i < phi->operands.size(); i++) {
870 if (!phi->operands[i].isTemp())
871 continue;
872 unsigned pred_idx = preds[i];
873
874 /* rename operand */
875 if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) {
876 std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp());
877 if (it != ctx.renames[pred_idx].end())
878 phi->operands[i].setTemp(it->second);
879 continue;
880 }
881
882 Temp tmp = phi->operands[i].getTemp();
883
884 /* reload phi operand at end of predecessor block */
885 Temp new_name = {ctx.program->allocateId(), tmp.regClass()};
886 Block& pred = ctx.program->blocks[pred_idx];
887 unsigned idx = pred.instructions.size();
888 do {
889 assert(idx != 0);
890 idx--;
891 } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
892 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
893
894 aco_ptr<Instruction> reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);
895 pred.instructions.insert(it, std::move(reload));
896
897 ctx.spills_exit[pred_idx].erase(tmp);
898 ctx.renames[pred_idx][tmp] = new_name;
899 phi->operands[i].setTemp(new_name);
900 }
901 }
902
903 /* iterate live variables for which to reload */
904 // TODO: reload at current block if variable is spilled on all predecessors
905 for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) {
906 /* skip spilled variables */
907 if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
908 continue;
909 std::vector<unsigned> preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds;
910
911 /* variable is dead at predecessor, it must be from a phi */
912 bool is_dead = false;
913 for (unsigned pred_idx : preds) {
914 if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
915 is_dead = true;
916 }
917 if (is_dead)
918 continue;
919 for (unsigned pred_idx : preds) {
920 /* the variable is not spilled at the predecessor */
921 if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())
922 continue;
923
924 /* variable is spilled at predecessor and has to be reloaded */
925 Temp new_name = {ctx.program->allocateId(), pair.first.regClass()};
926 Block& pred = ctx.program->blocks[pred_idx];
927 unsigned idx = pred.instructions.size();
928 do {
929 assert(idx != 0);
930 idx--;
931 } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
932 std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
933
934 aco_ptr<Instruction> reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);
935 pred.instructions.insert(it, std::move(reload));
936
937 ctx.spills_exit[pred.index].erase(pair.first);
938 ctx.renames[pred.index][pair.first] = new_name;
939 }
940
941 /* check if we have to create a new phi for this variable */
942 Temp rename = Temp();
943 bool is_same = true;
944 for (unsigned pred_idx : preds) {
945 if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {
946 if (rename == Temp())
947 rename = pair.first;
948 else
949 is_same = rename == pair.first;
950 } else {
951 if (rename == Temp())
952 rename = ctx.renames[pred_idx][pair.first];
953 else
954 is_same = rename == ctx.renames[pred_idx][pair.first];
955 }
956
957 if (!is_same)
958 break;
959 }
960
961 if (!is_same) {
962 /* the variable was renamed differently in the predecessors: we have to create a phi */
963 aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
964 aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
965 rename = {ctx.program->allocateId(), pair.first.regClass()};
966 for (unsigned i = 0; i < phi->operands.size(); i++) {
967 Temp tmp;
968 if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end())
969 tmp = ctx.renames[preds[i]][pair.first];
970 else if (preds[i] >= block_idx)
971 tmp = rename;
972 else
973 tmp = pair.first;
974 phi->operands[i] = Operand(tmp);
975 }
976 phi->definitions[0] = Definition(rename);
977 instructions.emplace_back(std::move(phi));
978 }
979
980 /* the variable was renamed: add new name to renames */
981 if (!(rename == Temp() || rename == pair.first))
982 ctx.renames[block_idx][pair.first] = rename;
983 }
984
985 /* combine phis with instructions */
986 unsigned idx = 0;
987 while (!block->instructions[idx]) {
988 idx++;
989 }
990
991 ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx);
992 ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), RegisterDemand());
993
994 std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);
995 instructions.insert(instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),
996 std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
997 block->instructions = std::move(instructions);
998 }
999
1000 void process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
1001 std::map<Temp, uint32_t> &current_spills, RegisterDemand spilled_registers)
1002 {
1003 std::vector<std::map<Temp, uint32_t>> local_next_use_distance;
1004 std::vector<aco_ptr<Instruction>> instructions;
1005 unsigned idx = 0;
1006
1007 /* phis are handled separetely */
1008 while (block->instructions[idx]->opcode == aco_opcode::p_phi ||
1009 block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {
1010 aco_ptr<Instruction>& instr = block->instructions[idx];
1011 for (const Operand& op : instr->operands) {
1012 /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1013 if (op.isTemp() && ctx.remat.count(op.getTemp()))
1014 ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
1015 }
1016 instructions.emplace_back(std::move(instr));
1017 idx++;
1018 }
1019
1020 if (block->register_demand.exceeds(ctx.target_pressure))
1021 local_next_use_distance = local_next_uses(ctx, block);
1022
1023 while (idx < block->instructions.size()) {
1024 aco_ptr<Instruction>& instr = block->instructions[idx];
1025
1026 std::map<Temp, std::pair<Temp, uint32_t>> reloads;
1027 std::map<Temp, uint32_t> spills;
1028 /* rename and reload operands */
1029 for (Operand& op : instr->operands) {
1030 if (!op.isTemp())
1031 continue;
1032 if (current_spills.find(op.getTemp()) == current_spills.end()) {
1033 /* the Operand is in register: check if it was renamed */
1034 if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())
1035 op.setTemp(ctx.renames[block_idx][op.getTemp()]);
1036 /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1037 if (ctx.remat.count(op.getTemp()))
1038 ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
1039 continue;
1040 }
1041 /* the Operand is spilled: add it to reloads */
1042 Temp new_tmp = {ctx.program->allocateId(), op.regClass()};
1043 ctx.renames[block_idx][op.getTemp()] = new_tmp;
1044 reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
1045 current_spills.erase(op.getTemp());
1046 op.setTemp(new_tmp);
1047 spilled_registers -= new_tmp;
1048 }
1049
1050 /* check if register demand is low enough before and after the current instruction */
1051 if (block->register_demand.exceeds(ctx.target_pressure)) {
1052
1053 RegisterDemand new_demand = ctx.register_demand[block_idx][idx];
1054 if (idx == 0) {
1055 RegisterDemand demand_before = new_demand;
1056 for (const Definition& def : instr->definitions)
1057 demand_before -= def.getTemp();
1058 for (const Operand& op : instr->operands) {
1059 if (op.isFirstKill())
1060 demand_before += op.getTemp();
1061 }
1062 new_demand.update(demand_before);
1063 } else {
1064 new_demand.update(ctx.register_demand[block_idx][idx - 1]);
1065 }
1066
1067 assert(!local_next_use_distance.empty());
1068
1069 /* if reg pressure is too high, spill variable with furthest next use */
1070 while (RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
1071 unsigned distance = 0;
1072 Temp to_spill;
1073 bool do_rematerialize = false;
1074 if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) {
1075 for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
1076 bool can_rematerialize = ctx.remat.count(pair.first);
1077 if (pair.first.type() == RegType::vgpr &&
1078 ((pair.second > distance && can_rematerialize == do_rematerialize) ||
1079 (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
1080 current_spills.find(pair.first) == current_spills.end() &&
1081 ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) {
1082 to_spill = pair.first;
1083 distance = pair.second;
1084 do_rematerialize = can_rematerialize;
1085 }
1086 }
1087 } else {
1088 for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
1089 bool can_rematerialize = ctx.remat.count(pair.first);
1090 if (pair.first.type() == RegType::sgpr &&
1091 ((pair.second > distance && can_rematerialize == do_rematerialize) ||
1092 (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
1093 current_spills.find(pair.first) == current_spills.end() &&
1094 ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) {
1095 to_spill = pair.first;
1096 distance = pair.second;
1097 do_rematerialize = can_rematerialize;
1098 }
1099 }
1100 }
1101
1102 assert(distance != 0 && distance > idx);
1103 uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass());
1104
1105 /* add interferences with currently spilled variables */
1106 for (std::pair<Temp, uint32_t> pair : current_spills) {
1107 ctx.interferences[spill_id].second.emplace(pair.second);
1108 ctx.interferences[pair.second].second.emplace(spill_id);
1109 }
1110 for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
1111 ctx.interferences[spill_id].second.emplace(pair.second.second);
1112 ctx.interferences[pair.second.second].second.emplace(spill_id);
1113 }
1114
1115 current_spills[to_spill] = spill_id;
1116 spilled_registers += to_spill;
1117
1118 /* rename if necessary */
1119 if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {
1120 to_spill = ctx.renames[block_idx][to_spill];
1121 }
1122
1123 /* add spill to new instructions */
1124 aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
1125 spill->operands[0] = Operand(to_spill);
1126 spill->operands[1] = Operand(spill_id);
1127 instructions.emplace_back(std::move(spill));
1128 }
1129 }
1130
1131 /* add reloads and instruction to new instructions */
1132 for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
1133 aco_ptr<Instruction> reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second);
1134 instructions.emplace_back(std::move(reload));
1135 }
1136 instructions.emplace_back(std::move(instr));
1137 idx++;
1138 }
1139
1140 block->instructions = std::move(instructions);
1141 ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1142 }
1143
1144 void spill_block(spill_ctx& ctx, unsigned block_idx)
1145 {
1146 Block* block = &ctx.program->blocks[block_idx];
1147 ctx.processed[block_idx] = true;
1148
1149 /* determine set of variables which are spilled at the beginning of the block */
1150 RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);
1151
1152 /* add interferences for spilled variables */
1153 for (std::pair<Temp, uint32_t> x : ctx.spills_entry[block_idx]) {
1154 for (std::pair<Temp, uint32_t> y : ctx.spills_entry[block_idx])
1155 if (x.second != y.second)
1156 ctx.interferences[x.second].second.emplace(y.second);
1157 }
1158
1159 bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx;
1160 if (!is_loop_header) {
1161 /* add spill/reload code on incoming control flow edges */
1162 add_coupling_code(ctx, block, block_idx);
1163 }
1164
1165 std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];
1166
1167 /* check conditions to process this block */
1168 bool process = RegisterDemand(block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
1169 !ctx.renames[block_idx].empty() ||
1170 ctx.remat_used.size();
1171
1172 std::map<Temp, uint32_t>::iterator it = current_spills.begin();
1173 while (!process && it != current_spills.end()) {
1174 if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
1175 process = true;
1176 ++it;
1177 }
1178
1179 if (process)
1180 process_block(ctx, block_idx, block, current_spills, spilled_registers);
1181 else
1182 ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
1183
1184 /* check if the next block leaves the current loop */
1185 if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)
1186 return;
1187
1188 Block* loop_header = ctx.loop_header.top();
1189
1190 /* preserve original renames at end of loop header block */
1191 std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]);
1192
1193 /* add coupling code to all loop header predecessors */
1194 add_coupling_code(ctx, loop_header, loop_header->index);
1195
1196 /* update remat_used for phis added in add_coupling_code() */
1197 for (aco_ptr<Instruction>& instr : loop_header->instructions) {
1198 if (!is_phi(instr))
1199 break;
1200 for (const Operand& op : instr->operands) {
1201 if (op.isTemp() && ctx.remat.count(op.getTemp()))
1202 ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
1203 }
1204 }
1205
1206 /* propagate new renames through loop: i.e. repair the SSA */
1207 renames.swap(ctx.renames[loop_header->index]);
1208 for (std::pair<Temp, Temp> rename : renames) {
1209 for (unsigned idx = loop_header->index; idx <= block_idx; idx++) {
1210 Block& current = ctx.program->blocks[idx];
1211 std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin();
1212
1213 /* first rename phis */
1214 while (instr_it != current.instructions.end()) {
1215 aco_ptr<Instruction>& phi = *instr_it;
1216 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
1217 break;
1218 /* no need to rename the loop header phis once again. this happened in add_coupling_code() */
1219 if (idx == loop_header->index) {
1220 instr_it++;
1221 continue;
1222 }
1223
1224 for (Operand& op : phi->operands) {
1225 if (!op.isTemp())
1226 continue;
1227 if (op.getTemp() == rename.first)
1228 op.setTemp(rename.second);
1229 }
1230 instr_it++;
1231 }
1232
1233 std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = ctx.next_use_distances_start[idx].find(rename.first);
1234
1235 /* variable is not live at beginning of this block */
1236 if (it == ctx.next_use_distances_start[idx].end())
1237 continue;
1238
1239 /* if the variable is live at the block's exit, add rename */
1240 if (ctx.next_use_distances_end[idx].find(rename.first) != ctx.next_use_distances_end[idx].end())
1241 ctx.renames[idx].insert(rename);
1242
1243 /* rename all uses in this block */
1244 bool renamed = false;
1245 while (!renamed && instr_it != current.instructions.end()) {
1246 aco_ptr<Instruction>& instr = *instr_it;
1247 for (Operand& op : instr->operands) {
1248 if (!op.isTemp())
1249 continue;
1250 if (op.getTemp() == rename.first) {
1251 op.setTemp(rename.second);
1252 /* we can stop with this block as soon as the variable is spilled */
1253 if (instr->opcode == aco_opcode::p_spill)
1254 renamed = true;
1255 }
1256 }
1257 instr_it++;
1258 }
1259 }
1260 }
1261
1262 /* remove loop header info from stack */
1263 ctx.loop_header.pop();
1264 }
1265
1266 Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,
1267 std::vector<aco_ptr<Instruction>>& instructions,
1268 unsigned offset, bool is_top_level)
1269 {
1270 Builder bld(ctx.program);
1271 if (is_top_level) {
1272 bld.reset(&instructions);
1273 } else {
1274 /* find p_logical_end */
1275 unsigned idx = instructions.size() - 1;
1276 while (instructions[idx]->opcode != aco_opcode::p_logical_end)
1277 idx--;
1278 bld.reset(&instructions, std::next(instructions.begin(), idx));
1279 }
1280
1281 Temp private_segment_buffer = ctx.program->private_segment_buffer;
1282 if (ctx.program->stage != compute_cs)
1283 private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u));
1284
1285 if (offset)
1286 scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset));
1287
1288 uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
1289 S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);
1290
1291 if (ctx.program->chip_class >= GFX10) {
1292 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
1293 S_008F0C_OOB_SELECT(3) |
1294 S_008F0C_RESOURCE_LEVEL(1);
1295 } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
1296 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1297 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1298 }
1299 /* older generations need element size = 4 bytes. element size removed in GFX9 */
1300 if (ctx.program->chip_class <= GFX8)
1301 rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
1302
1303 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
1304 private_segment_buffer, Operand(-1u),
1305 Operand(rsrc_conf));
1306 }
1307
1308 void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
1309 std::map<uint32_t, uint32_t> sgpr_slot;
1310 std::map<uint32_t, uint32_t> vgpr_slot;
1311 std::vector<bool> is_assigned(ctx.interferences.size());
1312
1313 /* first, handle affinities: just merge all interferences into both spill ids */
1314 for (std::vector<uint32_t>& vec : ctx.affinities) {
1315 for (unsigned i = 0; i < vec.size(); i++) {
1316 for (unsigned j = i + 1; j < vec.size(); j++) {
1317 assert(vec[i] != vec[j]);
1318 for (uint32_t id : ctx.interferences[vec[i]].second)
1319 ctx.interferences[id].second.insert(vec[j]);
1320 for (uint32_t id : ctx.interferences[vec[j]].second)
1321 ctx.interferences[id].second.insert(vec[i]);
1322 ctx.interferences[vec[i]].second.insert(ctx.interferences[vec[j]].second.begin(), ctx.interferences[vec[j]].second.end());
1323 ctx.interferences[vec[j]].second.insert(ctx.interferences[vec[i]].second.begin(), ctx.interferences[vec[i]].second.end());
1324
1325 bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];
1326 ctx.is_reloaded[vec[i]] = reloaded;
1327 ctx.is_reloaded[vec[j]] = reloaded;
1328 }
1329 }
1330 }
1331 for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)
1332 for (ASSERTED uint32_t id : ctx.interferences[i].second)
1333 assert(i != id);
1334
1335 /* for each spill slot, assign as many spill ids as possible */
1336 std::vector<std::set<uint32_t>> spill_slot_interferences;
1337 unsigned slot_idx = 0;
1338 bool done = false;
1339
1340 /* assign sgpr spill slots */
1341 while (!done) {
1342 done = true;
1343 for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1344 if (is_assigned[id] || !ctx.is_reloaded[id])
1345 continue;
1346 if (ctx.interferences[id].first.type() != RegType::sgpr)
1347 continue;
1348
1349 /* check interferences */
1350 bool interferes = false;
1351 for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) {
1352 if (i == spill_slot_interferences.size())
1353 spill_slot_interferences.emplace_back(std::set<uint32_t>());
1354 if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / 64 != slot_idx / 64) {
1355 interferes = true;
1356 break;
1357 }
1358 }
1359 if (interferes) {
1360 done = false;
1361 continue;
1362 }
1363
1364 /* we found a spill id which can be assigned to current spill slot */
1365 sgpr_slot[id] = slot_idx;
1366 is_assigned[id] = true;
1367 for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++)
1368 spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end());
1369
1370 /* add all affinities: there are no additional interferences */
1371 for (std::vector<uint32_t>& vec : ctx.affinities) {
1372 bool found_affinity = false;
1373 for (uint32_t entry : vec) {
1374 if (entry == id) {
1375 found_affinity = true;
1376 break;
1377 }
1378 }
1379 if (!found_affinity)
1380 continue;
1381 for (uint32_t entry : vec) {
1382 sgpr_slot[entry] = slot_idx;
1383 is_assigned[entry] = true;
1384 }
1385 }
1386 }
1387 slot_idx++;
1388 }
1389
1390 unsigned sgpr_spill_slots = spill_slot_interferences.size();
1391 spill_slot_interferences.clear();
1392 slot_idx = 0;
1393 done = false;
1394
1395 /* assign vgpr spill slots */
1396 while (!done) {
1397 done = true;
1398 for (unsigned id = 0; id < ctx.interferences.size(); id++) {
1399 if (is_assigned[id] || !ctx.is_reloaded[id])
1400 continue;
1401 if (ctx.interferences[id].first.type() != RegType::vgpr)
1402 continue;
1403
1404 /* check interferences */
1405 bool interferes = false;
1406 for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) {
1407 if (i == spill_slot_interferences.size())
1408 spill_slot_interferences.emplace_back(std::set<uint32_t>());
1409 /* check for interference and ensure that vector regs are stored next to each other */
1410 if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end()) {
1411 interferes = true;
1412 break;
1413 }
1414 }
1415 if (interferes) {
1416 done = false;
1417 continue;
1418 }
1419
1420 /* we found a spill id which can be assigned to current spill slot */
1421 vgpr_slot[id] = slot_idx;
1422 is_assigned[id] = true;
1423 for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++)
1424 spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end());
1425
1426 /* add all affinities: there are no additional interferences */
1427 for (std::vector<uint32_t>& vec : ctx.affinities) {
1428 bool found_affinity = false;
1429 for (uint32_t entry : vec) {
1430 if (entry == id) {
1431 found_affinity = true;
1432 break;
1433 }
1434 }
1435 if (!found_affinity)
1436 continue;
1437 for (uint32_t entry : vec) {
1438 vgpr_slot[entry] = slot_idx;
1439 is_assigned[entry] = true;
1440 }
1441 }
1442 }
1443 slot_idx++;
1444 }
1445
1446 unsigned vgpr_spill_slots = spill_slot_interferences.size();
1447
1448 for (unsigned id = 0; id < is_assigned.size(); id++)
1449 assert(is_assigned[id] || !ctx.is_reloaded[id]);
1450
1451 for (std::vector<uint32_t>& vec : ctx.affinities) {
1452 for (unsigned i = 0; i < vec.size(); i++) {
1453 for (unsigned j = i + 1; j < vec.size(); j++) {
1454 assert(is_assigned[vec[i]] == is_assigned[vec[j]]);
1455 if (!is_assigned[vec[i]])
1456 continue;
1457 assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);
1458 assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type());
1459 if (ctx.interferences[vec[i]].first.type() == RegType::sgpr)
1460 assert(sgpr_slot[vec[i]] == sgpr_slot[vec[j]]);
1461 else
1462 assert(vgpr_slot[vec[i]] == vgpr_slot[vec[j]]);
1463 }
1464 }
1465 }
1466
1467 /* hope, we didn't mess up */
1468 std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + 63) / 64);
1469 assert(vgpr_spill_temps.size() <= spills_to_vgpr);
1470
1471 /* replace pseudo instructions with actual hardware instructions */
1472 Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();
1473 unsigned last_top_level_block_idx = 0;
1474 std::vector<bool> reload_in_loop(vgpr_spill_temps.size());
1475 for (Block& block : ctx.program->blocks) {
1476
1477 /* after loops, we insert a user if there was a reload inside the loop */
1478 if (block.loop_nest_depth == 0) {
1479 int end_vgprs = 0;
1480 for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1481 if (reload_in_loop[i])
1482 end_vgprs++;
1483 }
1484
1485 if (end_vgprs > 0) {
1486 aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};
1487 int k = 0;
1488 for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1489 if (reload_in_loop[i])
1490 destr->operands[k++] = Operand(vgpr_spill_temps[i]);
1491 reload_in_loop[i] = false;
1492 }
1493 /* find insertion point */
1494 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1495 while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1496 ++it;
1497 block.instructions.insert(it, std::move(destr));
1498 }
1499 }
1500
1501 if (block.kind & block_kind_top_level && !block.linear_preds.empty()) {
1502 last_top_level_block_idx = block.index;
1503
1504 /* check if any spilled variables use a created linear vgpr, otherwise destroy them */
1505 for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
1506 if (vgpr_spill_temps[i] == Temp())
1507 continue;
1508
1509 bool can_destroy = true;
1510 for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) {
1511
1512 if (sgpr_slot.find(pair.second) != sgpr_slot.end() &&
1513 sgpr_slot[pair.second] / 64 == i) {
1514 can_destroy = false;
1515 break;
1516 }
1517 }
1518 if (can_destroy)
1519 vgpr_spill_temps[i] = Temp();
1520 }
1521 }
1522
1523 std::vector<aco_ptr<Instruction>>::iterator it;
1524 std::vector<aco_ptr<Instruction>> instructions;
1525 instructions.reserve(block.instructions.size());
1526 Builder bld(ctx.program, &instructions);
1527 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1528
1529 if ((*it)->opcode == aco_opcode::p_spill) {
1530 uint32_t spill_id = (*it)->operands[1].constantValue();
1531
1532 if (!ctx.is_reloaded[spill_id]) {
1533 /* never reloaded, so don't spill */
1534 } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
1535 /* spill vgpr */
1536 ctx.program->config->spilled_vgprs += (*it)->operands[0].size();
1537 uint32_t spill_slot = vgpr_slot[spill_id];
1538 bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
1539 unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1540
1541 /* check if the scratch resource descriptor already exists */
1542 if (scratch_rsrc == Temp()) {
1543 unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1544 scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
1545 last_top_level_block_idx == block.index ?
1546 instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
1547 offset,
1548 last_top_level_block_idx == block.index);
1549 }
1550
1551 unsigned offset = base_offset + spill_slot * 4;
1552 aco_opcode opcode = aco_opcode::buffer_store_dword;
1553 assert((*it)->operands[0].isTemp());
1554 Temp temp = (*it)->operands[0].getTemp();
1555 assert(temp.type() == RegType::vgpr && !temp.is_linear());
1556 if (temp.size() > 1) {
1557 Instruction* split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
1558 split->operands[0] = Operand(temp);
1559 for (unsigned i = 0; i < temp.size(); i++)
1560 split->definitions[i] = bld.def(v1);
1561 bld.insert(split);
1562 for (unsigned i = 0; i < temp.size(); i++)
1563 bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
1564 } else {
1565 bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false);
1566 }
1567 } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
1568 ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
1569
1570 uint32_t spill_slot = sgpr_slot[spill_id];
1571
1572 /* check if the linear vgpr already exists */
1573 if (vgpr_spill_temps[spill_slot / 64] == Temp()) {
1574 Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()};
1575 vgpr_spill_temps[spill_slot / 64] = linear_vgpr;
1576 aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1577 create->definitions[0] = Definition(linear_vgpr);
1578 /* find the right place to insert this definition */
1579 if (last_top_level_block_idx == block.index) {
1580 /* insert right before the current instruction */
1581 instructions.emplace_back(std::move(create));
1582 } else {
1583 assert(last_top_level_block_idx < block.index);
1584 /* insert before the branch at last top level block */
1585 std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions;
1586 instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
1587 }
1588 }
1589
1590 /* spill sgpr: just add the vgpr temp to operands */
1591 Pseudo_instruction* spill = create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
1592 spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]);
1593 spill->operands[1] = Operand(spill_slot % 64);
1594 spill->operands[2] = (*it)->operands[0];
1595 instructions.emplace_back(aco_ptr<Instruction>(spill));
1596 } else {
1597 unreachable("No spill slot assigned for spill id");
1598 }
1599
1600 } else if ((*it)->opcode == aco_opcode::p_reload) {
1601 uint32_t spill_id = (*it)->operands[0].constantValue();
1602 assert(ctx.is_reloaded[spill_id]);
1603
1604 if (vgpr_slot.find(spill_id) != vgpr_slot.end()) {
1605 /* reload vgpr */
1606 uint32_t spill_slot = vgpr_slot[spill_id];
1607 bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096;
1608 unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
1609
1610 /* check if the scratch resource descriptor already exists */
1611 if (scratch_rsrc == Temp()) {
1612 unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
1613 scratch_rsrc = load_scratch_resource(ctx, scratch_offset,
1614 last_top_level_block_idx == block.index ?
1615 instructions : ctx.program->blocks[last_top_level_block_idx].instructions,
1616 offset,
1617 last_top_level_block_idx == block.index);
1618 }
1619
1620 unsigned offset = base_offset + spill_slot * 4;
1621 aco_opcode opcode = aco_opcode::buffer_load_dword;
1622 Definition def = (*it)->definitions[0];
1623 if (def.size() > 1) {
1624 Instruction* vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};
1625 vec->definitions[0] = def;
1626 for (unsigned i = 0; i < def.size(); i++) {
1627 Temp tmp = bld.tmp(v1);
1628 vec->operands[i] = Operand(tmp);
1629 bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false);
1630 }
1631 bld.insert(vec);
1632 } else {
1633 bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false);
1634 }
1635 } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) {
1636 uint32_t spill_slot = sgpr_slot[spill_id];
1637 reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0;
1638
1639 /* check if the linear vgpr already exists */
1640 if (vgpr_spill_temps[spill_slot / 64] == Temp()) {
1641 Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()};
1642 vgpr_spill_temps[spill_slot / 64] = linear_vgpr;
1643 aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
1644 create->definitions[0] = Definition(linear_vgpr);
1645 /* find the right place to insert this definition */
1646 if (last_top_level_block_idx == block.index) {
1647 /* insert right before the current instruction */
1648 instructions.emplace_back(std::move(create));
1649 } else {
1650 assert(last_top_level_block_idx < block.index);
1651 /* insert before the branch at last top level block */
1652 std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions;
1653 instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
1654 }
1655 }
1656
1657 /* reload sgpr: just add the vgpr temp to operands */
1658 Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
1659 reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]);
1660 reload->operands[1] = Operand(spill_slot % 64);
1661 reload->definitions[0] = (*it)->definitions[0];
1662 instructions.emplace_back(aco_ptr<Instruction>(reload));
1663 } else {
1664 unreachable("No spill slot assigned for spill id");
1665 }
1666 } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
1667 instructions.emplace_back(std::move(*it));
1668 }
1669
1670 }
1671 block.instructions = std::move(instructions);
1672 }
1673
1674 /* update required scratch memory */
1675 ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);
1676
1677 /* SSA elimination inserts copies for logical phis right before p_logical_end
1678 * So if a linear vgpr is used between that p_logical_end and the branch,
1679 * we need to ensure logical phis don't choose a definition which aliases
1680 * the linear vgpr.
1681 * TODO: Moving the spills and reloads to before p_logical_end might produce
1682 * slightly better code. */
1683 for (Block& block : ctx.program->blocks) {
1684 /* loops exits are already handled */
1685 if (block.logical_preds.size() <= 1)
1686 continue;
1687
1688 bool has_logical_phis = false;
1689 for (aco_ptr<Instruction>& instr : block.instructions) {
1690 if (instr->opcode == aco_opcode::p_phi) {
1691 has_logical_phis = true;
1692 break;
1693 } else if (instr->opcode != aco_opcode::p_linear_phi) {
1694 break;
1695 }
1696 }
1697 if (!has_logical_phis)
1698 continue;
1699
1700 std::set<Temp> vgprs;
1701 for (unsigned pred_idx : block.logical_preds) {
1702 Block& pred = ctx.program->blocks[pred_idx];
1703 for (int i = pred.instructions.size() - 1; i >= 0; i--) {
1704 aco_ptr<Instruction>& pred_instr = pred.instructions[i];
1705 if (pred_instr->opcode == aco_opcode::p_logical_end) {
1706 break;
1707 } else if (pred_instr->opcode == aco_opcode::p_spill ||
1708 pred_instr->opcode == aco_opcode::p_reload) {
1709 vgprs.insert(pred_instr->operands[0].getTemp());
1710 }
1711 }
1712 }
1713 if (!vgprs.size())
1714 continue;
1715
1716 aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};
1717 int k = 0;
1718 for (Temp tmp : vgprs) {
1719 destr->operands[k++] = Operand(tmp);
1720 }
1721 /* find insertion point */
1722 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
1723 while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
1724 ++it;
1725 block.instructions.insert(it, std::move(destr));
1726 }
1727 }
1728
1729 } /* end namespace */
1730
1731
1732 void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
1733 {
1734 program->config->spilled_vgprs = 0;
1735 program->config->spilled_sgprs = 0;
1736
1737 /* no spilling when register pressure is low enough */
1738 if (program->num_waves > 0)
1739 return;
1740
1741 /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1742 lower_to_cssa(program, live_vars, options);
1743
1744 /* calculate target register demand */
1745 RegisterDemand register_target = program->max_reg_demand;
1746 if (register_target.sgpr > program->sgpr_limit)
1747 register_target.vgpr += (register_target.sgpr - program->sgpr_limit + 63 + 32) / 64;
1748 register_target.sgpr = program->sgpr_limit;
1749
1750 if (register_target.vgpr > program->vgpr_limit)
1751 register_target.sgpr = program->sgpr_limit - 5;
1752 register_target.vgpr = program->vgpr_limit - (register_target.vgpr - program->max_reg_demand.vgpr);
1753
1754 int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + 63 + 32) / 64;
1755
1756 /* initialize ctx */
1757 spill_ctx ctx(register_target, program, live_vars.register_demand);
1758 compute_global_next_uses(ctx, live_vars.live_out);
1759 get_rematerialize_info(ctx);
1760
1761 /* create spills and reloads */
1762 for (unsigned i = 0; i < program->blocks.size(); i++)
1763 spill_block(ctx, i);
1764
1765 /* assign spill slots and DCE rematerialized code */
1766 assign_spill_slots(ctx, spills_to_vgpr);
1767
1768 /* update live variable information */
1769 live_vars = live_var_analysis(program, options);
1770
1771 assert(program->num_waves >= 0);
1772 }
1773
1774 }
1775