2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27 #include "aco_builder.h"
35 * Implements the spilling algorithm on SSA-form from
36 * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
37 * by Matthias Braun and Sebastian Hack.
49 RegisterDemand target_pressure
;
51 std::vector
<std::vector
<RegisterDemand
>> register_demand
;
52 std::vector
<std::map
<Temp
, Temp
>> renames
;
53 std::vector
<std::map
<Temp
, uint32_t>> spills_entry
;
54 std::vector
<std::map
<Temp
, uint32_t>> spills_exit
;
55 std::vector
<bool> processed
;
56 std::stack
<Block
*> loop_header
;
57 std::vector
<std::map
<Temp
, std::pair
<uint32_t, uint32_t>>> next_use_distances_start
;
58 std::vector
<std::map
<Temp
, std::pair
<uint32_t, uint32_t>>> next_use_distances_end
;
59 std::vector
<std::pair
<RegClass
, std::unordered_set
<uint32_t>>> interferences
;
60 std::vector
<std::vector
<uint32_t>> affinities
;
61 std::vector
<bool> is_reloaded
;
62 std::map
<Temp
, remat_info
> remat
;
63 std::map
<Instruction
*, bool> remat_used
;
66 spill_ctx(const RegisterDemand target_pressure
, Program
* program
,
67 std::vector
<std::vector
<RegisterDemand
>> register_demand
)
68 : target_pressure(target_pressure
), program(program
),
69 register_demand(std::move(register_demand
)), renames(program
->blocks
.size()),
70 spills_entry(program
->blocks
.size()), spills_exit(program
->blocks
.size()),
71 processed(program
->blocks
.size(), false), wave_size(program
->wave_size
) {}
73 void add_affinity(uint32_t first
, uint32_t second
)
75 unsigned found_first
= affinities
.size();
76 unsigned found_second
= affinities
.size();
77 for (unsigned i
= 0; i
< affinities
.size(); i
++) {
78 std::vector
<uint32_t>& vec
= affinities
[i
];
79 for (uint32_t entry
: vec
) {
82 else if (entry
== second
)
86 if (found_first
== affinities
.size() && found_second
== affinities
.size()) {
87 affinities
.emplace_back(std::vector
<uint32_t>({first
, second
}));
88 } else if (found_first
< affinities
.size() && found_second
== affinities
.size()) {
89 affinities
[found_first
].push_back(second
);
90 } else if (found_second
< affinities
.size() && found_first
== affinities
.size()) {
91 affinities
[found_second
].push_back(first
);
92 } else if (found_first
!= found_second
) {
93 /* merge second into first */
94 affinities
[found_first
].insert(affinities
[found_first
].end(), affinities
[found_second
].begin(), affinities
[found_second
].end());
95 affinities
.erase(std::next(affinities
.begin(), found_second
));
97 assert(found_first
== found_second
);
101 void add_interference(uint32_t first
, uint32_t second
)
103 if (interferences
[first
].first
.type() != interferences
[second
].first
.type())
106 bool inserted
= interferences
[first
].second
.insert(second
).second
;
108 interferences
[second
].second
.insert(first
);
111 uint32_t allocate_spill_id(RegClass rc
)
113 interferences
.emplace_back(rc
, std::unordered_set
<uint32_t>());
114 is_reloaded
.push_back(false);
115 return next_spill_id
++;
118 uint32_t next_spill_id
= 0;
121 int32_t get_dominator(int idx_a
, int idx_b
, Program
* program
, bool is_linear
)
129 while (idx_a
!= idx_b
) {
131 idx_a
= program
->blocks
[idx_a
].linear_idom
;
133 idx_b
= program
->blocks
[idx_b
].linear_idom
;
136 while (idx_a
!= idx_b
) {
138 idx_a
= program
->blocks
[idx_a
].logical_idom
;
140 idx_b
= program
->blocks
[idx_b
].logical_idom
;
147 void next_uses_per_block(spill_ctx
& ctx
, unsigned block_idx
, std::set
<uint32_t>& worklist
)
149 Block
* block
= &ctx
.program
->blocks
[block_idx
];
150 std::map
<Temp
, std::pair
<uint32_t, uint32_t>> next_uses
= ctx
.next_use_distances_end
[block_idx
];
152 /* to compute the next use distance at the beginning of the block, we have to add the block's size */
153 for (std::map
<Temp
, std::pair
<uint32_t, uint32_t>>::iterator it
= next_uses
.begin(); it
!= next_uses
.end(); ++it
)
154 it
->second
.second
= it
->second
.second
+ block
->instructions
.size();
156 int idx
= block
->instructions
.size() - 1;
158 aco_ptr
<Instruction
>& instr
= block
->instructions
[idx
];
160 if (instr
->opcode
== aco_opcode::p_linear_phi
||
161 instr
->opcode
== aco_opcode::p_phi
)
164 for (const Definition
& def
: instr
->definitions
) {
166 next_uses
.erase(def
.getTemp());
169 for (const Operand
& op
: instr
->operands
) {
171 if (op
.isFixed() && op
.physReg() == exec
)
173 if (op
.regClass().type() == RegType::vgpr
&& op
.regClass().is_linear())
176 next_uses
[op
.getTemp()] = {block_idx
, idx
};
181 assert(block_idx
!= 0 || next_uses
.empty());
182 ctx
.next_use_distances_start
[block_idx
] = next_uses
;
184 aco_ptr
<Instruction
>& instr
= block
->instructions
[idx
];
185 assert(instr
->opcode
== aco_opcode::p_linear_phi
|| instr
->opcode
== aco_opcode::p_phi
);
187 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
188 unsigned pred_idx
= instr
->opcode
== aco_opcode::p_phi
?
189 block
->logical_preds
[i
] :
190 block
->linear_preds
[i
];
191 if (instr
->operands
[i
].isTemp()) {
192 if (instr
->operands
[i
].getTemp() == ctx
.program
->blocks
[pred_idx
].live_out_exec
)
194 if (ctx
.next_use_distances_end
[pred_idx
].find(instr
->operands
[i
].getTemp()) == ctx
.next_use_distances_end
[pred_idx
].end() ||
195 ctx
.next_use_distances_end
[pred_idx
][instr
->operands
[i
].getTemp()] != std::pair
<uint32_t, uint32_t>{block_idx
, 0})
196 worklist
.insert(pred_idx
);
197 ctx
.next_use_distances_end
[pred_idx
][instr
->operands
[i
].getTemp()] = {block_idx
, 0};
200 next_uses
.erase(instr
->definitions
[0].getTemp());
204 /* all remaining live vars must be live-out at the predecessors */
205 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: next_uses
) {
206 Temp temp
= pair
.first
;
207 uint32_t distance
= pair
.second
.second
;
208 uint32_t dom
= pair
.second
.first
;
209 std::vector
<unsigned>& preds
= temp
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
210 for (unsigned pred_idx
: preds
) {
211 if (temp
== ctx
.program
->blocks
[pred_idx
].live_out_exec
)
213 if (ctx
.program
->blocks
[pred_idx
].loop_nest_depth
> block
->loop_nest_depth
)
215 if (ctx
.next_use_distances_end
[pred_idx
].find(temp
) != ctx
.next_use_distances_end
[pred_idx
].end()) {
216 dom
= get_dominator(dom
, ctx
.next_use_distances_end
[pred_idx
][temp
].first
, ctx
.program
, temp
.is_linear());
217 distance
= std::min(ctx
.next_use_distances_end
[pred_idx
][temp
].second
, distance
);
219 if (ctx
.next_use_distances_end
[pred_idx
][temp
] != std::pair
<uint32_t, uint32_t>{dom
, distance
})
220 worklist
.insert(pred_idx
);
221 ctx
.next_use_distances_end
[pred_idx
][temp
] = {dom
, distance
};
227 void compute_global_next_uses(spill_ctx
& ctx
)
229 ctx
.next_use_distances_start
.resize(ctx
.program
->blocks
.size());
230 ctx
.next_use_distances_end
.resize(ctx
.program
->blocks
.size());
231 std::set
<uint32_t> worklist
;
232 for (Block
& block
: ctx
.program
->blocks
)
233 worklist
.insert(block
.index
);
235 while (!worklist
.empty()) {
236 std::set
<unsigned>::reverse_iterator b_it
= worklist
.rbegin();
237 unsigned block_idx
= *b_it
;
238 worklist
.erase(block_idx
);
239 next_uses_per_block(ctx
, block_idx
, worklist
);
243 bool should_rematerialize(aco_ptr
<Instruction
>& instr
)
245 /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
246 if (instr
->format
!= Format::VOP1
&& instr
->format
!= Format::SOP1
&& instr
->format
!= Format::PSEUDO
&& instr
->format
!= Format::SOPK
)
248 /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */
249 if (instr
->format
== Format::PSEUDO
&& instr
->opcode
!= aco_opcode::p_create_vector
)
251 if (instr
->format
== Format::SOPK
&& instr
->opcode
!= aco_opcode::s_movk_i32
)
254 for (const Operand
& op
: instr
->operands
) {
255 /* TODO: rematerialization using temporaries isn't yet supported */
260 /* TODO: rematerialization with multiple definitions isn't yet supported */
261 if (instr
->definitions
.size() > 1)
267 aco_ptr
<Instruction
> do_reload(spill_ctx
& ctx
, Temp tmp
, Temp new_name
, uint32_t spill_id
)
269 std::map
<Temp
, remat_info
>::iterator remat
= ctx
.remat
.find(tmp
);
270 if (remat
!= ctx
.remat
.end()) {
271 Instruction
*instr
= remat
->second
.instr
;
272 assert((instr
->format
== Format::VOP1
|| instr
->format
== Format::SOP1
|| instr
->format
== Format::PSEUDO
|| instr
->format
== Format::SOPK
) && "unsupported");
273 assert((instr
->format
!= Format::PSEUDO
|| instr
->opcode
== aco_opcode::p_create_vector
) && "unsupported");
274 assert(instr
->definitions
.size() == 1 && "unsupported");
276 aco_ptr
<Instruction
> res
;
277 if (instr
->format
== Format::VOP1
) {
278 res
.reset(create_instruction
<VOP1_instruction
>(instr
->opcode
, instr
->format
, instr
->operands
.size(), instr
->definitions
.size()));
279 } else if (instr
->format
== Format::SOP1
) {
280 res
.reset(create_instruction
<SOP1_instruction
>(instr
->opcode
, instr
->format
, instr
->operands
.size(), instr
->definitions
.size()));
281 } else if (instr
->format
== Format::PSEUDO
) {
282 res
.reset(create_instruction
<Pseudo_instruction
>(instr
->opcode
, instr
->format
, instr
->operands
.size(), instr
->definitions
.size()));
283 } else if (instr
->format
== Format::SOPK
) {
284 res
.reset(create_instruction
<SOPK_instruction
>(instr
->opcode
, instr
->format
, instr
->operands
.size(), instr
->definitions
.size()));
285 static_cast<SOPK_instruction
*>(res
.get())->imm
= static_cast<SOPK_instruction
*>(instr
)->imm
;
287 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
288 res
->operands
[i
] = instr
->operands
[i
];
289 if (instr
->operands
[i
].isTemp()) {
290 assert(false && "unsupported");
291 if (ctx
.remat
.count(instr
->operands
[i
].getTemp()))
292 ctx
.remat_used
[ctx
.remat
[instr
->operands
[i
].getTemp()].instr
] = true;
295 res
->definitions
[0] = Definition(new_name
);
298 aco_ptr
<Pseudo_instruction
> reload
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_reload
, Format::PSEUDO
, 1, 1)};
299 reload
->operands
[0] = Operand(spill_id
);
300 reload
->definitions
[0] = Definition(new_name
);
301 ctx
.is_reloaded
[spill_id
] = true;
306 void get_rematerialize_info(spill_ctx
& ctx
)
308 for (Block
& block
: ctx
.program
->blocks
) {
309 bool logical
= false;
310 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
311 if (instr
->opcode
== aco_opcode::p_logical_start
)
313 else if (instr
->opcode
== aco_opcode::p_logical_end
)
315 if (logical
&& should_rematerialize(instr
)) {
316 for (const Definition
& def
: instr
->definitions
) {
318 ctx
.remat
[def
.getTemp()] = (remat_info
){instr
.get()};
319 ctx
.remat_used
[instr
.get()] = false;
327 std::vector
<std::map
<Temp
, uint32_t>> local_next_uses(spill_ctx
& ctx
, Block
* block
)
329 std::vector
<std::map
<Temp
, uint32_t>> local_next_uses(block
->instructions
.size());
331 std::map
<Temp
, uint32_t> next_uses
;
332 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_end
[block
->index
])
333 next_uses
[pair
.first
] = pair
.second
.second
+ block
->instructions
.size();
335 for (int idx
= block
->instructions
.size() - 1; idx
>= 0; idx
--) {
336 aco_ptr
<Instruction
>& instr
= block
->instructions
[idx
];
339 if (instr
->opcode
== aco_opcode::p_phi
|| instr
->opcode
== aco_opcode::p_linear_phi
)
342 for (const Operand
& op
: instr
->operands
) {
343 if (op
.isFixed() && op
.physReg() == exec
)
345 if (op
.regClass().type() == RegType::vgpr
&& op
.regClass().is_linear())
348 next_uses
[op
.getTemp()] = idx
;
350 for (const Definition
& def
: instr
->definitions
) {
352 next_uses
.erase(def
.getTemp());
354 local_next_uses
[idx
] = next_uses
;
356 return local_next_uses
;
360 RegisterDemand
init_live_in_vars(spill_ctx
& ctx
, Block
* block
, unsigned block_idx
)
362 RegisterDemand spilled_registers
;
364 /* first block, nothing was spilled before */
368 /* loop header block */
369 if (block
->loop_nest_depth
> ctx
.program
->blocks
[block_idx
- 1].loop_nest_depth
) {
370 assert(block
->linear_preds
[0] == block_idx
- 1);
371 assert(block
->logical_preds
[0] == block_idx
- 1);
373 /* create new loop_info */
374 ctx
.loop_header
.emplace(block
);
376 /* check how many live-through variables should be spilled */
377 RegisterDemand new_demand
;
378 unsigned i
= block_idx
;
379 while (ctx
.program
->blocks
[i
].loop_nest_depth
>= block
->loop_nest_depth
) {
380 assert(ctx
.program
->blocks
.size() > i
);
381 new_demand
.update(ctx
.program
->blocks
[i
].register_demand
);
384 unsigned loop_end
= i
;
386 /* keep live-through spilled */
387 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_end
[block_idx
- 1]) {
388 if (pair
.second
.first
< loop_end
)
391 Temp to_spill
= pair
.first
;
392 auto it
= ctx
.spills_exit
[block_idx
- 1].find(to_spill
);
393 if (it
== ctx
.spills_exit
[block_idx
- 1].end())
396 ctx
.spills_entry
[block_idx
][to_spill
] = it
->second
;
397 spilled_registers
+= to_spill
;
400 /* select live-through vgpr variables */
401 while (new_demand
.vgpr
- spilled_registers
.vgpr
> ctx
.target_pressure
.vgpr
) {
402 unsigned distance
= 0;
404 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_end
[block_idx
- 1]) {
405 if (pair
.first
.type() == RegType::vgpr
&&
406 pair
.second
.first
>= loop_end
&&
407 pair
.second
.second
> distance
&&
408 ctx
.spills_entry
[block_idx
].find(pair
.first
) == ctx
.spills_entry
[block_idx
].end()) {
409 to_spill
= pair
.first
;
410 distance
= pair
.second
.second
;
417 if (ctx
.spills_exit
[block_idx
- 1].find(to_spill
) == ctx
.spills_exit
[block_idx
- 1].end()) {
418 spill_id
= ctx
.allocate_spill_id(to_spill
.regClass());
420 spill_id
= ctx
.spills_exit
[block_idx
- 1][to_spill
];
423 ctx
.spills_entry
[block_idx
][to_spill
] = spill_id
;
424 spilled_registers
.vgpr
+= to_spill
.size();
427 /* select live-through sgpr variables */
428 while (new_demand
.sgpr
- spilled_registers
.sgpr
> ctx
.target_pressure
.sgpr
) {
429 unsigned distance
= 0;
431 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_end
[block_idx
- 1]) {
432 if (pair
.first
.type() == RegType::sgpr
&&
433 pair
.second
.first
>= loop_end
&&
434 pair
.second
.second
> distance
&&
435 ctx
.spills_entry
[block_idx
].find(pair
.first
) == ctx
.spills_entry
[block_idx
].end()) {
436 to_spill
= pair
.first
;
437 distance
= pair
.second
.second
;
444 if (ctx
.spills_exit
[block_idx
- 1].find(to_spill
) == ctx
.spills_exit
[block_idx
- 1].end()) {
445 spill_id
= ctx
.allocate_spill_id(to_spill
.regClass());
447 spill_id
= ctx
.spills_exit
[block_idx
- 1][to_spill
];
450 ctx
.spills_entry
[block_idx
][to_spill
] = spill_id
;
451 spilled_registers
.sgpr
+= to_spill
.size();
457 if (!RegisterDemand(new_demand
- spilled_registers
).exceeds(ctx
.target_pressure
))
458 return spilled_registers
;
460 /* if reg pressure is too high at beginning of loop, add variables with furthest use */
462 while (block
->instructions
[idx
]->opcode
== aco_opcode::p_phi
|| block
->instructions
[idx
]->opcode
== aco_opcode::p_linear_phi
)
465 assert(idx
!= 0 && "loop without phis: TODO");
467 RegisterDemand reg_pressure
= ctx
.register_demand
[block_idx
][idx
] - spilled_registers
;
468 /* Consider register pressure from linear predecessors. This can affect
469 * reg_pressure if the branch instructions define sgprs. */
470 for (unsigned pred
: block
->linear_preds
) {
471 reg_pressure
.sgpr
= std::max
<int16_t>(
472 reg_pressure
.sgpr
, ctx
.register_demand
[pred
].back().sgpr
- spilled_registers
.sgpr
);
475 while (reg_pressure
.sgpr
> ctx
.target_pressure
.sgpr
) {
476 unsigned distance
= 0;
478 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_start
[block_idx
]) {
479 if (pair
.first
.type() == RegType::sgpr
&&
480 pair
.second
.second
> distance
&&
481 ctx
.spills_entry
[block_idx
].find(pair
.first
) == ctx
.spills_entry
[block_idx
].end()) {
482 to_spill
= pair
.first
;
483 distance
= pair
.second
.second
;
486 assert(distance
!= 0);
488 ctx
.spills_entry
[block_idx
][to_spill
] = ctx
.allocate_spill_id(to_spill
.regClass());
489 spilled_registers
.sgpr
+= to_spill
.size();
490 reg_pressure
.sgpr
-= to_spill
.size();
492 while (reg_pressure
.vgpr
> ctx
.target_pressure
.vgpr
) {
493 unsigned distance
= 0;
495 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_start
[block_idx
]) {
496 if (pair
.first
.type() == RegType::vgpr
&&
497 pair
.second
.second
> distance
&&
498 ctx
.spills_entry
[block_idx
].find(pair
.first
) == ctx
.spills_entry
[block_idx
].end()) {
499 to_spill
= pair
.first
;
500 distance
= pair
.second
.second
;
503 assert(distance
!= 0);
504 ctx
.spills_entry
[block_idx
][to_spill
] = ctx
.allocate_spill_id(to_spill
.regClass());
505 spilled_registers
.vgpr
+= to_spill
.size();
506 reg_pressure
.vgpr
-= to_spill
.size();
509 return spilled_registers
;
513 if (block
->linear_preds
.size() == 1 && !(block
->kind
& block_kind_loop_exit
)) {
514 /* keep variables spilled if they are alive and not used in the current block */
515 unsigned pred_idx
= block
->linear_preds
[0];
516 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[pred_idx
]) {
517 if (pair
.first
.type() == RegType::sgpr
&&
518 ctx
.next_use_distances_start
[block_idx
].find(pair
.first
) != ctx
.next_use_distances_start
[block_idx
].end() &&
519 ctx
.next_use_distances_start
[block_idx
][pair
.first
].first
!= block_idx
) {
520 ctx
.spills_entry
[block_idx
].insert(pair
);
521 spilled_registers
.sgpr
+= pair
.first
.size();
524 if (block
->logical_preds
.size() == 1) {
525 pred_idx
= block
->logical_preds
[0];
526 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[pred_idx
]) {
527 if (pair
.first
.type() == RegType::vgpr
&&
528 ctx
.next_use_distances_start
[block_idx
].find(pair
.first
) != ctx
.next_use_distances_start
[block_idx
].end() &&
529 ctx
.next_use_distances_start
[block_idx
][pair
.first
].first
!= block_idx
) {
530 ctx
.spills_entry
[block_idx
].insert(pair
);
531 spilled_registers
.vgpr
+= pair
.first
.size();
536 /* if register demand is still too high, we just keep all spilled live vars and process the block */
537 if (block
->register_demand
.sgpr
- spilled_registers
.sgpr
> ctx
.target_pressure
.sgpr
) {
538 pred_idx
= block
->linear_preds
[0];
539 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[pred_idx
]) {
540 if (pair
.first
.type() == RegType::sgpr
&&
541 ctx
.next_use_distances_start
[block_idx
].find(pair
.first
) != ctx
.next_use_distances_start
[block_idx
].end() &&
542 ctx
.spills_entry
[block_idx
].insert(pair
).second
) {
543 spilled_registers
.sgpr
+= pair
.first
.size();
547 if (block
->register_demand
.vgpr
- spilled_registers
.vgpr
> ctx
.target_pressure
.vgpr
&& block
->logical_preds
.size() == 1) {
548 pred_idx
= block
->logical_preds
[0];
549 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[pred_idx
]) {
550 if (pair
.first
.type() == RegType::vgpr
&&
551 ctx
.next_use_distances_start
[block_idx
].find(pair
.first
) != ctx
.next_use_distances_start
[block_idx
].end() &&
552 ctx
.spills_entry
[block_idx
].insert(pair
).second
) {
553 spilled_registers
.vgpr
+= pair
.first
.size();
558 return spilled_registers
;
561 /* else: merge block */
562 std::set
<Temp
> partial_spills
;
564 /* keep variables spilled on all incoming paths */
565 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_start
[block_idx
]) {
566 std::vector
<unsigned>& preds
= pair
.first
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
567 /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it.
568 * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors.
569 * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */
570 /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */
571 bool remat
= ctx
.remat
.count(pair
.first
);
573 uint32_t spill_id
= 0;
574 for (unsigned pred_idx
: preds
) {
575 /* variable is not even live at the predecessor: probably from a phi */
576 if (ctx
.next_use_distances_end
[pred_idx
].find(pair
.first
) == ctx
.next_use_distances_end
[pred_idx
].end()) {
580 if (ctx
.spills_exit
[pred_idx
].find(pair
.first
) == ctx
.spills_exit
[pred_idx
].end()) {
584 partial_spills
.insert(pair
.first
);
585 /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */
586 spill_id
= ctx
.spills_exit
[pred_idx
][pair
.first
];
592 ctx
.spills_entry
[block_idx
][pair
.first
] = spill_id
;
593 partial_spills
.erase(pair
.first
);
594 spilled_registers
+= pair
.first
;
600 while (block
->instructions
[idx
]->opcode
== aco_opcode::p_linear_phi
||
601 block
->instructions
[idx
]->opcode
== aco_opcode::p_phi
) {
602 aco_ptr
<Instruction
>& phi
= block
->instructions
[idx
];
603 std::vector
<unsigned>& preds
= phi
->opcode
== aco_opcode::p_phi
? block
->logical_preds
: block
->linear_preds
;
606 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
607 if (phi
->operands
[i
].isUndefined())
609 assert(phi
->operands
[i
].isTemp());
610 if (ctx
.spills_exit
[preds
[i
]].find(phi
->operands
[i
].getTemp()) == ctx
.spills_exit
[preds
[i
]].end())
613 partial_spills
.insert(phi
->definitions
[0].getTemp());
616 ctx
.spills_entry
[block_idx
][phi
->definitions
[0].getTemp()] = ctx
.allocate_spill_id(phi
->definitions
[0].regClass());
617 partial_spills
.erase(phi
->definitions
[0].getTemp());
618 spilled_registers
+= phi
->definitions
[0].getTemp();
624 /* if reg pressure at first instruction is still too high, add partially spilled variables */
625 RegisterDemand reg_pressure
;
627 for (const Definition
& def
: block
->instructions
[idx
]->definitions
) {
629 reg_pressure
-= def
.getTemp();
632 for (const Operand
& op
: block
->instructions
[idx
]->operands
) {
633 if (op
.isTemp() && op
.isFirstKill()) {
634 reg_pressure
+= op
.getTemp();
638 for (unsigned i
= 0; i
< idx
; i
++) {
639 aco_ptr
<Instruction
>& instr
= block
->instructions
[i
];
640 assert(is_phi(instr
));
641 /* Killed phi definitions increase pressure in the predecessor but not
642 * the block they're in. Since the loops below are both to control
643 * pressure of the start of this block and the ends of it's
644 * predecessors, we need to count killed unspilled phi definitions here. */
645 if (instr
->definitions
[0].isKill() &&
646 !ctx
.spills_entry
[block_idx
].count(instr
->definitions
[0].getTemp()))
647 reg_pressure
+= instr
->definitions
[0].getTemp();
651 reg_pressure
+= ctx
.register_demand
[block_idx
][idx
] - spilled_registers
;
653 /* Consider register pressure from linear predecessors. This can affect
654 * reg_pressure if the branch instructions define sgprs. */
655 for (unsigned pred
: block
->linear_preds
) {
656 reg_pressure
.sgpr
= std::max
<int16_t>(
657 reg_pressure
.sgpr
, ctx
.register_demand
[pred
].back().sgpr
- spilled_registers
.sgpr
);
660 while (reg_pressure
.sgpr
> ctx
.target_pressure
.sgpr
) {
661 assert(!partial_spills
.empty());
663 std::set
<Temp
>::iterator it
= partial_spills
.begin();
664 Temp to_spill
= Temp();
665 unsigned distance
= 0;
666 while (it
!= partial_spills
.end()) {
667 assert(ctx
.spills_entry
[block_idx
].find(*it
) == ctx
.spills_entry
[block_idx
].end());
669 if (it
->type() == RegType::sgpr
&& ctx
.next_use_distances_start
[block_idx
][*it
].second
> distance
) {
670 distance
= ctx
.next_use_distances_start
[block_idx
][*it
].second
;
675 assert(distance
!= 0);
677 ctx
.spills_entry
[block_idx
][to_spill
] = ctx
.allocate_spill_id(to_spill
.regClass());
678 partial_spills
.erase(to_spill
);
679 spilled_registers
.sgpr
+= to_spill
.size();
680 reg_pressure
.sgpr
-= to_spill
.size();
683 while (reg_pressure
.vgpr
> ctx
.target_pressure
.vgpr
) {
684 assert(!partial_spills
.empty());
686 std::set
<Temp
>::iterator it
= partial_spills
.begin();
687 Temp to_spill
= Temp();
688 unsigned distance
= 0;
689 while (it
!= partial_spills
.end()) {
690 assert(ctx
.spills_entry
[block_idx
].find(*it
) == ctx
.spills_entry
[block_idx
].end());
692 if (it
->type() == RegType::vgpr
&& ctx
.next_use_distances_start
[block_idx
][*it
].second
> distance
) {
693 distance
= ctx
.next_use_distances_start
[block_idx
][*it
].second
;
698 assert(distance
!= 0);
700 ctx
.spills_entry
[block_idx
][to_spill
] = ctx
.allocate_spill_id(to_spill
.regClass());
701 partial_spills
.erase(to_spill
);
702 spilled_registers
.vgpr
+= to_spill
.size();
703 reg_pressure
.vgpr
-= to_spill
.size();
706 return spilled_registers
;
710 RegisterDemand
get_demand_before(spill_ctx
& ctx
, unsigned block_idx
, unsigned idx
)
713 RegisterDemand demand
= ctx
.register_demand
[block_idx
][idx
];
714 aco_ptr
<Instruction
>& instr
= ctx
.program
->blocks
[block_idx
].instructions
[idx
];
715 aco_ptr
<Instruction
> instr_before(nullptr);
716 return get_demand_before(demand
, instr
, instr_before
);
718 return ctx
.register_demand
[block_idx
][idx
- 1];
722 void add_coupling_code(spill_ctx
& ctx
, Block
* block
, unsigned block_idx
)
724 /* no coupling code necessary */
725 if (block
->linear_preds
.size() == 0)
728 std::vector
<aco_ptr
<Instruction
>> instructions
;
729 /* branch block: TODO take other branch into consideration */
730 if (block
->linear_preds
.size() == 1 && !(block
->kind
& (block_kind_loop_exit
| block_kind_loop_header
))) {
731 assert(ctx
.processed
[block
->linear_preds
[0]]);
732 assert(ctx
.register_demand
[block_idx
].size() == block
->instructions
.size());
733 std::vector
<RegisterDemand
> reg_demand
;
734 unsigned insert_idx
= 0;
735 unsigned pred_idx
= block
->linear_preds
[0];
736 RegisterDemand demand_before
= get_demand_before(ctx
, block_idx
, 0);
738 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> live
: ctx
.next_use_distances_start
[block_idx
]) {
739 if (!live
.first
.is_linear())
742 if (ctx
.spills_entry
[block_idx
].find(live
.first
) != ctx
.spills_entry
[block_idx
].end())
745 /* in register at end of predecessor */
746 if (ctx
.spills_exit
[pred_idx
].find(live
.first
) == ctx
.spills_exit
[pred_idx
].end()) {
747 std::map
<Temp
, Temp
>::iterator it
= ctx
.renames
[pred_idx
].find(live
.first
);
748 if (it
!= ctx
.renames
[pred_idx
].end())
749 ctx
.renames
[block_idx
].insert(*it
);
753 /* variable is spilled at predecessor and live at current block: create reload instruction */
754 Temp new_name
= {ctx
.program
->allocateId(), live
.first
.regClass()};
755 aco_ptr
<Instruction
> reload
= do_reload(ctx
, live
.first
, new_name
, ctx
.spills_exit
[pred_idx
][live
.first
]);
756 instructions
.emplace_back(std::move(reload
));
757 reg_demand
.push_back(demand_before
);
758 ctx
.renames
[block_idx
][live
.first
] = new_name
;
761 if (block
->logical_preds
.size() == 1) {
763 assert(insert_idx
< block
->instructions
.size());
764 instructions
.emplace_back(std::move(block
->instructions
[insert_idx
]));
765 reg_demand
.push_back(ctx
.register_demand
[block_idx
][insert_idx
]);
767 } while (instructions
.back()->opcode
!= aco_opcode::p_logical_start
);
769 unsigned pred_idx
= block
->logical_preds
[0];
770 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> live
: ctx
.next_use_distances_start
[block_idx
]) {
771 if (live
.first
.is_linear())
774 if (ctx
.spills_entry
[block_idx
].find(live
.first
) != ctx
.spills_entry
[block_idx
].end())
777 /* in register at end of predecessor */
778 if (ctx
.spills_exit
[pred_idx
].find(live
.first
) == ctx
.spills_exit
[pred_idx
].end()) {
779 std::map
<Temp
, Temp
>::iterator it
= ctx
.renames
[pred_idx
].find(live
.first
);
780 if (it
!= ctx
.renames
[pred_idx
].end())
781 ctx
.renames
[block_idx
].insert(*it
);
785 /* variable is spilled at predecessor and live at current block: create reload instruction */
786 Temp new_name
= {ctx
.program
->allocateId(), live
.first
.regClass()};
787 aco_ptr
<Instruction
> reload
= do_reload(ctx
, live
.first
, new_name
, ctx
.spills_exit
[pred_idx
][live
.first
]);
788 instructions
.emplace_back(std::move(reload
));
789 reg_demand
.emplace_back(reg_demand
.back());
790 ctx
.renames
[block_idx
][live
.first
] = new_name
;
794 /* combine new reload instructions with original block */
795 if (!instructions
.empty()) {
796 reg_demand
.insert(reg_demand
.end(), std::next(ctx
.register_demand
[block
->index
].begin(), insert_idx
),
797 ctx
.register_demand
[block
->index
].end());
798 ctx
.register_demand
[block_idx
] = std::move(reg_demand
);
799 instructions
.insert(instructions
.end(),
800 std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(std::next(block
->instructions
.begin(), insert_idx
)),
801 std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(block
->instructions
.end()));
802 block
->instructions
= std::move(instructions
);
807 /* loop header and merge blocks: check if all (linear) predecessors have been processed */
808 for (ASSERTED
unsigned pred
: block
->linear_preds
)
809 assert(ctx
.processed
[pred
]);
811 /* iterate the phi nodes for which operands to spill at the predecessor */
812 for (aco_ptr
<Instruction
>& phi
: block
->instructions
) {
813 if (phi
->opcode
!= aco_opcode::p_phi
&&
814 phi
->opcode
!= aco_opcode::p_linear_phi
)
817 /* if the phi is not spilled, add to instructions */
818 if (ctx
.spills_entry
[block_idx
].find(phi
->definitions
[0].getTemp()) == ctx
.spills_entry
[block_idx
].end()) {
819 instructions
.emplace_back(std::move(phi
));
823 std::vector
<unsigned>& preds
= phi
->opcode
== aco_opcode::p_phi
? block
->logical_preds
: block
->linear_preds
;
824 uint32_t def_spill_id
= ctx
.spills_entry
[block_idx
][phi
->definitions
[0].getTemp()];
826 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
827 if (phi
->operands
[i
].isUndefined())
830 unsigned pred_idx
= preds
[i
];
831 assert(phi
->operands
[i
].isTemp() && phi
->operands
[i
].isKill());
832 Temp var
= phi
->operands
[i
].getTemp();
834 /* build interferences between the phi def and all spilled variables at the predecessor blocks */
835 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[pred_idx
]) {
836 if (var
== pair
.first
)
838 ctx
.add_interference(def_spill_id
, pair
.second
);
841 /* check if variable is already spilled at predecessor */
842 std::map
<Temp
, uint32_t>::iterator spilled
= ctx
.spills_exit
[pred_idx
].find(var
);
843 if (spilled
!= ctx
.spills_exit
[pred_idx
].end()) {
844 if (spilled
->second
!= def_spill_id
)
845 ctx
.add_affinity(def_spill_id
, spilled
->second
);
849 /* rename if necessary */
850 std::map
<Temp
, Temp
>::iterator rename_it
= ctx
.renames
[pred_idx
].find(var
);
851 if (rename_it
!= ctx
.renames
[pred_idx
].end()) {
852 var
= rename_it
->second
;
853 ctx
.renames
[pred_idx
].erase(rename_it
);
856 uint32_t spill_id
= ctx
.allocate_spill_id(phi
->definitions
[0].regClass());
857 ctx
.add_affinity(def_spill_id
, spill_id
);
858 aco_ptr
<Pseudo_instruction
> spill
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_spill
, Format::PSEUDO
, 2, 0)};
859 spill
->operands
[0] = Operand(var
);
860 spill
->operands
[1] = Operand(spill_id
);
861 Block
& pred
= ctx
.program
->blocks
[pred_idx
];
862 unsigned idx
= pred
.instructions
.size();
866 } while (phi
->opcode
== aco_opcode::p_phi
&& pred
.instructions
[idx
]->opcode
!= aco_opcode::p_logical_end
);
867 std::vector
<aco_ptr
<Instruction
>>::iterator it
= std::next(pred
.instructions
.begin(), idx
);
868 pred
.instructions
.insert(it
, std::move(spill
));
869 ctx
.spills_exit
[pred_idx
][phi
->operands
[i
].getTemp()] = spill_id
;
872 /* remove phi from instructions */
876 /* iterate all (other) spilled variables for which to spill at the predecessor */
877 // TODO: would be better to have them sorted: first vgprs and first with longest distance
878 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_entry
[block_idx
]) {
879 std::vector
<unsigned> preds
= pair
.first
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
881 for (unsigned pred_idx
: preds
) {
882 /* variable is already spilled at predecessor */
883 std::map
<Temp
, uint32_t>::iterator spilled
= ctx
.spills_exit
[pred_idx
].find(pair
.first
);
884 if (spilled
!= ctx
.spills_exit
[pred_idx
].end()) {
885 if (spilled
->second
!= pair
.second
)
886 ctx
.add_affinity(pair
.second
, spilled
->second
);
890 /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
891 if (ctx
.next_use_distances_end
[pred_idx
].find(pair
.first
) == ctx
.next_use_distances_end
[pred_idx
].end())
894 /* add interferences between spilled variable and predecessors exit spills */
895 for (std::pair
<Temp
, uint32_t> exit_spill
: ctx
.spills_exit
[pred_idx
]) {
896 if (exit_spill
.first
== pair
.first
)
898 ctx
.add_interference(exit_spill
.second
, pair
.second
);
901 /* variable is in register at predecessor and has to be spilled */
902 /* rename if necessary */
903 Temp var
= pair
.first
;
904 std::map
<Temp
, Temp
>::iterator rename_it
= ctx
.renames
[pred_idx
].find(var
);
905 if (rename_it
!= ctx
.renames
[pred_idx
].end()) {
906 var
= rename_it
->second
;
907 ctx
.renames
[pred_idx
].erase(rename_it
);
910 aco_ptr
<Pseudo_instruction
> spill
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_spill
, Format::PSEUDO
, 2, 0)};
911 spill
->operands
[0] = Operand(var
);
912 spill
->operands
[1] = Operand(pair
.second
);
913 Block
& pred
= ctx
.program
->blocks
[pred_idx
];
914 unsigned idx
= pred
.instructions
.size();
918 } while (pair
.first
.type() == RegType::vgpr
&& pred
.instructions
[idx
]->opcode
!= aco_opcode::p_logical_end
);
919 std::vector
<aco_ptr
<Instruction
>>::iterator it
= std::next(pred
.instructions
.begin(), idx
);
920 pred
.instructions
.insert(it
, std::move(spill
));
921 ctx
.spills_exit
[pred
.index
][pair
.first
] = pair
.second
;
925 /* iterate phis for which operands to reload */
926 for (aco_ptr
<Instruction
>& phi
: instructions
) {
927 assert(phi
->opcode
== aco_opcode::p_phi
|| phi
->opcode
== aco_opcode::p_linear_phi
);
928 assert(ctx
.spills_entry
[block_idx
].find(phi
->definitions
[0].getTemp()) == ctx
.spills_entry
[block_idx
].end());
930 std::vector
<unsigned>& preds
= phi
->opcode
== aco_opcode::p_phi
? block
->logical_preds
: block
->linear_preds
;
931 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
932 if (!phi
->operands
[i
].isTemp())
934 unsigned pred_idx
= preds
[i
];
937 if (ctx
.spills_exit
[pred_idx
].find(phi
->operands
[i
].getTemp()) == ctx
.spills_exit
[pred_idx
].end()) {
938 std::map
<Temp
, Temp
>::iterator it
= ctx
.renames
[pred_idx
].find(phi
->operands
[i
].getTemp());
939 if (it
!= ctx
.renames
[pred_idx
].end())
940 phi
->operands
[i
].setTemp(it
->second
);
944 Temp tmp
= phi
->operands
[i
].getTemp();
946 /* reload phi operand at end of predecessor block */
947 Temp new_name
= {ctx
.program
->allocateId(), tmp
.regClass()};
948 Block
& pred
= ctx
.program
->blocks
[pred_idx
];
949 unsigned idx
= pred
.instructions
.size();
953 } while (phi
->opcode
== aco_opcode::p_phi
&& pred
.instructions
[idx
]->opcode
!= aco_opcode::p_logical_end
);
954 std::vector
<aco_ptr
<Instruction
>>::iterator it
= std::next(pred
.instructions
.begin(), idx
);
956 aco_ptr
<Instruction
> reload
= do_reload(ctx
, tmp
, new_name
, ctx
.spills_exit
[pred_idx
][tmp
]);
957 pred
.instructions
.insert(it
, std::move(reload
));
959 ctx
.spills_exit
[pred_idx
].erase(tmp
);
960 ctx
.renames
[pred_idx
][tmp
] = new_name
;
961 phi
->operands
[i
].setTemp(new_name
);
965 /* iterate live variables for which to reload */
966 // TODO: reload at current block if variable is spilled on all predecessors
967 for (std::pair
<Temp
, std::pair
<uint32_t, uint32_t>> pair
: ctx
.next_use_distances_start
[block_idx
]) {
968 /* skip spilled variables */
969 if (ctx
.spills_entry
[block_idx
].find(pair
.first
) != ctx
.spills_entry
[block_idx
].end())
971 std::vector
<unsigned> preds
= pair
.first
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
973 /* variable is dead at predecessor, it must be from a phi */
974 bool is_dead
= false;
975 for (unsigned pred_idx
: preds
) {
976 if (ctx
.next_use_distances_end
[pred_idx
].find(pair
.first
) == ctx
.next_use_distances_end
[pred_idx
].end())
981 for (unsigned pred_idx
: preds
) {
982 /* the variable is not spilled at the predecessor */
983 if (ctx
.spills_exit
[pred_idx
].find(pair
.first
) == ctx
.spills_exit
[pred_idx
].end())
986 /* variable is spilled at predecessor and has to be reloaded */
987 Temp new_name
= {ctx
.program
->allocateId(), pair
.first
.regClass()};
988 Block
& pred
= ctx
.program
->blocks
[pred_idx
];
989 unsigned idx
= pred
.instructions
.size();
993 } while (pair
.first
.type() == RegType::vgpr
&& pred
.instructions
[idx
]->opcode
!= aco_opcode::p_logical_end
);
994 std::vector
<aco_ptr
<Instruction
>>::iterator it
= std::next(pred
.instructions
.begin(), idx
);
996 aco_ptr
<Instruction
> reload
= do_reload(ctx
, pair
.first
, new_name
, ctx
.spills_exit
[pred
.index
][pair
.first
]);
997 pred
.instructions
.insert(it
, std::move(reload
));
999 ctx
.spills_exit
[pred
.index
].erase(pair
.first
);
1000 ctx
.renames
[pred
.index
][pair
.first
] = new_name
;
1003 /* check if we have to create a new phi for this variable */
1004 Temp rename
= Temp();
1005 bool is_same
= true;
1006 for (unsigned pred_idx
: preds
) {
1007 if (ctx
.renames
[pred_idx
].find(pair
.first
) == ctx
.renames
[pred_idx
].end()) {
1008 if (rename
== Temp())
1009 rename
= pair
.first
;
1011 is_same
= rename
== pair
.first
;
1013 if (rename
== Temp())
1014 rename
= ctx
.renames
[pred_idx
][pair
.first
];
1016 is_same
= rename
== ctx
.renames
[pred_idx
][pair
.first
];
1024 /* the variable was renamed differently in the predecessors: we have to create a phi */
1025 aco_opcode opcode
= pair
.first
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1026 aco_ptr
<Pseudo_instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1027 rename
= {ctx
.program
->allocateId(), pair
.first
.regClass()};
1028 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
1030 if (ctx
.renames
[preds
[i
]].find(pair
.first
) != ctx
.renames
[preds
[i
]].end())
1031 tmp
= ctx
.renames
[preds
[i
]][pair
.first
];
1032 else if (preds
[i
] >= block_idx
)
1036 phi
->operands
[i
] = Operand(tmp
);
1038 phi
->definitions
[0] = Definition(rename
);
1039 instructions
.emplace_back(std::move(phi
));
1042 /* the variable was renamed: add new name to renames */
1043 if (!(rename
== Temp() || rename
== pair
.first
))
1044 ctx
.renames
[block_idx
][pair
.first
] = rename
;
1047 /* combine phis with instructions */
1049 while (!block
->instructions
[idx
]) {
1053 if (!ctx
.processed
[block_idx
]) {
1054 assert(!(block
->kind
& block_kind_loop_header
));
1055 RegisterDemand demand_before
= get_demand_before(ctx
, block_idx
, idx
);
1056 ctx
.register_demand
[block
->index
].erase(ctx
.register_demand
[block
->index
].begin(), ctx
.register_demand
[block
->index
].begin() + idx
);
1057 ctx
.register_demand
[block
->index
].insert(ctx
.register_demand
[block
->index
].begin(), instructions
.size(), demand_before
);
1060 std::vector
<aco_ptr
<Instruction
>>::iterator start
= std::next(block
->instructions
.begin(), idx
);
1061 instructions
.insert(instructions
.end(), std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(start
),
1062 std::move_iterator
<std::vector
<aco_ptr
<Instruction
>>::iterator
>(block
->instructions
.end()));
1063 block
->instructions
= std::move(instructions
);
1066 void process_block(spill_ctx
& ctx
, unsigned block_idx
, Block
* block
,
1067 std::map
<Temp
, uint32_t> ¤t_spills
, RegisterDemand spilled_registers
)
1069 assert(!ctx
.processed
[block_idx
]);
1071 std::vector
<std::map
<Temp
, uint32_t>> local_next_use_distance
;
1072 std::vector
<aco_ptr
<Instruction
>> instructions
;
1075 /* phis are handled separetely */
1076 while (block
->instructions
[idx
]->opcode
== aco_opcode::p_phi
||
1077 block
->instructions
[idx
]->opcode
== aco_opcode::p_linear_phi
) {
1078 aco_ptr
<Instruction
>& instr
= block
->instructions
[idx
];
1079 for (const Operand
& op
: instr
->operands
) {
1080 /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1081 if (op
.isTemp() && ctx
.remat
.count(op
.getTemp()))
1082 ctx
.remat_used
[ctx
.remat
[op
.getTemp()].instr
] = true;
1084 instructions
.emplace_back(std::move(instr
));
1088 if (block
->register_demand
.exceeds(ctx
.target_pressure
))
1089 local_next_use_distance
= local_next_uses(ctx
, block
);
1091 while (idx
< block
->instructions
.size()) {
1092 aco_ptr
<Instruction
>& instr
= block
->instructions
[idx
];
1094 std::map
<Temp
, std::pair
<Temp
, uint32_t>> reloads
;
1095 std::map
<Temp
, uint32_t> spills
;
1096 /* rename and reload operands */
1097 for (Operand
& op
: instr
->operands
) {
1100 if (current_spills
.find(op
.getTemp()) == current_spills
.end()) {
1101 /* the Operand is in register: check if it was renamed */
1102 if (ctx
.renames
[block_idx
].find(op
.getTemp()) != ctx
.renames
[block_idx
].end())
1103 op
.setTemp(ctx
.renames
[block_idx
][op
.getTemp()]);
1104 /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
1105 if (ctx
.remat
.count(op
.getTemp()))
1106 ctx
.remat_used
[ctx
.remat
[op
.getTemp()].instr
] = true;
1109 /* the Operand is spilled: add it to reloads */
1110 Temp new_tmp
= {ctx
.program
->allocateId(), op
.regClass()};
1111 ctx
.renames
[block_idx
][op
.getTemp()] = new_tmp
;
1112 reloads
[new_tmp
] = std::make_pair(op
.getTemp(), current_spills
[op
.getTemp()]);
1113 current_spills
.erase(op
.getTemp());
1114 op
.setTemp(new_tmp
);
1115 spilled_registers
-= new_tmp
;
1118 /* check if register demand is low enough before and after the current instruction */
1119 if (block
->register_demand
.exceeds(ctx
.target_pressure
)) {
1121 RegisterDemand new_demand
= ctx
.register_demand
[block_idx
][idx
];
1122 new_demand
.update(get_demand_before(ctx
, block_idx
, idx
));
1124 assert(!local_next_use_distance
.empty());
1126 /* if reg pressure is too high, spill variable with furthest next use */
1127 while (RegisterDemand(new_demand
- spilled_registers
).exceeds(ctx
.target_pressure
)) {
1128 unsigned distance
= 0;
1130 bool do_rematerialize
= false;
1131 if (new_demand
.vgpr
- spilled_registers
.vgpr
> ctx
.target_pressure
.vgpr
) {
1132 for (std::pair
<Temp
, uint32_t> pair
: local_next_use_distance
[idx
]) {
1133 bool can_rematerialize
= ctx
.remat
.count(pair
.first
);
1134 if (pair
.first
.type() == RegType::vgpr
&&
1135 ((pair
.second
> distance
&& can_rematerialize
== do_rematerialize
) ||
1136 (can_rematerialize
&& !do_rematerialize
&& pair
.second
> idx
)) &&
1137 current_spills
.find(pair
.first
) == current_spills
.end() &&
1138 ctx
.spills_exit
[block_idx
].find(pair
.first
) == ctx
.spills_exit
[block_idx
].end()) {
1139 to_spill
= pair
.first
;
1140 distance
= pair
.second
;
1141 do_rematerialize
= can_rematerialize
;
1145 for (std::pair
<Temp
, uint32_t> pair
: local_next_use_distance
[idx
]) {
1146 bool can_rematerialize
= ctx
.remat
.count(pair
.first
);
1147 if (pair
.first
.type() == RegType::sgpr
&&
1148 ((pair
.second
> distance
&& can_rematerialize
== do_rematerialize
) ||
1149 (can_rematerialize
&& !do_rematerialize
&& pair
.second
> idx
)) &&
1150 current_spills
.find(pair
.first
) == current_spills
.end() &&
1151 ctx
.spills_exit
[block_idx
].find(pair
.first
) == ctx
.spills_exit
[block_idx
].end()) {
1152 to_spill
= pair
.first
;
1153 distance
= pair
.second
;
1154 do_rematerialize
= can_rematerialize
;
1159 assert(distance
!= 0 && distance
> idx
);
1160 uint32_t spill_id
= ctx
.allocate_spill_id(to_spill
.regClass());
1162 /* add interferences with currently spilled variables */
1163 for (std::pair
<Temp
, uint32_t> pair
: current_spills
)
1164 ctx
.add_interference(spill_id
, pair
.second
);
1165 for (std::pair
<Temp
, std::pair
<Temp
, uint32_t>> pair
: reloads
)
1166 ctx
.add_interference(spill_id
, pair
.second
.second
);
1168 current_spills
[to_spill
] = spill_id
;
1169 spilled_registers
+= to_spill
;
1171 /* rename if necessary */
1172 if (ctx
.renames
[block_idx
].find(to_spill
) != ctx
.renames
[block_idx
].end()) {
1173 to_spill
= ctx
.renames
[block_idx
][to_spill
];
1176 /* add spill to new instructions */
1177 aco_ptr
<Pseudo_instruction
> spill
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_spill
, Format::PSEUDO
, 2, 0)};
1178 spill
->operands
[0] = Operand(to_spill
);
1179 spill
->operands
[1] = Operand(spill_id
);
1180 instructions
.emplace_back(std::move(spill
));
1184 /* add reloads and instruction to new instructions */
1185 for (std::pair
<Temp
, std::pair
<Temp
, uint32_t>> pair
: reloads
) {
1186 aco_ptr
<Instruction
> reload
= do_reload(ctx
, pair
.second
.first
, pair
.first
, pair
.second
.second
);
1187 instructions
.emplace_back(std::move(reload
));
1189 instructions
.emplace_back(std::move(instr
));
1193 block
->instructions
= std::move(instructions
);
1194 ctx
.spills_exit
[block_idx
].insert(current_spills
.begin(), current_spills
.end());
1197 void spill_block(spill_ctx
& ctx
, unsigned block_idx
)
1199 Block
* block
= &ctx
.program
->blocks
[block_idx
];
1201 /* determine set of variables which are spilled at the beginning of the block */
1202 RegisterDemand spilled_registers
= init_live_in_vars(ctx
, block
, block_idx
);
1204 /* add interferences for spilled variables */
1205 for (auto it
= ctx
.spills_entry
[block_idx
].begin(); it
!= ctx
.spills_entry
[block_idx
].end(); ++it
) {
1206 for (auto it2
= std::next(it
); it2
!= ctx
.spills_entry
[block_idx
].end(); ++it2
)
1207 ctx
.add_interference(it
->second
, it2
->second
);
1210 bool is_loop_header
= block
->loop_nest_depth
&& ctx
.loop_header
.top()->index
== block_idx
;
1211 if (!is_loop_header
) {
1212 /* add spill/reload code on incoming control flow edges */
1213 add_coupling_code(ctx
, block
, block_idx
);
1216 std::map
<Temp
, uint32_t> current_spills
= ctx
.spills_entry
[block_idx
];
1218 /* check conditions to process this block */
1219 bool process
= RegisterDemand(block
->register_demand
- spilled_registers
).exceeds(ctx
.target_pressure
) ||
1220 !ctx
.renames
[block_idx
].empty() ||
1221 ctx
.remat_used
.size();
1223 std::map
<Temp
, uint32_t>::iterator it
= current_spills
.begin();
1224 while (!process
&& it
!= current_spills
.end()) {
1225 if (ctx
.next_use_distances_start
[block_idx
][it
->first
].first
== block_idx
)
1231 process_block(ctx
, block_idx
, block
, current_spills
, spilled_registers
);
1233 ctx
.spills_exit
[block_idx
].insert(current_spills
.begin(), current_spills
.end());
1235 ctx
.processed
[block_idx
] = true;
1237 /* check if the next block leaves the current loop */
1238 if (block
->loop_nest_depth
== 0 || ctx
.program
->blocks
[block_idx
+ 1].loop_nest_depth
>= block
->loop_nest_depth
)
1241 Block
* loop_header
= ctx
.loop_header
.top();
1243 /* preserve original renames at end of loop header block */
1244 std::map
<Temp
, Temp
> renames
= std::move(ctx
.renames
[loop_header
->index
]);
1246 /* add coupling code to all loop header predecessors */
1247 add_coupling_code(ctx
, loop_header
, loop_header
->index
);
1249 /* update remat_used for phis added in add_coupling_code() */
1250 for (aco_ptr
<Instruction
>& instr
: loop_header
->instructions
) {
1253 for (const Operand
& op
: instr
->operands
) {
1254 if (op
.isTemp() && ctx
.remat
.count(op
.getTemp()))
1255 ctx
.remat_used
[ctx
.remat
[op
.getTemp()].instr
] = true;
1259 /* propagate new renames through loop: i.e. repair the SSA */
1260 renames
.swap(ctx
.renames
[loop_header
->index
]);
1261 for (std::pair
<Temp
, Temp
> rename
: renames
) {
1262 for (unsigned idx
= loop_header
->index
; idx
<= block_idx
; idx
++) {
1263 Block
& current
= ctx
.program
->blocks
[idx
];
1264 std::vector
<aco_ptr
<Instruction
>>::iterator instr_it
= current
.instructions
.begin();
1266 /* first rename phis */
1267 while (instr_it
!= current
.instructions
.end()) {
1268 aco_ptr
<Instruction
>& phi
= *instr_it
;
1269 if (phi
->opcode
!= aco_opcode::p_phi
&& phi
->opcode
!= aco_opcode::p_linear_phi
)
1271 /* no need to rename the loop header phis once again. this happened in add_coupling_code() */
1272 if (idx
== loop_header
->index
) {
1277 for (Operand
& op
: phi
->operands
) {
1280 if (op
.getTemp() == rename
.first
)
1281 op
.setTemp(rename
.second
);
1286 std::map
<Temp
, std::pair
<uint32_t, uint32_t>>::iterator it
= ctx
.next_use_distances_start
[idx
].find(rename
.first
);
1288 /* variable is not live at beginning of this block */
1289 if (it
== ctx
.next_use_distances_start
[idx
].end())
1292 /* if the variable is live at the block's exit, add rename */
1293 if (ctx
.next_use_distances_end
[idx
].find(rename
.first
) != ctx
.next_use_distances_end
[idx
].end())
1294 ctx
.renames
[idx
].insert(rename
);
1296 /* rename all uses in this block */
1297 bool renamed
= false;
1298 while (!renamed
&& instr_it
!= current
.instructions
.end()) {
1299 aco_ptr
<Instruction
>& instr
= *instr_it
;
1300 for (Operand
& op
: instr
->operands
) {
1303 if (op
.getTemp() == rename
.first
) {
1304 op
.setTemp(rename
.second
);
1305 /* we can stop with this block as soon as the variable is spilled */
1306 if (instr
->opcode
== aco_opcode::p_spill
)
1315 /* remove loop header info from stack */
1316 ctx
.loop_header
.pop();
1319 Temp
load_scratch_resource(spill_ctx
& ctx
, Temp
& scratch_offset
,
1320 std::vector
<aco_ptr
<Instruction
>>& instructions
,
1321 unsigned offset
, bool is_top_level
)
1323 Builder
bld(ctx
.program
);
1325 bld
.reset(&instructions
);
1327 /* find p_logical_end */
1328 unsigned idx
= instructions
.size() - 1;
1329 while (instructions
[idx
]->opcode
!= aco_opcode::p_logical_end
)
1331 bld
.reset(&instructions
, std::next(instructions
.begin(), idx
));
1334 Temp private_segment_buffer
= ctx
.program
->private_segment_buffer
;
1335 if (ctx
.program
->stage
!= compute_cs
)
1336 private_segment_buffer
= bld
.smem(aco_opcode::s_load_dwordx2
, bld
.def(s2
), private_segment_buffer
, Operand(0u));
1339 scratch_offset
= bld
.sop2(aco_opcode::s_add_u32
, bld
.def(s1
), bld
.def(s1
, scc
), scratch_offset
, Operand(offset
));
1341 uint32_t rsrc_conf
= S_008F0C_ADD_TID_ENABLE(1) |
1342 S_008F0C_INDEX_STRIDE(ctx
.program
->wave_size
== 64 ? 3 : 2);
1344 if (ctx
.program
->chip_class
>= GFX10
) {
1345 rsrc_conf
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
1346 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW
) |
1347 S_008F0C_RESOURCE_LEVEL(1);
1348 } else if (ctx
.program
->chip_class
<= GFX7
) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
1349 rsrc_conf
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
1350 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
1352 /* older generations need element size = 4 bytes. element size removed in GFX9 */
1353 if (ctx
.program
->chip_class
<= GFX8
)
1354 rsrc_conf
|= S_008F0C_ELEMENT_SIZE(1);
1356 return bld
.pseudo(aco_opcode::p_create_vector
, bld
.def(s4
),
1357 private_segment_buffer
, Operand(-1u),
1358 Operand(rsrc_conf
));
1361 void add_interferences(spill_ctx
& ctx
, std::vector
<bool>& is_assigned
,
1362 std::vector
<uint32_t>& slots
, std::vector
<bool>& slots_used
,
1365 for (unsigned other
: ctx
.interferences
[id
].second
) {
1366 if (!is_assigned
[other
])
1369 RegClass other_rc
= ctx
.interferences
[other
].first
;
1370 unsigned slot
= slots
[other
];
1371 std::fill(slots_used
.begin() + slot
, slots_used
.begin() + slot
+ other_rc
.size(), true);
1375 unsigned find_available_slot(std::vector
<bool>& used
, unsigned wave_size
,
1376 unsigned size
, bool is_sgpr
, unsigned *num_slots
)
1378 unsigned wave_size_minus_one
= wave_size
- 1;
1382 bool available
= true;
1383 for (unsigned i
= 0; i
< size
; i
++) {
1384 if (slot
+ i
< used
.size() && used
[slot
+ i
]) {
1394 if (is_sgpr
&& ((slot
& wave_size_minus_one
) > wave_size
- size
)) {
1395 slot
= align(slot
, wave_size
);
1399 std::fill(used
.begin(), used
.end(), false);
1401 if (slot
+ size
> used
.size())
1402 used
.resize(slot
+ size
);
1408 void assign_spill_slots_helper(spill_ctx
& ctx
, RegType type
,
1409 std::vector
<bool>& is_assigned
,
1410 std::vector
<uint32_t>& slots
,
1411 unsigned *num_slots
)
1413 std::vector
<bool> slots_used(*num_slots
);
1415 /* assign slots for ids with affinities first */
1416 for (std::vector
<uint32_t>& vec
: ctx
.affinities
) {
1417 if (ctx
.interferences
[vec
[0]].first
.type() != type
)
1420 for (unsigned id
: vec
) {
1421 if (!ctx
.is_reloaded
[id
])
1424 add_interferences(ctx
, is_assigned
, slots
, slots_used
, id
);
1427 unsigned slot
= find_available_slot(slots_used
, ctx
.wave_size
,
1428 ctx
.interferences
[vec
[0]].first
.size(),
1429 type
== RegType::sgpr
, num_slots
);
1431 for (unsigned id
: vec
) {
1432 assert(!is_assigned
[id
]);
1434 if (ctx
.is_reloaded
[id
]) {
1436 is_assigned
[id
] = true;
1441 /* assign slots for ids without affinities */
1442 for (unsigned id
= 0; id
< ctx
.interferences
.size(); id
++) {
1443 if (is_assigned
[id
] || !ctx
.is_reloaded
[id
] || ctx
.interferences
[id
].first
.type() != type
)
1446 add_interferences(ctx
, is_assigned
, slots
, slots_used
, id
);
1448 unsigned slot
= find_available_slot(slots_used
, ctx
.wave_size
,
1449 ctx
.interferences
[id
].first
.size(),
1450 type
== RegType::sgpr
, num_slots
);
1453 is_assigned
[id
] = true;
1456 *num_slots
= slots_used
.size();
1459 void assign_spill_slots(spill_ctx
& ctx
, unsigned spills_to_vgpr
) {
1460 std::vector
<uint32_t> slots(ctx
.interferences
.size());
1461 std::vector
<bool> is_assigned(ctx
.interferences
.size());
1463 /* first, handle affinities: just merge all interferences into both spill ids */
1464 for (std::vector
<uint32_t>& vec
: ctx
.affinities
) {
1465 for (unsigned i
= 0; i
< vec
.size(); i
++) {
1466 for (unsigned j
= i
+ 1; j
< vec
.size(); j
++) {
1467 assert(vec
[i
] != vec
[j
]);
1468 bool reloaded
= ctx
.is_reloaded
[vec
[i
]] || ctx
.is_reloaded
[vec
[j
]];
1469 ctx
.is_reloaded
[vec
[i
]] = reloaded
;
1470 ctx
.is_reloaded
[vec
[j
]] = reloaded
;
1474 for (ASSERTED
uint32_t i
= 0; i
< ctx
.interferences
.size(); i
++)
1475 for (ASSERTED
uint32_t id
: ctx
.interferences
[i
].second
)
1478 /* for each spill slot, assign as many spill ids as possible */
1479 unsigned sgpr_spill_slots
= 0, vgpr_spill_slots
= 0;
1480 assign_spill_slots_helper(ctx
, RegType::sgpr
, is_assigned
, slots
, &sgpr_spill_slots
);
1481 assign_spill_slots_helper(ctx
, RegType::vgpr
, is_assigned
, slots
, &vgpr_spill_slots
);
1483 for (unsigned id
= 0; id
< is_assigned
.size(); id
++)
1484 assert(is_assigned
[id
] || !ctx
.is_reloaded
[id
]);
1486 for (std::vector
<uint32_t>& vec
: ctx
.affinities
) {
1487 for (unsigned i
= 0; i
< vec
.size(); i
++) {
1488 for (unsigned j
= i
+ 1; j
< vec
.size(); j
++) {
1489 assert(is_assigned
[vec
[i
]] == is_assigned
[vec
[j
]]);
1490 if (!is_assigned
[vec
[i
]])
1492 assert(ctx
.is_reloaded
[vec
[i
]] == ctx
.is_reloaded
[vec
[j
]]);
1493 assert(ctx
.interferences
[vec
[i
]].first
.type() == ctx
.interferences
[vec
[j
]].first
.type());
1494 assert(slots
[vec
[i
]] == slots
[vec
[j
]]);
1499 /* hope, we didn't mess up */
1500 std::vector
<Temp
> vgpr_spill_temps((sgpr_spill_slots
+ ctx
.wave_size
- 1) / ctx
.wave_size
);
1501 assert(vgpr_spill_temps
.size() <= spills_to_vgpr
);
1503 /* replace pseudo instructions with actual hardware instructions */
1504 Temp scratch_offset
= ctx
.program
->scratch_offset
, scratch_rsrc
= Temp();
1505 unsigned last_top_level_block_idx
= 0;
1506 std::vector
<bool> reload_in_loop(vgpr_spill_temps
.size());
1507 for (Block
& block
: ctx
.program
->blocks
) {
1509 /* after loops, we insert a user if there was a reload inside the loop */
1510 if (block
.loop_nest_depth
== 0) {
1512 for (unsigned i
= 0; i
< vgpr_spill_temps
.size(); i
++) {
1513 if (reload_in_loop
[i
])
1517 if (end_vgprs
> 0) {
1518 aco_ptr
<Instruction
> destr
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_end_linear_vgpr
, Format::PSEUDO
, end_vgprs
, 0)};
1520 for (unsigned i
= 0; i
< vgpr_spill_temps
.size(); i
++) {
1521 if (reload_in_loop
[i
])
1522 destr
->operands
[k
++] = Operand(vgpr_spill_temps
[i
]);
1523 reload_in_loop
[i
] = false;
1525 /* find insertion point */
1526 std::vector
<aco_ptr
<Instruction
>>::iterator it
= block
.instructions
.begin();
1527 while ((*it
)->opcode
== aco_opcode::p_linear_phi
|| (*it
)->opcode
== aco_opcode::p_phi
)
1529 block
.instructions
.insert(it
, std::move(destr
));
1533 if (block
.kind
& block_kind_top_level
&& !block
.linear_preds
.empty()) {
1534 last_top_level_block_idx
= block
.index
;
1536 /* check if any spilled variables use a created linear vgpr, otherwise destroy them */
1537 for (unsigned i
= 0; i
< vgpr_spill_temps
.size(); i
++) {
1538 if (vgpr_spill_temps
[i
] == Temp())
1541 bool can_destroy
= true;
1542 for (std::pair
<Temp
, uint32_t> pair
: ctx
.spills_exit
[block
.linear_preds
[0]]) {
1544 if (ctx
.interferences
[pair
.second
].first
.type() == RegType::sgpr
&&
1545 slots
[pair
.second
] / ctx
.wave_size
== i
) {
1546 can_destroy
= false;
1551 vgpr_spill_temps
[i
] = Temp();
1555 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1556 std::vector
<aco_ptr
<Instruction
>> instructions
;
1557 instructions
.reserve(block
.instructions
.size());
1558 Builder
bld(ctx
.program
, &instructions
);
1559 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1561 if ((*it
)->opcode
== aco_opcode::p_spill
) {
1562 uint32_t spill_id
= (*it
)->operands
[1].constantValue();
1564 if (!ctx
.is_reloaded
[spill_id
]) {
1565 /* never reloaded, so don't spill */
1566 } else if (!is_assigned
[spill_id
]) {
1567 unreachable("No spill slot assigned for spill id");
1568 } else if (ctx
.interferences
[spill_id
].first
.type() == RegType::vgpr
) {
1570 ctx
.program
->config
->spilled_vgprs
+= (*it
)->operands
[0].size();
1571 uint32_t spill_slot
= slots
[spill_id
];
1572 bool add_offset_to_sgpr
= ctx
.program
->config
->scratch_bytes_per_wave
/ ctx
.program
->wave_size
+ vgpr_spill_slots
* 4 > 4096;
1573 unsigned base_offset
= add_offset_to_sgpr
? 0 : ctx
.program
->config
->scratch_bytes_per_wave
/ ctx
.program
->wave_size
;
1575 /* check if the scratch resource descriptor already exists */
1576 if (scratch_rsrc
== Temp()) {
1577 unsigned offset
= add_offset_to_sgpr
? ctx
.program
->config
->scratch_bytes_per_wave
: 0;
1578 scratch_rsrc
= load_scratch_resource(ctx
, scratch_offset
,
1579 last_top_level_block_idx
== block
.index
?
1580 instructions
: ctx
.program
->blocks
[last_top_level_block_idx
].instructions
,
1582 last_top_level_block_idx
== block
.index
);
1585 unsigned offset
= base_offset
+ spill_slot
* 4;
1586 aco_opcode opcode
= aco_opcode::buffer_store_dword
;
1587 assert((*it
)->operands
[0].isTemp());
1588 Temp temp
= (*it
)->operands
[0].getTemp();
1589 assert(temp
.type() == RegType::vgpr
&& !temp
.is_linear());
1590 if (temp
.size() > 1) {
1591 Instruction
* split
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_split_vector
, Format::PSEUDO
, 1, temp
.size())};
1592 split
->operands
[0] = Operand(temp
);
1593 for (unsigned i
= 0; i
< temp
.size(); i
++)
1594 split
->definitions
[i
] = bld
.def(v1
);
1596 for (unsigned i
= 0; i
< temp
.size(); i
++) {
1597 Instruction
*instr
= bld
.mubuf(opcode
, scratch_rsrc
, Operand(v1
), scratch_offset
, split
->definitions
[i
].getTemp(), offset
+ i
* 4, false, true);
1598 static_cast<MUBUF_instruction
*>(instr
)->sync
= memory_sync_info(storage_vgpr_spill
, semantic_private
);
1601 Instruction
*instr
= bld
.mubuf(opcode
, scratch_rsrc
, Operand(v1
), scratch_offset
, temp
, offset
, false, true);
1602 static_cast<MUBUF_instruction
*>(instr
)->sync
= memory_sync_info(storage_vgpr_spill
, semantic_private
);
1605 ctx
.program
->config
->spilled_sgprs
+= (*it
)->operands
[0].size();
1607 uint32_t spill_slot
= slots
[spill_id
];
1609 /* check if the linear vgpr already exists */
1610 if (vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
] == Temp()) {
1611 Temp linear_vgpr
= {ctx
.program
->allocateId(), v1
.as_linear()};
1612 vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
] = linear_vgpr
;
1613 aco_ptr
<Pseudo_instruction
> create
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_start_linear_vgpr
, Format::PSEUDO
, 0, 1)};
1614 create
->definitions
[0] = Definition(linear_vgpr
);
1615 /* find the right place to insert this definition */
1616 if (last_top_level_block_idx
== block
.index
) {
1617 /* insert right before the current instruction */
1618 instructions
.emplace_back(std::move(create
));
1620 assert(last_top_level_block_idx
< block
.index
);
1621 /* insert before the branch at last top level block */
1622 std::vector
<aco_ptr
<Instruction
>>& instructions
= ctx
.program
->blocks
[last_top_level_block_idx
].instructions
;
1623 instructions
.insert(std::next(instructions
.begin(), instructions
.size() - 1), std::move(create
));
1627 /* spill sgpr: just add the vgpr temp to operands */
1628 Pseudo_instruction
* spill
= create_instruction
<Pseudo_instruction
>(aco_opcode::p_spill
, Format::PSEUDO
, 3, 0);
1629 spill
->operands
[0] = Operand(vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
]);
1630 spill
->operands
[1] = Operand(spill_slot
% ctx
.wave_size
);
1631 spill
->operands
[2] = (*it
)->operands
[0];
1632 instructions
.emplace_back(aco_ptr
<Instruction
>(spill
));
1635 } else if ((*it
)->opcode
== aco_opcode::p_reload
) {
1636 uint32_t spill_id
= (*it
)->operands
[0].constantValue();
1637 assert(ctx
.is_reloaded
[spill_id
]);
1639 if (!is_assigned
[spill_id
]) {
1640 unreachable("No spill slot assigned for spill id");
1641 } else if (ctx
.interferences
[spill_id
].first
.type() == RegType::vgpr
) {
1643 uint32_t spill_slot
= slots
[spill_id
];
1644 bool add_offset_to_sgpr
= ctx
.program
->config
->scratch_bytes_per_wave
/ ctx
.program
->wave_size
+ vgpr_spill_slots
* 4 > 4096;
1645 unsigned base_offset
= add_offset_to_sgpr
? 0 : ctx
.program
->config
->scratch_bytes_per_wave
/ ctx
.program
->wave_size
;
1647 /* check if the scratch resource descriptor already exists */
1648 if (scratch_rsrc
== Temp()) {
1649 unsigned offset
= add_offset_to_sgpr
? ctx
.program
->config
->scratch_bytes_per_wave
: 0;
1650 scratch_rsrc
= load_scratch_resource(ctx
, scratch_offset
,
1651 last_top_level_block_idx
== block
.index
?
1652 instructions
: ctx
.program
->blocks
[last_top_level_block_idx
].instructions
,
1654 last_top_level_block_idx
== block
.index
);
1657 unsigned offset
= base_offset
+ spill_slot
* 4;
1658 aco_opcode opcode
= aco_opcode::buffer_load_dword
;
1659 Definition def
= (*it
)->definitions
[0];
1660 if (def
.size() > 1) {
1661 Instruction
* vec
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_create_vector
, Format::PSEUDO
, def
.size(), 1)};
1662 vec
->definitions
[0] = def
;
1663 for (unsigned i
= 0; i
< def
.size(); i
++) {
1664 Temp tmp
= bld
.tmp(v1
);
1665 vec
->operands
[i
] = Operand(tmp
);
1666 Instruction
*instr
= bld
.mubuf(opcode
, Definition(tmp
), scratch_rsrc
, Operand(v1
), scratch_offset
, offset
+ i
* 4, false, true);
1667 static_cast<MUBUF_instruction
*>(instr
)->sync
= memory_sync_info(storage_vgpr_spill
, semantic_private
);
1671 Instruction
*instr
= bld
.mubuf(opcode
, def
, scratch_rsrc
, Operand(v1
), scratch_offset
, offset
, false, true);
1672 static_cast<MUBUF_instruction
*>(instr
)->sync
= memory_sync_info(storage_vgpr_spill
, semantic_private
);
1675 uint32_t spill_slot
= slots
[spill_id
];
1676 reload_in_loop
[spill_slot
/ ctx
.wave_size
] = block
.loop_nest_depth
> 0;
1678 /* check if the linear vgpr already exists */
1679 if (vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
] == Temp()) {
1680 Temp linear_vgpr
= {ctx
.program
->allocateId(), v1
.as_linear()};
1681 vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
] = linear_vgpr
;
1682 aco_ptr
<Pseudo_instruction
> create
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_start_linear_vgpr
, Format::PSEUDO
, 0, 1)};
1683 create
->definitions
[0] = Definition(linear_vgpr
);
1684 /* find the right place to insert this definition */
1685 if (last_top_level_block_idx
== block
.index
) {
1686 /* insert right before the current instruction */
1687 instructions
.emplace_back(std::move(create
));
1689 assert(last_top_level_block_idx
< block
.index
);
1690 /* insert before the branch at last top level block */
1691 std::vector
<aco_ptr
<Instruction
>>& instructions
= ctx
.program
->blocks
[last_top_level_block_idx
].instructions
;
1692 instructions
.insert(std::next(instructions
.begin(), instructions
.size() - 1), std::move(create
));
1696 /* reload sgpr: just add the vgpr temp to operands */
1697 Pseudo_instruction
* reload
= create_instruction
<Pseudo_instruction
>(aco_opcode::p_reload
, Format::PSEUDO
, 2, 1);
1698 reload
->operands
[0] = Operand(vgpr_spill_temps
[spill_slot
/ ctx
.wave_size
]);
1699 reload
->operands
[1] = Operand(spill_slot
% ctx
.wave_size
);
1700 reload
->definitions
[0] = (*it
)->definitions
[0];
1701 instructions
.emplace_back(aco_ptr
<Instruction
>(reload
));
1703 } else if (!ctx
.remat_used
.count(it
->get()) || ctx
.remat_used
[it
->get()]) {
1704 instructions
.emplace_back(std::move(*it
));
1708 block
.instructions
= std::move(instructions
);
1711 /* update required scratch memory */
1712 ctx
.program
->config
->scratch_bytes_per_wave
+= align(vgpr_spill_slots
* 4 * ctx
.program
->wave_size
, 1024);
1714 /* SSA elimination inserts copies for logical phis right before p_logical_end
1715 * So if a linear vgpr is used between that p_logical_end and the branch,
1716 * we need to ensure logical phis don't choose a definition which aliases
1718 * TODO: Moving the spills and reloads to before p_logical_end might produce
1719 * slightly better code. */
1720 for (Block
& block
: ctx
.program
->blocks
) {
1721 /* loops exits are already handled */
1722 if (block
.logical_preds
.size() <= 1)
1725 bool has_logical_phis
= false;
1726 for (aco_ptr
<Instruction
>& instr
: block
.instructions
) {
1727 if (instr
->opcode
== aco_opcode::p_phi
) {
1728 has_logical_phis
= true;
1730 } else if (instr
->opcode
!= aco_opcode::p_linear_phi
) {
1734 if (!has_logical_phis
)
1737 std::set
<Temp
> vgprs
;
1738 for (unsigned pred_idx
: block
.logical_preds
) {
1739 Block
& pred
= ctx
.program
->blocks
[pred_idx
];
1740 for (int i
= pred
.instructions
.size() - 1; i
>= 0; i
--) {
1741 aco_ptr
<Instruction
>& pred_instr
= pred
.instructions
[i
];
1742 if (pred_instr
->opcode
== aco_opcode::p_logical_end
) {
1744 } else if (pred_instr
->opcode
== aco_opcode::p_spill
||
1745 pred_instr
->opcode
== aco_opcode::p_reload
) {
1746 vgprs
.insert(pred_instr
->operands
[0].getTemp());
1753 aco_ptr
<Instruction
> destr
{create_instruction
<Pseudo_instruction
>(aco_opcode::p_end_linear_vgpr
, Format::PSEUDO
, vgprs
.size(), 0)};
1755 for (Temp tmp
: vgprs
) {
1756 destr
->operands
[k
++] = Operand(tmp
);
1758 /* find insertion point */
1759 std::vector
<aco_ptr
<Instruction
>>::iterator it
= block
.instructions
.begin();
1760 while ((*it
)->opcode
== aco_opcode::p_linear_phi
|| (*it
)->opcode
== aco_opcode::p_phi
)
1762 block
.instructions
.insert(it
, std::move(destr
));
1766 } /* end namespace */
1769 void spill(Program
* program
, live
& live_vars
, const struct radv_nir_compiler_options
*options
)
1771 program
->config
->spilled_vgprs
= 0;
1772 program
->config
->spilled_sgprs
= 0;
1774 /* no spilling when register pressure is low enough */
1775 if (program
->num_waves
> 0)
1778 /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
1779 lower_to_cssa(program
, live_vars
, options
);
1781 /* calculate target register demand */
1782 RegisterDemand register_target
= program
->max_reg_demand
;
1783 if (register_target
.sgpr
> program
->sgpr_limit
)
1784 register_target
.vgpr
+= (register_target
.sgpr
- program
->sgpr_limit
+ program
->wave_size
- 1 + 32) / program
->wave_size
;
1785 register_target
.sgpr
= program
->sgpr_limit
;
1787 if (register_target
.vgpr
> program
->vgpr_limit
)
1788 register_target
.sgpr
= program
->sgpr_limit
- 5;
1789 int spills_to_vgpr
= (program
->max_reg_demand
.sgpr
- register_target
.sgpr
+ program
->wave_size
- 1 + 32) / program
->wave_size
;
1790 register_target
.vgpr
= program
->vgpr_limit
- spills_to_vgpr
;
1792 /* initialize ctx */
1793 spill_ctx
ctx(register_target
, program
, live_vars
.register_demand
);
1794 compute_global_next_uses(ctx
);
1795 get_rematerialize_info(ctx
);
1797 /* create spills and reloads */
1798 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++)
1799 spill_block(ctx
, i
);
1801 /* assign spill slots and DCE rematerialized code */
1802 assign_spill_slots(ctx
, spills_to_vgpr
);
1804 /* update live variable information */
1805 live_vars
= live_var_analysis(program
, options
);
1807 assert(program
->num_waves
> 0);