aco: create pseudo dummy instruction in RA to be used for live-range splits
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37
38 namespace aco {
39 namespace {
40
41 struct assignment {
42 PhysReg reg;
43 RegClass rc;
44 uint8_t assigned = 0;
45 assignment() = default;
46 assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
47 };
48
49 struct phi_info {
50 Instruction* phi;
51 unsigned block_idx;
52 std::set<Instruction*> uses;
53 };
54
55 struct ra_ctx {
56 std::bitset<512> war_hint;
57 Program* program;
58 std::vector<assignment> assignments;
59 std::vector<std::unordered_map<unsigned, Temp>> renames;
60 std::vector<std::vector<Instruction*>> incomplete_phis;
61 std::vector<bool> filled;
62 std::vector<bool> sealed;
63 std::unordered_map<unsigned, Temp> orig_names;
64 std::unordered_map<unsigned, phi_info> phi_map;
65 std::unordered_map<unsigned, unsigned> affinities;
66 std::unordered_map<unsigned, Instruction*> vectors;
67 aco_ptr<Instruction> pseudo_dummy;
68 unsigned max_used_sgpr = 0;
69 unsigned max_used_vgpr = 0;
70 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
71
72 ra_ctx(Program* program) : program(program),
73 assignments(program->peekAllocationId()),
74 renames(program->blocks.size()),
75 incomplete_phis(program->blocks.size()),
76 filled(program->blocks.size()),
77 sealed(program->blocks.size())
78 {
79 pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
80 }
81 };
82
83 class RegisterFile {
84 public:
85 RegisterFile() {regs.fill(0);}
86
87 std::array<uint32_t, 512> regs;
88 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
89
90 const uint32_t& operator [] (unsigned index) const {
91 return regs[index];
92 }
93
94 uint32_t& operator [] (unsigned index) {
95 return regs[index];
96 }
97
98 unsigned count_zero(PhysReg start, unsigned size) {
99 unsigned res = 0;
100 for (unsigned i = 0; i < size; i++)
101 res += !regs[start + i];
102 return res;
103 }
104
105 bool test(PhysReg start, unsigned num_bytes) {
106 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
107 if (regs[i] & 0x0FFFFFFF)
108 return true;
109 if (regs[i] == 0xF0000000) {
110 assert(subdword_regs.find(i) != subdword_regs.end());
111 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
112 if (subdword_regs[i][j])
113 return true;
114 }
115 }
116 }
117 return false;
118 }
119
120 void block(PhysReg start, unsigned num_bytes) {
121 if (start.byte() || num_bytes % 4)
122 fill_subdword(start, num_bytes, 0xFFFFFFFF);
123 else
124 fill(start, num_bytes / 4, 0xFFFFFFFF);
125 }
126
127 bool is_blocked(PhysReg start) {
128 if (regs[start] == 0xFFFFFFFF)
129 return true;
130 if (regs[start] == 0xF0000000) {
131 for (unsigned i = start.byte(); i < 4; i++)
132 if (subdword_regs[start][i] == 0xFFFFFFFF)
133 return true;
134 }
135 return false;
136 }
137
138 void clear(PhysReg start, RegClass rc) {
139 if (rc.is_subdword())
140 fill_subdword(start, rc.bytes(), 0);
141 else
142 fill(start, rc.size(), 0);
143 }
144
145 void fill(Operand op) {
146 if (op.regClass().is_subdword())
147 fill_subdword(op.physReg(), op.bytes(), op.tempId());
148 else
149 fill(op.physReg(), op.size(), op.tempId());
150 }
151
152 void clear(Operand op) {
153 clear(op.physReg(), op.regClass());
154 }
155
156 void fill(Definition def) {
157 if (def.regClass().is_subdword())
158 fill_subdword(def.physReg(), def.bytes(), def.tempId());
159 else
160 fill(def.physReg(), def.size(), def.tempId());
161 }
162
163 void clear(Definition def) {
164 clear(def.physReg(), def.regClass());
165 }
166
167 private:
168 void fill(PhysReg start, unsigned size, uint32_t val) {
169 for (unsigned i = 0; i < size; i++)
170 regs[start + i] = val;
171 }
172
173 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
174 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
175 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
176 /* emplace or get */
177 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
178 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
179 sub[j] = val;
180
181 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
182 subdword_regs.erase(i);
183 regs[i] = 0;
184 }
185 }
186 }
187 };
188
189
190 /* helper function for debugging */
191 #if 0
192 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
193 {
194 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
195 unsigned lb = vgprs ? 256 : 0;
196 unsigned ub = lb + max;
197 char reg_char = vgprs ? 'v' : 's';
198
199 /* print markers */
200 printf(" ");
201 for (unsigned i = lb; i < ub; i += 3) {
202 printf("%.2u ", i - lb);
203 }
204 printf("\n");
205
206 /* print usage */
207 printf("%cgprs: ", reg_char);
208 unsigned free_regs = 0;
209 unsigned prev = 0;
210 bool char_select = false;
211 for (unsigned i = lb; i < ub; i++) {
212 if (reg_file[i] == 0xFFFF) {
213 printf("~");
214 } else if (reg_file[i]) {
215 if (reg_file[i] != prev) {
216 prev = reg_file[i];
217 char_select = !char_select;
218 }
219 printf(char_select ? "#" : "@");
220 } else {
221 free_regs++;
222 printf(".");
223 }
224 }
225 printf("\n");
226
227 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
228
229 /* print assignments */
230 prev = 0;
231 unsigned size = 0;
232 for (unsigned i = lb; i < ub; i++) {
233 if (reg_file[i] != prev) {
234 if (prev && size > 1)
235 printf("-%d]\n", i - 1 - lb);
236 else if (prev)
237 printf("]\n");
238 prev = reg_file[i];
239 if (prev && prev != 0xFFFF) {
240 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
241 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
242 else
243 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
244 }
245 size = 1;
246 } else {
247 size++;
248 }
249 }
250 if (prev && size > 1)
251 printf("-%d]\n", ub - lb - 1);
252 else if (prev)
253 printf("]\n");
254 }
255 #endif
256
257
258 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
259 {
260 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
261 unsigned size = rc.size();
262 if (rc.type() == RegType::vgpr) {
263 assert(reg >= 256);
264 unsigned hi = reg - 256 + size - 1;
265 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
266 } else if (reg + rc.size() <= max_addressible_sgpr) {
267 unsigned hi = reg + size - 1;
268 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
269 }
270 }
271
272
273 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
274 std::vector<std::pair<Operand, Definition>>& parallelcopies,
275 aco_ptr<Instruction>& instr)
276 {
277 /* allocate id's and rename operands: this is done transparently here */
278 for (std::pair<Operand, Definition>& copy : parallelcopies) {
279 /* the definitions with id are not from this function and already handled */
280 if (copy.second.isTemp())
281 continue;
282
283 /* check if we we moved another parallelcopy definition */
284 for (std::pair<Operand, Definition>& other : parallelcopies) {
285 if (!other.second.isTemp())
286 continue;
287 if (copy.first.getTemp() == other.second.getTemp()) {
288 copy.first.setTemp(other.first.getTemp());
289 copy.first.setFixed(other.first.physReg());
290 }
291 }
292 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
293 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
294 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
295 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
296 reg_file.fill(copy.second);
297
298 /* check if we moved an operand */
299 for (Operand& op : instr->operands) {
300 if (!op.isTemp())
301 continue;
302 if (op.tempId() == copy.first.tempId()) {
303 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
304 for (std::pair<Operand, Definition>& pc : parallelcopies) {
305 PhysReg def_reg = pc.second.physReg();
306 omit_renaming &= def_reg > copy.first.physReg() ?
307 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
308 (def_reg + pc.second.size() <= copy.first.physReg().reg());
309 }
310 if (omit_renaming)
311 continue;
312 op.setTemp(copy.second.getTemp());
313 op.setFixed(copy.second.physReg());
314 }
315 }
316 }
317 }
318
319 bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
320 {
321 return instr->isSDWA() || instr->format == Format::PSEUDO;
322 }
323
324 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
325 RegisterFile& reg_file,
326 uint32_t lb, uint32_t ub,
327 uint32_t size, uint32_t stride,
328 RegClass rc)
329 {
330 if (rc.is_subdword()) {
331 for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
332 assert(reg_file[entry.first] == 0xF0000000);
333 if (lb > entry.first || entry.first >= ub)
334 continue;
335
336 for (unsigned i = 0; i < 4; i+= stride) {
337 if (entry.second[i] != 0)
338 continue;
339
340 bool reg_found = true;
341 for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
342 reg_found &= entry.second[i + j] == 0;
343
344 /* check neighboring reg if needed */
345 reg_found &= (i <= 4 - rc.bytes() || reg_file[entry.first + 1] == 0);
346 if (reg_found) {
347 PhysReg res{entry.first};
348 res.reg_b += i;
349 return {res, true};
350 }
351 }
352 }
353
354 stride = 1; /* stride in full registers */
355 }
356
357 /* best fit algorithm: find the smallest gap to fit in the variable */
358 if (stride == 1) {
359 unsigned best_pos = 0xFFFF;
360 unsigned gap_size = 0xFFFF;
361 unsigned next_pos = 0xFFFF;
362
363 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
364 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
365 if (next_pos == 0xFFFF)
366 continue;
367
368 /* check if the variable fits */
369 if (next_pos + size > current_reg) {
370 next_pos = 0xFFFF;
371 continue;
372 }
373
374 /* check if the tested gap is smaller */
375 if (current_reg - next_pos < gap_size) {
376 best_pos = next_pos;
377 gap_size = current_reg - next_pos;
378 }
379 next_pos = 0xFFFF;
380 continue;
381 }
382
383 if (next_pos == 0xFFFF)
384 next_pos = current_reg;
385 }
386
387 /* final check */
388 if (next_pos != 0xFFFF &&
389 next_pos + size <= ub &&
390 ub - next_pos < gap_size) {
391 best_pos = next_pos;
392 gap_size = ub - next_pos;
393 }
394 if (best_pos != 0xFFFF) {
395 adjust_max_used_regs(ctx, rc, best_pos);
396 return {PhysReg{best_pos}, true};
397 }
398 return {{}, false};
399 }
400
401 bool found = false;
402 unsigned reg_lo = lb;
403 unsigned reg_hi = lb + size - 1;
404 while (!found && reg_lo + size <= ub) {
405 if (reg_file[reg_lo] != 0) {
406 reg_lo += stride;
407 continue;
408 }
409 reg_hi = reg_lo + size - 1;
410 found = true;
411 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
412 if (reg_file[reg] != 0 || ctx.war_hint[reg])
413 found = false;
414 }
415 if (found) {
416 adjust_max_used_regs(ctx, rc, reg_lo);
417 return {PhysReg{reg_lo}, true};
418 }
419
420 reg_lo += stride;
421 }
422
423 return {{}, false};
424 }
425
426 /* collect variables from a register area and clear reg_file */
427 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
428 PhysReg reg, unsigned size)
429 {
430 std::set<std::pair<unsigned, unsigned>> vars;
431 for (unsigned j = reg; j < reg + size; j++) {
432 if (reg_file.is_blocked(PhysReg{j}))
433 continue;
434 if (reg_file[j] == 0xF0000000) {
435 for (unsigned k = 0; k < 4; k++) {
436 unsigned id = reg_file.subdword_regs[j][k];
437 if (id) {
438 assignment& var = ctx.assignments[id];
439 vars.emplace(var.rc.bytes(), id);
440 reg_file.clear(var.reg, var.rc);
441 if (!reg_file[j])
442 break;
443 }
444 }
445 } else if (reg_file[j] != 0) {
446 unsigned id = reg_file[j];
447 assignment& var = ctx.assignments[id];
448 vars.emplace(var.rc.bytes(), id);
449 reg_file.clear(var.reg, var.rc);
450 }
451 }
452 return vars;
453 }
454
455 bool get_regs_for_copies(ra_ctx& ctx,
456 RegisterFile& reg_file,
457 std::vector<std::pair<Operand, Definition>>& parallelcopies,
458 const std::set<std::pair<unsigned, unsigned>> &vars,
459 uint32_t lb, uint32_t ub,
460 aco_ptr<Instruction>& instr,
461 uint32_t def_reg_lo,
462 uint32_t def_reg_hi)
463 {
464
465 /* variables are sorted from small sized to large */
466 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
467 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
468 unsigned id = it->second;
469 assignment& var = ctx.assignments[id];
470 uint32_t size = var.rc.size();
471 uint32_t stride = 1;
472 if (var.rc.type() == RegType::sgpr) {
473 if (size == 2)
474 stride = 2;
475 if (size > 3)
476 stride = 4;
477 }
478
479 /* check if this is a dead operand, then we can re-use the space from the definition */
480 bool is_dead_operand = false;
481 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
482 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
483 is_dead_operand = true;
484 }
485
486 std::pair<PhysReg, bool> res;
487 if (is_dead_operand) {
488 if (instr->opcode == aco_opcode::p_create_vector) {
489 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
490 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
491 PhysReg reg(def_reg_lo);
492 reg.reg_b += offset;
493 assert(!reg_file.test(reg, var.rc.bytes()));
494 res = {reg, true};
495 break;
496 }
497 }
498 } else {
499 res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.rc);
500 }
501 } else {
502 res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.rc);
503 if (!res.second) {
504 unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
505 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.rc);
506 }
507 }
508
509 if (res.second) {
510 /* mark the area as blocked */
511 reg_file.block(res.first, var.rc.bytes());
512
513 /* create parallelcopy pair (without definition id) */
514 Temp tmp = Temp(id, var.rc);
515 Operand pc_op = Operand(tmp);
516 pc_op.setFixed(var.reg);
517 Definition pc_def = Definition(res.first, pc_op.regClass());
518 parallelcopies.emplace_back(pc_op, pc_def);
519 continue;
520 }
521
522 unsigned best_pos = lb;
523 unsigned num_moves = 0xFF;
524 unsigned num_vars = 0;
525
526 /* we use a sliding window to find potential positions */
527 unsigned reg_lo = lb;
528 unsigned reg_hi = lb + size - 1;
529 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
530 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
531 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
532 continue;
533
534 /* second, check that we have at most k=num_moves elements in the window
535 * and no element is larger than the currently processed one */
536 unsigned k = 0;
537 unsigned n = 0;
538 unsigned last_var = 0;
539 bool found = true;
540 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
541 if (reg_file[j] == 0 || reg_file[j] == last_var)
542 continue;
543
544 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
545 found = false;
546 break;
547 }
548 if (reg_file[j] == 0xF0000000) {
549 k += 1;
550 n++;
551 continue;
552 }
553 /* we cannot split live ranges of linear vgprs */
554 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
555 found = false;
556 break;
557 }
558 bool is_kill = false;
559 for (const Operand& op : instr->operands) {
560 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
561 is_kill = true;
562 break;
563 }
564 }
565 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
566 found = false;
567 break;
568 }
569
570 k += ctx.assignments[reg_file[j]].rc.size();
571 last_var = reg_file[j];
572 n++;
573 if (k > num_moves || (k == num_moves && n <= num_vars)) {
574 found = false;
575 break;
576 }
577 }
578
579 if (found) {
580 best_pos = reg_lo;
581 num_moves = k;
582 num_vars = n;
583 }
584 }
585
586 /* FIXME: we messed up and couldn't find space for the variables to be copied */
587 if (num_moves == 0xFF)
588 return false;
589
590 reg_lo = best_pos;
591 reg_hi = best_pos + size - 1;
592
593 /* collect variables and block reg file */
594 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
595
596 /* mark the area as blocked */
597 reg_file.block(PhysReg{reg_lo}, size * 4);
598
599 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
600 return false;
601
602 adjust_max_used_regs(ctx, var.rc, reg_lo);
603
604 /* create parallelcopy pair (without definition id) */
605 Temp tmp = Temp(id, var.rc);
606 Operand pc_op = Operand(tmp);
607 pc_op.setFixed(var.reg);
608 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
609 parallelcopies.emplace_back(pc_op, pc_def);
610 }
611
612 return true;
613 }
614
615
616 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
617 RegisterFile& reg_file,
618 std::vector<std::pair<Operand, Definition>>& parallelcopies,
619 uint32_t lb, uint32_t ub,
620 uint32_t size, uint32_t stride,
621 RegClass rc,
622 aco_ptr<Instruction>& instr)
623 {
624 /* check how many free regs we have */
625 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
626
627 /* mark and count killed operands */
628 unsigned killed_ops = 0;
629 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
630 if (instr->operands[j].isTemp() &&
631 instr->operands[j].isFirstKillBeforeDef() &&
632 instr->operands[j].physReg() >= lb &&
633 instr->operands[j].physReg() < ub) {
634 assert(instr->operands[j].isFixed());
635 assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
636 reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
637 killed_ops += instr->operands[j].getTemp().size();
638 }
639 }
640
641 assert(regs_free >= size);
642 /* we might have to move dead operands to dst in order to make space */
643 unsigned op_moves = 0;
644
645 if (size > (regs_free - killed_ops))
646 op_moves = size - (regs_free - killed_ops);
647
648 /* find the best position to place the definition */
649 unsigned best_pos = lb;
650 unsigned num_moves = 0xFF;
651 unsigned num_vars = 0;
652
653 /* we use a sliding window to check potential positions */
654 unsigned reg_lo = lb;
655 unsigned reg_hi = lb + size - 1;
656 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
657 /* first check the edges: this is what we have to fix to allow for num_moves > size */
658 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
659 continue;
660 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
661 continue;
662
663 /* second, check that we have at most k=num_moves elements in the window
664 * and no element is larger than the currently processed one */
665 unsigned k = op_moves;
666 unsigned n = 0;
667 unsigned remaining_op_moves = op_moves;
668 unsigned last_var = 0;
669 bool found = true;
670 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
671 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
672 if (reg_file[j] == 0 || reg_file[j] == last_var)
673 continue;
674
675 /* dead operands effectively reduce the number of estimated moves */
676 if (reg_file.is_blocked(PhysReg{j})) {
677 if (remaining_op_moves) {
678 k--;
679 remaining_op_moves--;
680 }
681 continue;
682 }
683
684 if (reg_file[j] == 0xF0000000) {
685 k += 1;
686 n++;
687 continue;
688 }
689
690 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
691 found = false;
692 break;
693 }
694
695 /* we cannot split live ranges of linear vgprs */
696 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
697 found = false;
698 break;
699 }
700
701 k += ctx.assignments[reg_file[j]].rc.size();
702 n++;
703 last_var = reg_file[j];
704 }
705
706 if (!found || k > num_moves)
707 continue;
708 if (k == num_moves && n < num_vars)
709 continue;
710 if (!aligned && k == num_moves && n == num_vars)
711 continue;
712
713 if (found) {
714 best_pos = reg_lo;
715 num_moves = k;
716 num_vars = n;
717 }
718 }
719
720 if (num_moves == 0xFF) {
721 /* remove killed operands from reg_file once again */
722 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
723 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
724 reg_file.clear(instr->operands[i]);
725 }
726 for (unsigned i = 0; i < instr->definitions.size(); i++) {
727 Definition def = instr->definitions[i];
728 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
729 reg_file.fill(def);
730 }
731 return {{}, false};
732 }
733
734 RegisterFile register_file = reg_file;
735
736 /* now, we figured the placement for our definition */
737 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
738
739 if (instr->opcode == aco_opcode::p_create_vector) {
740 /* move killed operands which aren't yet at the correct position */
741 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
742 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
743 instr->operands[i].getTemp().type() == rc.type()) {
744
745 if (instr->operands[i].physReg() != best_pos + offset) {
746 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
747 reg_file.clear(instr->operands[i]);
748 } else {
749 reg_file.fill(instr->operands[i]);
750 }
751 }
752 }
753 } else {
754 /* re-enable the killed operands */
755 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
756 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
757 reg_file.fill(instr->operands[j]);
758 }
759 }
760
761 std::vector<std::pair<Operand, Definition>> pc;
762 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
763 reg_file = std::move(register_file);
764 /* remove killed operands from reg_file once again */
765 if (!is_phi(instr)) {
766 for (const Operand& op : instr->operands) {
767 if (op.isTemp() && op.isFirstKillBeforeDef())
768 reg_file.clear(op);
769 }
770 }
771 for (unsigned i = 0; i < instr->definitions.size(); i++) {
772 Definition& def = instr->definitions[i];
773 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
774 reg_file.fill(def);
775 }
776 return {{}, false};
777 }
778
779 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
780
781 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
782 reg_file.clear(PhysReg{best_pos}, rc);
783
784 update_renames(ctx, reg_file, parallelcopies, instr);
785
786 /* remove killed operands from reg_file once again */
787 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
788 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
789 continue;
790 assert(!instr->operands[i].isUndefined());
791 if (instr->operands[i].isFirstKillBeforeDef())
792 reg_file.clear(instr->operands[i]);
793 }
794 for (unsigned i = 0; i < instr->definitions.size(); i++) {
795 Definition def = instr->definitions[i];
796 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
797 reg_file.fill(def);
798 }
799
800 adjust_max_used_regs(ctx, rc, best_pos);
801 return {PhysReg{best_pos}, true};
802 }
803
804 bool get_reg_specified(ra_ctx& ctx,
805 RegisterFile& reg_file,
806 RegClass rc,
807 std::vector<std::pair<Operand, Definition>>& parallelcopies,
808 aco_ptr<Instruction>& instr,
809 PhysReg reg)
810 {
811 if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
812 return false;
813
814 uint32_t size = rc.size();
815 uint32_t stride = 1;
816 uint32_t lb, ub;
817
818 if (rc.type() == RegType::vgpr) {
819 lb = 256;
820 ub = 256 + ctx.program->max_reg_demand.vgpr;
821 } else {
822 if (size == 2)
823 stride = 2;
824 else if (size >= 4)
825 stride = 4;
826 if (reg % stride != 0)
827 return false;
828 lb = 0;
829 ub = ctx.program->max_reg_demand.sgpr;
830 }
831
832 uint32_t reg_lo = reg.reg();
833 uint32_t reg_hi = reg + (size - 1);
834
835 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
836 return false;
837
838 if (reg_file.test(reg, rc.bytes()))
839 return false;
840
841 adjust_max_used_regs(ctx, rc, reg_lo);
842 return true;
843 }
844
845 std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
846 RegisterFile& reg_file,
847 RegClass rc)
848 {
849 uint32_t size = rc.size();
850 uint32_t stride = 1;
851 uint32_t lb, ub;
852 if (rc.type() == RegType::vgpr) {
853 lb = 256;
854 ub = 256 + ctx.program->max_reg_demand.vgpr;
855 } else {
856 lb = 0;
857 ub = ctx.program->max_reg_demand.sgpr;
858 if (size == 2)
859 stride = 2;
860 else if (size >= 4)
861 stride = 4;
862 }
863 return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
864 }
865
866 PhysReg get_reg(ra_ctx& ctx,
867 RegisterFile& reg_file,
868 Temp temp,
869 std::vector<std::pair<Operand, Definition>>& parallelcopies,
870 aco_ptr<Instruction>& instr)
871 {
872 if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
873 ctx.assignments[ctx.affinities[temp.id()]].assigned) {
874 PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
875 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
876 return reg;
877 }
878
879 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
880 Instruction* vec = ctx.vectors[temp.id()];
881 unsigned byte_offset = 0;
882 for (const Operand& op : vec->operands) {
883 if (op.isTemp() && op.tempId() == temp.id())
884 break;
885 else
886 byte_offset += op.bytes();
887 }
888 unsigned k = 0;
889 for (const Operand& op : vec->operands) {
890 if (op.isTemp() &&
891 op.tempId() != temp.id() &&
892 op.getTemp().type() == temp.type() &&
893 ctx.assignments[op.tempId()].assigned) {
894 PhysReg reg = ctx.assignments[op.tempId()].reg;
895 reg.reg_b += (byte_offset - k);
896 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
897 return reg;
898 }
899 k += op.bytes();
900 }
901
902 std::pair<PhysReg, bool> res = get_reg_vec(ctx, reg_file, vec->definitions[0].regClass());
903 PhysReg reg = res.first;
904 if (res.second) {
905 reg.reg_b += byte_offset;
906 /* make sure to only use byte offset if the instruction supports it */
907 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
908 return reg;
909 }
910 }
911
912 RegClass rc = temp.regClass();
913 uint32_t size = rc.size();
914 uint32_t stride = 1;
915 uint32_t lb, ub;
916 if (rc.type() == RegType::vgpr) {
917 lb = 256;
918 ub = 256 + ctx.program->max_reg_demand.vgpr;
919 } else {
920 lb = 0;
921 ub = ctx.program->max_reg_demand.sgpr;
922 if (size == 2)
923 stride = 2;
924 else if (size >= 4)
925 stride = 4;
926 }
927
928 if (rc.is_subdword()) {
929 /* stride in bytes */
930 if(!instr_can_access_subdword(instr))
931 stride = 4;
932 else if (rc.bytes() % 4 == 0)
933 stride = 4;
934 else if (rc.bytes() % 2 == 0)
935 stride = 2;
936 }
937
938 std::pair<PhysReg, bool> res = {{}, false};
939 /* try to find space without live-range splits */
940 if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
941 res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
942 if (!res.second)
943 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
944 if (res.second)
945 return res.first;
946
947 /* try to find space with live-range splits */
948 res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
949
950 if (res.second)
951 return res.first;
952
953 /* try using more registers */
954
955 /* We should only fail here because keeping under the limit would require
956 * too many moves. */
957 assert(reg_file.count_zero(PhysReg{lb}, ub-lb) >= size);
958
959 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
960 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
961 if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
962 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
963 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
964 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
965 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
966 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
967 }
968
969 //FIXME: if nothing helps, shift-rotate the registers to make space
970
971 unreachable("did not find a register");
972 }
973
974 PhysReg get_reg_create_vector(ra_ctx& ctx,
975 RegisterFile& reg_file,
976 Temp temp,
977 std::vector<std::pair<Operand, Definition>>& parallelcopies,
978 aco_ptr<Instruction>& instr)
979 {
980 RegClass rc = temp.regClass();
981 /* create_vector instructions have different costs w.r.t. register coalescing */
982 uint32_t size = rc.size();
983 uint32_t bytes = rc.bytes();
984 uint32_t stride = 1;
985 uint32_t lb, ub;
986 if (rc.type() == RegType::vgpr) {
987 lb = 256;
988 ub = 256 + ctx.program->max_reg_demand.vgpr;
989 } else {
990 lb = 0;
991 ub = ctx.program->max_reg_demand.sgpr;
992 if (size == 2)
993 stride = 2;
994 else if (size >= 4)
995 stride = 4;
996 }
997
998 //TODO: improve p_create_vector for sub-dword vectors
999
1000 unsigned best_pos = -1;
1001 unsigned num_moves = 0xFF;
1002 bool best_war_hint = true;
1003
1004 /* test for each operand which definition placement causes the least shuffle instructions */
1005 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1006 // TODO: think about, if we can alias live operands on the same register
1007 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
1008 continue;
1009
1010 if (offset > instr->operands[i].physReg().reg_b)
1011 continue;
1012
1013 unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
1014 if (reg_lo % 4)
1015 continue;
1016 reg_lo /= 4;
1017 unsigned reg_hi = reg_lo + size - 1;
1018 unsigned k = 0;
1019
1020 /* no need to check multiple times */
1021 if (reg_lo == best_pos)
1022 continue;
1023
1024 /* check borders */
1025 // TODO: this can be improved */
1026 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
1027 continue;
1028 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
1029 continue;
1030 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
1031 continue;
1032
1033 /* count variables to be moved and check war_hint */
1034 bool war_hint = false;
1035 bool linear_vgpr = false;
1036 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
1037 if (reg_file[j] != 0) {
1038 if (reg_file[j] == 0xF0000000) {
1039 PhysReg reg;
1040 reg.reg_b = j * 4;
1041 unsigned bytes_left = bytes - (j - reg_lo) * 4;
1042 for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
1043 k += reg_file.test(reg, 1);
1044 } else {
1045 k += 4;
1046 /* we cannot split live ranges of linear vgprs */
1047 if (ctx.assignments[reg_file[j]].rc & (1 << 6))
1048 linear_vgpr = true;
1049 }
1050 }
1051 war_hint |= ctx.war_hint[j];
1052 }
1053 if (linear_vgpr || (war_hint && !best_war_hint))
1054 continue;
1055
1056 /* count operands in wrong positions */
1057 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
1058 if (j == i ||
1059 !instr->operands[j].isTemp() ||
1060 instr->operands[j].getTemp().type() != rc.type())
1061 continue;
1062 if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
1063 k += instr->operands[j].bytes();
1064 }
1065 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
1066 if (k > num_moves || (!aligned && k == num_moves))
1067 continue;
1068
1069 best_pos = reg_lo;
1070 num_moves = k;
1071 best_war_hint = war_hint;
1072 }
1073
1074 if (num_moves >= bytes)
1075 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1076
1077 /* collect variables to be moved */
1078 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1079
1080 /* move killed operands which aren't yet at the correct position */
1081 uint64_t moved_operand_mask = 0;
1082 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1083 if (instr->operands[i].isTemp() &&
1084 instr->operands[i].isFirstKillBeforeDef() &&
1085 instr->operands[i].getTemp().type() == rc.type() &&
1086 instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
1087 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1088 moved_operand_mask |= (uint64_t)1 << i;
1089 }
1090 }
1091
1092 ASSERTED bool success = false;
1093 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1094 assert(success);
1095
1096 update_renames(ctx, reg_file, parallelcopies, instr);
1097 adjust_max_used_regs(ctx, rc, best_pos);
1098
1099 while (moved_operand_mask) {
1100 unsigned i = u_bit_scan64(&moved_operand_mask);
1101 assert(instr->operands[i].isFirstKillBeforeDef());
1102 reg_file.clear(instr->operands[i]);
1103 }
1104
1105 return PhysReg{best_pos};
1106 }
1107
1108 void handle_pseudo(ra_ctx& ctx,
1109 const RegisterFile& reg_file,
1110 Instruction* instr)
1111 {
1112 if (instr->format != Format::PSEUDO)
1113 return;
1114
1115 /* all instructions which use handle_operands() need this information */
1116 switch (instr->opcode) {
1117 case aco_opcode::p_extract_vector:
1118 case aco_opcode::p_create_vector:
1119 case aco_opcode::p_split_vector:
1120 case aco_opcode::p_parallelcopy:
1121 case aco_opcode::p_wqm:
1122 break;
1123 default:
1124 return;
1125 }
1126
1127 /* if all definitions are vgpr, no need to care for SCC */
1128 bool writes_sgpr = false;
1129 for (Definition& def : instr->definitions) {
1130 if (def.getTemp().type() == RegType::sgpr) {
1131 writes_sgpr = true;
1132 break;
1133 }
1134 }
1135 /* if all operands are constant, no need to care either */
1136 bool reads_sgpr = false;
1137 for (Operand& op : instr->operands) {
1138 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1139 reads_sgpr = true;
1140 break;
1141 }
1142 }
1143 if (!(writes_sgpr && reads_sgpr))
1144 return;
1145
1146 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1147 if (reg_file[scc.reg()]) {
1148 pi->tmp_in_scc = true;
1149
1150 int reg = ctx.max_used_sgpr;
1151 for (; reg >= 0 && reg_file[reg]; reg--)
1152 ;
1153 if (reg < 0) {
1154 reg = ctx.max_used_sgpr + 1;
1155 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1156 ;
1157 assert(reg < ctx.program->max_reg_demand.sgpr);
1158 }
1159
1160 adjust_max_used_regs(ctx, s1, reg);
1161 pi->scratch_sgpr = PhysReg{(unsigned)reg};
1162 } else {
1163 pi->tmp_in_scc = false;
1164 }
1165 }
1166
1167 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1168 {
1169 if (instr->operands[idx].isFixed())
1170 return instr->operands[idx].physReg() == reg;
1171
1172 if (!instr_can_access_subdword(instr) && reg.byte())
1173 return false;
1174
1175 switch (instr->format) {
1176 case Format::SMEM:
1177 return reg != scc &&
1178 reg != exec &&
1179 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1180 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1181 default:
1182 // TODO: there are more instructions with restrictions on registers
1183 return true;
1184 }
1185 }
1186
1187 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1188 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1189 aco_ptr<Instruction>& instr, Operand& operand)
1190 {
1191 /* check if the operand is fixed */
1192 PhysReg dst;
1193 if (operand.isFixed()) {
1194 assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1195
1196 /* check if target reg is blocked, and move away the blocking var */
1197 if (register_file[operand.physReg().reg()]) {
1198 assert(register_file[operand.physReg()] != 0xF0000000);
1199 uint32_t blocking_id = register_file[operand.physReg().reg()];
1200 RegClass rc = ctx.assignments[blocking_id].rc;
1201 Operand pc_op = Operand(Temp{blocking_id, rc});
1202 pc_op.setFixed(operand.physReg());
1203
1204 /* find free reg */
1205 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
1206 Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1207 register_file.clear(pc_op);
1208 parallelcopy.emplace_back(pc_op, pc_def);
1209 }
1210 dst = operand.physReg();
1211
1212 } else {
1213 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr);
1214 }
1215
1216 Operand pc_op = operand;
1217 pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1218 Definition pc_def = Definition(dst, pc_op.regClass());
1219 register_file.clear(pc_op);
1220 parallelcopy.emplace_back(pc_op, pc_def);
1221 update_renames(ctx, register_file, parallelcopy, instr);
1222 }
1223
1224 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1225 {
1226 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1227 if (it == ctx.renames[block_idx].end())
1228 return val;
1229 else
1230 return it->second;
1231 }
1232
1233 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1234 {
1235 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1236 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1237 return val;
1238
1239 assert(preds.size() > 0);
1240
1241 Temp new_val;
1242 if (!ctx.sealed[block->index]) {
1243 /* consider rename from already processed predecessor */
1244 Temp tmp = read_variable(ctx, val, preds[0]);
1245
1246 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1247 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1248 ctx.assignments.emplace_back();
1249 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1250 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1251 phi->definitions[0] = Definition(new_val);
1252 for (unsigned i = 0; i < preds.size(); i++)
1253 phi->operands[i] = Operand(val);
1254 if (tmp.regClass() == new_val.regClass())
1255 ctx.affinities[new_val.id()] = tmp.id();
1256
1257 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1258 ctx.incomplete_phis[block->index].emplace_back(phi.get());
1259 block->instructions.insert(block->instructions.begin(), std::move(phi));
1260
1261 } else if (preds.size() == 1) {
1262 /* if the block has only one predecessor, just look there for the name */
1263 new_val = read_variable(ctx, val, preds[0]);
1264 } else {
1265 /* there are multiple predecessors and the block is sealed */
1266 Temp ops[preds.size()];
1267
1268 /* get the rename from each predecessor and check if they are the same */
1269 bool needs_phi = false;
1270 for (unsigned i = 0; i < preds.size(); i++) {
1271 ops[i] = read_variable(ctx, val, preds[i]);
1272 if (i == 0)
1273 new_val = ops[i];
1274 else
1275 needs_phi |= !(new_val == ops[i]);
1276 }
1277
1278 if (needs_phi) {
1279 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1280 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1281 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1282 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1283 phi->definitions[0] = Definition(new_val);
1284 for (unsigned i = 0; i < preds.size(); i++) {
1285 phi->operands[i] = Operand(ops[i]);
1286 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1287 if (ops[i].regClass() == new_val.regClass())
1288 ctx.affinities[new_val.id()] = ops[i].id();
1289 }
1290 ctx.assignments.emplace_back();
1291 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1292 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1293 block->instructions.insert(block->instructions.begin(), std::move(phi));
1294 }
1295 }
1296
1297 if (new_val != val) {
1298 ctx.renames[block->index][val.id()] = new_val;
1299 ctx.orig_names[new_val.id()] = val;
1300 }
1301 return new_val;
1302 }
1303
1304 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1305 {
1306 std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1307
1308 if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1309 return;
1310
1311 assert(info->second.block_idx != 0);
1312 Instruction* phi = info->second.phi;
1313 Temp same = Temp();
1314 Definition def = phi->definitions[0];
1315
1316 /* a phi node is trivial if all operands are the same as the definition of the phi */
1317 for (const Operand& op : phi->operands) {
1318 const Temp t = op.getTemp();
1319 if (t == same || t == def.getTemp()) {
1320 assert(t == same || op.physReg() == def.physReg());
1321 continue;
1322 }
1323 if (same != Temp())
1324 return;
1325
1326 same = t;
1327 }
1328 assert(same != Temp() || same == def.getTemp());
1329
1330 /* reroute all uses to same and remove phi */
1331 std::vector<Temp> phi_users;
1332 std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1333 for (Instruction* instr : info->second.uses) {
1334 assert(phi != instr);
1335 /* recursively try to remove trivial phis */
1336 if (is_phi(instr)) {
1337 /* ignore if the phi was already flagged trivial */
1338 if (instr->definitions.empty())
1339 continue;
1340
1341 if (instr->definitions[0].getTemp() != temp)
1342 phi_users.emplace_back(instr->definitions[0].getTemp());
1343 }
1344 for (Operand& op : instr->operands) {
1345 if (op.isTemp() && op.tempId() == def.tempId()) {
1346 op.setTemp(same);
1347 if (same_phi_info != ctx.phi_map.end())
1348 same_phi_info->second.uses.emplace(instr);
1349 }
1350 }
1351 }
1352
1353 auto it = ctx.orig_names.find(same.id());
1354 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1355 for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1356 auto it = ctx.renames[i].find(orig_var);
1357 if (it != ctx.renames[i].end() && it->second == def.getTemp())
1358 ctx.renames[i][orig_var] = same;
1359 }
1360
1361 phi->definitions.clear(); /* this indicates that the phi can be removed */
1362 ctx.phi_map.erase(info);
1363 for (Temp t : phi_users)
1364 try_remove_trivial_phi(ctx, t);
1365
1366 return;
1367 }
1368
1369 } /* end namespace */
1370
1371
1372 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
1373 {
1374 ra_ctx ctx(program);
1375 std::vector<std::vector<Temp>> phi_ressources;
1376 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1377
1378 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1379 Block& block = *it;
1380
1381 /* first, compute the death points of all live vars within the block */
1382 TempSet& live = live_out_per_block[block.index];
1383
1384 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1385 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1386 aco_ptr<Instruction>& instr = *rit;
1387 if (is_phi(instr)) {
1388 live.erase(instr->definitions[0].getTemp());
1389 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
1390 continue;
1391 /* collect information about affinity-related temporaries */
1392 std::vector<Temp> affinity_related;
1393 /* affinity_related[0] is the last seen affinity-related temp */
1394 affinity_related.emplace_back(instr->definitions[0].getTemp());
1395 affinity_related.emplace_back(instr->definitions[0].getTemp());
1396 for (const Operand& op : instr->operands) {
1397 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1398 affinity_related.emplace_back(op.getTemp());
1399 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1400 }
1401 }
1402 phi_ressources.emplace_back(std::move(affinity_related));
1403 continue;
1404 }
1405
1406 /* add vector affinities */
1407 if (instr->opcode == aco_opcode::p_create_vector) {
1408 for (const Operand& op : instr->operands) {
1409 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1410 ctx.vectors[op.tempId()] = instr.get();
1411 }
1412 }
1413
1414 /* add operands to live variables */
1415 for (const Operand& op : instr->operands) {
1416 if (op.isTemp())
1417 live.emplace(op.getTemp());
1418 }
1419
1420 /* erase definitions from live */
1421 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1422 const Definition& def = instr->definitions[i];
1423 if (!def.isTemp())
1424 continue;
1425 live.erase(def.getTemp());
1426 /* mark last-seen phi operand */
1427 std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1428 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1429 phi_ressources[it->second][0] = def.getTemp();
1430 /* try to coalesce phi affinities with parallelcopies */
1431 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
1432 Operand op = instr->operands[i];
1433 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1434 phi_ressources[it->second].emplace_back(op.getTemp());
1435 temp_to_phi_ressources[op.tempId()] = it->second;
1436 }
1437 }
1438 }
1439 }
1440 }
1441 }
1442 /* create affinities */
1443 for (std::vector<Temp>& vec : phi_ressources) {
1444 assert(vec.size() > 1);
1445 for (unsigned i = 1; i < vec.size(); i++)
1446 if (vec[i].id() != vec[0].id())
1447 ctx.affinities[vec[i].id()] = vec[0].id();
1448 }
1449
1450 /* state of register file after phis */
1451 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1452
1453 for (Block& block : program->blocks) {
1454 TempSet& live = live_out_per_block[block.index];
1455 /* initialize register file */
1456 assert(block.index != 0 || live.empty());
1457 RegisterFile register_file;
1458 ctx.war_hint.reset();
1459
1460 for (Temp t : live) {
1461 Temp renamed = handle_live_in(ctx, t, &block);
1462 assignment& var = ctx.assignments[renamed.id()];
1463 /* due to live-range splits, the live-in might be a phi, now */
1464 if (var.assigned)
1465 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1466 }
1467
1468 std::vector<aco_ptr<Instruction>> instructions;
1469 std::vector<aco_ptr<Instruction>>::iterator it;
1470
1471 /* this is a slight adjustment from the paper as we already have phi nodes:
1472 * We consider them incomplete phis and only handle the definition. */
1473
1474 /* handle fixed phi definitions */
1475 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1476 aco_ptr<Instruction>& phi = *it;
1477 if (!is_phi(phi))
1478 break;
1479 Definition& definition = phi->definitions[0];
1480 if (!definition.isFixed())
1481 continue;
1482
1483 /* check if a dead exec mask phi is needed */
1484 if (definition.isKill()) {
1485 for (Operand& op : phi->operands) {
1486 assert(op.isTemp());
1487 if (!ctx.assignments[op.tempId()].assigned ||
1488 ctx.assignments[op.tempId()].reg != exec) {
1489 definition.setKill(false);
1490 break;
1491 }
1492 }
1493 }
1494
1495 if (definition.isKill())
1496 continue;
1497
1498 assert(definition.physReg() == exec);
1499 assert(!register_file.test(definition.physReg(), definition.bytes()));
1500 register_file.fill(definition);
1501 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1502 }
1503
1504 /* look up the affinities */
1505 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1506 aco_ptr<Instruction>& phi = *it;
1507 if (!is_phi(phi))
1508 break;
1509 Definition& definition = phi->definitions[0];
1510 if (definition.isKill() || definition.isFixed())
1511 continue;
1512
1513 if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1514 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1515 assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1516 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1517 bool try_use_special_reg = reg == scc || reg == exec;
1518 if (try_use_special_reg) {
1519 for (const Operand& op : phi->operands) {
1520 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1521 ctx.assignments[op.tempId()].reg == reg)) {
1522 try_use_special_reg = false;
1523 break;
1524 }
1525 }
1526 if (!try_use_special_reg)
1527 continue;
1528 }
1529 /* only assign if register is still free */
1530 if (!register_file.test(reg, definition.bytes())) {
1531 definition.setFixed(reg);
1532 register_file.fill(definition);
1533 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1534 }
1535 }
1536 }
1537
1538 /* find registers for phis without affinity or where the register was blocked */
1539 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1540 aco_ptr<Instruction>& phi = *it;
1541 if (!is_phi(phi))
1542 break;
1543
1544 Definition& definition = phi->definitions[0];
1545 if (definition.isKill())
1546 continue;
1547
1548 if (!definition.isFixed()) {
1549 std::vector<std::pair<Operand, Definition>> parallelcopy;
1550 /* try to find a register that is used by at least one operand */
1551 for (const Operand& op : phi->operands) {
1552 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1553 continue;
1554 PhysReg reg = ctx.assignments[op.tempId()].reg;
1555 /* we tried this already on the previous loop */
1556 if (reg == scc || reg == exec)
1557 continue;
1558 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1559 definition.setFixed(reg);
1560 break;
1561 }
1562 }
1563 if (!definition.isFixed())
1564 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1565
1566 /* process parallelcopy */
1567 for (std::pair<Operand, Definition> pc : parallelcopy) {
1568 /* see if it's a copy from a different phi */
1569 //TODO: prefer moving some previous phis over live-ins
1570 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1571 Instruction *prev_phi = NULL;
1572 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1573 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1574 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1575 prev_phi = phi_it->get();
1576 }
1577 phi_it = it;
1578 while (!prev_phi && is_phi(*++phi_it)) {
1579 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1580 prev_phi = phi_it->get();
1581 }
1582 if (prev_phi) {
1583 /* if so, just update that phi's register */
1584 register_file.clear(prev_phi->definitions[0]);
1585 prev_phi->definitions[0].setFixed(pc.second.physReg());
1586 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1587 register_file.fill(prev_phi->definitions[0]);
1588 continue;
1589 }
1590
1591 /* rename */
1592 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1593 Temp orig = pc.first.getTemp();
1594 if (orig_it != ctx.orig_names.end())
1595 orig = orig_it->second;
1596 else
1597 ctx.orig_names[pc.second.tempId()] = orig;
1598 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1599
1600 /* otherwise, this is a live-in and we need to create a new phi
1601 * to move it in this block's predecessors */
1602 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1603 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1604 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1605 new_phi->definitions[0] = pc.second;
1606 for (unsigned i = 0; i < preds.size(); i++)
1607 new_phi->operands[i] = Operand(pc.first);
1608 instructions.emplace_back(std::move(new_phi));
1609 }
1610
1611 register_file.fill(definition);
1612 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1613 }
1614 live.emplace(definition.getTemp());
1615
1616 /* update phi affinities */
1617 for (const Operand& op : phi->operands) {
1618 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1619 ctx.affinities[op.tempId()] = definition.tempId();
1620 }
1621
1622 instructions.emplace_back(std::move(*it));
1623 }
1624
1625 /* fill in sgpr_live_in */
1626 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1627 sgpr_live_in[block.index][i] = register_file[i];
1628 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1629
1630 /* Handle all other instructions of the block */
1631 for (; it != block.instructions.end(); ++it) {
1632 aco_ptr<Instruction>& instr = *it;
1633
1634 /* parallelcopies from p_phi are inserted here which means
1635 * live ranges of killed operands end here as well */
1636 if (instr->opcode == aco_opcode::p_logical_end) {
1637 /* no need to process this instruction any further */
1638 if (block.logical_succs.size() != 1) {
1639 instructions.emplace_back(std::move(instr));
1640 continue;
1641 }
1642
1643 Block& succ = program->blocks[block.logical_succs[0]];
1644 unsigned idx = 0;
1645 for (; idx < succ.logical_preds.size(); idx++) {
1646 if (succ.logical_preds[idx] == block.index)
1647 break;
1648 }
1649 for (aco_ptr<Instruction>& phi : succ.instructions) {
1650 if (phi->opcode == aco_opcode::p_phi) {
1651 if (phi->operands[idx].isTemp() &&
1652 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1653 phi->operands[idx].isFirstKillBeforeDef()) {
1654 Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
1655 PhysReg reg = ctx.assignments[phi_op.id()].reg;
1656 assert(register_file[reg] == phi_op.id());
1657 register_file[reg] = 0;
1658 }
1659 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1660 break;
1661 }
1662 }
1663 instructions.emplace_back(std::move(instr));
1664 continue;
1665 }
1666
1667 std::vector<std::pair<Operand, Definition>> parallelcopy;
1668
1669 assert(!is_phi(instr));
1670
1671 /* handle operands */
1672 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1673 auto& operand = instr->operands[i];
1674 if (!operand.isTemp())
1675 continue;
1676
1677 /* rename operands */
1678 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
1679 assert(ctx.assignments[operand.tempId()].assigned);
1680
1681 PhysReg reg = ctx.assignments[operand.tempId()].reg;
1682 if (operand_can_use_reg(instr, i, reg))
1683 operand.setFixed(reg);
1684 else
1685 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
1686
1687 if (instr->format == Format::EXP ||
1688 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
1689 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1690 for (unsigned j = 0; j < operand.size(); j++)
1691 ctx.war_hint.set(operand.physReg().reg() + j);
1692 }
1693
1694 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
1695 if (phi != ctx.phi_map.end())
1696 phi->second.uses.emplace(instr.get());
1697 }
1698
1699 /* remove dead vars from register file */
1700 for (const Operand& op : instr->operands) {
1701 if (op.isTemp() && op.isFirstKillBeforeDef())
1702 register_file.clear(op);
1703 }
1704
1705 /* try to optimize v_mad_f32 -> v_mac_f32 */
1706 if (instr->opcode == aco_opcode::v_mad_f32 &&
1707 instr->operands[2].isTemp() &&
1708 instr->operands[2].isKillBeforeDef() &&
1709 instr->operands[2].getTemp().type() == RegType::vgpr &&
1710 instr->operands[1].isTemp() &&
1711 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1712 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1713 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1714 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1715 vop3->clamp || vop3->omod || vop3->opsel);
1716 if (can_use_mac) {
1717 instr->format = Format::VOP2;
1718 instr->opcode = aco_opcode::v_mac_f32;
1719 }
1720 }
1721
1722 /* handle definitions which must have the same register as an operand */
1723 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1724 instr->opcode == aco_opcode::v_mac_f32 ||
1725 instr->opcode == aco_opcode::v_writelane_b32 ||
1726 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1727 instr->definitions[0].setFixed(instr->operands[2].physReg());
1728 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1729 instr->opcode == aco_opcode::s_mulk_i32) {
1730 instr->definitions[0].setFixed(instr->operands[0].physReg());
1731 } else if (instr->format == Format::MUBUF &&
1732 instr->definitions.size() == 1 &&
1733 instr->operands.size() == 4) {
1734 instr->definitions[0].setFixed(instr->operands[3].physReg());
1735 } else if (instr->format == Format::MIMG &&
1736 instr->definitions.size() == 1 &&
1737 instr->operands[1].regClass().type() == RegType::vgpr) {
1738 instr->definitions[0].setFixed(instr->operands[1].physReg());
1739 }
1740
1741 ctx.defs_done.reset();
1742
1743 /* handle fixed definitions first */
1744 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1745 auto& definition = instr->definitions[i];
1746 if (!definition.isFixed())
1747 continue;
1748
1749 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1750 /* check if the target register is blocked */
1751 if (register_file[definition.physReg().reg()] != 0) {
1752 /* create parallelcopy pair to move blocking var */
1753 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
1754 Operand pc_op = Operand(tmp);
1755 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
1756 RegClass rc = pc_op.regClass();
1757 tmp = Temp{program->allocateId(), rc};
1758 Definition pc_def = Definition(tmp);
1759
1760 /* re-enable the killed operands, so that we don't move the blocking var there */
1761 for (const Operand& op : instr->operands) {
1762 if (op.isTemp() && op.isFirstKillBeforeDef())
1763 register_file.fill(op);
1764 }
1765
1766 /* find a new register for the blocking variable */
1767 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
1768 /* once again, disable killed operands */
1769 for (const Operand& op : instr->operands) {
1770 if (op.isTemp() && op.isFirstKillBeforeDef())
1771 register_file.clear(op);
1772 }
1773 for (unsigned k = 0; k < i; k++) {
1774 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1775 register_file.fill(instr->definitions[k]);
1776 }
1777 pc_def.setFixed(reg);
1778
1779 /* finish assignment of parallelcopy */
1780 ctx.assignments.emplace_back(reg, pc_def.regClass());
1781 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1782 parallelcopy.emplace_back(pc_op, pc_def);
1783
1784 /* add changes to reg_file */
1785 register_file.clear(pc_op);
1786 register_file.fill(pc_def);
1787 }
1788 ctx.defs_done.set(i);
1789
1790 if (!definition.isTemp())
1791 continue;
1792
1793 /* set live if it has a kill point */
1794 if (!definition.isKill())
1795 live.emplace(definition.getTemp());
1796
1797 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1798 register_file.fill(definition);
1799 }
1800
1801 /* handle all other definitions */
1802 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1803 auto& definition = instr->definitions[i];
1804
1805 if (definition.isFixed() || !definition.isTemp())
1806 continue;
1807
1808 /* find free reg */
1809 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1810 definition.setFixed(definition.physReg());
1811 else if (instr->opcode == aco_opcode::p_split_vector) {
1812 PhysReg reg = instr->operands[0].physReg();
1813 reg.reg_b += i * definition.bytes();
1814 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1815 definition.setFixed(reg);
1816 } else if (instr->opcode == aco_opcode::p_wqm) {
1817 PhysReg reg;
1818 if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1819 reg = instr->operands[0].physReg();
1820 definition.setFixed(reg);
1821 assert(register_file[reg.reg()] == 0);
1822 }
1823 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1824 PhysReg reg;
1825 if (instr->operands[0].isKillBeforeDef() &&
1826 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1827 reg = instr->operands[0].physReg();
1828 reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
1829 assert(!register_file.test(reg, definition.bytes()));
1830 definition.setFixed(reg);
1831 }
1832 } else if (instr->opcode == aco_opcode::p_create_vector) {
1833 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
1834 parallelcopy, instr);
1835 definition.setFixed(reg);
1836 }
1837
1838 if (!definition.isFixed())
1839 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
1840
1841 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1842 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1843 ctx.defs_done.set(i);
1844
1845 /* set live if it has a kill point */
1846 if (!definition.isKill())
1847 live.emplace(definition.getTemp());
1848
1849 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1850 register_file.fill(definition);
1851 }
1852
1853 handle_pseudo(ctx, register_file, instr.get());
1854
1855 /* kill definitions and late-kill operands */
1856 for (const Definition& def : instr->definitions) {
1857 if (def.isTemp() && def.isKill())
1858 register_file.clear(def);
1859 }
1860 for (const Operand& op : instr->operands) {
1861 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1862 register_file.clear(op);
1863 }
1864
1865 /* emit parallelcopy */
1866 if (!parallelcopy.empty()) {
1867 aco_ptr<Pseudo_instruction> pc;
1868 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1869 bool temp_in_scc = register_file[scc.reg()];
1870 bool sgpr_operands_alias_defs = false;
1871 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1872 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1873 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1874 if (!sgpr_operands_alias_defs) {
1875 unsigned reg = parallelcopy[i].first.physReg().reg();
1876 unsigned size = parallelcopy[i].first.getTemp().size();
1877 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1878
1879 reg = parallelcopy[i].second.physReg().reg();
1880 size = parallelcopy[i].second.getTemp().size();
1881 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1882 sgpr_operands_alias_defs = true;
1883 }
1884 }
1885
1886 pc->operands[i] = parallelcopy[i].first;
1887 pc->definitions[i] = parallelcopy[i].second;
1888 assert(pc->operands[i].size() == pc->definitions[i].size());
1889
1890 /* it might happen that the operand is already renamed. we have to restore the original name. */
1891 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1892 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1893 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1894 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1895
1896 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
1897 if (phi != ctx.phi_map.end())
1898 phi->second.uses.emplace(pc.get());
1899 }
1900
1901 if (temp_in_scc && sgpr_operands_alias_defs) {
1902 /* disable definitions and re-enable operands */
1903 for (const Definition& def : instr->definitions) {
1904 if (def.isTemp() && !def.isKill())
1905 register_file.clear(def);
1906 }
1907 for (const Operand& op : instr->operands) {
1908 if (op.isTemp() && op.isFirstKill())
1909 register_file.block(op.physReg(), op.bytes());
1910 }
1911
1912 handle_pseudo(ctx, register_file, pc.get());
1913
1914 /* re-enable live vars */
1915 for (const Operand& op : instr->operands) {
1916 if (op.isTemp() && op.isFirstKill())
1917 register_file.clear(op);
1918 }
1919 for (const Definition& def : instr->definitions) {
1920 if (def.isTemp() && !def.isKill())
1921 register_file.fill(def);
1922 }
1923 } else {
1924 pc->tmp_in_scc = false;
1925 }
1926
1927 instructions.emplace_back(std::move(pc));
1928 }
1929
1930 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1931 bool instr_needs_vop3 = !instr->isVOP3() &&
1932 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1933 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1934 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1935 instr->opcode == aco_opcode::v_addc_co_u32 ||
1936 instr->opcode == aco_opcode::v_sub_co_u32 ||
1937 instr->opcode == aco_opcode::v_subb_co_u32 ||
1938 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1939 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1940 !(instr->definitions[1].physReg() == vcc)) ||
1941 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1942 instr->opcode == aco_opcode::v_subb_co_u32 ||
1943 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1944 !(instr->operands[2].physReg() == vcc)));
1945 if (instr_needs_vop3) {
1946
1947 /* if the first operand is a literal, we have to move it to a reg */
1948 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
1949 bool can_sgpr = true;
1950 /* check, if we have to move to vgpr */
1951 for (const Operand& op : instr->operands) {
1952 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1953 can_sgpr = false;
1954 break;
1955 }
1956 }
1957 /* disable definitions and re-enable operands */
1958 for (const Definition& def : instr->definitions)
1959 register_file.clear(def);
1960 for (const Operand& op : instr->operands) {
1961 if (op.isTemp() && op.isFirstKill())
1962 register_file.block(op.physReg(), op.bytes());
1963 }
1964 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
1965 ctx.assignments.emplace_back();
1966 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
1967
1968 aco_ptr<Instruction> mov;
1969 if (can_sgpr)
1970 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1971 else
1972 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1973 mov->operands[0] = instr->operands[0];
1974 mov->definitions[0] = Definition(tmp);
1975 mov->definitions[0].setFixed(reg);
1976
1977 instr->operands[0] = Operand(tmp);
1978 instr->operands[0].setFixed(reg);
1979 instructions.emplace_back(std::move(mov));
1980 /* re-enable live vars */
1981 for (const Operand& op : instr->operands) {
1982 if (op.isTemp() && op.isFirstKill())
1983 register_file.clear(op);
1984 }
1985 for (const Definition& def : instr->definitions) {
1986 if (def.isTemp() && !def.isKill())
1987 register_file.fill(def);
1988 }
1989 }
1990
1991 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1992 aco_ptr<Instruction> tmp = std::move(instr);
1993 Format format = asVOP3(tmp->format);
1994 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
1995 for (unsigned i = 0; i < instr->operands.size(); i++) {
1996 Operand& operand = tmp->operands[i];
1997 instr->operands[i] = operand;
1998 /* keep phi_map up to date */
1999 if (operand.isTemp()) {
2000 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.tempId());
2001 if (phi != ctx.phi_map.end()) {
2002 phi->second.uses.erase(tmp.get());
2003 phi->second.uses.emplace(instr.get());
2004 }
2005 }
2006 }
2007 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2008 }
2009 instructions.emplace_back(std::move(*it));
2010
2011 } /* end for Instr */
2012
2013 block.instructions = std::move(instructions);
2014
2015 ctx.filled[block.index] = true;
2016 for (unsigned succ_idx : block.linear_succs) {
2017 Block& succ = program->blocks[succ_idx];
2018 /* seal block if all predecessors are filled */
2019 bool all_filled = true;
2020 for (unsigned pred_idx : succ.linear_preds) {
2021 if (!ctx.filled[pred_idx]) {
2022 all_filled = false;
2023 break;
2024 }
2025 }
2026 if (all_filled) {
2027 ctx.sealed[succ_idx] = true;
2028
2029 /* finish incomplete phis and check if they became trivial */
2030 for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2031 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2032 for (unsigned i = 0; i < phi->operands.size(); i++) {
2033 phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2034 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2035 }
2036 try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2037 }
2038 /* complete the original phi nodes, but no need to check triviality */
2039 for (aco_ptr<Instruction>& instr : succ.instructions) {
2040 if (!is_phi(instr))
2041 break;
2042 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2043
2044 for (unsigned i = 0; i < instr->operands.size(); i++) {
2045 auto& operand = instr->operands[i];
2046 if (!operand.isTemp())
2047 continue;
2048 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2049 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2050 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2051 if (phi != ctx.phi_map.end())
2052 phi->second.uses.emplace(instr.get());
2053 }
2054 }
2055 }
2056 }
2057 } /* end for BB */
2058
2059 /* remove trivial phis */
2060 for (Block& block : program->blocks) {
2061 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2062 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2063 auto middle = std::remove_if(block.instructions.begin(), end,
2064 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2065 block.instructions.erase(middle, end);
2066 }
2067
2068 /* find scc spill registers which may be needed for parallelcopies created by phis */
2069 for (Block& block : program->blocks) {
2070 if (block.linear_preds.size() <= 1)
2071 continue;
2072
2073 std::bitset<128> regs = sgpr_live_in[block.index];
2074 if (!regs[127])
2075 continue;
2076
2077 /* choose a register */
2078 int16_t reg = 0;
2079 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2080 ;
2081 assert(reg < ctx.program->max_reg_demand.sgpr);
2082 adjust_max_used_regs(ctx, s1, reg);
2083
2084 /* update predecessors */
2085 for (unsigned& pred_index : block.linear_preds) {
2086 Block& pred = program->blocks[pred_index];
2087 pred.scc_live_out = true;
2088 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2089 }
2090 }
2091
2092 /* num_gpr = rnd_up(max_used_gpr + 1) */
2093 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2094 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2095 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2096 else
2097 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2098 }
2099
2100 }