aco: use DefInfo in more places to simplify RA
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37
38 namespace aco {
39 namespace {
40
41 struct assignment {
42 PhysReg reg;
43 RegClass rc;
44 uint8_t assigned = 0;
45 assignment() = default;
46 assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
47 };
48
49 struct phi_info {
50 Instruction* phi;
51 unsigned block_idx;
52 std::set<Instruction*> uses;
53 };
54
55 struct ra_ctx {
56 std::bitset<512> war_hint;
57 Program* program;
58 std::vector<assignment> assignments;
59 std::vector<std::unordered_map<unsigned, Temp>> renames;
60 std::vector<std::vector<Instruction*>> incomplete_phis;
61 std::vector<bool> filled;
62 std::vector<bool> sealed;
63 std::unordered_map<unsigned, Temp> orig_names;
64 std::unordered_map<unsigned, phi_info> phi_map;
65 std::unordered_map<unsigned, unsigned> affinities;
66 std::unordered_map<unsigned, Instruction*> vectors;
67 aco_ptr<Instruction> pseudo_dummy;
68 unsigned max_used_sgpr = 0;
69 unsigned max_used_vgpr = 0;
70 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
71
72 ra_ctx(Program* program) : program(program),
73 assignments(program->peekAllocationId()),
74 renames(program->blocks.size()),
75 incomplete_phis(program->blocks.size()),
76 filled(program->blocks.size()),
77 sealed(program->blocks.size())
78 {
79 pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
80 }
81 };
82
83 bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
84 {
85 return instr->isSDWA() || instr->format == Format::PSEUDO;
86 }
87
88 struct DefInfo {
89 uint16_t lb;
90 uint16_t ub;
91 uint8_t size;
92 uint8_t stride;
93 RegClass rc;
94
95 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
96 size = rc.size();
97 stride = 1;
98
99 if (rc.type() == RegType::vgpr) {
100 lb = 256;
101 ub = 256 + ctx.program->max_reg_demand.vgpr;
102 } else {
103 lb = 0;
104 ub = ctx.program->max_reg_demand.sgpr;
105 if (size == 2)
106 stride = 2;
107 else if (size >= 4)
108 stride = 4;
109 }
110
111 if (rc.is_subdword()) {
112 /* stride in bytes */
113 if(!instr_can_access_subdword(instr))
114 stride = 4;
115 else if (rc.bytes() % 4 == 0)
116 stride = 4;
117 else if (rc.bytes() % 2 == 0)
118 stride = 2;
119 }
120 }
121 };
122
123 class RegisterFile {
124 public:
125 RegisterFile() {regs.fill(0);}
126
127 std::array<uint32_t, 512> regs;
128 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
129
130 const uint32_t& operator [] (unsigned index) const {
131 return regs[index];
132 }
133
134 uint32_t& operator [] (unsigned index) {
135 return regs[index];
136 }
137
138 unsigned count_zero(PhysReg start, unsigned size) {
139 unsigned res = 0;
140 for (unsigned i = 0; i < size; i++)
141 res += !regs[start + i];
142 return res;
143 }
144
145 bool test(PhysReg start, unsigned num_bytes) {
146 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
147 if (regs[i] & 0x0FFFFFFF)
148 return true;
149 if (regs[i] == 0xF0000000) {
150 assert(subdword_regs.find(i) != subdword_regs.end());
151 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
152 if (subdword_regs[i][j])
153 return true;
154 }
155 }
156 }
157 return false;
158 }
159
160 void block(PhysReg start, unsigned num_bytes) {
161 if (start.byte() || num_bytes % 4)
162 fill_subdword(start, num_bytes, 0xFFFFFFFF);
163 else
164 fill(start, num_bytes / 4, 0xFFFFFFFF);
165 }
166
167 bool is_blocked(PhysReg start) {
168 if (regs[start] == 0xFFFFFFFF)
169 return true;
170 if (regs[start] == 0xF0000000) {
171 for (unsigned i = start.byte(); i < 4; i++)
172 if (subdword_regs[start][i] == 0xFFFFFFFF)
173 return true;
174 }
175 return false;
176 }
177
178 void clear(PhysReg start, RegClass rc) {
179 if (rc.is_subdword())
180 fill_subdword(start, rc.bytes(), 0);
181 else
182 fill(start, rc.size(), 0);
183 }
184
185 void fill(Operand op) {
186 if (op.regClass().is_subdword())
187 fill_subdword(op.physReg(), op.bytes(), op.tempId());
188 else
189 fill(op.physReg(), op.size(), op.tempId());
190 }
191
192 void clear(Operand op) {
193 clear(op.physReg(), op.regClass());
194 }
195
196 void fill(Definition def) {
197 if (def.regClass().is_subdword())
198 fill_subdword(def.physReg(), def.bytes(), def.tempId());
199 else
200 fill(def.physReg(), def.size(), def.tempId());
201 }
202
203 void clear(Definition def) {
204 clear(def.physReg(), def.regClass());
205 }
206
207 private:
208 void fill(PhysReg start, unsigned size, uint32_t val) {
209 for (unsigned i = 0; i < size; i++)
210 regs[start + i] = val;
211 }
212
213 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
214 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
215 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
216 /* emplace or get */
217 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
218 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
219 sub[j] = val;
220
221 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
222 subdword_regs.erase(i);
223 regs[i] = 0;
224 }
225 }
226 }
227 };
228
229
230 /* helper function for debugging */
231 #if 0
232 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
233 {
234 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
235 unsigned lb = vgprs ? 256 : 0;
236 unsigned ub = lb + max;
237 char reg_char = vgprs ? 'v' : 's';
238
239 /* print markers */
240 printf(" ");
241 for (unsigned i = lb; i < ub; i += 3) {
242 printf("%.2u ", i - lb);
243 }
244 printf("\n");
245
246 /* print usage */
247 printf("%cgprs: ", reg_char);
248 unsigned free_regs = 0;
249 unsigned prev = 0;
250 bool char_select = false;
251 for (unsigned i = lb; i < ub; i++) {
252 if (reg_file[i] == 0xFFFF) {
253 printf("~");
254 } else if (reg_file[i]) {
255 if (reg_file[i] != prev) {
256 prev = reg_file[i];
257 char_select = !char_select;
258 }
259 printf(char_select ? "#" : "@");
260 } else {
261 free_regs++;
262 printf(".");
263 }
264 }
265 printf("\n");
266
267 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
268
269 /* print assignments */
270 prev = 0;
271 unsigned size = 0;
272 for (unsigned i = lb; i < ub; i++) {
273 if (reg_file[i] != prev) {
274 if (prev && size > 1)
275 printf("-%d]\n", i - 1 - lb);
276 else if (prev)
277 printf("]\n");
278 prev = reg_file[i];
279 if (prev && prev != 0xFFFF) {
280 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
281 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
282 else
283 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
284 }
285 size = 1;
286 } else {
287 size++;
288 }
289 }
290 if (prev && size > 1)
291 printf("-%d]\n", ub - lb - 1);
292 else if (prev)
293 printf("]\n");
294 }
295 #endif
296
297
298 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
299 {
300 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
301 unsigned size = rc.size();
302 if (rc.type() == RegType::vgpr) {
303 assert(reg >= 256);
304 unsigned hi = reg - 256 + size - 1;
305 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
306 } else if (reg + rc.size() <= max_addressible_sgpr) {
307 unsigned hi = reg + size - 1;
308 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
309 }
310 }
311
312
313 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
314 std::vector<std::pair<Operand, Definition>>& parallelcopies,
315 aco_ptr<Instruction>& instr)
316 {
317 /* allocate id's and rename operands: this is done transparently here */
318 for (std::pair<Operand, Definition>& copy : parallelcopies) {
319 /* the definitions with id are not from this function and already handled */
320 if (copy.second.isTemp())
321 continue;
322
323 /* check if we we moved another parallelcopy definition */
324 for (std::pair<Operand, Definition>& other : parallelcopies) {
325 if (!other.second.isTemp())
326 continue;
327 if (copy.first.getTemp() == other.second.getTemp()) {
328 copy.first.setTemp(other.first.getTemp());
329 copy.first.setFixed(other.first.physReg());
330 }
331 }
332 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
333 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
334 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
335 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
336 reg_file.fill(copy.second);
337
338 /* check if we moved an operand */
339 for (Operand& op : instr->operands) {
340 if (!op.isTemp())
341 continue;
342 if (op.tempId() == copy.first.tempId()) {
343 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
344 for (std::pair<Operand, Definition>& pc : parallelcopies) {
345 PhysReg def_reg = pc.second.physReg();
346 omit_renaming &= def_reg > copy.first.physReg() ?
347 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
348 (def_reg + pc.second.size() <= copy.first.physReg().reg());
349 }
350 if (omit_renaming)
351 continue;
352 op.setTemp(copy.second.getTemp());
353 op.setFixed(copy.second.physReg());
354 }
355 }
356 }
357 }
358
359 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
360 RegisterFile& reg_file,
361 DefInfo info)
362 {
363 uint32_t lb = info.lb;
364 uint32_t ub = info.ub;
365 uint32_t size = info.size;
366 uint32_t stride = info.stride;
367 RegClass rc = info.rc;
368
369 if (rc.is_subdword()) {
370 for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
371 assert(reg_file[entry.first] == 0xF0000000);
372 if (lb > entry.first || entry.first >= ub)
373 continue;
374
375 for (unsigned i = 0; i < 4; i+= stride) {
376 if (entry.second[i] != 0)
377 continue;
378
379 bool reg_found = true;
380 for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
381 reg_found &= entry.second[i + j] == 0;
382
383 /* check neighboring reg if needed */
384 reg_found &= (i <= 4 - rc.bytes() || reg_file[entry.first + 1] == 0);
385 if (reg_found) {
386 PhysReg res{entry.first};
387 res.reg_b += i;
388 return {res, true};
389 }
390 }
391 }
392
393 stride = 1; /* stride in full registers */
394 }
395
396 /* best fit algorithm: find the smallest gap to fit in the variable */
397 if (stride == 1) {
398 unsigned best_pos = 0xFFFF;
399 unsigned gap_size = 0xFFFF;
400 unsigned next_pos = 0xFFFF;
401
402 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
403 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
404 if (next_pos == 0xFFFF)
405 continue;
406
407 /* check if the variable fits */
408 if (next_pos + size > current_reg) {
409 next_pos = 0xFFFF;
410 continue;
411 }
412
413 /* check if the tested gap is smaller */
414 if (current_reg - next_pos < gap_size) {
415 best_pos = next_pos;
416 gap_size = current_reg - next_pos;
417 }
418 next_pos = 0xFFFF;
419 continue;
420 }
421
422 if (next_pos == 0xFFFF)
423 next_pos = current_reg;
424 }
425
426 /* final check */
427 if (next_pos != 0xFFFF &&
428 next_pos + size <= ub &&
429 ub - next_pos < gap_size) {
430 best_pos = next_pos;
431 gap_size = ub - next_pos;
432 }
433 if (best_pos != 0xFFFF) {
434 adjust_max_used_regs(ctx, rc, best_pos);
435 return {PhysReg{best_pos}, true};
436 }
437 return {{}, false};
438 }
439
440 bool found = false;
441 unsigned reg_lo = lb;
442 unsigned reg_hi = lb + size - 1;
443 while (!found && reg_lo + size <= ub) {
444 if (reg_file[reg_lo] != 0) {
445 reg_lo += stride;
446 continue;
447 }
448 reg_hi = reg_lo + size - 1;
449 found = true;
450 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
451 if (reg_file[reg] != 0 || ctx.war_hint[reg])
452 found = false;
453 }
454 if (found) {
455 adjust_max_used_regs(ctx, rc, reg_lo);
456 return {PhysReg{reg_lo}, true};
457 }
458
459 reg_lo += stride;
460 }
461
462 return {{}, false};
463 }
464
465 /* collect variables from a register area and clear reg_file */
466 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
467 PhysReg reg, unsigned size)
468 {
469 std::set<std::pair<unsigned, unsigned>> vars;
470 for (unsigned j = reg; j < reg + size; j++) {
471 if (reg_file.is_blocked(PhysReg{j}))
472 continue;
473 if (reg_file[j] == 0xF0000000) {
474 for (unsigned k = 0; k < 4; k++) {
475 unsigned id = reg_file.subdword_regs[j][k];
476 if (id) {
477 assignment& var = ctx.assignments[id];
478 vars.emplace(var.rc.bytes(), id);
479 reg_file.clear(var.reg, var.rc);
480 if (!reg_file[j])
481 break;
482 }
483 }
484 } else if (reg_file[j] != 0) {
485 unsigned id = reg_file[j];
486 assignment& var = ctx.assignments[id];
487 vars.emplace(var.rc.bytes(), id);
488 reg_file.clear(var.reg, var.rc);
489 }
490 }
491 return vars;
492 }
493
494 bool get_regs_for_copies(ra_ctx& ctx,
495 RegisterFile& reg_file,
496 std::vector<std::pair<Operand, Definition>>& parallelcopies,
497 const std::set<std::pair<unsigned, unsigned>> &vars,
498 uint32_t lb, uint32_t ub,
499 aco_ptr<Instruction>& instr,
500 uint32_t def_reg_lo,
501 uint32_t def_reg_hi)
502 {
503
504 /* variables are sorted from small sized to large */
505 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
506 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
507 unsigned id = it->second;
508 assignment& var = ctx.assignments[id];
509 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
510 uint32_t size = info.size;
511
512 /* check if this is a dead operand, then we can re-use the space from the definition */
513 bool is_dead_operand = false;
514 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
515 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
516 is_dead_operand = true;
517 }
518
519 std::pair<PhysReg, bool> res;
520 if (is_dead_operand) {
521 if (instr->opcode == aco_opcode::p_create_vector) {
522 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
523 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
524 PhysReg reg(def_reg_lo);
525 reg.reg_b += offset;
526 assert(!reg_file.test(reg, var.rc.bytes()));
527 res = {reg, true};
528 break;
529 }
530 }
531 } else {
532 info.lb = def_reg_lo;
533 info.ub = def_reg_hi + 1;
534 res = get_reg_simple(ctx, reg_file, info);
535 }
536 } else {
537 info.lb = lb;
538 info.ub = def_reg_lo;
539 res = get_reg_simple(ctx, reg_file, info);
540 if (!res.second) {
541 info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
542 info.ub = ub;
543 res = get_reg_simple(ctx, reg_file, info);
544 }
545 }
546
547 if (res.second) {
548 /* mark the area as blocked */
549 reg_file.block(res.first, var.rc.bytes());
550
551 /* create parallelcopy pair (without definition id) */
552 Temp tmp = Temp(id, var.rc);
553 Operand pc_op = Operand(tmp);
554 pc_op.setFixed(var.reg);
555 Definition pc_def = Definition(res.first, pc_op.regClass());
556 parallelcopies.emplace_back(pc_op, pc_def);
557 continue;
558 }
559
560 unsigned best_pos = lb;
561 unsigned num_moves = 0xFF;
562 unsigned num_vars = 0;
563
564 /* we use a sliding window to find potential positions */
565 unsigned reg_lo = lb;
566 unsigned reg_hi = lb + size - 1;
567 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
568 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
569 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
570 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
571 continue;
572
573 /* second, check that we have at most k=num_moves elements in the window
574 * and no element is larger than the currently processed one */
575 unsigned k = 0;
576 unsigned n = 0;
577 unsigned last_var = 0;
578 bool found = true;
579 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
580 if (reg_file[j] == 0 || reg_file[j] == last_var)
581 continue;
582
583 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
584 found = false;
585 break;
586 }
587 if (reg_file[j] == 0xF0000000) {
588 k += 1;
589 n++;
590 continue;
591 }
592 /* we cannot split live ranges of linear vgprs */
593 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
594 found = false;
595 break;
596 }
597 bool is_kill = false;
598 for (const Operand& op : instr->operands) {
599 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
600 is_kill = true;
601 break;
602 }
603 }
604 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
605 found = false;
606 break;
607 }
608
609 k += ctx.assignments[reg_file[j]].rc.size();
610 last_var = reg_file[j];
611 n++;
612 if (k > num_moves || (k == num_moves && n <= num_vars)) {
613 found = false;
614 break;
615 }
616 }
617
618 if (found) {
619 best_pos = reg_lo;
620 num_moves = k;
621 num_vars = n;
622 }
623 }
624
625 /* FIXME: we messed up and couldn't find space for the variables to be copied */
626 if (num_moves == 0xFF)
627 return false;
628
629 reg_lo = best_pos;
630 reg_hi = best_pos + size - 1;
631
632 /* collect variables and block reg file */
633 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
634
635 /* mark the area as blocked */
636 reg_file.block(PhysReg{reg_lo}, size * 4);
637
638 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
639 return false;
640
641 adjust_max_used_regs(ctx, var.rc, reg_lo);
642
643 /* create parallelcopy pair (without definition id) */
644 Temp tmp = Temp(id, var.rc);
645 Operand pc_op = Operand(tmp);
646 pc_op.setFixed(var.reg);
647 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
648 parallelcopies.emplace_back(pc_op, pc_def);
649 }
650
651 return true;
652 }
653
654
655 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
656 RegisterFile& reg_file,
657 std::vector<std::pair<Operand, Definition>>& parallelcopies,
658 DefInfo info,
659 aco_ptr<Instruction>& instr)
660 {
661 uint32_t lb = info.lb;
662 uint32_t ub = info.ub;
663 uint32_t size = info.size;
664 uint32_t stride = info.stride;
665 RegClass rc = info.rc;
666
667 /* check how many free regs we have */
668 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
669
670 /* mark and count killed operands */
671 unsigned killed_ops = 0;
672 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
673 if (instr->operands[j].isTemp() &&
674 instr->operands[j].isFirstKillBeforeDef() &&
675 instr->operands[j].physReg() >= lb &&
676 instr->operands[j].physReg() < ub) {
677 assert(instr->operands[j].isFixed());
678 assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
679 reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
680 killed_ops += instr->operands[j].getTemp().size();
681 }
682 }
683
684 assert(regs_free >= size);
685 /* we might have to move dead operands to dst in order to make space */
686 unsigned op_moves = 0;
687
688 if (size > (regs_free - killed_ops))
689 op_moves = size - (regs_free - killed_ops);
690
691 /* find the best position to place the definition */
692 unsigned best_pos = lb;
693 unsigned num_moves = 0xFF;
694 unsigned num_vars = 0;
695
696 /* we use a sliding window to check potential positions */
697 unsigned reg_lo = lb;
698 unsigned reg_hi = lb + size - 1;
699 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
700 /* first check the edges: this is what we have to fix to allow for num_moves > size */
701 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
702 continue;
703 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
704 continue;
705
706 /* second, check that we have at most k=num_moves elements in the window
707 * and no element is larger than the currently processed one */
708 unsigned k = op_moves;
709 unsigned n = 0;
710 unsigned remaining_op_moves = op_moves;
711 unsigned last_var = 0;
712 bool found = true;
713 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
714 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
715 if (reg_file[j] == 0 || reg_file[j] == last_var)
716 continue;
717
718 /* dead operands effectively reduce the number of estimated moves */
719 if (reg_file.is_blocked(PhysReg{j})) {
720 if (remaining_op_moves) {
721 k--;
722 remaining_op_moves--;
723 }
724 continue;
725 }
726
727 if (reg_file[j] == 0xF0000000) {
728 k += 1;
729 n++;
730 continue;
731 }
732
733 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
734 found = false;
735 break;
736 }
737
738 /* we cannot split live ranges of linear vgprs */
739 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
740 found = false;
741 break;
742 }
743
744 k += ctx.assignments[reg_file[j]].rc.size();
745 n++;
746 last_var = reg_file[j];
747 }
748
749 if (!found || k > num_moves)
750 continue;
751 if (k == num_moves && n < num_vars)
752 continue;
753 if (!aligned && k == num_moves && n == num_vars)
754 continue;
755
756 if (found) {
757 best_pos = reg_lo;
758 num_moves = k;
759 num_vars = n;
760 }
761 }
762
763 if (num_moves == 0xFF) {
764 /* remove killed operands from reg_file once again */
765 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
766 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
767 reg_file.clear(instr->operands[i]);
768 }
769 for (unsigned i = 0; i < instr->definitions.size(); i++) {
770 Definition def = instr->definitions[i];
771 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
772 reg_file.fill(def);
773 }
774 return {{}, false};
775 }
776
777 RegisterFile register_file = reg_file;
778
779 /* now, we figured the placement for our definition */
780 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
781
782 if (instr->opcode == aco_opcode::p_create_vector) {
783 /* move killed operands which aren't yet at the correct position */
784 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
785 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
786 instr->operands[i].getTemp().type() == rc.type()) {
787
788 if (instr->operands[i].physReg() != best_pos + offset) {
789 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
790 reg_file.clear(instr->operands[i]);
791 } else {
792 reg_file.fill(instr->operands[i]);
793 }
794 }
795 }
796 } else {
797 /* re-enable the killed operands */
798 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
799 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
800 reg_file.fill(instr->operands[j]);
801 }
802 }
803
804 std::vector<std::pair<Operand, Definition>> pc;
805 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
806 reg_file = std::move(register_file);
807 /* remove killed operands from reg_file once again */
808 if (!is_phi(instr)) {
809 for (const Operand& op : instr->operands) {
810 if (op.isTemp() && op.isFirstKillBeforeDef())
811 reg_file.clear(op);
812 }
813 }
814 for (unsigned i = 0; i < instr->definitions.size(); i++) {
815 Definition& def = instr->definitions[i];
816 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
817 reg_file.fill(def);
818 }
819 return {{}, false};
820 }
821
822 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
823
824 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
825 reg_file.clear(PhysReg{best_pos}, rc);
826
827 update_renames(ctx, reg_file, parallelcopies, instr);
828
829 /* remove killed operands from reg_file once again */
830 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
831 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
832 continue;
833 assert(!instr->operands[i].isUndefined());
834 if (instr->operands[i].isFirstKillBeforeDef())
835 reg_file.clear(instr->operands[i]);
836 }
837 for (unsigned i = 0; i < instr->definitions.size(); i++) {
838 Definition def = instr->definitions[i];
839 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
840 reg_file.fill(def);
841 }
842
843 adjust_max_used_regs(ctx, rc, best_pos);
844 return {PhysReg{best_pos}, true};
845 }
846
847 bool get_reg_specified(ra_ctx& ctx,
848 RegisterFile& reg_file,
849 RegClass rc,
850 std::vector<std::pair<Operand, Definition>>& parallelcopies,
851 aco_ptr<Instruction>& instr,
852 PhysReg reg)
853 {
854 if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
855 return false;
856
857 uint32_t size = rc.size();
858 uint32_t stride = 1;
859 uint32_t lb, ub;
860
861 if (rc.type() == RegType::vgpr) {
862 lb = 256;
863 ub = 256 + ctx.program->max_reg_demand.vgpr;
864 } else {
865 if (size == 2)
866 stride = 2;
867 else if (size >= 4)
868 stride = 4;
869 if (reg % stride != 0)
870 return false;
871 lb = 0;
872 ub = ctx.program->max_reg_demand.sgpr;
873 }
874
875 uint32_t reg_lo = reg.reg();
876 uint32_t reg_hi = reg + (size - 1);
877
878 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
879 return false;
880
881 if (reg_file.test(reg, rc.bytes()))
882 return false;
883
884 adjust_max_used_regs(ctx, rc, reg_lo);
885 return true;
886 }
887
888 PhysReg get_reg(ra_ctx& ctx,
889 RegisterFile& reg_file,
890 Temp temp,
891 std::vector<std::pair<Operand, Definition>>& parallelcopies,
892 aco_ptr<Instruction>& instr)
893 {
894 if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
895 ctx.assignments[ctx.affinities[temp.id()]].assigned) {
896 PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
897 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
898 return reg;
899 }
900
901 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
902 Instruction* vec = ctx.vectors[temp.id()];
903 unsigned byte_offset = 0;
904 for (const Operand& op : vec->operands) {
905 if (op.isTemp() && op.tempId() == temp.id())
906 break;
907 else
908 byte_offset += op.bytes();
909 }
910 unsigned k = 0;
911 for (const Operand& op : vec->operands) {
912 if (op.isTemp() &&
913 op.tempId() != temp.id() &&
914 op.getTemp().type() == temp.type() &&
915 ctx.assignments[op.tempId()].assigned) {
916 PhysReg reg = ctx.assignments[op.tempId()].reg;
917 reg.reg_b += (byte_offset - k);
918 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
919 return reg;
920 }
921 k += op.bytes();
922 }
923
924 DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
925 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
926 PhysReg reg = res.first;
927 if (res.second) {
928 reg.reg_b += byte_offset;
929 /* make sure to only use byte offset if the instruction supports it */
930 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
931 return reg;
932 }
933 }
934
935 DefInfo info(ctx, instr, temp.regClass());
936
937 /* try to find space without live-range splits */
938 std::pair<PhysReg, bool> res;
939 if (info.rc.type() == RegType::vgpr && (info.size == 4 || info.size == 8)) {
940 DefInfo info_strided = {info.lb, info.ub, info.size, 4, info.rc};
941 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info_strided);
942 }
943 if (!res.second)
944 res = get_reg_simple(ctx, reg_file, info);
945
946
947 if (res.second)
948 return res.first;
949
950 /* try to find space with live-range splits */
951 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
952
953 if (res.second)
954 return res.first;
955
956 /* try using more registers */
957
958 /* We should only fail here because keeping under the limit would require
959 * too many moves. */
960 assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
961
962 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
963 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
964 if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
965 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
966 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
967 } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
968 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
969 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
970 }
971
972 //FIXME: if nothing helps, shift-rotate the registers to make space
973
974 unreachable("did not find a register");
975 }
976
977 PhysReg get_reg_create_vector(ra_ctx& ctx,
978 RegisterFile& reg_file,
979 Temp temp,
980 std::vector<std::pair<Operand, Definition>>& parallelcopies,
981 aco_ptr<Instruction>& instr)
982 {
983 RegClass rc = temp.regClass();
984 /* create_vector instructions have different costs w.r.t. register coalescing */
985 uint32_t size = rc.size();
986 uint32_t bytes = rc.bytes();
987 uint32_t stride = 1;
988 uint32_t lb, ub;
989 if (rc.type() == RegType::vgpr) {
990 lb = 256;
991 ub = 256 + ctx.program->max_reg_demand.vgpr;
992 } else {
993 lb = 0;
994 ub = ctx.program->max_reg_demand.sgpr;
995 if (size == 2)
996 stride = 2;
997 else if (size >= 4)
998 stride = 4;
999 }
1000
1001 //TODO: improve p_create_vector for sub-dword vectors
1002
1003 unsigned best_pos = -1;
1004 unsigned num_moves = 0xFF;
1005 bool best_war_hint = true;
1006
1007 /* test for each operand which definition placement causes the least shuffle instructions */
1008 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1009 // TODO: think about, if we can alias live operands on the same register
1010 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
1011 continue;
1012
1013 if (offset > instr->operands[i].physReg().reg_b)
1014 continue;
1015
1016 unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
1017 if (reg_lo % 4)
1018 continue;
1019 reg_lo /= 4;
1020 unsigned reg_hi = reg_lo + size - 1;
1021 unsigned k = 0;
1022
1023 /* no need to check multiple times */
1024 if (reg_lo == best_pos)
1025 continue;
1026
1027 /* check borders */
1028 // TODO: this can be improved */
1029 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
1030 continue;
1031 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
1032 continue;
1033 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
1034 continue;
1035
1036 /* count variables to be moved and check war_hint */
1037 bool war_hint = false;
1038 bool linear_vgpr = false;
1039 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
1040 if (reg_file[j] != 0) {
1041 if (reg_file[j] == 0xF0000000) {
1042 PhysReg reg;
1043 reg.reg_b = j * 4;
1044 unsigned bytes_left = bytes - (j - reg_lo) * 4;
1045 for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
1046 k += reg_file.test(reg, 1);
1047 } else {
1048 k += 4;
1049 /* we cannot split live ranges of linear vgprs */
1050 if (ctx.assignments[reg_file[j]].rc & (1 << 6))
1051 linear_vgpr = true;
1052 }
1053 }
1054 war_hint |= ctx.war_hint[j];
1055 }
1056 if (linear_vgpr || (war_hint && !best_war_hint))
1057 continue;
1058
1059 /* count operands in wrong positions */
1060 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
1061 if (j == i ||
1062 !instr->operands[j].isTemp() ||
1063 instr->operands[j].getTemp().type() != rc.type())
1064 continue;
1065 if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
1066 k += instr->operands[j].bytes();
1067 }
1068 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
1069 if (k > num_moves || (!aligned && k == num_moves))
1070 continue;
1071
1072 best_pos = reg_lo;
1073 num_moves = k;
1074 best_war_hint = war_hint;
1075 }
1076
1077 if (num_moves >= bytes)
1078 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1079
1080 /* collect variables to be moved */
1081 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1082
1083 /* move killed operands which aren't yet at the correct position */
1084 uint64_t moved_operand_mask = 0;
1085 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1086 if (instr->operands[i].isTemp() &&
1087 instr->operands[i].isFirstKillBeforeDef() &&
1088 instr->operands[i].getTemp().type() == rc.type() &&
1089 instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
1090 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1091 moved_operand_mask |= (uint64_t)1 << i;
1092 }
1093 }
1094
1095 ASSERTED bool success = false;
1096 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1097 assert(success);
1098
1099 update_renames(ctx, reg_file, parallelcopies, instr);
1100 adjust_max_used_regs(ctx, rc, best_pos);
1101
1102 while (moved_operand_mask) {
1103 unsigned i = u_bit_scan64(&moved_operand_mask);
1104 assert(instr->operands[i].isFirstKillBeforeDef());
1105 reg_file.clear(instr->operands[i]);
1106 }
1107
1108 return PhysReg{best_pos};
1109 }
1110
1111 void handle_pseudo(ra_ctx& ctx,
1112 const RegisterFile& reg_file,
1113 Instruction* instr)
1114 {
1115 if (instr->format != Format::PSEUDO)
1116 return;
1117
1118 /* all instructions which use handle_operands() need this information */
1119 switch (instr->opcode) {
1120 case aco_opcode::p_extract_vector:
1121 case aco_opcode::p_create_vector:
1122 case aco_opcode::p_split_vector:
1123 case aco_opcode::p_parallelcopy:
1124 case aco_opcode::p_wqm:
1125 break;
1126 default:
1127 return;
1128 }
1129
1130 /* if all definitions are vgpr, no need to care for SCC */
1131 bool writes_sgpr = false;
1132 for (Definition& def : instr->definitions) {
1133 if (def.getTemp().type() == RegType::sgpr) {
1134 writes_sgpr = true;
1135 break;
1136 }
1137 }
1138 /* if all operands are constant, no need to care either */
1139 bool reads_sgpr = false;
1140 for (Operand& op : instr->operands) {
1141 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1142 reads_sgpr = true;
1143 break;
1144 }
1145 }
1146 if (!(writes_sgpr && reads_sgpr))
1147 return;
1148
1149 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1150 if (reg_file[scc.reg()]) {
1151 pi->tmp_in_scc = true;
1152
1153 int reg = ctx.max_used_sgpr;
1154 for (; reg >= 0 && reg_file[reg]; reg--)
1155 ;
1156 if (reg < 0) {
1157 reg = ctx.max_used_sgpr + 1;
1158 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1159 ;
1160 assert(reg < ctx.program->max_reg_demand.sgpr);
1161 }
1162
1163 adjust_max_used_regs(ctx, s1, reg);
1164 pi->scratch_sgpr = PhysReg{(unsigned)reg};
1165 } else {
1166 pi->tmp_in_scc = false;
1167 }
1168 }
1169
1170 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1171 {
1172 if (instr->operands[idx].isFixed())
1173 return instr->operands[idx].physReg() == reg;
1174
1175 if (!instr_can_access_subdword(instr) && reg.byte())
1176 return false;
1177
1178 switch (instr->format) {
1179 case Format::SMEM:
1180 return reg != scc &&
1181 reg != exec &&
1182 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1183 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1184 default:
1185 // TODO: there are more instructions with restrictions on registers
1186 return true;
1187 }
1188 }
1189
1190 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1191 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1192 aco_ptr<Instruction>& instr, Operand& operand)
1193 {
1194 /* check if the operand is fixed */
1195 PhysReg dst;
1196 if (operand.isFixed()) {
1197 assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1198
1199 /* check if target reg is blocked, and move away the blocking var */
1200 if (register_file[operand.physReg().reg()]) {
1201 assert(register_file[operand.physReg()] != 0xF0000000);
1202 uint32_t blocking_id = register_file[operand.physReg().reg()];
1203 RegClass rc = ctx.assignments[blocking_id].rc;
1204 Operand pc_op = Operand(Temp{blocking_id, rc});
1205 pc_op.setFixed(operand.physReg());
1206
1207 /* find free reg */
1208 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
1209 Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1210 register_file.clear(pc_op);
1211 parallelcopy.emplace_back(pc_op, pc_def);
1212 }
1213 dst = operand.physReg();
1214
1215 } else {
1216 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr);
1217 }
1218
1219 Operand pc_op = operand;
1220 pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1221 Definition pc_def = Definition(dst, pc_op.regClass());
1222 register_file.clear(pc_op);
1223 parallelcopy.emplace_back(pc_op, pc_def);
1224 update_renames(ctx, register_file, parallelcopy, instr);
1225 }
1226
1227 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1228 {
1229 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1230 if (it == ctx.renames[block_idx].end())
1231 return val;
1232 else
1233 return it->second;
1234 }
1235
1236 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1237 {
1238 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1239 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1240 return val;
1241
1242 assert(preds.size() > 0);
1243
1244 Temp new_val;
1245 if (!ctx.sealed[block->index]) {
1246 /* consider rename from already processed predecessor */
1247 Temp tmp = read_variable(ctx, val, preds[0]);
1248
1249 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1250 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1251 ctx.assignments.emplace_back();
1252 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1253 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1254 phi->definitions[0] = Definition(new_val);
1255 for (unsigned i = 0; i < preds.size(); i++)
1256 phi->operands[i] = Operand(val);
1257 if (tmp.regClass() == new_val.regClass())
1258 ctx.affinities[new_val.id()] = tmp.id();
1259
1260 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1261 ctx.incomplete_phis[block->index].emplace_back(phi.get());
1262 block->instructions.insert(block->instructions.begin(), std::move(phi));
1263
1264 } else if (preds.size() == 1) {
1265 /* if the block has only one predecessor, just look there for the name */
1266 new_val = read_variable(ctx, val, preds[0]);
1267 } else {
1268 /* there are multiple predecessors and the block is sealed */
1269 Temp ops[preds.size()];
1270
1271 /* get the rename from each predecessor and check if they are the same */
1272 bool needs_phi = false;
1273 for (unsigned i = 0; i < preds.size(); i++) {
1274 ops[i] = read_variable(ctx, val, preds[i]);
1275 if (i == 0)
1276 new_val = ops[i];
1277 else
1278 needs_phi |= !(new_val == ops[i]);
1279 }
1280
1281 if (needs_phi) {
1282 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1283 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1284 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1285 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1286 phi->definitions[0] = Definition(new_val);
1287 for (unsigned i = 0; i < preds.size(); i++) {
1288 phi->operands[i] = Operand(ops[i]);
1289 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1290 if (ops[i].regClass() == new_val.regClass())
1291 ctx.affinities[new_val.id()] = ops[i].id();
1292 }
1293 ctx.assignments.emplace_back();
1294 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1295 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1296 block->instructions.insert(block->instructions.begin(), std::move(phi));
1297 }
1298 }
1299
1300 if (new_val != val) {
1301 ctx.renames[block->index][val.id()] = new_val;
1302 ctx.orig_names[new_val.id()] = val;
1303 }
1304 return new_val;
1305 }
1306
1307 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1308 {
1309 std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1310
1311 if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1312 return;
1313
1314 assert(info->second.block_idx != 0);
1315 Instruction* phi = info->second.phi;
1316 Temp same = Temp();
1317 Definition def = phi->definitions[0];
1318
1319 /* a phi node is trivial if all operands are the same as the definition of the phi */
1320 for (const Operand& op : phi->operands) {
1321 const Temp t = op.getTemp();
1322 if (t == same || t == def.getTemp()) {
1323 assert(t == same || op.physReg() == def.physReg());
1324 continue;
1325 }
1326 if (same != Temp())
1327 return;
1328
1329 same = t;
1330 }
1331 assert(same != Temp() || same == def.getTemp());
1332
1333 /* reroute all uses to same and remove phi */
1334 std::vector<Temp> phi_users;
1335 std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1336 for (Instruction* instr : info->second.uses) {
1337 assert(phi != instr);
1338 /* recursively try to remove trivial phis */
1339 if (is_phi(instr)) {
1340 /* ignore if the phi was already flagged trivial */
1341 if (instr->definitions.empty())
1342 continue;
1343
1344 if (instr->definitions[0].getTemp() != temp)
1345 phi_users.emplace_back(instr->definitions[0].getTemp());
1346 }
1347 for (Operand& op : instr->operands) {
1348 if (op.isTemp() && op.tempId() == def.tempId()) {
1349 op.setTemp(same);
1350 if (same_phi_info != ctx.phi_map.end())
1351 same_phi_info->second.uses.emplace(instr);
1352 }
1353 }
1354 }
1355
1356 auto it = ctx.orig_names.find(same.id());
1357 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1358 for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1359 auto it = ctx.renames[i].find(orig_var);
1360 if (it != ctx.renames[i].end() && it->second == def.getTemp())
1361 ctx.renames[i][orig_var] = same;
1362 }
1363
1364 phi->definitions.clear(); /* this indicates that the phi can be removed */
1365 ctx.phi_map.erase(info);
1366 for (Temp t : phi_users)
1367 try_remove_trivial_phi(ctx, t);
1368
1369 return;
1370 }
1371
1372 } /* end namespace */
1373
1374
1375 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
1376 {
1377 ra_ctx ctx(program);
1378 std::vector<std::vector<Temp>> phi_ressources;
1379 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1380
1381 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1382 Block& block = *it;
1383
1384 /* first, compute the death points of all live vars within the block */
1385 TempSet& live = live_out_per_block[block.index];
1386
1387 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1388 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1389 aco_ptr<Instruction>& instr = *rit;
1390 if (is_phi(instr)) {
1391 live.erase(instr->definitions[0].getTemp());
1392 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
1393 continue;
1394 /* collect information about affinity-related temporaries */
1395 std::vector<Temp> affinity_related;
1396 /* affinity_related[0] is the last seen affinity-related temp */
1397 affinity_related.emplace_back(instr->definitions[0].getTemp());
1398 affinity_related.emplace_back(instr->definitions[0].getTemp());
1399 for (const Operand& op : instr->operands) {
1400 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1401 affinity_related.emplace_back(op.getTemp());
1402 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1403 }
1404 }
1405 phi_ressources.emplace_back(std::move(affinity_related));
1406 continue;
1407 }
1408
1409 /* add vector affinities */
1410 if (instr->opcode == aco_opcode::p_create_vector) {
1411 for (const Operand& op : instr->operands) {
1412 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1413 ctx.vectors[op.tempId()] = instr.get();
1414 }
1415 }
1416
1417 /* add operands to live variables */
1418 for (const Operand& op : instr->operands) {
1419 if (op.isTemp())
1420 live.emplace(op.getTemp());
1421 }
1422
1423 /* erase definitions from live */
1424 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1425 const Definition& def = instr->definitions[i];
1426 if (!def.isTemp())
1427 continue;
1428 live.erase(def.getTemp());
1429 /* mark last-seen phi operand */
1430 std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1431 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1432 phi_ressources[it->second][0] = def.getTemp();
1433 /* try to coalesce phi affinities with parallelcopies */
1434 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
1435 Operand op = instr->operands[i];
1436 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1437 phi_ressources[it->second].emplace_back(op.getTemp());
1438 temp_to_phi_ressources[op.tempId()] = it->second;
1439 }
1440 }
1441 }
1442 }
1443 }
1444 }
1445 /* create affinities */
1446 for (std::vector<Temp>& vec : phi_ressources) {
1447 assert(vec.size() > 1);
1448 for (unsigned i = 1; i < vec.size(); i++)
1449 if (vec[i].id() != vec[0].id())
1450 ctx.affinities[vec[i].id()] = vec[0].id();
1451 }
1452
1453 /* state of register file after phis */
1454 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1455
1456 for (Block& block : program->blocks) {
1457 TempSet& live = live_out_per_block[block.index];
1458 /* initialize register file */
1459 assert(block.index != 0 || live.empty());
1460 RegisterFile register_file;
1461 ctx.war_hint.reset();
1462
1463 for (Temp t : live) {
1464 Temp renamed = handle_live_in(ctx, t, &block);
1465 assignment& var = ctx.assignments[renamed.id()];
1466 /* due to live-range splits, the live-in might be a phi, now */
1467 if (var.assigned)
1468 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1469 }
1470
1471 std::vector<aco_ptr<Instruction>> instructions;
1472 std::vector<aco_ptr<Instruction>>::iterator it;
1473
1474 /* this is a slight adjustment from the paper as we already have phi nodes:
1475 * We consider them incomplete phis and only handle the definition. */
1476
1477 /* handle fixed phi definitions */
1478 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1479 aco_ptr<Instruction>& phi = *it;
1480 if (!is_phi(phi))
1481 break;
1482 Definition& definition = phi->definitions[0];
1483 if (!definition.isFixed())
1484 continue;
1485
1486 /* check if a dead exec mask phi is needed */
1487 if (definition.isKill()) {
1488 for (Operand& op : phi->operands) {
1489 assert(op.isTemp());
1490 if (!ctx.assignments[op.tempId()].assigned ||
1491 ctx.assignments[op.tempId()].reg != exec) {
1492 definition.setKill(false);
1493 break;
1494 }
1495 }
1496 }
1497
1498 if (definition.isKill())
1499 continue;
1500
1501 assert(definition.physReg() == exec);
1502 assert(!register_file.test(definition.physReg(), definition.bytes()));
1503 register_file.fill(definition);
1504 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1505 }
1506
1507 /* look up the affinities */
1508 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1509 aco_ptr<Instruction>& phi = *it;
1510 if (!is_phi(phi))
1511 break;
1512 Definition& definition = phi->definitions[0];
1513 if (definition.isKill() || definition.isFixed())
1514 continue;
1515
1516 if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1517 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1518 assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1519 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1520 bool try_use_special_reg = reg == scc || reg == exec;
1521 if (try_use_special_reg) {
1522 for (const Operand& op : phi->operands) {
1523 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1524 ctx.assignments[op.tempId()].reg == reg)) {
1525 try_use_special_reg = false;
1526 break;
1527 }
1528 }
1529 if (!try_use_special_reg)
1530 continue;
1531 }
1532 /* only assign if register is still free */
1533 if (!register_file.test(reg, definition.bytes())) {
1534 definition.setFixed(reg);
1535 register_file.fill(definition);
1536 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1537 }
1538 }
1539 }
1540
1541 /* find registers for phis without affinity or where the register was blocked */
1542 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1543 aco_ptr<Instruction>& phi = *it;
1544 if (!is_phi(phi))
1545 break;
1546
1547 Definition& definition = phi->definitions[0];
1548 if (definition.isKill())
1549 continue;
1550
1551 if (!definition.isFixed()) {
1552 std::vector<std::pair<Operand, Definition>> parallelcopy;
1553 /* try to find a register that is used by at least one operand */
1554 for (const Operand& op : phi->operands) {
1555 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1556 continue;
1557 PhysReg reg = ctx.assignments[op.tempId()].reg;
1558 /* we tried this already on the previous loop */
1559 if (reg == scc || reg == exec)
1560 continue;
1561 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1562 definition.setFixed(reg);
1563 break;
1564 }
1565 }
1566 if (!definition.isFixed())
1567 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1568
1569 /* process parallelcopy */
1570 for (std::pair<Operand, Definition> pc : parallelcopy) {
1571 /* see if it's a copy from a different phi */
1572 //TODO: prefer moving some previous phis over live-ins
1573 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1574 Instruction *prev_phi = NULL;
1575 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1576 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1577 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1578 prev_phi = phi_it->get();
1579 }
1580 phi_it = it;
1581 while (!prev_phi && is_phi(*++phi_it)) {
1582 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1583 prev_phi = phi_it->get();
1584 }
1585 if (prev_phi) {
1586 /* if so, just update that phi's register */
1587 register_file.clear(prev_phi->definitions[0]);
1588 prev_phi->definitions[0].setFixed(pc.second.physReg());
1589 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1590 register_file.fill(prev_phi->definitions[0]);
1591 continue;
1592 }
1593
1594 /* rename */
1595 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1596 Temp orig = pc.first.getTemp();
1597 if (orig_it != ctx.orig_names.end())
1598 orig = orig_it->second;
1599 else
1600 ctx.orig_names[pc.second.tempId()] = orig;
1601 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1602
1603 /* otherwise, this is a live-in and we need to create a new phi
1604 * to move it in this block's predecessors */
1605 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1606 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1607 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1608 new_phi->definitions[0] = pc.second;
1609 for (unsigned i = 0; i < preds.size(); i++)
1610 new_phi->operands[i] = Operand(pc.first);
1611 instructions.emplace_back(std::move(new_phi));
1612 }
1613
1614 register_file.fill(definition);
1615 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1616 }
1617 live.emplace(definition.getTemp());
1618
1619 /* update phi affinities */
1620 for (const Operand& op : phi->operands) {
1621 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1622 ctx.affinities[op.tempId()] = definition.tempId();
1623 }
1624
1625 instructions.emplace_back(std::move(*it));
1626 }
1627
1628 /* fill in sgpr_live_in */
1629 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1630 sgpr_live_in[block.index][i] = register_file[i];
1631 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1632
1633 /* Handle all other instructions of the block */
1634 for (; it != block.instructions.end(); ++it) {
1635 aco_ptr<Instruction>& instr = *it;
1636
1637 /* parallelcopies from p_phi are inserted here which means
1638 * live ranges of killed operands end here as well */
1639 if (instr->opcode == aco_opcode::p_logical_end) {
1640 /* no need to process this instruction any further */
1641 if (block.logical_succs.size() != 1) {
1642 instructions.emplace_back(std::move(instr));
1643 continue;
1644 }
1645
1646 Block& succ = program->blocks[block.logical_succs[0]];
1647 unsigned idx = 0;
1648 for (; idx < succ.logical_preds.size(); idx++) {
1649 if (succ.logical_preds[idx] == block.index)
1650 break;
1651 }
1652 for (aco_ptr<Instruction>& phi : succ.instructions) {
1653 if (phi->opcode == aco_opcode::p_phi) {
1654 if (phi->operands[idx].isTemp() &&
1655 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1656 phi->operands[idx].isFirstKillBeforeDef()) {
1657 Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
1658 PhysReg reg = ctx.assignments[phi_op.id()].reg;
1659 assert(register_file[reg] == phi_op.id());
1660 register_file[reg] = 0;
1661 }
1662 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1663 break;
1664 }
1665 }
1666 instructions.emplace_back(std::move(instr));
1667 continue;
1668 }
1669
1670 std::vector<std::pair<Operand, Definition>> parallelcopy;
1671
1672 assert(!is_phi(instr));
1673
1674 /* handle operands */
1675 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1676 auto& operand = instr->operands[i];
1677 if (!operand.isTemp())
1678 continue;
1679
1680 /* rename operands */
1681 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
1682 assert(ctx.assignments[operand.tempId()].assigned);
1683
1684 PhysReg reg = ctx.assignments[operand.tempId()].reg;
1685 if (operand_can_use_reg(instr, i, reg))
1686 operand.setFixed(reg);
1687 else
1688 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
1689
1690 if (instr->format == Format::EXP ||
1691 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
1692 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1693 for (unsigned j = 0; j < operand.size(); j++)
1694 ctx.war_hint.set(operand.physReg().reg() + j);
1695 }
1696
1697 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
1698 if (phi != ctx.phi_map.end())
1699 phi->second.uses.emplace(instr.get());
1700 }
1701
1702 /* remove dead vars from register file */
1703 for (const Operand& op : instr->operands) {
1704 if (op.isTemp() && op.isFirstKillBeforeDef())
1705 register_file.clear(op);
1706 }
1707
1708 /* try to optimize v_mad_f32 -> v_mac_f32 */
1709 if (instr->opcode == aco_opcode::v_mad_f32 &&
1710 instr->operands[2].isTemp() &&
1711 instr->operands[2].isKillBeforeDef() &&
1712 instr->operands[2].getTemp().type() == RegType::vgpr &&
1713 instr->operands[1].isTemp() &&
1714 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1715 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1716 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1717 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1718 vop3->clamp || vop3->omod || vop3->opsel);
1719 if (can_use_mac) {
1720 instr->format = Format::VOP2;
1721 instr->opcode = aco_opcode::v_mac_f32;
1722 }
1723 }
1724
1725 /* handle definitions which must have the same register as an operand */
1726 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1727 instr->opcode == aco_opcode::v_mac_f32 ||
1728 instr->opcode == aco_opcode::v_writelane_b32 ||
1729 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1730 instr->definitions[0].setFixed(instr->operands[2].physReg());
1731 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1732 instr->opcode == aco_opcode::s_mulk_i32) {
1733 instr->definitions[0].setFixed(instr->operands[0].physReg());
1734 } else if (instr->format == Format::MUBUF &&
1735 instr->definitions.size() == 1 &&
1736 instr->operands.size() == 4) {
1737 instr->definitions[0].setFixed(instr->operands[3].physReg());
1738 } else if (instr->format == Format::MIMG &&
1739 instr->definitions.size() == 1 &&
1740 instr->operands[1].regClass().type() == RegType::vgpr) {
1741 instr->definitions[0].setFixed(instr->operands[1].physReg());
1742 }
1743
1744 ctx.defs_done.reset();
1745
1746 /* handle fixed definitions first */
1747 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1748 auto& definition = instr->definitions[i];
1749 if (!definition.isFixed())
1750 continue;
1751
1752 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1753 /* check if the target register is blocked */
1754 if (register_file[definition.physReg().reg()] != 0) {
1755 /* create parallelcopy pair to move blocking var */
1756 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
1757 Operand pc_op = Operand(tmp);
1758 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
1759 RegClass rc = pc_op.regClass();
1760 tmp = Temp{program->allocateId(), rc};
1761 Definition pc_def = Definition(tmp);
1762
1763 /* re-enable the killed operands, so that we don't move the blocking var there */
1764 for (const Operand& op : instr->operands) {
1765 if (op.isTemp() && op.isFirstKillBeforeDef())
1766 register_file.fill(op);
1767 }
1768
1769 /* find a new register for the blocking variable */
1770 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
1771 /* once again, disable killed operands */
1772 for (const Operand& op : instr->operands) {
1773 if (op.isTemp() && op.isFirstKillBeforeDef())
1774 register_file.clear(op);
1775 }
1776 for (unsigned k = 0; k < i; k++) {
1777 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1778 register_file.fill(instr->definitions[k]);
1779 }
1780 pc_def.setFixed(reg);
1781
1782 /* finish assignment of parallelcopy */
1783 ctx.assignments.emplace_back(reg, pc_def.regClass());
1784 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1785 parallelcopy.emplace_back(pc_op, pc_def);
1786
1787 /* add changes to reg_file */
1788 register_file.clear(pc_op);
1789 register_file.fill(pc_def);
1790 }
1791 ctx.defs_done.set(i);
1792
1793 if (!definition.isTemp())
1794 continue;
1795
1796 /* set live if it has a kill point */
1797 if (!definition.isKill())
1798 live.emplace(definition.getTemp());
1799
1800 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1801 register_file.fill(definition);
1802 }
1803
1804 /* handle all other definitions */
1805 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1806 auto& definition = instr->definitions[i];
1807
1808 if (definition.isFixed() || !definition.isTemp())
1809 continue;
1810
1811 /* find free reg */
1812 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1813 definition.setFixed(definition.physReg());
1814 else if (instr->opcode == aco_opcode::p_split_vector) {
1815 PhysReg reg = instr->operands[0].physReg();
1816 reg.reg_b += i * definition.bytes();
1817 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1818 definition.setFixed(reg);
1819 } else if (instr->opcode == aco_opcode::p_wqm) {
1820 PhysReg reg;
1821 if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1822 reg = instr->operands[0].physReg();
1823 definition.setFixed(reg);
1824 assert(register_file[reg.reg()] == 0);
1825 }
1826 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1827 PhysReg reg;
1828 if (instr->operands[0].isKillBeforeDef() &&
1829 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1830 reg = instr->operands[0].physReg();
1831 reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
1832 assert(!register_file.test(reg, definition.bytes()));
1833 definition.setFixed(reg);
1834 }
1835 } else if (instr->opcode == aco_opcode::p_create_vector) {
1836 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
1837 parallelcopy, instr);
1838 definition.setFixed(reg);
1839 }
1840
1841 if (!definition.isFixed())
1842 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
1843
1844 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1845 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1846 ctx.defs_done.set(i);
1847
1848 /* set live if it has a kill point */
1849 if (!definition.isKill())
1850 live.emplace(definition.getTemp());
1851
1852 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1853 register_file.fill(definition);
1854 }
1855
1856 handle_pseudo(ctx, register_file, instr.get());
1857
1858 /* kill definitions and late-kill operands */
1859 for (const Definition& def : instr->definitions) {
1860 if (def.isTemp() && def.isKill())
1861 register_file.clear(def);
1862 }
1863 for (const Operand& op : instr->operands) {
1864 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1865 register_file.clear(op);
1866 }
1867
1868 /* emit parallelcopy */
1869 if (!parallelcopy.empty()) {
1870 aco_ptr<Pseudo_instruction> pc;
1871 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1872 bool temp_in_scc = register_file[scc.reg()];
1873 bool sgpr_operands_alias_defs = false;
1874 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1875 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1876 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1877 if (!sgpr_operands_alias_defs) {
1878 unsigned reg = parallelcopy[i].first.physReg().reg();
1879 unsigned size = parallelcopy[i].first.getTemp().size();
1880 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1881
1882 reg = parallelcopy[i].second.physReg().reg();
1883 size = parallelcopy[i].second.getTemp().size();
1884 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1885 sgpr_operands_alias_defs = true;
1886 }
1887 }
1888
1889 pc->operands[i] = parallelcopy[i].first;
1890 pc->definitions[i] = parallelcopy[i].second;
1891 assert(pc->operands[i].size() == pc->definitions[i].size());
1892
1893 /* it might happen that the operand is already renamed. we have to restore the original name. */
1894 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1895 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1896 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1897 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1898
1899 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
1900 if (phi != ctx.phi_map.end())
1901 phi->second.uses.emplace(pc.get());
1902 }
1903
1904 if (temp_in_scc && sgpr_operands_alias_defs) {
1905 /* disable definitions and re-enable operands */
1906 for (const Definition& def : instr->definitions) {
1907 if (def.isTemp() && !def.isKill())
1908 register_file.clear(def);
1909 }
1910 for (const Operand& op : instr->operands) {
1911 if (op.isTemp() && op.isFirstKill())
1912 register_file.block(op.physReg(), op.bytes());
1913 }
1914
1915 handle_pseudo(ctx, register_file, pc.get());
1916
1917 /* re-enable live vars */
1918 for (const Operand& op : instr->operands) {
1919 if (op.isTemp() && op.isFirstKill())
1920 register_file.clear(op);
1921 }
1922 for (const Definition& def : instr->definitions) {
1923 if (def.isTemp() && !def.isKill())
1924 register_file.fill(def);
1925 }
1926 } else {
1927 pc->tmp_in_scc = false;
1928 }
1929
1930 instructions.emplace_back(std::move(pc));
1931 }
1932
1933 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1934 bool instr_needs_vop3 = !instr->isVOP3() &&
1935 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1936 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1937 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1938 instr->opcode == aco_opcode::v_addc_co_u32 ||
1939 instr->opcode == aco_opcode::v_sub_co_u32 ||
1940 instr->opcode == aco_opcode::v_subb_co_u32 ||
1941 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1942 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1943 !(instr->definitions[1].physReg() == vcc)) ||
1944 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1945 instr->opcode == aco_opcode::v_subb_co_u32 ||
1946 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1947 !(instr->operands[2].physReg() == vcc)));
1948 if (instr_needs_vop3) {
1949
1950 /* if the first operand is a literal, we have to move it to a reg */
1951 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
1952 bool can_sgpr = true;
1953 /* check, if we have to move to vgpr */
1954 for (const Operand& op : instr->operands) {
1955 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1956 can_sgpr = false;
1957 break;
1958 }
1959 }
1960 /* disable definitions and re-enable operands */
1961 for (const Definition& def : instr->definitions)
1962 register_file.clear(def);
1963 for (const Operand& op : instr->operands) {
1964 if (op.isTemp() && op.isFirstKill())
1965 register_file.block(op.physReg(), op.bytes());
1966 }
1967 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
1968 ctx.assignments.emplace_back();
1969 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
1970
1971 aco_ptr<Instruction> mov;
1972 if (can_sgpr)
1973 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1974 else
1975 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1976 mov->operands[0] = instr->operands[0];
1977 mov->definitions[0] = Definition(tmp);
1978 mov->definitions[0].setFixed(reg);
1979
1980 instr->operands[0] = Operand(tmp);
1981 instr->operands[0].setFixed(reg);
1982 instructions.emplace_back(std::move(mov));
1983 /* re-enable live vars */
1984 for (const Operand& op : instr->operands) {
1985 if (op.isTemp() && op.isFirstKill())
1986 register_file.clear(op);
1987 }
1988 for (const Definition& def : instr->definitions) {
1989 if (def.isTemp() && !def.isKill())
1990 register_file.fill(def);
1991 }
1992 }
1993
1994 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1995 aco_ptr<Instruction> tmp = std::move(instr);
1996 Format format = asVOP3(tmp->format);
1997 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
1998 for (unsigned i = 0; i < instr->operands.size(); i++) {
1999 Operand& operand = tmp->operands[i];
2000 instr->operands[i] = operand;
2001 /* keep phi_map up to date */
2002 if (operand.isTemp()) {
2003 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.tempId());
2004 if (phi != ctx.phi_map.end()) {
2005 phi->second.uses.erase(tmp.get());
2006 phi->second.uses.emplace(instr.get());
2007 }
2008 }
2009 }
2010 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2011 }
2012 instructions.emplace_back(std::move(*it));
2013
2014 } /* end for Instr */
2015
2016 block.instructions = std::move(instructions);
2017
2018 ctx.filled[block.index] = true;
2019 for (unsigned succ_idx : block.linear_succs) {
2020 Block& succ = program->blocks[succ_idx];
2021 /* seal block if all predecessors are filled */
2022 bool all_filled = true;
2023 for (unsigned pred_idx : succ.linear_preds) {
2024 if (!ctx.filled[pred_idx]) {
2025 all_filled = false;
2026 break;
2027 }
2028 }
2029 if (all_filled) {
2030 ctx.sealed[succ_idx] = true;
2031
2032 /* finish incomplete phis and check if they became trivial */
2033 for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2034 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2035 for (unsigned i = 0; i < phi->operands.size(); i++) {
2036 phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2037 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2038 }
2039 try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2040 }
2041 /* complete the original phi nodes, but no need to check triviality */
2042 for (aco_ptr<Instruction>& instr : succ.instructions) {
2043 if (!is_phi(instr))
2044 break;
2045 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2046
2047 for (unsigned i = 0; i < instr->operands.size(); i++) {
2048 auto& operand = instr->operands[i];
2049 if (!operand.isTemp())
2050 continue;
2051 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2052 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2053 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2054 if (phi != ctx.phi_map.end())
2055 phi->second.uses.emplace(instr.get());
2056 }
2057 }
2058 }
2059 }
2060 } /* end for BB */
2061
2062 /* remove trivial phis */
2063 for (Block& block : program->blocks) {
2064 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2065 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2066 auto middle = std::remove_if(block.instructions.begin(), end,
2067 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2068 block.instructions.erase(middle, end);
2069 }
2070
2071 /* find scc spill registers which may be needed for parallelcopies created by phis */
2072 for (Block& block : program->blocks) {
2073 if (block.linear_preds.size() <= 1)
2074 continue;
2075
2076 std::bitset<128> regs = sgpr_live_in[block.index];
2077 if (!regs[127])
2078 continue;
2079
2080 /* choose a register */
2081 int16_t reg = 0;
2082 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2083 ;
2084 assert(reg < ctx.program->max_reg_demand.sgpr);
2085 adjust_max_used_regs(ctx, s1, reg);
2086
2087 /* update predecessors */
2088 for (unsigned& pred_index : block.linear_preds) {
2089 Block& pred = program->blocks[pred_index];
2090 pred.scc_live_out = true;
2091 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2092 }
2093 }
2094
2095 /* num_gpr = rnd_up(max_used_gpr + 1) */
2096 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2097 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2098 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2099 else
2100 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2101 }
2102
2103 }