aco: add notion of subdword registers to register allocator
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33 #include <functional>
34
35 #include "aco_ir.h"
36 #include "sid.h"
37 #include "util/u_math.h"
38
39 namespace aco {
40 namespace {
41
42 struct ra_ctx {
43 std::bitset<512> war_hint;
44 Program* program;
45 std::unordered_map<unsigned, std::pair<PhysReg, RegClass>> assignments;
46 std::map<unsigned, Temp> orig_names;
47 unsigned max_used_sgpr = 0;
48 unsigned max_used_vgpr = 0;
49 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
50
51 ra_ctx(Program* program) : program(program) {}
52 };
53
54 class RegisterFile {
55 public:
56 RegisterFile() {regs.fill(0);}
57
58 std::array<uint32_t, 512> regs;
59 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
60
61 const uint32_t& operator [] (unsigned index) const {
62 return regs[index];
63 }
64
65 uint32_t& operator [] (unsigned index) {
66 return regs[index];
67 }
68
69 unsigned count_zero(PhysReg start, unsigned size) {
70 unsigned res = 0;
71 for (unsigned i = 0; i < size; i++)
72 res += !regs[start + i];
73 return res;
74 }
75
76 bool test(PhysReg start, unsigned num_bytes) {
77 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
78 if (regs[i] & 0x0FFFFFFF)
79 return true;
80 if (regs[i] == 0xF0000000) {
81 assert(subdword_regs.find(i) != subdword_regs.end());
82 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
83 if (subdword_regs[i][j])
84 return true;
85 }
86 }
87 }
88 return false;
89 }
90
91 void fill(PhysReg start, unsigned size, uint32_t val) {
92 for (unsigned i = 0; i < size; i++)
93 regs[start + i] = val;
94 }
95
96 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
97 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
98 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
99 /* emplace or get */
100 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
101 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
102 sub[j] = val;
103
104 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
105 subdword_regs.erase(i);
106 regs[i] = 0;
107 }
108 }
109 }
110
111 void block(PhysReg start, unsigned num_bytes) {
112 if (start.byte() || num_bytes % 4)
113 fill_subdword(start, num_bytes, 0xFFFFFFFF);
114 else
115 fill(start, num_bytes / 4, 0xFFFFFFFF);
116 }
117
118 bool is_blocked(PhysReg start) {
119 if (regs[start] == 0xFFFFFFFF)
120 return true;
121 if (regs[start] == 0xF0000000) {
122 for (unsigned i = start.byte(); i < 4; i++)
123 if (subdword_regs[start][i] == 0xFFFFFFFF)
124 return true;
125 }
126 return false;
127 }
128
129 void clear(PhysReg start, RegClass rc) {
130 if (rc.is_subdword())
131 fill_subdword(start, rc.bytes(), 0);
132 else
133 fill(start, rc.size(), 0);
134 }
135
136 void fill(Operand op) {
137 if (op.regClass().is_subdword())
138 fill_subdword(op.physReg(), op.bytes(), op.tempId());
139 else
140 fill(op.physReg(), op.size(), op.tempId());
141 }
142
143 void clear(Operand op) {
144 clear(op.physReg(), op.regClass());
145 }
146
147 void fill(Definition def) {
148 if (def.regClass().is_subdword())
149 fill_subdword(def.physReg(), def.bytes(), def.tempId());
150 else
151 fill(def.physReg(), def.size(), def.tempId());
152 }
153
154 void clear(Definition def) {
155 clear(def.physReg(), def.regClass());
156 }
157 };
158
159
160 /* helper function for debugging */
161 #if 0
162 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
163 {
164 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
165 unsigned lb = vgprs ? 256 : 0;
166 unsigned ub = lb + max;
167 char reg_char = vgprs ? 'v' : 's';
168
169 /* print markers */
170 printf(" ");
171 for (unsigned i = lb; i < ub; i += 3) {
172 printf("%.2u ", i - lb);
173 }
174 printf("\n");
175
176 /* print usage */
177 printf("%cgprs: ", reg_char);
178 unsigned free_regs = 0;
179 unsigned prev = 0;
180 bool char_select = false;
181 for (unsigned i = lb; i < ub; i++) {
182 if (reg_file[i] == 0xFFFF) {
183 printf("~");
184 } else if (reg_file[i]) {
185 if (reg_file[i] != prev) {
186 prev = reg_file[i];
187 char_select = !char_select;
188 }
189 printf(char_select ? "#" : "@");
190 } else {
191 free_regs++;
192 printf(".");
193 }
194 }
195 printf("\n");
196
197 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
198
199 /* print assignments */
200 prev = 0;
201 unsigned size = 0;
202 for (unsigned i = lb; i < ub; i++) {
203 if (reg_file[i] != prev) {
204 if (prev && size > 1)
205 printf("-%d]\n", i - 1 - lb);
206 else if (prev)
207 printf("]\n");
208 prev = reg_file[i];
209 if (prev && prev != 0xFFFF) {
210 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
211 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
212 else
213 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
214 }
215 size = 1;
216 } else {
217 size++;
218 }
219 }
220 if (prev && size > 1)
221 printf("-%d]\n", ub - lb - 1);
222 else if (prev)
223 printf("]\n");
224 }
225 #endif
226
227
228 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
229 {
230 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
231 unsigned size = rc.size();
232 if (rc.type() == RegType::vgpr) {
233 assert(reg >= 256);
234 unsigned hi = reg - 256 + size - 1;
235 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
236 } else if (reg + rc.size() <= max_addressible_sgpr) {
237 unsigned hi = reg + size - 1;
238 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
239 }
240 }
241
242
243 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
244 std::vector<std::pair<Operand, Definition>>& parallelcopies,
245 aco_ptr<Instruction>& instr)
246 {
247 /* allocate id's and rename operands: this is done transparently here */
248 for (std::pair<Operand, Definition>& copy : parallelcopies) {
249 /* the definitions with id are not from this function and already handled */
250 if (copy.second.isTemp())
251 continue;
252
253 /* check if we we moved another parallelcopy definition */
254 for (std::pair<Operand, Definition>& other : parallelcopies) {
255 if (!other.second.isTemp())
256 continue;
257 if (copy.first.getTemp() == other.second.getTemp()) {
258 copy.first.setTemp(other.first.getTemp());
259 copy.first.setFixed(other.first.physReg());
260 }
261 }
262 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
263 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
264 ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()};
265 reg_file.fill(copy.second);
266
267 /* check if we moved an operand */
268 for (Operand& op : instr->operands) {
269 if (!op.isTemp())
270 continue;
271 if (op.tempId() == copy.first.tempId()) {
272 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
273 for (std::pair<Operand, Definition>& pc : parallelcopies) {
274 PhysReg def_reg = pc.second.physReg();
275 omit_renaming &= def_reg > copy.first.physReg() ?
276 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
277 (def_reg + pc.second.size() <= copy.first.physReg().reg());
278 }
279 if (omit_renaming)
280 continue;
281 op.setTemp(copy.second.getTemp());
282 op.setFixed(copy.second.physReg());
283 }
284 }
285 }
286 }
287
288 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
289 RegisterFile& reg_file,
290 uint32_t lb, uint32_t ub,
291 uint32_t size, uint32_t stride,
292 RegClass rc)
293 {
294 /* best fit algorithm: find the smallest gap to fit in the variable */
295 if (stride == 1) {
296 unsigned best_pos = 0xFFFF;
297 unsigned gap_size = 0xFFFF;
298 unsigned next_pos = 0xFFFF;
299
300 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
301 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
302 if (next_pos == 0xFFFF)
303 continue;
304
305 /* check if the variable fits */
306 if (next_pos + size > current_reg) {
307 next_pos = 0xFFFF;
308 continue;
309 }
310
311 /* check if the tested gap is smaller */
312 if (current_reg - next_pos < gap_size) {
313 best_pos = next_pos;
314 gap_size = current_reg - next_pos;
315 }
316 next_pos = 0xFFFF;
317 continue;
318 }
319
320 if (next_pos == 0xFFFF)
321 next_pos = current_reg;
322 }
323
324 /* final check */
325 if (next_pos != 0xFFFF &&
326 next_pos + size <= ub &&
327 ub - next_pos < gap_size) {
328 best_pos = next_pos;
329 gap_size = ub - next_pos;
330 }
331 if (best_pos != 0xFFFF) {
332 adjust_max_used_regs(ctx, rc, best_pos);
333 return {PhysReg{best_pos}, true};
334 }
335 return {{}, false};
336 }
337
338 bool found = false;
339 unsigned reg_lo = lb;
340 unsigned reg_hi = lb + size - 1;
341 while (!found && reg_lo + size <= ub) {
342 if (reg_file[reg_lo] != 0) {
343 reg_lo += stride;
344 continue;
345 }
346 reg_hi = reg_lo + size - 1;
347 found = true;
348 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
349 if (reg_file[reg] != 0 || ctx.war_hint[reg])
350 found = false;
351 }
352 if (found) {
353 adjust_max_used_regs(ctx, rc, reg_lo);
354 return {PhysReg{reg_lo}, true};
355 }
356
357 reg_lo += stride;
358 }
359
360 return {{}, false};
361 }
362
363 bool get_regs_for_copies(ra_ctx& ctx,
364 RegisterFile& reg_file,
365 std::vector<std::pair<Operand, Definition>>& parallelcopies,
366 const std::set<std::pair<unsigned, unsigned>> &vars,
367 uint32_t lb, uint32_t ub,
368 aco_ptr<Instruction>& instr,
369 uint32_t def_reg_lo,
370 uint32_t def_reg_hi)
371 {
372
373 /* variables are sorted from small sized to large */
374 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
375 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
376 unsigned id = it->second;
377 std::pair<PhysReg, RegClass> var = ctx.assignments[id];
378 uint32_t size = it->first;
379 uint32_t stride = 1;
380 if (var.second.type() == RegType::sgpr) {
381 if (size == 2)
382 stride = 2;
383 if (size > 3)
384 stride = 4;
385 }
386
387 /* check if this is a dead operand, then we can re-use the space from the definition */
388 bool is_dead_operand = false;
389 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
390 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
391 is_dead_operand = true;
392 }
393
394 std::pair<PhysReg, bool> res;
395 if (is_dead_operand) {
396 if (instr->opcode == aco_opcode::p_create_vector) {
397 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
398 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
399 for (unsigned j = 0; j < size; j++)
400 assert(reg_file[def_reg_lo + offset + j] == 0);
401 res = {PhysReg{def_reg_lo + offset}, true};
402 break;
403 }
404 }
405 } else {
406 res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second);
407 }
408 } else {
409 res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second);
410 if (!res.second) {
411 unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
412 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second);
413 }
414 }
415
416 if (res.second) {
417 /* mark the area as blocked */
418 reg_file.block(res.first, var.second.bytes());
419
420 /* create parallelcopy pair (without definition id) */
421 Temp tmp = Temp(id, var.second);
422 Operand pc_op = Operand(tmp);
423 pc_op.setFixed(var.first);
424 Definition pc_def = Definition(res.first, pc_op.regClass());
425 parallelcopies.emplace_back(pc_op, pc_def);
426 continue;
427 }
428
429 unsigned best_pos = lb;
430 unsigned num_moves = 0xFF;
431 unsigned num_vars = 0;
432
433 /* we use a sliding window to find potential positions */
434 unsigned reg_lo = lb;
435 unsigned reg_hi = lb + size - 1;
436 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
437 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
438 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
439 continue;
440
441 /* second, check that we have at most k=num_moves elements in the window
442 * and no element is larger than the currently processed one */
443 unsigned k = 0;
444 unsigned n = 0;
445 unsigned last_var = 0;
446 bool found = true;
447 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
448 if (reg_file[j] == 0 || reg_file[j] == last_var)
449 continue;
450
451 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
452 found = false;
453 break;
454 }
455 /* we cannot split live ranges of linear vgprs */
456 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
457 found = false;
458 break;
459 }
460 bool is_kill = false;
461 for (const Operand& op : instr->operands) {
462 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
463 is_kill = true;
464 break;
465 }
466 }
467 if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) {
468 found = false;
469 break;
470 }
471
472 k += ctx.assignments[reg_file[j]].second.size();
473 last_var = reg_file[j];
474 n++;
475 if (k > num_moves || (k == num_moves && n <= num_vars)) {
476 found = false;
477 break;
478 }
479 }
480
481 if (found) {
482 best_pos = reg_lo;
483 num_moves = k;
484 num_vars = n;
485 }
486 }
487
488 /* FIXME: we messed up and couldn't find space for the variables to be copied */
489 if (num_moves == 0xFF)
490 return false;
491
492 reg_lo = best_pos;
493 reg_hi = best_pos + size - 1;
494
495 /* collect variables and block reg file */
496 std::set<std::pair<unsigned, unsigned>> new_vars;
497 for (unsigned j = reg_lo; j <= reg_hi; j++) {
498 if (reg_file[j] != 0) {
499 unsigned size = ctx.assignments[reg_file[j]].second.size();
500 unsigned id = reg_file[j];
501 new_vars.emplace(size, id);
502 reg_file.clear(ctx.assignments[id].first, ctx.assignments[id].second);
503 }
504 }
505
506 /* mark the area as blocked */
507 reg_file.block(PhysReg{reg_lo}, size * 4);
508
509 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
510 return false;
511
512 adjust_max_used_regs(ctx, var.second, reg_lo);
513
514 /* create parallelcopy pair (without definition id) */
515 Temp tmp = Temp(id, var.second);
516 Operand pc_op = Operand(tmp);
517 pc_op.setFixed(var.first);
518 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
519 parallelcopies.emplace_back(pc_op, pc_def);
520 }
521
522 return true;
523 }
524
525
526 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
527 RegisterFile& reg_file,
528 std::vector<std::pair<Operand, Definition>>& parallelcopies,
529 uint32_t lb, uint32_t ub,
530 uint32_t size, uint32_t stride,
531 RegClass rc,
532 aco_ptr<Instruction>& instr)
533 {
534 /* check how many free regs we have */
535 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
536
537 /* mark and count killed operands */
538 unsigned killed_ops = 0;
539 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
540 if (instr->operands[j].isTemp() &&
541 instr->operands[j].isFirstKillBeforeDef() &&
542 instr->operands[j].physReg() >= lb &&
543 instr->operands[j].physReg() < ub) {
544 assert(instr->operands[j].isFixed());
545 assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
546 reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
547 killed_ops += instr->operands[j].getTemp().size();
548 }
549 }
550
551 assert(regs_free >= size);
552 /* we might have to move dead operands to dst in order to make space */
553 unsigned op_moves = 0;
554
555 if (size > (regs_free - killed_ops))
556 op_moves = size - (regs_free - killed_ops);
557
558 /* find the best position to place the definition */
559 unsigned best_pos = lb;
560 unsigned num_moves = 0xFF;
561 unsigned num_vars = 0;
562
563 /* we use a sliding window to check potential positions */
564 unsigned reg_lo = lb;
565 unsigned reg_hi = lb + size - 1;
566 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
567 /* first check the edges: this is what we have to fix to allow for num_moves > size */
568 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
569 continue;
570 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
571 continue;
572
573 /* second, check that we have at most k=num_moves elements in the window
574 * and no element is larger than the currently processed one */
575 unsigned k = op_moves;
576 unsigned n = 0;
577 unsigned remaining_op_moves = op_moves;
578 unsigned last_var = 0;
579 bool found = true;
580 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
581 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
582 if (reg_file[j] == 0 || reg_file[j] == last_var)
583 continue;
584
585 /* dead operands effectively reduce the number of estimated moves */
586 if (remaining_op_moves && reg_file.is_blocked(PhysReg{j})) {
587 k--;
588 remaining_op_moves--;
589 continue;
590 }
591
592 if (ctx.assignments[reg_file[j]].second.size() >= size) {
593 found = false;
594 break;
595 }
596
597
598 /* we cannot split live ranges of linear vgprs */
599 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
600 found = false;
601 break;
602 }
603
604 k += ctx.assignments[reg_file[j]].second.size();
605 n++;
606 last_var = reg_file[j];
607 }
608
609 if (!found || k > num_moves)
610 continue;
611 if (k == num_moves && n < num_vars)
612 continue;
613 if (!aligned && k == num_moves && n == num_vars)
614 continue;
615
616 if (found) {
617 best_pos = reg_lo;
618 num_moves = k;
619 num_vars = n;
620 }
621 }
622
623 if (num_moves == 0xFF) {
624 /* remove killed operands from reg_file once again */
625 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
626 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
627 reg_file.clear(instr->operands[i]);
628 }
629 for (unsigned i = 0; i < instr->definitions.size(); i++) {
630 Definition def = instr->definitions[i];
631 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
632 reg_file.fill(def);
633 }
634 return {{}, false};
635 }
636
637 RegisterFile register_file = reg_file;
638
639 /* now, we figured the placement for our definition */
640 std::set<std::pair<unsigned, unsigned>> vars;
641 for (unsigned j = best_pos; j < best_pos + size; j++) {
642 if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0)
643 vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]);
644 reg_file.clear(ctx.assignments[reg_file[j]].first, ctx.assignments[reg_file[j]].second);
645 }
646
647 if (instr->opcode == aco_opcode::p_create_vector) {
648 /* move killed operands which aren't yet at the correct position */
649 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
650 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
651 instr->operands[i].getTemp().type() == rc.type()) {
652
653 if (instr->operands[i].physReg() != best_pos + offset) {
654 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
655 reg_file.clear(instr->operands[i]);
656 } else {
657 reg_file.fill(instr->operands[i]);
658 }
659 }
660 }
661 } else {
662 /* re-enable the killed operands */
663 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
664 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
665 reg_file.fill(instr->operands[j]);
666 }
667 }
668
669 std::vector<std::pair<Operand, Definition>> pc;
670 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
671 reg_file = std::move(register_file);
672 /* remove killed operands from reg_file once again */
673 if (!is_phi(instr)) {
674 for (const Operand& op : instr->operands) {
675 if (op.isTemp() && op.isFirstKillBeforeDef())
676 reg_file.clear(op);
677 }
678 }
679 for (unsigned i = 0; i < instr->definitions.size(); i++) {
680 Definition& def = instr->definitions[i];
681 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
682 reg_file.fill(def);
683 }
684 return {{}, false};
685 }
686
687 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
688
689 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
690 reg_file.clear(PhysReg{best_pos}, rc);
691
692 update_renames(ctx, reg_file, parallelcopies, instr);
693
694 /* remove killed operands from reg_file once again */
695 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
696 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
697 continue;
698 assert(!instr->operands[i].isUndefined());
699 if (instr->operands[i].isFirstKillBeforeDef())
700 reg_file.clear(instr->operands[i]);
701 }
702 for (unsigned i = 0; i < instr->definitions.size(); i++) {
703 Definition def = instr->definitions[i];
704 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
705 reg_file.fill(def);
706 }
707
708 adjust_max_used_regs(ctx, rc, best_pos);
709 return {PhysReg{best_pos}, true};
710 }
711
712 PhysReg get_reg(ra_ctx& ctx,
713 RegisterFile& reg_file,
714 RegClass rc,
715 std::vector<std::pair<Operand, Definition>>& parallelcopies,
716 aco_ptr<Instruction>& instr)
717 {
718 uint32_t size = rc.size();
719 uint32_t stride = 1;
720 uint32_t lb, ub;
721 if (rc.type() == RegType::vgpr) {
722 lb = 256;
723 ub = 256 + ctx.program->max_reg_demand.vgpr;
724 } else {
725 lb = 0;
726 ub = ctx.program->max_reg_demand.sgpr;
727 if (size == 2)
728 stride = 2;
729 else if (size >= 4)
730 stride = 4;
731 }
732
733 std::pair<PhysReg, bool> res = {{}, false};
734 /* try to find space without live-range splits */
735 if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
736 res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
737 if (!res.second)
738 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
739 if (res.second)
740 return res.first;
741
742 /* try to find space with live-range splits */
743 res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
744
745 if (res.second)
746 return res.first;
747
748 /* try using more registers */
749
750 /* We should only fail here because keeping under the limit would require
751 * too many moves. */
752 assert(reg_file.count_zero(PhysReg{lb}, ub-lb) >= size);
753
754 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
755 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
756 if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
757 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
758 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
759 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
760 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
761 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
762 }
763
764 //FIXME: if nothing helps, shift-rotate the registers to make space
765
766 unreachable("did not find a register");
767 }
768
769
770 std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
771 RegisterFile& reg_file,
772 RegClass rc)
773 {
774 uint32_t size = rc.size();
775 uint32_t stride = 1;
776 uint32_t lb, ub;
777 if (rc.type() == RegType::vgpr) {
778 lb = 256;
779 ub = 256 + ctx.program->max_reg_demand.vgpr;
780 } else {
781 lb = 0;
782 ub = ctx.program->max_reg_demand.sgpr;
783 if (size == 2)
784 stride = 2;
785 else if (size >= 4)
786 stride = 4;
787 }
788 return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
789 }
790
791
792 PhysReg get_reg_create_vector(ra_ctx& ctx,
793 RegisterFile& reg_file,
794 RegClass rc,
795 std::vector<std::pair<Operand, Definition>>& parallelcopies,
796 aco_ptr<Instruction>& instr)
797 {
798 /* create_vector instructions have different costs w.r.t. register coalescing */
799 uint32_t size = rc.size();
800 uint32_t stride = 1;
801 uint32_t lb, ub;
802 if (rc.type() == RegType::vgpr) {
803 lb = 256;
804 ub = 256 + ctx.program->max_reg_demand.vgpr;
805 } else {
806 lb = 0;
807 ub = ctx.program->max_reg_demand.sgpr;
808 if (size == 2)
809 stride = 2;
810 else if (size >= 4)
811 stride = 4;
812 }
813
814 unsigned best_pos = -1;
815 unsigned num_moves = 0xFF;
816 bool best_war_hint = true;
817
818 /* test for each operand which definition placement causes the least shuffle instructions */
819 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
820 // TODO: think about, if we can alias live operands on the same register
821 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
822 continue;
823
824 if (offset > instr->operands[i].physReg())
825 continue;
826
827 unsigned reg_lo = instr->operands[i].physReg() - offset;
828 unsigned reg_hi = reg_lo + size - 1;
829 unsigned k = 0;
830
831 /* no need to check multiple times */
832 if (reg_lo == best_pos)
833 continue;
834
835 /* check borders */
836 // TODO: this can be improved */
837 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
838 continue;
839 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
840 continue;
841 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
842 continue;
843
844 /* count variables to be moved and check war_hint */
845 bool war_hint = false;
846 bool linear_vgpr = false;
847 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
848 if (reg_file[j] != 0) {
849 k++;
850 /* we cannot split live ranges of linear vgprs */
851 if (ctx.assignments[reg_file[j]].second & (1 << 6))
852 linear_vgpr = true;
853 }
854 war_hint |= ctx.war_hint[j];
855 }
856 if (linear_vgpr || (war_hint && !best_war_hint))
857 continue;
858
859 /* count operands in wrong positions */
860 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
861 if (j == i ||
862 !instr->operands[j].isTemp() ||
863 instr->operands[j].getTemp().type() != rc.type())
864 continue;
865 if (instr->operands[j].physReg() != reg_lo + offset)
866 k += instr->operands[j].size();
867 }
868 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
869 if (k > num_moves || (!aligned && k == num_moves))
870 continue;
871
872 best_pos = reg_lo;
873 num_moves = k;
874 best_war_hint = war_hint;
875 }
876
877 if (num_moves >= size)
878 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
879
880 /* collect variables to be moved */
881 std::set<std::pair<unsigned, unsigned>> vars;
882 for (unsigned i = best_pos; i < best_pos + size; i++) {
883 if (reg_file[i] != 0)
884 vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]);
885 reg_file[i] = 0;
886 }
887
888 /* move killed operands which aren't yet at the correct position */
889 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
890 if (instr->operands[i].isTemp() &&
891 instr->operands[i].isFirstKillBeforeDef() &&
892 instr->operands[i].getTemp().type() == rc.type() &&
893 instr->operands[i].physReg() != best_pos + offset)
894 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
895 }
896
897 ASSERTED bool success = false;
898 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
899 assert(success);
900
901 update_renames(ctx, reg_file, parallelcopies, instr);
902 adjust_max_used_regs(ctx, rc, best_pos);
903 return PhysReg{best_pos};
904 }
905
906 bool get_reg_specified(ra_ctx& ctx,
907 RegisterFile& reg_file,
908 RegClass rc,
909 std::vector<std::pair<Operand, Definition>>& parallelcopies,
910 aco_ptr<Instruction>& instr,
911 PhysReg reg)
912 {
913 uint32_t size = rc.size();
914 uint32_t stride = 1;
915 uint32_t lb, ub;
916
917 if (rc.type() == RegType::vgpr) {
918 lb = 256;
919 ub = 256 + ctx.program->max_reg_demand.vgpr;
920 } else {
921 if (size == 2)
922 stride = 2;
923 else if (size >= 4)
924 stride = 4;
925 if (reg % stride != 0)
926 return false;
927 lb = 0;
928 ub = ctx.program->max_reg_demand.sgpr;
929 }
930
931 uint32_t reg_lo = reg.reg();
932 uint32_t reg_hi = reg + (size - 1);
933
934 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
935 return false;
936
937 if (reg_file.test(reg, rc.bytes()))
938 return false;
939
940 adjust_max_used_regs(ctx, rc, reg_lo);
941 return true;
942 }
943
944 void handle_pseudo(ra_ctx& ctx,
945 const RegisterFile& reg_file,
946 Instruction* instr)
947 {
948 if (instr->format != Format::PSEUDO)
949 return;
950
951 /* all instructions which use handle_operands() need this information */
952 switch (instr->opcode) {
953 case aco_opcode::p_extract_vector:
954 case aco_opcode::p_create_vector:
955 case aco_opcode::p_split_vector:
956 case aco_opcode::p_parallelcopy:
957 case aco_opcode::p_wqm:
958 break;
959 default:
960 return;
961 }
962
963 /* if all definitions are vgpr, no need to care for SCC */
964 bool writes_sgpr = false;
965 for (Definition& def : instr->definitions) {
966 if (def.getTemp().type() == RegType::sgpr) {
967 writes_sgpr = true;
968 break;
969 }
970 }
971 /* if all operands are constant, no need to care either */
972 bool reads_sgpr = false;
973 for (Operand& op : instr->operands) {
974 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
975 reads_sgpr = true;
976 break;
977 }
978 }
979 if (!(writes_sgpr && reads_sgpr))
980 return;
981
982 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
983 if (reg_file[scc.reg()]) {
984 pi->tmp_in_scc = true;
985
986 int reg = ctx.max_used_sgpr;
987 for (; reg >= 0 && reg_file[reg]; reg--)
988 ;
989 if (reg < 0) {
990 reg = ctx.max_used_sgpr + 1;
991 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
992 ;
993 assert(reg < ctx.program->max_reg_demand.sgpr);
994 }
995
996 adjust_max_used_regs(ctx, s1, reg);
997 pi->scratch_sgpr = PhysReg{(unsigned)reg};
998 } else {
999 pi->tmp_in_scc = false;
1000 }
1001 }
1002
1003 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1004 {
1005 switch (instr->format) {
1006 case Format::SMEM:
1007 return reg != scc &&
1008 reg != exec &&
1009 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1010 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1011 default:
1012 // TODO: there are more instructions with restrictions on registers
1013 return true;
1014 }
1015 }
1016
1017 } /* end namespace */
1018
1019
1020 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block)
1021 {
1022 ra_ctx ctx(program);
1023
1024 std::vector<std::unordered_map<unsigned, Temp>> renames(program->blocks.size());
1025
1026 struct phi_info {
1027 Instruction* phi;
1028 unsigned block_idx;
1029 std::set<Instruction*> uses;
1030 };
1031
1032 bool filled[program->blocks.size()];
1033 bool sealed[program->blocks.size()];
1034 memset(filled, 0, sizeof filled);
1035 memset(sealed, 0, sizeof sealed);
1036 std::vector<std::vector<Instruction*>> incomplete_phis(program->blocks.size());
1037 std::map<unsigned, phi_info> phi_map;
1038 std::map<unsigned, unsigned> affinities;
1039 std::function<Temp(Temp,unsigned)> read_variable;
1040 std::function<Temp(Temp,Block*)> handle_live_in;
1041 std::function<Temp(std::map<unsigned, phi_info>::iterator)> try_remove_trivial_phi;
1042
1043 read_variable = [&](Temp val, unsigned block_idx) -> Temp {
1044 std::unordered_map<unsigned, Temp>::iterator it = renames[block_idx].find(val.id());
1045 assert(it != renames[block_idx].end());
1046 return it->second;
1047 };
1048
1049 handle_live_in = [&](Temp val, Block *block) -> Temp {
1050 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1051 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) {
1052 renames[block->index][val.id()] = val;
1053 return val;
1054 }
1055 assert(preds.size() > 0);
1056
1057 Temp new_val;
1058 if (!sealed[block->index]) {
1059 /* consider rename from already processed predecessor */
1060 Temp tmp = read_variable(val, preds[0]);
1061
1062 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1063 new_val = Temp{program->allocateId(), val.regClass()};
1064 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1065 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1066 phi->definitions[0] = Definition(new_val);
1067 for (unsigned i = 0; i < preds.size(); i++)
1068 phi->operands[i] = Operand(val);
1069 if (tmp.regClass() == new_val.regClass())
1070 affinities[new_val.id()] = tmp.id();
1071
1072 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1073 incomplete_phis[block->index].emplace_back(phi.get());
1074 block->instructions.insert(block->instructions.begin(), std::move(phi));
1075
1076 } else if (preds.size() == 1) {
1077 /* if the block has only one predecessor, just look there for the name */
1078 new_val = read_variable(val, preds[0]);
1079 } else {
1080 /* there are multiple predecessors and the block is sealed */
1081 Temp ops[preds.size()];
1082
1083 /* we start assuming that the name is the same from all predecessors */
1084 renames[block->index][val.id()] = val;
1085 bool needs_phi = false;
1086
1087 /* get the rename from each predecessor and check if they are the same */
1088 for (unsigned i = 0; i < preds.size(); i++) {
1089 ops[i] = read_variable(val, preds[i]);
1090 if (i == 0)
1091 new_val = ops[i];
1092 else
1093 needs_phi |= !(new_val == ops[i]);
1094 }
1095
1096 if (needs_phi) {
1097 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1098 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1099 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1100 new_val = Temp{program->allocateId(), val.regClass()};
1101 phi->definitions[0] = Definition(new_val);
1102 for (unsigned i = 0; i < preds.size(); i++) {
1103 phi->operands[i] = Operand(ops[i]);
1104 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first);
1105 if (ops[i].regClass() == new_val.regClass())
1106 affinities[new_val.id()] = ops[i].id();
1107 }
1108 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1109 block->instructions.insert(block->instructions.begin(), std::move(phi));
1110 }
1111 }
1112
1113 renames[block->index][val.id()] = new_val;
1114 renames[block->index][new_val.id()] = new_val;
1115 ctx.orig_names[new_val.id()] = val;
1116 return new_val;
1117 };
1118
1119 try_remove_trivial_phi = [&] (std::map<unsigned, phi_info>::iterator info) -> Temp {
1120 assert(info->second.block_idx != 0);
1121 Instruction* phi = info->second.phi;
1122 Temp same = Temp();
1123
1124 Definition def = phi->definitions[0];
1125 /* a phi node is trivial if all operands are the same as the definition of the phi */
1126 for (const Operand& op : phi->operands) {
1127 const Temp t = op.getTemp();
1128 if (t == same || t == def.getTemp())
1129 continue;
1130 if (!(same == Temp()) || !(op.physReg() == def.physReg())) {
1131 /* phi is not trivial */
1132 return def.getTemp();
1133 }
1134 same = t;
1135 }
1136 assert(!(same == Temp() || same == def.getTemp()));
1137
1138 /* reroute all uses to same and remove phi */
1139 std::vector<std::map<unsigned, phi_info>::iterator> phi_users;
1140 std::map<unsigned, phi_info>::iterator same_phi_info = phi_map.find(same.id());
1141 for (Instruction* instr : info->second.uses) {
1142 assert(phi != instr);
1143 /* recursively try to remove trivial phis */
1144 if (is_phi(instr)) {
1145 /* ignore if the phi was already flagged trivial */
1146 if (instr->definitions.empty())
1147 continue;
1148
1149 std::map<unsigned, phi_info>::iterator it = phi_map.find(instr->definitions[0].tempId());
1150 if (it != phi_map.end() && it != info)
1151 phi_users.emplace_back(it);
1152 }
1153 for (Operand& op : instr->operands) {
1154 if (op.isTemp() && op.tempId() == def.tempId()) {
1155 op.setTemp(same);
1156 if (same_phi_info != phi_map.end())
1157 same_phi_info->second.uses.emplace(instr);
1158 }
1159 }
1160 }
1161
1162 auto it = ctx.orig_names.find(same.id());
1163 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1164 for (unsigned i = 0; i < program->blocks.size(); i++) {
1165 auto it = renames[i].find(orig_var);
1166 if (it != renames[i].end() && it->second == def.getTemp())
1167 renames[i][orig_var] = same;
1168 }
1169
1170 unsigned block_idx = info->second.block_idx;
1171 phi->definitions.clear(); /* this indicates that the phi can be removed */
1172 phi_map.erase(info);
1173 for (auto it : phi_users) {
1174 if (sealed[it->second.block_idx])
1175 try_remove_trivial_phi(it);
1176 }
1177
1178 /* due to the removal of other phis, the name might have changed once again! */
1179 return renames[block_idx][orig_var];
1180 };
1181
1182 std::map<unsigned, Instruction*> vectors;
1183 std::vector<std::vector<Temp>> phi_ressources;
1184 std::map<unsigned, unsigned> temp_to_phi_ressources;
1185
1186 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1187 Block& block = *it;
1188
1189 /* first, compute the death points of all live vars within the block */
1190 std::set<Temp>& live = live_out_per_block[block.index];
1191
1192 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1193 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1194 aco_ptr<Instruction>& instr = *rit;
1195 if (is_phi(instr)) {
1196 live.erase(instr->definitions[0].getTemp());
1197 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
1198 continue;
1199 /* collect information about affinity-related temporaries */
1200 std::vector<Temp> affinity_related;
1201 /* affinity_related[0] is the last seen affinity-related temp */
1202 affinity_related.emplace_back(instr->definitions[0].getTemp());
1203 affinity_related.emplace_back(instr->definitions[0].getTemp());
1204 for (const Operand& op : instr->operands) {
1205 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1206 affinity_related.emplace_back(op.getTemp());
1207 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1208 }
1209 }
1210 phi_ressources.emplace_back(std::move(affinity_related));
1211 continue;
1212 }
1213
1214 /* add vector affinities */
1215 if (instr->opcode == aco_opcode::p_create_vector) {
1216 for (const Operand& op : instr->operands) {
1217 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1218 vectors[op.tempId()] = instr.get();
1219 }
1220 }
1221
1222 /* add operands to live variables */
1223 for (const Operand& op : instr->operands) {
1224 if (op.isTemp())
1225 live.emplace(op.getTemp());
1226 }
1227
1228 /* erase definitions from live */
1229 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1230 const Definition& def = instr->definitions[i];
1231 if (!def.isTemp())
1232 continue;
1233 live.erase(def.getTemp());
1234 /* mark last-seen phi operand */
1235 std::map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1236 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1237 phi_ressources[it->second][0] = def.getTemp();
1238 /* try to coalesce phi affinities with parallelcopies */
1239 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
1240 Operand op = instr->operands[i];
1241 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1242 phi_ressources[it->second].emplace_back(op.getTemp());
1243 temp_to_phi_ressources[op.tempId()] = it->second;
1244 }
1245 }
1246 }
1247 }
1248 }
1249 }
1250 /* create affinities */
1251 for (std::vector<Temp>& vec : phi_ressources) {
1252 assert(vec.size() > 1);
1253 for (unsigned i = 1; i < vec.size(); i++)
1254 if (vec[i].id() != vec[0].id())
1255 affinities[vec[i].id()] = vec[0].id();
1256 }
1257
1258 /* state of register file after phis */
1259 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1260
1261 for (Block& block : program->blocks) {
1262 std::set<Temp>& live = live_out_per_block[block.index];
1263 /* initialize register file */
1264 assert(block.index != 0 || live.empty());
1265 RegisterFile register_file;
1266 ctx.war_hint.reset();
1267
1268 for (Temp t : live) {
1269 Temp renamed = handle_live_in(t, &block);
1270 if (ctx.assignments.find(renamed.id()) != ctx.assignments.end())
1271 register_file.fill(ctx.assignments[renamed.id()].first, t.size(), renamed.id());
1272 }
1273
1274 std::vector<aco_ptr<Instruction>> instructions;
1275 std::vector<aco_ptr<Instruction>>::iterator it;
1276
1277 /* this is a slight adjustment from the paper as we already have phi nodes:
1278 * We consider them incomplete phis and only handle the definition. */
1279
1280 /* handle fixed phi definitions */
1281 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1282 aco_ptr<Instruction>& phi = *it;
1283 if (!is_phi(phi))
1284 break;
1285 Definition& definition = phi->definitions[0];
1286 if (!definition.isFixed())
1287 continue;
1288
1289 /* check if a dead exec mask phi is needed */
1290 if (definition.isKill()) {
1291 for (Operand& op : phi->operands) {
1292 assert(op.isTemp());
1293 if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1294 ctx.assignments[op.tempId()].first != exec) {
1295 definition.setKill(false);
1296 break;
1297 }
1298 }
1299 }
1300
1301 if (definition.isKill())
1302 continue;
1303
1304 assert(definition.physReg() == exec);
1305 assert(!register_file.test(definition.physReg(), definition.bytes()));
1306 register_file.fill(definition);
1307 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1308 }
1309
1310 /* look up the affinities */
1311 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1312 aco_ptr<Instruction>& phi = *it;
1313 if (!is_phi(phi))
1314 break;
1315 Definition& definition = phi->definitions[0];
1316 if (definition.isKill() || definition.isFixed())
1317 continue;
1318
1319 if (affinities.find(definition.tempId()) != affinities.end() &&
1320 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1321 assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass());
1322 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1323 bool try_use_special_reg = reg == scc || reg == exec;
1324 if (try_use_special_reg) {
1325 for (const Operand& op : phi->operands) {
1326 if (!op.isTemp() ||
1327 ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1328 !(ctx.assignments[op.tempId()].first == reg)) {
1329 try_use_special_reg = false;
1330 break;
1331 }
1332 }
1333 if (!try_use_special_reg)
1334 continue;
1335 }
1336 /* only assign if register is still free */
1337 if (!register_file.test(reg, definition.bytes())) {
1338 definition.setFixed(reg);
1339 register_file.fill(definition);
1340 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1341 }
1342 }
1343 }
1344
1345 /* find registers for phis without affinity or where the register was blocked */
1346 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1347 aco_ptr<Instruction>& phi = *it;
1348 if (!is_phi(phi))
1349 break;
1350
1351 Definition& definition = phi->definitions[0];
1352 if (definition.isKill())
1353 continue;
1354
1355 renames[block.index][definition.tempId()] = definition.getTemp();
1356
1357 if (!definition.isFixed()) {
1358 std::vector<std::pair<Operand, Definition>> parallelcopy;
1359 /* try to find a register that is used by at least one operand */
1360 for (const Operand& op : phi->operands) {
1361 if (!op.isTemp() ||
1362 ctx.assignments.find(op.tempId()) == ctx.assignments.end())
1363 continue;
1364 PhysReg reg = ctx.assignments[op.tempId()].first;
1365 /* we tried this already on the previous loop */
1366 if (reg == scc || reg == exec)
1367 continue;
1368 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1369 definition.setFixed(reg);
1370 break;
1371 }
1372 }
1373 if (!definition.isFixed())
1374 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi));
1375
1376 /* process parallelcopy */
1377 for (std::pair<Operand, Definition> pc : parallelcopy) {
1378 /* see if it's a copy from a different phi */
1379 //TODO: prefer moving some previous phis over live-ins
1380 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1381 Instruction *prev_phi = NULL;
1382 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1383 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1384 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1385 prev_phi = phi_it->get();
1386 }
1387 phi_it = it;
1388 while (!prev_phi && is_phi(*++phi_it)) {
1389 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1390 prev_phi = phi_it->get();
1391 }
1392 if (prev_phi) {
1393 /* if so, just update that phi's register */
1394 prev_phi->definitions[0].setFixed(pc.second.physReg());
1395 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1396 register_file.fill(pc.second.physReg(), pc.second.size(), prev_phi->definitions[0].tempId());
1397 continue;
1398 }
1399
1400 /* rename */
1401 std::map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1402 Temp orig = pc.first.getTemp();
1403 if (orig_it != ctx.orig_names.end())
1404 orig = orig_it->second;
1405 else
1406 ctx.orig_names[pc.second.tempId()] = orig;
1407 renames[block.index][orig.id()] = pc.second.getTemp();
1408 renames[block.index][pc.second.tempId()] = pc.second.getTemp();
1409
1410 /* otherwise, this is a live-in and we need to create a new phi
1411 * to move it in this block's predecessors */
1412 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1413 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1414 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1415 new_phi->definitions[0] = pc.second;
1416 for (unsigned i = 0; i < preds.size(); i++)
1417 new_phi->operands[i] = Operand(pc.first);
1418 instructions.emplace_back(std::move(new_phi));
1419 }
1420
1421 register_file.fill(definition);
1422 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1423 }
1424 live.emplace(definition.getTemp());
1425
1426 /* update phi affinities */
1427 for (const Operand& op : phi->operands) {
1428 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1429 affinities[op.tempId()] = definition.tempId();
1430 }
1431
1432 instructions.emplace_back(std::move(*it));
1433 }
1434
1435 /* fill in sgpr_live_in */
1436 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1437 sgpr_live_in[block.index][i] = register_file[i];
1438 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1439
1440 /* Handle all other instructions of the block */
1441 for (; it != block.instructions.end(); ++it) {
1442 aco_ptr<Instruction>& instr = *it;
1443
1444 /* parallelcopies from p_phi are inserted here which means
1445 * live ranges of killed operands end here as well */
1446 if (instr->opcode == aco_opcode::p_logical_end) {
1447 /* no need to process this instruction any further */
1448 if (block.logical_succs.size() != 1) {
1449 instructions.emplace_back(std::move(instr));
1450 continue;
1451 }
1452
1453 Block& succ = program->blocks[block.logical_succs[0]];
1454 unsigned idx = 0;
1455 for (; idx < succ.logical_preds.size(); idx++) {
1456 if (succ.logical_preds[idx] == block.index)
1457 break;
1458 }
1459 for (aco_ptr<Instruction>& phi : succ.instructions) {
1460 if (phi->opcode == aco_opcode::p_phi) {
1461 if (phi->operands[idx].isTemp() &&
1462 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1463 phi->operands[idx].isFirstKillBeforeDef()) {
1464 Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index);
1465 PhysReg reg = ctx.assignments[phi_op.id()].first;
1466 assert(register_file[reg] == phi_op.id());
1467 register_file[reg] = 0;
1468 }
1469 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1470 break;
1471 }
1472 }
1473 instructions.emplace_back(std::move(instr));
1474 continue;
1475 }
1476
1477 std::vector<std::pair<Operand, Definition>> parallelcopy;
1478
1479 assert(!is_phi(instr));
1480
1481 /* handle operands */
1482 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1483 auto& operand = instr->operands[i];
1484 if (!operand.isTemp())
1485 continue;
1486
1487 /* rename operands */
1488 operand.setTemp(read_variable(operand.getTemp(), block.index));
1489
1490 /* check if the operand is fixed */
1491 if (operand.isFixed()) {
1492
1493 if (operand.physReg() == ctx.assignments[operand.tempId()].first) {
1494 /* we are fine: the operand is already assigned the correct reg */
1495
1496 } else {
1497 /* check if target reg is blocked, and move away the blocking var */
1498 if (register_file[operand.physReg().reg()]) {
1499 uint32_t blocking_id = register_file[operand.physReg().reg()];
1500 RegClass rc = ctx.assignments[blocking_id].second;
1501 Operand pc_op = Operand(Temp{blocking_id, rc});
1502 pc_op.setFixed(operand.physReg());
1503 Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()});
1504 /* find free reg */
1505 PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr);
1506 pc_def.setFixed(reg);
1507 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1508 register_file.clear(pc_op);
1509 register_file.fill(pc_def);
1510 parallelcopy.emplace_back(pc_op, pc_def);
1511
1512 /* handle renames of previous operands */
1513 for (unsigned j = 0; j < i; j++) {
1514 Operand& op = instr->operands[j];
1515 if (op.isTemp() && op.tempId() == blocking_id) {
1516 op.setTemp(pc_def.getTemp());
1517 op.setFixed(reg);
1518 }
1519 }
1520 }
1521 /* move operand to fixed reg and create parallelcopy pair */
1522 Operand pc_op = operand;
1523 Temp tmp = Temp{program->allocateId(), operand.regClass()};
1524 Definition pc_def = Definition(tmp);
1525 pc_def.setFixed(operand.physReg());
1526 pc_op.setFixed(ctx.assignments[operand.tempId()].first);
1527 operand.setTemp(tmp);
1528 ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()};
1529 operand.setFixed(pc_def.physReg());
1530 register_file.clear(pc_op);
1531 register_file.fill(pc_def);
1532 parallelcopy.emplace_back(pc_op, pc_def);
1533 }
1534 } else {
1535 assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end());
1536 PhysReg reg = ctx.assignments[operand.tempId()].first;
1537
1538 if (operand_can_use_reg(instr, i, reg)) {
1539 operand.setFixed(ctx.assignments[operand.tempId()].first);
1540 } else {
1541 Operand pc_op = operand;
1542 pc_op.setFixed(reg);
1543 PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr);
1544 Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass());
1545 ctx.assignments[pc_def.tempId()] = {new_reg, pc_def.regClass()};
1546 register_file.clear(pc_op);
1547 register_file.fill(pc_def);
1548 parallelcopy.emplace_back(pc_op, pc_def);
1549 operand.setTemp(pc_def.getTemp());
1550 operand.setFixed(new_reg);
1551 }
1552
1553 if (instr->format == Format::EXP ||
1554 (instr->isVMEM() && i == 3 && program->chip_class == GFX6) ||
1555 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1556 for (unsigned j = 0; j < operand.size(); j++)
1557 ctx.war_hint.set(operand.physReg().reg() + j);
1558 }
1559 }
1560 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1561 if (phi != phi_map.end())
1562 phi->second.uses.emplace(instr.get());
1563
1564 }
1565 /* remove dead vars from register file */
1566 for (const Operand& op : instr->operands) {
1567 if (op.isTemp() && op.isFirstKillBeforeDef())
1568 register_file.clear(op);
1569 }
1570
1571 /* try to optimize v_mad_f32 -> v_mac_f32 */
1572 if (instr->opcode == aco_opcode::v_mad_f32 &&
1573 instr->operands[2].isTemp() &&
1574 instr->operands[2].isKillBeforeDef() &&
1575 instr->operands[2].getTemp().type() == RegType::vgpr &&
1576 instr->operands[1].isTemp() &&
1577 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1578 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1579 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1580 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1581 vop3->clamp || vop3->omod || vop3->opsel);
1582 if (can_use_mac) {
1583 instr->format = Format::VOP2;
1584 instr->opcode = aco_opcode::v_mac_f32;
1585 }
1586 }
1587
1588 /* handle definitions which must have the same register as an operand */
1589 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1590 instr->opcode == aco_opcode::v_mac_f32 ||
1591 instr->opcode == aco_opcode::v_writelane_b32 ||
1592 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1593 instr->definitions[0].setFixed(instr->operands[2].physReg());
1594 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1595 instr->opcode == aco_opcode::s_mulk_i32) {
1596 instr->definitions[0].setFixed(instr->operands[0].physReg());
1597 } else if (instr->format == Format::MUBUF &&
1598 instr->definitions.size() == 1 &&
1599 instr->operands.size() == 4) {
1600 instr->definitions[0].setFixed(instr->operands[3].physReg());
1601 } else if (instr->format == Format::MIMG &&
1602 instr->definitions.size() == 1 &&
1603 instr->operands[1].regClass().type() == RegType::vgpr) {
1604 instr->definitions[0].setFixed(instr->operands[1].physReg());
1605 }
1606
1607 ctx.defs_done.reset();
1608
1609 /* handle fixed definitions first */
1610 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1611 auto& definition = instr->definitions[i];
1612 if (!definition.isFixed())
1613 continue;
1614
1615 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1616 /* check if the target register is blocked */
1617 if (register_file[definition.physReg().reg()] != 0) {
1618 /* create parallelcopy pair to move blocking var */
1619 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second};
1620 Operand pc_op = Operand(tmp);
1621 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].first);
1622 RegClass rc = pc_op.regClass();
1623 tmp = Temp{program->allocateId(), rc};
1624 Definition pc_def = Definition(tmp);
1625
1626 /* re-enable the killed operands, so that we don't move the blocking var there */
1627 for (const Operand& op : instr->operands) {
1628 if (op.isTemp() && op.isFirstKillBeforeDef())
1629 register_file.fill(op.physReg(), op.size(), 0xFFFF);
1630 }
1631
1632 /* find a new register for the blocking variable */
1633 PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr);
1634 /* once again, disable killed operands */
1635 for (const Operand& op : instr->operands) {
1636 if (op.isTemp() && op.isFirstKillBeforeDef())
1637 register_file.clear(op);
1638 }
1639 for (unsigned k = 0; k < i; k++) {
1640 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1641 register_file.fill(instr->definitions[k]);
1642 }
1643 pc_def.setFixed(reg);
1644
1645 /* finish assignment of parallelcopy */
1646 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1647 parallelcopy.emplace_back(pc_op, pc_def);
1648
1649 /* add changes to reg_file */
1650 register_file.clear(pc_op);
1651 register_file.fill(pc_def);
1652 }
1653 ctx.defs_done.set(i);
1654
1655 if (!definition.isTemp())
1656 continue;
1657
1658 /* set live if it has a kill point */
1659 if (!definition.isKill())
1660 live.emplace(definition.getTemp());
1661
1662 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1663 renames[block.index][definition.tempId()] = definition.getTemp();
1664 register_file.fill(definition);
1665 }
1666
1667 /* handle all other definitions */
1668 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1669 auto& definition = instr->definitions[i];
1670
1671 if (definition.isFixed() || !definition.isTemp())
1672 continue;
1673
1674 /* find free reg */
1675 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1676 definition.setFixed(definition.physReg());
1677 else if (instr->opcode == aco_opcode::p_split_vector) {
1678 PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()};
1679 if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1680 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1681 definition.setFixed(reg);
1682 } else if (instr->opcode == aco_opcode::p_wqm) {
1683 PhysReg reg;
1684 if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1685 reg = instr->operands[0].physReg();
1686 assert(register_file[reg.reg()] == 0);
1687 } else {
1688 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1689 }
1690 definition.setFixed(reg);
1691 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1692 PhysReg reg;
1693 if (instr->operands[0].isKillBeforeDef() &&
1694 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1695 reg = instr->operands[0].physReg();
1696 reg = PhysReg(reg.reg() + definition.size() * instr->operands[1].constantValue());
1697 assert(register_file[reg.reg()] == 0);
1698 } else {
1699 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1700 }
1701 definition.setFixed(reg);
1702 } else if (instr->opcode == aco_opcode::p_create_vector) {
1703 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(),
1704 parallelcopy, instr);
1705 definition.setFixed(reg);
1706 } else if (affinities.find(definition.tempId()) != affinities.end() &&
1707 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1708 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1709 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1710 definition.setFixed(reg);
1711 else
1712 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1713
1714 } else if (vectors.find(definition.tempId()) != vectors.end()) {
1715 Instruction* vec = vectors[definition.tempId()];
1716 unsigned offset = 0;
1717 for (const Operand& op : vec->operands) {
1718 if (op.isTemp() && op.tempId() == definition.tempId())
1719 break;
1720 else
1721 offset += op.size();
1722 }
1723 unsigned k = 0;
1724 for (const Operand& op : vec->operands) {
1725 if (op.isTemp() &&
1726 op.tempId() != definition.tempId() &&
1727 op.getTemp().type() == definition.getTemp().type() &&
1728 ctx.assignments.find(op.tempId()) != ctx.assignments.end()) {
1729 PhysReg reg = ctx.assignments[op.tempId()].first;
1730 reg = PhysReg(reg.reg() - k + offset);
1731 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
1732 definition.setFixed(reg);
1733 break;
1734 }
1735 }
1736 k += op.size();
1737 }
1738 if (!definition.isFixed()) {
1739 std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
1740 PhysReg reg = res.first;
1741 if (res.second) {
1742 reg = PhysReg(reg.reg() + offset);
1743 } else {
1744 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1745 }
1746 definition.setFixed(reg);
1747 }
1748 } else
1749 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1750
1751 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1752 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1753 ctx.defs_done.set(i);
1754
1755 /* set live if it has a kill point */
1756 if (!definition.isKill())
1757 live.emplace(definition.getTemp());
1758
1759 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1760 renames[block.index][definition.tempId()] = definition.getTemp();
1761 register_file.fill(definition);
1762 }
1763
1764 handle_pseudo(ctx, register_file, instr.get());
1765
1766 /* kill definitions and late-kill operands */
1767 for (const Definition& def : instr->definitions) {
1768 if (def.isTemp() && def.isKill())
1769 register_file.clear(def);
1770 }
1771 for (const Operand& op : instr->operands) {
1772 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1773 register_file.clear(op);
1774 }
1775
1776 /* emit parallelcopy */
1777 if (!parallelcopy.empty()) {
1778 aco_ptr<Pseudo_instruction> pc;
1779 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1780 bool temp_in_scc = register_file[scc.reg()];
1781 bool sgpr_operands_alias_defs = false;
1782 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1783 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1784 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1785 if (!sgpr_operands_alias_defs) {
1786 unsigned reg = parallelcopy[i].first.physReg().reg();
1787 unsigned size = parallelcopy[i].first.getTemp().size();
1788 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1789
1790 reg = parallelcopy[i].second.physReg().reg();
1791 size = parallelcopy[i].second.getTemp().size();
1792 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1793 sgpr_operands_alias_defs = true;
1794 }
1795 }
1796
1797 pc->operands[i] = parallelcopy[i].first;
1798 pc->definitions[i] = parallelcopy[i].second;
1799 assert(pc->operands[i].size() == pc->definitions[i].size());
1800
1801 /* it might happen that the operand is already renamed. we have to restore the original name. */
1802 std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1803 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1804 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1805 renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1806 renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp();
1807
1808 std::map<unsigned, phi_info>::iterator phi = phi_map.find(pc->operands[i].tempId());
1809 if (phi != phi_map.end())
1810 phi->second.uses.emplace(pc.get());
1811 }
1812
1813 if (temp_in_scc && sgpr_operands_alias_defs) {
1814 /* disable definitions and re-enable operands */
1815 for (const Definition& def : instr->definitions) {
1816 if (def.isTemp() && !def.isKill())
1817 register_file.clear(def);
1818 }
1819 for (const Operand& op : instr->operands) {
1820 if (op.isTemp() && op.isFirstKill())
1821 register_file.fill(op.physReg(), op.size(), 0xFFFF);
1822 }
1823
1824 handle_pseudo(ctx, register_file, pc.get());
1825
1826 /* re-enable live vars */
1827 for (const Operand& op : instr->operands) {
1828 if (op.isTemp() && op.isFirstKill())
1829 register_file.clear(op);
1830 }
1831 for (const Definition& def : instr->definitions) {
1832 if (def.isTemp() && !def.isKill())
1833 register_file.fill(def);
1834 }
1835 } else {
1836 pc->tmp_in_scc = false;
1837 }
1838
1839 instructions.emplace_back(std::move(pc));
1840 }
1841
1842 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1843 bool instr_needs_vop3 = !instr->isVOP3() &&
1844 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1845 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1846 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1847 instr->opcode == aco_opcode::v_addc_co_u32 ||
1848 instr->opcode == aco_opcode::v_sub_co_u32 ||
1849 instr->opcode == aco_opcode::v_subb_co_u32 ||
1850 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1851 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1852 !(instr->definitions[1].physReg() == vcc)) ||
1853 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1854 instr->opcode == aco_opcode::v_subb_co_u32 ||
1855 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1856 !(instr->operands[2].physReg() == vcc)));
1857 if (instr_needs_vop3) {
1858
1859 /* if the first operand is a literal, we have to move it to a reg */
1860 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
1861 bool can_sgpr = true;
1862 /* check, if we have to move to vgpr */
1863 for (const Operand& op : instr->operands) {
1864 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1865 can_sgpr = false;
1866 break;
1867 }
1868 }
1869 aco_ptr<Instruction> mov;
1870 if (can_sgpr)
1871 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1872 else
1873 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1874 mov->operands[0] = instr->operands[0];
1875 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
1876 mov->definitions[0] = Definition(tmp);
1877 /* disable definitions and re-enable operands */
1878 for (const Definition& def : instr->definitions)
1879 register_file.clear(def);
1880 for (const Operand& op : instr->operands) {
1881 if (op.isTemp() && op.isFirstKill())
1882 register_file.fill(op.physReg(), op.size(), 0xFFFF);
1883 }
1884 mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov));
1885 instr->operands[0] = Operand(tmp);
1886 instr->operands[0].setFixed(mov->definitions[0].physReg());
1887 instructions.emplace_back(std::move(mov));
1888 /* re-enable live vars */
1889 for (const Operand& op : instr->operands) {
1890 if (op.isTemp() && op.isFirstKill())
1891 register_file.clear(op);
1892 }
1893 for (const Definition& def : instr->definitions) {
1894 if (def.isTemp() && !def.isKill())
1895 register_file.fill(def);
1896 }
1897 }
1898
1899 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1900 aco_ptr<Instruction> tmp = std::move(instr);
1901 Format format = asVOP3(tmp->format);
1902 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
1903 for (unsigned i = 0; i < instr->operands.size(); i++) {
1904 Operand& operand = tmp->operands[i];
1905 instr->operands[i] = operand;
1906 /* keep phi_map up to date */
1907 if (operand.isTemp()) {
1908 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.tempId());
1909 if (phi != phi_map.end()) {
1910 phi->second.uses.erase(tmp.get());
1911 phi->second.uses.emplace(instr.get());
1912 }
1913 }
1914 }
1915 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
1916 }
1917 instructions.emplace_back(std::move(*it));
1918
1919 } /* end for Instr */
1920
1921 block.instructions = std::move(instructions);
1922
1923 filled[block.index] = true;
1924 for (unsigned succ_idx : block.linear_succs) {
1925 Block& succ = program->blocks[succ_idx];
1926 /* seal block if all predecessors are filled */
1927 bool all_filled = true;
1928 for (unsigned pred_idx : succ.linear_preds) {
1929 if (!filled[pred_idx]) {
1930 all_filled = false;
1931 break;
1932 }
1933 }
1934 if (all_filled) {
1935 /* finish incomplete phis and check if they became trivial */
1936 for (Instruction* phi : incomplete_phis[succ_idx]) {
1937 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
1938 for (unsigned i = 0; i < phi->operands.size(); i++) {
1939 phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i]));
1940 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first);
1941 }
1942 try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId()));
1943 }
1944 /* complete the original phi nodes, but no need to check triviality */
1945 for (aco_ptr<Instruction>& instr : succ.instructions) {
1946 if (!is_phi(instr))
1947 break;
1948 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
1949
1950 for (unsigned i = 0; i < instr->operands.size(); i++) {
1951 auto& operand = instr->operands[i];
1952 if (!operand.isTemp())
1953 continue;
1954 operand.setTemp(read_variable(operand.getTemp(), preds[i]));
1955 operand.setFixed(ctx.assignments[operand.tempId()].first);
1956 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1957 if (phi != phi_map.end())
1958 phi->second.uses.emplace(instr.get());
1959 }
1960 }
1961 sealed[succ_idx] = true;
1962 }
1963 }
1964 } /* end for BB */
1965
1966 /* remove trivial phis */
1967 for (Block& block : program->blocks) {
1968 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
1969 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
1970 auto middle = std::remove_if(block.instructions.begin(), end,
1971 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
1972 block.instructions.erase(middle, end);
1973 }
1974
1975 /* find scc spill registers which may be needed for parallelcopies created by phis */
1976 for (Block& block : program->blocks) {
1977 if (block.linear_preds.size() <= 1)
1978 continue;
1979
1980 std::bitset<128> regs = sgpr_live_in[block.index];
1981 if (!regs[127])
1982 continue;
1983
1984 /* choose a register */
1985 int16_t reg = 0;
1986 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
1987 ;
1988 assert(reg < ctx.program->max_reg_demand.sgpr);
1989 adjust_max_used_regs(ctx, s1, reg);
1990
1991 /* update predecessors */
1992 for (unsigned& pred_index : block.linear_preds) {
1993 Block& pred = program->blocks[pred_index];
1994 pred.scc_live_out = true;
1995 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
1996 }
1997 }
1998
1999 /* num_gpr = rnd_up(max_used_gpr + 1) */
2000 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2001 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2002 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2003 else
2004 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2005 }
2006
2007 }