aco: simplify operand handling in RA
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37
38 namespace aco {
39 namespace {
40
41 struct assignment {
42 PhysReg reg;
43 RegClass rc;
44 uint8_t assigned = 0;
45 assignment() = default;
46 assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
47 };
48
49 struct phi_info {
50 Instruction* phi;
51 unsigned block_idx;
52 std::set<Instruction*> uses;
53 };
54
55 struct ra_ctx {
56 std::bitset<512> war_hint;
57 Program* program;
58 std::vector<assignment> assignments;
59 std::vector<std::unordered_map<unsigned, Temp>> renames;
60 std::vector<std::vector<Instruction*>> incomplete_phis;
61 std::vector<bool> filled;
62 std::vector<bool> sealed;
63 std::unordered_map<unsigned, Temp> orig_names;
64 std::unordered_map<unsigned, phi_info> phi_map;
65 std::unordered_map<unsigned, unsigned> affinities;
66 unsigned max_used_sgpr = 0;
67 unsigned max_used_vgpr = 0;
68 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
69
70 ra_ctx(Program* program) : program(program),
71 assignments(program->peekAllocationId()),
72 renames(program->blocks.size()),
73 incomplete_phis(program->blocks.size()),
74 filled(program->blocks.size()),
75 sealed(program->blocks.size()) {}
76 };
77
78 class RegisterFile {
79 public:
80 RegisterFile() {regs.fill(0);}
81
82 std::array<uint32_t, 512> regs;
83 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
84
85 const uint32_t& operator [] (unsigned index) const {
86 return regs[index];
87 }
88
89 uint32_t& operator [] (unsigned index) {
90 return regs[index];
91 }
92
93 unsigned count_zero(PhysReg start, unsigned size) {
94 unsigned res = 0;
95 for (unsigned i = 0; i < size; i++)
96 res += !regs[start + i];
97 return res;
98 }
99
100 bool test(PhysReg start, unsigned num_bytes) {
101 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
102 if (regs[i] & 0x0FFFFFFF)
103 return true;
104 if (regs[i] == 0xF0000000) {
105 assert(subdword_regs.find(i) != subdword_regs.end());
106 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
107 if (subdword_regs[i][j])
108 return true;
109 }
110 }
111 }
112 return false;
113 }
114
115 void block(PhysReg start, unsigned num_bytes) {
116 if (start.byte() || num_bytes % 4)
117 fill_subdword(start, num_bytes, 0xFFFFFFFF);
118 else
119 fill(start, num_bytes / 4, 0xFFFFFFFF);
120 }
121
122 bool is_blocked(PhysReg start) {
123 if (regs[start] == 0xFFFFFFFF)
124 return true;
125 if (regs[start] == 0xF0000000) {
126 for (unsigned i = start.byte(); i < 4; i++)
127 if (subdword_regs[start][i] == 0xFFFFFFFF)
128 return true;
129 }
130 return false;
131 }
132
133 void clear(PhysReg start, RegClass rc) {
134 if (rc.is_subdword())
135 fill_subdword(start, rc.bytes(), 0);
136 else
137 fill(start, rc.size(), 0);
138 }
139
140 void fill(Operand op) {
141 if (op.regClass().is_subdword())
142 fill_subdword(op.physReg(), op.bytes(), op.tempId());
143 else
144 fill(op.physReg(), op.size(), op.tempId());
145 }
146
147 void clear(Operand op) {
148 clear(op.physReg(), op.regClass());
149 }
150
151 void fill(Definition def) {
152 if (def.regClass().is_subdword())
153 fill_subdword(def.physReg(), def.bytes(), def.tempId());
154 else
155 fill(def.physReg(), def.size(), def.tempId());
156 }
157
158 void clear(Definition def) {
159 clear(def.physReg(), def.regClass());
160 }
161
162 private:
163 void fill(PhysReg start, unsigned size, uint32_t val) {
164 for (unsigned i = 0; i < size; i++)
165 regs[start + i] = val;
166 }
167
168 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
169 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
170 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
171 /* emplace or get */
172 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
173 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
174 sub[j] = val;
175
176 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
177 subdword_regs.erase(i);
178 regs[i] = 0;
179 }
180 }
181 }
182 };
183
184
185 /* helper function for debugging */
186 #if 0
187 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
188 {
189 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
190 unsigned lb = vgprs ? 256 : 0;
191 unsigned ub = lb + max;
192 char reg_char = vgprs ? 'v' : 's';
193
194 /* print markers */
195 printf(" ");
196 for (unsigned i = lb; i < ub; i += 3) {
197 printf("%.2u ", i - lb);
198 }
199 printf("\n");
200
201 /* print usage */
202 printf("%cgprs: ", reg_char);
203 unsigned free_regs = 0;
204 unsigned prev = 0;
205 bool char_select = false;
206 for (unsigned i = lb; i < ub; i++) {
207 if (reg_file[i] == 0xFFFF) {
208 printf("~");
209 } else if (reg_file[i]) {
210 if (reg_file[i] != prev) {
211 prev = reg_file[i];
212 char_select = !char_select;
213 }
214 printf(char_select ? "#" : "@");
215 } else {
216 free_regs++;
217 printf(".");
218 }
219 }
220 printf("\n");
221
222 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
223
224 /* print assignments */
225 prev = 0;
226 unsigned size = 0;
227 for (unsigned i = lb; i < ub; i++) {
228 if (reg_file[i] != prev) {
229 if (prev && size > 1)
230 printf("-%d]\n", i - 1 - lb);
231 else if (prev)
232 printf("]\n");
233 prev = reg_file[i];
234 if (prev && prev != 0xFFFF) {
235 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
236 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
237 else
238 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
239 }
240 size = 1;
241 } else {
242 size++;
243 }
244 }
245 if (prev && size > 1)
246 printf("-%d]\n", ub - lb - 1);
247 else if (prev)
248 printf("]\n");
249 }
250 #endif
251
252
253 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
254 {
255 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
256 unsigned size = rc.size();
257 if (rc.type() == RegType::vgpr) {
258 assert(reg >= 256);
259 unsigned hi = reg - 256 + size - 1;
260 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
261 } else if (reg + rc.size() <= max_addressible_sgpr) {
262 unsigned hi = reg + size - 1;
263 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
264 }
265 }
266
267
268 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
269 std::vector<std::pair<Operand, Definition>>& parallelcopies,
270 aco_ptr<Instruction>& instr)
271 {
272 /* allocate id's and rename operands: this is done transparently here */
273 for (std::pair<Operand, Definition>& copy : parallelcopies) {
274 /* the definitions with id are not from this function and already handled */
275 if (copy.second.isTemp())
276 continue;
277
278 /* check if we we moved another parallelcopy definition */
279 for (std::pair<Operand, Definition>& other : parallelcopies) {
280 if (!other.second.isTemp())
281 continue;
282 if (copy.first.getTemp() == other.second.getTemp()) {
283 copy.first.setTemp(other.first.getTemp());
284 copy.first.setFixed(other.first.physReg());
285 }
286 }
287 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
288 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
289 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
290 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
291 reg_file.fill(copy.second);
292
293 /* check if we moved an operand */
294 for (Operand& op : instr->operands) {
295 if (!op.isTemp())
296 continue;
297 if (op.tempId() == copy.first.tempId()) {
298 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
299 for (std::pair<Operand, Definition>& pc : parallelcopies) {
300 PhysReg def_reg = pc.second.physReg();
301 omit_renaming &= def_reg > copy.first.physReg() ?
302 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
303 (def_reg + pc.second.size() <= copy.first.physReg().reg());
304 }
305 if (omit_renaming)
306 continue;
307 op.setTemp(copy.second.getTemp());
308 op.setFixed(copy.second.physReg());
309 }
310 }
311 }
312 }
313
314 bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
315 {
316 return instr->isSDWA() || instr->format == Format::PSEUDO;
317 }
318
319 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
320 RegisterFile& reg_file,
321 uint32_t lb, uint32_t ub,
322 uint32_t size, uint32_t stride,
323 RegClass rc)
324 {
325 if (rc.is_subdword()) {
326 for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
327 assert(reg_file[entry.first] == 0xF0000000);
328 if (lb > entry.first || entry.first >= ub)
329 continue;
330
331 for (unsigned i = 0; i < 4; i+= stride) {
332 if (entry.second[i] != 0)
333 continue;
334
335 bool reg_found = true;
336 for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
337 reg_found &= entry.second[i + j] == 0;
338
339 /* check neighboring reg if needed */
340 reg_found &= (i <= 4 - rc.bytes() || reg_file[entry.first + 1] == 0);
341 if (reg_found) {
342 PhysReg res{entry.first};
343 res.reg_b += i;
344 return {res, true};
345 }
346 }
347 }
348
349 stride = 1; /* stride in full registers */
350 }
351
352 /* best fit algorithm: find the smallest gap to fit in the variable */
353 if (stride == 1) {
354 unsigned best_pos = 0xFFFF;
355 unsigned gap_size = 0xFFFF;
356 unsigned next_pos = 0xFFFF;
357
358 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
359 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
360 if (next_pos == 0xFFFF)
361 continue;
362
363 /* check if the variable fits */
364 if (next_pos + size > current_reg) {
365 next_pos = 0xFFFF;
366 continue;
367 }
368
369 /* check if the tested gap is smaller */
370 if (current_reg - next_pos < gap_size) {
371 best_pos = next_pos;
372 gap_size = current_reg - next_pos;
373 }
374 next_pos = 0xFFFF;
375 continue;
376 }
377
378 if (next_pos == 0xFFFF)
379 next_pos = current_reg;
380 }
381
382 /* final check */
383 if (next_pos != 0xFFFF &&
384 next_pos + size <= ub &&
385 ub - next_pos < gap_size) {
386 best_pos = next_pos;
387 gap_size = ub - next_pos;
388 }
389 if (best_pos != 0xFFFF) {
390 adjust_max_used_regs(ctx, rc, best_pos);
391 return {PhysReg{best_pos}, true};
392 }
393 return {{}, false};
394 }
395
396 bool found = false;
397 unsigned reg_lo = lb;
398 unsigned reg_hi = lb + size - 1;
399 while (!found && reg_lo + size <= ub) {
400 if (reg_file[reg_lo] != 0) {
401 reg_lo += stride;
402 continue;
403 }
404 reg_hi = reg_lo + size - 1;
405 found = true;
406 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
407 if (reg_file[reg] != 0 || ctx.war_hint[reg])
408 found = false;
409 }
410 if (found) {
411 adjust_max_used_regs(ctx, rc, reg_lo);
412 return {PhysReg{reg_lo}, true};
413 }
414
415 reg_lo += stride;
416 }
417
418 return {{}, false};
419 }
420
421 /* collect variables from a register area and clear reg_file */
422 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
423 PhysReg reg, unsigned size)
424 {
425 std::set<std::pair<unsigned, unsigned>> vars;
426 for (unsigned j = reg; j < reg + size; j++) {
427 if (reg_file.is_blocked(PhysReg{j}))
428 continue;
429 if (reg_file[j] == 0xF0000000) {
430 for (unsigned k = 0; k < 4; k++) {
431 unsigned id = reg_file.subdword_regs[j][k];
432 if (id) {
433 assignment& var = ctx.assignments[id];
434 vars.emplace(var.rc.bytes(), id);
435 reg_file.clear(var.reg, var.rc);
436 if (!reg_file[j])
437 break;
438 }
439 }
440 } else if (reg_file[j] != 0) {
441 unsigned id = reg_file[j];
442 assignment& var = ctx.assignments[id];
443 vars.emplace(var.rc.bytes(), id);
444 reg_file.clear(var.reg, var.rc);
445 }
446 }
447 return vars;
448 }
449
450 bool get_regs_for_copies(ra_ctx& ctx,
451 RegisterFile& reg_file,
452 std::vector<std::pair<Operand, Definition>>& parallelcopies,
453 const std::set<std::pair<unsigned, unsigned>> &vars,
454 uint32_t lb, uint32_t ub,
455 aco_ptr<Instruction>& instr,
456 uint32_t def_reg_lo,
457 uint32_t def_reg_hi)
458 {
459
460 /* variables are sorted from small sized to large */
461 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
462 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
463 unsigned id = it->second;
464 assignment& var = ctx.assignments[id];
465 uint32_t size = var.rc.size();
466 uint32_t stride = 1;
467 if (var.rc.type() == RegType::sgpr) {
468 if (size == 2)
469 stride = 2;
470 if (size > 3)
471 stride = 4;
472 }
473
474 /* check if this is a dead operand, then we can re-use the space from the definition */
475 bool is_dead_operand = false;
476 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
477 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
478 is_dead_operand = true;
479 }
480
481 std::pair<PhysReg, bool> res;
482 if (is_dead_operand) {
483 if (instr->opcode == aco_opcode::p_create_vector) {
484 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
485 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
486 PhysReg reg(def_reg_lo);
487 reg.reg_b += offset;
488 assert(!reg_file.test(reg, var.rc.bytes()));
489 res = {reg, true};
490 break;
491 }
492 }
493 } else {
494 res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.rc);
495 }
496 } else {
497 res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.rc);
498 if (!res.second) {
499 unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
500 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.rc);
501 }
502 }
503
504 if (res.second) {
505 /* mark the area as blocked */
506 reg_file.block(res.first, var.rc.bytes());
507
508 /* create parallelcopy pair (without definition id) */
509 Temp tmp = Temp(id, var.rc);
510 Operand pc_op = Operand(tmp);
511 pc_op.setFixed(var.reg);
512 Definition pc_def = Definition(res.first, pc_op.regClass());
513 parallelcopies.emplace_back(pc_op, pc_def);
514 continue;
515 }
516
517 unsigned best_pos = lb;
518 unsigned num_moves = 0xFF;
519 unsigned num_vars = 0;
520
521 /* we use a sliding window to find potential positions */
522 unsigned reg_lo = lb;
523 unsigned reg_hi = lb + size - 1;
524 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
525 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
526 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
527 continue;
528
529 /* second, check that we have at most k=num_moves elements in the window
530 * and no element is larger than the currently processed one */
531 unsigned k = 0;
532 unsigned n = 0;
533 unsigned last_var = 0;
534 bool found = true;
535 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
536 if (reg_file[j] == 0 || reg_file[j] == last_var)
537 continue;
538
539 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
540 found = false;
541 break;
542 }
543 if (reg_file[j] == 0xF0000000) {
544 k += 1;
545 n++;
546 continue;
547 }
548 /* we cannot split live ranges of linear vgprs */
549 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
550 found = false;
551 break;
552 }
553 bool is_kill = false;
554 for (const Operand& op : instr->operands) {
555 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
556 is_kill = true;
557 break;
558 }
559 }
560 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
561 found = false;
562 break;
563 }
564
565 k += ctx.assignments[reg_file[j]].rc.size();
566 last_var = reg_file[j];
567 n++;
568 if (k > num_moves || (k == num_moves && n <= num_vars)) {
569 found = false;
570 break;
571 }
572 }
573
574 if (found) {
575 best_pos = reg_lo;
576 num_moves = k;
577 num_vars = n;
578 }
579 }
580
581 /* FIXME: we messed up and couldn't find space for the variables to be copied */
582 if (num_moves == 0xFF)
583 return false;
584
585 reg_lo = best_pos;
586 reg_hi = best_pos + size - 1;
587
588 /* collect variables and block reg file */
589 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
590
591 /* mark the area as blocked */
592 reg_file.block(PhysReg{reg_lo}, size * 4);
593
594 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
595 return false;
596
597 adjust_max_used_regs(ctx, var.rc, reg_lo);
598
599 /* create parallelcopy pair (without definition id) */
600 Temp tmp = Temp(id, var.rc);
601 Operand pc_op = Operand(tmp);
602 pc_op.setFixed(var.reg);
603 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
604 parallelcopies.emplace_back(pc_op, pc_def);
605 }
606
607 return true;
608 }
609
610
611 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
612 RegisterFile& reg_file,
613 std::vector<std::pair<Operand, Definition>>& parallelcopies,
614 uint32_t lb, uint32_t ub,
615 uint32_t size, uint32_t stride,
616 RegClass rc,
617 aco_ptr<Instruction>& instr)
618 {
619 /* check how many free regs we have */
620 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
621
622 /* mark and count killed operands */
623 unsigned killed_ops = 0;
624 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
625 if (instr->operands[j].isTemp() &&
626 instr->operands[j].isFirstKillBeforeDef() &&
627 instr->operands[j].physReg() >= lb &&
628 instr->operands[j].physReg() < ub) {
629 assert(instr->operands[j].isFixed());
630 assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
631 reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
632 killed_ops += instr->operands[j].getTemp().size();
633 }
634 }
635
636 assert(regs_free >= size);
637 /* we might have to move dead operands to dst in order to make space */
638 unsigned op_moves = 0;
639
640 if (size > (regs_free - killed_ops))
641 op_moves = size - (regs_free - killed_ops);
642
643 /* find the best position to place the definition */
644 unsigned best_pos = lb;
645 unsigned num_moves = 0xFF;
646 unsigned num_vars = 0;
647
648 /* we use a sliding window to check potential positions */
649 unsigned reg_lo = lb;
650 unsigned reg_hi = lb + size - 1;
651 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
652 /* first check the edges: this is what we have to fix to allow for num_moves > size */
653 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
654 continue;
655 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
656 continue;
657
658 /* second, check that we have at most k=num_moves elements in the window
659 * and no element is larger than the currently processed one */
660 unsigned k = op_moves;
661 unsigned n = 0;
662 unsigned remaining_op_moves = op_moves;
663 unsigned last_var = 0;
664 bool found = true;
665 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
666 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
667 if (reg_file[j] == 0 || reg_file[j] == last_var)
668 continue;
669
670 /* dead operands effectively reduce the number of estimated moves */
671 if (reg_file.is_blocked(PhysReg{j})) {
672 if (remaining_op_moves) {
673 k--;
674 remaining_op_moves--;
675 }
676 continue;
677 }
678
679 if (reg_file[j] == 0xF0000000) {
680 k += 1;
681 n++;
682 continue;
683 }
684
685 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
686 found = false;
687 break;
688 }
689
690 /* we cannot split live ranges of linear vgprs */
691 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
692 found = false;
693 break;
694 }
695
696 k += ctx.assignments[reg_file[j]].rc.size();
697 n++;
698 last_var = reg_file[j];
699 }
700
701 if (!found || k > num_moves)
702 continue;
703 if (k == num_moves && n < num_vars)
704 continue;
705 if (!aligned && k == num_moves && n == num_vars)
706 continue;
707
708 if (found) {
709 best_pos = reg_lo;
710 num_moves = k;
711 num_vars = n;
712 }
713 }
714
715 if (num_moves == 0xFF) {
716 /* remove killed operands from reg_file once again */
717 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
718 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
719 reg_file.clear(instr->operands[i]);
720 }
721 for (unsigned i = 0; i < instr->definitions.size(); i++) {
722 Definition def = instr->definitions[i];
723 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
724 reg_file.fill(def);
725 }
726 return {{}, false};
727 }
728
729 RegisterFile register_file = reg_file;
730
731 /* now, we figured the placement for our definition */
732 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
733
734 if (instr->opcode == aco_opcode::p_create_vector) {
735 /* move killed operands which aren't yet at the correct position */
736 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
737 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
738 instr->operands[i].getTemp().type() == rc.type()) {
739
740 if (instr->operands[i].physReg() != best_pos + offset) {
741 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
742 reg_file.clear(instr->operands[i]);
743 } else {
744 reg_file.fill(instr->operands[i]);
745 }
746 }
747 }
748 } else {
749 /* re-enable the killed operands */
750 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
751 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
752 reg_file.fill(instr->operands[j]);
753 }
754 }
755
756 std::vector<std::pair<Operand, Definition>> pc;
757 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
758 reg_file = std::move(register_file);
759 /* remove killed operands from reg_file once again */
760 if (!is_phi(instr)) {
761 for (const Operand& op : instr->operands) {
762 if (op.isTemp() && op.isFirstKillBeforeDef())
763 reg_file.clear(op);
764 }
765 }
766 for (unsigned i = 0; i < instr->definitions.size(); i++) {
767 Definition& def = instr->definitions[i];
768 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
769 reg_file.fill(def);
770 }
771 return {{}, false};
772 }
773
774 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
775
776 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
777 reg_file.clear(PhysReg{best_pos}, rc);
778
779 update_renames(ctx, reg_file, parallelcopies, instr);
780
781 /* remove killed operands from reg_file once again */
782 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
783 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
784 continue;
785 assert(!instr->operands[i].isUndefined());
786 if (instr->operands[i].isFirstKillBeforeDef())
787 reg_file.clear(instr->operands[i]);
788 }
789 for (unsigned i = 0; i < instr->definitions.size(); i++) {
790 Definition def = instr->definitions[i];
791 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
792 reg_file.fill(def);
793 }
794
795 adjust_max_used_regs(ctx, rc, best_pos);
796 return {PhysReg{best_pos}, true};
797 }
798
799 PhysReg get_reg(ra_ctx& ctx,
800 RegisterFile& reg_file,
801 RegClass rc,
802 std::vector<std::pair<Operand, Definition>>& parallelcopies,
803 aco_ptr<Instruction>& instr)
804 {
805 uint32_t size = rc.size();
806 uint32_t stride = 1;
807 uint32_t lb, ub;
808 if (rc.type() == RegType::vgpr) {
809 lb = 256;
810 ub = 256 + ctx.program->max_reg_demand.vgpr;
811 } else {
812 lb = 0;
813 ub = ctx.program->max_reg_demand.sgpr;
814 if (size == 2)
815 stride = 2;
816 else if (size >= 4)
817 stride = 4;
818 }
819
820 if (rc.is_subdword()) {
821 /* stride in bytes */
822 if(!instr_can_access_subdword(instr))
823 stride = 4;
824 else if (rc.bytes() % 4 == 0)
825 stride = 4;
826 else if (rc.bytes() % 2 == 0)
827 stride = 2;
828 }
829
830 std::pair<PhysReg, bool> res = {{}, false};
831 /* try to find space without live-range splits */
832 if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
833 res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
834 if (!res.second)
835 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
836 if (res.second)
837 return res.first;
838
839 /* try to find space with live-range splits */
840 res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
841
842 if (res.second)
843 return res.first;
844
845 /* try using more registers */
846
847 /* We should only fail here because keeping under the limit would require
848 * too many moves. */
849 assert(reg_file.count_zero(PhysReg{lb}, ub-lb) >= size);
850
851 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
852 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
853 if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
854 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
855 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
856 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
857 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
858 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
859 }
860
861 //FIXME: if nothing helps, shift-rotate the registers to make space
862
863 unreachable("did not find a register");
864 }
865
866
867 std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
868 RegisterFile& reg_file,
869 RegClass rc)
870 {
871 uint32_t size = rc.size();
872 uint32_t stride = 1;
873 uint32_t lb, ub;
874 if (rc.type() == RegType::vgpr) {
875 lb = 256;
876 ub = 256 + ctx.program->max_reg_demand.vgpr;
877 } else {
878 lb = 0;
879 ub = ctx.program->max_reg_demand.sgpr;
880 if (size == 2)
881 stride = 2;
882 else if (size >= 4)
883 stride = 4;
884 }
885 return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
886 }
887
888
889 PhysReg get_reg_create_vector(ra_ctx& ctx,
890 RegisterFile& reg_file,
891 RegClass rc,
892 std::vector<std::pair<Operand, Definition>>& parallelcopies,
893 aco_ptr<Instruction>& instr)
894 {
895 /* create_vector instructions have different costs w.r.t. register coalescing */
896 uint32_t size = rc.size();
897 uint32_t bytes = rc.bytes();
898 uint32_t stride = 1;
899 uint32_t lb, ub;
900 if (rc.type() == RegType::vgpr) {
901 lb = 256;
902 ub = 256 + ctx.program->max_reg_demand.vgpr;
903 } else {
904 lb = 0;
905 ub = ctx.program->max_reg_demand.sgpr;
906 if (size == 2)
907 stride = 2;
908 else if (size >= 4)
909 stride = 4;
910 }
911
912 //TODO: improve p_create_vector for sub-dword vectors
913
914 unsigned best_pos = -1;
915 unsigned num_moves = 0xFF;
916 bool best_war_hint = true;
917
918 /* test for each operand which definition placement causes the least shuffle instructions */
919 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
920 // TODO: think about, if we can alias live operands on the same register
921 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
922 continue;
923
924 if (offset > instr->operands[i].physReg().reg_b)
925 continue;
926
927 unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
928 if (reg_lo % 4)
929 continue;
930 reg_lo /= 4;
931 unsigned reg_hi = reg_lo + size - 1;
932 unsigned k = 0;
933
934 /* no need to check multiple times */
935 if (reg_lo == best_pos)
936 continue;
937
938 /* check borders */
939 // TODO: this can be improved */
940 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
941 continue;
942 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
943 continue;
944 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
945 continue;
946
947 /* count variables to be moved and check war_hint */
948 bool war_hint = false;
949 bool linear_vgpr = false;
950 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
951 if (reg_file[j] != 0) {
952 if (reg_file[j] == 0xF0000000) {
953 PhysReg reg;
954 reg.reg_b = j * 4;
955 unsigned bytes_left = bytes - (j - reg_lo) * 4;
956 for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
957 k += reg_file.test(reg, 1);
958 } else {
959 k += 4;
960 /* we cannot split live ranges of linear vgprs */
961 if (ctx.assignments[reg_file[j]].rc & (1 << 6))
962 linear_vgpr = true;
963 }
964 }
965 war_hint |= ctx.war_hint[j];
966 }
967 if (linear_vgpr || (war_hint && !best_war_hint))
968 continue;
969
970 /* count operands in wrong positions */
971 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
972 if (j == i ||
973 !instr->operands[j].isTemp() ||
974 instr->operands[j].getTemp().type() != rc.type())
975 continue;
976 if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
977 k += instr->operands[j].bytes();
978 }
979 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
980 if (k > num_moves || (!aligned && k == num_moves))
981 continue;
982
983 best_pos = reg_lo;
984 num_moves = k;
985 best_war_hint = war_hint;
986 }
987
988 if (num_moves >= bytes)
989 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
990
991 /* collect variables to be moved */
992 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
993
994 /* move killed operands which aren't yet at the correct position */
995 uint64_t moved_operand_mask = 0;
996 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
997 if (instr->operands[i].isTemp() &&
998 instr->operands[i].isFirstKillBeforeDef() &&
999 instr->operands[i].getTemp().type() == rc.type() &&
1000 instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
1001 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1002 moved_operand_mask |= (uint64_t)1 << i;
1003 }
1004 }
1005
1006 ASSERTED bool success = false;
1007 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1008 assert(success);
1009
1010 update_renames(ctx, reg_file, parallelcopies, instr);
1011 adjust_max_used_regs(ctx, rc, best_pos);
1012
1013 while (moved_operand_mask) {
1014 unsigned i = u_bit_scan64(&moved_operand_mask);
1015 assert(instr->operands[i].isFirstKillBeforeDef());
1016 reg_file.clear(instr->operands[i]);
1017 }
1018
1019 return PhysReg{best_pos};
1020 }
1021
1022 bool get_reg_specified(ra_ctx& ctx,
1023 RegisterFile& reg_file,
1024 RegClass rc,
1025 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1026 aco_ptr<Instruction>& instr,
1027 PhysReg reg)
1028 {
1029 uint32_t size = rc.size();
1030 uint32_t stride = 1;
1031 uint32_t lb, ub;
1032
1033 if (rc.type() == RegType::vgpr) {
1034 lb = 256;
1035 ub = 256 + ctx.program->max_reg_demand.vgpr;
1036 } else {
1037 if (size == 2)
1038 stride = 2;
1039 else if (size >= 4)
1040 stride = 4;
1041 if (reg % stride != 0)
1042 return false;
1043 lb = 0;
1044 ub = ctx.program->max_reg_demand.sgpr;
1045 }
1046
1047 if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
1048 return false;
1049
1050 uint32_t reg_lo = reg.reg();
1051 uint32_t reg_hi = reg + (size - 1);
1052
1053 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
1054 return false;
1055
1056 if (reg_file.test(reg, rc.bytes()))
1057 return false;
1058
1059 adjust_max_used_regs(ctx, rc, reg_lo);
1060 return true;
1061 }
1062
1063 void handle_pseudo(ra_ctx& ctx,
1064 const RegisterFile& reg_file,
1065 Instruction* instr)
1066 {
1067 if (instr->format != Format::PSEUDO)
1068 return;
1069
1070 /* all instructions which use handle_operands() need this information */
1071 switch (instr->opcode) {
1072 case aco_opcode::p_extract_vector:
1073 case aco_opcode::p_create_vector:
1074 case aco_opcode::p_split_vector:
1075 case aco_opcode::p_parallelcopy:
1076 case aco_opcode::p_wqm:
1077 break;
1078 default:
1079 return;
1080 }
1081
1082 /* if all definitions are vgpr, no need to care for SCC */
1083 bool writes_sgpr = false;
1084 for (Definition& def : instr->definitions) {
1085 if (def.getTemp().type() == RegType::sgpr) {
1086 writes_sgpr = true;
1087 break;
1088 }
1089 }
1090 /* if all operands are constant, no need to care either */
1091 bool reads_sgpr = false;
1092 for (Operand& op : instr->operands) {
1093 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1094 reads_sgpr = true;
1095 break;
1096 }
1097 }
1098 if (!(writes_sgpr && reads_sgpr))
1099 return;
1100
1101 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1102 if (reg_file[scc.reg()]) {
1103 pi->tmp_in_scc = true;
1104
1105 int reg = ctx.max_used_sgpr;
1106 for (; reg >= 0 && reg_file[reg]; reg--)
1107 ;
1108 if (reg < 0) {
1109 reg = ctx.max_used_sgpr + 1;
1110 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1111 ;
1112 assert(reg < ctx.program->max_reg_demand.sgpr);
1113 }
1114
1115 adjust_max_used_regs(ctx, s1, reg);
1116 pi->scratch_sgpr = PhysReg{(unsigned)reg};
1117 } else {
1118 pi->tmp_in_scc = false;
1119 }
1120 }
1121
1122 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1123 {
1124 if (instr->operands[idx].isFixed())
1125 return instr->operands[idx].physReg() == reg;
1126
1127 if (!instr_can_access_subdword(instr) && reg.byte())
1128 return false;
1129
1130 switch (instr->format) {
1131 case Format::SMEM:
1132 return reg != scc &&
1133 reg != exec &&
1134 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1135 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1136 default:
1137 // TODO: there are more instructions with restrictions on registers
1138 return true;
1139 }
1140 }
1141
1142 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1143 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1144 aco_ptr<Instruction>& instr, Operand& operand)
1145 {
1146 /* check if the operand is fixed */
1147 PhysReg dst;
1148 if (operand.isFixed()) {
1149 assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1150
1151 /* check if target reg is blocked, and move away the blocking var */
1152 if (register_file[operand.physReg().reg()]) {
1153 assert(register_file[operand.physReg()] != 0xF0000000);
1154 uint32_t blocking_id = register_file[operand.physReg().reg()];
1155 RegClass rc = ctx.assignments[blocking_id].rc;
1156 Operand pc_op = Operand(Temp{blocking_id, rc});
1157 pc_op.setFixed(operand.physReg());
1158
1159 /* find free reg */
1160 PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr);
1161 Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1162 register_file.clear(pc_op);
1163 parallelcopy.emplace_back(pc_op, pc_def);
1164 }
1165 dst = operand.physReg();
1166
1167 } else {
1168 dst = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr);
1169 }
1170
1171 Operand pc_op = operand;
1172 pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1173 Definition pc_def = Definition(dst, pc_op.regClass());
1174 register_file.clear(pc_op);
1175 parallelcopy.emplace_back(pc_op, pc_def);
1176 update_renames(ctx, register_file, parallelcopy, instr);
1177 }
1178
1179 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1180 {
1181 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1182 if (it == ctx.renames[block_idx].end())
1183 return val;
1184 else
1185 return it->second;
1186 }
1187
1188 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1189 {
1190 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1191 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1192 return val;
1193
1194 assert(preds.size() > 0);
1195
1196 Temp new_val;
1197 if (!ctx.sealed[block->index]) {
1198 /* consider rename from already processed predecessor */
1199 Temp tmp = read_variable(ctx, val, preds[0]);
1200
1201 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1202 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1203 ctx.assignments.emplace_back();
1204 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1205 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1206 phi->definitions[0] = Definition(new_val);
1207 for (unsigned i = 0; i < preds.size(); i++)
1208 phi->operands[i] = Operand(val);
1209 if (tmp.regClass() == new_val.regClass())
1210 ctx.affinities[new_val.id()] = tmp.id();
1211
1212 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1213 ctx.incomplete_phis[block->index].emplace_back(phi.get());
1214 block->instructions.insert(block->instructions.begin(), std::move(phi));
1215
1216 } else if (preds.size() == 1) {
1217 /* if the block has only one predecessor, just look there for the name */
1218 new_val = read_variable(ctx, val, preds[0]);
1219 } else {
1220 /* there are multiple predecessors and the block is sealed */
1221 Temp ops[preds.size()];
1222
1223 /* get the rename from each predecessor and check if they are the same */
1224 bool needs_phi = false;
1225 for (unsigned i = 0; i < preds.size(); i++) {
1226 ops[i] = read_variable(ctx, val, preds[i]);
1227 if (i == 0)
1228 new_val = ops[i];
1229 else
1230 needs_phi |= !(new_val == ops[i]);
1231 }
1232
1233 if (needs_phi) {
1234 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1235 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1236 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1237 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1238 phi->definitions[0] = Definition(new_val);
1239 for (unsigned i = 0; i < preds.size(); i++) {
1240 phi->operands[i] = Operand(ops[i]);
1241 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1242 if (ops[i].regClass() == new_val.regClass())
1243 ctx.affinities[new_val.id()] = ops[i].id();
1244 }
1245 ctx.assignments.emplace_back();
1246 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1247 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1248 block->instructions.insert(block->instructions.begin(), std::move(phi));
1249 }
1250 }
1251
1252 if (new_val != val) {
1253 ctx.renames[block->index][val.id()] = new_val;
1254 ctx.orig_names[new_val.id()] = val;
1255 }
1256 return new_val;
1257 }
1258
1259 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1260 {
1261 std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1262
1263 if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1264 return;
1265
1266 assert(info->second.block_idx != 0);
1267 Instruction* phi = info->second.phi;
1268 Temp same = Temp();
1269 Definition def = phi->definitions[0];
1270
1271 /* a phi node is trivial if all operands are the same as the definition of the phi */
1272 for (const Operand& op : phi->operands) {
1273 const Temp t = op.getTemp();
1274 if (t == same || t == def.getTemp()) {
1275 assert(t == same || op.physReg() == def.physReg());
1276 continue;
1277 }
1278 if (same != Temp())
1279 return;
1280
1281 same = t;
1282 }
1283 assert(same != Temp() || same == def.getTemp());
1284
1285 /* reroute all uses to same and remove phi */
1286 std::vector<Temp> phi_users;
1287 std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1288 for (Instruction* instr : info->second.uses) {
1289 assert(phi != instr);
1290 /* recursively try to remove trivial phis */
1291 if (is_phi(instr)) {
1292 /* ignore if the phi was already flagged trivial */
1293 if (instr->definitions.empty())
1294 continue;
1295
1296 if (instr->definitions[0].getTemp() != temp)
1297 phi_users.emplace_back(instr->definitions[0].getTemp());
1298 }
1299 for (Operand& op : instr->operands) {
1300 if (op.isTemp() && op.tempId() == def.tempId()) {
1301 op.setTemp(same);
1302 if (same_phi_info != ctx.phi_map.end())
1303 same_phi_info->second.uses.emplace(instr);
1304 }
1305 }
1306 }
1307
1308 auto it = ctx.orig_names.find(same.id());
1309 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1310 for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1311 auto it = ctx.renames[i].find(orig_var);
1312 if (it != ctx.renames[i].end() && it->second == def.getTemp())
1313 ctx.renames[i][orig_var] = same;
1314 }
1315
1316 phi->definitions.clear(); /* this indicates that the phi can be removed */
1317 ctx.phi_map.erase(info);
1318 for (Temp t : phi_users)
1319 try_remove_trivial_phi(ctx, t);
1320
1321 return;
1322 }
1323
1324 } /* end namespace */
1325
1326
1327 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
1328 {
1329 ra_ctx ctx(program);
1330
1331 std::unordered_map<unsigned, Instruction*> vectors;
1332 std::vector<std::vector<Temp>> phi_ressources;
1333 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1334
1335 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1336 Block& block = *it;
1337
1338 /* first, compute the death points of all live vars within the block */
1339 TempSet& live = live_out_per_block[block.index];
1340
1341 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1342 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1343 aco_ptr<Instruction>& instr = *rit;
1344 if (is_phi(instr)) {
1345 live.erase(instr->definitions[0].getTemp());
1346 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
1347 continue;
1348 /* collect information about affinity-related temporaries */
1349 std::vector<Temp> affinity_related;
1350 /* affinity_related[0] is the last seen affinity-related temp */
1351 affinity_related.emplace_back(instr->definitions[0].getTemp());
1352 affinity_related.emplace_back(instr->definitions[0].getTemp());
1353 for (const Operand& op : instr->operands) {
1354 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1355 affinity_related.emplace_back(op.getTemp());
1356 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1357 }
1358 }
1359 phi_ressources.emplace_back(std::move(affinity_related));
1360 continue;
1361 }
1362
1363 /* add vector affinities */
1364 if (instr->opcode == aco_opcode::p_create_vector) {
1365 for (const Operand& op : instr->operands) {
1366 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1367 vectors[op.tempId()] = instr.get();
1368 }
1369 }
1370
1371 /* add operands to live variables */
1372 for (const Operand& op : instr->operands) {
1373 if (op.isTemp())
1374 live.emplace(op.getTemp());
1375 }
1376
1377 /* erase definitions from live */
1378 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1379 const Definition& def = instr->definitions[i];
1380 if (!def.isTemp())
1381 continue;
1382 live.erase(def.getTemp());
1383 /* mark last-seen phi operand */
1384 std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1385 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1386 phi_ressources[it->second][0] = def.getTemp();
1387 /* try to coalesce phi affinities with parallelcopies */
1388 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
1389 Operand op = instr->operands[i];
1390 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1391 phi_ressources[it->second].emplace_back(op.getTemp());
1392 temp_to_phi_ressources[op.tempId()] = it->second;
1393 }
1394 }
1395 }
1396 }
1397 }
1398 }
1399 /* create affinities */
1400 for (std::vector<Temp>& vec : phi_ressources) {
1401 assert(vec.size() > 1);
1402 for (unsigned i = 1; i < vec.size(); i++)
1403 if (vec[i].id() != vec[0].id())
1404 ctx.affinities[vec[i].id()] = vec[0].id();
1405 }
1406
1407 /* state of register file after phis */
1408 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1409
1410 for (Block& block : program->blocks) {
1411 TempSet& live = live_out_per_block[block.index];
1412 /* initialize register file */
1413 assert(block.index != 0 || live.empty());
1414 RegisterFile register_file;
1415 ctx.war_hint.reset();
1416
1417 for (Temp t : live) {
1418 Temp renamed = handle_live_in(ctx, t, &block);
1419 assignment& var = ctx.assignments[renamed.id()];
1420 /* due to live-range splits, the live-in might be a phi, now */
1421 if (var.assigned)
1422 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1423 }
1424
1425 std::vector<aco_ptr<Instruction>> instructions;
1426 std::vector<aco_ptr<Instruction>>::iterator it;
1427
1428 /* this is a slight adjustment from the paper as we already have phi nodes:
1429 * We consider them incomplete phis and only handle the definition. */
1430
1431 /* handle fixed phi definitions */
1432 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1433 aco_ptr<Instruction>& phi = *it;
1434 if (!is_phi(phi))
1435 break;
1436 Definition& definition = phi->definitions[0];
1437 if (!definition.isFixed())
1438 continue;
1439
1440 /* check if a dead exec mask phi is needed */
1441 if (definition.isKill()) {
1442 for (Operand& op : phi->operands) {
1443 assert(op.isTemp());
1444 if (!ctx.assignments[op.tempId()].assigned ||
1445 ctx.assignments[op.tempId()].reg != exec) {
1446 definition.setKill(false);
1447 break;
1448 }
1449 }
1450 }
1451
1452 if (definition.isKill())
1453 continue;
1454
1455 assert(definition.physReg() == exec);
1456 assert(!register_file.test(definition.physReg(), definition.bytes()));
1457 register_file.fill(definition);
1458 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1459 }
1460
1461 /* look up the affinities */
1462 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1463 aco_ptr<Instruction>& phi = *it;
1464 if (!is_phi(phi))
1465 break;
1466 Definition& definition = phi->definitions[0];
1467 if (definition.isKill() || definition.isFixed())
1468 continue;
1469
1470 if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1471 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1472 assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1473 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1474 bool try_use_special_reg = reg == scc || reg == exec;
1475 if (try_use_special_reg) {
1476 for (const Operand& op : phi->operands) {
1477 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1478 ctx.assignments[op.tempId()].reg == reg)) {
1479 try_use_special_reg = false;
1480 break;
1481 }
1482 }
1483 if (!try_use_special_reg)
1484 continue;
1485 }
1486 /* only assign if register is still free */
1487 if (!register_file.test(reg, definition.bytes())) {
1488 definition.setFixed(reg);
1489 register_file.fill(definition);
1490 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1491 }
1492 }
1493 }
1494
1495 /* find registers for phis without affinity or where the register was blocked */
1496 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1497 aco_ptr<Instruction>& phi = *it;
1498 if (!is_phi(phi))
1499 break;
1500
1501 Definition& definition = phi->definitions[0];
1502 if (definition.isKill())
1503 continue;
1504
1505 if (!definition.isFixed()) {
1506 std::vector<std::pair<Operand, Definition>> parallelcopy;
1507 /* try to find a register that is used by at least one operand */
1508 for (const Operand& op : phi->operands) {
1509 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1510 continue;
1511 PhysReg reg = ctx.assignments[op.tempId()].reg;
1512 /* we tried this already on the previous loop */
1513 if (reg == scc || reg == exec)
1514 continue;
1515 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1516 definition.setFixed(reg);
1517 break;
1518 }
1519 }
1520 if (!definition.isFixed())
1521 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi));
1522
1523 /* process parallelcopy */
1524 for (std::pair<Operand, Definition> pc : parallelcopy) {
1525 /* see if it's a copy from a different phi */
1526 //TODO: prefer moving some previous phis over live-ins
1527 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1528 Instruction *prev_phi = NULL;
1529 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1530 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1531 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1532 prev_phi = phi_it->get();
1533 }
1534 phi_it = it;
1535 while (!prev_phi && is_phi(*++phi_it)) {
1536 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1537 prev_phi = phi_it->get();
1538 }
1539 if (prev_phi) {
1540 /* if so, just update that phi's register */
1541 register_file.clear(prev_phi->definitions[0]);
1542 prev_phi->definitions[0].setFixed(pc.second.physReg());
1543 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1544 register_file.fill(prev_phi->definitions[0]);
1545 continue;
1546 }
1547
1548 /* rename */
1549 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1550 Temp orig = pc.first.getTemp();
1551 if (orig_it != ctx.orig_names.end())
1552 orig = orig_it->second;
1553 else
1554 ctx.orig_names[pc.second.tempId()] = orig;
1555 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1556
1557 /* otherwise, this is a live-in and we need to create a new phi
1558 * to move it in this block's predecessors */
1559 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1560 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1561 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1562 new_phi->definitions[0] = pc.second;
1563 for (unsigned i = 0; i < preds.size(); i++)
1564 new_phi->operands[i] = Operand(pc.first);
1565 instructions.emplace_back(std::move(new_phi));
1566 }
1567
1568 register_file.fill(definition);
1569 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1570 }
1571 live.emplace(definition.getTemp());
1572
1573 /* update phi affinities */
1574 for (const Operand& op : phi->operands) {
1575 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1576 ctx.affinities[op.tempId()] = definition.tempId();
1577 }
1578
1579 instructions.emplace_back(std::move(*it));
1580 }
1581
1582 /* fill in sgpr_live_in */
1583 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1584 sgpr_live_in[block.index][i] = register_file[i];
1585 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1586
1587 /* Handle all other instructions of the block */
1588 for (; it != block.instructions.end(); ++it) {
1589 aco_ptr<Instruction>& instr = *it;
1590
1591 /* parallelcopies from p_phi are inserted here which means
1592 * live ranges of killed operands end here as well */
1593 if (instr->opcode == aco_opcode::p_logical_end) {
1594 /* no need to process this instruction any further */
1595 if (block.logical_succs.size() != 1) {
1596 instructions.emplace_back(std::move(instr));
1597 continue;
1598 }
1599
1600 Block& succ = program->blocks[block.logical_succs[0]];
1601 unsigned idx = 0;
1602 for (; idx < succ.logical_preds.size(); idx++) {
1603 if (succ.logical_preds[idx] == block.index)
1604 break;
1605 }
1606 for (aco_ptr<Instruction>& phi : succ.instructions) {
1607 if (phi->opcode == aco_opcode::p_phi) {
1608 if (phi->operands[idx].isTemp() &&
1609 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1610 phi->operands[idx].isFirstKillBeforeDef()) {
1611 Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
1612 PhysReg reg = ctx.assignments[phi_op.id()].reg;
1613 assert(register_file[reg] == phi_op.id());
1614 register_file[reg] = 0;
1615 }
1616 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1617 break;
1618 }
1619 }
1620 instructions.emplace_back(std::move(instr));
1621 continue;
1622 }
1623
1624 std::vector<std::pair<Operand, Definition>> parallelcopy;
1625
1626 assert(!is_phi(instr));
1627
1628 /* handle operands */
1629 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1630 auto& operand = instr->operands[i];
1631 if (!operand.isTemp())
1632 continue;
1633
1634 /* rename operands */
1635 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
1636 assert(ctx.assignments[operand.tempId()].assigned);
1637
1638 PhysReg reg = ctx.assignments[operand.tempId()].reg;
1639 if (operand_can_use_reg(instr, i, reg))
1640 operand.setFixed(reg);
1641 else
1642 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
1643
1644 if (instr->format == Format::EXP ||
1645 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
1646 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1647 for (unsigned j = 0; j < operand.size(); j++)
1648 ctx.war_hint.set(operand.physReg().reg() + j);
1649 }
1650
1651 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
1652 if (phi != ctx.phi_map.end())
1653 phi->second.uses.emplace(instr.get());
1654 }
1655
1656 /* remove dead vars from register file */
1657 for (const Operand& op : instr->operands) {
1658 if (op.isTemp() && op.isFirstKillBeforeDef())
1659 register_file.clear(op);
1660 }
1661
1662 /* try to optimize v_mad_f32 -> v_mac_f32 */
1663 if (instr->opcode == aco_opcode::v_mad_f32 &&
1664 instr->operands[2].isTemp() &&
1665 instr->operands[2].isKillBeforeDef() &&
1666 instr->operands[2].getTemp().type() == RegType::vgpr &&
1667 instr->operands[1].isTemp() &&
1668 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1669 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1670 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1671 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1672 vop3->clamp || vop3->omod || vop3->opsel);
1673 if (can_use_mac) {
1674 instr->format = Format::VOP2;
1675 instr->opcode = aco_opcode::v_mac_f32;
1676 }
1677 }
1678
1679 /* handle definitions which must have the same register as an operand */
1680 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1681 instr->opcode == aco_opcode::v_mac_f32 ||
1682 instr->opcode == aco_opcode::v_writelane_b32 ||
1683 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1684 instr->definitions[0].setFixed(instr->operands[2].physReg());
1685 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1686 instr->opcode == aco_opcode::s_mulk_i32) {
1687 instr->definitions[0].setFixed(instr->operands[0].physReg());
1688 } else if (instr->format == Format::MUBUF &&
1689 instr->definitions.size() == 1 &&
1690 instr->operands.size() == 4) {
1691 instr->definitions[0].setFixed(instr->operands[3].physReg());
1692 } else if (instr->format == Format::MIMG &&
1693 instr->definitions.size() == 1 &&
1694 instr->operands[1].regClass().type() == RegType::vgpr) {
1695 instr->definitions[0].setFixed(instr->operands[1].physReg());
1696 }
1697
1698 ctx.defs_done.reset();
1699
1700 /* handle fixed definitions first */
1701 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1702 auto& definition = instr->definitions[i];
1703 if (!definition.isFixed())
1704 continue;
1705
1706 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1707 /* check if the target register is blocked */
1708 if (register_file[definition.physReg().reg()] != 0) {
1709 /* create parallelcopy pair to move blocking var */
1710 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
1711 Operand pc_op = Operand(tmp);
1712 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
1713 RegClass rc = pc_op.regClass();
1714 tmp = Temp{program->allocateId(), rc};
1715 Definition pc_def = Definition(tmp);
1716
1717 /* re-enable the killed operands, so that we don't move the blocking var there */
1718 for (const Operand& op : instr->operands) {
1719 if (op.isTemp() && op.isFirstKillBeforeDef())
1720 register_file.fill(op);
1721 }
1722
1723 /* find a new register for the blocking variable */
1724 PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr);
1725 /* once again, disable killed operands */
1726 for (const Operand& op : instr->operands) {
1727 if (op.isTemp() && op.isFirstKillBeforeDef())
1728 register_file.clear(op);
1729 }
1730 for (unsigned k = 0; k < i; k++) {
1731 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1732 register_file.fill(instr->definitions[k]);
1733 }
1734 pc_def.setFixed(reg);
1735
1736 /* finish assignment of parallelcopy */
1737 ctx.assignments.emplace_back(reg, pc_def.regClass());
1738 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1739 parallelcopy.emplace_back(pc_op, pc_def);
1740
1741 /* add changes to reg_file */
1742 register_file.clear(pc_op);
1743 register_file.fill(pc_def);
1744 }
1745 ctx.defs_done.set(i);
1746
1747 if (!definition.isTemp())
1748 continue;
1749
1750 /* set live if it has a kill point */
1751 if (!definition.isKill())
1752 live.emplace(definition.getTemp());
1753
1754 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1755 register_file.fill(definition);
1756 }
1757
1758 /* handle all other definitions */
1759 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1760 auto& definition = instr->definitions[i];
1761
1762 if (definition.isFixed() || !definition.isTemp())
1763 continue;
1764
1765 /* find free reg */
1766 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1767 definition.setFixed(definition.physReg());
1768 else if (instr->opcode == aco_opcode::p_split_vector) {
1769 PhysReg reg = instr->operands[0].physReg();
1770 reg.reg_b += i * definition.bytes();
1771 if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1772 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1773 definition.setFixed(reg);
1774 } else if (instr->opcode == aco_opcode::p_wqm) {
1775 PhysReg reg;
1776 if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1777 reg = instr->operands[0].physReg();
1778 assert(register_file[reg.reg()] == 0);
1779 } else {
1780 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1781 }
1782 definition.setFixed(reg);
1783 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1784 PhysReg reg;
1785 if (instr->operands[0].isKillBeforeDef() &&
1786 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1787 reg = instr->operands[0].physReg();
1788 reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
1789 assert(!register_file.test(reg, definition.bytes()));
1790 } else {
1791 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1792 }
1793 definition.setFixed(reg);
1794 } else if (instr->opcode == aco_opcode::p_create_vector) {
1795 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(),
1796 parallelcopy, instr);
1797 definition.setFixed(reg);
1798 } else if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1799 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1800 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1801 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1802 definition.setFixed(reg);
1803 else
1804 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1805
1806 } else if (vectors.find(definition.tempId()) != vectors.end()) {
1807 Instruction* vec = vectors[definition.tempId()];
1808 unsigned byte_offset = 0;
1809 for (const Operand& op : vec->operands) {
1810 if (op.isTemp() && op.tempId() == definition.tempId())
1811 break;
1812 else
1813 byte_offset += op.bytes();
1814 }
1815 unsigned k = 0;
1816 for (const Operand& op : vec->operands) {
1817 if (op.isTemp() &&
1818 op.tempId() != definition.tempId() &&
1819 op.getTemp().type() == definition.getTemp().type() &&
1820 ctx.assignments[op.tempId()].assigned) {
1821 PhysReg reg = ctx.assignments[op.tempId()].reg;
1822 reg.reg_b += (byte_offset - k);
1823 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
1824 definition.setFixed(reg);
1825 break;
1826 }
1827 }
1828 k += op.bytes();
1829 }
1830 if (!definition.isFixed()) {
1831 std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
1832 PhysReg reg = res.first;
1833 if (res.second) {
1834 reg.reg_b += byte_offset;
1835 /* make sure to only use byte offset if the instruction supports it */
1836 if (vec->definitions[0].regClass().is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
1837 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1838 } else {
1839 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1840 }
1841 definition.setFixed(reg);
1842 }
1843 } else
1844 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1845
1846 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1847 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1848 ctx.defs_done.set(i);
1849
1850 /* set live if it has a kill point */
1851 if (!definition.isKill())
1852 live.emplace(definition.getTemp());
1853
1854 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1855 register_file.fill(definition);
1856 }
1857
1858 handle_pseudo(ctx, register_file, instr.get());
1859
1860 /* kill definitions and late-kill operands */
1861 for (const Definition& def : instr->definitions) {
1862 if (def.isTemp() && def.isKill())
1863 register_file.clear(def);
1864 }
1865 for (const Operand& op : instr->operands) {
1866 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1867 register_file.clear(op);
1868 }
1869
1870 /* emit parallelcopy */
1871 if (!parallelcopy.empty()) {
1872 aco_ptr<Pseudo_instruction> pc;
1873 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1874 bool temp_in_scc = register_file[scc.reg()];
1875 bool sgpr_operands_alias_defs = false;
1876 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1877 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1878 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1879 if (!sgpr_operands_alias_defs) {
1880 unsigned reg = parallelcopy[i].first.physReg().reg();
1881 unsigned size = parallelcopy[i].first.getTemp().size();
1882 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1883
1884 reg = parallelcopy[i].second.physReg().reg();
1885 size = parallelcopy[i].second.getTemp().size();
1886 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1887 sgpr_operands_alias_defs = true;
1888 }
1889 }
1890
1891 pc->operands[i] = parallelcopy[i].first;
1892 pc->definitions[i] = parallelcopy[i].second;
1893 assert(pc->operands[i].size() == pc->definitions[i].size());
1894
1895 /* it might happen that the operand is already renamed. we have to restore the original name. */
1896 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1897 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1898 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1899 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1900
1901 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
1902 if (phi != ctx.phi_map.end())
1903 phi->second.uses.emplace(pc.get());
1904 }
1905
1906 if (temp_in_scc && sgpr_operands_alias_defs) {
1907 /* disable definitions and re-enable operands */
1908 for (const Definition& def : instr->definitions) {
1909 if (def.isTemp() && !def.isKill())
1910 register_file.clear(def);
1911 }
1912 for (const Operand& op : instr->operands) {
1913 if (op.isTemp() && op.isFirstKill())
1914 register_file.block(op.physReg(), op.bytes());
1915 }
1916
1917 handle_pseudo(ctx, register_file, pc.get());
1918
1919 /* re-enable live vars */
1920 for (const Operand& op : instr->operands) {
1921 if (op.isTemp() && op.isFirstKill())
1922 register_file.clear(op);
1923 }
1924 for (const Definition& def : instr->definitions) {
1925 if (def.isTemp() && !def.isKill())
1926 register_file.fill(def);
1927 }
1928 } else {
1929 pc->tmp_in_scc = false;
1930 }
1931
1932 instructions.emplace_back(std::move(pc));
1933 }
1934
1935 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1936 bool instr_needs_vop3 = !instr->isVOP3() &&
1937 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1938 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1939 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1940 instr->opcode == aco_opcode::v_addc_co_u32 ||
1941 instr->opcode == aco_opcode::v_sub_co_u32 ||
1942 instr->opcode == aco_opcode::v_subb_co_u32 ||
1943 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1944 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1945 !(instr->definitions[1].physReg() == vcc)) ||
1946 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1947 instr->opcode == aco_opcode::v_subb_co_u32 ||
1948 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1949 !(instr->operands[2].physReg() == vcc)));
1950 if (instr_needs_vop3) {
1951
1952 /* if the first operand is a literal, we have to move it to a reg */
1953 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
1954 bool can_sgpr = true;
1955 /* check, if we have to move to vgpr */
1956 for (const Operand& op : instr->operands) {
1957 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1958 can_sgpr = false;
1959 break;
1960 }
1961 }
1962 /* disable definitions and re-enable operands */
1963 for (const Definition& def : instr->definitions)
1964 register_file.clear(def);
1965 for (const Operand& op : instr->operands) {
1966 if (op.isTemp() && op.isFirstKill())
1967 register_file.block(op.physReg(), op.bytes());
1968 }
1969 RegClass rc = can_sgpr ? s1 : v1;
1970 PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr);
1971 Temp tmp = {program->allocateId(), rc};
1972 ctx.assignments.emplace_back(reg, rc);
1973
1974 aco_ptr<Instruction> mov;
1975 if (can_sgpr)
1976 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1977 else
1978 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1979 mov->operands[0] = instr->operands[0];
1980 mov->definitions[0] = Definition(tmp);
1981 mov->definitions[0].setFixed(reg);
1982
1983 instr->operands[0] = Operand(tmp);
1984 instr->operands[0].setFixed(reg);
1985 instructions.emplace_back(std::move(mov));
1986 /* re-enable live vars */
1987 for (const Operand& op : instr->operands) {
1988 if (op.isTemp() && op.isFirstKill())
1989 register_file.clear(op);
1990 }
1991 for (const Definition& def : instr->definitions) {
1992 if (def.isTemp() && !def.isKill())
1993 register_file.fill(def);
1994 }
1995 }
1996
1997 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1998 aco_ptr<Instruction> tmp = std::move(instr);
1999 Format format = asVOP3(tmp->format);
2000 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
2001 for (unsigned i = 0; i < instr->operands.size(); i++) {
2002 Operand& operand = tmp->operands[i];
2003 instr->operands[i] = operand;
2004 /* keep phi_map up to date */
2005 if (operand.isTemp()) {
2006 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.tempId());
2007 if (phi != ctx.phi_map.end()) {
2008 phi->second.uses.erase(tmp.get());
2009 phi->second.uses.emplace(instr.get());
2010 }
2011 }
2012 }
2013 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2014 }
2015 instructions.emplace_back(std::move(*it));
2016
2017 } /* end for Instr */
2018
2019 block.instructions = std::move(instructions);
2020
2021 ctx.filled[block.index] = true;
2022 for (unsigned succ_idx : block.linear_succs) {
2023 Block& succ = program->blocks[succ_idx];
2024 /* seal block if all predecessors are filled */
2025 bool all_filled = true;
2026 for (unsigned pred_idx : succ.linear_preds) {
2027 if (!ctx.filled[pred_idx]) {
2028 all_filled = false;
2029 break;
2030 }
2031 }
2032 if (all_filled) {
2033 ctx.sealed[succ_idx] = true;
2034
2035 /* finish incomplete phis and check if they became trivial */
2036 for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2037 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2038 for (unsigned i = 0; i < phi->operands.size(); i++) {
2039 phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2040 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2041 }
2042 try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2043 }
2044 /* complete the original phi nodes, but no need to check triviality */
2045 for (aco_ptr<Instruction>& instr : succ.instructions) {
2046 if (!is_phi(instr))
2047 break;
2048 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2049
2050 for (unsigned i = 0; i < instr->operands.size(); i++) {
2051 auto& operand = instr->operands[i];
2052 if (!operand.isTemp())
2053 continue;
2054 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2055 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2056 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2057 if (phi != ctx.phi_map.end())
2058 phi->second.uses.emplace(instr.get());
2059 }
2060 }
2061 }
2062 }
2063 } /* end for BB */
2064
2065 /* remove trivial phis */
2066 for (Block& block : program->blocks) {
2067 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2068 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2069 auto middle = std::remove_if(block.instructions.begin(), end,
2070 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2071 block.instructions.erase(middle, end);
2072 }
2073
2074 /* find scc spill registers which may be needed for parallelcopies created by phis */
2075 for (Block& block : program->blocks) {
2076 if (block.linear_preds.size() <= 1)
2077 continue;
2078
2079 std::bitset<128> regs = sgpr_live_in[block.index];
2080 if (!regs[127])
2081 continue;
2082
2083 /* choose a register */
2084 int16_t reg = 0;
2085 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2086 ;
2087 assert(reg < ctx.program->max_reg_demand.sgpr);
2088 adjust_max_used_regs(ctx, s1, reg);
2089
2090 /* update predecessors */
2091 for (unsigned& pred_index : block.linear_preds) {
2092 Block& pred = program->blocks[pred_index];
2093 pred.scc_live_out = true;
2094 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2095 }
2096 }
2097
2098 /* num_gpr = rnd_up(max_used_gpr + 1) */
2099 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2100 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2101 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2102 else
2103 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2104 }
2105
2106 }