f00001285a06cb8e0d8d3e36284e071a6e6c160f
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37
38 namespace aco {
39 namespace {
40
41 struct assignment {
42 PhysReg reg;
43 RegClass rc;
44 uint8_t assigned = 0;
45 assignment() = default;
46 assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
47 };
48
49 struct phi_info {
50 Instruction* phi;
51 unsigned block_idx;
52 std::set<Instruction*> uses;
53 };
54
55 struct ra_ctx {
56 std::bitset<512> war_hint;
57 Program* program;
58 std::vector<assignment> assignments;
59 std::vector<std::unordered_map<unsigned, Temp>> renames;
60 std::vector<std::vector<Instruction*>> incomplete_phis;
61 std::vector<bool> filled;
62 std::vector<bool> sealed;
63 std::unordered_map<unsigned, Temp> orig_names;
64 std::unordered_map<unsigned, phi_info> phi_map;
65 std::unordered_map<unsigned, unsigned> affinities;
66 std::unordered_map<unsigned, Instruction*> vectors;
67 std::unordered_map<unsigned, Instruction*> split_vectors;
68 aco_ptr<Instruction> pseudo_dummy;
69 unsigned max_used_sgpr = 0;
70 unsigned max_used_vgpr = 0;
71 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
72
73 ra_ctx(Program* program) : program(program),
74 assignments(program->peekAllocationId()),
75 renames(program->blocks.size()),
76 incomplete_phis(program->blocks.size()),
77 filled(program->blocks.size()),
78 sealed(program->blocks.size())
79 {
80 pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
81 }
82 };
83
84 bool instr_can_access_subdword(ra_ctx& ctx, aco_ptr<Instruction>& instr)
85 {
86 if (ctx.program->chip_class < GFX8)
87 return false;
88 return instr->isSDWA() || instr->format == Format::PSEUDO;
89 }
90
91 struct DefInfo {
92 uint16_t lb;
93 uint16_t ub;
94 uint8_t size;
95 uint8_t stride;
96 RegClass rc;
97
98 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
99 size = rc.size();
100 stride = 1;
101
102 if (rc.type() == RegType::vgpr) {
103 lb = 256;
104 ub = 256 + ctx.program->max_reg_demand.vgpr;
105 } else {
106 lb = 0;
107 ub = ctx.program->max_reg_demand.sgpr;
108 if (size == 2)
109 stride = 2;
110 else if (size >= 4)
111 stride = 4;
112 }
113
114 if (rc.is_subdword()) {
115 /* stride in bytes */
116 if(!instr_can_access_subdword(ctx, instr))
117 stride = 4;
118 else if (rc.bytes() % 4 == 0)
119 stride = 4;
120 else if (rc.bytes() % 2 == 0)
121 stride = 2;
122 }
123 }
124 };
125
126 class RegisterFile {
127 public:
128 RegisterFile() {regs.fill(0);}
129
130 std::array<uint32_t, 512> regs;
131 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
132
133 const uint32_t& operator [] (unsigned index) const {
134 return regs[index];
135 }
136
137 uint32_t& operator [] (unsigned index) {
138 return regs[index];
139 }
140
141 unsigned count_zero(PhysReg start, unsigned size) {
142 unsigned res = 0;
143 for (unsigned i = 0; i < size; i++)
144 res += !regs[start + i];
145 return res;
146 }
147
148 bool test(PhysReg start, unsigned num_bytes) {
149 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
150 if (regs[i] & 0x0FFFFFFF)
151 return true;
152 if (regs[i] == 0xF0000000) {
153 assert(subdword_regs.find(i) != subdword_regs.end());
154 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
155 if (subdword_regs[i][j])
156 return true;
157 }
158 }
159 }
160 return false;
161 }
162
163 void block(PhysReg start, RegClass rc) {
164 if (rc.is_subdword())
165 fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
166 else
167 fill(start, rc.size(), 0xFFFFFFFF);
168 }
169
170 bool is_blocked(PhysReg start) {
171 if (regs[start] == 0xFFFFFFFF)
172 return true;
173 if (regs[start] == 0xF0000000) {
174 for (unsigned i = start.byte(); i < 4; i++)
175 if (subdword_regs[start][i] == 0xFFFFFFFF)
176 return true;
177 }
178 return false;
179 }
180
181 void clear(PhysReg start, RegClass rc) {
182 if (rc.is_subdword())
183 fill_subdword(start, rc.bytes(), 0);
184 else
185 fill(start, rc.size(), 0);
186 }
187
188 void fill(Operand op) {
189 if (op.regClass().is_subdword())
190 fill_subdword(op.physReg(), op.bytes(), op.tempId());
191 else
192 fill(op.physReg(), op.size(), op.tempId());
193 }
194
195 void clear(Operand op) {
196 clear(op.physReg(), op.regClass());
197 }
198
199 void fill(Definition def) {
200 if (def.regClass().is_subdword())
201 fill_subdword(def.physReg(), def.bytes(), def.tempId());
202 else
203 fill(def.physReg(), def.size(), def.tempId());
204 }
205
206 void clear(Definition def) {
207 clear(def.physReg(), def.regClass());
208 }
209
210 private:
211 void fill(PhysReg start, unsigned size, uint32_t val) {
212 for (unsigned i = 0; i < size; i++)
213 regs[start + i] = val;
214 }
215
216 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
217 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
218 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
219 /* emplace or get */
220 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
221 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
222 sub[j] = val;
223
224 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
225 subdword_regs.erase(i);
226 regs[i] = 0;
227 }
228 }
229 }
230 };
231
232
233 /* helper function for debugging */
234 #if 0
235 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
236 {
237 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
238 unsigned lb = vgprs ? 256 : 0;
239 unsigned ub = lb + max;
240 char reg_char = vgprs ? 'v' : 's';
241
242 /* print markers */
243 printf(" ");
244 for (unsigned i = lb; i < ub; i += 3) {
245 printf("%.2u ", i - lb);
246 }
247 printf("\n");
248
249 /* print usage */
250 printf("%cgprs: ", reg_char);
251 unsigned free_regs = 0;
252 unsigned prev = 0;
253 bool char_select = false;
254 for (unsigned i = lb; i < ub; i++) {
255 if (reg_file[i] == 0xFFFF) {
256 printf("~");
257 } else if (reg_file[i]) {
258 if (reg_file[i] != prev) {
259 prev = reg_file[i];
260 char_select = !char_select;
261 }
262 printf(char_select ? "#" : "@");
263 } else {
264 free_regs++;
265 printf(".");
266 }
267 }
268 printf("\n");
269
270 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
271
272 /* print assignments */
273 prev = 0;
274 unsigned size = 0;
275 for (unsigned i = lb; i < ub; i++) {
276 if (reg_file[i] != prev) {
277 if (prev && size > 1)
278 printf("-%d]\n", i - 1 - lb);
279 else if (prev)
280 printf("]\n");
281 prev = reg_file[i];
282 if (prev && prev != 0xFFFF) {
283 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
284 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
285 else
286 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
287 }
288 size = 1;
289 } else {
290 size++;
291 }
292 }
293 if (prev && size > 1)
294 printf("-%d]\n", ub - lb - 1);
295 else if (prev)
296 printf("]\n");
297 }
298 #endif
299
300
301 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
302 {
303 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
304 unsigned size = rc.size();
305 if (rc.type() == RegType::vgpr) {
306 assert(reg >= 256);
307 unsigned hi = reg - 256 + size - 1;
308 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
309 } else if (reg + rc.size() <= max_addressible_sgpr) {
310 unsigned hi = reg + size - 1;
311 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
312 }
313 }
314
315
316 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
317 std::vector<std::pair<Operand, Definition>>& parallelcopies,
318 aco_ptr<Instruction>& instr)
319 {
320 /* allocate id's and rename operands: this is done transparently here */
321 for (std::pair<Operand, Definition>& copy : parallelcopies) {
322 /* the definitions with id are not from this function and already handled */
323 if (copy.second.isTemp())
324 continue;
325
326 /* check if we we moved another parallelcopy definition */
327 for (std::pair<Operand, Definition>& other : parallelcopies) {
328 if (!other.second.isTemp())
329 continue;
330 if (copy.first.getTemp() == other.second.getTemp()) {
331 copy.first.setTemp(other.first.getTemp());
332 copy.first.setFixed(other.first.physReg());
333 }
334 }
335 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
336 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
337 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
338 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
339 reg_file.fill(copy.second);
340
341 /* check if we moved an operand */
342 for (Operand& op : instr->operands) {
343 if (!op.isTemp())
344 continue;
345 if (op.tempId() == copy.first.tempId()) {
346 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
347 for (std::pair<Operand, Definition>& pc : parallelcopies) {
348 PhysReg def_reg = pc.second.physReg();
349 omit_renaming &= def_reg > copy.first.physReg() ?
350 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
351 (def_reg + pc.second.size() <= copy.first.physReg().reg());
352 }
353 if (omit_renaming)
354 continue;
355 op.setTemp(copy.second.getTemp());
356 op.setFixed(copy.second.physReg());
357 }
358 }
359 }
360 }
361
362 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
363 RegisterFile& reg_file,
364 DefInfo info)
365 {
366 uint32_t lb = info.lb;
367 uint32_t ub = info.ub;
368 uint32_t size = info.size;
369 uint32_t stride = info.stride;
370 RegClass rc = info.rc;
371
372 if (rc.is_subdword()) {
373 for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
374 assert(reg_file[entry.first] == 0xF0000000);
375 if (lb > entry.first || entry.first >= ub)
376 continue;
377
378 for (unsigned i = 0; i < 4; i+= stride) {
379 if (entry.second[i] != 0)
380 continue;
381
382 bool reg_found = true;
383 for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
384 reg_found &= entry.second[i + j] == 0;
385
386 /* check neighboring reg if needed */
387 reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
388 if (reg_found) {
389 PhysReg res{entry.first};
390 res.reg_b += i;
391 adjust_max_used_regs(ctx, rc, entry.first);
392 return {res, true};
393 }
394 }
395 }
396
397 stride = 1; /* stride in full registers */
398 rc = info.rc = RegClass(RegType::vgpr, size);
399 }
400
401 if (stride == 1) {
402
403 for (unsigned stride = 8; stride > 1; stride /= 2) {
404 if (size % stride)
405 continue;
406 info.stride = stride;
407 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
408 if (res.second)
409 return res;
410 }
411
412 /* best fit algorithm: find the smallest gap to fit in the variable */
413 unsigned best_pos = 0xFFFF;
414 unsigned gap_size = 0xFFFF;
415 unsigned last_pos = 0xFFFF;
416
417 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
418
419 if (reg_file[current_reg] == 0 && !ctx.war_hint[current_reg]) {
420 if (last_pos == 0xFFFF)
421 last_pos = current_reg;
422
423 /* stop searching after max_used_gpr */
424 if (current_reg == ctx.max_used_sgpr + 1 || current_reg == 256 + ctx.max_used_vgpr + 1)
425 break;
426 else
427 continue;
428 }
429
430 if (last_pos == 0xFFFF)
431 continue;
432
433 /* early return on exact matches */
434 if (last_pos + size == current_reg) {
435 adjust_max_used_regs(ctx, rc, last_pos);
436 return {PhysReg{last_pos}, true};
437 }
438
439 /* check if it fits and the gap size is smaller */
440 if (last_pos + size < current_reg && current_reg - last_pos < gap_size) {
441 best_pos = last_pos;
442 gap_size = current_reg - last_pos;
443 }
444 last_pos = 0xFFFF;
445 }
446
447 /* final check */
448 if (last_pos + size <= ub && ub - last_pos < gap_size) {
449 best_pos = last_pos;
450 gap_size = ub - last_pos;
451 }
452
453 if (best_pos == 0xFFFF)
454 return {{}, false};
455
456 /* find best position within gap by leaving a good stride for other variables*/
457 unsigned buffer = gap_size - size;
458 if (buffer > 1) {
459 if (((best_pos + size) % 8 != 0 && (best_pos + buffer) % 8 == 0) ||
460 ((best_pos + size) % 4 != 0 && (best_pos + buffer) % 4 == 0) ||
461 ((best_pos + size) % 2 != 0 && (best_pos + buffer) % 2 == 0))
462 best_pos = best_pos + buffer;
463 }
464
465 adjust_max_used_regs(ctx, rc, best_pos);
466 return {PhysReg{best_pos}, true};
467 }
468
469 bool found = false;
470 unsigned reg_lo = lb;
471 unsigned reg_hi = lb + size - 1;
472 while (!found && reg_lo + size <= ub) {
473 if (reg_file[reg_lo] != 0) {
474 reg_lo += stride;
475 continue;
476 }
477 reg_hi = reg_lo + size - 1;
478 found = true;
479 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
480 if (reg_file[reg] != 0 || ctx.war_hint[reg])
481 found = false;
482 }
483 if (found) {
484 adjust_max_used_regs(ctx, rc, reg_lo);
485 return {PhysReg{reg_lo}, true};
486 }
487
488 reg_lo += stride;
489 }
490
491 return {{}, false};
492 }
493
494 /* collect variables from a register area and clear reg_file */
495 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
496 PhysReg reg, unsigned size)
497 {
498 std::set<std::pair<unsigned, unsigned>> vars;
499 for (unsigned j = reg; j < reg + size; j++) {
500 if (reg_file.is_blocked(PhysReg{j}))
501 continue;
502 if (reg_file[j] == 0xF0000000) {
503 for (unsigned k = 0; k < 4; k++) {
504 unsigned id = reg_file.subdword_regs[j][k];
505 if (id) {
506 assignment& var = ctx.assignments[id];
507 vars.emplace(var.rc.bytes(), id);
508 reg_file.clear(var.reg, var.rc);
509 if (!reg_file[j])
510 break;
511 }
512 }
513 } else if (reg_file[j] != 0) {
514 unsigned id = reg_file[j];
515 assignment& var = ctx.assignments[id];
516 vars.emplace(var.rc.bytes(), id);
517 reg_file.clear(var.reg, var.rc);
518 }
519 }
520 return vars;
521 }
522
523 bool get_regs_for_copies(ra_ctx& ctx,
524 RegisterFile& reg_file,
525 std::vector<std::pair<Operand, Definition>>& parallelcopies,
526 const std::set<std::pair<unsigned, unsigned>> &vars,
527 uint32_t lb, uint32_t ub,
528 aco_ptr<Instruction>& instr,
529 uint32_t def_reg_lo,
530 uint32_t def_reg_hi)
531 {
532
533 /* variables are sorted from small sized to large */
534 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
535 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
536 unsigned id = it->second;
537 assignment& var = ctx.assignments[id];
538 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
539 uint32_t size = info.size;
540
541 /* check if this is a dead operand, then we can re-use the space from the definition */
542 bool is_dead_operand = false;
543 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
544 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
545 is_dead_operand = true;
546 }
547
548 std::pair<PhysReg, bool> res;
549 if (is_dead_operand) {
550 if (instr->opcode == aco_opcode::p_create_vector) {
551 PhysReg reg(def_reg_lo);
552 for (unsigned i = 0; i < instr->operands.size(); i++) {
553 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
554 assert(!reg_file.test(reg, var.rc.bytes()));
555 res = {reg, reg.byte() == 0 || instr_can_access_subdword(ctx, instr)};
556 break;
557 }
558 reg.reg_b += instr->operands[i].bytes();
559 }
560 } else {
561 info.lb = def_reg_lo;
562 info.ub = def_reg_hi + 1;
563 res = get_reg_simple(ctx, reg_file, info);
564 }
565 } else {
566 info.lb = lb;
567 info.ub = def_reg_lo;
568 res = get_reg_simple(ctx, reg_file, info);
569 if (!res.second) {
570 info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
571 info.ub = ub;
572 res = get_reg_simple(ctx, reg_file, info);
573 }
574 }
575
576 if (res.second) {
577 /* mark the area as blocked */
578 reg_file.block(res.first, var.rc);
579
580 /* create parallelcopy pair (without definition id) */
581 Temp tmp = Temp(id, var.rc);
582 Operand pc_op = Operand(tmp);
583 pc_op.setFixed(var.reg);
584 Definition pc_def = Definition(res.first, pc_op.regClass());
585 parallelcopies.emplace_back(pc_op, pc_def);
586 continue;
587 }
588
589 unsigned best_pos = lb;
590 unsigned num_moves = 0xFF;
591 unsigned num_vars = 0;
592
593 /* we use a sliding window to find potential positions */
594 unsigned reg_lo = lb;
595 unsigned reg_hi = lb + size - 1;
596 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
597 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
598 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
599 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
600 continue;
601
602 /* second, check that we have at most k=num_moves elements in the window
603 * and no element is larger than the currently processed one */
604 unsigned k = 0;
605 unsigned n = 0;
606 unsigned last_var = 0;
607 bool found = true;
608 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
609 if (reg_file[j] == 0 || reg_file[j] == last_var)
610 continue;
611
612 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
613 found = false;
614 break;
615 }
616 if (reg_file[j] == 0xF0000000) {
617 k += 1;
618 n++;
619 continue;
620 }
621 /* we cannot split live ranges of linear vgprs */
622 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
623 found = false;
624 break;
625 }
626 bool is_kill = false;
627 for (const Operand& op : instr->operands) {
628 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
629 is_kill = true;
630 break;
631 }
632 }
633 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
634 found = false;
635 break;
636 }
637
638 k += ctx.assignments[reg_file[j]].rc.size();
639 last_var = reg_file[j];
640 n++;
641 if (k > num_moves || (k == num_moves && n <= num_vars)) {
642 found = false;
643 break;
644 }
645 }
646
647 if (found) {
648 best_pos = reg_lo;
649 num_moves = k;
650 num_vars = n;
651 }
652 }
653
654 /* FIXME: we messed up and couldn't find space for the variables to be copied */
655 if (num_moves == 0xFF)
656 return false;
657
658 reg_lo = best_pos;
659 reg_hi = best_pos + size - 1;
660
661 /* collect variables and block reg file */
662 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
663
664 /* mark the area as blocked */
665 reg_file.block(PhysReg{reg_lo}, var.rc);
666
667 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
668 return false;
669
670 adjust_max_used_regs(ctx, var.rc, reg_lo);
671
672 /* create parallelcopy pair (without definition id) */
673 Temp tmp = Temp(id, var.rc);
674 Operand pc_op = Operand(tmp);
675 pc_op.setFixed(var.reg);
676 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
677 parallelcopies.emplace_back(pc_op, pc_def);
678 }
679
680 return true;
681 }
682
683
684 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
685 RegisterFile& reg_file,
686 std::vector<std::pair<Operand, Definition>>& parallelcopies,
687 DefInfo info,
688 aco_ptr<Instruction>& instr)
689 {
690 uint32_t lb = info.lb;
691 uint32_t ub = info.ub;
692 uint32_t size = info.size;
693 uint32_t stride = info.stride;
694 RegClass rc = info.rc;
695
696 /* check how many free regs we have */
697 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
698
699 /* mark and count killed operands */
700 unsigned killed_ops = 0;
701 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
702 if (instr->operands[j].isTemp() &&
703 instr->operands[j].isFirstKillBeforeDef() &&
704 instr->operands[j].physReg() >= lb &&
705 instr->operands[j].physReg() < ub &&
706 !reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes())) {
707 assert(instr->operands[j].isFixed());
708 reg_file.block(instr->operands[j].physReg(), instr->operands[j].regClass());
709 killed_ops += instr->operands[j].getTemp().size();
710 }
711 }
712
713 assert(regs_free >= size);
714 /* we might have to move dead operands to dst in order to make space */
715 unsigned op_moves = 0;
716
717 if (size > (regs_free - killed_ops))
718 op_moves = size - (regs_free - killed_ops);
719
720 /* find the best position to place the definition */
721 unsigned best_pos = lb;
722 unsigned num_moves = 0xFF;
723 unsigned num_vars = 0;
724
725 /* we use a sliding window to check potential positions */
726 unsigned reg_lo = lb;
727 unsigned reg_hi = lb + size - 1;
728 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
729 /* first check the edges: this is what we have to fix to allow for num_moves > size */
730 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
731 continue;
732 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
733 continue;
734
735 /* second, check that we have at most k=num_moves elements in the window
736 * and no element is larger than the currently processed one */
737 unsigned k = op_moves;
738 unsigned n = 0;
739 unsigned remaining_op_moves = op_moves;
740 unsigned last_var = 0;
741 bool found = true;
742 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
743 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
744 if (reg_file[j] == 0 || reg_file[j] == last_var)
745 continue;
746
747 /* dead operands effectively reduce the number of estimated moves */
748 if (reg_file.is_blocked(PhysReg{j})) {
749 if (remaining_op_moves) {
750 k--;
751 remaining_op_moves--;
752 }
753 continue;
754 }
755
756 if (reg_file[j] == 0xF0000000) {
757 k += 1;
758 n++;
759 continue;
760 }
761
762 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
763 found = false;
764 break;
765 }
766
767 /* we cannot split live ranges of linear vgprs */
768 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
769 found = false;
770 break;
771 }
772
773 k += ctx.assignments[reg_file[j]].rc.size();
774 n++;
775 last_var = reg_file[j];
776 }
777
778 if (!found || k > num_moves)
779 continue;
780 if (k == num_moves && n < num_vars)
781 continue;
782 if (!aligned && k == num_moves && n == num_vars)
783 continue;
784
785 if (found) {
786 best_pos = reg_lo;
787 num_moves = k;
788 num_vars = n;
789 }
790 }
791
792 if (num_moves == 0xFF) {
793 /* remove killed operands from reg_file once again */
794 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
795 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
796 reg_file.clear(instr->operands[i]);
797 }
798 for (unsigned i = 0; i < instr->definitions.size(); i++) {
799 Definition def = instr->definitions[i];
800 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
801 reg_file.fill(def);
802 }
803 return {{}, false};
804 }
805
806 RegisterFile register_file = reg_file;
807
808 /* now, we figured the placement for our definition */
809 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
810
811 if (instr->opcode == aco_opcode::p_create_vector) {
812 /* move killed operands which aren't yet at the correct position (GFX9+)
813 * or which are in the definition space */
814 PhysReg reg = PhysReg{best_pos};
815 for (Operand& op : instr->operands) {
816 if (op.isTemp() && op.isFirstKillBeforeDef() &&
817 op.getTemp().type() == rc.type()) {
818 if (op.physReg() != reg &&
819 (ctx.program->chip_class >= GFX9 ||
820 (op.physReg().advance(op.bytes()) > PhysReg{best_pos} &&
821 op.physReg() < PhysReg{best_pos + size}))) {
822 vars.emplace(op.bytes(), op.tempId());
823 reg_file.clear(op);
824 } else {
825 reg_file.fill(op);
826 }
827 }
828 reg.reg_b += op.bytes();
829 }
830 } else if (!is_phi(instr)) {
831 /* re-enable killed operands */
832 for (Operand& op : instr->operands) {
833 if (op.isTemp() && op.isFirstKillBeforeDef())
834 reg_file.fill(op);
835 }
836 }
837
838 std::vector<std::pair<Operand, Definition>> pc;
839 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
840 reg_file = std::move(register_file);
841 /* remove killed operands from reg_file once again */
842 if (!is_phi(instr)) {
843 for (const Operand& op : instr->operands) {
844 if (op.isTemp() && op.isFirstKillBeforeDef())
845 reg_file.clear(op);
846 }
847 }
848 for (unsigned i = 0; i < instr->definitions.size(); i++) {
849 Definition& def = instr->definitions[i];
850 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
851 reg_file.fill(def);
852 }
853 return {{}, false};
854 }
855
856 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
857
858 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
859 reg_file.clear(PhysReg{best_pos}, rc);
860
861 update_renames(ctx, reg_file, parallelcopies, instr);
862
863 /* remove killed operands from reg_file once again */
864 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
865 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
866 continue;
867 assert(!instr->operands[i].isUndefined());
868 if (instr->operands[i].isFirstKillBeforeDef())
869 reg_file.clear(instr->operands[i]);
870 }
871 for (unsigned i = 0; i < instr->definitions.size(); i++) {
872 Definition def = instr->definitions[i];
873 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
874 reg_file.fill(def);
875 }
876
877 adjust_max_used_regs(ctx, rc, best_pos);
878 return {PhysReg{best_pos}, true};
879 }
880
881 bool get_reg_specified(ra_ctx& ctx,
882 RegisterFile& reg_file,
883 RegClass rc,
884 std::vector<std::pair<Operand, Definition>>& parallelcopies,
885 aco_ptr<Instruction>& instr,
886 PhysReg reg)
887 {
888 if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(ctx, instr))
889 return false;
890 if (!rc.is_subdword() && reg.byte())
891 return false;
892
893 uint32_t size = rc.size();
894 uint32_t stride = 1;
895 uint32_t lb, ub;
896
897 if (rc.type() == RegType::vgpr) {
898 lb = 256;
899 ub = 256 + ctx.program->max_reg_demand.vgpr;
900 } else {
901 if (size == 2)
902 stride = 2;
903 else if (size >= 4)
904 stride = 4;
905 if (reg % stride != 0)
906 return false;
907 lb = 0;
908 ub = ctx.program->max_reg_demand.sgpr;
909 }
910
911 uint32_t reg_lo = reg.reg();
912 uint32_t reg_hi = reg + (size - 1);
913
914 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
915 return false;
916
917 if (reg_file.test(reg, rc.bytes()))
918 return false;
919
920 adjust_max_used_regs(ctx, rc, reg_lo);
921 return true;
922 }
923
924 PhysReg get_reg(ra_ctx& ctx,
925 RegisterFile& reg_file,
926 Temp temp,
927 std::vector<std::pair<Operand, Definition>>& parallelcopies,
928 aco_ptr<Instruction>& instr)
929 {
930 auto split_vec = ctx.split_vectors.find(temp.id());
931 if (split_vec != ctx.split_vectors.end()) {
932 unsigned offset = 0;
933 for (Definition def : split_vec->second->definitions) {
934 auto affinity_it = ctx.affinities.find(def.tempId());
935 if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
936 PhysReg reg = ctx.assignments[affinity_it->second].reg;
937 reg.reg_b -= offset;
938 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
939 return reg;
940 }
941 offset += def.bytes();
942 }
943 }
944
945 if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
946 ctx.assignments[ctx.affinities[temp.id()]].assigned) {
947 PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
948 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
949 return reg;
950 }
951
952 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
953 Instruction* vec = ctx.vectors[temp.id()];
954 unsigned byte_offset = 0;
955 for (const Operand& op : vec->operands) {
956 if (op.isTemp() && op.tempId() == temp.id())
957 break;
958 else
959 byte_offset += op.bytes();
960 }
961 unsigned k = 0;
962 for (const Operand& op : vec->operands) {
963 if (op.isTemp() &&
964 op.tempId() != temp.id() &&
965 op.getTemp().type() == temp.type() &&
966 ctx.assignments[op.tempId()].assigned) {
967 PhysReg reg = ctx.assignments[op.tempId()].reg;
968 reg.reg_b += (byte_offset - k);
969 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
970 return reg;
971 }
972 k += op.bytes();
973 }
974
975 DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
976 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
977 PhysReg reg = res.first;
978 if (res.second) {
979 reg.reg_b += byte_offset;
980 /* make sure to only use byte offset if the instruction supports it */
981 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
982 return reg;
983 }
984 }
985
986 DefInfo info(ctx, instr, temp.regClass());
987
988 /* try to find space without live-range splits */
989 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
990
991 if (res.second)
992 return res.first;
993
994 /* try to find space with live-range splits */
995 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
996
997 if (res.second)
998 return res.first;
999
1000 /* try using more registers */
1001
1002 /* We should only fail here because keeping under the limit would require
1003 * too many moves. */
1004 assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
1005
1006 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
1007 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
1008 if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
1009 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
1010 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1011 } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
1012 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
1013 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1014 }
1015
1016 //FIXME: if nothing helps, shift-rotate the registers to make space
1017
1018 fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n");
1019 abort();
1020 }
1021
1022 PhysReg get_reg_create_vector(ra_ctx& ctx,
1023 RegisterFile& reg_file,
1024 Temp temp,
1025 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1026 aco_ptr<Instruction>& instr)
1027 {
1028 RegClass rc = temp.regClass();
1029 /* create_vector instructions have different costs w.r.t. register coalescing */
1030 uint32_t size = rc.size();
1031 uint32_t bytes = rc.bytes();
1032 uint32_t stride = 1;
1033 uint32_t lb, ub;
1034 if (rc.type() == RegType::vgpr) {
1035 lb = 256;
1036 ub = 256 + ctx.program->max_reg_demand.vgpr;
1037 } else {
1038 lb = 0;
1039 ub = ctx.program->max_reg_demand.sgpr;
1040 if (size == 2)
1041 stride = 2;
1042 else if (size >= 4)
1043 stride = 4;
1044 }
1045
1046 //TODO: improve p_create_vector for sub-dword vectors
1047
1048 unsigned best_pos = -1;
1049 unsigned num_moves = 0xFF;
1050 bool best_war_hint = true;
1051
1052 /* test for each operand which definition placement causes the least shuffle instructions */
1053 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1054 // TODO: think about, if we can alias live operands on the same register
1055 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
1056 continue;
1057
1058 if (offset > instr->operands[i].physReg().reg_b)
1059 continue;
1060
1061 unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
1062 if (reg_lo % 4)
1063 continue;
1064 reg_lo /= 4;
1065 unsigned reg_hi = reg_lo + size - 1;
1066 unsigned k = 0;
1067
1068 /* no need to check multiple times */
1069 if (reg_lo == best_pos)
1070 continue;
1071
1072 /* check borders */
1073 // TODO: this can be improved */
1074 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
1075 continue;
1076 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
1077 continue;
1078 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
1079 continue;
1080
1081 /* count variables to be moved and check war_hint */
1082 bool war_hint = false;
1083 bool linear_vgpr = false;
1084 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
1085 if (reg_file[j] != 0) {
1086 if (reg_file[j] == 0xF0000000) {
1087 PhysReg reg;
1088 reg.reg_b = j * 4;
1089 unsigned bytes_left = bytes - (j - reg_lo) * 4;
1090 for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
1091 k += reg_file.test(reg, 1);
1092 } else {
1093 k += 4;
1094 /* we cannot split live ranges of linear vgprs */
1095 if (ctx.assignments[reg_file[j]].rc & (1 << 6))
1096 linear_vgpr = true;
1097 }
1098 }
1099 war_hint |= ctx.war_hint[j];
1100 }
1101 if (linear_vgpr || (war_hint && !best_war_hint))
1102 continue;
1103
1104 /* count operands in wrong positions */
1105 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
1106 if (j == i ||
1107 !instr->operands[j].isTemp() ||
1108 instr->operands[j].getTemp().type() != rc.type())
1109 continue;
1110 if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
1111 k += instr->operands[j].bytes();
1112 }
1113 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
1114 if (k > num_moves || (!aligned && k == num_moves))
1115 continue;
1116
1117 best_pos = reg_lo;
1118 num_moves = k;
1119 best_war_hint = war_hint;
1120 }
1121
1122 if (num_moves >= bytes)
1123 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1124
1125 /* re-enable killed operands which are in the wrong position */
1126 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1127 if (instr->operands[i].isTemp() &&
1128 instr->operands[i].isFirstKillBeforeDef() &&
1129 instr->operands[i].physReg().reg_b != best_pos * 4 + offset)
1130 reg_file.fill(instr->operands[i]);
1131 }
1132
1133 /* collect variables to be moved */
1134 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1135
1136 /* GFX9+: move killed operands which aren't yet at the correct position
1137 * Moving all killed operands generally leads to more register swaps.
1138 * This is only done on GFX9+ because of the cheap v_swap instruction.
1139 */
1140 if (ctx.program->chip_class >= GFX9) {
1141 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1142 if (instr->operands[i].isTemp() &&
1143 instr->operands[i].isFirstKillBeforeDef() &&
1144 instr->operands[i].getTemp().type() == rc.type() &&
1145 instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
1146 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1147 reg_file.clear(instr->operands[i]);
1148 }
1149 }
1150 }
1151 ASSERTED bool success = false;
1152 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1153 assert(success);
1154
1155 update_renames(ctx, reg_file, parallelcopies, instr);
1156 adjust_max_used_regs(ctx, rc, best_pos);
1157
1158 /* remove killed operands from reg_file once again */
1159 for (unsigned i = 0; i < instr->operands.size(); i++) {
1160 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
1161 continue;
1162 assert(!instr->operands[i].isUndefined());
1163 if (instr->operands[i].isFirstKillBeforeDef())
1164 reg_file.clear(instr->operands[i]);
1165 }
1166
1167 return PhysReg{best_pos};
1168 }
1169
1170 void handle_pseudo(ra_ctx& ctx,
1171 const RegisterFile& reg_file,
1172 Instruction* instr)
1173 {
1174 if (instr->format != Format::PSEUDO)
1175 return;
1176
1177 /* all instructions which use handle_operands() need this information */
1178 switch (instr->opcode) {
1179 case aco_opcode::p_extract_vector:
1180 case aco_opcode::p_create_vector:
1181 case aco_opcode::p_split_vector:
1182 case aco_opcode::p_parallelcopy:
1183 case aco_opcode::p_wqm:
1184 break;
1185 default:
1186 return;
1187 }
1188
1189 /* if all definitions are vgpr, no need to care for SCC */
1190 bool writes_sgpr = false;
1191 for (Definition& def : instr->definitions) {
1192 if (def.getTemp().type() == RegType::sgpr) {
1193 writes_sgpr = true;
1194 break;
1195 }
1196 }
1197 /* if all operands are constant, no need to care either */
1198 bool reads_sgpr = false;
1199 for (Operand& op : instr->operands) {
1200 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1201 reads_sgpr = true;
1202 break;
1203 }
1204 }
1205 if (!(writes_sgpr && reads_sgpr))
1206 return;
1207
1208 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1209 if (reg_file[scc.reg()]) {
1210 pi->tmp_in_scc = true;
1211
1212 int reg = ctx.max_used_sgpr;
1213 for (; reg >= 0 && reg_file[reg]; reg--)
1214 ;
1215 if (reg < 0) {
1216 reg = ctx.max_used_sgpr + 1;
1217 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1218 ;
1219 assert(reg < ctx.program->max_reg_demand.sgpr);
1220 }
1221
1222 adjust_max_used_regs(ctx, s1, reg);
1223 pi->scratch_sgpr = PhysReg{(unsigned)reg};
1224 } else {
1225 pi->tmp_in_scc = false;
1226 }
1227 }
1228
1229 bool operand_can_use_reg(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1230 {
1231 if (instr->operands[idx].isFixed())
1232 return instr->operands[idx].physReg() == reg;
1233
1234 if (reg.byte() && !instr_can_access_subdword(ctx, instr))
1235 return false;
1236
1237 switch (instr->format) {
1238 case Format::SMEM:
1239 return reg != scc &&
1240 reg != exec &&
1241 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1242 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1243 default:
1244 // TODO: there are more instructions with restrictions on registers
1245 return true;
1246 }
1247 }
1248
1249 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1250 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1251 aco_ptr<Instruction>& instr, Operand& operand)
1252 {
1253 /* check if the operand is fixed */
1254 PhysReg dst;
1255 if (operand.isFixed()) {
1256 assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1257
1258 /* check if target reg is blocked, and move away the blocking var */
1259 if (register_file[operand.physReg().reg()]) {
1260 assert(register_file[operand.physReg()] != 0xF0000000);
1261 uint32_t blocking_id = register_file[operand.physReg().reg()];
1262 RegClass rc = ctx.assignments[blocking_id].rc;
1263 Operand pc_op = Operand(Temp{blocking_id, rc});
1264 pc_op.setFixed(operand.physReg());
1265
1266 /* find free reg */
1267 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
1268 Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1269 register_file.clear(pc_op);
1270 parallelcopy.emplace_back(pc_op, pc_def);
1271 }
1272 dst = operand.physReg();
1273
1274 } else {
1275 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr);
1276 }
1277
1278 Operand pc_op = operand;
1279 pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1280 Definition pc_def = Definition(dst, pc_op.regClass());
1281 register_file.clear(pc_op);
1282 parallelcopy.emplace_back(pc_op, pc_def);
1283 update_renames(ctx, register_file, parallelcopy, instr);
1284 }
1285
1286 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1287 {
1288 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1289 if (it == ctx.renames[block_idx].end())
1290 return val;
1291 else
1292 return it->second;
1293 }
1294
1295 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1296 {
1297 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1298 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1299 return val;
1300
1301 assert(preds.size() > 0);
1302
1303 Temp new_val;
1304 if (!ctx.sealed[block->index]) {
1305 /* consider rename from already processed predecessor */
1306 Temp tmp = read_variable(ctx, val, preds[0]);
1307
1308 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1309 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1310 ctx.assignments.emplace_back();
1311 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1312 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1313 phi->definitions[0] = Definition(new_val);
1314 for (unsigned i = 0; i < preds.size(); i++)
1315 phi->operands[i] = Operand(val);
1316 if (tmp.regClass() == new_val.regClass())
1317 ctx.affinities[new_val.id()] = tmp.id();
1318
1319 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1320 ctx.incomplete_phis[block->index].emplace_back(phi.get());
1321 block->instructions.insert(block->instructions.begin(), std::move(phi));
1322
1323 } else if (preds.size() == 1) {
1324 /* if the block has only one predecessor, just look there for the name */
1325 new_val = read_variable(ctx, val, preds[0]);
1326 } else {
1327 /* there are multiple predecessors and the block is sealed */
1328 Temp ops[preds.size()];
1329
1330 /* get the rename from each predecessor and check if they are the same */
1331 bool needs_phi = false;
1332 for (unsigned i = 0; i < preds.size(); i++) {
1333 ops[i] = read_variable(ctx, val, preds[i]);
1334 if (i == 0)
1335 new_val = ops[i];
1336 else
1337 needs_phi |= !(new_val == ops[i]);
1338 }
1339
1340 if (needs_phi) {
1341 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1342 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1343 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1344 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1345 phi->definitions[0] = Definition(new_val);
1346 for (unsigned i = 0; i < preds.size(); i++) {
1347 phi->operands[i] = Operand(ops[i]);
1348 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1349 if (ops[i].regClass() == new_val.regClass())
1350 ctx.affinities[new_val.id()] = ops[i].id();
1351 }
1352 ctx.assignments.emplace_back();
1353 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1354 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1355 block->instructions.insert(block->instructions.begin(), std::move(phi));
1356 }
1357 }
1358
1359 if (new_val != val) {
1360 ctx.renames[block->index][val.id()] = new_val;
1361 ctx.orig_names[new_val.id()] = val;
1362 }
1363 return new_val;
1364 }
1365
1366 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1367 {
1368 std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1369
1370 if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1371 return;
1372
1373 assert(info->second.block_idx != 0);
1374 Instruction* phi = info->second.phi;
1375 Temp same = Temp();
1376 Definition def = phi->definitions[0];
1377
1378 /* a phi node is trivial if all operands are the same as the definition of the phi */
1379 for (const Operand& op : phi->operands) {
1380 const Temp t = op.getTemp();
1381 if (t == same || t == def.getTemp()) {
1382 assert(t == same || op.physReg() == def.physReg());
1383 continue;
1384 }
1385 if (same != Temp())
1386 return;
1387
1388 same = t;
1389 }
1390 assert(same != Temp() || same == def.getTemp());
1391
1392 /* reroute all uses to same and remove phi */
1393 std::vector<Temp> phi_users;
1394 std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1395 for (Instruction* instr : info->second.uses) {
1396 assert(phi != instr);
1397 /* recursively try to remove trivial phis */
1398 if (is_phi(instr)) {
1399 /* ignore if the phi was already flagged trivial */
1400 if (instr->definitions.empty())
1401 continue;
1402
1403 if (instr->definitions[0].getTemp() != temp)
1404 phi_users.emplace_back(instr->definitions[0].getTemp());
1405 }
1406 for (Operand& op : instr->operands) {
1407 if (op.isTemp() && op.tempId() == def.tempId()) {
1408 op.setTemp(same);
1409 if (same_phi_info != ctx.phi_map.end())
1410 same_phi_info->second.uses.emplace(instr);
1411 }
1412 }
1413 }
1414
1415 auto it = ctx.orig_names.find(same.id());
1416 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1417 for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1418 auto it = ctx.renames[i].find(orig_var);
1419 if (it != ctx.renames[i].end() && it->second == def.getTemp())
1420 ctx.renames[i][orig_var] = same;
1421 }
1422
1423 phi->definitions.clear(); /* this indicates that the phi can be removed */
1424 ctx.phi_map.erase(info);
1425 for (Temp t : phi_users)
1426 try_remove_trivial_phi(ctx, t);
1427
1428 return;
1429 }
1430
1431 } /* end namespace */
1432
1433
1434 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
1435 {
1436 ra_ctx ctx(program);
1437 std::vector<std::vector<Temp>> phi_ressources;
1438 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1439
1440 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1441 Block& block = *it;
1442
1443 /* first, compute the death points of all live vars within the block */
1444 TempSet& live = live_out_per_block[block.index];
1445
1446 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1447 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1448 aco_ptr<Instruction>& instr = *rit;
1449 if (is_phi(instr)) {
1450 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
1451 live.erase(instr->definitions[0].getTemp());
1452 continue;
1453 }
1454 /* collect information about affinity-related temporaries */
1455 std::vector<Temp> affinity_related;
1456 /* affinity_related[0] is the last seen affinity-related temp */
1457 affinity_related.emplace_back(instr->definitions[0].getTemp());
1458 affinity_related.emplace_back(instr->definitions[0].getTemp());
1459 for (const Operand& op : instr->operands) {
1460 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1461 affinity_related.emplace_back(op.getTemp());
1462 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1463 }
1464 }
1465 phi_ressources.emplace_back(std::move(affinity_related));
1466 } else {
1467 /* add vector affinities */
1468 if (instr->opcode == aco_opcode::p_create_vector) {
1469 for (const Operand& op : instr->operands) {
1470 if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1471 ctx.vectors[op.tempId()] = instr.get();
1472 }
1473 }
1474
1475 if (instr->opcode == aco_opcode::p_split_vector && instr->operands[0].isFirstKillBeforeDef())
1476 ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
1477
1478 /* add operands to live variables */
1479 for (const Operand& op : instr->operands) {
1480 if (op.isTemp())
1481 live.emplace(op.getTemp());
1482 }
1483 }
1484
1485 /* erase definitions from live */
1486 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1487 const Definition& def = instr->definitions[i];
1488 if (!def.isTemp())
1489 continue;
1490 live.erase(def.getTemp());
1491 /* mark last-seen phi operand */
1492 std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1493 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1494 phi_ressources[it->second][0] = def.getTemp();
1495 /* try to coalesce phi affinities with parallelcopies */
1496 Operand op = Operand();
1497 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
1498 op = instr->operands[i];
1499 else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
1500 op = instr->operands[2];
1501
1502 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1503 phi_ressources[it->second].emplace_back(op.getTemp());
1504 temp_to_phi_ressources[op.tempId()] = it->second;
1505 }
1506 }
1507 }
1508 }
1509 }
1510 /* create affinities */
1511 for (std::vector<Temp>& vec : phi_ressources) {
1512 assert(vec.size() > 1);
1513 for (unsigned i = 1; i < vec.size(); i++)
1514 if (vec[i].id() != vec[0].id())
1515 ctx.affinities[vec[i].id()] = vec[0].id();
1516 }
1517
1518 /* state of register file after phis */
1519 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1520
1521 for (Block& block : program->blocks) {
1522 TempSet& live = live_out_per_block[block.index];
1523 /* initialize register file */
1524 assert(block.index != 0 || live.empty());
1525 RegisterFile register_file;
1526 ctx.war_hint.reset();
1527
1528 for (Temp t : live) {
1529 Temp renamed = handle_live_in(ctx, t, &block);
1530 assignment& var = ctx.assignments[renamed.id()];
1531 /* due to live-range splits, the live-in might be a phi, now */
1532 if (var.assigned)
1533 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1534 }
1535
1536 std::vector<aco_ptr<Instruction>> instructions;
1537 std::vector<aco_ptr<Instruction>>::iterator it;
1538
1539 /* this is a slight adjustment from the paper as we already have phi nodes:
1540 * We consider them incomplete phis and only handle the definition. */
1541
1542 /* handle fixed phi definitions */
1543 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1544 aco_ptr<Instruction>& phi = *it;
1545 if (!is_phi(phi))
1546 break;
1547 Definition& definition = phi->definitions[0];
1548 if (!definition.isFixed())
1549 continue;
1550
1551 /* check if a dead exec mask phi is needed */
1552 if (definition.isKill()) {
1553 for (Operand& op : phi->operands) {
1554 assert(op.isTemp());
1555 if (!ctx.assignments[op.tempId()].assigned ||
1556 ctx.assignments[op.tempId()].reg != exec) {
1557 definition.setKill(false);
1558 break;
1559 }
1560 }
1561 }
1562
1563 if (definition.isKill())
1564 continue;
1565
1566 assert(definition.physReg() == exec);
1567 assert(!register_file.test(definition.physReg(), definition.bytes()));
1568 register_file.fill(definition);
1569 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1570 }
1571
1572 /* look up the affinities */
1573 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1574 aco_ptr<Instruction>& phi = *it;
1575 if (!is_phi(phi))
1576 break;
1577 Definition& definition = phi->definitions[0];
1578 if (definition.isKill() || definition.isFixed())
1579 continue;
1580
1581 if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1582 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1583 assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1584 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1585 bool try_use_special_reg = reg == scc || reg == exec;
1586 if (try_use_special_reg) {
1587 for (const Operand& op : phi->operands) {
1588 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1589 ctx.assignments[op.tempId()].reg == reg)) {
1590 try_use_special_reg = false;
1591 break;
1592 }
1593 }
1594 if (!try_use_special_reg)
1595 continue;
1596 }
1597 /* only assign if register is still free */
1598 if (!register_file.test(reg, definition.bytes())) {
1599 definition.setFixed(reg);
1600 register_file.fill(definition);
1601 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1602 }
1603 }
1604 }
1605
1606 /* find registers for phis without affinity or where the register was blocked */
1607 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1608 aco_ptr<Instruction>& phi = *it;
1609 if (!is_phi(phi))
1610 break;
1611
1612 Definition& definition = phi->definitions[0];
1613 if (definition.isKill())
1614 continue;
1615
1616 if (!definition.isFixed()) {
1617 std::vector<std::pair<Operand, Definition>> parallelcopy;
1618 /* try to find a register that is used by at least one operand */
1619 for (const Operand& op : phi->operands) {
1620 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1621 continue;
1622 PhysReg reg = ctx.assignments[op.tempId()].reg;
1623 /* we tried this already on the previous loop */
1624 if (reg == scc || reg == exec)
1625 continue;
1626 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1627 definition.setFixed(reg);
1628 break;
1629 }
1630 }
1631 if (!definition.isFixed())
1632 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1633
1634 /* process parallelcopy */
1635 for (std::pair<Operand, Definition> pc : parallelcopy) {
1636 /* see if it's a copy from a different phi */
1637 //TODO: prefer moving some previous phis over live-ins
1638 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1639 Instruction *prev_phi = NULL;
1640 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1641 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1642 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1643 prev_phi = phi_it->get();
1644 }
1645 phi_it = it;
1646 while (!prev_phi && is_phi(*++phi_it)) {
1647 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1648 prev_phi = phi_it->get();
1649 }
1650 if (prev_phi) {
1651 /* if so, just update that phi's register */
1652 register_file.clear(prev_phi->definitions[0]);
1653 prev_phi->definitions[0].setFixed(pc.second.physReg());
1654 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1655 register_file.fill(prev_phi->definitions[0]);
1656 continue;
1657 }
1658
1659 /* rename */
1660 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1661 Temp orig = pc.first.getTemp();
1662 if (orig_it != ctx.orig_names.end())
1663 orig = orig_it->second;
1664 else
1665 ctx.orig_names[pc.second.tempId()] = orig;
1666 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1667
1668 /* otherwise, this is a live-in and we need to create a new phi
1669 * to move it in this block's predecessors */
1670 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1671 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1672 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1673 new_phi->definitions[0] = pc.second;
1674 for (unsigned i = 0; i < preds.size(); i++)
1675 new_phi->operands[i] = Operand(pc.first);
1676 instructions.emplace_back(std::move(new_phi));
1677 }
1678
1679 register_file.fill(definition);
1680 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1681 }
1682 live.emplace(definition.getTemp());
1683
1684 /* update phi affinities */
1685 for (const Operand& op : phi->operands) {
1686 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1687 ctx.affinities[op.tempId()] = definition.tempId();
1688 }
1689
1690 instructions.emplace_back(std::move(*it));
1691 }
1692
1693 /* fill in sgpr_live_in */
1694 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1695 sgpr_live_in[block.index][i] = register_file[i];
1696 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1697
1698 /* Handle all other instructions of the block */
1699 for (; it != block.instructions.end(); ++it) {
1700 aco_ptr<Instruction>& instr = *it;
1701
1702 /* parallelcopies from p_phi are inserted here which means
1703 * live ranges of killed operands end here as well */
1704 if (instr->opcode == aco_opcode::p_logical_end) {
1705 /* no need to process this instruction any further */
1706 if (block.logical_succs.size() != 1) {
1707 instructions.emplace_back(std::move(instr));
1708 continue;
1709 }
1710
1711 Block& succ = program->blocks[block.logical_succs[0]];
1712 unsigned idx = 0;
1713 for (; idx < succ.logical_preds.size(); idx++) {
1714 if (succ.logical_preds[idx] == block.index)
1715 break;
1716 }
1717 for (aco_ptr<Instruction>& phi : succ.instructions) {
1718 if (phi->opcode == aco_opcode::p_phi) {
1719 if (phi->operands[idx].isTemp() &&
1720 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1721 phi->operands[idx].isFirstKillBeforeDef()) {
1722 Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
1723 PhysReg reg = ctx.assignments[phi_op.id()].reg;
1724 assert(register_file[reg] == phi_op.id());
1725 register_file[reg] = 0;
1726 }
1727 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1728 break;
1729 }
1730 }
1731 instructions.emplace_back(std::move(instr));
1732 continue;
1733 }
1734
1735 std::vector<std::pair<Operand, Definition>> parallelcopy;
1736
1737 assert(!is_phi(instr));
1738
1739 /* handle operands */
1740 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1741 auto& operand = instr->operands[i];
1742 if (!operand.isTemp())
1743 continue;
1744
1745 /* rename operands */
1746 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
1747 assert(ctx.assignments[operand.tempId()].assigned);
1748
1749 PhysReg reg = ctx.assignments[operand.tempId()].reg;
1750 if (operand_can_use_reg(ctx, instr, i, reg))
1751 operand.setFixed(reg);
1752 else
1753 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
1754
1755 if (instr->format == Format::EXP ||
1756 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
1757 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1758 for (unsigned j = 0; j < operand.size(); j++)
1759 ctx.war_hint.set(operand.physReg().reg() + j);
1760 }
1761
1762 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
1763 if (phi != ctx.phi_map.end())
1764 phi->second.uses.emplace(instr.get());
1765 }
1766
1767 /* remove dead vars from register file */
1768 for (const Operand& op : instr->operands) {
1769 if (op.isTemp() && op.isFirstKillBeforeDef())
1770 register_file.clear(op);
1771 }
1772
1773 /* try to optimize v_mad_f32 -> v_mac_f32 */
1774 if (instr->opcode == aco_opcode::v_mad_f32 &&
1775 instr->operands[2].isTemp() &&
1776 instr->operands[2].isKillBeforeDef() &&
1777 instr->operands[2].getTemp().type() == RegType::vgpr &&
1778 instr->operands[1].isTemp() &&
1779 instr->operands[1].getTemp().type() == RegType::vgpr &&
1780 !instr->usesModifiers()) {
1781 unsigned def_id = instr->definitions[0].tempId();
1782 auto it = ctx.affinities.find(def_id);
1783 if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
1784 instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
1785 register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
1786 instr->format = Format::VOP2;
1787 instr->opcode = aco_opcode::v_mac_f32;
1788 }
1789 }
1790
1791 /* handle definitions which must have the same register as an operand */
1792 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1793 instr->opcode == aco_opcode::v_mac_f32 ||
1794 instr->opcode == aco_opcode::v_writelane_b32 ||
1795 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1796 instr->definitions[0].setFixed(instr->operands[2].physReg());
1797 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1798 instr->opcode == aco_opcode::s_mulk_i32) {
1799 instr->definitions[0].setFixed(instr->operands[0].physReg());
1800 } else if (instr->format == Format::MUBUF &&
1801 instr->definitions.size() == 1 &&
1802 instr->operands.size() == 4) {
1803 instr->definitions[0].setFixed(instr->operands[3].physReg());
1804 } else if (instr->format == Format::MIMG &&
1805 instr->definitions.size() == 1 &&
1806 instr->operands[1].regClass().type() == RegType::vgpr) {
1807 instr->definitions[0].setFixed(instr->operands[1].physReg());
1808 }
1809
1810 ctx.defs_done.reset();
1811
1812 /* handle fixed definitions first */
1813 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1814 auto& definition = instr->definitions[i];
1815 if (!definition.isFixed())
1816 continue;
1817
1818 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1819 /* check if the target register is blocked */
1820 if (register_file[definition.physReg().reg()] != 0) {
1821 /* create parallelcopy pair to move blocking var */
1822 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
1823 Operand pc_op = Operand(tmp);
1824 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
1825 RegClass rc = pc_op.regClass();
1826 tmp = Temp{program->allocateId(), rc};
1827 Definition pc_def = Definition(tmp);
1828
1829 /* re-enable the killed operands, so that we don't move the blocking var there */
1830 for (const Operand& op : instr->operands) {
1831 if (op.isTemp() && op.isFirstKillBeforeDef())
1832 register_file.fill(op);
1833 }
1834
1835 /* find a new register for the blocking variable */
1836 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
1837 /* once again, disable killed operands */
1838 for (const Operand& op : instr->operands) {
1839 if (op.isTemp() && op.isFirstKillBeforeDef())
1840 register_file.clear(op);
1841 }
1842 for (unsigned k = 0; k < i; k++) {
1843 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1844 register_file.fill(instr->definitions[k]);
1845 }
1846 pc_def.setFixed(reg);
1847
1848 /* finish assignment of parallelcopy */
1849 ctx.assignments.emplace_back(reg, pc_def.regClass());
1850 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1851 parallelcopy.emplace_back(pc_op, pc_def);
1852
1853 /* add changes to reg_file */
1854 register_file.clear(pc_op);
1855 register_file.fill(pc_def);
1856 }
1857 ctx.defs_done.set(i);
1858
1859 if (!definition.isTemp())
1860 continue;
1861
1862 /* set live if it has a kill point */
1863 if (!definition.isKill())
1864 live.emplace(definition.getTemp());
1865
1866 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1867 register_file.fill(definition);
1868 }
1869
1870 /* handle all other definitions */
1871 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1872 auto& definition = instr->definitions[i];
1873
1874 if (definition.isFixed() || !definition.isTemp())
1875 continue;
1876
1877 /* find free reg */
1878 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1879 definition.setFixed(definition.physReg());
1880 else if (instr->opcode == aco_opcode::p_split_vector) {
1881 PhysReg reg = instr->operands[0].physReg();
1882 for (unsigned j = 0; j < i; j++)
1883 reg.reg_b += instr->definitions[j].bytes();
1884 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1885 definition.setFixed(reg);
1886 } else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) {
1887 PhysReg reg = instr->operands[i].physReg();
1888 if (instr->operands[i].isTemp() &&
1889 instr->operands[i].getTemp().type() == definition.getTemp().type() &&
1890 !register_file.test(reg, definition.bytes()))
1891 definition.setFixed(reg);
1892 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1893 PhysReg reg;
1894 if (instr->operands[0].isKillBeforeDef() &&
1895 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1896 reg = instr->operands[0].physReg();
1897 reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
1898 assert(!register_file.test(reg, definition.bytes()));
1899 definition.setFixed(reg);
1900 }
1901 } else if (instr->opcode == aco_opcode::p_create_vector) {
1902 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
1903 parallelcopy, instr);
1904 definition.setFixed(reg);
1905 }
1906
1907 if (!definition.isFixed()) {
1908 Temp tmp = definition.getTemp();
1909 if (tmp.regClass().is_subdword() &&
1910 !instr_can_access_subdword(ctx, instr)) {
1911 assert(tmp.bytes() <= 4);
1912 tmp = Temp(definition.tempId(), v1);
1913 }
1914 definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
1915 }
1916
1917 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1918 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1919 ctx.defs_done.set(i);
1920
1921 /* set live if it has a kill point */
1922 if (!definition.isKill())
1923 live.emplace(definition.getTemp());
1924
1925 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1926 register_file.fill(definition);
1927 }
1928
1929 handle_pseudo(ctx, register_file, instr.get());
1930
1931 /* kill definitions and late-kill operands */
1932 for (const Definition& def : instr->definitions) {
1933 if (def.isTemp() && def.isKill())
1934 register_file.clear(def);
1935 }
1936 for (const Operand& op : instr->operands) {
1937 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1938 register_file.clear(op);
1939 }
1940
1941 /* emit parallelcopy */
1942 if (!parallelcopy.empty()) {
1943 aco_ptr<Pseudo_instruction> pc;
1944 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1945 bool temp_in_scc = register_file[scc.reg()];
1946 bool sgpr_operands_alias_defs = false;
1947 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1948 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1949 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1950 if (!sgpr_operands_alias_defs) {
1951 unsigned reg = parallelcopy[i].first.physReg().reg();
1952 unsigned size = parallelcopy[i].first.getTemp().size();
1953 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1954
1955 reg = parallelcopy[i].second.physReg().reg();
1956 size = parallelcopy[i].second.getTemp().size();
1957 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1958 sgpr_operands_alias_defs = true;
1959 }
1960 }
1961
1962 pc->operands[i] = parallelcopy[i].first;
1963 pc->definitions[i] = parallelcopy[i].second;
1964 assert(pc->operands[i].size() == pc->definitions[i].size());
1965
1966 /* it might happen that the operand is already renamed. we have to restore the original name. */
1967 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1968 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1969 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1970 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1971
1972 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
1973 if (phi != ctx.phi_map.end())
1974 phi->second.uses.emplace(pc.get());
1975 }
1976
1977 if (temp_in_scc && sgpr_operands_alias_defs) {
1978 /* disable definitions and re-enable operands */
1979 for (const Definition& def : instr->definitions) {
1980 if (def.isTemp() && !def.isKill())
1981 register_file.clear(def);
1982 }
1983 for (const Operand& op : instr->operands) {
1984 if (op.isTemp() && op.isFirstKill())
1985 register_file.block(op.physReg(), op.regClass());
1986 }
1987
1988 handle_pseudo(ctx, register_file, pc.get());
1989
1990 /* re-enable live vars */
1991 for (const Operand& op : instr->operands) {
1992 if (op.isTemp() && op.isFirstKill())
1993 register_file.clear(op);
1994 }
1995 for (const Definition& def : instr->definitions) {
1996 if (def.isTemp() && !def.isKill())
1997 register_file.fill(def);
1998 }
1999 } else {
2000 pc->tmp_in_scc = false;
2001 }
2002
2003 instructions.emplace_back(std::move(pc));
2004 }
2005
2006 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
2007 bool instr_needs_vop3 = !instr->isVOP3() &&
2008 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
2009 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
2010 ((instr->opcode == aco_opcode::v_add_co_u32 ||
2011 instr->opcode == aco_opcode::v_addc_co_u32 ||
2012 instr->opcode == aco_opcode::v_sub_co_u32 ||
2013 instr->opcode == aco_opcode::v_subb_co_u32 ||
2014 instr->opcode == aco_opcode::v_subrev_co_u32 ||
2015 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2016 !(instr->definitions[1].physReg() == vcc)) ||
2017 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
2018 instr->opcode == aco_opcode::v_subb_co_u32 ||
2019 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2020 !(instr->operands[2].physReg() == vcc)));
2021 if (instr_needs_vop3) {
2022
2023 /* if the first operand is a literal, we have to move it to a reg */
2024 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
2025 bool can_sgpr = true;
2026 /* check, if we have to move to vgpr */
2027 for (const Operand& op : instr->operands) {
2028 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
2029 can_sgpr = false;
2030 break;
2031 }
2032 }
2033 /* disable definitions and re-enable operands */
2034 for (const Definition& def : instr->definitions)
2035 register_file.clear(def);
2036 for (const Operand& op : instr->operands) {
2037 if (op.isTemp() && op.isFirstKill())
2038 register_file.block(op.physReg(), op.regClass());
2039 }
2040 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
2041 ctx.assignments.emplace_back();
2042 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2043
2044 aco_ptr<Instruction> mov;
2045 if (can_sgpr)
2046 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
2047 else
2048 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
2049 mov->operands[0] = instr->operands[0];
2050 mov->definitions[0] = Definition(tmp);
2051 mov->definitions[0].setFixed(reg);
2052
2053 instr->operands[0] = Operand(tmp);
2054 instr->operands[0].setFixed(reg);
2055 instructions.emplace_back(std::move(mov));
2056 /* re-enable live vars */
2057 for (const Operand& op : instr->operands) {
2058 if (op.isTemp() && op.isFirstKill())
2059 register_file.clear(op);
2060 }
2061 for (const Definition& def : instr->definitions) {
2062 if (def.isTemp() && !def.isKill())
2063 register_file.fill(def);
2064 }
2065 }
2066
2067 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2068 aco_ptr<Instruction> tmp = std::move(instr);
2069 Format format = asVOP3(tmp->format);
2070 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
2071 for (unsigned i = 0; i < instr->operands.size(); i++) {
2072 Operand& operand = tmp->operands[i];
2073 instr->operands[i] = operand;
2074 /* keep phi_map up to date */
2075 if (operand.isTemp()) {
2076 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.tempId());
2077 if (phi != ctx.phi_map.end()) {
2078 phi->second.uses.erase(tmp.get());
2079 phi->second.uses.emplace(instr.get());
2080 }
2081 }
2082 }
2083 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2084 }
2085 instructions.emplace_back(std::move(*it));
2086
2087 } /* end for Instr */
2088
2089 block.instructions = std::move(instructions);
2090
2091 ctx.filled[block.index] = true;
2092 for (unsigned succ_idx : block.linear_succs) {
2093 Block& succ = program->blocks[succ_idx];
2094 /* seal block if all predecessors are filled */
2095 bool all_filled = true;
2096 for (unsigned pred_idx : succ.linear_preds) {
2097 if (!ctx.filled[pred_idx]) {
2098 all_filled = false;
2099 break;
2100 }
2101 }
2102 if (all_filled) {
2103 ctx.sealed[succ_idx] = true;
2104
2105 /* finish incomplete phis and check if they became trivial */
2106 for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2107 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2108 for (unsigned i = 0; i < phi->operands.size(); i++) {
2109 phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2110 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2111 }
2112 try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2113 }
2114 /* complete the original phi nodes, but no need to check triviality */
2115 for (aco_ptr<Instruction>& instr : succ.instructions) {
2116 if (!is_phi(instr))
2117 break;
2118 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2119
2120 for (unsigned i = 0; i < instr->operands.size(); i++) {
2121 auto& operand = instr->operands[i];
2122 if (!operand.isTemp())
2123 continue;
2124 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2125 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2126 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2127 if (phi != ctx.phi_map.end())
2128 phi->second.uses.emplace(instr.get());
2129 }
2130 }
2131 }
2132 }
2133 } /* end for BB */
2134
2135 /* remove trivial phis */
2136 for (Block& block : program->blocks) {
2137 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2138 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2139 auto middle = std::remove_if(block.instructions.begin(), end,
2140 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2141 block.instructions.erase(middle, end);
2142 }
2143
2144 /* find scc spill registers which may be needed for parallelcopies created by phis */
2145 for (Block& block : program->blocks) {
2146 if (block.linear_preds.size() <= 1)
2147 continue;
2148
2149 std::bitset<128> regs = sgpr_live_in[block.index];
2150 if (!regs[127])
2151 continue;
2152
2153 /* choose a register */
2154 int16_t reg = 0;
2155 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2156 ;
2157 assert(reg < ctx.program->max_reg_demand.sgpr);
2158 adjust_max_used_regs(ctx, s1, reg);
2159
2160 /* update predecessors */
2161 for (unsigned& pred_index : block.linear_preds) {
2162 Block& pred = program->blocks[pred_index];
2163 pred.scc_live_out = true;
2164 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2165 }
2166 }
2167
2168 /* num_gpr = rnd_up(max_used_gpr + 1) */
2169 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2170 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2171 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2172 else
2173 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2174 }
2175
2176 }