aco: Use correct reference type in for-range-loop.
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33
34 #include "aco_ir.h"
35 #include "sid.h"
36 #include "util/u_math.h"
37
38 namespace aco {
39 namespace {
40
41 struct assignment {
42 PhysReg reg;
43 RegClass rc;
44 uint8_t assigned = 0;
45 assignment() = default;
46 assignment(PhysReg reg, RegClass rc) : reg(reg), rc(rc), assigned(-1) {}
47 };
48
49 struct phi_info {
50 Instruction* phi;
51 unsigned block_idx;
52 std::set<Instruction*> uses;
53 };
54
55 struct ra_ctx {
56 std::bitset<512> war_hint;
57 Program* program;
58 std::vector<assignment> assignments;
59 std::vector<std::unordered_map<unsigned, Temp>> renames;
60 std::vector<std::vector<Instruction*>> incomplete_phis;
61 std::vector<bool> filled;
62 std::vector<bool> sealed;
63 std::unordered_map<unsigned, Temp> orig_names;
64 std::unordered_map<unsigned, phi_info> phi_map;
65 std::unordered_map<unsigned, unsigned> affinities;
66 std::unordered_map<unsigned, Instruction*> vectors;
67 std::unordered_map<unsigned, Instruction*> split_vectors;
68 aco_ptr<Instruction> pseudo_dummy;
69 unsigned max_used_sgpr = 0;
70 unsigned max_used_vgpr = 0;
71 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
72
73 ra_ctx(Program* program) : program(program),
74 assignments(program->peekAllocationId()),
75 renames(program->blocks.size()),
76 incomplete_phis(program->blocks.size()),
77 filled(program->blocks.size()),
78 sealed(program->blocks.size())
79 {
80 pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
81 }
82 };
83
84 bool instr_can_access_subdword(ra_ctx& ctx, aco_ptr<Instruction>& instr)
85 {
86 if (ctx.program->chip_class < GFX8)
87 return false;
88 return instr->isSDWA() || instr->format == Format::PSEUDO;
89 }
90
91 struct DefInfo {
92 uint16_t lb;
93 uint16_t ub;
94 uint8_t size;
95 uint8_t stride;
96 RegClass rc;
97
98 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
99 size = rc.size();
100 stride = 1;
101
102 if (rc.type() == RegType::vgpr) {
103 lb = 256;
104 ub = 256 + ctx.program->max_reg_demand.vgpr;
105 } else {
106 lb = 0;
107 ub = ctx.program->max_reg_demand.sgpr;
108 if (size == 2)
109 stride = 2;
110 else if (size >= 4)
111 stride = 4;
112 }
113
114 if (rc.is_subdword()) {
115 /* stride in bytes */
116 if(!instr_can_access_subdword(ctx, instr))
117 stride = 4;
118 else if (rc.bytes() % 4 == 0)
119 stride = 4;
120 else if (rc.bytes() % 2 == 0)
121 stride = 2;
122 }
123 }
124 };
125
126 class RegisterFile {
127 public:
128 RegisterFile() {regs.fill(0);}
129
130 std::array<uint32_t, 512> regs;
131 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
132
133 const uint32_t& operator [] (unsigned index) const {
134 return regs[index];
135 }
136
137 uint32_t& operator [] (unsigned index) {
138 return regs[index];
139 }
140
141 unsigned count_zero(PhysReg start, unsigned size) {
142 unsigned res = 0;
143 for (unsigned i = 0; i < size; i++)
144 res += !regs[start + i];
145 return res;
146 }
147
148 bool test(PhysReg start, unsigned num_bytes) {
149 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
150 if (regs[i] & 0x0FFFFFFF)
151 return true;
152 if (regs[i] == 0xF0000000) {
153 assert(subdword_regs.find(i) != subdword_regs.end());
154 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
155 if (subdword_regs[i][j])
156 return true;
157 }
158 }
159 }
160 return false;
161 }
162
163 void block(PhysReg start, RegClass rc) {
164 if (rc.is_subdword())
165 fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
166 else
167 fill(start, rc.size(), 0xFFFFFFFF);
168 }
169
170 bool is_blocked(PhysReg start) {
171 if (regs[start] == 0xFFFFFFFF)
172 return true;
173 if (regs[start] == 0xF0000000) {
174 for (unsigned i = start.byte(); i < 4; i++)
175 if (subdword_regs[start][i] == 0xFFFFFFFF)
176 return true;
177 }
178 return false;
179 }
180
181 void clear(PhysReg start, RegClass rc) {
182 if (rc.is_subdword())
183 fill_subdword(start, rc.bytes(), 0);
184 else
185 fill(start, rc.size(), 0);
186 }
187
188 void fill(Operand op) {
189 if (op.regClass().is_subdword())
190 fill_subdword(op.physReg(), op.bytes(), op.tempId());
191 else
192 fill(op.physReg(), op.size(), op.tempId());
193 }
194
195 void clear(Operand op) {
196 clear(op.physReg(), op.regClass());
197 }
198
199 void fill(Definition def) {
200 if (def.regClass().is_subdword())
201 fill_subdword(def.physReg(), def.bytes(), def.tempId());
202 else
203 fill(def.physReg(), def.size(), def.tempId());
204 }
205
206 void clear(Definition def) {
207 clear(def.physReg(), def.regClass());
208 }
209
210 private:
211 void fill(PhysReg start, unsigned size, uint32_t val) {
212 for (unsigned i = 0; i < size; i++)
213 regs[start + i] = val;
214 }
215
216 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
217 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
218 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
219 /* emplace or get */
220 std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
221 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
222 sub[j] = val;
223
224 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
225 subdword_regs.erase(i);
226 regs[i] = 0;
227 }
228 }
229 }
230 };
231
232
233 /* helper function for debugging */
234 #if 0
235 void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
236 {
237 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
238 unsigned lb = vgprs ? 256 : 0;
239 unsigned ub = lb + max;
240 char reg_char = vgprs ? 'v' : 's';
241
242 /* print markers */
243 printf(" ");
244 for (unsigned i = lb; i < ub; i += 3) {
245 printf("%.2u ", i - lb);
246 }
247 printf("\n");
248
249 /* print usage */
250 printf("%cgprs: ", reg_char);
251 unsigned free_regs = 0;
252 unsigned prev = 0;
253 bool char_select = false;
254 for (unsigned i = lb; i < ub; i++) {
255 if (reg_file[i] == 0xFFFF) {
256 printf("~");
257 } else if (reg_file[i]) {
258 if (reg_file[i] != prev) {
259 prev = reg_file[i];
260 char_select = !char_select;
261 }
262 printf(char_select ? "#" : "@");
263 } else {
264 free_regs++;
265 printf(".");
266 }
267 }
268 printf("\n");
269
270 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
271
272 /* print assignments */
273 prev = 0;
274 unsigned size = 0;
275 for (unsigned i = lb; i < ub; i++) {
276 if (reg_file[i] != prev) {
277 if (prev && size > 1)
278 printf("-%d]\n", i - 1 - lb);
279 else if (prev)
280 printf("]\n");
281 prev = reg_file[i];
282 if (prev && prev != 0xFFFF) {
283 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
284 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
285 else
286 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
287 }
288 size = 1;
289 } else {
290 size++;
291 }
292 }
293 if (prev && size > 1)
294 printf("-%d]\n", ub - lb - 1);
295 else if (prev)
296 printf("]\n");
297 }
298 #endif
299
300
301 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
302 {
303 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
304 unsigned size = rc.size();
305 if (rc.type() == RegType::vgpr) {
306 assert(reg >= 256);
307 unsigned hi = reg - 256 + size - 1;
308 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
309 } else if (reg + rc.size() <= max_addressible_sgpr) {
310 unsigned hi = reg + size - 1;
311 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
312 }
313 }
314
315
316 void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
317 std::vector<std::pair<Operand, Definition>>& parallelcopies,
318 aco_ptr<Instruction>& instr)
319 {
320 /* allocate id's and rename operands: this is done transparently here */
321 for (std::pair<Operand, Definition>& copy : parallelcopies) {
322 /* the definitions with id are not from this function and already handled */
323 if (copy.second.isTemp())
324 continue;
325
326 /* check if we we moved another parallelcopy definition */
327 for (std::pair<Operand, Definition>& other : parallelcopies) {
328 if (!other.second.isTemp())
329 continue;
330 if (copy.first.getTemp() == other.second.getTemp()) {
331 copy.first.setTemp(other.first.getTemp());
332 copy.first.setFixed(other.first.physReg());
333 }
334 }
335 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
336 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
337 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
338 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
339 reg_file.fill(copy.second);
340
341 /* check if we moved an operand */
342 for (Operand& op : instr->operands) {
343 if (!op.isTemp())
344 continue;
345 if (op.tempId() == copy.first.tempId()) {
346 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
347 for (std::pair<Operand, Definition>& pc : parallelcopies) {
348 PhysReg def_reg = pc.second.physReg();
349 omit_renaming &= def_reg > copy.first.physReg() ?
350 (copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
351 (def_reg + pc.second.size() <= copy.first.physReg().reg());
352 }
353 if (omit_renaming)
354 continue;
355 op.setTemp(copy.second.getTemp());
356 op.setFixed(copy.second.physReg());
357 }
358 }
359 }
360 }
361
362 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
363 RegisterFile& reg_file,
364 DefInfo info)
365 {
366 uint32_t lb = info.lb;
367 uint32_t ub = info.ub;
368 uint32_t size = info.size;
369 uint32_t stride = info.stride;
370 RegClass rc = info.rc;
371
372 if (rc.is_subdword()) {
373 for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
374 assert(reg_file[entry.first] == 0xF0000000);
375 if (lb > entry.first || entry.first >= ub)
376 continue;
377
378 for (unsigned i = 0; i < 4; i+= stride) {
379 if (entry.second[i] != 0)
380 continue;
381
382 bool reg_found = true;
383 for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
384 reg_found &= entry.second[i + j] == 0;
385
386 /* check neighboring reg if needed */
387 reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
388 if (reg_found) {
389 PhysReg res{entry.first};
390 res.reg_b += i;
391 adjust_max_used_regs(ctx, rc, entry.first);
392 return {res, true};
393 }
394 }
395 }
396
397 stride = 1; /* stride in full registers */
398 rc = info.rc = RegClass(RegType::vgpr, size);
399 }
400
401 if (stride == 1) {
402
403 for (unsigned stride = 8; stride > 1; stride /= 2) {
404 if (size % stride)
405 continue;
406 info.stride = stride;
407 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
408 if (res.second)
409 return res;
410 }
411
412 /* best fit algorithm: find the smallest gap to fit in the variable */
413 unsigned best_pos = 0xFFFF;
414 unsigned gap_size = 0xFFFF;
415 unsigned last_pos = 0xFFFF;
416
417 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
418
419 if (reg_file[current_reg] == 0 && !ctx.war_hint[current_reg]) {
420 if (last_pos == 0xFFFF)
421 last_pos = current_reg;
422
423 /* stop searching after max_used_gpr */
424 if (current_reg == ctx.max_used_sgpr + 1 || current_reg == 256 + ctx.max_used_vgpr + 1)
425 break;
426 else
427 continue;
428 }
429
430 if (last_pos == 0xFFFF)
431 continue;
432
433 /* early return on exact matches */
434 if (last_pos + size == current_reg) {
435 adjust_max_used_regs(ctx, rc, last_pos);
436 return {PhysReg{last_pos}, true};
437 }
438
439 /* check if it fits and the gap size is smaller */
440 if (last_pos + size < current_reg && current_reg - last_pos < gap_size) {
441 best_pos = last_pos;
442 gap_size = current_reg - last_pos;
443 }
444 last_pos = 0xFFFF;
445 }
446
447 /* final check */
448 if (last_pos + size <= ub && ub - last_pos < gap_size) {
449 best_pos = last_pos;
450 gap_size = ub - last_pos;
451 }
452
453 if (best_pos == 0xFFFF)
454 return {{}, false};
455
456 /* find best position within gap by leaving a good stride for other variables*/
457 unsigned buffer = gap_size - size;
458 if (buffer > 1) {
459 if (((best_pos + size) % 8 != 0 && (best_pos + buffer) % 8 == 0) ||
460 ((best_pos + size) % 4 != 0 && (best_pos + buffer) % 4 == 0) ||
461 ((best_pos + size) % 2 != 0 && (best_pos + buffer) % 2 == 0))
462 best_pos = best_pos + buffer;
463 }
464
465 adjust_max_used_regs(ctx, rc, best_pos);
466 return {PhysReg{best_pos}, true};
467 }
468
469 bool found = false;
470 unsigned reg_lo = lb;
471 unsigned reg_hi = lb + size - 1;
472 while (!found && reg_lo + size <= ub) {
473 if (reg_file[reg_lo] != 0) {
474 reg_lo += stride;
475 continue;
476 }
477 reg_hi = reg_lo + size - 1;
478 found = true;
479 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
480 if (reg_file[reg] != 0 || ctx.war_hint[reg])
481 found = false;
482 }
483 if (found) {
484 adjust_max_used_regs(ctx, rc, reg_lo);
485 return {PhysReg{reg_lo}, true};
486 }
487
488 reg_lo += stride;
489 }
490
491 return {{}, false};
492 }
493
494 /* collect variables from a register area and clear reg_file */
495 std::set<std::pair<unsigned, unsigned>> collect_vars(ra_ctx& ctx, RegisterFile& reg_file,
496 PhysReg reg, unsigned size)
497 {
498 std::set<std::pair<unsigned, unsigned>> vars;
499 for (unsigned j = reg; j < reg + size; j++) {
500 if (reg_file.is_blocked(PhysReg{j}))
501 continue;
502 if (reg_file[j] == 0xF0000000) {
503 for (unsigned k = 0; k < 4; k++) {
504 unsigned id = reg_file.subdword_regs[j][k];
505 if (id) {
506 assignment& var = ctx.assignments[id];
507 vars.emplace(var.rc.bytes(), id);
508 reg_file.clear(var.reg, var.rc);
509 if (!reg_file[j])
510 break;
511 }
512 }
513 } else if (reg_file[j] != 0) {
514 unsigned id = reg_file[j];
515 assignment& var = ctx.assignments[id];
516 vars.emplace(var.rc.bytes(), id);
517 reg_file.clear(var.reg, var.rc);
518 }
519 }
520 return vars;
521 }
522
523 bool get_regs_for_copies(ra_ctx& ctx,
524 RegisterFile& reg_file,
525 std::vector<std::pair<Operand, Definition>>& parallelcopies,
526 const std::set<std::pair<unsigned, unsigned>> &vars,
527 uint32_t lb, uint32_t ub,
528 aco_ptr<Instruction>& instr,
529 uint32_t def_reg_lo,
530 uint32_t def_reg_hi)
531 {
532
533 /* variables are sorted from small sized to large */
534 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
535 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
536 unsigned id = it->second;
537 assignment& var = ctx.assignments[id];
538 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
539 uint32_t size = info.size;
540
541 /* check if this is a dead operand, then we can re-use the space from the definition */
542 bool is_dead_operand = false;
543 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
544 if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
545 is_dead_operand = true;
546 }
547
548 std::pair<PhysReg, bool> res;
549 if (is_dead_operand) {
550 if (instr->opcode == aco_opcode::p_create_vector) {
551 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
552 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
553 PhysReg reg(def_reg_lo);
554 reg.reg_b += offset;
555 assert(!reg_file.test(reg, var.rc.bytes()));
556 res = {reg, reg.byte() == 0 || instr_can_access_subdword(ctx, instr)};
557 break;
558 }
559 }
560 } else {
561 info.lb = def_reg_lo;
562 info.ub = def_reg_hi + 1;
563 res = get_reg_simple(ctx, reg_file, info);
564 }
565 } else {
566 info.lb = lb;
567 info.ub = def_reg_lo;
568 res = get_reg_simple(ctx, reg_file, info);
569 if (!res.second) {
570 info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
571 info.ub = ub;
572 res = get_reg_simple(ctx, reg_file, info);
573 }
574 }
575
576 if (res.second) {
577 /* mark the area as blocked */
578 reg_file.block(res.first, var.rc);
579
580 /* create parallelcopy pair (without definition id) */
581 Temp tmp = Temp(id, var.rc);
582 Operand pc_op = Operand(tmp);
583 pc_op.setFixed(var.reg);
584 Definition pc_def = Definition(res.first, pc_op.regClass());
585 parallelcopies.emplace_back(pc_op, pc_def);
586 continue;
587 }
588
589 unsigned best_pos = lb;
590 unsigned num_moves = 0xFF;
591 unsigned num_vars = 0;
592
593 /* we use a sliding window to find potential positions */
594 unsigned reg_lo = lb;
595 unsigned reg_hi = lb + size - 1;
596 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
597 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
598 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
599 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
600 continue;
601
602 /* second, check that we have at most k=num_moves elements in the window
603 * and no element is larger than the currently processed one */
604 unsigned k = 0;
605 unsigned n = 0;
606 unsigned last_var = 0;
607 bool found = true;
608 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
609 if (reg_file[j] == 0 || reg_file[j] == last_var)
610 continue;
611
612 if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
613 found = false;
614 break;
615 }
616 if (reg_file[j] == 0xF0000000) {
617 k += 1;
618 n++;
619 continue;
620 }
621 /* we cannot split live ranges of linear vgprs */
622 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
623 found = false;
624 break;
625 }
626 bool is_kill = false;
627 for (const Operand& op : instr->operands) {
628 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
629 is_kill = true;
630 break;
631 }
632 }
633 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
634 found = false;
635 break;
636 }
637
638 k += ctx.assignments[reg_file[j]].rc.size();
639 last_var = reg_file[j];
640 n++;
641 if (k > num_moves || (k == num_moves && n <= num_vars)) {
642 found = false;
643 break;
644 }
645 }
646
647 if (found) {
648 best_pos = reg_lo;
649 num_moves = k;
650 num_vars = n;
651 }
652 }
653
654 /* FIXME: we messed up and couldn't find space for the variables to be copied */
655 if (num_moves == 0xFF)
656 return false;
657
658 reg_lo = best_pos;
659 reg_hi = best_pos + size - 1;
660
661 /* collect variables and block reg file */
662 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
663
664 /* mark the area as blocked */
665 reg_file.block(PhysReg{reg_lo}, var.rc);
666
667 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
668 return false;
669
670 adjust_max_used_regs(ctx, var.rc, reg_lo);
671
672 /* create parallelcopy pair (without definition id) */
673 Temp tmp = Temp(id, var.rc);
674 Operand pc_op = Operand(tmp);
675 pc_op.setFixed(var.reg);
676 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
677 parallelcopies.emplace_back(pc_op, pc_def);
678 }
679
680 return true;
681 }
682
683
684 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
685 RegisterFile& reg_file,
686 std::vector<std::pair<Operand, Definition>>& parallelcopies,
687 DefInfo info,
688 aco_ptr<Instruction>& instr)
689 {
690 uint32_t lb = info.lb;
691 uint32_t ub = info.ub;
692 uint32_t size = info.size;
693 uint32_t stride = info.stride;
694 RegClass rc = info.rc;
695
696 /* check how many free regs we have */
697 unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
698
699 /* mark and count killed operands */
700 unsigned killed_ops = 0;
701 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
702 if (instr->operands[j].isTemp() &&
703 instr->operands[j].isFirstKillBeforeDef() &&
704 instr->operands[j].physReg() >= lb &&
705 instr->operands[j].physReg() < ub &&
706 !reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes())) {
707 assert(instr->operands[j].isFixed());
708 reg_file.block(instr->operands[j].physReg(), instr->operands[j].regClass());
709 killed_ops += instr->operands[j].getTemp().size();
710 }
711 }
712
713 assert(regs_free >= size);
714 /* we might have to move dead operands to dst in order to make space */
715 unsigned op_moves = 0;
716
717 if (size > (regs_free - killed_ops))
718 op_moves = size - (regs_free - killed_ops);
719
720 /* find the best position to place the definition */
721 unsigned best_pos = lb;
722 unsigned num_moves = 0xFF;
723 unsigned num_vars = 0;
724
725 /* we use a sliding window to check potential positions */
726 unsigned reg_lo = lb;
727 unsigned reg_hi = lb + size - 1;
728 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
729 /* first check the edges: this is what we have to fix to allow for num_moves > size */
730 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
731 continue;
732 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
733 continue;
734
735 /* second, check that we have at most k=num_moves elements in the window
736 * and no element is larger than the currently processed one */
737 unsigned k = op_moves;
738 unsigned n = 0;
739 unsigned remaining_op_moves = op_moves;
740 unsigned last_var = 0;
741 bool found = true;
742 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
743 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
744 if (reg_file[j] == 0 || reg_file[j] == last_var)
745 continue;
746
747 /* dead operands effectively reduce the number of estimated moves */
748 if (reg_file.is_blocked(PhysReg{j})) {
749 if (remaining_op_moves) {
750 k--;
751 remaining_op_moves--;
752 }
753 continue;
754 }
755
756 if (reg_file[j] == 0xF0000000) {
757 k += 1;
758 n++;
759 continue;
760 }
761
762 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
763 found = false;
764 break;
765 }
766
767 /* we cannot split live ranges of linear vgprs */
768 if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
769 found = false;
770 break;
771 }
772
773 k += ctx.assignments[reg_file[j]].rc.size();
774 n++;
775 last_var = reg_file[j];
776 }
777
778 if (!found || k > num_moves)
779 continue;
780 if (k == num_moves && n < num_vars)
781 continue;
782 if (!aligned && k == num_moves && n == num_vars)
783 continue;
784
785 if (found) {
786 best_pos = reg_lo;
787 num_moves = k;
788 num_vars = n;
789 }
790 }
791
792 if (num_moves == 0xFF) {
793 /* remove killed operands from reg_file once again */
794 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
795 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef())
796 reg_file.clear(instr->operands[i]);
797 }
798 for (unsigned i = 0; i < instr->definitions.size(); i++) {
799 Definition def = instr->definitions[i];
800 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
801 reg_file.fill(def);
802 }
803 return {{}, false};
804 }
805
806 RegisterFile register_file = reg_file;
807
808 /* now, we figured the placement for our definition */
809 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
810
811 if (instr->opcode == aco_opcode::p_create_vector && ctx.program->chip_class >= GFX9) {
812 /* move killed operands which aren't yet at the correct position */
813 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
814 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
815 instr->operands[i].getTemp().type() == rc.type()) {
816
817 if (instr->operands[i].physReg() != best_pos + offset) {
818 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
819 reg_file.clear(instr->operands[i]);
820 } else {
821 reg_file.fill(instr->operands[i]);
822 }
823 }
824 }
825 } else {
826 /* re-enable the killed operands */
827 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
828 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
829 reg_file.fill(instr->operands[j]);
830 }
831 }
832
833 std::vector<std::pair<Operand, Definition>> pc;
834 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
835 reg_file = std::move(register_file);
836 /* remove killed operands from reg_file once again */
837 if (!is_phi(instr)) {
838 for (const Operand& op : instr->operands) {
839 if (op.isTemp() && op.isFirstKillBeforeDef())
840 reg_file.clear(op);
841 }
842 }
843 for (unsigned i = 0; i < instr->definitions.size(); i++) {
844 Definition& def = instr->definitions[i];
845 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
846 reg_file.fill(def);
847 }
848 return {{}, false};
849 }
850
851 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
852
853 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
854 reg_file.clear(PhysReg{best_pos}, rc);
855
856 update_renames(ctx, reg_file, parallelcopies, instr);
857
858 /* remove killed operands from reg_file once again */
859 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
860 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
861 continue;
862 assert(!instr->operands[i].isUndefined());
863 if (instr->operands[i].isFirstKillBeforeDef())
864 reg_file.clear(instr->operands[i]);
865 }
866 for (unsigned i = 0; i < instr->definitions.size(); i++) {
867 Definition def = instr->definitions[i];
868 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i))
869 reg_file.fill(def);
870 }
871
872 adjust_max_used_regs(ctx, rc, best_pos);
873 return {PhysReg{best_pos}, true};
874 }
875
876 bool get_reg_specified(ra_ctx& ctx,
877 RegisterFile& reg_file,
878 RegClass rc,
879 std::vector<std::pair<Operand, Definition>>& parallelcopies,
880 aco_ptr<Instruction>& instr,
881 PhysReg reg)
882 {
883 if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(ctx, instr))
884 return false;
885 if (!rc.is_subdword() && reg.byte())
886 return false;
887
888 uint32_t size = rc.size();
889 uint32_t stride = 1;
890 uint32_t lb, ub;
891
892 if (rc.type() == RegType::vgpr) {
893 lb = 256;
894 ub = 256 + ctx.program->max_reg_demand.vgpr;
895 } else {
896 if (size == 2)
897 stride = 2;
898 else if (size >= 4)
899 stride = 4;
900 if (reg % stride != 0)
901 return false;
902 lb = 0;
903 ub = ctx.program->max_reg_demand.sgpr;
904 }
905
906 uint32_t reg_lo = reg.reg();
907 uint32_t reg_hi = reg + (size - 1);
908
909 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
910 return false;
911
912 if (reg_file.test(reg, rc.bytes()))
913 return false;
914
915 adjust_max_used_regs(ctx, rc, reg_lo);
916 return true;
917 }
918
919 PhysReg get_reg(ra_ctx& ctx,
920 RegisterFile& reg_file,
921 Temp temp,
922 std::vector<std::pair<Operand, Definition>>& parallelcopies,
923 aco_ptr<Instruction>& instr)
924 {
925 auto split_vec = ctx.split_vectors.find(temp.id());
926 if (split_vec != ctx.split_vectors.end()) {
927 unsigned offset = 0;
928 for (Definition def : split_vec->second->definitions) {
929 auto affinity_it = ctx.affinities.find(def.tempId());
930 if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
931 PhysReg reg = ctx.assignments[affinity_it->second].reg;
932 reg.reg_b -= offset;
933 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
934 return reg;
935 }
936 offset += def.bytes();
937 }
938 }
939
940 if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
941 ctx.assignments[ctx.affinities[temp.id()]].assigned) {
942 PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
943 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
944 return reg;
945 }
946
947 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
948 Instruction* vec = ctx.vectors[temp.id()];
949 unsigned byte_offset = 0;
950 for (const Operand& op : vec->operands) {
951 if (op.isTemp() && op.tempId() == temp.id())
952 break;
953 else
954 byte_offset += op.bytes();
955 }
956 unsigned k = 0;
957 for (const Operand& op : vec->operands) {
958 if (op.isTemp() &&
959 op.tempId() != temp.id() &&
960 op.getTemp().type() == temp.type() &&
961 ctx.assignments[op.tempId()].assigned) {
962 PhysReg reg = ctx.assignments[op.tempId()].reg;
963 reg.reg_b += (byte_offset - k);
964 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
965 return reg;
966 }
967 k += op.bytes();
968 }
969
970 DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
971 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
972 PhysReg reg = res.first;
973 if (res.second) {
974 reg.reg_b += byte_offset;
975 /* make sure to only use byte offset if the instruction supports it */
976 if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
977 return reg;
978 }
979 }
980
981 DefInfo info(ctx, instr, temp.regClass());
982
983 /* try to find space without live-range splits */
984 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
985
986 if (res.second)
987 return res.first;
988
989 /* try to find space with live-range splits */
990 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
991
992 if (res.second)
993 return res.first;
994
995 /* try using more registers */
996
997 /* We should only fail here because keeping under the limit would require
998 * too many moves. */
999 assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
1000
1001 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
1002 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
1003 if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
1004 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
1005 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1006 } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
1007 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
1008 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1009 }
1010
1011 //FIXME: if nothing helps, shift-rotate the registers to make space
1012
1013 fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n");
1014 abort();
1015 }
1016
1017 PhysReg get_reg_create_vector(ra_ctx& ctx,
1018 RegisterFile& reg_file,
1019 Temp temp,
1020 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1021 aco_ptr<Instruction>& instr)
1022 {
1023 RegClass rc = temp.regClass();
1024 /* create_vector instructions have different costs w.r.t. register coalescing */
1025 uint32_t size = rc.size();
1026 uint32_t bytes = rc.bytes();
1027 uint32_t stride = 1;
1028 uint32_t lb, ub;
1029 if (rc.type() == RegType::vgpr) {
1030 lb = 256;
1031 ub = 256 + ctx.program->max_reg_demand.vgpr;
1032 } else {
1033 lb = 0;
1034 ub = ctx.program->max_reg_demand.sgpr;
1035 if (size == 2)
1036 stride = 2;
1037 else if (size >= 4)
1038 stride = 4;
1039 }
1040
1041 //TODO: improve p_create_vector for sub-dword vectors
1042
1043 unsigned best_pos = -1;
1044 unsigned num_moves = 0xFF;
1045 bool best_war_hint = true;
1046
1047 /* test for each operand which definition placement causes the least shuffle instructions */
1048 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1049 // TODO: think about, if we can alias live operands on the same register
1050 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
1051 continue;
1052
1053 if (offset > instr->operands[i].physReg().reg_b)
1054 continue;
1055
1056 unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
1057 if (reg_lo % 4)
1058 continue;
1059 reg_lo /= 4;
1060 unsigned reg_hi = reg_lo + size - 1;
1061 unsigned k = 0;
1062
1063 /* no need to check multiple times */
1064 if (reg_lo == best_pos)
1065 continue;
1066
1067 /* check borders */
1068 // TODO: this can be improved */
1069 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
1070 continue;
1071 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
1072 continue;
1073 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
1074 continue;
1075
1076 /* count variables to be moved and check war_hint */
1077 bool war_hint = false;
1078 bool linear_vgpr = false;
1079 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
1080 if (reg_file[j] != 0) {
1081 if (reg_file[j] == 0xF0000000) {
1082 PhysReg reg;
1083 reg.reg_b = j * 4;
1084 unsigned bytes_left = bytes - (j - reg_lo) * 4;
1085 for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
1086 k += reg_file.test(reg, 1);
1087 } else {
1088 k += 4;
1089 /* we cannot split live ranges of linear vgprs */
1090 if (ctx.assignments[reg_file[j]].rc & (1 << 6))
1091 linear_vgpr = true;
1092 }
1093 }
1094 war_hint |= ctx.war_hint[j];
1095 }
1096 if (linear_vgpr || (war_hint && !best_war_hint))
1097 continue;
1098
1099 /* count operands in wrong positions */
1100 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
1101 if (j == i ||
1102 !instr->operands[j].isTemp() ||
1103 instr->operands[j].getTemp().type() != rc.type())
1104 continue;
1105 if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
1106 k += instr->operands[j].bytes();
1107 }
1108 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
1109 if (k > num_moves || (!aligned && k == num_moves))
1110 continue;
1111
1112 best_pos = reg_lo;
1113 num_moves = k;
1114 best_war_hint = war_hint;
1115 }
1116
1117 if (num_moves >= bytes)
1118 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1119
1120 /* collect variables to be moved */
1121 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
1122
1123 /* GFX9+: move killed operands which aren't yet at the correct position
1124 * Moving all killed operands generally leads to more register swaps.
1125 * This is only done on GFX9+ because of the cheap v_swap instruction.
1126 */
1127 if (ctx.program->chip_class >= GFX9) {
1128 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
1129 if (instr->operands[i].isTemp() &&
1130 instr->operands[i].isFirstKillBeforeDef() &&
1131 instr->operands[i].getTemp().type() == rc.type() &&
1132 instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
1133 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
1134 }
1135 }
1136 } else {
1137 /* re-enable the killed operands */
1138 for (unsigned j = 0; j < instr->operands.size(); j++) {
1139 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
1140 reg_file.fill(instr->operands[j]);
1141 }
1142 }
1143 ASSERTED bool success = false;
1144 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
1145 assert(success);
1146
1147 update_renames(ctx, reg_file, parallelcopies, instr);
1148 adjust_max_used_regs(ctx, rc, best_pos);
1149
1150 /* remove killed operands from reg_file once again */
1151 for (unsigned i = 0; i < instr->operands.size(); i++) {
1152 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
1153 continue;
1154 assert(!instr->operands[i].isUndefined());
1155 if (instr->operands[i].isFirstKillBeforeDef())
1156 reg_file.clear(instr->operands[i]);
1157 }
1158
1159 return PhysReg{best_pos};
1160 }
1161
1162 void handle_pseudo(ra_ctx& ctx,
1163 const RegisterFile& reg_file,
1164 Instruction* instr)
1165 {
1166 if (instr->format != Format::PSEUDO)
1167 return;
1168
1169 /* all instructions which use handle_operands() need this information */
1170 switch (instr->opcode) {
1171 case aco_opcode::p_extract_vector:
1172 case aco_opcode::p_create_vector:
1173 case aco_opcode::p_split_vector:
1174 case aco_opcode::p_parallelcopy:
1175 case aco_opcode::p_wqm:
1176 break;
1177 default:
1178 return;
1179 }
1180
1181 /* if all definitions are vgpr, no need to care for SCC */
1182 bool writes_sgpr = false;
1183 for (Definition& def : instr->definitions) {
1184 if (def.getTemp().type() == RegType::sgpr) {
1185 writes_sgpr = true;
1186 break;
1187 }
1188 }
1189 /* if all operands are constant, no need to care either */
1190 bool reads_sgpr = false;
1191 for (Operand& op : instr->operands) {
1192 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1193 reads_sgpr = true;
1194 break;
1195 }
1196 }
1197 if (!(writes_sgpr && reads_sgpr))
1198 return;
1199
1200 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
1201 if (reg_file[scc.reg()]) {
1202 pi->tmp_in_scc = true;
1203
1204 int reg = ctx.max_used_sgpr;
1205 for (; reg >= 0 && reg_file[reg]; reg--)
1206 ;
1207 if (reg < 0) {
1208 reg = ctx.max_used_sgpr + 1;
1209 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
1210 ;
1211 assert(reg < ctx.program->max_reg_demand.sgpr);
1212 }
1213
1214 adjust_max_used_regs(ctx, s1, reg);
1215 pi->scratch_sgpr = PhysReg{(unsigned)reg};
1216 } else {
1217 pi->tmp_in_scc = false;
1218 }
1219 }
1220
1221 bool operand_can_use_reg(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
1222 {
1223 if (instr->operands[idx].isFixed())
1224 return instr->operands[idx].physReg() == reg;
1225
1226 if (reg.byte() && !instr_can_access_subdword(ctx, instr))
1227 return false;
1228
1229 switch (instr->format) {
1230 case Format::SMEM:
1231 return reg != scc &&
1232 reg != exec &&
1233 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
1234 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
1235 default:
1236 // TODO: there are more instructions with restrictions on registers
1237 return true;
1238 }
1239 }
1240
1241 void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
1242 std::vector<std::pair<Operand, Definition>>& parallelcopy,
1243 aco_ptr<Instruction>& instr, Operand& operand)
1244 {
1245 /* check if the operand is fixed */
1246 PhysReg dst;
1247 if (operand.isFixed()) {
1248 assert(operand.physReg() != ctx.assignments[operand.tempId()].reg);
1249
1250 /* check if target reg is blocked, and move away the blocking var */
1251 if (register_file[operand.physReg().reg()]) {
1252 assert(register_file[operand.physReg()] != 0xF0000000);
1253 uint32_t blocking_id = register_file[operand.physReg().reg()];
1254 RegClass rc = ctx.assignments[blocking_id].rc;
1255 Operand pc_op = Operand(Temp{blocking_id, rc});
1256 pc_op.setFixed(operand.physReg());
1257
1258 /* find free reg */
1259 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
1260 Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
1261 register_file.clear(pc_op);
1262 parallelcopy.emplace_back(pc_op, pc_def);
1263 }
1264 dst = operand.physReg();
1265
1266 } else {
1267 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr);
1268 }
1269
1270 Operand pc_op = operand;
1271 pc_op.setFixed(ctx.assignments[operand.tempId()].reg);
1272 Definition pc_def = Definition(dst, pc_op.regClass());
1273 register_file.clear(pc_op);
1274 parallelcopy.emplace_back(pc_op, pc_def);
1275 update_renames(ctx, register_file, parallelcopy, instr);
1276 }
1277
1278 Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
1279 {
1280 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
1281 if (it == ctx.renames[block_idx].end())
1282 return val;
1283 else
1284 return it->second;
1285 }
1286
1287 Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block)
1288 {
1289 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
1290 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
1291 return val;
1292
1293 assert(preds.size() > 0);
1294
1295 Temp new_val;
1296 if (!ctx.sealed[block->index]) {
1297 /* consider rename from already processed predecessor */
1298 Temp tmp = read_variable(ctx, val, preds[0]);
1299
1300 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1301 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1302 ctx.assignments.emplace_back();
1303 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1304 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1305 phi->definitions[0] = Definition(new_val);
1306 for (unsigned i = 0; i < preds.size(); i++)
1307 phi->operands[i] = Operand(val);
1308 if (tmp.regClass() == new_val.regClass())
1309 ctx.affinities[new_val.id()] = tmp.id();
1310
1311 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1312 ctx.incomplete_phis[block->index].emplace_back(phi.get());
1313 block->instructions.insert(block->instructions.begin(), std::move(phi));
1314
1315 } else if (preds.size() == 1) {
1316 /* if the block has only one predecessor, just look there for the name */
1317 new_val = read_variable(ctx, val, preds[0]);
1318 } else {
1319 /* there are multiple predecessors and the block is sealed */
1320 Temp ops[preds.size()];
1321
1322 /* get the rename from each predecessor and check if they are the same */
1323 bool needs_phi = false;
1324 for (unsigned i = 0; i < preds.size(); i++) {
1325 ops[i] = read_variable(ctx, val, preds[i]);
1326 if (i == 0)
1327 new_val = ops[i];
1328 else
1329 needs_phi |= !(new_val == ops[i]);
1330 }
1331
1332 if (needs_phi) {
1333 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1334 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1335 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1336 new_val = Temp{ctx.program->allocateId(), val.regClass()};
1337 phi->definitions[0] = Definition(new_val);
1338 for (unsigned i = 0; i < preds.size(); i++) {
1339 phi->operands[i] = Operand(ops[i]);
1340 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
1341 if (ops[i].regClass() == new_val.regClass())
1342 ctx.affinities[new_val.id()] = ops[i].id();
1343 }
1344 ctx.assignments.emplace_back();
1345 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1346 ctx.phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1347 block->instructions.insert(block->instructions.begin(), std::move(phi));
1348 }
1349 }
1350
1351 if (new_val != val) {
1352 ctx.renames[block->index][val.id()] = new_val;
1353 ctx.orig_names[new_val.id()] = val;
1354 }
1355 return new_val;
1356 }
1357
1358 void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
1359 {
1360 std::unordered_map<unsigned, phi_info>::iterator info = ctx.phi_map.find(temp.id());
1361
1362 if (info == ctx.phi_map.end() || !ctx.sealed[info->second.block_idx])
1363 return;
1364
1365 assert(info->second.block_idx != 0);
1366 Instruction* phi = info->second.phi;
1367 Temp same = Temp();
1368 Definition def = phi->definitions[0];
1369
1370 /* a phi node is trivial if all operands are the same as the definition of the phi */
1371 for (const Operand& op : phi->operands) {
1372 const Temp t = op.getTemp();
1373 if (t == same || t == def.getTemp()) {
1374 assert(t == same || op.physReg() == def.physReg());
1375 continue;
1376 }
1377 if (same != Temp())
1378 return;
1379
1380 same = t;
1381 }
1382 assert(same != Temp() || same == def.getTemp());
1383
1384 /* reroute all uses to same and remove phi */
1385 std::vector<Temp> phi_users;
1386 std::unordered_map<unsigned, phi_info>::iterator same_phi_info = ctx.phi_map.find(same.id());
1387 for (Instruction* instr : info->second.uses) {
1388 assert(phi != instr);
1389 /* recursively try to remove trivial phis */
1390 if (is_phi(instr)) {
1391 /* ignore if the phi was already flagged trivial */
1392 if (instr->definitions.empty())
1393 continue;
1394
1395 if (instr->definitions[0].getTemp() != temp)
1396 phi_users.emplace_back(instr->definitions[0].getTemp());
1397 }
1398 for (Operand& op : instr->operands) {
1399 if (op.isTemp() && op.tempId() == def.tempId()) {
1400 op.setTemp(same);
1401 if (same_phi_info != ctx.phi_map.end())
1402 same_phi_info->second.uses.emplace(instr);
1403 }
1404 }
1405 }
1406
1407 auto it = ctx.orig_names.find(same.id());
1408 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1409 for (unsigned i = 0; i < ctx.program->blocks.size(); i++) {
1410 auto it = ctx.renames[i].find(orig_var);
1411 if (it != ctx.renames[i].end() && it->second == def.getTemp())
1412 ctx.renames[i][orig_var] = same;
1413 }
1414
1415 phi->definitions.clear(); /* this indicates that the phi can be removed */
1416 ctx.phi_map.erase(info);
1417 for (Temp t : phi_users)
1418 try_remove_trivial_phi(ctx, t);
1419
1420 return;
1421 }
1422
1423 } /* end namespace */
1424
1425
1426 void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
1427 {
1428 ra_ctx ctx(program);
1429 std::vector<std::vector<Temp>> phi_ressources;
1430 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
1431
1432 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1433 Block& block = *it;
1434
1435 /* first, compute the death points of all live vars within the block */
1436 TempSet& live = live_out_per_block[block.index];
1437
1438 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1439 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1440 aco_ptr<Instruction>& instr = *rit;
1441 if (is_phi(instr)) {
1442 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
1443 live.erase(instr->definitions[0].getTemp());
1444 continue;
1445 }
1446 /* collect information about affinity-related temporaries */
1447 std::vector<Temp> affinity_related;
1448 /* affinity_related[0] is the last seen affinity-related temp */
1449 affinity_related.emplace_back(instr->definitions[0].getTemp());
1450 affinity_related.emplace_back(instr->definitions[0].getTemp());
1451 for (const Operand& op : instr->operands) {
1452 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1453 affinity_related.emplace_back(op.getTemp());
1454 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1455 }
1456 }
1457 phi_ressources.emplace_back(std::move(affinity_related));
1458 } else {
1459 /* add vector affinities */
1460 if (instr->opcode == aco_opcode::p_create_vector) {
1461 for (const Operand& op : instr->operands) {
1462 if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1463 ctx.vectors[op.tempId()] = instr.get();
1464 }
1465 }
1466
1467 if (instr->opcode == aco_opcode::p_split_vector && instr->operands[0].isFirstKillBeforeDef())
1468 ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
1469
1470 /* add operands to live variables */
1471 for (const Operand& op : instr->operands) {
1472 if (op.isTemp())
1473 live.emplace(op.getTemp());
1474 }
1475 }
1476
1477 /* erase definitions from live */
1478 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1479 const Definition& def = instr->definitions[i];
1480 if (!def.isTemp())
1481 continue;
1482 live.erase(def.getTemp());
1483 /* mark last-seen phi operand */
1484 std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1485 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1486 phi_ressources[it->second][0] = def.getTemp();
1487 /* try to coalesce phi affinities with parallelcopies */
1488 Operand op = Operand();
1489 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
1490 op = instr->operands[i];
1491 else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
1492 op = instr->operands[2];
1493
1494 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
1495 phi_ressources[it->second].emplace_back(op.getTemp());
1496 temp_to_phi_ressources[op.tempId()] = it->second;
1497 }
1498 }
1499 }
1500 }
1501 }
1502 /* create affinities */
1503 for (std::vector<Temp>& vec : phi_ressources) {
1504 assert(vec.size() > 1);
1505 for (unsigned i = 1; i < vec.size(); i++)
1506 if (vec[i].id() != vec[0].id())
1507 ctx.affinities[vec[i].id()] = vec[0].id();
1508 }
1509
1510 /* state of register file after phis */
1511 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1512
1513 for (Block& block : program->blocks) {
1514 TempSet& live = live_out_per_block[block.index];
1515 /* initialize register file */
1516 assert(block.index != 0 || live.empty());
1517 RegisterFile register_file;
1518 ctx.war_hint.reset();
1519
1520 for (Temp t : live) {
1521 Temp renamed = handle_live_in(ctx, t, &block);
1522 assignment& var = ctx.assignments[renamed.id()];
1523 /* due to live-range splits, the live-in might be a phi, now */
1524 if (var.assigned)
1525 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
1526 }
1527
1528 std::vector<aco_ptr<Instruction>> instructions;
1529 std::vector<aco_ptr<Instruction>>::iterator it;
1530
1531 /* this is a slight adjustment from the paper as we already have phi nodes:
1532 * We consider them incomplete phis and only handle the definition. */
1533
1534 /* handle fixed phi definitions */
1535 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1536 aco_ptr<Instruction>& phi = *it;
1537 if (!is_phi(phi))
1538 break;
1539 Definition& definition = phi->definitions[0];
1540 if (!definition.isFixed())
1541 continue;
1542
1543 /* check if a dead exec mask phi is needed */
1544 if (definition.isKill()) {
1545 for (Operand& op : phi->operands) {
1546 assert(op.isTemp());
1547 if (!ctx.assignments[op.tempId()].assigned ||
1548 ctx.assignments[op.tempId()].reg != exec) {
1549 definition.setKill(false);
1550 break;
1551 }
1552 }
1553 }
1554
1555 if (definition.isKill())
1556 continue;
1557
1558 assert(definition.physReg() == exec);
1559 assert(!register_file.test(definition.physReg(), definition.bytes()));
1560 register_file.fill(definition);
1561 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1562 }
1563
1564 /* look up the affinities */
1565 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1566 aco_ptr<Instruction>& phi = *it;
1567 if (!is_phi(phi))
1568 break;
1569 Definition& definition = phi->definitions[0];
1570 if (definition.isKill() || definition.isFixed())
1571 continue;
1572
1573 if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
1574 ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
1575 assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass());
1576 PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
1577 bool try_use_special_reg = reg == scc || reg == exec;
1578 if (try_use_special_reg) {
1579 for (const Operand& op : phi->operands) {
1580 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned &&
1581 ctx.assignments[op.tempId()].reg == reg)) {
1582 try_use_special_reg = false;
1583 break;
1584 }
1585 }
1586 if (!try_use_special_reg)
1587 continue;
1588 }
1589 /* only assign if register is still free */
1590 if (!register_file.test(reg, definition.bytes())) {
1591 definition.setFixed(reg);
1592 register_file.fill(definition);
1593 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1594 }
1595 }
1596 }
1597
1598 /* find registers for phis without affinity or where the register was blocked */
1599 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1600 aco_ptr<Instruction>& phi = *it;
1601 if (!is_phi(phi))
1602 break;
1603
1604 Definition& definition = phi->definitions[0];
1605 if (definition.isKill())
1606 continue;
1607
1608 if (!definition.isFixed()) {
1609 std::vector<std::pair<Operand, Definition>> parallelcopy;
1610 /* try to find a register that is used by at least one operand */
1611 for (const Operand& op : phi->operands) {
1612 if (!(op.isTemp() && ctx.assignments[op.tempId()].assigned))
1613 continue;
1614 PhysReg reg = ctx.assignments[op.tempId()].reg;
1615 /* we tried this already on the previous loop */
1616 if (reg == scc || reg == exec)
1617 continue;
1618 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1619 definition.setFixed(reg);
1620 break;
1621 }
1622 }
1623 if (!definition.isFixed())
1624 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
1625
1626 /* process parallelcopy */
1627 for (std::pair<Operand, Definition> pc : parallelcopy) {
1628 /* see if it's a copy from a different phi */
1629 //TODO: prefer moving some previous phis over live-ins
1630 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1631 Instruction *prev_phi = NULL;
1632 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1633 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1634 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1635 prev_phi = phi_it->get();
1636 }
1637 phi_it = it;
1638 while (!prev_phi && is_phi(*++phi_it)) {
1639 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1640 prev_phi = phi_it->get();
1641 }
1642 if (prev_phi) {
1643 /* if so, just update that phi's register */
1644 register_file.clear(prev_phi->definitions[0]);
1645 prev_phi->definitions[0].setFixed(pc.second.physReg());
1646 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1647 register_file.fill(prev_phi->definitions[0]);
1648 continue;
1649 }
1650
1651 /* rename */
1652 std::unordered_map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1653 Temp orig = pc.first.getTemp();
1654 if (orig_it != ctx.orig_names.end())
1655 orig = orig_it->second;
1656 else
1657 ctx.orig_names[pc.second.tempId()] = orig;
1658 ctx.renames[block.index][orig.id()] = pc.second.getTemp();
1659
1660 /* otherwise, this is a live-in and we need to create a new phi
1661 * to move it in this block's predecessors */
1662 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1663 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1664 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1665 new_phi->definitions[0] = pc.second;
1666 for (unsigned i = 0; i < preds.size(); i++)
1667 new_phi->operands[i] = Operand(pc.first);
1668 instructions.emplace_back(std::move(new_phi));
1669 }
1670
1671 register_file.fill(definition);
1672 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1673 }
1674 live.emplace(definition.getTemp());
1675
1676 /* update phi affinities */
1677 for (const Operand& op : phi->operands) {
1678 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1679 ctx.affinities[op.tempId()] = definition.tempId();
1680 }
1681
1682 instructions.emplace_back(std::move(*it));
1683 }
1684
1685 /* fill in sgpr_live_in */
1686 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1687 sgpr_live_in[block.index][i] = register_file[i];
1688 sgpr_live_in[block.index][127] = register_file[scc.reg()];
1689
1690 /* Handle all other instructions of the block */
1691 for (; it != block.instructions.end(); ++it) {
1692 aco_ptr<Instruction>& instr = *it;
1693
1694 /* parallelcopies from p_phi are inserted here which means
1695 * live ranges of killed operands end here as well */
1696 if (instr->opcode == aco_opcode::p_logical_end) {
1697 /* no need to process this instruction any further */
1698 if (block.logical_succs.size() != 1) {
1699 instructions.emplace_back(std::move(instr));
1700 continue;
1701 }
1702
1703 Block& succ = program->blocks[block.logical_succs[0]];
1704 unsigned idx = 0;
1705 for (; idx < succ.logical_preds.size(); idx++) {
1706 if (succ.logical_preds[idx] == block.index)
1707 break;
1708 }
1709 for (aco_ptr<Instruction>& phi : succ.instructions) {
1710 if (phi->opcode == aco_opcode::p_phi) {
1711 if (phi->operands[idx].isTemp() &&
1712 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1713 phi->operands[idx].isFirstKillBeforeDef()) {
1714 Temp phi_op = read_variable(ctx, phi->operands[idx].getTemp(), block.index);
1715 PhysReg reg = ctx.assignments[phi_op.id()].reg;
1716 assert(register_file[reg] == phi_op.id());
1717 register_file[reg] = 0;
1718 }
1719 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1720 break;
1721 }
1722 }
1723 instructions.emplace_back(std::move(instr));
1724 continue;
1725 }
1726
1727 std::vector<std::pair<Operand, Definition>> parallelcopy;
1728
1729 assert(!is_phi(instr));
1730
1731 /* handle operands */
1732 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1733 auto& operand = instr->operands[i];
1734 if (!operand.isTemp())
1735 continue;
1736
1737 /* rename operands */
1738 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
1739 assert(ctx.assignments[operand.tempId()].assigned);
1740
1741 PhysReg reg = ctx.assignments[operand.tempId()].reg;
1742 if (operand_can_use_reg(ctx, instr, i, reg))
1743 operand.setFixed(reg);
1744 else
1745 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
1746
1747 if (instr->format == Format::EXP ||
1748 (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
1749 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1750 for (unsigned j = 0; j < operand.size(); j++)
1751 ctx.war_hint.set(operand.physReg().reg() + j);
1752 }
1753
1754 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
1755 if (phi != ctx.phi_map.end())
1756 phi->second.uses.emplace(instr.get());
1757 }
1758
1759 /* remove dead vars from register file */
1760 for (const Operand& op : instr->operands) {
1761 if (op.isTemp() && op.isFirstKillBeforeDef())
1762 register_file.clear(op);
1763 }
1764
1765 /* try to optimize v_mad_f32 -> v_mac_f32 */
1766 if (instr->opcode == aco_opcode::v_mad_f32 &&
1767 instr->operands[2].isTemp() &&
1768 instr->operands[2].isKillBeforeDef() &&
1769 instr->operands[2].getTemp().type() == RegType::vgpr &&
1770 instr->operands[1].isTemp() &&
1771 instr->operands[1].getTemp().type() == RegType::vgpr &&
1772 !instr->usesModifiers()) {
1773 unsigned def_id = instr->definitions[0].tempId();
1774 auto it = ctx.affinities.find(def_id);
1775 if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
1776 instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
1777 register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
1778 instr->format = Format::VOP2;
1779 instr->opcode = aco_opcode::v_mac_f32;
1780 }
1781 }
1782
1783 /* handle definitions which must have the same register as an operand */
1784 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1785 instr->opcode == aco_opcode::v_mac_f32 ||
1786 instr->opcode == aco_opcode::v_writelane_b32 ||
1787 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1788 instr->definitions[0].setFixed(instr->operands[2].physReg());
1789 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1790 instr->opcode == aco_opcode::s_mulk_i32) {
1791 instr->definitions[0].setFixed(instr->operands[0].physReg());
1792 } else if (instr->format == Format::MUBUF &&
1793 instr->definitions.size() == 1 &&
1794 instr->operands.size() == 4) {
1795 instr->definitions[0].setFixed(instr->operands[3].physReg());
1796 } else if (instr->format == Format::MIMG &&
1797 instr->definitions.size() == 1 &&
1798 instr->operands[1].regClass().type() == RegType::vgpr) {
1799 instr->definitions[0].setFixed(instr->operands[1].physReg());
1800 }
1801
1802 ctx.defs_done.reset();
1803
1804 /* handle fixed definitions first */
1805 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1806 auto& definition = instr->definitions[i];
1807 if (!definition.isFixed())
1808 continue;
1809
1810 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1811 /* check if the target register is blocked */
1812 if (register_file[definition.physReg().reg()] != 0) {
1813 /* create parallelcopy pair to move blocking var */
1814 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
1815 Operand pc_op = Operand(tmp);
1816 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
1817 RegClass rc = pc_op.regClass();
1818 tmp = Temp{program->allocateId(), rc};
1819 Definition pc_def = Definition(tmp);
1820
1821 /* re-enable the killed operands, so that we don't move the blocking var there */
1822 for (const Operand& op : instr->operands) {
1823 if (op.isTemp() && op.isFirstKillBeforeDef())
1824 register_file.fill(op);
1825 }
1826
1827 /* find a new register for the blocking variable */
1828 PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
1829 /* once again, disable killed operands */
1830 for (const Operand& op : instr->operands) {
1831 if (op.isTemp() && op.isFirstKillBeforeDef())
1832 register_file.clear(op);
1833 }
1834 for (unsigned k = 0; k < i; k++) {
1835 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1836 register_file.fill(instr->definitions[k]);
1837 }
1838 pc_def.setFixed(reg);
1839
1840 /* finish assignment of parallelcopy */
1841 ctx.assignments.emplace_back(reg, pc_def.regClass());
1842 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
1843 parallelcopy.emplace_back(pc_op, pc_def);
1844
1845 /* add changes to reg_file */
1846 register_file.clear(pc_op);
1847 register_file.fill(pc_def);
1848 }
1849 ctx.defs_done.set(i);
1850
1851 if (!definition.isTemp())
1852 continue;
1853
1854 /* set live if it has a kill point */
1855 if (!definition.isKill())
1856 live.emplace(definition.getTemp());
1857
1858 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1859 register_file.fill(definition);
1860 }
1861
1862 /* handle all other definitions */
1863 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1864 auto& definition = instr->definitions[i];
1865
1866 if (definition.isFixed() || !definition.isTemp())
1867 continue;
1868
1869 /* find free reg */
1870 if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
1871 definition.setFixed(definition.physReg());
1872 else if (instr->opcode == aco_opcode::p_split_vector) {
1873 PhysReg reg = instr->operands[0].physReg();
1874 for (unsigned j = 0; j < i; j++)
1875 reg.reg_b += instr->definitions[j].bytes();
1876 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1877 definition.setFixed(reg);
1878 } else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) {
1879 PhysReg reg = instr->operands[i].physReg();
1880 if (instr->operands[i].isTemp() &&
1881 instr->operands[i].getTemp().type() == definition.getTemp().type() &&
1882 !register_file.test(reg, definition.bytes()))
1883 definition.setFixed(reg);
1884 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1885 PhysReg reg;
1886 if (instr->operands[0].isKillBeforeDef() &&
1887 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1888 reg = instr->operands[0].physReg();
1889 reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
1890 assert(!register_file.test(reg, definition.bytes()));
1891 definition.setFixed(reg);
1892 }
1893 } else if (instr->opcode == aco_opcode::p_create_vector) {
1894 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
1895 parallelcopy, instr);
1896 definition.setFixed(reg);
1897 }
1898
1899 if (!definition.isFixed()) {
1900 Temp tmp = definition.getTemp();
1901 if (tmp.regClass().is_subdword() &&
1902 !instr_can_access_subdword(ctx, instr)) {
1903 assert(tmp.bytes() <= 4);
1904 tmp = Temp(definition.tempId(), v1);
1905 }
1906 definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
1907 }
1908
1909 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1910 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1911 ctx.defs_done.set(i);
1912
1913 /* set live if it has a kill point */
1914 if (!definition.isKill())
1915 live.emplace(definition.getTemp());
1916
1917 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1918 register_file.fill(definition);
1919 }
1920
1921 handle_pseudo(ctx, register_file, instr.get());
1922
1923 /* kill definitions and late-kill operands */
1924 for (const Definition& def : instr->definitions) {
1925 if (def.isTemp() && def.isKill())
1926 register_file.clear(def);
1927 }
1928 for (const Operand& op : instr->operands) {
1929 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
1930 register_file.clear(op);
1931 }
1932
1933 /* emit parallelcopy */
1934 if (!parallelcopy.empty()) {
1935 aco_ptr<Pseudo_instruction> pc;
1936 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1937 bool temp_in_scc = register_file[scc.reg()];
1938 bool sgpr_operands_alias_defs = false;
1939 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1940 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1941 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1942 if (!sgpr_operands_alias_defs) {
1943 unsigned reg = parallelcopy[i].first.physReg().reg();
1944 unsigned size = parallelcopy[i].first.getTemp().size();
1945 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1946
1947 reg = parallelcopy[i].second.physReg().reg();
1948 size = parallelcopy[i].second.getTemp().size();
1949 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1950 sgpr_operands_alias_defs = true;
1951 }
1952 }
1953
1954 pc->operands[i] = parallelcopy[i].first;
1955 pc->definitions[i] = parallelcopy[i].second;
1956 assert(pc->operands[i].size() == pc->definitions[i].size());
1957
1958 /* it might happen that the operand is already renamed. we have to restore the original name. */
1959 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1960 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1961 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1962 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1963
1964 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(pc->operands[i].tempId());
1965 if (phi != ctx.phi_map.end())
1966 phi->second.uses.emplace(pc.get());
1967 }
1968
1969 if (temp_in_scc && sgpr_operands_alias_defs) {
1970 /* disable definitions and re-enable operands */
1971 for (const Definition& def : instr->definitions) {
1972 if (def.isTemp() && !def.isKill())
1973 register_file.clear(def);
1974 }
1975 for (const Operand& op : instr->operands) {
1976 if (op.isTemp() && op.isFirstKill())
1977 register_file.block(op.physReg(), op.regClass());
1978 }
1979
1980 handle_pseudo(ctx, register_file, pc.get());
1981
1982 /* re-enable live vars */
1983 for (const Operand& op : instr->operands) {
1984 if (op.isTemp() && op.isFirstKill())
1985 register_file.clear(op);
1986 }
1987 for (const Definition& def : instr->definitions) {
1988 if (def.isTemp() && !def.isKill())
1989 register_file.fill(def);
1990 }
1991 } else {
1992 pc->tmp_in_scc = false;
1993 }
1994
1995 instructions.emplace_back(std::move(pc));
1996 }
1997
1998 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1999 bool instr_needs_vop3 = !instr->isVOP3() &&
2000 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
2001 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
2002 ((instr->opcode == aco_opcode::v_add_co_u32 ||
2003 instr->opcode == aco_opcode::v_addc_co_u32 ||
2004 instr->opcode == aco_opcode::v_sub_co_u32 ||
2005 instr->opcode == aco_opcode::v_subb_co_u32 ||
2006 instr->opcode == aco_opcode::v_subrev_co_u32 ||
2007 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2008 !(instr->definitions[1].physReg() == vcc)) ||
2009 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
2010 instr->opcode == aco_opcode::v_subb_co_u32 ||
2011 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
2012 !(instr->operands[2].physReg() == vcc)));
2013 if (instr_needs_vop3) {
2014
2015 /* if the first operand is a literal, we have to move it to a reg */
2016 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
2017 bool can_sgpr = true;
2018 /* check, if we have to move to vgpr */
2019 for (const Operand& op : instr->operands) {
2020 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
2021 can_sgpr = false;
2022 break;
2023 }
2024 }
2025 /* disable definitions and re-enable operands */
2026 for (const Definition& def : instr->definitions)
2027 register_file.clear(def);
2028 for (const Operand& op : instr->operands) {
2029 if (op.isTemp() && op.isFirstKill())
2030 register_file.block(op.physReg(), op.regClass());
2031 }
2032 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
2033 ctx.assignments.emplace_back();
2034 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
2035
2036 aco_ptr<Instruction> mov;
2037 if (can_sgpr)
2038 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
2039 else
2040 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
2041 mov->operands[0] = instr->operands[0];
2042 mov->definitions[0] = Definition(tmp);
2043 mov->definitions[0].setFixed(reg);
2044
2045 instr->operands[0] = Operand(tmp);
2046 instr->operands[0].setFixed(reg);
2047 instructions.emplace_back(std::move(mov));
2048 /* re-enable live vars */
2049 for (const Operand& op : instr->operands) {
2050 if (op.isTemp() && op.isFirstKill())
2051 register_file.clear(op);
2052 }
2053 for (const Definition& def : instr->definitions) {
2054 if (def.isTemp() && !def.isKill())
2055 register_file.fill(def);
2056 }
2057 }
2058
2059 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2060 aco_ptr<Instruction> tmp = std::move(instr);
2061 Format format = asVOP3(tmp->format);
2062 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
2063 for (unsigned i = 0; i < instr->operands.size(); i++) {
2064 Operand& operand = tmp->operands[i];
2065 instr->operands[i] = operand;
2066 /* keep phi_map up to date */
2067 if (operand.isTemp()) {
2068 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.tempId());
2069 if (phi != ctx.phi_map.end()) {
2070 phi->second.uses.erase(tmp.get());
2071 phi->second.uses.emplace(instr.get());
2072 }
2073 }
2074 }
2075 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
2076 }
2077 instructions.emplace_back(std::move(*it));
2078
2079 } /* end for Instr */
2080
2081 block.instructions = std::move(instructions);
2082
2083 ctx.filled[block.index] = true;
2084 for (unsigned succ_idx : block.linear_succs) {
2085 Block& succ = program->blocks[succ_idx];
2086 /* seal block if all predecessors are filled */
2087 bool all_filled = true;
2088 for (unsigned pred_idx : succ.linear_preds) {
2089 if (!ctx.filled[pred_idx]) {
2090 all_filled = false;
2091 break;
2092 }
2093 }
2094 if (all_filled) {
2095 ctx.sealed[succ_idx] = true;
2096
2097 /* finish incomplete phis and check if they became trivial */
2098 for (Instruction* phi : ctx.incomplete_phis[succ_idx]) {
2099 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
2100 for (unsigned i = 0; i < phi->operands.size(); i++) {
2101 phi->operands[i].setTemp(read_variable(ctx, phi->operands[i].getTemp(), preds[i]));
2102 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].reg);
2103 }
2104 try_remove_trivial_phi(ctx, phi->definitions[0].getTemp());
2105 }
2106 /* complete the original phi nodes, but no need to check triviality */
2107 for (aco_ptr<Instruction>& instr : succ.instructions) {
2108 if (!is_phi(instr))
2109 break;
2110 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
2111
2112 for (unsigned i = 0; i < instr->operands.size(); i++) {
2113 auto& operand = instr->operands[i];
2114 if (!operand.isTemp())
2115 continue;
2116 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2117 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2118 std::unordered_map<unsigned, phi_info>::iterator phi = ctx.phi_map.find(operand.getTemp().id());
2119 if (phi != ctx.phi_map.end())
2120 phi->second.uses.emplace(instr.get());
2121 }
2122 }
2123 }
2124 }
2125 } /* end for BB */
2126
2127 /* remove trivial phis */
2128 for (Block& block : program->blocks) {
2129 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
2130 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
2131 auto middle = std::remove_if(block.instructions.begin(), end,
2132 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
2133 block.instructions.erase(middle, end);
2134 }
2135
2136 /* find scc spill registers which may be needed for parallelcopies created by phis */
2137 for (Block& block : program->blocks) {
2138 if (block.linear_preds.size() <= 1)
2139 continue;
2140
2141 std::bitset<128> regs = sgpr_live_in[block.index];
2142 if (!regs[127])
2143 continue;
2144
2145 /* choose a register */
2146 int16_t reg = 0;
2147 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
2148 ;
2149 assert(reg < ctx.program->max_reg_demand.sgpr);
2150 adjust_max_used_regs(ctx, s1, reg);
2151
2152 /* update predecessors */
2153 for (unsigned& pred_index : block.linear_preds) {
2154 Block& pred = program->blocks[pred_index];
2155 pred.scc_live_out = true;
2156 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
2157 }
2158 }
2159
2160 /* num_gpr = rnd_up(max_used_gpr + 1) */
2161 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
2162 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
2163 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
2164 else
2165 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
2166 }
2167
2168 }