aco: fix image_atomic_cmp_swap
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33 #include <functional>
34
35 #include "aco_ir.h"
36 #include "sid.h"
37 #include "util/u_math.h"
38
39 namespace aco {
40 namespace {
41
42 struct ra_ctx {
43 std::bitset<512> war_hint;
44 Program* program;
45 std::unordered_map<unsigned, std::pair<PhysReg, RegClass>> assignments;
46 std::map<unsigned, Temp> orig_names;
47 unsigned max_used_sgpr = 0;
48 unsigned max_used_vgpr = 0;
49 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
50
51 ra_ctx(Program* program) : program(program) {}
52 };
53
54
55 /* helper function for debugging */
56 #if 0
57 void print_regs(ra_ctx& ctx, bool vgprs, std::array<uint32_t, 512>& reg_file)
58 {
59 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
60 unsigned lb = vgprs ? 256 : 0;
61 unsigned ub = lb + max;
62 char reg_char = vgprs ? 'v' : 's';
63
64 /* print markers */
65 printf(" ");
66 for (unsigned i = lb; i < ub; i += 3) {
67 printf("%.2u ", i - lb);
68 }
69 printf("\n");
70
71 /* print usage */
72 printf("%cgprs: ", reg_char);
73 unsigned free_regs = 0;
74 unsigned prev = 0;
75 bool char_select = false;
76 for (unsigned i = lb; i < ub; i++) {
77 if (reg_file[i] == 0xFFFF) {
78 printf("~");
79 } else if (reg_file[i]) {
80 if (reg_file[i] != prev) {
81 prev = reg_file[i];
82 char_select = !char_select;
83 }
84 printf(char_select ? "#" : "@");
85 } else {
86 free_regs++;
87 printf(".");
88 }
89 }
90 printf("\n");
91
92 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
93
94 /* print assignments */
95 prev = 0;
96 unsigned size = 0;
97 for (unsigned i = lb; i < ub; i++) {
98 if (reg_file[i] != prev) {
99 if (prev && size > 1)
100 printf("-%d]\n", i - 1 - lb);
101 else if (prev)
102 printf("]\n");
103 prev = reg_file[i];
104 if (prev && prev != 0xFFFF) {
105 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
106 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
107 else
108 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
109 }
110 size = 1;
111 } else {
112 size++;
113 }
114 }
115 if (prev && size > 1)
116 printf("-%d]\n", ub - lb - 1);
117 else if (prev)
118 printf("]\n");
119 }
120 #endif
121
122
123 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
124 {
125 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
126 unsigned size = rc.size();
127 if (rc.type() == RegType::vgpr) {
128 assert(reg >= 256);
129 unsigned hi = reg - 256 + size - 1;
130 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
131 } else if (reg + rc.size() <= max_addressible_sgpr) {
132 unsigned hi = reg + size - 1;
133 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
134 }
135 }
136
137
138 void update_renames(ra_ctx& ctx, std::array<uint32_t, 512>& reg_file,
139 std::vector<std::pair<Operand, Definition>>& parallelcopies,
140 aco_ptr<Instruction>& instr)
141 {
142 /* allocate id's and rename operands: this is done transparently here */
143 for (std::pair<Operand, Definition>& copy : parallelcopies) {
144 /* the definitions with id are not from this function and already handled */
145 if (copy.second.isTemp())
146 continue;
147
148 /* check if we we moved another parallelcopy definition */
149 for (std::pair<Operand, Definition>& other : parallelcopies) {
150 if (!other.second.isTemp())
151 continue;
152 if (copy.first.getTemp() == other.second.getTemp()) {
153 copy.first.setTemp(other.first.getTemp());
154 copy.first.setFixed(other.first.physReg());
155 }
156 }
157 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
158 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
159 ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()};
160 for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++)
161 reg_file[i] = copy.second.tempId();
162 /* check if we moved an operand */
163 for (Operand& op : instr->operands) {
164 if (!op.isTemp())
165 continue;
166 if (op.tempId() == copy.first.tempId()) {
167 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKill();
168 for (std::pair<Operand, Definition>& pc : parallelcopies) {
169 PhysReg def_reg = pc.second.physReg();
170 omit_renaming &= def_reg > copy.first.physReg() ?
171 (copy.first.physReg() + copy.first.size() <= def_reg.reg) :
172 (def_reg + pc.second.size() <= copy.first.physReg().reg);
173 }
174 if (omit_renaming)
175 continue;
176 op.setTemp(copy.second.getTemp());
177 op.setFixed(copy.second.physReg());
178 }
179 }
180 }
181 }
182
183 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
184 std::array<uint32_t, 512>& reg_file,
185 uint32_t lb, uint32_t ub,
186 uint32_t size, uint32_t stride,
187 RegClass rc)
188 {
189 /* best fit algorithm: find the smallest gap to fit in the variable */
190 if (stride == 1) {
191 unsigned best_pos = 0xFFFF;
192 unsigned gap_size = 0xFFFF;
193 unsigned next_pos = 0xFFFF;
194
195 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
196 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
197 if (next_pos == 0xFFFF)
198 continue;
199
200 /* check if the variable fits */
201 if (next_pos + size > current_reg) {
202 next_pos = 0xFFFF;
203 continue;
204 }
205
206 /* check if the tested gap is smaller */
207 if (current_reg - next_pos < gap_size) {
208 best_pos = next_pos;
209 gap_size = current_reg - next_pos;
210 }
211 next_pos = 0xFFFF;
212 continue;
213 }
214
215 if (next_pos == 0xFFFF)
216 next_pos = current_reg;
217 }
218
219 /* final check */
220 if (next_pos != 0xFFFF &&
221 next_pos + size <= ub &&
222 ub - next_pos < gap_size) {
223 best_pos = next_pos;
224 gap_size = ub - next_pos;
225 }
226 if (best_pos != 0xFFFF) {
227 adjust_max_used_regs(ctx, rc, best_pos);
228 return {PhysReg{best_pos}, true};
229 }
230 return {{}, false};
231 }
232
233 bool found = false;
234 unsigned reg_lo = lb;
235 unsigned reg_hi = lb + size - 1;
236 while (!found && reg_lo + size <= ub) {
237 if (reg_file[reg_lo] != 0) {
238 reg_lo += stride;
239 continue;
240 }
241 reg_hi = reg_lo + size - 1;
242 found = true;
243 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
244 if (reg_file[reg] != 0 || ctx.war_hint[reg])
245 found = false;
246 }
247 if (found) {
248 adjust_max_used_regs(ctx, rc, reg_lo);
249 return {PhysReg{reg_lo}, true};
250 }
251
252 reg_lo += stride;
253 }
254
255 return {{}, false};
256 }
257
258 bool get_regs_for_copies(ra_ctx& ctx,
259 std::array<uint32_t, 512>& reg_file,
260 std::vector<std::pair<Operand, Definition>>& parallelcopies,
261 std::set<std::pair<unsigned, unsigned>> vars,
262 uint32_t lb, uint32_t ub,
263 aco_ptr<Instruction>& instr,
264 uint32_t def_reg_lo,
265 uint32_t def_reg_hi)
266 {
267
268 /* variables are sorted from small sized to large */
269 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
270 for (std::set<std::pair<unsigned, unsigned>>::reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
271 unsigned id = it->second;
272 std::pair<PhysReg, RegClass> var = ctx.assignments[id];
273 uint32_t size = it->first;
274 uint32_t stride = 1;
275 if (var.second.type() == RegType::sgpr) {
276 if (size == 2)
277 stride = 2;
278 if (size > 3)
279 stride = 4;
280 }
281
282 /* check if this is a dead operand, then we can re-use the space from the definition */
283 bool is_dead_operand = false;
284 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && i < instr->operands.size(); i++) {
285 if (instr->operands[i].isTemp() && instr->operands[i].isKill() && instr->operands[i].tempId() == id)
286 is_dead_operand = true;
287 }
288
289 std::pair<PhysReg, bool> res;
290 if (is_dead_operand) {
291 if (instr->opcode == aco_opcode::p_create_vector) {
292 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
293 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
294 for (unsigned j = 0; j < size; j++)
295 assert(reg_file[def_reg_lo + offset + j] == 0);
296 res = {PhysReg{def_reg_lo + offset}, true};
297 break;
298 }
299 }
300 } else {
301 res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second);
302 }
303 } else {
304 res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second);
305 if (!res.second) {
306 unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
307 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second);
308 }
309 }
310
311 if (res.second) {
312 /* mark the area as blocked */
313 for (unsigned i = res.first.reg; i < res.first + size; i++)
314 reg_file[i] = 0xFFFFFFFF;
315 /* create parallelcopy pair (without definition id) */
316 Temp tmp = Temp(id, var.second);
317 Operand pc_op = Operand(tmp);
318 pc_op.setFixed(var.first);
319 Definition pc_def = Definition(res.first, pc_op.regClass());
320 parallelcopies.emplace_back(pc_op, pc_def);
321 continue;
322 }
323
324 unsigned best_pos = lb;
325 unsigned num_moves = 0xFF;
326 unsigned num_vars = 0;
327
328 /* we use a sliding window to find potential positions */
329 unsigned reg_lo = lb;
330 unsigned reg_hi = lb + size - 1;
331 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
332 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
333 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
334 continue;
335
336 /* second, check that we have at most k=num_moves elements in the window
337 * and no element is larger than the currently processed one */
338 unsigned k = 0;
339 unsigned n = 0;
340 unsigned last_var = 0;
341 bool found = true;
342 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
343 if (reg_file[j] == 0 || reg_file[j] == last_var)
344 continue;
345
346 /* 0xFFFF signals that this area is already blocked! */
347 if (reg_file[j] == 0xFFFFFFFF || k > num_moves) {
348 found = false;
349 break;
350 }
351 /* we cannot split live ranges of linear vgprs */
352 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
353 found = false;
354 break;
355 }
356 bool is_kill = false;
357 for (const Operand& op : instr->operands) {
358 if (op.isTemp() && op.isKill() && op.tempId() == reg_file[j]) {
359 is_kill = true;
360 break;
361 }
362 }
363 if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) {
364 found = false;
365 break;
366 }
367
368 k += ctx.assignments[reg_file[j]].second.size();
369 last_var = reg_file[j];
370 n++;
371 if (k > num_moves || (k == num_moves && n <= num_vars)) {
372 found = false;
373 break;
374 }
375 }
376
377 if (found) {
378 best_pos = reg_lo;
379 num_moves = k;
380 num_vars = n;
381 }
382 }
383
384 /* FIXME: we messed up and couldn't find space for the variables to be copied */
385 if (num_moves == 0xFF)
386 return false;
387
388 reg_lo = best_pos;
389 reg_hi = best_pos + size - 1;
390
391 /* collect variables and block reg file */
392 std::set<std::pair<unsigned, unsigned>> new_vars;
393 for (unsigned j = reg_lo; j <= reg_hi; j++) {
394 if (reg_file[j] != 0) {
395 unsigned size = ctx.assignments[reg_file[j]].second.size();
396 unsigned id = reg_file[j];
397 new_vars.emplace(size, id);
398 for (unsigned k = 0; k < size; k++)
399 reg_file[ctx.assignments[id].first + k] = 0;
400 }
401 }
402
403 /* mark the area as blocked */
404 for (unsigned i = reg_lo; i <= reg_hi; i++)
405 reg_file[i] = 0xFFFFFFFF;
406
407 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
408 return false;
409
410 adjust_max_used_regs(ctx, var.second, reg_lo);
411
412 /* create parallelcopy pair (without definition id) */
413 Temp tmp = Temp(id, var.second);
414 Operand pc_op = Operand(tmp);
415 pc_op.setFixed(var.first);
416 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
417 parallelcopies.emplace_back(pc_op, pc_def);
418 }
419
420 return true;
421 }
422
423
424 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
425 std::array<uint32_t, 512>& reg_file,
426 std::vector<std::pair<Operand, Definition>>& parallelcopies,
427 uint32_t lb, uint32_t ub,
428 uint32_t size, uint32_t stride,
429 RegClass rc,
430 aco_ptr<Instruction>& instr)
431 {
432 unsigned regs_free = 0;
433 /* check how many free regs we have */
434 for (unsigned j = lb; j < ub; j++) {
435 if (reg_file[j] == 0)
436 regs_free++;
437 }
438
439 /* mark and count killed operands */
440 unsigned killed_ops = 0;
441 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
442 if (instr->operands[j].isTemp() &&
443 instr->operands[j].isFirstKill() &&
444 instr->operands[j].physReg() >= lb &&
445 instr->operands[j].physReg() < ub) {
446 assert(instr->operands[j].isFixed());
447 assert(reg_file[instr->operands[j].physReg().reg] == 0);
448 for (unsigned k = 0; k < instr->operands[j].size(); k++)
449 reg_file[instr->operands[j].physReg() + k] = 0xFFFFFFFF;
450 killed_ops += instr->operands[j].getTemp().size();
451 }
452 }
453
454 assert(regs_free >= size);
455 /* we might have to move dead operands to dst in order to make space */
456 unsigned op_moves = 0;
457
458 if (size > (regs_free - killed_ops))
459 op_moves = size - (regs_free - killed_ops);
460
461 /* find the best position to place the definition */
462 unsigned best_pos = lb;
463 unsigned num_moves = 0xFF;
464 unsigned num_vars = 0;
465
466 /* we use a sliding window to check potential positions */
467 unsigned reg_lo = lb;
468 unsigned reg_hi = lb + size - 1;
469 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
470 /* first check the edges: this is what we have to fix to allow for num_moves > size */
471 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
472 continue;
473 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
474 continue;
475
476 /* second, check that we have at most k=num_moves elements in the window
477 * and no element is larger than the currently processed one */
478 unsigned k = op_moves;
479 unsigned n = 0;
480 unsigned remaining_op_moves = op_moves;
481 unsigned last_var = 0;
482 bool found = true;
483 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
484 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
485 if (reg_file[j] == 0 || reg_file[j] == last_var)
486 continue;
487
488 /* dead operands effectively reduce the number of estimated moves */
489 if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) {
490 k--;
491 remaining_op_moves--;
492 continue;
493 }
494
495 if (ctx.assignments[reg_file[j]].second.size() >= size) {
496 found = false;
497 break;
498 }
499
500
501 /* we cannot split live ranges of linear vgprs */
502 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
503 found = false;
504 break;
505 }
506
507 k += ctx.assignments[reg_file[j]].second.size();
508 n++;
509 last_var = reg_file[j];
510 }
511
512 if (!found || k > num_moves)
513 continue;
514 if (k == num_moves && n < num_vars)
515 continue;
516 if (!aligned && k == num_moves && n == num_vars)
517 continue;
518
519 if (found) {
520 best_pos = reg_lo;
521 num_moves = k;
522 num_vars = n;
523 }
524 }
525
526 if (num_moves == 0xFF) {
527 /* remove killed operands from reg_file once again */
528 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
529 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill()) {
530 for (unsigned k = 0; k < instr->operands[i].getTemp().size(); k++)
531 reg_file[instr->operands[i].physReg() + k] = 0;
532 }
533 }
534 for (unsigned i = 0; i < instr->definitions.size(); i++) {
535 Definition def = instr->definitions[i];
536 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
537 for (unsigned k = 0; k < def.getTemp().size(); k++)
538 reg_file[def.physReg() + k] = def.tempId();
539 }
540 }
541 return {{}, false};
542 }
543
544 std::array<uint32_t, 512> register_file = reg_file;
545
546 /* now, we figured the placement for our definition */
547 std::set<std::pair<unsigned, unsigned>> vars;
548 for (unsigned j = best_pos; j < best_pos + size; j++) {
549 if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0)
550 vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]);
551 reg_file[j] = 0;
552 }
553
554 if (instr->opcode == aco_opcode::p_create_vector) {
555 /* move killed operands which aren't yet at the correct position */
556 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
557 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
558 instr->operands[i].getTemp().type() == rc.type()) {
559
560 if (instr->operands[i].physReg() != best_pos + offset) {
561 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
562 for (unsigned j = 0; j < instr->operands[i].size(); j++)
563 reg_file[instr->operands[i].physReg() + j] = 0;
564 } else {
565 for (unsigned j = 0; j < instr->operands[i].size(); j++)
566 reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId();
567 }
568 }
569 }
570 } else {
571 /* re-enable the killed operands */
572 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
573 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill()) {
574 for (unsigned k = 0; k < instr->operands[j].getTemp().size(); k++)
575 reg_file[instr->operands[j].physReg() + k] = instr->operands[j].tempId();
576 }
577 }
578 }
579
580 std::vector<std::pair<Operand, Definition>> pc;
581 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
582 reg_file = std::move(register_file);
583 /* remove killed operands from reg_file once again */
584 if (!is_phi(instr)) {
585 for (const Operand& op : instr->operands) {
586 if (op.isTemp() && op.isFirstKill()) {
587 for (unsigned k = 0; k < op.getTemp().size(); k++)
588 reg_file[op.physReg() + k] = 0;
589 }
590 }
591 }
592 for (unsigned i = 0; i < instr->definitions.size(); i++) {
593 Definition& def = instr->definitions[i];
594 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
595 for (unsigned k = 0; k < def.getTemp().size(); k++)
596 reg_file[def.physReg() + k] = def.tempId();
597 }
598 }
599 return {{}, false};
600 }
601
602 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
603
604 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
605 for (unsigned i = 0; i < size; i++)
606 reg_file[best_pos + i] = 0;
607
608 update_renames(ctx, reg_file, parallelcopies, instr);
609
610 /* remove killed operands from reg_file once again */
611 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
612 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
613 continue;
614 assert(!instr->operands[i].isUndefined());
615 if (instr->operands[i].isFirstKill()) {
616 for (unsigned j = 0; j < instr->operands[i].getTemp().size(); j++)
617 reg_file[instr->operands[i].physReg() + j] = 0;
618 }
619 }
620 for (unsigned i = 0; i < instr->definitions.size(); i++) {
621 Definition def = instr->definitions[i];
622 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
623 for (unsigned k = 0; k < def.getTemp().size(); k++)
624 reg_file[def.physReg() + k] = def.tempId();
625 }
626 }
627
628 adjust_max_used_regs(ctx, rc, best_pos);
629 return {PhysReg{best_pos}, true};
630 }
631
632 PhysReg get_reg(ra_ctx& ctx,
633 std::array<uint32_t, 512>& reg_file,
634 RegClass rc,
635 std::vector<std::pair<Operand, Definition>>& parallelcopies,
636 aco_ptr<Instruction>& instr)
637 {
638 uint32_t size = rc.size();
639 uint32_t stride = 1;
640 uint32_t lb, ub;
641 if (rc.type() == RegType::vgpr) {
642 lb = 256;
643 ub = 256 + ctx.program->max_reg_demand.vgpr;
644 } else {
645 lb = 0;
646 ub = ctx.program->max_reg_demand.sgpr;
647 if (size == 2)
648 stride = 2;
649 else if (size >= 4)
650 stride = 4;
651 }
652
653 std::pair<PhysReg, bool> res = {{}, false};
654 /* try to find space without live-range splits */
655 if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
656 res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
657 if (!res.second)
658 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
659 if (res.second)
660 return res.first;
661
662 /* try to find space with live-range splits */
663 res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
664
665 if (res.second)
666 return res.first;
667
668 unsigned regs_free = 0;
669 for (unsigned i = lb; i < ub; i++) {
670 if (!reg_file[i])
671 regs_free++;
672 }
673
674 /* We should only fail here because keeping under the limit would require
675 * too many moves. */
676 assert(regs_free >= size);
677
678 /* try using more registers */
679 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
680 uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
681 if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
682 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
683 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
684 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
685 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
686 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
687 }
688
689 //FIXME: if nothing helps, shift-rotate the registers to make space
690
691 unreachable("did not find a register");
692 }
693
694
695 std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
696 std::array<uint32_t, 512>& reg_file,
697 RegClass rc)
698 {
699 uint32_t size = rc.size();
700 uint32_t stride = 1;
701 uint32_t lb, ub;
702 if (rc.type() == RegType::vgpr) {
703 lb = 256;
704 ub = 256 + ctx.program->max_reg_demand.vgpr;
705 } else {
706 lb = 0;
707 ub = ctx.program->max_reg_demand.sgpr;
708 if (size == 2)
709 stride = 2;
710 else if (size >= 4)
711 stride = 4;
712 }
713 return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
714 }
715
716
717 PhysReg get_reg_create_vector(ra_ctx& ctx,
718 std::array<uint32_t, 512>& reg_file,
719 RegClass rc,
720 std::vector<std::pair<Operand, Definition>>& parallelcopies,
721 aco_ptr<Instruction>& instr)
722 {
723 /* create_vector instructions have different costs w.r.t. register coalescing */
724 uint32_t size = rc.size();
725 uint32_t stride = 1;
726 uint32_t lb, ub;
727 if (rc.type() == RegType::vgpr) {
728 lb = 256;
729 ub = 256 + ctx.program->max_reg_demand.vgpr;
730 } else {
731 lb = 0;
732 ub = ctx.program->max_reg_demand.sgpr;
733 if (size == 2)
734 stride = 2;
735 else if (size >= 4)
736 stride = 4;
737 }
738
739 unsigned best_pos = -1;
740 unsigned num_moves = 0xFF;
741 bool best_war_hint = true;
742
743 /* test for each operand which definition placement causes the least shuffle instructions */
744 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
745 // TODO: think about, if we can alias live operands on the same register
746 if (!instr->operands[i].isTemp() || !instr->operands[i].isKill() || instr->operands[i].getTemp().type() != rc.type())
747 continue;
748
749 if (offset > instr->operands[i].physReg())
750 continue;
751
752 unsigned reg_lo = instr->operands[i].physReg() - offset;
753 unsigned reg_hi = reg_lo + size - 1;
754 unsigned k = 0;
755
756 /* no need to check multiple times */
757 if (reg_lo == best_pos)
758 continue;
759
760 /* check borders */
761 // TODO: this can be improved */
762 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
763 continue;
764 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
765 continue;
766 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
767 continue;
768
769 /* count variables to be moved and check war_hint */
770 bool war_hint = false;
771 bool linear_vgpr = false;
772 for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
773 if (reg_file[j] != 0) {
774 k++;
775 /* we cannot split live ranges of linear vgprs */
776 if (ctx.assignments[reg_file[j]].second & (1 << 6))
777 linear_vgpr = true;
778 }
779 war_hint |= ctx.war_hint[j];
780 }
781 if (linear_vgpr || (war_hint && !best_war_hint))
782 continue;
783
784 /* count operands in wrong positions */
785 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
786 if (j == i ||
787 !instr->operands[j].isTemp() ||
788 instr->operands[j].getTemp().type() != rc.type())
789 continue;
790 if (instr->operands[j].physReg() != reg_lo + offset)
791 k += instr->operands[j].size();
792 }
793 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
794 if (k > num_moves || (!aligned && k == num_moves))
795 continue;
796
797 best_pos = reg_lo;
798 num_moves = k;
799 best_war_hint = war_hint;
800 }
801
802 if (num_moves >= size)
803 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
804
805 /* collect variables to be moved */
806 std::set<std::pair<unsigned, unsigned>> vars;
807 for (unsigned i = best_pos; i < best_pos + size; i++) {
808 if (reg_file[i] != 0)
809 vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]);
810 reg_file[i] = 0;
811 }
812
813 /* move killed operands which aren't yet at the correct position */
814 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
815 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) {
816 if (instr->operands[i].physReg() != best_pos + offset) {
817 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
818 } else {
819 for (unsigned j = 0; j < instr->operands[i].size(); j++)
820 reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId();
821 }
822 }
823 }
824
825 ASSERTED bool success = false;
826 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
827 assert(success);
828
829 update_renames(ctx, reg_file, parallelcopies, instr);
830 adjust_max_used_regs(ctx, rc, best_pos);
831 return PhysReg{best_pos};
832 }
833
834 bool get_reg_specified(ra_ctx& ctx,
835 std::array<uint32_t, 512>& reg_file,
836 RegClass rc,
837 std::vector<std::pair<Operand, Definition>>& parallelcopies,
838 aco_ptr<Instruction>& instr,
839 PhysReg reg)
840 {
841 uint32_t size = rc.size();
842 uint32_t stride = 1;
843 uint32_t lb, ub;
844
845 if (rc.type() == RegType::vgpr) {
846 lb = 256;
847 ub = 256 + ctx.program->max_reg_demand.vgpr;
848 } else {
849 if (size == 2)
850 stride = 2;
851 else if (size >= 4)
852 stride = 4;
853 if (reg % stride != 0)
854 return false;
855 lb = 0;
856 ub = ctx.program->max_reg_demand.sgpr;
857 }
858
859 uint32_t reg_lo = reg.reg;
860 uint32_t reg_hi = reg + (size - 1);
861
862 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
863 return false;
864
865 for (unsigned i = reg_lo; i <= reg_hi; i++) {
866 if (reg_file[i] != 0)
867 return false;
868 }
869 adjust_max_used_regs(ctx, rc, reg_lo);
870 return true;
871 }
872
873 void handle_pseudo(ra_ctx& ctx,
874 const std::array<uint32_t, 512>& reg_file,
875 Instruction* instr)
876 {
877 if (instr->format != Format::PSEUDO)
878 return;
879
880 /* all instructions which use handle_operands() need this information */
881 switch (instr->opcode) {
882 case aco_opcode::p_extract_vector:
883 case aco_opcode::p_create_vector:
884 case aco_opcode::p_split_vector:
885 case aco_opcode::p_parallelcopy:
886 case aco_opcode::p_wqm:
887 break;
888 default:
889 return;
890 }
891
892 /* if all definitions are vgpr, no need to care for SCC */
893 bool writes_sgpr = false;
894 for (Definition& def : instr->definitions) {
895 if (def.getTemp().type() == RegType::sgpr) {
896 writes_sgpr = true;
897 break;
898 }
899 }
900 /* if all operands are constant, no need to care either */
901 bool reads_sgpr = false;
902 for (Operand& op : instr->operands) {
903 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
904 reads_sgpr = true;
905 break;
906 }
907 }
908 if (!(writes_sgpr && reads_sgpr))
909 return;
910
911 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
912 if (reg_file[scc.reg]) {
913 pi->tmp_in_scc = true;
914
915 int reg = ctx.max_used_sgpr;
916 for (; reg >= 0 && reg_file[reg]; reg--)
917 ;
918 if (reg < 0) {
919 reg = ctx.max_used_sgpr + 1;
920 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
921 ;
922 assert(reg < ctx.program->max_reg_demand.sgpr);
923 }
924
925 adjust_max_used_regs(ctx, s1, reg);
926 pi->scratch_sgpr = PhysReg{(unsigned)reg};
927 } else {
928 pi->tmp_in_scc = false;
929 }
930 }
931
932 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
933 {
934 switch (instr->format) {
935 case Format::SMEM:
936 return reg != scc &&
937 reg != exec &&
938 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
939 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
940 default:
941 // TODO: there are more instructions with restrictions on registers
942 return true;
943 }
944 }
945
946 } /* end namespace */
947
948
949 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block)
950 {
951 ra_ctx ctx(program);
952
953 std::vector<std::unordered_map<unsigned, Temp>> renames(program->blocks.size());
954
955 struct phi_info {
956 Instruction* phi;
957 unsigned block_idx;
958 std::set<Instruction*> uses;
959 };
960
961 bool filled[program->blocks.size()];
962 bool sealed[program->blocks.size()];
963 memset(filled, 0, sizeof filled);
964 memset(sealed, 0, sizeof sealed);
965 std::vector<std::vector<Instruction*>> incomplete_phis(program->blocks.size());
966 std::map<unsigned, phi_info> phi_map;
967 std::map<unsigned, unsigned> affinities;
968 std::function<Temp(Temp,unsigned)> read_variable;
969 std::function<Temp(Temp,Block*)> handle_live_in;
970 std::function<Temp(std::map<unsigned, phi_info>::iterator)> try_remove_trivial_phi;
971
972 read_variable = [&](Temp val, unsigned block_idx) -> Temp {
973 std::unordered_map<unsigned, Temp>::iterator it = renames[block_idx].find(val.id());
974 assert(it != renames[block_idx].end());
975 return it->second;
976 };
977
978 handle_live_in = [&](Temp val, Block *block) -> Temp {
979 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
980 if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) {
981 renames[block->index][val.id()] = val;
982 return val;
983 }
984 assert(preds.size() > 0);
985
986 Temp new_val;
987 if (!sealed[block->index]) {
988 /* consider rename from already processed predecessor */
989 Temp tmp = read_variable(val, preds[0]);
990
991 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
992 new_val = Temp{program->allocateId(), val.regClass()};
993 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
994 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
995 phi->definitions[0] = Definition(new_val);
996 for (unsigned i = 0; i < preds.size(); i++)
997 phi->operands[i] = Operand(val);
998 if (tmp.regClass() == new_val.regClass())
999 affinities[new_val.id()] = tmp.id();
1000
1001 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1002 incomplete_phis[block->index].emplace_back(phi.get());
1003 block->instructions.insert(block->instructions.begin(), std::move(phi));
1004
1005 } else if (preds.size() == 1) {
1006 /* if the block has only one predecessor, just look there for the name */
1007 new_val = read_variable(val, preds[0]);
1008 } else {
1009 /* there are multiple predecessors and the block is sealed */
1010 Temp ops[preds.size()];
1011
1012 /* we start assuming that the name is the same from all predecessors */
1013 renames[block->index][val.id()] = val;
1014 bool needs_phi = false;
1015
1016 /* get the rename from each predecessor and check if they are the same */
1017 for (unsigned i = 0; i < preds.size(); i++) {
1018 ops[i] = read_variable(val, preds[i]);
1019 if (i == 0)
1020 new_val = ops[i];
1021 else
1022 needs_phi |= !(new_val == ops[i]);
1023 }
1024
1025 if (needs_phi) {
1026 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1027 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1028 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1029 new_val = Temp{program->allocateId(), val.regClass()};
1030 phi->definitions[0] = Definition(new_val);
1031 for (unsigned i = 0; i < preds.size(); i++) {
1032 phi->operands[i] = Operand(ops[i]);
1033 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first);
1034 if (ops[i].regClass() == new_val.regClass())
1035 affinities[new_val.id()] = ops[i].id();
1036 }
1037 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1038 block->instructions.insert(block->instructions.begin(), std::move(phi));
1039 }
1040 }
1041
1042 renames[block->index][val.id()] = new_val;
1043 renames[block->index][new_val.id()] = new_val;
1044 ctx.orig_names[new_val.id()] = val;
1045 return new_val;
1046 };
1047
1048 try_remove_trivial_phi = [&] (std::map<unsigned, phi_info>::iterator info) -> Temp {
1049 assert(info->second.block_idx != 0);
1050 Instruction* phi = info->second.phi;
1051 Temp same = Temp();
1052
1053 Definition def = phi->definitions[0];
1054 /* a phi node is trivial if all operands are the same as the definition of the phi */
1055 for (const Operand& op : phi->operands) {
1056 const Temp t = op.getTemp();
1057 if (t == same || t == def.getTemp())
1058 continue;
1059 if (!(same == Temp()) || !(op.physReg() == def.physReg())) {
1060 /* phi is not trivial */
1061 return def.getTemp();
1062 }
1063 same = t;
1064 }
1065 assert(!(same == Temp() || same == def.getTemp()));
1066
1067 /* reroute all uses to same and remove phi */
1068 std::vector<std::map<unsigned, phi_info>::iterator> phi_users;
1069 std::map<unsigned, phi_info>::iterator same_phi_info = phi_map.find(same.id());
1070 for (Instruction* instr : info->second.uses) {
1071 assert(phi != instr);
1072 /* recursively try to remove trivial phis */
1073 if (is_phi(instr)) {
1074 /* ignore if the phi was already flagged trivial */
1075 if (instr->definitions.empty())
1076 continue;
1077
1078 std::map<unsigned, phi_info>::iterator it = phi_map.find(instr->definitions[0].tempId());
1079 if (it != phi_map.end() && it != info)
1080 phi_users.emplace_back(it);
1081 }
1082 for (Operand& op : instr->operands) {
1083 if (op.isTemp() && op.tempId() == def.tempId()) {
1084 op.setTemp(same);
1085 if (same_phi_info != phi_map.end())
1086 same_phi_info->second.uses.emplace(instr);
1087 }
1088 }
1089 }
1090
1091 auto it = ctx.orig_names.find(same.id());
1092 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1093 for (unsigned i = 0; i < program->blocks.size(); i++) {
1094 auto it = renames[i].find(orig_var);
1095 if (it != renames[i].end() && it->second == def.getTemp())
1096 renames[i][orig_var] = same;
1097 }
1098
1099 unsigned block_idx = info->second.block_idx;
1100 phi->definitions.clear(); /* this indicates that the phi can be removed */
1101 phi_map.erase(info);
1102 for (auto it : phi_users) {
1103 if (sealed[it->second.block_idx])
1104 try_remove_trivial_phi(it);
1105 }
1106
1107 /* due to the removal of other phis, the name might have changed once again! */
1108 return renames[block_idx][orig_var];
1109 };
1110
1111 std::map<unsigned, Instruction*> vectors;
1112 std::vector<std::vector<Temp>> phi_ressources;
1113 std::map<unsigned, unsigned> temp_to_phi_ressources;
1114
1115 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1116 Block& block = *it;
1117
1118 /* first, compute the death points of all live vars within the block */
1119 std::set<Temp>& live = live_out_per_block[block.index];
1120
1121 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1122 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1123 aco_ptr<Instruction>& instr = *rit;
1124 if (is_phi(instr)) {
1125 live.erase(instr->definitions[0].getTemp());
1126 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
1127 continue;
1128 /* collect information about affinity-related temporaries */
1129 std::vector<Temp> affinity_related;
1130 /* affinity_related[0] is the last seen affinity-related temp */
1131 affinity_related.emplace_back(instr->definitions[0].getTemp());
1132 affinity_related.emplace_back(instr->definitions[0].getTemp());
1133 for (const Operand& op : instr->operands) {
1134 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1135 affinity_related.emplace_back(op.getTemp());
1136 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1137 }
1138 }
1139 phi_ressources.emplace_back(std::move(affinity_related));
1140 continue;
1141 }
1142
1143 /* add vector affinities */
1144 if (instr->opcode == aco_opcode::p_create_vector) {
1145 for (const Operand& op : instr->operands) {
1146 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1147 vectors[op.tempId()] = instr.get();
1148 }
1149 }
1150
1151 /* add operands to live variables */
1152 for (const Operand& op : instr->operands) {
1153 if (op.isTemp())
1154 live.emplace(op.getTemp());
1155 }
1156
1157 /* erase definitions from live */
1158 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1159 const Definition& def = instr->definitions[i];
1160 if (!def.isTemp())
1161 continue;
1162 live.erase(def.getTemp());
1163 /* mark last-seen phi operand */
1164 std::map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1165 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
1166 phi_ressources[it->second][0] = def.getTemp();
1167 /* try to coalesce phi affinities with parallelcopies */
1168 if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
1169 Operand op = instr->operands[i];
1170 if (op.isTemp() && op.isFirstKill() && def.regClass() == op.regClass()) {
1171 phi_ressources[it->second].emplace_back(op.getTemp());
1172 temp_to_phi_ressources[op.tempId()] = it->second;
1173 }
1174 }
1175 }
1176 }
1177 }
1178 }
1179 /* create affinities */
1180 for (std::vector<Temp>& vec : phi_ressources) {
1181 assert(vec.size() > 1);
1182 for (unsigned i = 1; i < vec.size(); i++)
1183 if (vec[i].id() != vec[0].id())
1184 affinities[vec[i].id()] = vec[0].id();
1185 }
1186
1187 /* state of register file after phis */
1188 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1189
1190 for (Block& block : program->blocks) {
1191 std::set<Temp>& live = live_out_per_block[block.index];
1192 /* initialize register file */
1193 assert(block.index != 0 || live.empty());
1194 std::array<uint32_t, 512> register_file = {0};
1195 ctx.war_hint.reset();
1196
1197 for (Temp t : live) {
1198 Temp renamed = handle_live_in(t, &block);
1199 if (ctx.assignments.find(renamed.id()) != ctx.assignments.end()) {
1200 for (unsigned i = 0; i < t.size(); i++)
1201 register_file[ctx.assignments[renamed.id()].first + i] = renamed.id();
1202 }
1203 }
1204
1205 std::vector<aco_ptr<Instruction>> instructions;
1206 std::vector<aco_ptr<Instruction>>::iterator it;
1207
1208 /* this is a slight adjustment from the paper as we already have phi nodes:
1209 * We consider them incomplete phis and only handle the definition. */
1210
1211 /* handle fixed phi definitions */
1212 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1213 aco_ptr<Instruction>& phi = *it;
1214 if (!is_phi(phi))
1215 break;
1216 Definition& definition = phi->definitions[0];
1217 if (!definition.isFixed())
1218 continue;
1219
1220 /* check if a dead exec mask phi is needed */
1221 if (definition.isKill()) {
1222 for (Operand& op : phi->operands) {
1223 assert(op.isTemp());
1224 if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1225 ctx.assignments[op.tempId()].first != exec) {
1226 definition.setKill(false);
1227 break;
1228 }
1229 }
1230 }
1231
1232 if (definition.isKill())
1233 continue;
1234
1235 assert(definition.physReg() == exec);
1236 for (unsigned i = 0; i < definition.size(); i++) {
1237 assert(register_file[definition.physReg() + i] == 0);
1238 register_file[definition.physReg() + i] = definition.tempId();
1239 }
1240 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1241 }
1242
1243 /* look up the affinities */
1244 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1245 aco_ptr<Instruction>& phi = *it;
1246 if (!is_phi(phi))
1247 break;
1248 Definition& definition = phi->definitions[0];
1249 if (definition.isKill() || definition.isFixed())
1250 continue;
1251
1252 if (affinities.find(definition.tempId()) != affinities.end() &&
1253 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1254 assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass());
1255 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1256 bool try_use_special_reg = reg == scc || reg == exec;
1257 if (try_use_special_reg) {
1258 for (const Operand& op : phi->operands) {
1259 if (!op.isTemp() ||
1260 ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1261 !(ctx.assignments[op.tempId()].first == reg)) {
1262 try_use_special_reg = false;
1263 break;
1264 }
1265 }
1266 if (!try_use_special_reg)
1267 continue;
1268 }
1269 bool reg_free = true;
1270 for (unsigned i = reg.reg; reg_free && i < reg + definition.size(); i++) {
1271 if (register_file[i] != 0)
1272 reg_free = false;
1273 }
1274 /* only assign if register is still free */
1275 if (reg_free) {
1276 definition.setFixed(reg);
1277 for (unsigned i = 0; i < definition.size(); i++)
1278 register_file[definition.physReg() + i] = definition.tempId();
1279 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1280 }
1281 }
1282 }
1283
1284 /* find registers for phis without affinity or where the register was blocked */
1285 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1286 aco_ptr<Instruction>& phi = *it;
1287 if (!is_phi(phi))
1288 break;
1289
1290 Definition& definition = phi->definitions[0];
1291 if (definition.isKill())
1292 continue;
1293
1294 renames[block.index][definition.tempId()] = definition.getTemp();
1295
1296 if (!definition.isFixed()) {
1297 std::vector<std::pair<Operand, Definition>> parallelcopy;
1298 /* try to find a register that is used by at least one operand */
1299 for (const Operand& op : phi->operands) {
1300 if (!op.isTemp() ||
1301 ctx.assignments.find(op.tempId()) == ctx.assignments.end())
1302 continue;
1303 PhysReg reg = ctx.assignments[op.tempId()].first;
1304 /* we tried this already on the previous loop */
1305 if (reg == scc || reg == exec)
1306 continue;
1307 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1308 definition.setFixed(reg);
1309 break;
1310 }
1311 }
1312 if (!definition.isFixed())
1313 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi));
1314
1315 /* process parallelcopy */
1316 for (std::pair<Operand, Definition> pc : parallelcopy) {
1317 /* see if it's a copy from a different phi */
1318 //TODO: prefer moving some previous phis over live-ins
1319 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1320 Instruction *prev_phi = NULL;
1321 std::vector<aco_ptr<Instruction>>::iterator phi_it;
1322 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
1323 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1324 prev_phi = phi_it->get();
1325 }
1326 phi_it = it;
1327 while (!prev_phi && is_phi(*++phi_it)) {
1328 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
1329 prev_phi = phi_it->get();
1330 }
1331 if (prev_phi) {
1332 /* if so, just update that phi's register */
1333 prev_phi->definitions[0].setFixed(pc.second.physReg());
1334 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()};
1335 for (unsigned reg = pc.second.physReg(); reg < pc.second.physReg() + pc.second.size(); reg++)
1336 register_file[reg] = prev_phi->definitions[0].tempId();
1337 continue;
1338 }
1339
1340 /* rename */
1341 std::map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1342 Temp orig = pc.first.getTemp();
1343 if (orig_it != ctx.orig_names.end())
1344 orig = orig_it->second;
1345 else
1346 ctx.orig_names[pc.second.tempId()] = orig;
1347 renames[block.index][orig.id()] = pc.second.getTemp();
1348 renames[block.index][pc.second.tempId()] = pc.second.getTemp();
1349
1350 /* otherwise, this is a live-in and we need to create a new phi
1351 * to move it in this block's predecessors */
1352 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1353 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1354 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1355 new_phi->definitions[0] = pc.second;
1356 for (unsigned i = 0; i < preds.size(); i++)
1357 new_phi->operands[i] = Operand(pc.first);
1358 instructions.emplace_back(std::move(new_phi));
1359 }
1360
1361 for (unsigned i = 0; i < definition.size(); i++)
1362 register_file[definition.physReg() + i] = definition.tempId();
1363 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1364 }
1365 live.emplace(definition.getTemp());
1366
1367 /* update phi affinities */
1368 for (const Operand& op : phi->operands) {
1369 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1370 affinities[op.tempId()] = definition.tempId();
1371 }
1372
1373 instructions.emplace_back(std::move(*it));
1374 }
1375
1376 /* fill in sgpr_live_in */
1377 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
1378 sgpr_live_in[block.index][i] = register_file[i];
1379 sgpr_live_in[block.index][127] = register_file[scc.reg];
1380
1381 /* Handle all other instructions of the block */
1382 for (; it != block.instructions.end(); ++it) {
1383 aco_ptr<Instruction>& instr = *it;
1384
1385 /* parallelcopies from p_phi are inserted here which means
1386 * live ranges of killed operands end here as well */
1387 if (instr->opcode == aco_opcode::p_logical_end) {
1388 /* no need to process this instruction any further */
1389 if (block.logical_succs.size() != 1) {
1390 instructions.emplace_back(std::move(instr));
1391 continue;
1392 }
1393
1394 Block& succ = program->blocks[block.logical_succs[0]];
1395 unsigned idx = 0;
1396 for (; idx < succ.logical_preds.size(); idx++) {
1397 if (succ.logical_preds[idx] == block.index)
1398 break;
1399 }
1400 for (aco_ptr<Instruction>& phi : succ.instructions) {
1401 if (phi->opcode == aco_opcode::p_phi) {
1402 if (phi->operands[idx].isTemp() &&
1403 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1404 phi->operands[idx].isFirstKill()) {
1405 Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index);
1406 PhysReg reg = ctx.assignments[phi_op.id()].first;
1407 assert(register_file[reg] == phi_op.id());
1408 register_file[reg] = 0;
1409 }
1410 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1411 break;
1412 }
1413 }
1414 instructions.emplace_back(std::move(instr));
1415 continue;
1416 }
1417
1418 std::vector<std::pair<Operand, Definition>> parallelcopy;
1419
1420 assert(!is_phi(instr));
1421
1422 /* handle operands */
1423 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1424 auto& operand = instr->operands[i];
1425 if (!operand.isTemp())
1426 continue;
1427
1428 /* rename operands */
1429 operand.setTemp(read_variable(operand.getTemp(), block.index));
1430
1431 /* check if the operand is fixed */
1432 if (operand.isFixed()) {
1433
1434 if (operand.physReg() == ctx.assignments[operand.tempId()].first) {
1435 /* we are fine: the operand is already assigned the correct reg */
1436
1437 } else {
1438 /* check if target reg is blocked, and move away the blocking var */
1439 if (register_file[operand.physReg().reg]) {
1440 uint32_t blocking_id = register_file[operand.physReg().reg];
1441 RegClass rc = ctx.assignments[blocking_id].second;
1442 Operand pc_op = Operand(Temp{blocking_id, rc});
1443 pc_op.setFixed(operand.physReg());
1444 Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()});
1445 /* find free reg */
1446 PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr);
1447 pc_def.setFixed(reg);
1448 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1449 for (unsigned i = 0; i < operand.size(); i++) {
1450 register_file[pc_op.physReg() + i] = 0;
1451 register_file[pc_def.physReg() + i] = pc_def.tempId();
1452 }
1453 parallelcopy.emplace_back(pc_op, pc_def);
1454
1455 /* handle renames of previous operands */
1456 for (unsigned j = 0; j < i; j++) {
1457 Operand& op = instr->operands[j];
1458 if (op.isTemp() && op.tempId() == blocking_id) {
1459 op.setTemp(pc_def.getTemp());
1460 op.setFixed(reg);
1461 }
1462 }
1463 }
1464 /* move operand to fixed reg and create parallelcopy pair */
1465 Operand pc_op = operand;
1466 Temp tmp = Temp{program->allocateId(), operand.regClass()};
1467 Definition pc_def = Definition(tmp);
1468 pc_def.setFixed(operand.physReg());
1469 pc_op.setFixed(ctx.assignments[operand.tempId()].first);
1470 operand.setTemp(tmp);
1471 ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()};
1472 operand.setFixed(pc_def.physReg());
1473 for (unsigned i = 0; i < operand.size(); i++) {
1474 register_file[pc_op.physReg() + i] = 0;
1475 register_file[pc_def.physReg() + i] = tmp.id();
1476 }
1477 parallelcopy.emplace_back(pc_op, pc_def);
1478 }
1479 } else {
1480 assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end());
1481 PhysReg reg = ctx.assignments[operand.tempId()].first;
1482
1483 if (operand_can_use_reg(instr, i, reg)) {
1484 operand.setFixed(ctx.assignments[operand.tempId()].first);
1485 } else {
1486 Operand pc_op = operand;
1487 pc_op.setFixed(reg);
1488 PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr);
1489 Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass());
1490 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1491 for (unsigned i = 0; i < operand.size(); i++) {
1492 register_file[pc_op.physReg() + i] = 0;
1493 register_file[pc_def.physReg() + i] = pc_def.tempId();
1494 }
1495 parallelcopy.emplace_back(pc_op, pc_def);
1496 operand.setFixed(new_reg);
1497 }
1498
1499 if (instr->format == Format::EXP ||
1500 (instr->isVMEM() && i == 3 && program->chip_class == GFX6) ||
1501 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1502 for (unsigned j = 0; j < operand.size(); j++)
1503 ctx.war_hint.set(operand.physReg().reg + j);
1504 }
1505 }
1506 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1507 if (phi != phi_map.end())
1508 phi->second.uses.emplace(instr.get());
1509
1510 }
1511 /* remove dead vars from register file */
1512 for (const Operand& op : instr->operands) {
1513 if (op.isTemp() && op.isFirstKill())
1514 for (unsigned j = 0; j < op.size(); j++)
1515 register_file[op.physReg() + j] = 0;
1516 }
1517
1518 /* try to optimize v_mad_f32 -> v_mac_f32 */
1519 if (instr->opcode == aco_opcode::v_mad_f32 &&
1520 instr->operands[2].isTemp() &&
1521 instr->operands[2].isKill() &&
1522 instr->operands[2].getTemp().type() == RegType::vgpr &&
1523 instr->operands[1].isTemp() &&
1524 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1525 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1526 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1527 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1528 vop3->clamp || vop3->omod || vop3->opsel);
1529 if (can_use_mac) {
1530 instr->format = Format::VOP2;
1531 instr->opcode = aco_opcode::v_mac_f32;
1532 }
1533 }
1534
1535 /* handle definitions which must have the same register as an operand */
1536 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1537 instr->opcode == aco_opcode::v_mac_f32 ||
1538 instr->opcode == aco_opcode::v_writelane_b32 ||
1539 instr->opcode == aco_opcode::v_writelane_b32_e64) {
1540 instr->definitions[0].setFixed(instr->operands[2].physReg());
1541 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1542 instr->opcode == aco_opcode::s_mulk_i32) {
1543 instr->definitions[0].setFixed(instr->operands[0].physReg());
1544 } else if (instr->format == Format::MUBUF &&
1545 instr->definitions.size() == 1 &&
1546 instr->operands.size() == 4) {
1547 instr->definitions[0].setFixed(instr->operands[3].physReg());
1548 } else if (instr->format == Format::MIMG &&
1549 instr->definitions.size() == 1 &&
1550 instr->operands[1].regClass().type() == RegType::vgpr) {
1551 instr->definitions[0].setFixed(instr->operands[1].physReg());
1552 }
1553
1554 ctx.defs_done.reset();
1555
1556 /* handle fixed definitions first */
1557 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1558 auto& definition = instr->definitions[i];
1559 if (!definition.isFixed())
1560 continue;
1561
1562 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1563 /* check if the target register is blocked */
1564 if (register_file[definition.physReg().reg] != 0) {
1565 /* create parallelcopy pair to move blocking var */
1566 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second};
1567 Operand pc_op = Operand(tmp);
1568 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first);
1569 RegClass rc = pc_op.regClass();
1570 tmp = Temp{program->allocateId(), rc};
1571 Definition pc_def = Definition(tmp);
1572
1573 /* re-enable the killed operands, so that we don't move the blocking var there */
1574 for (const Operand& op : instr->operands) {
1575 if (op.isTemp() && op.isFirstKill())
1576 for (unsigned j = 0; j < op.size(); j++)
1577 register_file[op.physReg() + j] = 0xFFFF;
1578 }
1579
1580 /* find a new register for the blocking variable */
1581 PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr);
1582 /* once again, disable killed operands */
1583 for (const Operand& op : instr->operands) {
1584 if (op.isTemp() && op.isFirstKill())
1585 for (unsigned j = 0; j < op.size(); j++)
1586 register_file[op.physReg() + j] = 0;
1587 }
1588 for (unsigned k = 0; k < i; k++) {
1589 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1590 for (unsigned j = 0; j < instr->definitions[k].size(); j++)
1591 register_file[instr->definitions[k].physReg() + j] = instr->definitions[k].tempId();
1592 }
1593 pc_def.setFixed(reg);
1594
1595 /* finish assignment of parallelcopy */
1596 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1597 parallelcopy.emplace_back(pc_op, pc_def);
1598
1599 /* add changes to reg_file */
1600 for (unsigned i = 0; i < pc_op.size(); i++) {
1601 register_file[pc_op.physReg() + i] = 0x0;
1602 register_file[pc_def.physReg() + i] = pc_def.tempId();
1603 }
1604 }
1605 ctx.defs_done.set(i);
1606
1607 if (!definition.isTemp())
1608 continue;
1609
1610 /* set live if it has a kill point */
1611 if (!definition.isKill())
1612 live.emplace(definition.getTemp());
1613
1614 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1615 renames[block.index][definition.tempId()] = definition.getTemp();
1616 for (unsigned j = 0; j < definition.size(); j++)
1617 register_file[definition.physReg() + j] = definition.tempId();
1618 }
1619
1620 /* handle all other definitions */
1621 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1622 auto& definition = instr->definitions[i];
1623
1624 if (definition.isFixed() || !definition.isTemp())
1625 continue;
1626
1627 /* find free reg */
1628 if (definition.hasHint() && register_file[definition.physReg().reg] == 0)
1629 definition.setFixed(definition.physReg());
1630 else if (instr->opcode == aco_opcode::p_split_vector) {
1631 PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()};
1632 if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1633 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1634 definition.setFixed(reg);
1635 } else if (instr->opcode == aco_opcode::p_wqm) {
1636 PhysReg reg;
1637 if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1638 reg = instr->operands[0].physReg();
1639 assert(register_file[reg.reg] == 0);
1640 } else {
1641 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1642 }
1643 definition.setFixed(reg);
1644 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1645 PhysReg reg;
1646 if (instr->operands[0].isKill() &&
1647 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1648 reg = instr->operands[0].physReg();
1649 reg.reg += definition.size() * instr->operands[1].constantValue();
1650 assert(register_file[reg.reg] == 0);
1651 } else {
1652 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1653 }
1654 definition.setFixed(reg);
1655 } else if (instr->opcode == aco_opcode::p_create_vector) {
1656 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(),
1657 parallelcopy, instr);
1658 definition.setFixed(reg);
1659 } else if (affinities.find(definition.tempId()) != affinities.end() &&
1660 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1661 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1662 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1663 definition.setFixed(reg);
1664 else
1665 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1666
1667 } else if (vectors.find(definition.tempId()) != vectors.end()) {
1668 Instruction* vec = vectors[definition.tempId()];
1669 unsigned offset = 0;
1670 for (const Operand& op : vec->operands) {
1671 if (op.isTemp() && op.tempId() == definition.tempId())
1672 break;
1673 else
1674 offset += op.size();
1675 }
1676 unsigned k = 0;
1677 for (const Operand& op : vec->operands) {
1678 if (op.isTemp() &&
1679 op.tempId() != definition.tempId() &&
1680 op.getTemp().type() == definition.getTemp().type() &&
1681 ctx.assignments.find(op.tempId()) != ctx.assignments.end()) {
1682 PhysReg reg = ctx.assignments[op.tempId()].first;
1683 reg.reg = reg - k + offset;
1684 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
1685 definition.setFixed(reg);
1686 break;
1687 }
1688 }
1689 k += op.size();
1690 }
1691 if (!definition.isFixed()) {
1692 std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
1693 PhysReg reg = res.first;
1694 if (res.second) {
1695 reg.reg += offset;
1696 } else {
1697 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1698 }
1699 definition.setFixed(reg);
1700 }
1701 } else
1702 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1703
1704 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1705 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1706 ctx.defs_done.set(i);
1707
1708 /* set live if it has a kill point */
1709 if (!definition.isKill())
1710 live.emplace(definition.getTemp());
1711
1712 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1713 renames[block.index][definition.tempId()] = definition.getTemp();
1714 for (unsigned j = 0; j < definition.size(); j++)
1715 register_file[definition.physReg() + j] = definition.tempId();
1716 }
1717
1718 handle_pseudo(ctx, register_file, instr.get());
1719
1720 /* kill definitions */
1721 for (const Definition& def : instr->definitions) {
1722 if (def.isTemp() && def.isKill()) {
1723 for (unsigned j = 0; j < def.size(); j++) {
1724 register_file[def.physReg() + j] = 0;
1725 }
1726 }
1727 }
1728
1729 /* emit parallelcopy */
1730 if (!parallelcopy.empty()) {
1731 aco_ptr<Pseudo_instruction> pc;
1732 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1733 bool temp_in_scc = register_file[scc.reg];
1734 bool sgpr_operands_alias_defs = false;
1735 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1736 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1737 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1738 if (!sgpr_operands_alias_defs) {
1739 unsigned reg = parallelcopy[i].first.physReg().reg;
1740 unsigned size = parallelcopy[i].first.getTemp().size();
1741 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1742
1743 reg = parallelcopy[i].second.physReg().reg;
1744 size = parallelcopy[i].second.getTemp().size();
1745 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1746 sgpr_operands_alias_defs = true;
1747 }
1748 }
1749
1750 pc->operands[i] = parallelcopy[i].first;
1751 pc->definitions[i] = parallelcopy[i].second;
1752 assert(pc->operands[i].size() == pc->definitions[i].size());
1753
1754 /* it might happen that the operand is already renamed. we have to restore the original name. */
1755 std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1756 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
1757 ctx.orig_names[pc->definitions[i].tempId()] = orig;
1758 renames[block.index][orig.id()] = pc->definitions[i].getTemp();
1759 renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp();
1760
1761 std::map<unsigned, phi_info>::iterator phi = phi_map.find(pc->operands[i].tempId());
1762 if (phi != phi_map.end())
1763 phi->second.uses.emplace(pc.get());
1764 }
1765
1766 if (temp_in_scc && sgpr_operands_alias_defs) {
1767 /* disable definitions and re-enable operands */
1768 for (const Definition& def : instr->definitions) {
1769 if (def.isTemp() && !def.isKill()) {
1770 for (unsigned j = 0; j < def.size(); j++) {
1771 register_file[def.physReg() + j] = 0x0;
1772 }
1773 }
1774 }
1775 for (const Operand& op : instr->operands) {
1776 if (op.isTemp() && op.isFirstKill()) {
1777 for (unsigned j = 0; j < op.size(); j++)
1778 register_file[op.physReg() + j] = 0xFFFF;
1779 }
1780 }
1781
1782 handle_pseudo(ctx, register_file, pc.get());
1783
1784 /* re-enable live vars */
1785 for (const Operand& op : instr->operands) {
1786 if (op.isTemp() && op.isFirstKill())
1787 for (unsigned j = 0; j < op.size(); j++)
1788 register_file[op.physReg() + j] = 0x0;
1789 }
1790 for (const Definition& def : instr->definitions) {
1791 if (def.isTemp() && !def.isKill()) {
1792 for (unsigned j = 0; j < def.size(); j++) {
1793 register_file[def.physReg() + j] = def.tempId();
1794 }
1795 }
1796 }
1797 } else {
1798 pc->tmp_in_scc = false;
1799 }
1800
1801 instructions.emplace_back(std::move(pc));
1802 }
1803
1804 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1805 bool instr_needs_vop3 = !instr->isVOP3() &&
1806 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1807 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1808 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1809 instr->opcode == aco_opcode::v_addc_co_u32 ||
1810 instr->opcode == aco_opcode::v_sub_co_u32 ||
1811 instr->opcode == aco_opcode::v_subb_co_u32 ||
1812 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1813 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1814 !(instr->definitions[1].physReg() == vcc)) ||
1815 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1816 instr->opcode == aco_opcode::v_subb_co_u32 ||
1817 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1818 !(instr->operands[2].physReg() == vcc)));
1819 if (instr_needs_vop3) {
1820
1821 /* if the first operand is a literal, we have to move it to a reg */
1822 if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) {
1823 bool can_sgpr = true;
1824 /* check, if we have to move to vgpr */
1825 for (const Operand& op : instr->operands) {
1826 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1827 can_sgpr = false;
1828 break;
1829 }
1830 }
1831 aco_ptr<Instruction> mov;
1832 if (can_sgpr)
1833 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1834 else
1835 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1836 mov->operands[0] = instr->operands[0];
1837 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
1838 mov->definitions[0] = Definition(tmp);
1839 /* disable definitions and re-enable operands */
1840 for (const Definition& def : instr->definitions) {
1841 for (unsigned j = 0; j < def.size(); j++) {
1842 register_file[def.physReg() + j] = 0x0;
1843 }
1844 }
1845 for (const Operand& op : instr->operands) {
1846 if (op.isTemp() && op.isFirstKill()) {
1847 for (unsigned j = 0; j < op.size(); j++)
1848 register_file[op.physReg() + j] = 0xFFFF;
1849 }
1850 }
1851 mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov));
1852 instr->operands[0] = Operand(tmp);
1853 instr->operands[0].setFixed(mov->definitions[0].physReg());
1854 instructions.emplace_back(std::move(mov));
1855 /* re-enable live vars */
1856 for (const Operand& op : instr->operands) {
1857 if (op.isTemp() && op.isFirstKill())
1858 for (unsigned j = 0; j < op.size(); j++)
1859 register_file[op.physReg() + j] = 0x0;
1860 }
1861 for (const Definition& def : instr->definitions) {
1862 if (def.isTemp() && !def.isKill()) {
1863 for (unsigned j = 0; j < def.size(); j++) {
1864 register_file[def.physReg() + j] = def.tempId();
1865 }
1866 }
1867 }
1868 }
1869
1870 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1871 aco_ptr<Instruction> tmp = std::move(instr);
1872 Format format = asVOP3(tmp->format);
1873 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
1874 for (unsigned i = 0; i < instr->operands.size(); i++) {
1875 Operand& operand = tmp->operands[i];
1876 instr->operands[i] = operand;
1877 /* keep phi_map up to date */
1878 if (operand.isTemp()) {
1879 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.tempId());
1880 if (phi != phi_map.end()) {
1881 phi->second.uses.erase(tmp.get());
1882 phi->second.uses.emplace(instr.get());
1883 }
1884 }
1885 }
1886 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
1887 }
1888 instructions.emplace_back(std::move(*it));
1889
1890 } /* end for Instr */
1891
1892 block.instructions = std::move(instructions);
1893
1894 filled[block.index] = true;
1895 for (unsigned succ_idx : block.linear_succs) {
1896 Block& succ = program->blocks[succ_idx];
1897 /* seal block if all predecessors are filled */
1898 bool all_filled = true;
1899 for (unsigned pred_idx : succ.linear_preds) {
1900 if (!filled[pred_idx]) {
1901 all_filled = false;
1902 break;
1903 }
1904 }
1905 if (all_filled) {
1906 /* finish incomplete phis and check if they became trivial */
1907 for (Instruction* phi : incomplete_phis[succ_idx]) {
1908 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
1909 for (unsigned i = 0; i < phi->operands.size(); i++) {
1910 phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i]));
1911 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first);
1912 }
1913 try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId()));
1914 }
1915 /* complete the original phi nodes, but no need to check triviality */
1916 for (aco_ptr<Instruction>& instr : succ.instructions) {
1917 if (!is_phi(instr))
1918 break;
1919 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
1920
1921 for (unsigned i = 0; i < instr->operands.size(); i++) {
1922 auto& operand = instr->operands[i];
1923 if (!operand.isTemp())
1924 continue;
1925 operand.setTemp(read_variable(operand.getTemp(), preds[i]));
1926 operand.setFixed(ctx.assignments[operand.tempId()].first);
1927 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1928 if (phi != phi_map.end())
1929 phi->second.uses.emplace(instr.get());
1930 }
1931 }
1932 sealed[succ_idx] = true;
1933 }
1934 }
1935 } /* end for BB */
1936
1937 /* remove trivial phis */
1938 for (Block& block : program->blocks) {
1939 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
1940 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
1941 auto middle = std::remove_if(block.instructions.begin(), end,
1942 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
1943 block.instructions.erase(middle, end);
1944 }
1945
1946 /* find scc spill registers which may be needed for parallelcopies created by phis */
1947 for (Block& block : program->blocks) {
1948 if (block.linear_preds.size() <= 1)
1949 continue;
1950
1951 std::bitset<128> regs = sgpr_live_in[block.index];
1952 if (!regs[127])
1953 continue;
1954
1955 /* choose a register */
1956 int16_t reg = 0;
1957 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
1958 ;
1959 assert(reg < ctx.program->max_reg_demand.sgpr);
1960 adjust_max_used_regs(ctx, s1, reg);
1961
1962 /* update predecessors */
1963 for (unsigned& pred_index : block.linear_preds) {
1964 Block& pred = program->blocks[pred_index];
1965 pred.scc_live_out = true;
1966 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
1967 }
1968 }
1969
1970 /* num_gpr = rnd_up(max_used_gpr + 1) */
1971 program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
1972 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
1973 program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
1974 else
1975 program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
1976 }
1977
1978 }