android: aco: fix undefined template 'std::__1::array' build errors
[mesa.git] / src / amd / compiler / aco_register_allocation.cpp
1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
26 *
27 */
28
29 #include <algorithm>
30 #include <array>
31 #include <map>
32 #include <unordered_map>
33 #include <functional>
34
35 #include "aco_ir.h"
36 #include "sid.h"
37
38 namespace aco {
39 namespace {
40
41 struct ra_ctx {
42 std::bitset<512> war_hint;
43 Program* program;
44 std::unordered_map<unsigned, std::pair<PhysReg, RegClass>> assignments;
45 std::map<unsigned, Temp> orig_names;
46 unsigned max_used_sgpr = 0;
47 unsigned max_used_vgpr = 0;
48 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
49
50 ra_ctx(Program* program) : program(program) {}
51 };
52
53
54 /* helper function for debugging */
55 #if 0
56 void print_regs(ra_ctx& ctx, bool vgprs, std::array<uint32_t, 512>& reg_file)
57 {
58 unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr;
59 unsigned lb = vgprs ? 256 : 0;
60 unsigned ub = lb + max;
61 char reg_char = vgprs ? 'v' : 's';
62
63 /* print markers */
64 printf(" ");
65 for (unsigned i = lb; i < ub; i += 3) {
66 printf("%.2u ", i - lb);
67 }
68 printf("\n");
69
70 /* print usage */
71 printf("%cgprs: ", reg_char);
72 unsigned free_regs = 0;
73 unsigned prev = 0;
74 bool char_select = false;
75 for (unsigned i = lb; i < ub; i++) {
76 if (reg_file[i] == 0xFFFF) {
77 printf("~");
78 } else if (reg_file[i]) {
79 if (reg_file[i] != prev) {
80 prev = reg_file[i];
81 char_select = !char_select;
82 }
83 printf(char_select ? "#" : "@");
84 } else {
85 free_regs++;
86 printf(".");
87 }
88 }
89 printf("\n");
90
91 printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max);
92
93 /* print assignments */
94 prev = 0;
95 unsigned size = 0;
96 for (unsigned i = lb; i < ub; i++) {
97 if (reg_file[i] != prev) {
98 if (prev && size > 1)
99 printf("-%d]\n", i - 1 - lb);
100 else if (prev)
101 printf("]\n");
102 prev = reg_file[i];
103 if (prev && prev != 0xFFFF) {
104 if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i])
105 printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb);
106 else
107 printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb);
108 }
109 size = 1;
110 } else {
111 size++;
112 }
113 }
114 if (prev && size > 1)
115 printf("-%d]\n", ub - lb - 1);
116 else if (prev)
117 printf("]\n");
118 }
119 #endif
120
121
122 void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
123 {
124 unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
125 unsigned size = rc.size();
126 if (rc.type() == RegType::vgpr) {
127 assert(reg >= 256);
128 unsigned hi = reg - 256 + size - 1;
129 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
130 } else if (reg + rc.size() <= max_addressible_sgpr) {
131 unsigned hi = reg + size - 1;
132 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
133 }
134 }
135
136
137 void update_renames(ra_ctx& ctx, std::array<uint32_t, 512>& reg_file,
138 std::vector<std::pair<Operand, Definition>>& parallelcopies,
139 aco_ptr<Instruction>& instr)
140 {
141 /* allocate id's and rename operands: this is done transparently here */
142 for (std::pair<Operand, Definition>& copy : parallelcopies) {
143 /* the definitions with id are not from this function and already handled */
144 if (copy.second.isTemp())
145 continue;
146
147 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
148 copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
149 ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()};
150 for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++)
151 reg_file[i] = copy.second.tempId();
152 /* check if we moved an operand */
153 for (Operand& op : instr->operands) {
154 if (!op.isTemp())
155 continue;
156 if (op.tempId() == copy.first.tempId()) {
157 bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKill();
158 for (std::pair<Operand, Definition>& pc : parallelcopies) {
159 PhysReg def_reg = pc.second.physReg();
160 omit_renaming &= def_reg > copy.first.physReg() ?
161 (copy.first.physReg() + copy.first.size() <= def_reg.reg) :
162 (def_reg + pc.second.size() <= copy.first.physReg().reg);
163 }
164 if (omit_renaming)
165 continue;
166 op.setTemp(copy.second.getTemp());
167 op.setFixed(copy.second.physReg());
168 }
169 }
170 }
171 }
172
173 std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
174 std::array<uint32_t, 512>& reg_file,
175 uint32_t lb, uint32_t ub,
176 uint32_t size, uint32_t stride,
177 RegClass rc)
178 {
179 /* best fit algorithm: find the smallest gap to fit in the variable */
180 if (stride == 1) {
181 unsigned best_pos = 0xFFFF;
182 unsigned gap_size = 0xFFFF;
183 unsigned next_pos = 0xFFFF;
184
185 for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
186 if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
187 if (next_pos == 0xFFFF)
188 continue;
189
190 /* check if the variable fits */
191 if (next_pos + size > current_reg) {
192 next_pos = 0xFFFF;
193 continue;
194 }
195
196 /* check if the tested gap is smaller */
197 if (current_reg - next_pos < gap_size) {
198 best_pos = next_pos;
199 gap_size = current_reg - next_pos;
200 }
201 next_pos = 0xFFFF;
202 continue;
203 }
204
205 if (next_pos == 0xFFFF)
206 next_pos = current_reg;
207 }
208
209 /* final check */
210 if (next_pos != 0xFFFF &&
211 next_pos + size <= ub &&
212 ub - next_pos < gap_size) {
213 best_pos = next_pos;
214 gap_size = ub - next_pos;
215 }
216 if (best_pos != 0xFFFF) {
217 adjust_max_used_regs(ctx, rc, best_pos);
218 return {PhysReg{best_pos}, true};
219 }
220 return {{}, false};
221 }
222
223 bool found = false;
224 unsigned reg_lo = lb;
225 unsigned reg_hi = lb + size - 1;
226 while (!found && reg_lo + size <= ub) {
227 if (reg_file[reg_lo] != 0) {
228 reg_lo += stride;
229 continue;
230 }
231 reg_hi = reg_lo + size - 1;
232 found = true;
233 for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) {
234 if (reg_file[reg] != 0 || ctx.war_hint[reg])
235 found = false;
236 }
237 if (found) {
238 adjust_max_used_regs(ctx, rc, reg_lo);
239 return {PhysReg{reg_lo}, true};
240 }
241
242 reg_lo += stride;
243 }
244
245 return {{}, false};
246 }
247
248 bool get_regs_for_copies(ra_ctx& ctx,
249 std::array<uint32_t, 512>& reg_file,
250 std::vector<std::pair<Operand, Definition>>& parallelcopies,
251 std::set<std::pair<unsigned, unsigned>> vars,
252 uint32_t lb, uint32_t ub,
253 aco_ptr<Instruction>& instr,
254 uint32_t def_reg_lo,
255 uint32_t def_reg_hi)
256 {
257
258 /* variables are sorted from small sized to large */
259 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
260 for (std::set<std::pair<unsigned, unsigned>>::reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
261 unsigned id = it->second;
262 std::pair<PhysReg, RegClass> var = ctx.assignments[id];
263 uint32_t size = it->first;
264 uint32_t stride = 1;
265 if (var.second.type() == RegType::sgpr) {
266 if (size == 2)
267 stride = 2;
268 if (size > 3)
269 stride = 4;
270 }
271
272 /* check if this is a dead operand, then we can re-use the space from the definition */
273 bool is_dead_operand = false;
274 for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && i < instr->operands.size(); i++) {
275 if (instr->operands[i].isTemp() && instr->operands[i].isKill() && instr->operands[i].tempId() == id)
276 is_dead_operand = true;
277 }
278
279 std::pair<PhysReg, bool> res;
280 if (is_dead_operand) {
281 if (instr->opcode == aco_opcode::p_create_vector) {
282 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
283 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
284 for (unsigned j = 0; j < size; j++)
285 assert(reg_file[def_reg_lo + offset + j] == 0);
286 res = {PhysReg{def_reg_lo + offset}, true};
287 break;
288 }
289 }
290 } else {
291 res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second);
292 }
293 } else {
294 res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second);
295 if (!res.second) {
296 unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
297 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second);
298 }
299 }
300
301 if (res.second) {
302 /* mark the area as blocked */
303 for (unsigned i = res.first.reg; i < res.first + size; i++)
304 reg_file[i] = 0xFFFFFFFF;
305 /* create parallelcopy pair (without definition id) */
306 Temp tmp = Temp(id, var.second);
307 Operand pc_op = Operand(tmp);
308 pc_op.setFixed(var.first);
309 Definition pc_def = Definition(res.first, pc_op.regClass());
310 parallelcopies.emplace_back(pc_op, pc_def);
311 continue;
312 }
313
314 unsigned best_pos = lb;
315 unsigned num_moves = 0xFF;
316 unsigned num_vars = 0;
317
318 /* we use a sliding window to find potential positions */
319 unsigned reg_lo = lb;
320 unsigned reg_hi = lb + size - 1;
321 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
322 if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
323 (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
324 continue;
325
326 /* second, check that we have at most k=num_moves elements in the window
327 * and no element is larger than the currently processed one */
328 unsigned k = 0;
329 unsigned n = 0;
330 unsigned last_var = 0;
331 bool found = true;
332 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
333 if (reg_file[j] == 0 || reg_file[j] == last_var)
334 continue;
335
336 /* 0xFFFF signals that this area is already blocked! */
337 if (reg_file[j] == 0xFFFFFFFF || k > num_moves) {
338 found = false;
339 break;
340 }
341 /* we cannot split live ranges of linear vgprs */
342 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
343 found = false;
344 break;
345 }
346 bool is_kill = false;
347 for (const Operand& op : instr->operands) {
348 if (op.isTemp() && op.isKill() && op.tempId() == reg_file[j]) {
349 is_kill = true;
350 break;
351 }
352 }
353 if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) {
354 found = false;
355 break;
356 }
357
358 k += ctx.assignments[reg_file[j]].second.size();
359 last_var = reg_file[j];
360 n++;
361 if (k > num_moves || (k == num_moves && n <= num_vars)) {
362 found = false;
363 break;
364 }
365 }
366
367 if (found) {
368 best_pos = reg_lo;
369 num_moves = k;
370 num_vars = n;
371 }
372 }
373
374 /* FIXME: we messed up and couldn't find space for the variables to be copied */
375 if (num_moves == 0xFF)
376 return false;
377
378 reg_lo = best_pos;
379 reg_hi = best_pos + size - 1;
380
381 /* collect variables and block reg file */
382 std::set<std::pair<unsigned, unsigned>> new_vars;
383 for (unsigned j = reg_lo; j <= reg_hi; j++) {
384 if (reg_file[j] != 0) {
385 unsigned size = ctx.assignments[reg_file[j]].second.size();
386 unsigned id = reg_file[j];
387 new_vars.emplace(size, id);
388 for (unsigned k = 0; k < size; k++)
389 reg_file[ctx.assignments[id].first + k] = 0;
390 }
391 }
392
393 /* mark the area as blocked */
394 for (unsigned i = reg_lo; i <= reg_hi; i++)
395 reg_file[i] = 0xFFFFFFFF;
396
397 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
398 return false;
399
400 adjust_max_used_regs(ctx, var.second, reg_lo);
401
402 /* create parallelcopy pair (without definition id) */
403 Temp tmp = Temp(id, var.second);
404 Operand pc_op = Operand(tmp);
405 pc_op.setFixed(var.first);
406 Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass());
407 parallelcopies.emplace_back(pc_op, pc_def);
408 }
409
410 return true;
411 }
412
413
414 std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
415 std::array<uint32_t, 512>& reg_file,
416 std::vector<std::pair<Operand, Definition>>& parallelcopies,
417 uint32_t lb, uint32_t ub,
418 uint32_t size, uint32_t stride,
419 RegClass rc,
420 aco_ptr<Instruction>& instr)
421 {
422 unsigned regs_free = 0;
423 /* check how many free regs we have */
424 for (unsigned j = lb; j < ub; j++) {
425 if (reg_file[j] == 0)
426 regs_free++;
427 }
428
429 /* mark and count killed operands */
430 unsigned killed_ops = 0;
431 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
432 if (instr->operands[j].isTemp() &&
433 instr->operands[j].isFirstKill() &&
434 instr->operands[j].physReg() >= lb &&
435 instr->operands[j].physReg() < ub) {
436 assert(instr->operands[j].isFixed());
437 assert(reg_file[instr->operands[j].physReg().reg] == 0);
438 for (unsigned k = 0; k < instr->operands[j].size(); k++)
439 reg_file[instr->operands[j].physReg() + k] = 0xFFFFFFFF;
440 killed_ops += instr->operands[j].getTemp().size();
441 }
442 }
443
444 assert(regs_free >= size);
445 /* we might have to move dead operands to dst in order to make space */
446 unsigned op_moves = 0;
447
448 if (size > (regs_free - killed_ops))
449 op_moves = size - (regs_free - killed_ops);
450
451 /* find the best position to place the definition */
452 unsigned best_pos = lb;
453 unsigned num_moves = 0xFF;
454 unsigned num_vars = 0;
455
456 /* we use a sliding window to check potential positions */
457 unsigned reg_lo = lb;
458 unsigned reg_hi = lb + size - 1;
459 for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
460 /* first check the edges: this is what we have to fix to allow for num_moves > size */
461 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
462 continue;
463 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
464 continue;
465
466 /* second, check that we have at most k=num_moves elements in the window
467 * and no element is larger than the currently processed one */
468 unsigned k = op_moves;
469 unsigned n = 0;
470 unsigned remaining_op_moves = op_moves;
471 unsigned last_var = 0;
472 bool found = true;
473 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
474 for (unsigned j = reg_lo; found && j <= reg_hi; j++) {
475 if (reg_file[j] == 0 || reg_file[j] == last_var)
476 continue;
477
478 /* dead operands effectively reduce the number of estimated moves */
479 if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) {
480 k--;
481 remaining_op_moves--;
482 continue;
483 }
484
485 if (ctx.assignments[reg_file[j]].second.size() >= size) {
486 found = false;
487 break;
488 }
489
490
491 /* we cannot split live ranges of linear vgprs */
492 if (ctx.assignments[reg_file[j]].second & (1 << 6)) {
493 found = false;
494 break;
495 }
496
497 k += ctx.assignments[reg_file[j]].second.size();
498 n++;
499 last_var = reg_file[j];
500 }
501
502 if (!found || k > num_moves)
503 continue;
504 if (k == num_moves && n < num_vars)
505 continue;
506 if (!aligned && k == num_moves && n == num_vars)
507 continue;
508
509 if (found) {
510 best_pos = reg_lo;
511 num_moves = k;
512 num_vars = n;
513 }
514 }
515
516 if (num_moves == 0xFF) {
517 /* remove killed operands from reg_file once again */
518 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
519 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill()) {
520 for (unsigned k = 0; k < instr->operands[i].getTemp().size(); k++)
521 reg_file[instr->operands[i].physReg() + k] = 0;
522 }
523 }
524 for (unsigned i = 0; i < instr->definitions.size(); i++) {
525 Definition def = instr->definitions[i];
526 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
527 for (unsigned k = 0; k < def.getTemp().size(); k++)
528 reg_file[def.physReg() + k] = def.tempId();
529 }
530 }
531 return {{}, false};
532 }
533
534 std::array<uint32_t, 512> register_file = reg_file;
535
536 /* now, we figured the placement for our definition */
537 std::set<std::pair<unsigned, unsigned>> vars;
538 for (unsigned j = best_pos; j < best_pos + size; j++) {
539 if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0)
540 vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]);
541 reg_file[j] = 0;
542 }
543
544 if (instr->opcode == aco_opcode::p_create_vector) {
545 /* move killed operands which aren't yet at the correct position */
546 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
547 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
548 instr->operands[i].getTemp().type() == rc.type()) {
549
550 if (instr->operands[i].physReg() != best_pos + offset) {
551 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
552 for (unsigned j = 0; j < instr->operands[i].size(); j++)
553 reg_file[instr->operands[i].physReg() + j] = 0;
554 } else {
555 for (unsigned j = 0; j < instr->operands[i].size(); j++)
556 reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId();
557 }
558 }
559 }
560 } else {
561 /* re-enable the killed operands */
562 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
563 if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill()) {
564 for (unsigned k = 0; k < instr->operands[j].getTemp().size(); k++)
565 reg_file[instr->operands[j].physReg() + k] = instr->operands[j].tempId();
566 }
567 }
568 }
569
570 std::vector<std::pair<Operand, Definition>> pc;
571 if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) {
572 reg_file = std::move(register_file);
573 /* remove killed operands from reg_file once again */
574 if (!is_phi(instr)) {
575 for (const Operand& op : instr->operands) {
576 if (op.isTemp() && op.isFirstKill()) {
577 for (unsigned k = 0; k < op.getTemp().size(); k++)
578 reg_file[op.physReg() + k] = 0;
579 }
580 }
581 }
582 for (unsigned i = 0; i < instr->definitions.size(); i++) {
583 Definition& def = instr->definitions[i];
584 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
585 for (unsigned k = 0; k < def.getTemp().size(); k++)
586 reg_file[def.physReg() + k] = def.tempId();
587 }
588 }
589 return {{}, false};
590 }
591
592 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
593
594 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
595 for (unsigned i = 0; i < size; i++)
596 reg_file[best_pos + i] = 0;
597
598 update_renames(ctx, reg_file, parallelcopies, instr);
599
600 /* remove killed operands from reg_file once again */
601 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
602 if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed())
603 continue;
604 assert(!instr->operands[i].isUndefined());
605 if (instr->operands[i].isFirstKill()) {
606 for (unsigned j = 0; j < instr->operands[i].getTemp().size(); j++)
607 reg_file[instr->operands[i].physReg() + j] = 0;
608 }
609 }
610 for (unsigned i = 0; i < instr->definitions.size(); i++) {
611 Definition def = instr->definitions[i];
612 if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) {
613 for (unsigned k = 0; k < def.getTemp().size(); k++)
614 reg_file[def.physReg() + k] = def.tempId();
615 }
616 }
617
618 adjust_max_used_regs(ctx, rc, best_pos);
619 return {PhysReg{best_pos}, true};
620 }
621
622 PhysReg get_reg(ra_ctx& ctx,
623 std::array<uint32_t, 512>& reg_file,
624 RegClass rc,
625 std::vector<std::pair<Operand, Definition>>& parallelcopies,
626 aco_ptr<Instruction>& instr)
627 {
628 uint32_t size = rc.size();
629 uint32_t stride = 1;
630 uint32_t lb, ub;
631 if (rc.type() == RegType::vgpr) {
632 lb = 256;
633 ub = 256 + ctx.program->max_reg_demand.vgpr;
634 } else {
635 lb = 0;
636 ub = ctx.program->max_reg_demand.sgpr;
637 if (size == 2)
638 stride = 2;
639 else if (size >= 4)
640 stride = 4;
641 }
642
643 std::pair<PhysReg, bool> res = {{}, false};
644 /* try to find space without live-range splits */
645 if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
646 res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
647 if (!res.second)
648 res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
649 if (res.second)
650 return res.first;
651
652 /* try to find space with live-range splits */
653 res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
654
655 if (res.second)
656 return res.first;
657
658 unsigned regs_free = 0;
659 for (unsigned i = lb; i < ub; i++) {
660 if (!reg_file[i])
661 regs_free++;
662 }
663
664 /* We should only fail here because keeping under the limit would require
665 * too many moves. */
666 assert(regs_free >= size);
667
668 /* try using more registers */
669 uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
670 if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) {
671 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
672 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
673 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
674 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
675 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
676 }
677
678 //FIXME: if nothing helps, shift-rotate the registers to make space
679
680 unreachable("did not find a register");
681 }
682
683
684 std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
685 std::array<uint32_t, 512>& reg_file,
686 RegClass rc)
687 {
688 uint32_t size = rc.size();
689 uint32_t stride = 1;
690 uint32_t lb, ub;
691 if (rc.type() == RegType::vgpr) {
692 lb = 256;
693 ub = 256 + ctx.program->max_reg_demand.vgpr;
694 } else {
695 lb = 0;
696 ub = ctx.program->max_reg_demand.sgpr;
697 if (size == 2)
698 stride = 2;
699 else if (size >= 4)
700 stride = 4;
701 }
702 return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
703 }
704
705
706 PhysReg get_reg_create_vector(ra_ctx& ctx,
707 std::array<uint32_t, 512>& reg_file,
708 RegClass rc,
709 std::vector<std::pair<Operand, Definition>>& parallelcopies,
710 aco_ptr<Instruction>& instr)
711 {
712 /* create_vector instructions have different costs w.r.t. register coalescing */
713 uint32_t size = rc.size();
714 uint32_t stride = 1;
715 uint32_t lb, ub;
716 if (rc.type() == RegType::vgpr) {
717 lb = 256;
718 ub = 256 + ctx.program->max_reg_demand.vgpr;
719 } else {
720 lb = 0;
721 ub = ctx.program->max_reg_demand.sgpr;
722 if (size == 2)
723 stride = 2;
724 else if (size >= 4)
725 stride = 4;
726 }
727
728 unsigned best_pos = -1;
729 unsigned num_moves = 0xFF;
730 bool best_war_hint = true;
731
732 /* test for each operand which definition placement causes the least shuffle instructions */
733 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
734 // TODO: think about, if we can alias live operands on the same register
735 if (!instr->operands[i].isTemp() || !instr->operands[i].isKill() || instr->operands[i].getTemp().type() != rc.type())
736 continue;
737
738 if (offset > instr->operands[i].physReg())
739 continue;
740
741 unsigned reg_lo = instr->operands[i].physReg() - offset;
742 unsigned reg_hi = reg_lo + size - 1;
743 unsigned k = 0;
744
745 /* no need to check multiple times */
746 if (reg_lo == best_pos)
747 continue;
748
749 /* check borders */
750 // TODO: this can be improved */
751 if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
752 continue;
753 if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
754 continue;
755 if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
756 continue;
757
758 /* count variables to be moved and check war_hint */
759 bool war_hint = false;
760 for (unsigned j = reg_lo; j <= reg_hi; j++) {
761 if (reg_file[j] != 0)
762 k++;
763 war_hint |= ctx.war_hint[j];
764 }
765
766 /* count operands in wrong positions */
767 for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
768 if (j == i ||
769 !instr->operands[j].isTemp() ||
770 instr->operands[j].getTemp().type() != rc.type())
771 continue;
772 if (instr->operands[j].physReg() != reg_lo + offset)
773 k += instr->operands[j].size();
774 }
775 bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
776 if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint))
777 continue;
778
779 best_pos = reg_lo;
780 num_moves = k;
781 best_war_hint = war_hint;
782 }
783
784 if (num_moves >= size)
785 return get_reg(ctx, reg_file, rc, parallelcopies, instr);
786
787 /* collect variables to be moved */
788 std::set<std::pair<unsigned, unsigned>> vars;
789 for (unsigned i = best_pos; i < best_pos + size; i++) {
790 if (reg_file[i] != 0)
791 vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]);
792 reg_file[i] = 0;
793 }
794
795 /* move killed operands which aren't yet at the correct position */
796 for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
797 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) {
798 if (instr->operands[i].physReg() != best_pos + offset) {
799 vars.emplace(instr->operands[i].size(), instr->operands[i].tempId());
800 } else {
801 for (unsigned j = 0; j < instr->operands[i].size(); j++)
802 reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId();
803 }
804 }
805 }
806
807 ASSERTED bool success = false;
808 success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
809 assert(success);
810
811 update_renames(ctx, reg_file, parallelcopies, instr);
812 adjust_max_used_regs(ctx, rc, best_pos);
813 return PhysReg{best_pos};
814 }
815
816 bool get_reg_specified(ra_ctx& ctx,
817 std::array<uint32_t, 512>& reg_file,
818 RegClass rc,
819 std::vector<std::pair<Operand, Definition>>& parallelcopies,
820 aco_ptr<Instruction>& instr,
821 PhysReg reg)
822 {
823 uint32_t size = rc.size();
824 uint32_t stride = 1;
825 uint32_t lb, ub;
826
827 if (rc.type() == RegType::vgpr) {
828 lb = 256;
829 ub = 256 + ctx.program->max_reg_demand.vgpr;
830 } else {
831 if (size == 2)
832 stride = 2;
833 else if (size >= 4)
834 stride = 4;
835 if (reg % stride != 0)
836 return false;
837 lb = 0;
838 ub = ctx.program->max_reg_demand.sgpr;
839 }
840
841 uint32_t reg_lo = reg.reg;
842 uint32_t reg_hi = reg + (size - 1);
843
844 if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
845 return false;
846
847 for (unsigned i = reg_lo; i <= reg_hi; i++) {
848 if (reg_file[i] != 0)
849 return false;
850 }
851 adjust_max_used_regs(ctx, rc, reg_lo);
852 return true;
853 }
854
855 void handle_pseudo(ra_ctx& ctx,
856 const std::array<uint32_t, 512>& reg_file,
857 Instruction* instr)
858 {
859 if (instr->format != Format::PSEUDO)
860 return;
861
862 /* all instructions which use handle_operands() need this information */
863 switch (instr->opcode) {
864 case aco_opcode::p_extract_vector:
865 case aco_opcode::p_create_vector:
866 case aco_opcode::p_split_vector:
867 case aco_opcode::p_parallelcopy:
868 case aco_opcode::p_wqm:
869 break;
870 default:
871 return;
872 }
873
874 /* if all definitions are vgpr, no need to care for SCC */
875 bool writes_sgpr = false;
876 for (Definition& def : instr->definitions) {
877 if (def.getTemp().type() == RegType::sgpr) {
878 writes_sgpr = true;
879 break;
880 }
881 }
882 if (!writes_sgpr)
883 return;
884
885 Pseudo_instruction *pi = (Pseudo_instruction *)instr;
886 if (reg_file[scc.reg]) {
887 pi->tmp_in_scc = true;
888
889 int reg = ctx.max_used_sgpr;
890 for (; reg >= 0 && reg_file[reg]; reg--)
891 ;
892 if (reg < 0) {
893 reg = ctx.max_used_sgpr + 1;
894 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
895 ;
896 assert(reg < ctx.program->max_reg_demand.sgpr);
897 }
898
899 adjust_max_used_regs(ctx, s1, reg);
900 pi->scratch_sgpr = PhysReg{(unsigned)reg};
901 } else {
902 pi->tmp_in_scc = false;
903 }
904 }
905
906 bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
907 {
908 switch (instr->format) {
909 case Format::SMEM:
910 return reg != scc &&
911 reg != exec &&
912 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
913 (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */
914 default:
915 // TODO: there are more instructions with restrictions on registers
916 return true;
917 }
918 }
919
920 } /* end namespace */
921
922
923 void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block)
924 {
925 ra_ctx ctx(program);
926
927 std::vector<std::unordered_map<unsigned, Temp>> renames(program->blocks.size());
928
929 struct phi_info {
930 Instruction* phi;
931 unsigned block_idx;
932 std::set<Instruction*> uses;
933 };
934
935 bool filled[program->blocks.size()];
936 bool sealed[program->blocks.size()];
937 memset(filled, 0, sizeof filled);
938 memset(sealed, 0, sizeof sealed);
939 std::vector<std::vector<Instruction*>> incomplete_phis(program->blocks.size());
940 std::map<unsigned, phi_info> phi_map;
941 std::map<unsigned, unsigned> affinities;
942 std::function<Temp(Temp,unsigned)> read_variable;
943 std::function<Temp(Temp,Block*)> handle_live_in;
944 std::function<Temp(std::map<unsigned, phi_info>::iterator)> try_remove_trivial_phi;
945
946 read_variable = [&](Temp val, unsigned block_idx) -> Temp {
947 std::unordered_map<unsigned, Temp>::iterator it = renames[block_idx].find(val.id());
948 assert(it != renames[block_idx].end());
949 return it->second;
950 };
951
952 handle_live_in = [&](Temp val, Block *block) -> Temp {
953 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
954 if (preds.size() == 0 && block->index != 0) {
955 renames[block->index][val.id()] = val;
956 return val;
957 }
958 assert(preds.size() > 0);
959
960 Temp new_val;
961 if (!sealed[block->index]) {
962 /* consider rename from already processed predecessor */
963 Temp tmp = read_variable(val, preds[0]);
964
965 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
966 new_val = Temp{program->allocateId(), val.regClass()};
967 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
968 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
969 phi->definitions[0] = Definition(new_val);
970 for (unsigned i = 0; i < preds.size(); i++)
971 phi->operands[i] = Operand(val);
972 if (tmp.regClass() == new_val.regClass())
973 affinities[new_val.id()] = tmp.id();
974
975 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
976 incomplete_phis[block->index].emplace_back(phi.get());
977 block->instructions.insert(block->instructions.begin(), std::move(phi));
978
979 } else if (preds.size() == 1) {
980 /* if the block has only one predecessor, just look there for the name */
981 new_val = read_variable(val, preds[0]);
982 } else {
983 /* there are multiple predecessors and the block is sealed */
984 Temp ops[preds.size()];
985
986 /* we start assuming that the name is the same from all predecessors */
987 renames[block->index][val.id()] = val;
988 bool needs_phi = false;
989
990 /* get the rename from each predecessor and check if they are the same */
991 for (unsigned i = 0; i < preds.size(); i++) {
992 ops[i] = read_variable(val, preds[i]);
993 if (i == 0)
994 new_val = ops[i];
995 else
996 needs_phi |= !(new_val == ops[i]);
997 }
998
999 if (needs_phi) {
1000 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1001 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1002 aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1003 new_val = Temp{program->allocateId(), val.regClass()};
1004 phi->definitions[0] = Definition(new_val);
1005 for (unsigned i = 0; i < preds.size(); i++) {
1006 phi->operands[i] = Operand(ops[i]);
1007 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first);
1008 if (ops[i].regClass() == new_val.regClass())
1009 affinities[new_val.id()] = ops[i].id();
1010 }
1011 phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index});
1012 block->instructions.insert(block->instructions.begin(), std::move(phi));
1013 }
1014 }
1015
1016 renames[block->index][val.id()] = new_val;
1017 renames[block->index][new_val.id()] = new_val;
1018 ctx.orig_names[new_val.id()] = val;
1019 return new_val;
1020 };
1021
1022 try_remove_trivial_phi = [&] (std::map<unsigned, phi_info>::iterator info) -> Temp {
1023 assert(info->second.block_idx != 0);
1024 Instruction* phi = info->second.phi;
1025 Temp same = Temp();
1026
1027 Definition def = phi->definitions[0];
1028 /* a phi node is trivial if all operands are the same as the definition of the phi */
1029 for (const Operand& op : phi->operands) {
1030 const Temp t = op.getTemp();
1031 if (t == same || t == def.getTemp())
1032 continue;
1033 if (!(same == Temp()) || !(op.physReg() == def.physReg())) {
1034 /* phi is not trivial */
1035 return def.getTemp();
1036 }
1037 same = t;
1038 }
1039 assert(!(same == Temp() || same == def.getTemp()));
1040
1041 /* reroute all uses to same and remove phi */
1042 std::vector<std::map<unsigned, phi_info>::iterator> phi_users;
1043 std::map<unsigned, phi_info>::iterator same_phi_info = phi_map.find(same.id());
1044 for (Instruction* instr : info->second.uses) {
1045 assert(phi != instr);
1046 /* recursively try to remove trivial phis */
1047 if (is_phi(instr)) {
1048 /* ignore if the phi was already flagged trivial */
1049 if (instr->definitions.empty())
1050 continue;
1051
1052 std::map<unsigned, phi_info>::iterator it = phi_map.find(instr->definitions[0].tempId());
1053 if (it != phi_map.end() && it != info)
1054 phi_users.emplace_back(it);
1055 }
1056 for (Operand& op : instr->operands) {
1057 if (op.isTemp() && op.tempId() == def.tempId()) {
1058 op.setTemp(same);
1059 if (same_phi_info != phi_map.end())
1060 same_phi_info->second.uses.emplace(instr);
1061 }
1062 }
1063 }
1064
1065 auto it = ctx.orig_names.find(same.id());
1066 unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id();
1067 for (unsigned i = 0; i < program->blocks.size(); i++) {
1068 auto it = renames[i].find(orig_var);
1069 if (it != renames[i].end() && it->second == def.getTemp())
1070 renames[i][orig_var] = same;
1071 }
1072
1073 unsigned block_idx = info->second.block_idx;
1074 phi->definitions.clear(); /* this indicates that the phi can be removed */
1075 phi_map.erase(info);
1076 for (auto it : phi_users) {
1077 if (sealed[it->second.block_idx])
1078 try_remove_trivial_phi(it);
1079 }
1080
1081 /* due to the removal of other phis, the name might have changed once again! */
1082 return renames[block_idx][orig_var];
1083 };
1084
1085 std::map<unsigned, Instruction*> vectors;
1086 std::vector<std::vector<Temp>> phi_ressources;
1087 std::map<unsigned, unsigned> temp_to_phi_ressources;
1088
1089 for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) {
1090 Block& block = *it;
1091
1092 /* first, compute the death points of all live vars within the block */
1093 std::set<Temp>& live = live_out_per_block[block.index];
1094
1095 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
1096 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
1097 aco_ptr<Instruction>& instr = *rit;
1098 if (!is_phi(instr)) {
1099 for (const Operand& op : instr->operands) {
1100 if (op.isTemp())
1101 live.emplace(op.getTemp());
1102 }
1103 if (instr->opcode == aco_opcode::p_create_vector) {
1104 for (const Operand& op : instr->operands) {
1105 if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
1106 vectors[op.tempId()] = instr.get();
1107 }
1108 }
1109 } else if (!instr->definitions[0].isKill() && !instr->definitions[0].isFixed()) {
1110 /* collect information about affinity-related temporaries */
1111 std::vector<Temp> affinity_related;
1112 /* affinity_related[0] is the last seen affinity-related temp */
1113 affinity_related.emplace_back(instr->definitions[0].getTemp());
1114 affinity_related.emplace_back(instr->definitions[0].getTemp());
1115 for (const Operand& op : instr->operands) {
1116 if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) {
1117 affinity_related.emplace_back(op.getTemp());
1118 temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
1119 }
1120 }
1121 phi_ressources.emplace_back(std::move(affinity_related));
1122 }
1123
1124 /* erase from live */
1125 for (const Definition& def : instr->definitions) {
1126 if (def.isTemp()) {
1127 live.erase(def.getTemp());
1128 std::map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
1129 if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass())
1130 phi_ressources[it->second][0] = def.getTemp();
1131 }
1132 }
1133 }
1134 }
1135 /* create affinities */
1136 for (std::vector<Temp>& vec : phi_ressources) {
1137 assert(vec.size() > 1);
1138 for (unsigned i = 1; i < vec.size(); i++)
1139 if (vec[i].id() != vec[0].id())
1140 affinities[vec[i].id()] = vec[0].id();
1141 }
1142
1143 /* state of register file after phis */
1144 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
1145
1146 for (Block& block : program->blocks) {
1147 std::set<Temp>& live = live_out_per_block[block.index];
1148 /* initialize register file */
1149 assert(block.index != 0 || live.empty());
1150 std::array<uint32_t, 512> register_file = {0};
1151 ctx.war_hint.reset();
1152
1153 for (Temp t : live) {
1154 Temp renamed = handle_live_in(t, &block);
1155 if (ctx.assignments.find(renamed.id()) != ctx.assignments.end()) {
1156 for (unsigned i = 0; i < t.size(); i++)
1157 register_file[ctx.assignments[renamed.id()].first + i] = renamed.id();
1158 }
1159 }
1160
1161 std::vector<aco_ptr<Instruction>> instructions;
1162 std::vector<aco_ptr<Instruction>>::iterator it;
1163
1164 /* this is a slight adjustment from the paper as we already have phi nodes:
1165 * We consider them incomplete phis and only handle the definition. */
1166
1167 /* handle fixed phi definitions */
1168 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1169 aco_ptr<Instruction>& phi = *it;
1170 if (!is_phi(phi))
1171 break;
1172 Definition& definition = phi->definitions[0];
1173 if (!definition.isFixed())
1174 continue;
1175
1176 /* check if a dead exec mask phi is needed */
1177 if (definition.isKill()) {
1178 for (Operand& op : phi->operands) {
1179 assert(op.isTemp());
1180 if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1181 ctx.assignments[op.tempId()].first != exec) {
1182 definition.setKill(false);
1183 break;
1184 }
1185 }
1186 }
1187
1188 if (definition.isKill())
1189 continue;
1190
1191 assert(definition.physReg() == exec);
1192 for (unsigned i = 0; i < definition.size(); i++) {
1193 assert(register_file[definition.physReg() + i] == 0);
1194 register_file[definition.physReg() + i] = definition.tempId();
1195 }
1196 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1197 }
1198
1199 /* look up the affinities */
1200 for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
1201 aco_ptr<Instruction>& phi = *it;
1202 if (!is_phi(phi))
1203 break;
1204 Definition& definition = phi->definitions[0];
1205 if (definition.isKill() || definition.isFixed())
1206 continue;
1207
1208 if (affinities.find(definition.tempId()) != affinities.end() &&
1209 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1210 assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass());
1211 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1212 bool try_use_special_reg = reg == scc || reg == exec;
1213 if (try_use_special_reg) {
1214 for (const Operand& op : phi->operands) {
1215 if (!op.isTemp() ||
1216 ctx.assignments.find(op.tempId()) == ctx.assignments.end() ||
1217 !(ctx.assignments[op.tempId()].first == reg)) {
1218 try_use_special_reg = false;
1219 break;
1220 }
1221 }
1222 if (!try_use_special_reg)
1223 continue;
1224 }
1225 bool reg_free = true;
1226 for (unsigned i = reg.reg; reg_free && i < reg + definition.size(); i++) {
1227 if (register_file[i] != 0)
1228 reg_free = false;
1229 }
1230 /* only assign if register is still free */
1231 if (reg_free) {
1232 definition.setFixed(reg);
1233 for (unsigned i = 0; i < definition.size(); i++)
1234 register_file[definition.physReg() + i] = definition.tempId();
1235 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1236 }
1237 }
1238 }
1239
1240 /* find registers for phis without affinity or where the register was blocked */
1241 for (it = block.instructions.begin();it != block.instructions.end(); ++it) {
1242 aco_ptr<Instruction>& phi = *it;
1243 if (!is_phi(phi))
1244 break;
1245
1246 Definition& definition = phi->definitions[0];
1247 if (definition.isKill())
1248 continue;
1249
1250 renames[block.index][definition.tempId()] = definition.getTemp();
1251
1252 if (!definition.isFixed()) {
1253 std::vector<std::pair<Operand, Definition>> parallelcopy;
1254 /* try to find a register that is used by at least one operand */
1255 for (const Operand& op : phi->operands) {
1256 if (!op.isTemp() ||
1257 ctx.assignments.find(op.tempId()) == ctx.assignments.end())
1258 continue;
1259 PhysReg reg = ctx.assignments[op.tempId()].first;
1260 /* we tried this already on the previous loop */
1261 if (reg == scc || reg == exec)
1262 continue;
1263 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) {
1264 definition.setFixed(reg);
1265 break;
1266 }
1267 }
1268 if (!definition.isFixed())
1269 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi));
1270
1271 /* process parallelcopy */
1272 for (std::pair<Operand, Definition> pc : parallelcopy) {
1273 /* rename */
1274 std::map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId());
1275 Temp orig = pc.first.getTemp();
1276 if (orig_it != ctx.orig_names.end())
1277 orig = orig_it->second;
1278 else
1279 ctx.orig_names[pc.second.tempId()] = orig;
1280 renames[block.index][orig.id()] = pc.second.getTemp();
1281 renames[block.index][pc.second.tempId()] = pc.second.getTemp();
1282
1283 /* see if it's a copy from a previous phi */
1284 //TODO: prefer moving some previous phis over live-ins
1285 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1286 Instruction *prev_phi = NULL;
1287 for (auto it2 = instructions.begin(); it2 != instructions.end(); ++it2) {
1288 if ((*it2)->definitions[0].tempId() == pc.first.tempId())
1289 prev_phi = it2->get();
1290 }
1291 if (prev_phi) {
1292 /* if so, just update that phi */
1293 prev_phi->definitions[0] = pc.second;
1294 continue;
1295 }
1296
1297 /* otherwise, this is a live-in and we need to create a new phi
1298 * to move it in this block's predecessors */
1299 aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
1300 std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
1301 aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
1302 new_phi->definitions[0] = pc.second;
1303 for (unsigned i = 0; i < preds.size(); i++)
1304 new_phi->operands[i] = Operand(pc.first);
1305 instructions.emplace_back(std::move(new_phi));
1306 }
1307
1308 for (unsigned i = 0; i < definition.size(); i++)
1309 register_file[definition.physReg() + i] = definition.tempId();
1310 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1311 }
1312 live.emplace(definition.getTemp());
1313
1314 /* update phi affinities */
1315 for (const Operand& op : phi->operands) {
1316 if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
1317 affinities[op.tempId()] = definition.tempId();
1318 }
1319
1320 instructions.emplace_back(std::move(*it));
1321 }
1322
1323 /* fill in sgpr_live_in */
1324 for (unsigned i = 0; i < ctx.max_used_sgpr; i++)
1325 sgpr_live_in[block.index][i] = register_file[i];
1326 sgpr_live_in[block.index][127] = register_file[scc.reg];
1327
1328 /* Handle all other instructions of the block */
1329 for (; it != block.instructions.end(); ++it) {
1330 aco_ptr<Instruction>& instr = *it;
1331
1332 /* parallelcopies from p_phi are inserted here which means
1333 * live ranges of killed operands end here as well */
1334 if (instr->opcode == aco_opcode::p_logical_end) {
1335 /* no need to process this instruction any further */
1336 if (block.logical_succs.size() != 1) {
1337 instructions.emplace_back(std::move(instr));
1338 continue;
1339 }
1340
1341 Block& succ = program->blocks[block.logical_succs[0]];
1342 unsigned idx = 0;
1343 for (; idx < succ.logical_preds.size(); idx++) {
1344 if (succ.logical_preds[idx] == block.index)
1345 break;
1346 }
1347 for (aco_ptr<Instruction>& phi : succ.instructions) {
1348 if (phi->opcode == aco_opcode::p_phi) {
1349 if (phi->operands[idx].isTemp() &&
1350 phi->operands[idx].getTemp().type() == RegType::sgpr &&
1351 phi->operands[idx].isFirstKill()) {
1352 Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index);
1353 PhysReg reg = ctx.assignments[phi_op.id()].first;
1354 assert(register_file[reg] == phi_op.id());
1355 register_file[reg] = 0;
1356 }
1357 } else if (phi->opcode != aco_opcode::p_linear_phi) {
1358 break;
1359 }
1360 }
1361 instructions.emplace_back(std::move(instr));
1362 continue;
1363 }
1364
1365 std::vector<std::pair<Operand, Definition>> parallelcopy;
1366
1367 assert(!is_phi(instr));
1368
1369 /* handle operands */
1370 for (unsigned i = 0; i < instr->operands.size(); ++i) {
1371 auto& operand = instr->operands[i];
1372 if (!operand.isTemp())
1373 continue;
1374
1375 /* rename operands */
1376 operand.setTemp(read_variable(operand.getTemp(), block.index));
1377
1378 /* check if the operand is fixed */
1379 if (operand.isFixed()) {
1380
1381 if (operand.physReg() == ctx.assignments[operand.tempId()].first) {
1382 /* we are fine: the operand is already assigned the correct reg */
1383
1384 } else {
1385 /* check if target reg is blocked, and move away the blocking var */
1386 if (register_file[operand.physReg().reg]) {
1387 uint32_t blocking_id = register_file[operand.physReg().reg];
1388 RegClass rc = ctx.assignments[blocking_id].second;
1389 Operand pc_op = Operand(Temp{blocking_id, rc});
1390 pc_op.setFixed(operand.physReg());
1391 Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()});
1392 /* find free reg */
1393 PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr);
1394 pc_def.setFixed(reg);
1395 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1396 for (unsigned i = 0; i < operand.size(); i++) {
1397 register_file[pc_op.physReg() + i] = 0;
1398 register_file[pc_def.physReg() + i] = pc_def.tempId();
1399 }
1400 parallelcopy.emplace_back(pc_op, pc_def);
1401
1402 /* handle renames of previous operands */
1403 for (unsigned j = 0; j < i; j++) {
1404 Operand& op = instr->operands[j];
1405 if (op.isTemp() && op.tempId() == blocking_id) {
1406 op = Operand(pc_def.getTemp());
1407 op.setFixed(reg);
1408 }
1409 }
1410 }
1411 /* move operand to fixed reg and create parallelcopy pair */
1412 Operand pc_op = operand;
1413 Temp tmp = Temp{program->allocateId(), operand.regClass()};
1414 Definition pc_def = Definition(tmp);
1415 pc_def.setFixed(operand.physReg());
1416 pc_op.setFixed(ctx.assignments[operand.tempId()].first);
1417 operand.setTemp(tmp);
1418 ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()};
1419 operand.setFixed(pc_def.physReg());
1420 for (unsigned i = 0; i < operand.size(); i++) {
1421 register_file[pc_op.physReg() + i] = 0;
1422 register_file[pc_def.physReg() + i] = tmp.id();
1423 }
1424 parallelcopy.emplace_back(pc_op, pc_def);
1425 }
1426 } else {
1427 assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end());
1428 PhysReg reg = ctx.assignments[operand.tempId()].first;
1429
1430 if (operand_can_use_reg(instr, i, reg)) {
1431 operand.setFixed(ctx.assignments[operand.tempId()].first);
1432 } else {
1433 Operand pc_op = operand;
1434 pc_op.setFixed(reg);
1435 PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr);
1436 Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass());
1437 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1438 for (unsigned i = 0; i < operand.size(); i++) {
1439 register_file[pc_op.physReg() + i] = 0;
1440 register_file[pc_def.physReg() + i] = pc_def.tempId();
1441 }
1442 parallelcopy.emplace_back(pc_op, pc_def);
1443 operand.setFixed(new_reg);
1444 }
1445
1446 if (instr->format == Format::EXP ||
1447 (instr->isVMEM() && i == 3 && program->chip_class == GFX6) ||
1448 (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) {
1449 for (unsigned j = 0; j < operand.size(); j++)
1450 ctx.war_hint.set(operand.physReg().reg + j);
1451 }
1452 }
1453 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1454 if (phi != phi_map.end())
1455 phi->second.uses.emplace(instr.get());
1456
1457 }
1458 /* remove dead vars from register file */
1459 for (const Operand& op : instr->operands) {
1460 if (op.isTemp() && op.isFirstKill())
1461 for (unsigned j = 0; j < op.size(); j++)
1462 register_file[op.physReg() + j] = 0;
1463 }
1464
1465 /* try to optimize v_mad_f32 -> v_mac_f32 */
1466 if (instr->opcode == aco_opcode::v_mad_f32 &&
1467 instr->operands[2].isTemp() &&
1468 instr->operands[2].isKill() &&
1469 instr->operands[2].getTemp().type() == RegType::vgpr &&
1470 instr->operands[1].isTemp() &&
1471 instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
1472 VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
1473 bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
1474 vop3->opsel[0] || vop3->opsel[1] || vop3->opsel[2] ||
1475 vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
1476 vop3->clamp || vop3->omod);
1477 if (can_use_mac) {
1478 instr->format = Format::VOP2;
1479 instr->opcode = aco_opcode::v_mac_f32;
1480 }
1481 }
1482
1483 /* handle definitions which must have the same register as an operand */
1484 if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
1485 instr->opcode == aco_opcode::v_mac_f32 ||
1486 instr->opcode == aco_opcode::v_writelane_b32) {
1487 instr->definitions[0].setFixed(instr->operands[2].physReg());
1488 } else if (instr->opcode == aco_opcode::s_addk_i32 ||
1489 instr->opcode == aco_opcode::s_mulk_i32) {
1490 instr->definitions[0].setFixed(instr->operands[0].physReg());
1491 } else if ((instr->format == Format::MUBUF ||
1492 instr->format == Format::MIMG) &&
1493 instr->definitions.size() == 1 &&
1494 instr->operands.size() == 4) {
1495 instr->definitions[0].setFixed(instr->operands[3].physReg());
1496 }
1497
1498 ctx.defs_done.reset();
1499
1500 /* handle fixed definitions first */
1501 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1502 auto& definition = instr->definitions[i];
1503 if (!definition.isFixed())
1504 continue;
1505
1506 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
1507 /* check if the target register is blocked */
1508 if (register_file[definition.physReg().reg] != 0) {
1509 /* create parallelcopy pair to move blocking var */
1510 Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second};
1511 Operand pc_op = Operand(tmp);
1512 pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first);
1513 RegClass rc = pc_op.regClass();
1514 tmp = Temp{program->allocateId(), rc};
1515 Definition pc_def = Definition(tmp);
1516
1517 /* re-enable the killed operands, so that we don't move the blocking var there */
1518 for (const Operand& op : instr->operands) {
1519 if (op.isTemp() && op.isFirstKill())
1520 for (unsigned j = 0; j < op.size(); j++)
1521 register_file[op.physReg() + j] = 0xFFFF;
1522 }
1523
1524 /* find a new register for the blocking variable */
1525 PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr);
1526 /* once again, disable killed operands */
1527 for (const Operand& op : instr->operands) {
1528 if (op.isTemp() && op.isFirstKill())
1529 for (unsigned j = 0; j < op.size(); j++)
1530 register_file[op.physReg() + j] = 0;
1531 }
1532 for (unsigned k = 0; k < i; k++) {
1533 if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
1534 for (unsigned j = 0; j < instr->definitions[k].size(); j++)
1535 register_file[instr->definitions[k].physReg() + j] = instr->definitions[k].tempId();
1536 }
1537 pc_def.setFixed(reg);
1538
1539 /* finish assignment of parallelcopy */
1540 ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()};
1541 parallelcopy.emplace_back(pc_op, pc_def);
1542
1543 /* add changes to reg_file */
1544 for (unsigned i = 0; i < pc_op.size(); i++) {
1545 register_file[pc_op.physReg() + i] = 0x0;
1546 register_file[pc_def.physReg() + i] = pc_def.tempId();
1547 }
1548 }
1549 ctx.defs_done.set(i);
1550
1551 if (!definition.isTemp())
1552 continue;
1553
1554 /* set live if it has a kill point */
1555 if (!definition.isKill())
1556 live.emplace(definition.getTemp());
1557
1558 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1559 renames[block.index][definition.tempId()] = definition.getTemp();
1560 for (unsigned j = 0; j < definition.size(); j++)
1561 register_file[definition.physReg() + j] = definition.tempId();
1562 }
1563
1564 /* handle all other definitions */
1565 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
1566 auto& definition = instr->definitions[i];
1567
1568 if (definition.isFixed() || !definition.isTemp())
1569 continue;
1570
1571 /* find free reg */
1572 if (definition.hasHint() && register_file[definition.physReg().reg] == 0)
1573 definition.setFixed(definition.physReg());
1574 else if (instr->opcode == aco_opcode::p_split_vector) {
1575 PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()};
1576 if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1577 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1578 definition.setFixed(reg);
1579 } else if (instr->opcode == aco_opcode::p_wqm) {
1580 PhysReg reg;
1581 if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1582 reg = instr->operands[0].physReg();
1583 assert(register_file[reg.reg] == 0);
1584 } else {
1585 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1586 }
1587 definition.setFixed(reg);
1588 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1589 PhysReg reg;
1590 if (instr->operands[0].isKill() &&
1591 instr->operands[0].getTemp().type() == definition.getTemp().type()) {
1592 reg = instr->operands[0].physReg();
1593 reg.reg += definition.size() * instr->operands[1].constantValue();
1594 assert(register_file[reg.reg] == 0);
1595 } else {
1596 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1597 }
1598 definition.setFixed(reg);
1599 } else if (instr->opcode == aco_opcode::p_create_vector) {
1600 PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(),
1601 parallelcopy, instr);
1602 definition.setFixed(reg);
1603 } else if (affinities.find(definition.tempId()) != affinities.end() &&
1604 ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) {
1605 PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first;
1606 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
1607 definition.setFixed(reg);
1608 else
1609 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1610
1611 } else if (vectors.find(definition.tempId()) != vectors.end()) {
1612 Instruction* vec = vectors[definition.tempId()];
1613 unsigned offset = 0;
1614 for (const Operand& op : vec->operands) {
1615 if (op.isTemp() && op.tempId() == definition.tempId())
1616 break;
1617 else
1618 offset += op.size();
1619 }
1620 unsigned k = 0;
1621 for (const Operand& op : vec->operands) {
1622 if (op.isTemp() &&
1623 op.tempId() != definition.tempId() &&
1624 op.getTemp().type() == definition.getTemp().type() &&
1625 ctx.assignments.find(op.tempId()) != ctx.assignments.end()) {
1626 PhysReg reg = ctx.assignments[op.tempId()].first;
1627 reg.reg = reg - k + offset;
1628 if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
1629 definition.setFixed(reg);
1630 break;
1631 }
1632 }
1633 k += op.size();
1634 }
1635 if (!definition.isFixed()) {
1636 std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
1637 PhysReg reg = res.first;
1638 if (res.second) {
1639 reg.reg += offset;
1640 } else {
1641 reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr);
1642 }
1643 definition.setFixed(reg);
1644 }
1645 } else
1646 definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr));
1647
1648 assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
1649 (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
1650 ctx.defs_done.set(i);
1651
1652 /* set live if it has a kill point */
1653 if (!definition.isKill())
1654 live.emplace(definition.getTemp());
1655
1656 ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
1657 renames[block.index][definition.tempId()] = definition.getTemp();
1658 for (unsigned j = 0; j < definition.size(); j++)
1659 register_file[definition.physReg() + j] = definition.tempId();
1660 }
1661
1662 handle_pseudo(ctx, register_file, instr.get());
1663
1664 /* kill definitions */
1665 for (const Definition& def : instr->definitions) {
1666 if (def.isTemp() && def.isKill()) {
1667 for (unsigned j = 0; j < def.size(); j++) {
1668 register_file[def.physReg() + j] = 0;
1669 }
1670 }
1671 }
1672
1673 /* emit parallelcopy */
1674 if (!parallelcopy.empty()) {
1675 aco_ptr<Pseudo_instruction> pc;
1676 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size()));
1677 bool temp_in_scc = register_file[scc.reg];
1678 bool sgpr_operands_alias_defs = false;
1679 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
1680 for (unsigned i = 0; i < parallelcopy.size(); i++) {
1681 if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
1682 if (!sgpr_operands_alias_defs) {
1683 unsigned reg = parallelcopy[i].first.physReg().reg;
1684 unsigned size = parallelcopy[i].first.getTemp().size();
1685 sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
1686
1687 reg = parallelcopy[i].second.physReg().reg;
1688 size = parallelcopy[i].second.getTemp().size();
1689 if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
1690 sgpr_operands_alias_defs = true;
1691 }
1692 }
1693
1694 pc->operands[i] = parallelcopy[i].first;
1695 pc->definitions[i] = parallelcopy[i].second;
1696
1697 /* it might happen that the operand is already renamed. we have to restore the original name. */
1698 std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
1699 if (it != ctx.orig_names.end())
1700 pc->operands[i].setTemp(it->second);
1701 unsigned orig_id = pc->operands[i].tempId();
1702 ctx.orig_names[pc->definitions[i].tempId()] = pc->operands[i].getTemp();
1703
1704 pc->operands[i].setTemp(read_variable(pc->operands[i].getTemp(), block.index));
1705 renames[block.index][orig_id] = pc->definitions[i].getTemp();
1706 renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp();
1707 std::map<unsigned, phi_info>::iterator phi = phi_map.find(pc->operands[i].tempId());
1708 if (phi != phi_map.end())
1709 phi->second.uses.emplace(pc.get());
1710 }
1711
1712 if (temp_in_scc && sgpr_operands_alias_defs) {
1713 /* disable definitions and re-enable operands */
1714 for (const Definition& def : instr->definitions) {
1715 if (def.isTemp() && !def.isKill()) {
1716 for (unsigned j = 0; j < def.size(); j++) {
1717 register_file[def.physReg() + j] = 0x0;
1718 }
1719 }
1720 }
1721 for (const Operand& op : instr->operands) {
1722 if (op.isTemp() && op.isFirstKill()) {
1723 for (unsigned j = 0; j < op.size(); j++)
1724 register_file[op.physReg() + j] = 0xFFFF;
1725 }
1726 }
1727
1728 handle_pseudo(ctx, register_file, pc.get());
1729
1730 /* re-enable live vars */
1731 for (const Operand& op : instr->operands) {
1732 if (op.isTemp() && op.isFirstKill())
1733 for (unsigned j = 0; j < op.size(); j++)
1734 register_file[op.physReg() + j] = 0x0;
1735 }
1736 for (const Definition& def : instr->definitions) {
1737 if (def.isTemp() && !def.isKill()) {
1738 for (unsigned j = 0; j < def.size(); j++) {
1739 register_file[def.physReg() + j] = def.tempId();
1740 }
1741 }
1742 }
1743 } else {
1744 pc->tmp_in_scc = false;
1745 }
1746
1747 instructions.emplace_back(std::move(pc));
1748 }
1749
1750 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1751 bool instr_needs_vop3 = !instr->isVOP3() &&
1752 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
1753 (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) ||
1754 ((instr->opcode == aco_opcode::v_add_co_u32 ||
1755 instr->opcode == aco_opcode::v_addc_co_u32 ||
1756 instr->opcode == aco_opcode::v_sub_co_u32 ||
1757 instr->opcode == aco_opcode::v_subb_co_u32 ||
1758 instr->opcode == aco_opcode::v_subrev_co_u32 ||
1759 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1760 !(instr->definitions[1].physReg() == vcc)) ||
1761 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
1762 instr->opcode == aco_opcode::v_subb_co_u32 ||
1763 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
1764 !(instr->operands[2].physReg() == vcc)));
1765 if (instr_needs_vop3) {
1766
1767 /* if the first operand is a literal, we have to move it to a reg */
1768 if (instr->operands.size() && instr->operands[0].isLiteral()) {
1769 bool can_sgpr = true;
1770 /* check, if we have to move to vgpr */
1771 for (const Operand& op : instr->operands) {
1772 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
1773 can_sgpr = false;
1774 break;
1775 }
1776 }
1777 aco_ptr<Instruction> mov;
1778 if (can_sgpr)
1779 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1));
1780 else
1781 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1));
1782 mov->operands[0] = instr->operands[0];
1783 Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
1784 mov->definitions[0] = Definition(tmp);
1785 /* disable definitions and re-enable operands */
1786 for (const Definition& def : instr->definitions) {
1787 for (unsigned j = 0; j < def.size(); j++) {
1788 register_file[def.physReg() + j] = 0x0;
1789 }
1790 }
1791 for (const Operand& op : instr->operands) {
1792 if (op.isTemp() && op.isFirstKill()) {
1793 for (unsigned j = 0; j < op.size(); j++)
1794 register_file[op.physReg() + j] = 0xFFFF;
1795 }
1796 }
1797 mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov));
1798 instr->operands[0] = Operand(tmp);
1799 instr->operands[0].setFixed(mov->definitions[0].physReg());
1800 instructions.emplace_back(std::move(mov));
1801 /* re-enable live vars */
1802 for (const Operand& op : instr->operands) {
1803 if (op.isTemp() && op.isFirstKill())
1804 for (unsigned j = 0; j < op.size(); j++)
1805 register_file[op.physReg() + j] = 0x0;
1806 }
1807 for (const Definition& def : instr->definitions) {
1808 if (def.isTemp() && !def.isKill()) {
1809 for (unsigned j = 0; j < def.size(); j++) {
1810 register_file[def.physReg() + j] = def.tempId();
1811 }
1812 }
1813 }
1814 }
1815
1816 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1817 aco_ptr<Instruction> tmp = std::move(instr);
1818 Format format = asVOP3(tmp->format);
1819 instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
1820 for (unsigned i = 0; i < instr->operands.size(); i++) {
1821 Operand& operand = tmp->operands[i];
1822 instr->operands[i] = operand;
1823 /* keep phi_map up to date */
1824 if (operand.isTemp()) {
1825 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.tempId());
1826 if (phi != phi_map.end()) {
1827 phi->second.uses.erase(tmp.get());
1828 phi->second.uses.emplace(instr.get());
1829 }
1830 }
1831 }
1832 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
1833 }
1834 instructions.emplace_back(std::move(*it));
1835
1836 } /* end for Instr */
1837
1838 block.instructions = std::move(instructions);
1839
1840 filled[block.index] = true;
1841 for (unsigned succ_idx : block.linear_succs) {
1842 Block& succ = program->blocks[succ_idx];
1843 /* seal block if all predecessors are filled */
1844 bool all_filled = true;
1845 for (unsigned pred_idx : succ.linear_preds) {
1846 if (!filled[pred_idx]) {
1847 all_filled = false;
1848 break;
1849 }
1850 }
1851 if (all_filled) {
1852 /* finish incomplete phis and check if they became trivial */
1853 for (Instruction* phi : incomplete_phis[succ_idx]) {
1854 std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds;
1855 for (unsigned i = 0; i < phi->operands.size(); i++) {
1856 phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i]));
1857 phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first);
1858 }
1859 try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId()));
1860 }
1861 /* complete the original phi nodes, but no need to check triviality */
1862 for (aco_ptr<Instruction>& instr : succ.instructions) {
1863 if (!is_phi(instr))
1864 break;
1865 std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds;
1866
1867 for (unsigned i = 0; i < instr->operands.size(); i++) {
1868 auto& operand = instr->operands[i];
1869 if (!operand.isTemp())
1870 continue;
1871 operand.setTemp(read_variable(operand.getTemp(), preds[i]));
1872 operand.setFixed(ctx.assignments[operand.tempId()].first);
1873 std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id());
1874 if (phi != phi_map.end())
1875 phi->second.uses.emplace(instr.get());
1876 }
1877 }
1878 sealed[succ_idx] = true;
1879 }
1880 }
1881 } /* end for BB */
1882
1883 /* remove trivial phis */
1884 for (Block& block : program->blocks) {
1885 auto end = std::find_if(block.instructions.begin(), block.instructions.end(),
1886 [](aco_ptr<Instruction>& instr) { return !is_phi(instr);});
1887 auto middle = std::remove_if(block.instructions.begin(), end,
1888 [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();});
1889 block.instructions.erase(middle, end);
1890 }
1891
1892 /* find scc spill registers which may be needed for parallelcopies created by phis */
1893 for (Block& block : program->blocks) {
1894 if (block.linear_preds.size() <= 1)
1895 continue;
1896
1897 std::bitset<128> regs = sgpr_live_in[block.index];
1898 if (!regs[127])
1899 continue;
1900
1901 /* choose a register */
1902 int16_t reg = 0;
1903 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
1904 ;
1905 assert(reg < ctx.program->max_reg_demand.sgpr);
1906 adjust_max_used_regs(ctx, s1, reg);
1907
1908 /* update predecessors */
1909 for (unsigned& pred_index : block.linear_preds) {
1910 Block& pred = program->blocks[pred_index];
1911 pred.scc_live_out = true;
1912 pred.scratch_sgpr = PhysReg{(uint16_t)reg};
1913 }
1914 }
1915
1916 /* num_gpr = rnd_up(max_used_gpr + 1) */
1917 program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3;
1918 if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) {
1919 assert(ctx.max_used_sgpr <= 93);
1920 ctx.max_used_sgpr = 93; /* workaround hardware bug */
1921 }
1922 program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */
1923 }
1924
1925 }