2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
31 #include <unordered_map>
41 std::bitset
<512> war_hint
;
43 std::unordered_map
<unsigned, std::pair
<PhysReg
, RegClass
>> assignments
;
44 std::map
<unsigned, Temp
> orig_names
;
45 unsigned max_used_sgpr
= 0;
46 unsigned max_used_vgpr
= 0;
47 std::bitset
<64> defs_done
; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
49 ra_ctx(Program
* program
) : program(program
) {}
53 /* helper function for debugging */
55 void print_regs(ra_ctx
& ctx
, bool vgprs
, std::array
<uint32_t, 512>& reg_file
)
57 unsigned max
= vgprs
? ctx
.program
->max_reg_demand
.vgpr
: ctx
.program
->max_reg_demand
.sgpr
;
58 unsigned lb
= vgprs
? 256 : 0;
59 unsigned ub
= lb
+ max
;
60 char reg_char
= vgprs
? 'v' : 's';
64 for (unsigned i
= lb
; i
< ub
; i
+= 3) {
65 printf("%.2u ", i
- lb
);
70 printf("%cgprs: ", reg_char
);
71 unsigned free_regs
= 0;
73 bool char_select
= false;
74 for (unsigned i
= lb
; i
< ub
; i
++) {
75 if (reg_file
[i
] == 0xFFFF) {
77 } else if (reg_file
[i
]) {
78 if (reg_file
[i
] != prev
) {
80 char_select
= !char_select
;
82 printf(char_select
? "#" : "@");
90 printf("%u/%u used, %u/%u free\n", max
- free_regs
, max
, free_regs
, max
);
92 /* print assignments */
95 for (unsigned i
= lb
; i
< ub
; i
++) {
96 if (reg_file
[i
] != prev
) {
98 printf("-%d]\n", i
- 1 - lb
);
102 if (prev
&& prev
!= 0xFFFF) {
103 if (ctx
.orig_names
.count(reg_file
[i
]) && ctx
.orig_names
[reg_file
[i
]].id() != reg_file
[i
])
104 printf("%%%u (was %%%d) = %c[%d", reg_file
[i
], ctx
.orig_names
[reg_file
[i
]].id(), reg_char
, i
- lb
);
106 printf("%%%u = %c[%d", reg_file
[i
], reg_char
, i
- lb
);
113 if (prev
&& size
> 1)
114 printf("-%d]\n", ub
- lb
- 1);
121 void adjust_max_used_regs(ra_ctx
& ctx
, RegClass rc
, unsigned reg
)
123 unsigned max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
124 unsigned size
= rc
.size();
125 if (rc
.type() == RegType::vgpr
) {
127 unsigned hi
= reg
- 256 + size
- 1;
128 ctx
.max_used_vgpr
= std::max(ctx
.max_used_vgpr
, hi
);
129 } else if (reg
+ rc
.size() <= max_addressible_sgpr
) {
130 unsigned hi
= reg
+ size
- 1;
131 ctx
.max_used_sgpr
= std::max(ctx
.max_used_sgpr
, std::min(hi
, max_addressible_sgpr
));
136 void update_renames(ra_ctx
& ctx
, std::array
<uint32_t, 512>& reg_file
,
137 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
138 aco_ptr
<Instruction
>& instr
)
140 /* allocate id's and rename operands: this is done transparently here */
141 for (std::pair
<Operand
, Definition
>& copy
: parallelcopies
) {
142 /* the definitions with id are not from this function and already handled */
143 if (copy
.second
.isTemp())
146 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
147 copy
.second
.setTemp(Temp(ctx
.program
->allocateId(), copy
.second
.regClass()));
148 ctx
.assignments
[copy
.second
.tempId()] = {copy
.second
.physReg(), copy
.second
.regClass()};
149 for (unsigned i
= copy
.second
.physReg().reg
; i
< copy
.second
.physReg() + copy
.second
.size(); i
++)
150 reg_file
[i
] = copy
.second
.tempId();
151 /* check if we moved an operand */
152 for (Operand
& op
: instr
->operands
) {
155 if (op
.tempId() == copy
.first
.tempId()) {
156 bool omit_renaming
= instr
->opcode
== aco_opcode::p_create_vector
&& !op
.isKill();
157 for (std::pair
<Operand
, Definition
>& pc
: parallelcopies
) {
158 PhysReg def_reg
= pc
.second
.physReg();
159 omit_renaming
&= def_reg
> copy
.first
.physReg() ?
160 (copy
.first
.physReg() + copy
.first
.size() <= def_reg
.reg
) :
161 (def_reg
+ pc
.second
.size() <= copy
.first
.physReg().reg
);
165 op
.setTemp(copy
.second
.getTemp());
166 op
.setFixed(copy
.second
.physReg());
172 std::pair
<PhysReg
, bool> get_reg_simple(ra_ctx
& ctx
,
173 std::array
<uint32_t, 512>& reg_file
,
174 uint32_t lb
, uint32_t ub
,
175 uint32_t size
, uint32_t stride
,
178 /* best fit algorithm: find the smallest gap to fit in the variable */
180 unsigned best_pos
= 0xFFFF;
181 unsigned gap_size
= 0xFFFF;
182 unsigned next_pos
= 0xFFFF;
184 for (unsigned current_reg
= lb
; current_reg
< ub
; current_reg
++) {
185 if (reg_file
[current_reg
] != 0 || ctx
.war_hint
[current_reg
]) {
186 if (next_pos
== 0xFFFF)
189 /* check if the variable fits */
190 if (next_pos
+ size
> current_reg
) {
195 /* check if the tested gap is smaller */
196 if (current_reg
- next_pos
< gap_size
) {
198 gap_size
= current_reg
- next_pos
;
204 if (next_pos
== 0xFFFF)
205 next_pos
= current_reg
;
209 if (next_pos
!= 0xFFFF &&
210 next_pos
+ size
<= ub
&&
211 ub
- next_pos
< gap_size
) {
213 gap_size
= ub
- next_pos
;
215 if (best_pos
!= 0xFFFF) {
216 adjust_max_used_regs(ctx
, rc
, best_pos
);
217 return {PhysReg
{best_pos
}, true};
223 unsigned reg_lo
= lb
;
224 unsigned reg_hi
= lb
+ size
- 1;
225 while (!found
&& reg_lo
+ size
<= ub
) {
226 if (reg_file
[reg_lo
] != 0) {
230 reg_hi
= reg_lo
+ size
- 1;
232 for (unsigned reg
= reg_lo
+ 1; found
&& reg
<= reg_hi
; reg
++) {
233 if (reg_file
[reg
] != 0 || ctx
.war_hint
[reg
])
237 adjust_max_used_regs(ctx
, rc
, reg_lo
);
238 return {PhysReg
{reg_lo
}, true};
247 bool get_regs_for_copies(ra_ctx
& ctx
,
248 std::array
<uint32_t, 512>& reg_file
,
249 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
250 std::set
<std::pair
<unsigned, unsigned>> vars
,
251 uint32_t lb
, uint32_t ub
,
252 aco_ptr
<Instruction
>& instr
,
257 /* variables are sorted from small sized to large */
258 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
259 for (std::set
<std::pair
<unsigned, unsigned>>::reverse_iterator it
= vars
.rbegin(); it
!= vars
.rend(); ++it
) {
260 unsigned id
= it
->second
;
261 std::pair
<PhysReg
, RegClass
> var
= ctx
.assignments
[id
];
262 uint32_t size
= it
->first
;
264 if (var
.second
.type() == RegType::sgpr
) {
271 /* check if this is a dead operand, then we can re-use the space from the definition */
272 bool is_dead_operand
= false;
273 for (unsigned i
= 0; !is_phi(instr
) && !is_dead_operand
&& i
< instr
->operands
.size(); i
++) {
274 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isKill() && instr
->operands
[i
].tempId() == id
)
275 is_dead_operand
= true;
278 std::pair
<PhysReg
, bool> res
;
279 if (is_dead_operand
) {
280 if (instr
->opcode
== aco_opcode::p_create_vector
) {
281 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
282 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
283 for (unsigned j
= 0; j
< size
; j
++)
284 assert(reg_file
[def_reg_lo
+ offset
+ j
] == 0);
285 res
= {PhysReg
{def_reg_lo
+ offset
}, true};
290 res
= get_reg_simple(ctx
, reg_file
, def_reg_lo
, def_reg_hi
+ 1, size
, stride
, var
.second
);
293 res
= get_reg_simple(ctx
, reg_file
, lb
, def_reg_lo
, size
, stride
, var
.second
);
295 unsigned lb
= (def_reg_hi
+ stride
) & ~(stride
- 1);
296 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, var
.second
);
301 /* mark the area as blocked */
302 for (unsigned i
= res
.first
.reg
; i
< res
.first
+ size
; i
++)
303 reg_file
[i
] = 0xFFFFFFFF;
304 /* create parallelcopy pair (without definition id) */
305 Temp tmp
= Temp(id
, var
.second
);
306 Operand pc_op
= Operand(tmp
);
307 pc_op
.setFixed(var
.first
);
308 Definition pc_def
= Definition(res
.first
, pc_op
.regClass());
309 parallelcopies
.emplace_back(pc_op
, pc_def
);
313 unsigned best_pos
= lb
;
314 unsigned num_moves
= 0xFF;
315 unsigned num_vars
= 0;
317 /* we use a sliding window to find potential positions */
318 unsigned reg_lo
= lb
;
319 unsigned reg_hi
= lb
+ size
- 1;
320 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
321 if (!is_dead_operand
&& ((reg_lo
>= def_reg_lo
&& reg_lo
<= def_reg_hi
) ||
322 (reg_hi
>= def_reg_lo
&& reg_hi
<= def_reg_hi
)))
325 /* second, check that we have at most k=num_moves elements in the window
326 * and no element is larger than the currently processed one */
329 unsigned last_var
= 0;
331 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
332 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
335 /* 0xFFFF signals that this area is already blocked! */
336 if (reg_file
[j
] == 0xFFFFFFFF || k
> num_moves
) {
340 /* we cannot split live ranges of linear vgprs */
341 if (ctx
.assignments
[reg_file
[j
]].second
& (1 << 6)) {
345 bool is_kill
= false;
346 for (const Operand
& op
: instr
->operands
) {
347 if (op
.isTemp() && op
.isKill() && op
.tempId() == reg_file
[j
]) {
352 if (!is_kill
&& ctx
.assignments
[reg_file
[j
]].second
.size() >= size
) {
357 k
+= ctx
.assignments
[reg_file
[j
]].second
.size();
358 last_var
= reg_file
[j
];
360 if (k
> num_moves
|| (k
== num_moves
&& n
<= num_vars
)) {
373 /* FIXME: we messed up and couldn't find space for the variables to be copied */
374 if (num_moves
== 0xFF)
378 reg_hi
= best_pos
+ size
- 1;
380 /* collect variables and block reg file */
381 std::set
<std::pair
<unsigned, unsigned>> new_vars
;
382 for (unsigned j
= reg_lo
; j
<= reg_hi
; j
++) {
383 if (reg_file
[j
] != 0) {
384 unsigned size
= ctx
.assignments
[reg_file
[j
]].second
.size();
385 unsigned id
= reg_file
[j
];
386 new_vars
.emplace(size
, id
);
387 for (unsigned k
= 0; k
< size
; k
++)
388 reg_file
[ctx
.assignments
[id
].first
+ k
] = 0;
392 /* mark the area as blocked */
393 for (unsigned i
= reg_lo
; i
<= reg_hi
; i
++)
394 reg_file
[i
] = 0xFFFFFFFF;
396 if (!get_regs_for_copies(ctx
, reg_file
, parallelcopies
, new_vars
, lb
, ub
, instr
, def_reg_lo
, def_reg_hi
))
399 adjust_max_used_regs(ctx
, var
.second
, reg_lo
);
401 /* create parallelcopy pair (without definition id) */
402 Temp tmp
= Temp(id
, var
.second
);
403 Operand pc_op
= Operand(tmp
);
404 pc_op
.setFixed(var
.first
);
405 Definition pc_def
= Definition(PhysReg
{reg_lo
}, pc_op
.regClass());
406 parallelcopies
.emplace_back(pc_op
, pc_def
);
413 std::pair
<PhysReg
, bool> get_reg_impl(ra_ctx
& ctx
,
414 std::array
<uint32_t, 512>& reg_file
,
415 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
416 uint32_t lb
, uint32_t ub
,
417 uint32_t size
, uint32_t stride
,
419 aco_ptr
<Instruction
>& instr
)
421 unsigned regs_free
= 0;
422 /* check how many free regs we have */
423 for (unsigned j
= lb
; j
< ub
; j
++) {
424 if (reg_file
[j
] == 0)
428 /* mark and count killed operands */
429 unsigned killed_ops
= 0;
430 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
431 if (instr
->operands
[j
].isTemp() &&
432 instr
->operands
[j
].isFirstKill() &&
433 instr
->operands
[j
].physReg() >= lb
&&
434 instr
->operands
[j
].physReg() < ub
) {
435 assert(instr
->operands
[j
].isFixed());
436 assert(reg_file
[instr
->operands
[j
].physReg().reg
] == 0);
437 for (unsigned k
= 0; k
< instr
->operands
[j
].size(); k
++)
438 reg_file
[instr
->operands
[j
].physReg() + k
] = 0xFFFFFFFF;
439 killed_ops
+= instr
->operands
[j
].getTemp().size();
443 assert(regs_free
>= size
);
444 /* we might have to move dead operands to dst in order to make space */
445 unsigned op_moves
= 0;
447 if (size
> (regs_free
- killed_ops
))
448 op_moves
= size
- (regs_free
- killed_ops
);
450 /* find the best position to place the definition */
451 unsigned best_pos
= lb
;
452 unsigned num_moves
= 0xFF;
453 unsigned num_vars
= 0;
455 /* we use a sliding window to check potential positions */
456 unsigned reg_lo
= lb
;
457 unsigned reg_hi
= lb
+ size
- 1;
458 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
459 /* first check the edges: this is what we have to fix to allow for num_moves > size */
460 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
462 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
465 /* second, check that we have at most k=num_moves elements in the window
466 * and no element is larger than the currently processed one */
467 unsigned k
= op_moves
;
469 unsigned remaining_op_moves
= op_moves
;
470 unsigned last_var
= 0;
472 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
473 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
474 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
477 /* dead operands effectively reduce the number of estimated moves */
478 if (remaining_op_moves
&& reg_file
[j
] == 0xFFFFFFFF) {
480 remaining_op_moves
--;
484 if (ctx
.assignments
[reg_file
[j
]].second
.size() >= size
) {
490 /* we cannot split live ranges of linear vgprs */
491 if (ctx
.assignments
[reg_file
[j
]].second
& (1 << 6)) {
496 k
+= ctx
.assignments
[reg_file
[j
]].second
.size();
498 last_var
= reg_file
[j
];
501 if (!found
|| k
> num_moves
)
503 if (k
== num_moves
&& n
< num_vars
)
505 if (!aligned
&& k
== num_moves
&& n
== num_vars
)
515 if (num_moves
== 0xFF) {
516 /* remove killed operands from reg_file once again */
517 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
518 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKill()) {
519 for (unsigned k
= 0; k
< instr
->operands
[i
].getTemp().size(); k
++)
520 reg_file
[instr
->operands
[i
].physReg() + k
] = 0;
523 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
524 Definition def
= instr
->definitions
[i
];
525 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
)) {
526 for (unsigned k
= 0; k
< def
.getTemp().size(); k
++)
527 reg_file
[def
.physReg() + k
] = def
.tempId();
533 std::array
<uint32_t, 512> register_file
= reg_file
;
535 /* now, we figured the placement for our definition */
536 std::set
<std::pair
<unsigned, unsigned>> vars
;
537 for (unsigned j
= best_pos
; j
< best_pos
+ size
; j
++) {
538 if (reg_file
[j
] != 0xFFFFFFFF && reg_file
[j
] != 0)
539 vars
.emplace(ctx
.assignments
[reg_file
[j
]].second
.size(), reg_file
[j
]);
543 if (instr
->opcode
== aco_opcode::p_create_vector
) {
544 /* move killed operands which aren't yet at the correct position */
545 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
546 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKill() &&
547 instr
->operands
[i
].getTemp().type() == rc
.type()) {
549 if (instr
->operands
[i
].physReg() != best_pos
+ offset
) {
550 vars
.emplace(instr
->operands
[i
].size(), instr
->operands
[i
].tempId());
551 for (unsigned j
= 0; j
< instr
->operands
[i
].size(); j
++)
552 reg_file
[instr
->operands
[i
].physReg() + j
] = 0;
554 for (unsigned j
= 0; j
< instr
->operands
[i
].size(); j
++)
555 reg_file
[instr
->operands
[i
].physReg() + j
] = instr
->operands
[i
].tempId();
560 /* re-enable the killed operands */
561 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
562 if (instr
->operands
[j
].isTemp() && instr
->operands
[j
].isFirstKill()) {
563 for (unsigned k
= 0; k
< instr
->operands
[j
].getTemp().size(); k
++)
564 reg_file
[instr
->operands
[j
].physReg() + k
] = instr
->operands
[j
].tempId();
569 std::vector
<std::pair
<Operand
, Definition
>> pc
;
570 if (!get_regs_for_copies(ctx
, reg_file
, pc
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1)) {
571 reg_file
= std::move(register_file
);
572 /* remove killed operands from reg_file once again */
573 if (!is_phi(instr
)) {
574 for (const Operand
& op
: instr
->operands
) {
575 if (op
.isTemp() && op
.isFirstKill()) {
576 for (unsigned k
= 0; k
< op
.getTemp().size(); k
++)
577 reg_file
[op
.physReg() + k
] = 0;
581 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
582 Definition
& def
= instr
->definitions
[i
];
583 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
)) {
584 for (unsigned k
= 0; k
< def
.getTemp().size(); k
++)
585 reg_file
[def
.physReg() + k
] = def
.tempId();
591 parallelcopies
.insert(parallelcopies
.end(), pc
.begin(), pc
.end());
593 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
594 for (unsigned i
= 0; i
< size
; i
++)
595 reg_file
[best_pos
+ i
] = 0;
597 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
599 /* remove killed operands from reg_file once again */
600 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
601 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
603 assert(!instr
->operands
[i
].isUndefined());
604 if (instr
->operands
[i
].isFirstKill()) {
605 for (unsigned j
= 0; j
< instr
->operands
[i
].getTemp().size(); j
++)
606 reg_file
[instr
->operands
[i
].physReg() + j
] = 0;
609 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
610 Definition def
= instr
->definitions
[i
];
611 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
)) {
612 for (unsigned k
= 0; k
< def
.getTemp().size(); k
++)
613 reg_file
[def
.physReg() + k
] = def
.tempId();
617 adjust_max_used_regs(ctx
, rc
, best_pos
);
618 return {PhysReg
{best_pos
}, true};
621 PhysReg
get_reg(ra_ctx
& ctx
,
622 std::array
<uint32_t, 512>& reg_file
,
624 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
625 aco_ptr
<Instruction
>& instr
)
627 uint32_t size
= rc
.size();
630 if (rc
.type() == RegType::vgpr
) {
632 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
635 ub
= ctx
.program
->max_reg_demand
.sgpr
;
642 std::pair
<PhysReg
, bool> res
= {{}, false};
643 /* try to find space without live-range splits */
644 if (rc
.type() == RegType::vgpr
&& (size
== 4 || size
== 8))
645 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, 4, rc
);
647 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
651 /* try to find space with live-range splits */
652 res
= get_reg_impl(ctx
, reg_file
, parallelcopies
, lb
, ub
, size
, stride
, rc
, instr
);
657 unsigned regs_free
= 0;
658 for (unsigned i
= lb
; i
< ub
; i
++) {
663 /* We should only fail here because keeping under the limit would require
665 assert(regs_free
>= size
);
667 /* try using more registers */
668 uint16_t max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
669 if (rc
.type() == RegType::vgpr
&& ctx
.program
->max_reg_demand
.vgpr
< 256) {
670 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
+ 1, ctx
.program
->max_reg_demand
.sgpr
));
671 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
672 } else if (rc
.type() == RegType::sgpr
&& ctx
.program
->max_reg_demand
.sgpr
< max_addressible_sgpr
) {
673 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
, ctx
.program
->max_reg_demand
.sgpr
+ 1));
674 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
677 //FIXME: if nothing helps, shift-rotate the registers to make space
679 unreachable("did not find a register");
683 std::pair
<PhysReg
, bool> get_reg_vec(ra_ctx
& ctx
,
684 std::array
<uint32_t, 512>& reg_file
,
687 uint32_t size
= rc
.size();
690 if (rc
.type() == RegType::vgpr
) {
692 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
695 ub
= ctx
.program
->max_reg_demand
.sgpr
;
701 return get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
705 PhysReg
get_reg_create_vector(ra_ctx
& ctx
,
706 std::array
<uint32_t, 512>& reg_file
,
708 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
709 aco_ptr
<Instruction
>& instr
)
711 /* create_vector instructions have different costs w.r.t. register coalescing */
712 uint32_t size
= rc
.size();
715 if (rc
.type() == RegType::vgpr
) {
717 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
720 ub
= ctx
.program
->max_reg_demand
.sgpr
;
727 unsigned best_pos
= -1;
728 unsigned num_moves
= 0xFF;
729 bool best_war_hint
= true;
731 /* test for each operand which definition placement causes the least shuffle instructions */
732 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
733 // TODO: think about, if we can alias live operands on the same register
734 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isKill() || instr
->operands
[i
].getTemp().type() != rc
.type())
737 if (offset
> instr
->operands
[i
].physReg())
740 unsigned reg_lo
= instr
->operands
[i
].physReg() - offset
;
741 unsigned reg_hi
= reg_lo
+ size
- 1;
744 /* no need to check multiple times */
745 if (reg_lo
== best_pos
)
749 // TODO: this can be improved */
750 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
% stride
!= 0)
752 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
754 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
757 /* count variables to be moved and check war_hint */
758 bool war_hint
= false;
759 for (unsigned j
= reg_lo
; j
<= reg_hi
; j
++) {
760 if (reg_file
[j
] != 0)
762 war_hint
|= ctx
.war_hint
[j
];
765 /* count operands in wrong positions */
766 for (unsigned j
= 0, offset
= 0; j
< instr
->operands
.size(); offset
+= instr
->operands
[j
].size(), j
++) {
768 !instr
->operands
[j
].isTemp() ||
769 instr
->operands
[j
].getTemp().type() != rc
.type())
771 if (instr
->operands
[j
].physReg() != reg_lo
+ offset
)
772 k
+= instr
->operands
[j
].size();
774 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
775 if (k
> num_moves
|| (!aligned
&& k
== num_moves
) || (war_hint
&& !best_war_hint
))
780 best_war_hint
= war_hint
;
783 if (num_moves
>= size
)
784 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
786 /* collect variables to be moved */
787 std::set
<std::pair
<unsigned, unsigned>> vars
;
788 for (unsigned i
= best_pos
; i
< best_pos
+ size
; i
++) {
789 if (reg_file
[i
] != 0)
790 vars
.emplace(ctx
.assignments
[reg_file
[i
]].second
.size(), reg_file
[i
]);
794 /* move killed operands which aren't yet at the correct position */
795 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
796 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKill() && instr
->operands
[i
].getTemp().type() == rc
.type()) {
797 if (instr
->operands
[i
].physReg() != best_pos
+ offset
) {
798 vars
.emplace(instr
->operands
[i
].size(), instr
->operands
[i
].tempId());
800 for (unsigned j
= 0; j
< instr
->operands
[i
].size(); j
++)
801 reg_file
[instr
->operands
[i
].physReg() + j
] = instr
->operands
[i
].tempId();
806 ASSERTED
bool success
= false;
807 success
= get_regs_for_copies(ctx
, reg_file
, parallelcopies
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1);
810 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
811 adjust_max_used_regs(ctx
, rc
, best_pos
);
812 return PhysReg
{best_pos
};
815 bool get_reg_specified(ra_ctx
& ctx
,
816 std::array
<uint32_t, 512>& reg_file
,
818 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
819 aco_ptr
<Instruction
>& instr
,
822 uint32_t size
= rc
.size();
826 if (rc
.type() == RegType::vgpr
) {
828 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
834 if (reg
% stride
!= 0)
837 ub
= ctx
.program
->max_reg_demand
.sgpr
;
840 uint32_t reg_lo
= reg
.reg
;
841 uint32_t reg_hi
= reg
+ (size
- 1);
843 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
> reg_hi
)
846 for (unsigned i
= reg_lo
; i
<= reg_hi
; i
++) {
847 if (reg_file
[i
] != 0)
850 adjust_max_used_regs(ctx
, rc
, reg_lo
);
854 void handle_pseudo(ra_ctx
& ctx
,
855 const std::array
<uint32_t, 512>& reg_file
,
858 if (instr
->format
!= Format::PSEUDO
)
861 /* all instructions which use handle_operands() need this information */
862 switch (instr
->opcode
) {
863 case aco_opcode::p_extract_vector
:
864 case aco_opcode::p_create_vector
:
865 case aco_opcode::p_split_vector
:
866 case aco_opcode::p_parallelcopy
:
867 case aco_opcode::p_wqm
:
873 /* if all definitions are vgpr, no need to care for SCC */
874 bool writes_sgpr
= false;
875 for (Definition
& def
: instr
->definitions
) {
876 if (def
.getTemp().type() == RegType::sgpr
) {
884 Pseudo_instruction
*pi
= (Pseudo_instruction
*)instr
;
885 if (reg_file
[scc
.reg
]) {
886 pi
->tmp_in_scc
= true;
888 int reg
= ctx
.max_used_sgpr
;
889 for (; reg
>= 0 && reg_file
[reg
]; reg
--)
892 reg
= ctx
.max_used_sgpr
+ 1;
893 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& reg_file
[reg
]; reg
++)
895 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
898 adjust_max_used_regs(ctx
, s1
, reg
);
899 pi
->scratch_sgpr
= PhysReg
{(unsigned)reg
};
901 pi
->tmp_in_scc
= false;
905 bool operand_can_use_reg(aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
)
907 switch (instr
->format
) {
911 (reg
!= m0
|| idx
== 1 || idx
== 3) && /* offset can be m0 */
912 (reg
!= vcc
|| (instr
->definitions
.empty() && idx
== 2)); /* sdata can be vcc */
914 // TODO: there are more instructions with restrictions on registers
919 } /* end namespace */
922 void register_allocation(Program
*program
, std::vector
<std::set
<Temp
>> live_out_per_block
)
926 std::vector
<std::unordered_map
<unsigned, Temp
>> renames(program
->blocks
.size());
931 std::set
<Instruction
*> uses
;
934 bool filled
[program
->blocks
.size()];
935 bool sealed
[program
->blocks
.size()];
936 memset(filled
, 0, sizeof filled
);
937 memset(sealed
, 0, sizeof sealed
);
938 std::vector
<std::vector
<Instruction
*>> incomplete_phis(program
->blocks
.size());
939 std::map
<unsigned, phi_info
> phi_map
;
940 std::map
<unsigned, unsigned> affinities
;
941 std::function
<Temp(Temp
,unsigned)> read_variable
;
942 std::function
<Temp(Temp
,Block
*)> handle_live_in
;
943 std::function
<Temp(std::map
<unsigned, phi_info
>::iterator
)> try_remove_trivial_phi
;
945 read_variable
= [&](Temp val
, unsigned block_idx
) -> Temp
{
946 std::unordered_map
<unsigned, Temp
>::iterator it
= renames
[block_idx
].find(val
.id());
947 assert(it
!= renames
[block_idx
].end());
951 handle_live_in
= [&](Temp val
, Block
*block
) -> Temp
{
952 std::vector
<unsigned>& preds
= val
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
953 if (preds
.size() == 0 && block
->index
!= 0) {
954 renames
[block
->index
][val
.id()] = val
;
957 assert(preds
.size() > 0);
960 if (!sealed
[block
->index
]) {
961 /* consider rename from already processed predecessor */
962 Temp tmp
= read_variable(val
, preds
[0]);
964 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
965 new_val
= Temp
{program
->allocateId(), val
.regClass()};
966 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
967 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
968 phi
->definitions
[0] = Definition(new_val
);
969 for (unsigned i
= 0; i
< preds
.size(); i
++)
970 phi
->operands
[i
] = Operand(val
);
971 if (tmp
.regClass() == new_val
.regClass())
972 affinities
[new_val
.id()] = tmp
.id();
974 phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
975 incomplete_phis
[block
->index
].emplace_back(phi
.get());
976 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
978 } else if (preds
.size() == 1) {
979 /* if the block has only one predecessor, just look there for the name */
980 new_val
= read_variable(val
, preds
[0]);
982 /* there are multiple predecessors and the block is sealed */
983 Temp ops
[preds
.size()];
985 /* we start assuming that the name is the same from all predecessors */
986 renames
[block
->index
][val
.id()] = val
;
987 bool needs_phi
= false;
989 /* get the rename from each predecessor and check if they are the same */
990 for (unsigned i
= 0; i
< preds
.size(); i
++) {
991 ops
[i
] = read_variable(val
, preds
[i
]);
995 needs_phi
|= !(new_val
== ops
[i
]);
999 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1000 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1001 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1002 new_val
= Temp
{program
->allocateId(), val
.regClass()};
1003 phi
->definitions
[0] = Definition(new_val
);
1004 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1005 phi
->operands
[i
] = Operand(ops
[i
]);
1006 phi
->operands
[i
].setFixed(ctx
.assignments
[ops
[i
].id()].first
);
1007 if (ops
[i
].regClass() == new_val
.regClass())
1008 affinities
[new_val
.id()] = ops
[i
].id();
1010 phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1011 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1015 renames
[block
->index
][val
.id()] = new_val
;
1016 renames
[block
->index
][new_val
.id()] = new_val
;
1017 ctx
.orig_names
[new_val
.id()] = val
;
1021 try_remove_trivial_phi
= [&] (std::map
<unsigned, phi_info
>::iterator info
) -> Temp
{
1022 assert(info
->second
.block_idx
!= 0);
1023 Instruction
* phi
= info
->second
.phi
;
1026 Definition def
= phi
->definitions
[0];
1027 /* a phi node is trivial if all operands are the same as the definition of the phi */
1028 for (const Operand
& op
: phi
->operands
) {
1029 const Temp t
= op
.getTemp();
1030 if (t
== same
|| t
== def
.getTemp())
1032 if (!(same
== Temp()) || !(op
.physReg() == def
.physReg())) {
1033 /* phi is not trivial */
1034 return def
.getTemp();
1038 assert(!(same
== Temp() || same
== def
.getTemp()));
1040 /* reroute all uses to same and remove phi */
1041 std::vector
<std::map
<unsigned, phi_info
>::iterator
> phi_users
;
1042 std::map
<unsigned, phi_info
>::iterator same_phi_info
= phi_map
.find(same
.id());
1043 for (Instruction
* instr
: info
->second
.uses
) {
1044 assert(phi
!= instr
);
1045 /* recursively try to remove trivial phis */
1046 if (is_phi(instr
)) {
1047 /* ignore if the phi was already flagged trivial */
1048 if (instr
->definitions
.empty())
1051 std::map
<unsigned, phi_info
>::iterator it
= phi_map
.find(instr
->definitions
[0].tempId());
1052 if (it
!= phi_map
.end() && it
!= info
)
1053 phi_users
.emplace_back(it
);
1055 for (Operand
& op
: instr
->operands
) {
1056 if (op
.isTemp() && op
.tempId() == def
.tempId()) {
1058 if (same_phi_info
!= phi_map
.end())
1059 same_phi_info
->second
.uses
.emplace(instr
);
1064 auto it
= ctx
.orig_names
.find(same
.id());
1065 unsigned orig_var
= it
!= ctx
.orig_names
.end() ? it
->second
.id() : same
.id();
1066 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
1067 auto it
= renames
[i
].find(orig_var
);
1068 if (it
!= renames
[i
].end() && it
->second
== def
.getTemp())
1069 renames
[i
][orig_var
] = same
;
1072 unsigned block_idx
= info
->second
.block_idx
;
1073 phi
->definitions
.clear(); /* this indicates that the phi can be removed */
1074 phi_map
.erase(info
);
1075 for (auto it
: phi_users
) {
1076 if (sealed
[it
->second
.block_idx
])
1077 try_remove_trivial_phi(it
);
1080 /* due to the removal of other phis, the name might have changed once again! */
1081 return renames
[block_idx
][orig_var
];
1084 std::map
<unsigned, Instruction
*> vectors
;
1085 std::vector
<std::vector
<Temp
>> phi_ressources
;
1086 std::map
<unsigned, unsigned> temp_to_phi_ressources
;
1088 for (std::vector
<Block
>::reverse_iterator it
= program
->blocks
.rbegin(); it
!= program
->blocks
.rend(); it
++) {
1091 /* first, compute the death points of all live vars within the block */
1092 std::set
<Temp
>& live
= live_out_per_block
[block
.index
];
1094 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator rit
;
1095 for (rit
= block
.instructions
.rbegin(); rit
!= block
.instructions
.rend(); ++rit
) {
1096 aco_ptr
<Instruction
>& instr
= *rit
;
1097 if (!is_phi(instr
)) {
1098 for (const Operand
& op
: instr
->operands
) {
1100 live
.emplace(op
.getTemp());
1102 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1103 for (const Operand
& op
: instr
->operands
) {
1104 if (op
.isTemp() && op
.getTemp().type() == instr
->definitions
[0].getTemp().type())
1105 vectors
[op
.tempId()] = instr
.get();
1108 } else if (!instr
->definitions
[0].isKill() && !instr
->definitions
[0].isFixed()) {
1109 /* collect information about affinity-related temporaries */
1110 std::vector
<Temp
> affinity_related
;
1111 /* affinity_related[0] is the last seen affinity-related temp */
1112 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1113 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1114 for (const Operand
& op
: instr
->operands
) {
1115 if (op
.isTemp() && op
.regClass() == instr
->definitions
[0].regClass()) {
1116 affinity_related
.emplace_back(op
.getTemp());
1117 temp_to_phi_ressources
[op
.tempId()] = phi_ressources
.size();
1120 phi_ressources
.emplace_back(std::move(affinity_related
));
1123 /* erase from live */
1124 for (const Definition
& def
: instr
->definitions
) {
1126 live
.erase(def
.getTemp());
1127 std::map
<unsigned, unsigned>::iterator it
= temp_to_phi_ressources
.find(def
.tempId());
1128 if (it
!= temp_to_phi_ressources
.end() && def
.regClass() == phi_ressources
[it
->second
][0].regClass())
1129 phi_ressources
[it
->second
][0] = def
.getTemp();
1134 /* create affinities */
1135 for (std::vector
<Temp
>& vec
: phi_ressources
) {
1136 assert(vec
.size() > 1);
1137 for (unsigned i
= 1; i
< vec
.size(); i
++)
1138 if (vec
[i
].id() != vec
[0].id())
1139 affinities
[vec
[i
].id()] = vec
[0].id();
1142 /* state of register file after phis */
1143 std::vector
<std::bitset
<128>> sgpr_live_in(program
->blocks
.size());
1145 for (Block
& block
: program
->blocks
) {
1146 std::set
<Temp
>& live
= live_out_per_block
[block
.index
];
1147 /* initialize register file */
1148 assert(block
.index
!= 0 || live
.empty());
1149 std::array
<uint32_t, 512> register_file
= {0};
1150 ctx
.war_hint
.reset();
1152 for (Temp t
: live
) {
1153 Temp renamed
= handle_live_in(t
, &block
);
1154 if (ctx
.assignments
.find(renamed
.id()) != ctx
.assignments
.end()) {
1155 for (unsigned i
= 0; i
< t
.size(); i
++)
1156 register_file
[ctx
.assignments
[renamed
.id()].first
+ i
] = renamed
.id();
1160 std::vector
<aco_ptr
<Instruction
>> instructions
;
1161 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1163 /* this is a slight adjustment from the paper as we already have phi nodes:
1164 * We consider them incomplete phis and only handle the definition. */
1166 /* handle fixed phi definitions */
1167 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1168 aco_ptr
<Instruction
>& phi
= *it
;
1171 Definition
& definition
= phi
->definitions
[0];
1172 if (!definition
.isFixed())
1175 /* check if a dead exec mask phi is needed */
1176 if (definition
.isKill()) {
1177 for (Operand
& op
: phi
->operands
) {
1178 assert(op
.isTemp());
1179 if (ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end() ||
1180 ctx
.assignments
[op
.tempId()].first
!= exec
) {
1181 definition
.setKill(false);
1187 if (definition
.isKill())
1190 assert(definition
.physReg() == exec
);
1191 for (unsigned i
= 0; i
< definition
.size(); i
++) {
1192 assert(register_file
[definition
.physReg() + i
] == 0);
1193 register_file
[definition
.physReg() + i
] = definition
.tempId();
1195 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1198 /* look up the affinities */
1199 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1200 aco_ptr
<Instruction
>& phi
= *it
;
1203 Definition
& definition
= phi
->definitions
[0];
1204 if (definition
.isKill() || definition
.isFixed())
1207 if (affinities
.find(definition
.tempId()) != affinities
.end() &&
1208 ctx
.assignments
.find(affinities
[definition
.tempId()]) != ctx
.assignments
.end()) {
1209 assert(ctx
.assignments
[affinities
[definition
.tempId()]].second
== definition
.regClass());
1210 PhysReg reg
= ctx
.assignments
[affinities
[definition
.tempId()]].first
;
1211 bool try_use_special_reg
= reg
== scc
|| reg
== exec
;
1212 if (try_use_special_reg
) {
1213 for (const Operand
& op
: phi
->operands
) {
1215 ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end() ||
1216 !(ctx
.assignments
[op
.tempId()].first
== reg
)) {
1217 try_use_special_reg
= false;
1221 if (!try_use_special_reg
)
1224 bool reg_free
= true;
1225 for (unsigned i
= reg
.reg
; reg_free
&& i
< reg
+ definition
.size(); i
++) {
1226 if (register_file
[i
] != 0)
1229 /* only assign if register is still free */
1231 definition
.setFixed(reg
);
1232 for (unsigned i
= 0; i
< definition
.size(); i
++)
1233 register_file
[definition
.physReg() + i
] = definition
.tempId();
1234 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1239 /* find registers for phis without affinity or where the register was blocked */
1240 for (it
= block
.instructions
.begin();it
!= block
.instructions
.end(); ++it
) {
1241 aco_ptr
<Instruction
>& phi
= *it
;
1245 Definition
& definition
= phi
->definitions
[0];
1246 if (definition
.isKill())
1249 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1251 if (!definition
.isFixed()) {
1252 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1253 /* try to find a register that is used by at least one operand */
1254 for (const Operand
& op
: phi
->operands
) {
1256 ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end())
1258 PhysReg reg
= ctx
.assignments
[op
.tempId()].first
;
1259 /* we tried this already on the previous loop */
1260 if (reg
== scc
|| reg
== exec
)
1262 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
, reg
)) {
1263 definition
.setFixed(reg
);
1267 if (!definition
.isFixed())
1268 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
));
1270 /* process parallelcopy */
1271 for (std::pair
<Operand
, Definition
> pc
: parallelcopy
) {
1273 std::map
<unsigned, Temp
>::iterator orig_it
= ctx
.orig_names
.find(pc
.first
.tempId());
1274 Temp orig
= pc
.first
.getTemp();
1275 if (orig_it
!= ctx
.orig_names
.end())
1276 orig
= orig_it
->second
;
1278 ctx
.orig_names
[pc
.second
.tempId()] = orig
;
1279 renames
[block
.index
][orig
.id()] = pc
.second
.getTemp();
1280 renames
[block
.index
][pc
.second
.tempId()] = pc
.second
.getTemp();
1282 /* see if it's a copy from a previous phi */
1283 //TODO: prefer moving some previous phis over live-ins
1284 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1285 Instruction
*prev_phi
= NULL
;
1286 for (auto it2
= instructions
.begin(); it2
!= instructions
.end(); ++it2
) {
1287 if ((*it2
)->definitions
[0].tempId() == pc
.first
.tempId())
1288 prev_phi
= it2
->get();
1291 /* if so, just update that phi */
1292 prev_phi
->definitions
[0] = pc
.second
;
1296 /* otherwise, this is a live-in and we need to create a new phi
1297 * to move it in this block's predecessors */
1298 aco_opcode opcode
= pc
.first
.getTemp().is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1299 std::vector
<unsigned>& preds
= pc
.first
.getTemp().is_linear() ? block
.linear_preds
: block
.logical_preds
;
1300 aco_ptr
<Instruction
> new_phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1301 new_phi
->definitions
[0] = pc
.second
;
1302 for (unsigned i
= 0; i
< preds
.size(); i
++)
1303 new_phi
->operands
[i
] = Operand(pc
.first
);
1304 instructions
.emplace_back(std::move(new_phi
));
1307 for (unsigned i
= 0; i
< definition
.size(); i
++)
1308 register_file
[definition
.physReg() + i
] = definition
.tempId();
1309 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1311 live
.emplace(definition
.getTemp());
1313 /* update phi affinities */
1314 for (const Operand
& op
: phi
->operands
) {
1315 if (op
.isTemp() && op
.regClass() == phi
->definitions
[0].regClass())
1316 affinities
[op
.tempId()] = definition
.tempId();
1319 instructions
.emplace_back(std::move(*it
));
1322 /* fill in sgpr_live_in */
1323 for (unsigned i
= 0; i
< ctx
.max_used_sgpr
; i
++)
1324 sgpr_live_in
[block
.index
][i
] = register_file
[i
];
1325 sgpr_live_in
[block
.index
][127] = register_file
[scc
.reg
];
1327 /* Handle all other instructions of the block */
1328 for (; it
!= block
.instructions
.end(); ++it
) {
1329 aco_ptr
<Instruction
>& instr
= *it
;
1331 /* parallelcopies from p_phi are inserted here which means
1332 * live ranges of killed operands end here as well */
1333 if (instr
->opcode
== aco_opcode::p_logical_end
) {
1334 /* no need to process this instruction any further */
1335 if (block
.logical_succs
.size() != 1) {
1336 instructions
.emplace_back(std::move(instr
));
1340 Block
& succ
= program
->blocks
[block
.logical_succs
[0]];
1342 for (; idx
< succ
.logical_preds
.size(); idx
++) {
1343 if (succ
.logical_preds
[idx
] == block
.index
)
1346 for (aco_ptr
<Instruction
>& phi
: succ
.instructions
) {
1347 if (phi
->opcode
== aco_opcode::p_phi
) {
1348 if (phi
->operands
[idx
].isTemp() &&
1349 phi
->operands
[idx
].getTemp().type() == RegType::sgpr
&&
1350 phi
->operands
[idx
].isFirstKill()) {
1351 Temp phi_op
= read_variable(phi
->operands
[idx
].getTemp(), block
.index
);
1352 PhysReg reg
= ctx
.assignments
[phi_op
.id()].first
;
1353 assert(register_file
[reg
] == phi_op
.id());
1354 register_file
[reg
] = 0;
1356 } else if (phi
->opcode
!= aco_opcode::p_linear_phi
) {
1360 instructions
.emplace_back(std::move(instr
));
1364 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1366 assert(!is_phi(instr
));
1368 /* handle operands */
1369 for (unsigned i
= 0; i
< instr
->operands
.size(); ++i
) {
1370 auto& operand
= instr
->operands
[i
];
1371 if (!operand
.isTemp())
1374 /* rename operands */
1375 operand
.setTemp(read_variable(operand
.getTemp(), block
.index
));
1377 /* check if the operand is fixed */
1378 if (operand
.isFixed()) {
1380 if (operand
.physReg() == ctx
.assignments
[operand
.tempId()].first
) {
1381 /* we are fine: the operand is already assigned the correct reg */
1384 /* check if target reg is blocked, and move away the blocking var */
1385 if (register_file
[operand
.physReg().reg
]) {
1386 uint32_t blocking_id
= register_file
[operand
.physReg().reg
];
1387 RegClass rc
= ctx
.assignments
[blocking_id
].second
;
1388 Operand pc_op
= Operand(Temp
{blocking_id
, rc
});
1389 pc_op
.setFixed(operand
.physReg());
1390 Definition pc_def
= Definition(Temp
{program
->allocateId(), pc_op
.regClass()});
1392 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.regClass(), parallelcopy
, instr
);
1393 pc_def
.setFixed(reg
);
1394 ctx
.assignments
[pc_def
.tempId()] = {reg
, pc_def
.regClass()};
1395 for (unsigned i
= 0; i
< operand
.size(); i
++) {
1396 register_file
[pc_op
.physReg() + i
] = 0;
1397 register_file
[pc_def
.physReg() + i
] = pc_def
.tempId();
1399 parallelcopy
.emplace_back(pc_op
, pc_def
);
1401 /* handle renames of previous operands */
1402 for (unsigned j
= 0; j
< i
; j
++) {
1403 Operand
& op
= instr
->operands
[j
];
1404 if (op
.isTemp() && op
.tempId() == blocking_id
) {
1405 op
= Operand(pc_def
.getTemp());
1410 /* move operand to fixed reg and create parallelcopy pair */
1411 Operand pc_op
= operand
;
1412 Temp tmp
= Temp
{program
->allocateId(), operand
.regClass()};
1413 Definition pc_def
= Definition(tmp
);
1414 pc_def
.setFixed(operand
.physReg());
1415 pc_op
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1416 operand
.setTemp(tmp
);
1417 ctx
.assignments
[tmp
.id()] = {pc_def
.physReg(), pc_def
.regClass()};
1418 operand
.setFixed(pc_def
.physReg());
1419 for (unsigned i
= 0; i
< operand
.size(); i
++) {
1420 register_file
[pc_op
.physReg() + i
] = 0;
1421 register_file
[pc_def
.physReg() + i
] = tmp
.id();
1423 parallelcopy
.emplace_back(pc_op
, pc_def
);
1426 assert(ctx
.assignments
.find(operand
.tempId()) != ctx
.assignments
.end());
1427 PhysReg reg
= ctx
.assignments
[operand
.tempId()].first
;
1429 if (operand_can_use_reg(instr
, i
, reg
)) {
1430 operand
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1432 Operand pc_op
= operand
;
1433 pc_op
.setFixed(reg
);
1434 PhysReg new_reg
= get_reg(ctx
, register_file
, operand
.regClass(), parallelcopy
, instr
);
1435 Definition pc_def
= Definition(program
->allocateId(), new_reg
, pc_op
.regClass());
1436 ctx
.assignments
[pc_def
.tempId()] = {reg
, pc_def
.regClass()};
1437 for (unsigned i
= 0; i
< operand
.size(); i
++) {
1438 register_file
[pc_op
.physReg() + i
] = 0;
1439 register_file
[pc_def
.physReg() + i
] = pc_def
.tempId();
1441 parallelcopy
.emplace_back(pc_op
, pc_def
);
1442 operand
.setFixed(new_reg
);
1445 if (instr
->format
== Format::EXP
||
1446 (instr
->isVMEM() && i
== 3 && program
->chip_class
== GFX6
) ||
1447 (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
)) {
1448 for (unsigned j
= 0; j
< operand
.size(); j
++)
1449 ctx
.war_hint
.set(operand
.physReg().reg
+ j
);
1452 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.getTemp().id());
1453 if (phi
!= phi_map
.end())
1454 phi
->second
.uses
.emplace(instr
.get());
1457 /* remove dead vars from register file */
1458 for (const Operand
& op
: instr
->operands
) {
1459 if (op
.isTemp() && op
.isFirstKill())
1460 for (unsigned j
= 0; j
< op
.size(); j
++)
1461 register_file
[op
.physReg() + j
] = 0;
1464 /* try to optimize v_mad_f32 -> v_mac_f32 */
1465 if (instr
->opcode
== aco_opcode::v_mad_f32
&&
1466 instr
->operands
[2].isTemp() &&
1467 instr
->operands
[2].isKill() &&
1468 instr
->operands
[2].getTemp().type() == RegType::vgpr
&&
1469 instr
->operands
[1].isTemp() &&
1470 instr
->operands
[1].getTemp().type() == RegType::vgpr
) { /* TODO: swap src0 and src1 in this case */
1471 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
1472 bool can_use_mac
= !(vop3
->abs
[0] || vop3
->abs
[1] || vop3
->abs
[2] ||
1473 vop3
->opsel
[0] || vop3
->opsel
[1] || vop3
->opsel
[2] ||
1474 vop3
->neg
[0] || vop3
->neg
[1] || vop3
->neg
[2] ||
1475 vop3
->clamp
|| vop3
->omod
);
1477 instr
->format
= Format::VOP2
;
1478 instr
->opcode
= aco_opcode::v_mac_f32
;
1482 /* handle definitions which must have the same register as an operand */
1483 if (instr
->opcode
== aco_opcode::v_interp_p2_f32
||
1484 instr
->opcode
== aco_opcode::v_mac_f32
||
1485 instr
->opcode
== aco_opcode::v_writelane_b32
) {
1486 instr
->definitions
[0].setFixed(instr
->operands
[2].physReg());
1487 } else if (instr
->opcode
== aco_opcode::s_addk_i32
||
1488 instr
->opcode
== aco_opcode::s_mulk_i32
) {
1489 instr
->definitions
[0].setFixed(instr
->operands
[0].physReg());
1490 } else if ((instr
->format
== Format::MUBUF
||
1491 instr
->format
== Format::MIMG
) &&
1492 instr
->definitions
.size() == 1 &&
1493 instr
->operands
.size() == 4) {
1494 instr
->definitions
[0].setFixed(instr
->operands
[3].physReg());
1497 ctx
.defs_done
.reset();
1499 /* handle fixed definitions first */
1500 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1501 auto& definition
= instr
->definitions
[i
];
1502 if (!definition
.isFixed())
1505 adjust_max_used_regs(ctx
, definition
.regClass(), definition
.physReg());
1506 /* check if the target register is blocked */
1507 if (register_file
[definition
.physReg().reg
] != 0) {
1508 /* create parallelcopy pair to move blocking var */
1509 Temp tmp
= {register_file
[definition
.physReg()], ctx
.assignments
[register_file
[definition
.physReg()]].second
};
1510 Operand pc_op
= Operand(tmp
);
1511 pc_op
.setFixed(ctx
.assignments
[register_file
[definition
.physReg().reg
]].first
);
1512 RegClass rc
= pc_op
.regClass();
1513 tmp
= Temp
{program
->allocateId(), rc
};
1514 Definition pc_def
= Definition(tmp
);
1516 /* re-enable the killed operands, so that we don't move the blocking var there */
1517 for (const Operand
& op
: instr
->operands
) {
1518 if (op
.isTemp() && op
.isFirstKill())
1519 for (unsigned j
= 0; j
< op
.size(); j
++)
1520 register_file
[op
.physReg() + j
] = 0xFFFF;
1523 /* find a new register for the blocking variable */
1524 PhysReg reg
= get_reg(ctx
, register_file
, rc
, parallelcopy
, instr
);
1525 /* once again, disable killed operands */
1526 for (const Operand
& op
: instr
->operands
) {
1527 if (op
.isTemp() && op
.isFirstKill())
1528 for (unsigned j
= 0; j
< op
.size(); j
++)
1529 register_file
[op
.physReg() + j
] = 0;
1531 for (unsigned k
= 0; k
< i
; k
++) {
1532 if (instr
->definitions
[k
].isTemp() && ctx
.defs_done
.test(k
) && !instr
->definitions
[k
].isKill())
1533 for (unsigned j
= 0; j
< instr
->definitions
[k
].size(); j
++)
1534 register_file
[instr
->definitions
[k
].physReg() + j
] = instr
->definitions
[k
].tempId();
1536 pc_def
.setFixed(reg
);
1538 /* finish assignment of parallelcopy */
1539 ctx
.assignments
[pc_def
.tempId()] = {reg
, pc_def
.regClass()};
1540 parallelcopy
.emplace_back(pc_op
, pc_def
);
1542 /* add changes to reg_file */
1543 for (unsigned i
= 0; i
< pc_op
.size(); i
++) {
1544 register_file
[pc_op
.physReg() + i
] = 0x0;
1545 register_file
[pc_def
.physReg() + i
] = pc_def
.tempId();
1548 ctx
.defs_done
.set(i
);
1550 if (!definition
.isTemp())
1553 /* set live if it has a kill point */
1554 if (!definition
.isKill())
1555 live
.emplace(definition
.getTemp());
1557 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1558 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1559 for (unsigned j
= 0; j
< definition
.size(); j
++)
1560 register_file
[definition
.physReg() + j
] = definition
.tempId();
1563 /* handle all other definitions */
1564 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1565 auto& definition
= instr
->definitions
[i
];
1567 if (definition
.isFixed() || !definition
.isTemp())
1571 if (definition
.hasHint() && register_file
[definition
.physReg().reg
] == 0)
1572 definition
.setFixed(definition
.physReg());
1573 else if (instr
->opcode
== aco_opcode::p_split_vector
) {
1574 PhysReg reg
= PhysReg
{instr
->operands
[0].physReg() + i
* definition
.size()};
1575 if (!get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1576 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1577 definition
.setFixed(reg
);
1578 } else if (instr
->opcode
== aco_opcode::p_wqm
) {
1580 if (instr
->operands
[0].isKill() && instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1581 reg
= instr
->operands
[0].physReg();
1582 assert(register_file
[reg
.reg
] == 0);
1584 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1586 definition
.setFixed(reg
);
1587 } else if (instr
->opcode
== aco_opcode::p_extract_vector
) {
1589 if (instr
->operands
[0].isKill() &&
1590 instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1591 reg
= instr
->operands
[0].physReg();
1592 reg
.reg
+= definition
.size() * instr
->operands
[1].constantValue();
1593 assert(register_file
[reg
.reg
] == 0);
1595 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1597 definition
.setFixed(reg
);
1598 } else if (instr
->opcode
== aco_opcode::p_create_vector
) {
1599 PhysReg reg
= get_reg_create_vector(ctx
, register_file
, definition
.regClass(),
1600 parallelcopy
, instr
);
1601 definition
.setFixed(reg
);
1602 } else if (affinities
.find(definition
.tempId()) != affinities
.end() &&
1603 ctx
.assignments
.find(affinities
[definition
.tempId()]) != ctx
.assignments
.end()) {
1604 PhysReg reg
= ctx
.assignments
[affinities
[definition
.tempId()]].first
;
1605 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1606 definition
.setFixed(reg
);
1608 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1610 } else if (vectors
.find(definition
.tempId()) != vectors
.end()) {
1611 Instruction
* vec
= vectors
[definition
.tempId()];
1612 unsigned offset
= 0;
1613 for (const Operand
& op
: vec
->operands
) {
1614 if (op
.isTemp() && op
.tempId() == definition
.tempId())
1617 offset
+= op
.size();
1620 for (const Operand
& op
: vec
->operands
) {
1622 op
.tempId() != definition
.tempId() &&
1623 op
.getTemp().type() == definition
.getTemp().type() &&
1624 ctx
.assignments
.find(op
.tempId()) != ctx
.assignments
.end()) {
1625 PhysReg reg
= ctx
.assignments
[op
.tempId()].first
;
1626 reg
.reg
= reg
- k
+ offset
;
1627 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
)) {
1628 definition
.setFixed(reg
);
1634 if (!definition
.isFixed()) {
1635 std::pair
<PhysReg
, bool> res
= get_reg_vec(ctx
, register_file
, vec
->definitions
[0].regClass());
1636 PhysReg reg
= res
.first
;
1640 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1642 definition
.setFixed(reg
);
1645 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1647 assert(definition
.isFixed() && ((definition
.getTemp().type() == RegType::vgpr
&& definition
.physReg() >= 256) ||
1648 (definition
.getTemp().type() != RegType::vgpr
&& definition
.physReg() < 256)));
1649 ctx
.defs_done
.set(i
);
1651 /* set live if it has a kill point */
1652 if (!definition
.isKill())
1653 live
.emplace(definition
.getTemp());
1655 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1656 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1657 for (unsigned j
= 0; j
< definition
.size(); j
++)
1658 register_file
[definition
.physReg() + j
] = definition
.tempId();
1661 handle_pseudo(ctx
, register_file
, instr
.get());
1663 /* kill definitions */
1664 for (const Definition
& def
: instr
->definitions
) {
1665 if (def
.isTemp() && def
.isKill()) {
1666 for (unsigned j
= 0; j
< def
.size(); j
++) {
1667 register_file
[def
.physReg() + j
] = 0;
1672 /* emit parallelcopy */
1673 if (!parallelcopy
.empty()) {
1674 aco_ptr
<Pseudo_instruction
> pc
;
1675 pc
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, parallelcopy
.size(), parallelcopy
.size()));
1676 bool temp_in_scc
= register_file
[scc
.reg
];
1677 bool sgpr_operands_alias_defs
= false;
1678 uint64_t sgpr_operands
[4] = {0, 0, 0, 0};
1679 for (unsigned i
= 0; i
< parallelcopy
.size(); i
++) {
1680 if (temp_in_scc
&& parallelcopy
[i
].first
.isTemp() && parallelcopy
[i
].first
.getTemp().type() == RegType::sgpr
) {
1681 if (!sgpr_operands_alias_defs
) {
1682 unsigned reg
= parallelcopy
[i
].first
.physReg().reg
;
1683 unsigned size
= parallelcopy
[i
].first
.getTemp().size();
1684 sgpr_operands
[reg
/ 64u] |= ((1u << size
) - 1) << (reg
% 64u);
1686 reg
= parallelcopy
[i
].second
.physReg().reg
;
1687 size
= parallelcopy
[i
].second
.getTemp().size();
1688 if (sgpr_operands
[reg
/ 64u] & ((1u << size
) - 1) << (reg
% 64u))
1689 sgpr_operands_alias_defs
= true;
1693 pc
->operands
[i
] = parallelcopy
[i
].first
;
1694 pc
->definitions
[i
] = parallelcopy
[i
].second
;
1696 /* it might happen that the operand is already renamed. we have to restore the original name. */
1697 std::map
<unsigned, Temp
>::iterator it
= ctx
.orig_names
.find(pc
->operands
[i
].tempId());
1698 if (it
!= ctx
.orig_names
.end())
1699 pc
->operands
[i
].setTemp(it
->second
);
1700 unsigned orig_id
= pc
->operands
[i
].tempId();
1701 ctx
.orig_names
[pc
->definitions
[i
].tempId()] = pc
->operands
[i
].getTemp();
1703 pc
->operands
[i
].setTemp(read_variable(pc
->operands
[i
].getTemp(), block
.index
));
1704 renames
[block
.index
][orig_id
] = pc
->definitions
[i
].getTemp();
1705 renames
[block
.index
][pc
->definitions
[i
].tempId()] = pc
->definitions
[i
].getTemp();
1706 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(pc
->operands
[i
].tempId());
1707 if (phi
!= phi_map
.end())
1708 phi
->second
.uses
.emplace(pc
.get());
1711 if (temp_in_scc
&& sgpr_operands_alias_defs
) {
1712 /* disable definitions and re-enable operands */
1713 for (const Definition
& def
: instr
->definitions
) {
1714 if (def
.isTemp() && !def
.isKill()) {
1715 for (unsigned j
= 0; j
< def
.size(); j
++) {
1716 register_file
[def
.physReg() + j
] = 0x0;
1720 for (const Operand
& op
: instr
->operands
) {
1721 if (op
.isTemp() && op
.isFirstKill()) {
1722 for (unsigned j
= 0; j
< op
.size(); j
++)
1723 register_file
[op
.physReg() + j
] = 0xFFFF;
1727 handle_pseudo(ctx
, register_file
, pc
.get());
1729 /* re-enable live vars */
1730 for (const Operand
& op
: instr
->operands
) {
1731 if (op
.isTemp() && op
.isFirstKill())
1732 for (unsigned j
= 0; j
< op
.size(); j
++)
1733 register_file
[op
.physReg() + j
] = 0x0;
1735 for (const Definition
& def
: instr
->definitions
) {
1736 if (def
.isTemp() && !def
.isKill()) {
1737 for (unsigned j
= 0; j
< def
.size(); j
++) {
1738 register_file
[def
.physReg() + j
] = def
.tempId();
1743 pc
->tmp_in_scc
= false;
1746 instructions
.emplace_back(std::move(pc
));
1749 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1750 bool instr_needs_vop3
= !instr
->isVOP3() &&
1751 ((instr
->format
== Format::VOPC
&& !(instr
->definitions
[0].physReg() == vcc
)) ||
1752 (instr
->opcode
== aco_opcode::v_cndmask_b32
&& !(instr
->operands
[2].physReg() == vcc
)) ||
1753 ((instr
->opcode
== aco_opcode::v_add_co_u32
||
1754 instr
->opcode
== aco_opcode::v_addc_co_u32
||
1755 instr
->opcode
== aco_opcode::v_sub_co_u32
||
1756 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1757 instr
->opcode
== aco_opcode::v_subrev_co_u32
||
1758 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1759 !(instr
->definitions
[1].physReg() == vcc
)) ||
1760 ((instr
->opcode
== aco_opcode::v_addc_co_u32
||
1761 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1762 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1763 !(instr
->operands
[2].physReg() == vcc
)));
1764 if (instr_needs_vop3
) {
1766 /* if the first operand is a literal, we have to move it to a reg */
1767 if (instr
->operands
.size() && instr
->operands
[0].isLiteral()) {
1768 bool can_sgpr
= true;
1769 /* check, if we have to move to vgpr */
1770 for (const Operand
& op
: instr
->operands
) {
1771 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1776 aco_ptr
<Instruction
> mov
;
1778 mov
.reset(create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1));
1780 mov
.reset(create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1));
1781 mov
->operands
[0] = instr
->operands
[0];
1782 Temp tmp
= {program
->allocateId(), can_sgpr
? s1
: v1
};
1783 mov
->definitions
[0] = Definition(tmp
);
1784 /* disable definitions and re-enable operands */
1785 for (const Definition
& def
: instr
->definitions
) {
1786 for (unsigned j
= 0; j
< def
.size(); j
++) {
1787 register_file
[def
.physReg() + j
] = 0x0;
1790 for (const Operand
& op
: instr
->operands
) {
1791 if (op
.isTemp() && op
.isFirstKill()) {
1792 for (unsigned j
= 0; j
< op
.size(); j
++)
1793 register_file
[op
.physReg() + j
] = 0xFFFF;
1796 mov
->definitions
[0].setFixed(get_reg(ctx
, register_file
, tmp
.regClass(), parallelcopy
, mov
));
1797 instr
->operands
[0] = Operand(tmp
);
1798 instr
->operands
[0].setFixed(mov
->definitions
[0].physReg());
1799 instructions
.emplace_back(std::move(mov
));
1800 /* re-enable live vars */
1801 for (const Operand
& op
: instr
->operands
) {
1802 if (op
.isTemp() && op
.isFirstKill())
1803 for (unsigned j
= 0; j
< op
.size(); j
++)
1804 register_file
[op
.physReg() + j
] = 0x0;
1806 for (const Definition
& def
: instr
->definitions
) {
1807 if (def
.isTemp() && !def
.isKill()) {
1808 for (unsigned j
= 0; j
< def
.size(); j
++) {
1809 register_file
[def
.physReg() + j
] = def
.tempId();
1815 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1816 aco_ptr
<Instruction
> tmp
= std::move(instr
);
1817 Format format
= asVOP3(tmp
->format
);
1818 instr
.reset(create_instruction
<VOP3A_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
1819 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1820 Operand
& operand
= tmp
->operands
[i
];
1821 instr
->operands
[i
] = operand
;
1822 /* keep phi_map up to date */
1823 if (operand
.isTemp()) {
1824 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.tempId());
1825 if (phi
!= phi_map
.end()) {
1826 phi
->second
.uses
.erase(tmp
.get());
1827 phi
->second
.uses
.emplace(instr
.get());
1831 std::copy(tmp
->definitions
.begin(), tmp
->definitions
.end(), instr
->definitions
.begin());
1833 instructions
.emplace_back(std::move(*it
));
1835 } /* end for Instr */
1837 block
.instructions
= std::move(instructions
);
1839 filled
[block
.index
] = true;
1840 for (unsigned succ_idx
: block
.linear_succs
) {
1841 Block
& succ
= program
->blocks
[succ_idx
];
1842 /* seal block if all predecessors are filled */
1843 bool all_filled
= true;
1844 for (unsigned pred_idx
: succ
.linear_preds
) {
1845 if (!filled
[pred_idx
]) {
1851 /* finish incomplete phis and check if they became trivial */
1852 for (Instruction
* phi
: incomplete_phis
[succ_idx
]) {
1853 std::vector
<unsigned> preds
= phi
->definitions
[0].getTemp().is_linear() ? succ
.linear_preds
: succ
.logical_preds
;
1854 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
1855 phi
->operands
[i
].setTemp(read_variable(phi
->operands
[i
].getTemp(), preds
[i
]));
1856 phi
->operands
[i
].setFixed(ctx
.assignments
[phi
->operands
[i
].tempId()].first
);
1858 try_remove_trivial_phi(phi_map
.find(phi
->definitions
[0].tempId()));
1860 /* complete the original phi nodes, but no need to check triviality */
1861 for (aco_ptr
<Instruction
>& instr
: succ
.instructions
) {
1864 std::vector
<unsigned> preds
= instr
->opcode
== aco_opcode::p_phi
? succ
.logical_preds
: succ
.linear_preds
;
1866 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1867 auto& operand
= instr
->operands
[i
];
1868 if (!operand
.isTemp())
1870 operand
.setTemp(read_variable(operand
.getTemp(), preds
[i
]));
1871 operand
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1872 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.getTemp().id());
1873 if (phi
!= phi_map
.end())
1874 phi
->second
.uses
.emplace(instr
.get());
1877 sealed
[succ_idx
] = true;
1882 /* remove trivial phis */
1883 for (Block
& block
: program
->blocks
) {
1884 auto end
= std::find_if(block
.instructions
.begin(), block
.instructions
.end(),
1885 [](aco_ptr
<Instruction
>& instr
) { return !is_phi(instr
);});
1886 auto middle
= std::remove_if(block
.instructions
.begin(), end
,
1887 [](const aco_ptr
<Instruction
>& instr
) { return instr
->definitions
.empty();});
1888 block
.instructions
.erase(middle
, end
);
1891 /* find scc spill registers which may be needed for parallelcopies created by phis */
1892 for (Block
& block
: program
->blocks
) {
1893 if (block
.linear_preds
.size() <= 1)
1896 std::bitset
<128> regs
= sgpr_live_in
[block
.index
];
1900 /* choose a register */
1902 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& regs
[reg
]; reg
++)
1904 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
1905 adjust_max_used_regs(ctx
, s1
, reg
);
1907 /* update predecessors */
1908 for (unsigned& pred_index
: block
.linear_preds
) {
1909 Block
& pred
= program
->blocks
[pred_index
];
1910 pred
.scc_live_out
= true;
1911 pred
.scratch_sgpr
= PhysReg
{(uint16_t)reg
};
1915 /* num_gpr = rnd_up(max_used_gpr + 1) */
1916 program
->config
->num_vgprs
= (ctx
.max_used_vgpr
+ 1 + 3) & ~3;
1917 if (program
->family
== CHIP_TONGA
|| program
->family
== CHIP_ICELAND
) {
1918 assert(ctx
.max_used_sgpr
<= 93);
1919 ctx
.max_used_sgpr
= 93; /* workaround hardware bug */
1921 program
->config
->num_sgprs
= (ctx
.max_used_sgpr
+ 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */