2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
32 #include <unordered_map>
37 #include "util/u_math.h"
43 std::bitset
<512> war_hint
;
45 std::unordered_map
<unsigned, std::pair
<PhysReg
, RegClass
>> assignments
;
46 std::map
<unsigned, Temp
> orig_names
;
47 unsigned max_used_sgpr
= 0;
48 unsigned max_used_vgpr
= 0;
49 std::bitset
<64> defs_done
; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
51 ra_ctx(Program
* program
) : program(program
) {}
56 RegisterFile() {regs
.fill(0);}
58 std::array
<uint32_t, 512> regs
;
59 std::map
<uint32_t, std::array
<uint32_t, 4>> subdword_regs
;
61 const uint32_t& operator [] (unsigned index
) const {
65 uint32_t& operator [] (unsigned index
) {
69 unsigned count_zero(PhysReg start
, unsigned size
) {
71 for (unsigned i
= 0; i
< size
; i
++)
72 res
+= !regs
[start
+ i
];
76 bool test(PhysReg start
, unsigned num_bytes
) {
77 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
78 if (regs
[i
] & 0x0FFFFFFF)
80 if (regs
[i
] == 0xF0000000) {
81 assert(subdword_regs
.find(i
) != subdword_regs
.end());
82 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++) {
83 if (subdword_regs
[i
][j
])
91 void fill(PhysReg start
, unsigned size
, uint32_t val
) {
92 for (unsigned i
= 0; i
< size
; i
++)
93 regs
[start
+ i
] = val
;
96 void fill_subdword(PhysReg start
, unsigned num_bytes
, uint32_t val
) {
97 fill(start
, DIV_ROUND_UP(num_bytes
, 4), 0xF0000000);
98 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
100 std::array
<uint32_t, 4>& sub
= subdword_regs
.emplace(i
, std::array
<uint32_t, 4>{0, 0, 0, 0}).first
->second
;
101 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++)
104 if (sub
== std::array
<uint32_t, 4>{0, 0, 0, 0}) {
105 subdword_regs
.erase(i
);
111 void block(PhysReg start
, unsigned num_bytes
) {
112 if (start
.byte() || num_bytes
% 4)
113 fill_subdword(start
, num_bytes
, 0xFFFFFFFF);
115 fill(start
, num_bytes
/ 4, 0xFFFFFFFF);
118 bool is_blocked(PhysReg start
) {
119 if (regs
[start
] == 0xFFFFFFFF)
121 if (regs
[start
] == 0xF0000000) {
122 for (unsigned i
= start
.byte(); i
< 4; i
++)
123 if (subdword_regs
[start
][i
] == 0xFFFFFFFF)
129 void clear(PhysReg start
, RegClass rc
) {
130 if (rc
.is_subdword())
131 fill_subdword(start
, rc
.bytes(), 0);
133 fill(start
, rc
.size(), 0);
136 void fill(Operand op
) {
137 if (op
.regClass().is_subdword())
138 fill_subdword(op
.physReg(), op
.bytes(), op
.tempId());
140 fill(op
.physReg(), op
.size(), op
.tempId());
143 void clear(Operand op
) {
144 clear(op
.physReg(), op
.regClass());
147 void fill(Definition def
) {
148 if (def
.regClass().is_subdword())
149 fill_subdword(def
.physReg(), def
.bytes(), def
.tempId());
151 fill(def
.physReg(), def
.size(), def
.tempId());
154 void clear(Definition def
) {
155 clear(def
.physReg(), def
.regClass());
160 /* helper function for debugging */
162 void print_regs(ra_ctx
& ctx
, bool vgprs
, RegisterFile
& reg_file
)
164 unsigned max
= vgprs
? ctx
.program
->max_reg_demand
.vgpr
: ctx
.program
->max_reg_demand
.sgpr
;
165 unsigned lb
= vgprs
? 256 : 0;
166 unsigned ub
= lb
+ max
;
167 char reg_char
= vgprs
? 'v' : 's';
171 for (unsigned i
= lb
; i
< ub
; i
+= 3) {
172 printf("%.2u ", i
- lb
);
177 printf("%cgprs: ", reg_char
);
178 unsigned free_regs
= 0;
180 bool char_select
= false;
181 for (unsigned i
= lb
; i
< ub
; i
++) {
182 if (reg_file
[i
] == 0xFFFF) {
184 } else if (reg_file
[i
]) {
185 if (reg_file
[i
] != prev
) {
187 char_select
= !char_select
;
189 printf(char_select
? "#" : "@");
197 printf("%u/%u used, %u/%u free\n", max
- free_regs
, max
, free_regs
, max
);
199 /* print assignments */
202 for (unsigned i
= lb
; i
< ub
; i
++) {
203 if (reg_file
[i
] != prev
) {
204 if (prev
&& size
> 1)
205 printf("-%d]\n", i
- 1 - lb
);
209 if (prev
&& prev
!= 0xFFFF) {
210 if (ctx
.orig_names
.count(reg_file
[i
]) && ctx
.orig_names
[reg_file
[i
]].id() != reg_file
[i
])
211 printf("%%%u (was %%%d) = %c[%d", reg_file
[i
], ctx
.orig_names
[reg_file
[i
]].id(), reg_char
, i
- lb
);
213 printf("%%%u = %c[%d", reg_file
[i
], reg_char
, i
- lb
);
220 if (prev
&& size
> 1)
221 printf("-%d]\n", ub
- lb
- 1);
228 void adjust_max_used_regs(ra_ctx
& ctx
, RegClass rc
, unsigned reg
)
230 unsigned max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
231 unsigned size
= rc
.size();
232 if (rc
.type() == RegType::vgpr
) {
234 unsigned hi
= reg
- 256 + size
- 1;
235 ctx
.max_used_vgpr
= std::max(ctx
.max_used_vgpr
, hi
);
236 } else if (reg
+ rc
.size() <= max_addressible_sgpr
) {
237 unsigned hi
= reg
+ size
- 1;
238 ctx
.max_used_sgpr
= std::max(ctx
.max_used_sgpr
, std::min(hi
, max_addressible_sgpr
));
243 void update_renames(ra_ctx
& ctx
, RegisterFile
& reg_file
,
244 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
245 aco_ptr
<Instruction
>& instr
)
247 /* allocate id's and rename operands: this is done transparently here */
248 for (std::pair
<Operand
, Definition
>& copy
: parallelcopies
) {
249 /* the definitions with id are not from this function and already handled */
250 if (copy
.second
.isTemp())
253 /* check if we we moved another parallelcopy definition */
254 for (std::pair
<Operand
, Definition
>& other
: parallelcopies
) {
255 if (!other
.second
.isTemp())
257 if (copy
.first
.getTemp() == other
.second
.getTemp()) {
258 copy
.first
.setTemp(other
.first
.getTemp());
259 copy
.first
.setFixed(other
.first
.physReg());
262 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
263 copy
.second
.setTemp(Temp(ctx
.program
->allocateId(), copy
.second
.regClass()));
264 ctx
.assignments
[copy
.second
.tempId()] = {copy
.second
.physReg(), copy
.second
.regClass()};
265 reg_file
.fill(copy
.second
);
267 /* check if we moved an operand */
268 for (Operand
& op
: instr
->operands
) {
271 if (op
.tempId() == copy
.first
.tempId()) {
272 bool omit_renaming
= instr
->opcode
== aco_opcode::p_create_vector
&& !op
.isKillBeforeDef();
273 for (std::pair
<Operand
, Definition
>& pc
: parallelcopies
) {
274 PhysReg def_reg
= pc
.second
.physReg();
275 omit_renaming
&= def_reg
> copy
.first
.physReg() ?
276 (copy
.first
.physReg() + copy
.first
.size() <= def_reg
.reg()) :
277 (def_reg
+ pc
.second
.size() <= copy
.first
.physReg().reg());
281 op
.setTemp(copy
.second
.getTemp());
282 op
.setFixed(copy
.second
.physReg());
288 std::pair
<PhysReg
, bool> get_reg_simple(ra_ctx
& ctx
,
289 RegisterFile
& reg_file
,
290 uint32_t lb
, uint32_t ub
,
291 uint32_t size
, uint32_t stride
,
294 /* best fit algorithm: find the smallest gap to fit in the variable */
296 unsigned best_pos
= 0xFFFF;
297 unsigned gap_size
= 0xFFFF;
298 unsigned next_pos
= 0xFFFF;
300 for (unsigned current_reg
= lb
; current_reg
< ub
; current_reg
++) {
301 if (reg_file
[current_reg
] != 0 || ctx
.war_hint
[current_reg
]) {
302 if (next_pos
== 0xFFFF)
305 /* check if the variable fits */
306 if (next_pos
+ size
> current_reg
) {
311 /* check if the tested gap is smaller */
312 if (current_reg
- next_pos
< gap_size
) {
314 gap_size
= current_reg
- next_pos
;
320 if (next_pos
== 0xFFFF)
321 next_pos
= current_reg
;
325 if (next_pos
!= 0xFFFF &&
326 next_pos
+ size
<= ub
&&
327 ub
- next_pos
< gap_size
) {
329 gap_size
= ub
- next_pos
;
331 if (best_pos
!= 0xFFFF) {
332 adjust_max_used_regs(ctx
, rc
, best_pos
);
333 return {PhysReg
{best_pos
}, true};
339 unsigned reg_lo
= lb
;
340 unsigned reg_hi
= lb
+ size
- 1;
341 while (!found
&& reg_lo
+ size
<= ub
) {
342 if (reg_file
[reg_lo
] != 0) {
346 reg_hi
= reg_lo
+ size
- 1;
348 for (unsigned reg
= reg_lo
+ 1; found
&& reg
<= reg_hi
; reg
++) {
349 if (reg_file
[reg
] != 0 || ctx
.war_hint
[reg
])
353 adjust_max_used_regs(ctx
, rc
, reg_lo
);
354 return {PhysReg
{reg_lo
}, true};
363 bool get_regs_for_copies(ra_ctx
& ctx
,
364 RegisterFile
& reg_file
,
365 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
366 const std::set
<std::pair
<unsigned, unsigned>> &vars
,
367 uint32_t lb
, uint32_t ub
,
368 aco_ptr
<Instruction
>& instr
,
373 /* variables are sorted from small sized to large */
374 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
375 for (std::set
<std::pair
<unsigned, unsigned>>::const_reverse_iterator it
= vars
.rbegin(); it
!= vars
.rend(); ++it
) {
376 unsigned id
= it
->second
;
377 std::pair
<PhysReg
, RegClass
> var
= ctx
.assignments
[id
];
378 uint32_t size
= it
->first
;
380 if (var
.second
.type() == RegType::sgpr
) {
387 /* check if this is a dead operand, then we can re-use the space from the definition */
388 bool is_dead_operand
= false;
389 for (unsigned i
= 0; !is_phi(instr
) && !is_dead_operand
&& (i
< instr
->operands
.size()); i
++) {
390 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isKillBeforeDef() && instr
->operands
[i
].tempId() == id
)
391 is_dead_operand
= true;
394 std::pair
<PhysReg
, bool> res
;
395 if (is_dead_operand
) {
396 if (instr
->opcode
== aco_opcode::p_create_vector
) {
397 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
398 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
399 for (unsigned j
= 0; j
< size
; j
++)
400 assert(reg_file
[def_reg_lo
+ offset
+ j
] == 0);
401 res
= {PhysReg
{def_reg_lo
+ offset
}, true};
406 res
= get_reg_simple(ctx
, reg_file
, def_reg_lo
, def_reg_hi
+ 1, size
, stride
, var
.second
);
409 res
= get_reg_simple(ctx
, reg_file
, lb
, def_reg_lo
, size
, stride
, var
.second
);
411 unsigned lb
= (def_reg_hi
+ stride
) & ~(stride
- 1);
412 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, var
.second
);
417 /* mark the area as blocked */
418 reg_file
.block(res
.first
, var
.second
.bytes());
420 /* create parallelcopy pair (without definition id) */
421 Temp tmp
= Temp(id
, var
.second
);
422 Operand pc_op
= Operand(tmp
);
423 pc_op
.setFixed(var
.first
);
424 Definition pc_def
= Definition(res
.first
, pc_op
.regClass());
425 parallelcopies
.emplace_back(pc_op
, pc_def
);
429 unsigned best_pos
= lb
;
430 unsigned num_moves
= 0xFF;
431 unsigned num_vars
= 0;
433 /* we use a sliding window to find potential positions */
434 unsigned reg_lo
= lb
;
435 unsigned reg_hi
= lb
+ size
- 1;
436 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
437 if (!is_dead_operand
&& ((reg_lo
>= def_reg_lo
&& reg_lo
<= def_reg_hi
) ||
438 (reg_hi
>= def_reg_lo
&& reg_hi
<= def_reg_hi
)))
441 /* second, check that we have at most k=num_moves elements in the window
442 * and no element is larger than the currently processed one */
445 unsigned last_var
= 0;
447 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
448 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
451 if (reg_file
.is_blocked(PhysReg
{j
}) || k
> num_moves
) {
455 /* we cannot split live ranges of linear vgprs */
456 if (ctx
.assignments
[reg_file
[j
]].second
& (1 << 6)) {
460 bool is_kill
= false;
461 for (const Operand
& op
: instr
->operands
) {
462 if (op
.isTemp() && op
.isKillBeforeDef() && op
.tempId() == reg_file
[j
]) {
467 if (!is_kill
&& ctx
.assignments
[reg_file
[j
]].second
.size() >= size
) {
472 k
+= ctx
.assignments
[reg_file
[j
]].second
.size();
473 last_var
= reg_file
[j
];
475 if (k
> num_moves
|| (k
== num_moves
&& n
<= num_vars
)) {
488 /* FIXME: we messed up and couldn't find space for the variables to be copied */
489 if (num_moves
== 0xFF)
493 reg_hi
= best_pos
+ size
- 1;
495 /* collect variables and block reg file */
496 std::set
<std::pair
<unsigned, unsigned>> new_vars
;
497 for (unsigned j
= reg_lo
; j
<= reg_hi
; j
++) {
498 if (reg_file
[j
] != 0) {
499 unsigned size
= ctx
.assignments
[reg_file
[j
]].second
.size();
500 unsigned id
= reg_file
[j
];
501 new_vars
.emplace(size
, id
);
502 reg_file
.clear(ctx
.assignments
[id
].first
, ctx
.assignments
[id
].second
);
506 /* mark the area as blocked */
507 reg_file
.block(PhysReg
{reg_lo
}, size
* 4);
509 if (!get_regs_for_copies(ctx
, reg_file
, parallelcopies
, new_vars
, lb
, ub
, instr
, def_reg_lo
, def_reg_hi
))
512 adjust_max_used_regs(ctx
, var
.second
, reg_lo
);
514 /* create parallelcopy pair (without definition id) */
515 Temp tmp
= Temp(id
, var
.second
);
516 Operand pc_op
= Operand(tmp
);
517 pc_op
.setFixed(var
.first
);
518 Definition pc_def
= Definition(PhysReg
{reg_lo
}, pc_op
.regClass());
519 parallelcopies
.emplace_back(pc_op
, pc_def
);
526 std::pair
<PhysReg
, bool> get_reg_impl(ra_ctx
& ctx
,
527 RegisterFile
& reg_file
,
528 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
529 uint32_t lb
, uint32_t ub
,
530 uint32_t size
, uint32_t stride
,
532 aco_ptr
<Instruction
>& instr
)
534 /* check how many free regs we have */
535 unsigned regs_free
= reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
);
537 /* mark and count killed operands */
538 unsigned killed_ops
= 0;
539 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
540 if (instr
->operands
[j
].isTemp() &&
541 instr
->operands
[j
].isFirstKillBeforeDef() &&
542 instr
->operands
[j
].physReg() >= lb
&&
543 instr
->operands
[j
].physReg() < ub
) {
544 assert(instr
->operands
[j
].isFixed());
545 assert(!reg_file
.test(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes()));
546 reg_file
.block(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes());
547 killed_ops
+= instr
->operands
[j
].getTemp().size();
551 assert(regs_free
>= size
);
552 /* we might have to move dead operands to dst in order to make space */
553 unsigned op_moves
= 0;
555 if (size
> (regs_free
- killed_ops
))
556 op_moves
= size
- (regs_free
- killed_ops
);
558 /* find the best position to place the definition */
559 unsigned best_pos
= lb
;
560 unsigned num_moves
= 0xFF;
561 unsigned num_vars
= 0;
563 /* we use a sliding window to check potential positions */
564 unsigned reg_lo
= lb
;
565 unsigned reg_hi
= lb
+ size
- 1;
566 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
567 /* first check the edges: this is what we have to fix to allow for num_moves > size */
568 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
570 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
573 /* second, check that we have at most k=num_moves elements in the window
574 * and no element is larger than the currently processed one */
575 unsigned k
= op_moves
;
577 unsigned remaining_op_moves
= op_moves
;
578 unsigned last_var
= 0;
580 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
581 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
582 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
585 /* dead operands effectively reduce the number of estimated moves */
586 if (remaining_op_moves
&& reg_file
.is_blocked(PhysReg
{j
})) {
588 remaining_op_moves
--;
592 if (ctx
.assignments
[reg_file
[j
]].second
.size() >= size
) {
598 /* we cannot split live ranges of linear vgprs */
599 if (ctx
.assignments
[reg_file
[j
]].second
& (1 << 6)) {
604 k
+= ctx
.assignments
[reg_file
[j
]].second
.size();
606 last_var
= reg_file
[j
];
609 if (!found
|| k
> num_moves
)
611 if (k
== num_moves
&& n
< num_vars
)
613 if (!aligned
&& k
== num_moves
&& n
== num_vars
)
623 if (num_moves
== 0xFF) {
624 /* remove killed operands from reg_file once again */
625 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
626 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef())
627 reg_file
.clear(instr
->operands
[i
]);
629 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
630 Definition def
= instr
->definitions
[i
];
631 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
637 RegisterFile register_file
= reg_file
;
639 /* now, we figured the placement for our definition */
640 std::set
<std::pair
<unsigned, unsigned>> vars
;
641 for (unsigned j
= best_pos
; j
< best_pos
+ size
; j
++) {
642 if (reg_file
[j
] != 0xFFFFFFFF && reg_file
[j
] != 0)
643 vars
.emplace(ctx
.assignments
[reg_file
[j
]].second
.size(), reg_file
[j
]);
644 reg_file
.clear(ctx
.assignments
[reg_file
[j
]].first
, ctx
.assignments
[reg_file
[j
]].second
);
647 if (instr
->opcode
== aco_opcode::p_create_vector
) {
648 /* move killed operands which aren't yet at the correct position */
649 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
650 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef() &&
651 instr
->operands
[i
].getTemp().type() == rc
.type()) {
653 if (instr
->operands
[i
].physReg() != best_pos
+ offset
) {
654 vars
.emplace(instr
->operands
[i
].size(), instr
->operands
[i
].tempId());
655 reg_file
.clear(instr
->operands
[i
]);
657 reg_file
.fill(instr
->operands
[i
]);
662 /* re-enable the killed operands */
663 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
664 if (instr
->operands
[j
].isTemp() && instr
->operands
[j
].isFirstKill())
665 reg_file
.fill(instr
->operands
[j
]);
669 std::vector
<std::pair
<Operand
, Definition
>> pc
;
670 if (!get_regs_for_copies(ctx
, reg_file
, pc
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1)) {
671 reg_file
= std::move(register_file
);
672 /* remove killed operands from reg_file once again */
673 if (!is_phi(instr
)) {
674 for (const Operand
& op
: instr
->operands
) {
675 if (op
.isTemp() && op
.isFirstKillBeforeDef())
679 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
680 Definition
& def
= instr
->definitions
[i
];
681 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
687 parallelcopies
.insert(parallelcopies
.end(), pc
.begin(), pc
.end());
689 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
690 reg_file
.clear(PhysReg
{best_pos
}, rc
);
692 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
694 /* remove killed operands from reg_file once again */
695 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
696 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
698 assert(!instr
->operands
[i
].isUndefined());
699 if (instr
->operands
[i
].isFirstKillBeforeDef())
700 reg_file
.clear(instr
->operands
[i
]);
702 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
703 Definition def
= instr
->definitions
[i
];
704 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
708 adjust_max_used_regs(ctx
, rc
, best_pos
);
709 return {PhysReg
{best_pos
}, true};
712 PhysReg
get_reg(ra_ctx
& ctx
,
713 RegisterFile
& reg_file
,
715 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
716 aco_ptr
<Instruction
>& instr
)
718 uint32_t size
= rc
.size();
721 if (rc
.type() == RegType::vgpr
) {
723 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
726 ub
= ctx
.program
->max_reg_demand
.sgpr
;
733 std::pair
<PhysReg
, bool> res
= {{}, false};
734 /* try to find space without live-range splits */
735 if (rc
.type() == RegType::vgpr
&& (size
== 4 || size
== 8))
736 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, 4, rc
);
738 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
742 /* try to find space with live-range splits */
743 res
= get_reg_impl(ctx
, reg_file
, parallelcopies
, lb
, ub
, size
, stride
, rc
, instr
);
748 /* try using more registers */
750 /* We should only fail here because keeping under the limit would require
752 assert(reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
) >= size
);
754 uint16_t max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
755 uint16_t max_addressible_vgpr
= ctx
.program
->vgpr_limit
;
756 if (rc
.type() == RegType::vgpr
&& ctx
.program
->max_reg_demand
.vgpr
< max_addressible_vgpr
) {
757 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
+ 1, ctx
.program
->max_reg_demand
.sgpr
));
758 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
759 } else if (rc
.type() == RegType::sgpr
&& ctx
.program
->max_reg_demand
.sgpr
< max_addressible_sgpr
) {
760 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
, ctx
.program
->max_reg_demand
.sgpr
+ 1));
761 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
764 //FIXME: if nothing helps, shift-rotate the registers to make space
766 unreachable("did not find a register");
770 std::pair
<PhysReg
, bool> get_reg_vec(ra_ctx
& ctx
,
771 RegisterFile
& reg_file
,
774 uint32_t size
= rc
.size();
777 if (rc
.type() == RegType::vgpr
) {
779 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
782 ub
= ctx
.program
->max_reg_demand
.sgpr
;
788 return get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
792 PhysReg
get_reg_create_vector(ra_ctx
& ctx
,
793 RegisterFile
& reg_file
,
795 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
796 aco_ptr
<Instruction
>& instr
)
798 /* create_vector instructions have different costs w.r.t. register coalescing */
799 uint32_t size
= rc
.size();
802 if (rc
.type() == RegType::vgpr
) {
804 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
807 ub
= ctx
.program
->max_reg_demand
.sgpr
;
814 unsigned best_pos
= -1;
815 unsigned num_moves
= 0xFF;
816 bool best_war_hint
= true;
818 /* test for each operand which definition placement causes the least shuffle instructions */
819 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
820 // TODO: think about, if we can alias live operands on the same register
821 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isKillBeforeDef() || instr
->operands
[i
].getTemp().type() != rc
.type())
824 if (offset
> instr
->operands
[i
].physReg())
827 unsigned reg_lo
= instr
->operands
[i
].physReg() - offset
;
828 unsigned reg_hi
= reg_lo
+ size
- 1;
831 /* no need to check multiple times */
832 if (reg_lo
== best_pos
)
836 // TODO: this can be improved */
837 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
% stride
!= 0)
839 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
841 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
844 /* count variables to be moved and check war_hint */
845 bool war_hint
= false;
846 bool linear_vgpr
= false;
847 for (unsigned j
= reg_lo
; j
<= reg_hi
&& !linear_vgpr
; j
++) {
848 if (reg_file
[j
] != 0) {
850 /* we cannot split live ranges of linear vgprs */
851 if (ctx
.assignments
[reg_file
[j
]].second
& (1 << 6))
854 war_hint
|= ctx
.war_hint
[j
];
856 if (linear_vgpr
|| (war_hint
&& !best_war_hint
))
859 /* count operands in wrong positions */
860 for (unsigned j
= 0, offset
= 0; j
< instr
->operands
.size(); offset
+= instr
->operands
[j
].size(), j
++) {
862 !instr
->operands
[j
].isTemp() ||
863 instr
->operands
[j
].getTemp().type() != rc
.type())
865 if (instr
->operands
[j
].physReg() != reg_lo
+ offset
)
866 k
+= instr
->operands
[j
].size();
868 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
869 if (k
> num_moves
|| (!aligned
&& k
== num_moves
))
874 best_war_hint
= war_hint
;
877 if (num_moves
>= size
)
878 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
880 /* collect variables to be moved */
881 std::set
<std::pair
<unsigned, unsigned>> vars
;
882 for (unsigned i
= best_pos
; i
< best_pos
+ size
; i
++) {
883 if (reg_file
[i
] != 0)
884 vars
.emplace(ctx
.assignments
[reg_file
[i
]].second
.size(), reg_file
[i
]);
888 /* move killed operands which aren't yet at the correct position */
889 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
890 if (instr
->operands
[i
].isTemp() &&
891 instr
->operands
[i
].isFirstKillBeforeDef() &&
892 instr
->operands
[i
].getTemp().type() == rc
.type() &&
893 instr
->operands
[i
].physReg() != best_pos
+ offset
)
894 vars
.emplace(instr
->operands
[i
].size(), instr
->operands
[i
].tempId());
897 ASSERTED
bool success
= false;
898 success
= get_regs_for_copies(ctx
, reg_file
, parallelcopies
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1);
901 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
902 adjust_max_used_regs(ctx
, rc
, best_pos
);
903 return PhysReg
{best_pos
};
906 bool get_reg_specified(ra_ctx
& ctx
,
907 RegisterFile
& reg_file
,
909 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
910 aco_ptr
<Instruction
>& instr
,
913 uint32_t size
= rc
.size();
917 if (rc
.type() == RegType::vgpr
) {
919 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
925 if (reg
% stride
!= 0)
928 ub
= ctx
.program
->max_reg_demand
.sgpr
;
931 uint32_t reg_lo
= reg
.reg();
932 uint32_t reg_hi
= reg
+ (size
- 1);
934 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
> reg_hi
)
937 if (reg_file
.test(reg
, rc
.bytes()))
940 adjust_max_used_regs(ctx
, rc
, reg_lo
);
944 void handle_pseudo(ra_ctx
& ctx
,
945 const RegisterFile
& reg_file
,
948 if (instr
->format
!= Format::PSEUDO
)
951 /* all instructions which use handle_operands() need this information */
952 switch (instr
->opcode
) {
953 case aco_opcode::p_extract_vector
:
954 case aco_opcode::p_create_vector
:
955 case aco_opcode::p_split_vector
:
956 case aco_opcode::p_parallelcopy
:
957 case aco_opcode::p_wqm
:
963 /* if all definitions are vgpr, no need to care for SCC */
964 bool writes_sgpr
= false;
965 for (Definition
& def
: instr
->definitions
) {
966 if (def
.getTemp().type() == RegType::sgpr
) {
971 /* if all operands are constant, no need to care either */
972 bool reads_sgpr
= false;
973 for (Operand
& op
: instr
->operands
) {
974 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
979 if (!(writes_sgpr
&& reads_sgpr
))
982 Pseudo_instruction
*pi
= (Pseudo_instruction
*)instr
;
983 if (reg_file
[scc
.reg()]) {
984 pi
->tmp_in_scc
= true;
986 int reg
= ctx
.max_used_sgpr
;
987 for (; reg
>= 0 && reg_file
[reg
]; reg
--)
990 reg
= ctx
.max_used_sgpr
+ 1;
991 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& reg_file
[reg
]; reg
++)
993 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
996 adjust_max_used_regs(ctx
, s1
, reg
);
997 pi
->scratch_sgpr
= PhysReg
{(unsigned)reg
};
999 pi
->tmp_in_scc
= false;
1003 bool operand_can_use_reg(aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
)
1005 switch (instr
->format
) {
1007 return reg
!= scc
&&
1009 (reg
!= m0
|| idx
== 1 || idx
== 3) && /* offset can be m0 */
1010 (reg
!= vcc
|| (instr
->definitions
.empty() && idx
== 2)); /* sdata can be vcc */
1012 // TODO: there are more instructions with restrictions on registers
1017 } /* end namespace */
1020 void register_allocation(Program
*program
, std::vector
<std::set
<Temp
>> live_out_per_block
)
1022 ra_ctx
ctx(program
);
1024 std::vector
<std::unordered_map
<unsigned, Temp
>> renames(program
->blocks
.size());
1029 std::set
<Instruction
*> uses
;
1032 bool filled
[program
->blocks
.size()];
1033 bool sealed
[program
->blocks
.size()];
1034 memset(filled
, 0, sizeof filled
);
1035 memset(sealed
, 0, sizeof sealed
);
1036 std::vector
<std::vector
<Instruction
*>> incomplete_phis(program
->blocks
.size());
1037 std::map
<unsigned, phi_info
> phi_map
;
1038 std::map
<unsigned, unsigned> affinities
;
1039 std::function
<Temp(Temp
,unsigned)> read_variable
;
1040 std::function
<Temp(Temp
,Block
*)> handle_live_in
;
1041 std::function
<Temp(std::map
<unsigned, phi_info
>::iterator
)> try_remove_trivial_phi
;
1043 read_variable
= [&](Temp val
, unsigned block_idx
) -> Temp
{
1044 std::unordered_map
<unsigned, Temp
>::iterator it
= renames
[block_idx
].find(val
.id());
1045 assert(it
!= renames
[block_idx
].end());
1049 handle_live_in
= [&](Temp val
, Block
*block
) -> Temp
{
1050 std::vector
<unsigned>& preds
= val
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
1051 if (preds
.size() == 0 || val
.regClass() == val
.regClass().as_linear()) {
1052 renames
[block
->index
][val
.id()] = val
;
1055 assert(preds
.size() > 0);
1058 if (!sealed
[block
->index
]) {
1059 /* consider rename from already processed predecessor */
1060 Temp tmp
= read_variable(val
, preds
[0]);
1062 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1063 new_val
= Temp
{program
->allocateId(), val
.regClass()};
1064 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1065 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1066 phi
->definitions
[0] = Definition(new_val
);
1067 for (unsigned i
= 0; i
< preds
.size(); i
++)
1068 phi
->operands
[i
] = Operand(val
);
1069 if (tmp
.regClass() == new_val
.regClass())
1070 affinities
[new_val
.id()] = tmp
.id();
1072 phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1073 incomplete_phis
[block
->index
].emplace_back(phi
.get());
1074 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1076 } else if (preds
.size() == 1) {
1077 /* if the block has only one predecessor, just look there for the name */
1078 new_val
= read_variable(val
, preds
[0]);
1080 /* there are multiple predecessors and the block is sealed */
1081 Temp ops
[preds
.size()];
1083 /* we start assuming that the name is the same from all predecessors */
1084 renames
[block
->index
][val
.id()] = val
;
1085 bool needs_phi
= false;
1087 /* get the rename from each predecessor and check if they are the same */
1088 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1089 ops
[i
] = read_variable(val
, preds
[i
]);
1093 needs_phi
|= !(new_val
== ops
[i
]);
1097 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1098 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1099 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1100 new_val
= Temp
{program
->allocateId(), val
.regClass()};
1101 phi
->definitions
[0] = Definition(new_val
);
1102 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1103 phi
->operands
[i
] = Operand(ops
[i
]);
1104 phi
->operands
[i
].setFixed(ctx
.assignments
[ops
[i
].id()].first
);
1105 if (ops
[i
].regClass() == new_val
.regClass())
1106 affinities
[new_val
.id()] = ops
[i
].id();
1108 phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1109 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1113 renames
[block
->index
][val
.id()] = new_val
;
1114 renames
[block
->index
][new_val
.id()] = new_val
;
1115 ctx
.orig_names
[new_val
.id()] = val
;
1119 try_remove_trivial_phi
= [&] (std::map
<unsigned, phi_info
>::iterator info
) -> Temp
{
1120 assert(info
->second
.block_idx
!= 0);
1121 Instruction
* phi
= info
->second
.phi
;
1124 Definition def
= phi
->definitions
[0];
1125 /* a phi node is trivial if all operands are the same as the definition of the phi */
1126 for (const Operand
& op
: phi
->operands
) {
1127 const Temp t
= op
.getTemp();
1128 if (t
== same
|| t
== def
.getTemp())
1130 if (!(same
== Temp()) || !(op
.physReg() == def
.physReg())) {
1131 /* phi is not trivial */
1132 return def
.getTemp();
1136 assert(!(same
== Temp() || same
== def
.getTemp()));
1138 /* reroute all uses to same and remove phi */
1139 std::vector
<std::map
<unsigned, phi_info
>::iterator
> phi_users
;
1140 std::map
<unsigned, phi_info
>::iterator same_phi_info
= phi_map
.find(same
.id());
1141 for (Instruction
* instr
: info
->second
.uses
) {
1142 assert(phi
!= instr
);
1143 /* recursively try to remove trivial phis */
1144 if (is_phi(instr
)) {
1145 /* ignore if the phi was already flagged trivial */
1146 if (instr
->definitions
.empty())
1149 std::map
<unsigned, phi_info
>::iterator it
= phi_map
.find(instr
->definitions
[0].tempId());
1150 if (it
!= phi_map
.end() && it
!= info
)
1151 phi_users
.emplace_back(it
);
1153 for (Operand
& op
: instr
->operands
) {
1154 if (op
.isTemp() && op
.tempId() == def
.tempId()) {
1156 if (same_phi_info
!= phi_map
.end())
1157 same_phi_info
->second
.uses
.emplace(instr
);
1162 auto it
= ctx
.orig_names
.find(same
.id());
1163 unsigned orig_var
= it
!= ctx
.orig_names
.end() ? it
->second
.id() : same
.id();
1164 for (unsigned i
= 0; i
< program
->blocks
.size(); i
++) {
1165 auto it
= renames
[i
].find(orig_var
);
1166 if (it
!= renames
[i
].end() && it
->second
== def
.getTemp())
1167 renames
[i
][orig_var
] = same
;
1170 unsigned block_idx
= info
->second
.block_idx
;
1171 phi
->definitions
.clear(); /* this indicates that the phi can be removed */
1172 phi_map
.erase(info
);
1173 for (auto it
: phi_users
) {
1174 if (sealed
[it
->second
.block_idx
])
1175 try_remove_trivial_phi(it
);
1178 /* due to the removal of other phis, the name might have changed once again! */
1179 return renames
[block_idx
][orig_var
];
1182 std::map
<unsigned, Instruction
*> vectors
;
1183 std::vector
<std::vector
<Temp
>> phi_ressources
;
1184 std::map
<unsigned, unsigned> temp_to_phi_ressources
;
1186 for (std::vector
<Block
>::reverse_iterator it
= program
->blocks
.rbegin(); it
!= program
->blocks
.rend(); it
++) {
1189 /* first, compute the death points of all live vars within the block */
1190 std::set
<Temp
>& live
= live_out_per_block
[block
.index
];
1192 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator rit
;
1193 for (rit
= block
.instructions
.rbegin(); rit
!= block
.instructions
.rend(); ++rit
) {
1194 aco_ptr
<Instruction
>& instr
= *rit
;
1195 if (is_phi(instr
)) {
1196 live
.erase(instr
->definitions
[0].getTemp());
1197 if (instr
->definitions
[0].isKill() || instr
->definitions
[0].isFixed())
1199 /* collect information about affinity-related temporaries */
1200 std::vector
<Temp
> affinity_related
;
1201 /* affinity_related[0] is the last seen affinity-related temp */
1202 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1203 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1204 for (const Operand
& op
: instr
->operands
) {
1205 if (op
.isTemp() && op
.regClass() == instr
->definitions
[0].regClass()) {
1206 affinity_related
.emplace_back(op
.getTemp());
1207 temp_to_phi_ressources
[op
.tempId()] = phi_ressources
.size();
1210 phi_ressources
.emplace_back(std::move(affinity_related
));
1214 /* add vector affinities */
1215 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1216 for (const Operand
& op
: instr
->operands
) {
1217 if (op
.isTemp() && op
.getTemp().type() == instr
->definitions
[0].getTemp().type())
1218 vectors
[op
.tempId()] = instr
.get();
1222 /* add operands to live variables */
1223 for (const Operand
& op
: instr
->operands
) {
1225 live
.emplace(op
.getTemp());
1228 /* erase definitions from live */
1229 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1230 const Definition
& def
= instr
->definitions
[i
];
1233 live
.erase(def
.getTemp());
1234 /* mark last-seen phi operand */
1235 std::map
<unsigned, unsigned>::iterator it
= temp_to_phi_ressources
.find(def
.tempId());
1236 if (it
!= temp_to_phi_ressources
.end() && def
.regClass() == phi_ressources
[it
->second
][0].regClass()) {
1237 phi_ressources
[it
->second
][0] = def
.getTemp();
1238 /* try to coalesce phi affinities with parallelcopies */
1239 if (!def
.isFixed() && instr
->opcode
== aco_opcode::p_parallelcopy
) {
1240 Operand op
= instr
->operands
[i
];
1241 if (op
.isTemp() && op
.isFirstKillBeforeDef() && def
.regClass() == op
.regClass()) {
1242 phi_ressources
[it
->second
].emplace_back(op
.getTemp());
1243 temp_to_phi_ressources
[op
.tempId()] = it
->second
;
1250 /* create affinities */
1251 for (std::vector
<Temp
>& vec
: phi_ressources
) {
1252 assert(vec
.size() > 1);
1253 for (unsigned i
= 1; i
< vec
.size(); i
++)
1254 if (vec
[i
].id() != vec
[0].id())
1255 affinities
[vec
[i
].id()] = vec
[0].id();
1258 /* state of register file after phis */
1259 std::vector
<std::bitset
<128>> sgpr_live_in(program
->blocks
.size());
1261 for (Block
& block
: program
->blocks
) {
1262 std::set
<Temp
>& live
= live_out_per_block
[block
.index
];
1263 /* initialize register file */
1264 assert(block
.index
!= 0 || live
.empty());
1265 RegisterFile register_file
;
1266 ctx
.war_hint
.reset();
1268 for (Temp t
: live
) {
1269 Temp renamed
= handle_live_in(t
, &block
);
1270 if (ctx
.assignments
.find(renamed
.id()) != ctx
.assignments
.end())
1271 register_file
.fill(ctx
.assignments
[renamed
.id()].first
, t
.size(), renamed
.id());
1274 std::vector
<aco_ptr
<Instruction
>> instructions
;
1275 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1277 /* this is a slight adjustment from the paper as we already have phi nodes:
1278 * We consider them incomplete phis and only handle the definition. */
1280 /* handle fixed phi definitions */
1281 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1282 aco_ptr
<Instruction
>& phi
= *it
;
1285 Definition
& definition
= phi
->definitions
[0];
1286 if (!definition
.isFixed())
1289 /* check if a dead exec mask phi is needed */
1290 if (definition
.isKill()) {
1291 for (Operand
& op
: phi
->operands
) {
1292 assert(op
.isTemp());
1293 if (ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end() ||
1294 ctx
.assignments
[op
.tempId()].first
!= exec
) {
1295 definition
.setKill(false);
1301 if (definition
.isKill())
1304 assert(definition
.physReg() == exec
);
1305 assert(!register_file
.test(definition
.physReg(), definition
.bytes()));
1306 register_file
.fill(definition
);
1307 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1310 /* look up the affinities */
1311 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1312 aco_ptr
<Instruction
>& phi
= *it
;
1315 Definition
& definition
= phi
->definitions
[0];
1316 if (definition
.isKill() || definition
.isFixed())
1319 if (affinities
.find(definition
.tempId()) != affinities
.end() &&
1320 ctx
.assignments
.find(affinities
[definition
.tempId()]) != ctx
.assignments
.end()) {
1321 assert(ctx
.assignments
[affinities
[definition
.tempId()]].second
== definition
.regClass());
1322 PhysReg reg
= ctx
.assignments
[affinities
[definition
.tempId()]].first
;
1323 bool try_use_special_reg
= reg
== scc
|| reg
== exec
;
1324 if (try_use_special_reg
) {
1325 for (const Operand
& op
: phi
->operands
) {
1327 ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end() ||
1328 !(ctx
.assignments
[op
.tempId()].first
== reg
)) {
1329 try_use_special_reg
= false;
1333 if (!try_use_special_reg
)
1336 /* only assign if register is still free */
1337 if (!register_file
.test(reg
, definition
.bytes())) {
1338 definition
.setFixed(reg
);
1339 register_file
.fill(definition
);
1340 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1345 /* find registers for phis without affinity or where the register was blocked */
1346 for (it
= block
.instructions
.begin();it
!= block
.instructions
.end(); ++it
) {
1347 aco_ptr
<Instruction
>& phi
= *it
;
1351 Definition
& definition
= phi
->definitions
[0];
1352 if (definition
.isKill())
1355 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1357 if (!definition
.isFixed()) {
1358 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1359 /* try to find a register that is used by at least one operand */
1360 for (const Operand
& op
: phi
->operands
) {
1362 ctx
.assignments
.find(op
.tempId()) == ctx
.assignments
.end())
1364 PhysReg reg
= ctx
.assignments
[op
.tempId()].first
;
1365 /* we tried this already on the previous loop */
1366 if (reg
== scc
|| reg
== exec
)
1368 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
, reg
)) {
1369 definition
.setFixed(reg
);
1373 if (!definition
.isFixed())
1374 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
));
1376 /* process parallelcopy */
1377 for (std::pair
<Operand
, Definition
> pc
: parallelcopy
) {
1378 /* see if it's a copy from a different phi */
1379 //TODO: prefer moving some previous phis over live-ins
1380 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1381 Instruction
*prev_phi
= NULL
;
1382 std::vector
<aco_ptr
<Instruction
>>::iterator phi_it
;
1383 for (phi_it
= instructions
.begin(); phi_it
!= instructions
.end(); ++phi_it
) {
1384 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1385 prev_phi
= phi_it
->get();
1388 while (!prev_phi
&& is_phi(*++phi_it
)) {
1389 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1390 prev_phi
= phi_it
->get();
1393 /* if so, just update that phi's register */
1394 prev_phi
->definitions
[0].setFixed(pc
.second
.physReg());
1395 ctx
.assignments
[prev_phi
->definitions
[0].tempId()] = {pc
.second
.physReg(), pc
.second
.regClass()};
1396 register_file
.fill(pc
.second
.physReg(), pc
.second
.size(), prev_phi
->definitions
[0].tempId());
1401 std::map
<unsigned, Temp
>::iterator orig_it
= ctx
.orig_names
.find(pc
.first
.tempId());
1402 Temp orig
= pc
.first
.getTemp();
1403 if (orig_it
!= ctx
.orig_names
.end())
1404 orig
= orig_it
->second
;
1406 ctx
.orig_names
[pc
.second
.tempId()] = orig
;
1407 renames
[block
.index
][orig
.id()] = pc
.second
.getTemp();
1408 renames
[block
.index
][pc
.second
.tempId()] = pc
.second
.getTemp();
1410 /* otherwise, this is a live-in and we need to create a new phi
1411 * to move it in this block's predecessors */
1412 aco_opcode opcode
= pc
.first
.getTemp().is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1413 std::vector
<unsigned>& preds
= pc
.first
.getTemp().is_linear() ? block
.linear_preds
: block
.logical_preds
;
1414 aco_ptr
<Instruction
> new_phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1415 new_phi
->definitions
[0] = pc
.second
;
1416 for (unsigned i
= 0; i
< preds
.size(); i
++)
1417 new_phi
->operands
[i
] = Operand(pc
.first
);
1418 instructions
.emplace_back(std::move(new_phi
));
1421 register_file
.fill(definition
);
1422 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1424 live
.emplace(definition
.getTemp());
1426 /* update phi affinities */
1427 for (const Operand
& op
: phi
->operands
) {
1428 if (op
.isTemp() && op
.regClass() == phi
->definitions
[0].regClass())
1429 affinities
[op
.tempId()] = definition
.tempId();
1432 instructions
.emplace_back(std::move(*it
));
1435 /* fill in sgpr_live_in */
1436 for (unsigned i
= 0; i
<= ctx
.max_used_sgpr
; i
++)
1437 sgpr_live_in
[block
.index
][i
] = register_file
[i
];
1438 sgpr_live_in
[block
.index
][127] = register_file
[scc
.reg()];
1440 /* Handle all other instructions of the block */
1441 for (; it
!= block
.instructions
.end(); ++it
) {
1442 aco_ptr
<Instruction
>& instr
= *it
;
1444 /* parallelcopies from p_phi are inserted here which means
1445 * live ranges of killed operands end here as well */
1446 if (instr
->opcode
== aco_opcode::p_logical_end
) {
1447 /* no need to process this instruction any further */
1448 if (block
.logical_succs
.size() != 1) {
1449 instructions
.emplace_back(std::move(instr
));
1453 Block
& succ
= program
->blocks
[block
.logical_succs
[0]];
1455 for (; idx
< succ
.logical_preds
.size(); idx
++) {
1456 if (succ
.logical_preds
[idx
] == block
.index
)
1459 for (aco_ptr
<Instruction
>& phi
: succ
.instructions
) {
1460 if (phi
->opcode
== aco_opcode::p_phi
) {
1461 if (phi
->operands
[idx
].isTemp() &&
1462 phi
->operands
[idx
].getTemp().type() == RegType::sgpr
&&
1463 phi
->operands
[idx
].isFirstKillBeforeDef()) {
1464 Temp phi_op
= read_variable(phi
->operands
[idx
].getTemp(), block
.index
);
1465 PhysReg reg
= ctx
.assignments
[phi_op
.id()].first
;
1466 assert(register_file
[reg
] == phi_op
.id());
1467 register_file
[reg
] = 0;
1469 } else if (phi
->opcode
!= aco_opcode::p_linear_phi
) {
1473 instructions
.emplace_back(std::move(instr
));
1477 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1479 assert(!is_phi(instr
));
1481 /* handle operands */
1482 for (unsigned i
= 0; i
< instr
->operands
.size(); ++i
) {
1483 auto& operand
= instr
->operands
[i
];
1484 if (!operand
.isTemp())
1487 /* rename operands */
1488 operand
.setTemp(read_variable(operand
.getTemp(), block
.index
));
1490 /* check if the operand is fixed */
1491 if (operand
.isFixed()) {
1493 if (operand
.physReg() == ctx
.assignments
[operand
.tempId()].first
) {
1494 /* we are fine: the operand is already assigned the correct reg */
1497 /* check if target reg is blocked, and move away the blocking var */
1498 if (register_file
[operand
.physReg().reg()]) {
1499 uint32_t blocking_id
= register_file
[operand
.physReg().reg()];
1500 RegClass rc
= ctx
.assignments
[blocking_id
].second
;
1501 Operand pc_op
= Operand(Temp
{blocking_id
, rc
});
1502 pc_op
.setFixed(operand
.physReg());
1503 Definition pc_def
= Definition(Temp
{program
->allocateId(), pc_op
.regClass()});
1505 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.regClass(), parallelcopy
, instr
);
1506 pc_def
.setFixed(reg
);
1507 ctx
.assignments
[pc_def
.tempId()] = {reg
, pc_def
.regClass()};
1508 register_file
.clear(pc_op
);
1509 register_file
.fill(pc_def
);
1510 parallelcopy
.emplace_back(pc_op
, pc_def
);
1512 /* handle renames of previous operands */
1513 for (unsigned j
= 0; j
< i
; j
++) {
1514 Operand
& op
= instr
->operands
[j
];
1515 if (op
.isTemp() && op
.tempId() == blocking_id
) {
1516 op
.setTemp(pc_def
.getTemp());
1521 /* move operand to fixed reg and create parallelcopy pair */
1522 Operand pc_op
= operand
;
1523 Temp tmp
= Temp
{program
->allocateId(), operand
.regClass()};
1524 Definition pc_def
= Definition(tmp
);
1525 pc_def
.setFixed(operand
.physReg());
1526 pc_op
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1527 operand
.setTemp(tmp
);
1528 ctx
.assignments
[tmp
.id()] = {pc_def
.physReg(), pc_def
.regClass()};
1529 operand
.setFixed(pc_def
.physReg());
1530 register_file
.clear(pc_op
);
1531 register_file
.fill(pc_def
);
1532 parallelcopy
.emplace_back(pc_op
, pc_def
);
1535 assert(ctx
.assignments
.find(operand
.tempId()) != ctx
.assignments
.end());
1536 PhysReg reg
= ctx
.assignments
[operand
.tempId()].first
;
1538 if (operand_can_use_reg(instr
, i
, reg
)) {
1539 operand
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1541 Operand pc_op
= operand
;
1542 pc_op
.setFixed(reg
);
1543 PhysReg new_reg
= get_reg(ctx
, register_file
, operand
.regClass(), parallelcopy
, instr
);
1544 Definition pc_def
= Definition(program
->allocateId(), new_reg
, pc_op
.regClass());
1545 ctx
.assignments
[pc_def
.tempId()] = {new_reg
, pc_def
.regClass()};
1546 register_file
.clear(pc_op
);
1547 register_file
.fill(pc_def
);
1548 parallelcopy
.emplace_back(pc_op
, pc_def
);
1549 operand
.setTemp(pc_def
.getTemp());
1550 operand
.setFixed(new_reg
);
1553 if (instr
->format
== Format::EXP
||
1554 (instr
->isVMEM() && i
== 3 && program
->chip_class
== GFX6
) ||
1555 (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
)) {
1556 for (unsigned j
= 0; j
< operand
.size(); j
++)
1557 ctx
.war_hint
.set(operand
.physReg().reg() + j
);
1560 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.getTemp().id());
1561 if (phi
!= phi_map
.end())
1562 phi
->second
.uses
.emplace(instr
.get());
1565 /* remove dead vars from register file */
1566 for (const Operand
& op
: instr
->operands
) {
1567 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1568 register_file
.clear(op
);
1571 /* try to optimize v_mad_f32 -> v_mac_f32 */
1572 if (instr
->opcode
== aco_opcode::v_mad_f32
&&
1573 instr
->operands
[2].isTemp() &&
1574 instr
->operands
[2].isKillBeforeDef() &&
1575 instr
->operands
[2].getTemp().type() == RegType::vgpr
&&
1576 instr
->operands
[1].isTemp() &&
1577 instr
->operands
[1].getTemp().type() == RegType::vgpr
) { /* TODO: swap src0 and src1 in this case */
1578 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
1579 bool can_use_mac
= !(vop3
->abs
[0] || vop3
->abs
[1] || vop3
->abs
[2] ||
1580 vop3
->neg
[0] || vop3
->neg
[1] || vop3
->neg
[2] ||
1581 vop3
->clamp
|| vop3
->omod
|| vop3
->opsel
);
1583 instr
->format
= Format::VOP2
;
1584 instr
->opcode
= aco_opcode::v_mac_f32
;
1588 /* handle definitions which must have the same register as an operand */
1589 if (instr
->opcode
== aco_opcode::v_interp_p2_f32
||
1590 instr
->opcode
== aco_opcode::v_mac_f32
||
1591 instr
->opcode
== aco_opcode::v_writelane_b32
||
1592 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) {
1593 instr
->definitions
[0].setFixed(instr
->operands
[2].physReg());
1594 } else if (instr
->opcode
== aco_opcode::s_addk_i32
||
1595 instr
->opcode
== aco_opcode::s_mulk_i32
) {
1596 instr
->definitions
[0].setFixed(instr
->operands
[0].physReg());
1597 } else if (instr
->format
== Format::MUBUF
&&
1598 instr
->definitions
.size() == 1 &&
1599 instr
->operands
.size() == 4) {
1600 instr
->definitions
[0].setFixed(instr
->operands
[3].physReg());
1601 } else if (instr
->format
== Format::MIMG
&&
1602 instr
->definitions
.size() == 1 &&
1603 instr
->operands
[1].regClass().type() == RegType::vgpr
) {
1604 instr
->definitions
[0].setFixed(instr
->operands
[1].physReg());
1607 ctx
.defs_done
.reset();
1609 /* handle fixed definitions first */
1610 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1611 auto& definition
= instr
->definitions
[i
];
1612 if (!definition
.isFixed())
1615 adjust_max_used_regs(ctx
, definition
.regClass(), definition
.physReg());
1616 /* check if the target register is blocked */
1617 if (register_file
[definition
.physReg().reg()] != 0) {
1618 /* create parallelcopy pair to move blocking var */
1619 Temp tmp
= {register_file
[definition
.physReg()], ctx
.assignments
[register_file
[definition
.physReg()]].second
};
1620 Operand pc_op
= Operand(tmp
);
1621 pc_op
.setFixed(ctx
.assignments
[register_file
[definition
.physReg().reg()]].first
);
1622 RegClass rc
= pc_op
.regClass();
1623 tmp
= Temp
{program
->allocateId(), rc
};
1624 Definition pc_def
= Definition(tmp
);
1626 /* re-enable the killed operands, so that we don't move the blocking var there */
1627 for (const Operand
& op
: instr
->operands
) {
1628 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1629 register_file
.fill(op
.physReg(), op
.size(), 0xFFFF);
1632 /* find a new register for the blocking variable */
1633 PhysReg reg
= get_reg(ctx
, register_file
, rc
, parallelcopy
, instr
);
1634 /* once again, disable killed operands */
1635 for (const Operand
& op
: instr
->operands
) {
1636 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1637 register_file
.clear(op
);
1639 for (unsigned k
= 0; k
< i
; k
++) {
1640 if (instr
->definitions
[k
].isTemp() && ctx
.defs_done
.test(k
) && !instr
->definitions
[k
].isKill())
1641 register_file
.fill(instr
->definitions
[k
]);
1643 pc_def
.setFixed(reg
);
1645 /* finish assignment of parallelcopy */
1646 ctx
.assignments
[pc_def
.tempId()] = {reg
, pc_def
.regClass()};
1647 parallelcopy
.emplace_back(pc_op
, pc_def
);
1649 /* add changes to reg_file */
1650 register_file
.clear(pc_op
);
1651 register_file
.fill(pc_def
);
1653 ctx
.defs_done
.set(i
);
1655 if (!definition
.isTemp())
1658 /* set live if it has a kill point */
1659 if (!definition
.isKill())
1660 live
.emplace(definition
.getTemp());
1662 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1663 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1664 register_file
.fill(definition
);
1667 /* handle all other definitions */
1668 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1669 auto& definition
= instr
->definitions
[i
];
1671 if (definition
.isFixed() || !definition
.isTemp())
1675 if (definition
.hasHint() && register_file
[definition
.physReg().reg()] == 0)
1676 definition
.setFixed(definition
.physReg());
1677 else if (instr
->opcode
== aco_opcode::p_split_vector
) {
1678 PhysReg reg
= PhysReg
{instr
->operands
[0].physReg() + i
* definition
.size()};
1679 if (!get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1680 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1681 definition
.setFixed(reg
);
1682 } else if (instr
->opcode
== aco_opcode::p_wqm
) {
1684 if (instr
->operands
[0].isKillBeforeDef() && instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1685 reg
= instr
->operands
[0].physReg();
1686 assert(register_file
[reg
.reg()] == 0);
1688 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1690 definition
.setFixed(reg
);
1691 } else if (instr
->opcode
== aco_opcode::p_extract_vector
) {
1693 if (instr
->operands
[0].isKillBeforeDef() &&
1694 instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1695 reg
= instr
->operands
[0].physReg();
1696 reg
= PhysReg(reg
.reg() + definition
.size() * instr
->operands
[1].constantValue());
1697 assert(register_file
[reg
.reg()] == 0);
1699 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1701 definition
.setFixed(reg
);
1702 } else if (instr
->opcode
== aco_opcode::p_create_vector
) {
1703 PhysReg reg
= get_reg_create_vector(ctx
, register_file
, definition
.regClass(),
1704 parallelcopy
, instr
);
1705 definition
.setFixed(reg
);
1706 } else if (affinities
.find(definition
.tempId()) != affinities
.end() &&
1707 ctx
.assignments
.find(affinities
[definition
.tempId()]) != ctx
.assignments
.end()) {
1708 PhysReg reg
= ctx
.assignments
[affinities
[definition
.tempId()]].first
;
1709 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1710 definition
.setFixed(reg
);
1712 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1714 } else if (vectors
.find(definition
.tempId()) != vectors
.end()) {
1715 Instruction
* vec
= vectors
[definition
.tempId()];
1716 unsigned offset
= 0;
1717 for (const Operand
& op
: vec
->operands
) {
1718 if (op
.isTemp() && op
.tempId() == definition
.tempId())
1721 offset
+= op
.size();
1724 for (const Operand
& op
: vec
->operands
) {
1726 op
.tempId() != definition
.tempId() &&
1727 op
.getTemp().type() == definition
.getTemp().type() &&
1728 ctx
.assignments
.find(op
.tempId()) != ctx
.assignments
.end()) {
1729 PhysReg reg
= ctx
.assignments
[op
.tempId()].first
;
1730 reg
= PhysReg(reg
.reg() - k
+ offset
);
1731 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
)) {
1732 definition
.setFixed(reg
);
1738 if (!definition
.isFixed()) {
1739 std::pair
<PhysReg
, bool> res
= get_reg_vec(ctx
, register_file
, vec
->definitions
[0].regClass());
1740 PhysReg reg
= res
.first
;
1742 reg
= PhysReg(reg
.reg() + offset
);
1744 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1746 definition
.setFixed(reg
);
1749 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1751 assert(definition
.isFixed() && ((definition
.getTemp().type() == RegType::vgpr
&& definition
.physReg() >= 256) ||
1752 (definition
.getTemp().type() != RegType::vgpr
&& definition
.physReg() < 256)));
1753 ctx
.defs_done
.set(i
);
1755 /* set live if it has a kill point */
1756 if (!definition
.isKill())
1757 live
.emplace(definition
.getTemp());
1759 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1760 renames
[block
.index
][definition
.tempId()] = definition
.getTemp();
1761 register_file
.fill(definition
);
1764 handle_pseudo(ctx
, register_file
, instr
.get());
1766 /* kill definitions and late-kill operands */
1767 for (const Definition
& def
: instr
->definitions
) {
1768 if (def
.isTemp() && def
.isKill())
1769 register_file
.clear(def
);
1771 for (const Operand
& op
: instr
->operands
) {
1772 if (op
.isTemp() && op
.isFirstKill() && op
.isLateKill())
1773 register_file
.clear(op
);
1776 /* emit parallelcopy */
1777 if (!parallelcopy
.empty()) {
1778 aco_ptr
<Pseudo_instruction
> pc
;
1779 pc
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, parallelcopy
.size(), parallelcopy
.size()));
1780 bool temp_in_scc
= register_file
[scc
.reg()];
1781 bool sgpr_operands_alias_defs
= false;
1782 uint64_t sgpr_operands
[4] = {0, 0, 0, 0};
1783 for (unsigned i
= 0; i
< parallelcopy
.size(); i
++) {
1784 if (temp_in_scc
&& parallelcopy
[i
].first
.isTemp() && parallelcopy
[i
].first
.getTemp().type() == RegType::sgpr
) {
1785 if (!sgpr_operands_alias_defs
) {
1786 unsigned reg
= parallelcopy
[i
].first
.physReg().reg();
1787 unsigned size
= parallelcopy
[i
].first
.getTemp().size();
1788 sgpr_operands
[reg
/ 64u] |= ((1u << size
) - 1) << (reg
% 64u);
1790 reg
= parallelcopy
[i
].second
.physReg().reg();
1791 size
= parallelcopy
[i
].second
.getTemp().size();
1792 if (sgpr_operands
[reg
/ 64u] & ((1u << size
) - 1) << (reg
% 64u))
1793 sgpr_operands_alias_defs
= true;
1797 pc
->operands
[i
] = parallelcopy
[i
].first
;
1798 pc
->definitions
[i
] = parallelcopy
[i
].second
;
1799 assert(pc
->operands
[i
].size() == pc
->definitions
[i
].size());
1801 /* it might happen that the operand is already renamed. we have to restore the original name. */
1802 std::map
<unsigned, Temp
>::iterator it
= ctx
.orig_names
.find(pc
->operands
[i
].tempId());
1803 Temp orig
= it
!= ctx
.orig_names
.end() ? it
->second
: pc
->operands
[i
].getTemp();
1804 ctx
.orig_names
[pc
->definitions
[i
].tempId()] = orig
;
1805 renames
[block
.index
][orig
.id()] = pc
->definitions
[i
].getTemp();
1806 renames
[block
.index
][pc
->definitions
[i
].tempId()] = pc
->definitions
[i
].getTemp();
1808 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(pc
->operands
[i
].tempId());
1809 if (phi
!= phi_map
.end())
1810 phi
->second
.uses
.emplace(pc
.get());
1813 if (temp_in_scc
&& sgpr_operands_alias_defs
) {
1814 /* disable definitions and re-enable operands */
1815 for (const Definition
& def
: instr
->definitions
) {
1816 if (def
.isTemp() && !def
.isKill())
1817 register_file
.clear(def
);
1819 for (const Operand
& op
: instr
->operands
) {
1820 if (op
.isTemp() && op
.isFirstKill())
1821 register_file
.fill(op
.physReg(), op
.size(), 0xFFFF);
1824 handle_pseudo(ctx
, register_file
, pc
.get());
1826 /* re-enable live vars */
1827 for (const Operand
& op
: instr
->operands
) {
1828 if (op
.isTemp() && op
.isFirstKill())
1829 register_file
.clear(op
);
1831 for (const Definition
& def
: instr
->definitions
) {
1832 if (def
.isTemp() && !def
.isKill())
1833 register_file
.fill(def
);
1836 pc
->tmp_in_scc
= false;
1839 instructions
.emplace_back(std::move(pc
));
1842 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1843 bool instr_needs_vop3
= !instr
->isVOP3() &&
1844 ((instr
->format
== Format::VOPC
&& !(instr
->definitions
[0].physReg() == vcc
)) ||
1845 (instr
->opcode
== aco_opcode::v_cndmask_b32
&& !(instr
->operands
[2].physReg() == vcc
)) ||
1846 ((instr
->opcode
== aco_opcode::v_add_co_u32
||
1847 instr
->opcode
== aco_opcode::v_addc_co_u32
||
1848 instr
->opcode
== aco_opcode::v_sub_co_u32
||
1849 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1850 instr
->opcode
== aco_opcode::v_subrev_co_u32
||
1851 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1852 !(instr
->definitions
[1].physReg() == vcc
)) ||
1853 ((instr
->opcode
== aco_opcode::v_addc_co_u32
||
1854 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1855 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1856 !(instr
->operands
[2].physReg() == vcc
)));
1857 if (instr_needs_vop3
) {
1859 /* if the first operand is a literal, we have to move it to a reg */
1860 if (instr
->operands
.size() && instr
->operands
[0].isLiteral() && program
->chip_class
< GFX10
) {
1861 bool can_sgpr
= true;
1862 /* check, if we have to move to vgpr */
1863 for (const Operand
& op
: instr
->operands
) {
1864 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1869 aco_ptr
<Instruction
> mov
;
1871 mov
.reset(create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1));
1873 mov
.reset(create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1));
1874 mov
->operands
[0] = instr
->operands
[0];
1875 Temp tmp
= {program
->allocateId(), can_sgpr
? s1
: v1
};
1876 mov
->definitions
[0] = Definition(tmp
);
1877 /* disable definitions and re-enable operands */
1878 for (const Definition
& def
: instr
->definitions
)
1879 register_file
.clear(def
);
1880 for (const Operand
& op
: instr
->operands
) {
1881 if (op
.isTemp() && op
.isFirstKill())
1882 register_file
.fill(op
.physReg(), op
.size(), 0xFFFF);
1884 mov
->definitions
[0].setFixed(get_reg(ctx
, register_file
, tmp
.regClass(), parallelcopy
, mov
));
1885 instr
->operands
[0] = Operand(tmp
);
1886 instr
->operands
[0].setFixed(mov
->definitions
[0].physReg());
1887 instructions
.emplace_back(std::move(mov
));
1888 /* re-enable live vars */
1889 for (const Operand
& op
: instr
->operands
) {
1890 if (op
.isTemp() && op
.isFirstKill())
1891 register_file
.clear(op
);
1893 for (const Definition
& def
: instr
->definitions
) {
1894 if (def
.isTemp() && !def
.isKill())
1895 register_file
.fill(def
);
1899 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1900 aco_ptr
<Instruction
> tmp
= std::move(instr
);
1901 Format format
= asVOP3(tmp
->format
);
1902 instr
.reset(create_instruction
<VOP3A_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
1903 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1904 Operand
& operand
= tmp
->operands
[i
];
1905 instr
->operands
[i
] = operand
;
1906 /* keep phi_map up to date */
1907 if (operand
.isTemp()) {
1908 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.tempId());
1909 if (phi
!= phi_map
.end()) {
1910 phi
->second
.uses
.erase(tmp
.get());
1911 phi
->second
.uses
.emplace(instr
.get());
1915 std::copy(tmp
->definitions
.begin(), tmp
->definitions
.end(), instr
->definitions
.begin());
1917 instructions
.emplace_back(std::move(*it
));
1919 } /* end for Instr */
1921 block
.instructions
= std::move(instructions
);
1923 filled
[block
.index
] = true;
1924 for (unsigned succ_idx
: block
.linear_succs
) {
1925 Block
& succ
= program
->blocks
[succ_idx
];
1926 /* seal block if all predecessors are filled */
1927 bool all_filled
= true;
1928 for (unsigned pred_idx
: succ
.linear_preds
) {
1929 if (!filled
[pred_idx
]) {
1935 /* finish incomplete phis and check if they became trivial */
1936 for (Instruction
* phi
: incomplete_phis
[succ_idx
]) {
1937 std::vector
<unsigned> preds
= phi
->definitions
[0].getTemp().is_linear() ? succ
.linear_preds
: succ
.logical_preds
;
1938 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
1939 phi
->operands
[i
].setTemp(read_variable(phi
->operands
[i
].getTemp(), preds
[i
]));
1940 phi
->operands
[i
].setFixed(ctx
.assignments
[phi
->operands
[i
].tempId()].first
);
1942 try_remove_trivial_phi(phi_map
.find(phi
->definitions
[0].tempId()));
1944 /* complete the original phi nodes, but no need to check triviality */
1945 for (aco_ptr
<Instruction
>& instr
: succ
.instructions
) {
1948 std::vector
<unsigned> preds
= instr
->opcode
== aco_opcode::p_phi
? succ
.logical_preds
: succ
.linear_preds
;
1950 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1951 auto& operand
= instr
->operands
[i
];
1952 if (!operand
.isTemp())
1954 operand
.setTemp(read_variable(operand
.getTemp(), preds
[i
]));
1955 operand
.setFixed(ctx
.assignments
[operand
.tempId()].first
);
1956 std::map
<unsigned, phi_info
>::iterator phi
= phi_map
.find(operand
.getTemp().id());
1957 if (phi
!= phi_map
.end())
1958 phi
->second
.uses
.emplace(instr
.get());
1961 sealed
[succ_idx
] = true;
1966 /* remove trivial phis */
1967 for (Block
& block
: program
->blocks
) {
1968 auto end
= std::find_if(block
.instructions
.begin(), block
.instructions
.end(),
1969 [](aco_ptr
<Instruction
>& instr
) { return !is_phi(instr
);});
1970 auto middle
= std::remove_if(block
.instructions
.begin(), end
,
1971 [](const aco_ptr
<Instruction
>& instr
) { return instr
->definitions
.empty();});
1972 block
.instructions
.erase(middle
, end
);
1975 /* find scc spill registers which may be needed for parallelcopies created by phis */
1976 for (Block
& block
: program
->blocks
) {
1977 if (block
.linear_preds
.size() <= 1)
1980 std::bitset
<128> regs
= sgpr_live_in
[block
.index
];
1984 /* choose a register */
1986 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& regs
[reg
]; reg
++)
1988 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
1989 adjust_max_used_regs(ctx
, s1
, reg
);
1991 /* update predecessors */
1992 for (unsigned& pred_index
: block
.linear_preds
) {
1993 Block
& pred
= program
->blocks
[pred_index
];
1994 pred
.scc_live_out
= true;
1995 pred
.scratch_sgpr
= PhysReg
{(uint16_t)reg
};
1999 /* num_gpr = rnd_up(max_used_gpr + 1) */
2000 program
->config
->num_vgprs
= align(ctx
.max_used_vgpr
+ 1, 4);
2001 if (program
->family
== CHIP_TONGA
|| program
->family
== CHIP_ICELAND
) /* workaround hardware bug */
2002 program
->config
->num_sgprs
= get_sgpr_alloc(program
, program
->sgpr_limit
);
2004 program
->config
->num_sgprs
= align(ctx
.max_used_sgpr
+ 1 + get_extra_sgprs(program
), 8);