2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
32 #include <unordered_map>
36 #include "util/u_math.h"
45 assignment() = default;
46 assignment(PhysReg reg
, RegClass rc
) : reg(reg
), rc(rc
), assigned(-1) {}
52 std::set
<Instruction
*> uses
;
56 std::bitset
<512> war_hint
;
58 std::vector
<assignment
> assignments
;
59 std::vector
<std::unordered_map
<unsigned, Temp
>> renames
;
60 std::vector
<std::vector
<Instruction
*>> incomplete_phis
;
61 std::vector
<bool> filled
;
62 std::vector
<bool> sealed
;
63 std::unordered_map
<unsigned, Temp
> orig_names
;
64 std::unordered_map
<unsigned, phi_info
> phi_map
;
65 std::unordered_map
<unsigned, unsigned> affinities
;
66 std::unordered_map
<unsigned, Instruction
*> vectors
;
67 std::unordered_map
<unsigned, Instruction
*> split_vectors
;
68 aco_ptr
<Instruction
> pseudo_dummy
;
69 unsigned max_used_sgpr
= 0;
70 unsigned max_used_vgpr
= 0;
71 std::bitset
<64> defs_done
; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
73 ra_ctx(Program
* program
) : program(program
),
74 assignments(program
->peekAllocationId()),
75 renames(program
->blocks
.size()),
76 incomplete_phis(program
->blocks
.size()),
77 filled(program
->blocks
.size()),
78 sealed(program
->blocks
.size())
80 pseudo_dummy
.reset(create_instruction
<Instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, 0, 0));
84 bool instr_can_access_subdword(ra_ctx
& ctx
, aco_ptr
<Instruction
>& instr
)
86 if (ctx
.program
->chip_class
< GFX8
)
88 return instr
->isSDWA() || instr
->format
== Format::PSEUDO
;
98 DefInfo(ra_ctx
& ctx
, aco_ptr
<Instruction
>& instr
, RegClass rc
) : rc(rc
) {
102 if (rc
.type() == RegType::vgpr
) {
104 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
107 ub
= ctx
.program
->max_reg_demand
.sgpr
;
114 if (rc
.is_subdword()) {
115 /* stride in bytes */
116 if(!instr_can_access_subdword(ctx
, instr
))
118 else if (rc
.bytes() % 4 == 0)
120 else if (rc
.bytes() % 2 == 0)
128 RegisterFile() {regs
.fill(0);}
130 std::array
<uint32_t, 512> regs
;
131 std::map
<uint32_t, std::array
<uint32_t, 4>> subdword_regs
;
133 const uint32_t& operator [] (unsigned index
) const {
137 uint32_t& operator [] (unsigned index
) {
141 unsigned count_zero(PhysReg start
, unsigned size
) {
143 for (unsigned i
= 0; i
< size
; i
++)
144 res
+= !regs
[start
+ i
];
148 bool test(PhysReg start
, unsigned num_bytes
) {
149 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
150 if (regs
[i
] & 0x0FFFFFFF)
152 if (regs
[i
] == 0xF0000000) {
153 assert(subdword_regs
.find(i
) != subdword_regs
.end());
154 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++) {
155 if (subdword_regs
[i
][j
])
163 void block(PhysReg start
, RegClass rc
) {
164 if (rc
.is_subdword())
165 fill_subdword(start
, rc
.bytes(), 0xFFFFFFFF);
167 fill(start
, rc
.size(), 0xFFFFFFFF);
170 bool is_blocked(PhysReg start
) {
171 if (regs
[start
] == 0xFFFFFFFF)
173 if (regs
[start
] == 0xF0000000) {
174 for (unsigned i
= start
.byte(); i
< 4; i
++)
175 if (subdword_regs
[start
][i
] == 0xFFFFFFFF)
181 void clear(PhysReg start
, RegClass rc
) {
182 if (rc
.is_subdword())
183 fill_subdword(start
, rc
.bytes(), 0);
185 fill(start
, rc
.size(), 0);
188 void fill(Operand op
) {
189 if (op
.regClass().is_subdword())
190 fill_subdword(op
.physReg(), op
.bytes(), op
.tempId());
192 fill(op
.physReg(), op
.size(), op
.tempId());
195 void clear(Operand op
) {
196 clear(op
.physReg(), op
.regClass());
199 void fill(Definition def
) {
200 if (def
.regClass().is_subdword())
201 fill_subdword(def
.physReg(), def
.bytes(), def
.tempId());
203 fill(def
.physReg(), def
.size(), def
.tempId());
206 void clear(Definition def
) {
207 clear(def
.physReg(), def
.regClass());
211 void fill(PhysReg start
, unsigned size
, uint32_t val
) {
212 for (unsigned i
= 0; i
< size
; i
++)
213 regs
[start
+ i
] = val
;
216 void fill_subdword(PhysReg start
, unsigned num_bytes
, uint32_t val
) {
217 fill(start
, DIV_ROUND_UP(num_bytes
, 4), 0xF0000000);
218 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
220 std::array
<uint32_t, 4>& sub
= subdword_regs
.emplace(i
, std::array
<uint32_t, 4>{0, 0, 0, 0}).first
->second
;
221 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++)
224 if (sub
== std::array
<uint32_t, 4>{0, 0, 0, 0}) {
225 subdword_regs
.erase(i
);
233 /* helper function for debugging */
235 void print_regs(ra_ctx
& ctx
, bool vgprs
, RegisterFile
& reg_file
)
237 unsigned max
= vgprs
? ctx
.program
->max_reg_demand
.vgpr
: ctx
.program
->max_reg_demand
.sgpr
;
238 unsigned lb
= vgprs
? 256 : 0;
239 unsigned ub
= lb
+ max
;
240 char reg_char
= vgprs
? 'v' : 's';
244 for (unsigned i
= lb
; i
< ub
; i
+= 3) {
245 printf("%.2u ", i
- lb
);
250 printf("%cgprs: ", reg_char
);
251 unsigned free_regs
= 0;
253 bool char_select
= false;
254 for (unsigned i
= lb
; i
< ub
; i
++) {
255 if (reg_file
[i
] == 0xFFFF) {
257 } else if (reg_file
[i
]) {
258 if (reg_file
[i
] != prev
) {
260 char_select
= !char_select
;
262 printf(char_select
? "#" : "@");
270 printf("%u/%u used, %u/%u free\n", max
- free_regs
, max
, free_regs
, max
);
272 /* print assignments */
275 for (unsigned i
= lb
; i
< ub
; i
++) {
276 if (reg_file
[i
] != prev
) {
277 if (prev
&& size
> 1)
278 printf("-%d]\n", i
- 1 - lb
);
282 if (prev
&& prev
!= 0xFFFF) {
283 if (ctx
.orig_names
.count(reg_file
[i
]) && ctx
.orig_names
[reg_file
[i
]].id() != reg_file
[i
])
284 printf("%%%u (was %%%d) = %c[%d", reg_file
[i
], ctx
.orig_names
[reg_file
[i
]].id(), reg_char
, i
- lb
);
286 printf("%%%u = %c[%d", reg_file
[i
], reg_char
, i
- lb
);
293 if (prev
&& size
> 1)
294 printf("-%d]\n", ub
- lb
- 1);
301 void adjust_max_used_regs(ra_ctx
& ctx
, RegClass rc
, unsigned reg
)
303 unsigned max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
304 unsigned size
= rc
.size();
305 if (rc
.type() == RegType::vgpr
) {
307 unsigned hi
= reg
- 256 + size
- 1;
308 ctx
.max_used_vgpr
= std::max(ctx
.max_used_vgpr
, hi
);
309 } else if (reg
+ rc
.size() <= max_addressible_sgpr
) {
310 unsigned hi
= reg
+ size
- 1;
311 ctx
.max_used_sgpr
= std::max(ctx
.max_used_sgpr
, std::min(hi
, max_addressible_sgpr
));
316 void update_renames(ra_ctx
& ctx
, RegisterFile
& reg_file
,
317 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
318 aco_ptr
<Instruction
>& instr
)
320 /* allocate id's and rename operands: this is done transparently here */
321 for (std::pair
<Operand
, Definition
>& copy
: parallelcopies
) {
322 /* the definitions with id are not from this function and already handled */
323 if (copy
.second
.isTemp())
326 /* check if we we moved another parallelcopy definition */
327 for (std::pair
<Operand
, Definition
>& other
: parallelcopies
) {
328 if (!other
.second
.isTemp())
330 if (copy
.first
.getTemp() == other
.second
.getTemp()) {
331 copy
.first
.setTemp(other
.first
.getTemp());
332 copy
.first
.setFixed(other
.first
.physReg());
335 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
336 copy
.second
.setTemp(Temp(ctx
.program
->allocateId(), copy
.second
.regClass()));
337 ctx
.assignments
.emplace_back(copy
.second
.physReg(), copy
.second
.regClass());
338 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
339 reg_file
.fill(copy
.second
);
341 /* check if we moved an operand */
342 for (Operand
& op
: instr
->operands
) {
345 if (op
.tempId() == copy
.first
.tempId()) {
346 bool omit_renaming
= instr
->opcode
== aco_opcode::p_create_vector
&& !op
.isKillBeforeDef();
347 for (std::pair
<Operand
, Definition
>& pc
: parallelcopies
) {
348 PhysReg def_reg
= pc
.second
.physReg();
349 omit_renaming
&= def_reg
> copy
.first
.physReg() ?
350 (copy
.first
.physReg() + copy
.first
.size() <= def_reg
.reg()) :
351 (def_reg
+ pc
.second
.size() <= copy
.first
.physReg().reg());
355 op
.setTemp(copy
.second
.getTemp());
356 op
.setFixed(copy
.second
.physReg());
362 std::pair
<PhysReg
, bool> get_reg_simple(ra_ctx
& ctx
,
363 RegisterFile
& reg_file
,
366 uint32_t lb
= info
.lb
;
367 uint32_t ub
= info
.ub
;
368 uint32_t size
= info
.size
;
369 uint32_t stride
= info
.stride
;
370 RegClass rc
= info
.rc
;
372 if (rc
.is_subdword()) {
373 for (std::pair
<uint32_t, std::array
<uint32_t, 4>> entry
: reg_file
.subdword_regs
) {
374 assert(reg_file
[entry
.first
] == 0xF0000000);
375 if (lb
> entry
.first
|| entry
.first
>= ub
)
378 for (unsigned i
= 0; i
< 4; i
+= stride
) {
379 if (entry
.second
[i
] != 0)
382 bool reg_found
= true;
383 for (unsigned j
= 1; reg_found
&& i
+ j
< 4 && j
< rc
.bytes(); j
++)
384 reg_found
&= entry
.second
[i
+ j
] == 0;
386 /* check neighboring reg if needed */
387 reg_found
&= ((int)i
<= 4 - (int)rc
.bytes() || reg_file
[entry
.first
+ 1] == 0);
389 PhysReg res
{entry
.first
};
391 adjust_max_used_regs(ctx
, rc
, entry
.first
);
397 stride
= 1; /* stride in full registers */
398 rc
= info
.rc
= RegClass(RegType::vgpr
, size
);
403 for (unsigned stride
= 8; stride
> 1; stride
/= 2) {
406 info
.stride
= stride
;
407 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
412 /* best fit algorithm: find the smallest gap to fit in the variable */
413 unsigned best_pos
= 0xFFFF;
414 unsigned gap_size
= 0xFFFF;
415 unsigned last_pos
= 0xFFFF;
417 for (unsigned current_reg
= lb
; current_reg
< ub
; current_reg
++) {
419 if (reg_file
[current_reg
] == 0 && !ctx
.war_hint
[current_reg
]) {
420 if (last_pos
== 0xFFFF)
421 last_pos
= current_reg
;
423 /* stop searching after max_used_gpr */
424 if (current_reg
== ctx
.max_used_sgpr
+ 1 || current_reg
== 256 + ctx
.max_used_vgpr
+ 1)
430 if (last_pos
== 0xFFFF)
433 /* early return on exact matches */
434 if (last_pos
+ size
== current_reg
) {
435 adjust_max_used_regs(ctx
, rc
, last_pos
);
436 return {PhysReg
{last_pos
}, true};
439 /* check if it fits and the gap size is smaller */
440 if (last_pos
+ size
< current_reg
&& current_reg
- last_pos
< gap_size
) {
442 gap_size
= current_reg
- last_pos
;
448 if (last_pos
+ size
<= ub
&& ub
- last_pos
< gap_size
) {
450 gap_size
= ub
- last_pos
;
453 if (best_pos
== 0xFFFF)
456 /* find best position within gap by leaving a good stride for other variables*/
457 unsigned buffer
= gap_size
- size
;
459 if (((best_pos
+ size
) % 8 != 0 && (best_pos
+ buffer
) % 8 == 0) ||
460 ((best_pos
+ size
) % 4 != 0 && (best_pos
+ buffer
) % 4 == 0) ||
461 ((best_pos
+ size
) % 2 != 0 && (best_pos
+ buffer
) % 2 == 0))
462 best_pos
= best_pos
+ buffer
;
465 adjust_max_used_regs(ctx
, rc
, best_pos
);
466 return {PhysReg
{best_pos
}, true};
470 unsigned reg_lo
= lb
;
471 unsigned reg_hi
= lb
+ size
- 1;
472 while (!found
&& reg_lo
+ size
<= ub
) {
473 if (reg_file
[reg_lo
] != 0) {
477 reg_hi
= reg_lo
+ size
- 1;
479 for (unsigned reg
= reg_lo
+ 1; found
&& reg
<= reg_hi
; reg
++) {
480 if (reg_file
[reg
] != 0 || ctx
.war_hint
[reg
])
484 adjust_max_used_regs(ctx
, rc
, reg_lo
);
485 return {PhysReg
{reg_lo
}, true};
494 /* collect variables from a register area and clear reg_file */
495 std::set
<std::pair
<unsigned, unsigned>> collect_vars(ra_ctx
& ctx
, RegisterFile
& reg_file
,
496 PhysReg reg
, unsigned size
)
498 std::set
<std::pair
<unsigned, unsigned>> vars
;
499 for (unsigned j
= reg
; j
< reg
+ size
; j
++) {
500 if (reg_file
.is_blocked(PhysReg
{j
}))
502 if (reg_file
[j
] == 0xF0000000) {
503 for (unsigned k
= 0; k
< 4; k
++) {
504 unsigned id
= reg_file
.subdword_regs
[j
][k
];
506 assignment
& var
= ctx
.assignments
[id
];
507 vars
.emplace(var
.rc
.bytes(), id
);
508 reg_file
.clear(var
.reg
, var
.rc
);
513 } else if (reg_file
[j
] != 0) {
514 unsigned id
= reg_file
[j
];
515 assignment
& var
= ctx
.assignments
[id
];
516 vars
.emplace(var
.rc
.bytes(), id
);
517 reg_file
.clear(var
.reg
, var
.rc
);
523 bool get_regs_for_copies(ra_ctx
& ctx
,
524 RegisterFile
& reg_file
,
525 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
526 const std::set
<std::pair
<unsigned, unsigned>> &vars
,
527 uint32_t lb
, uint32_t ub
,
528 aco_ptr
<Instruction
>& instr
,
533 /* variables are sorted from small sized to large */
534 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
535 for (std::set
<std::pair
<unsigned, unsigned>>::const_reverse_iterator it
= vars
.rbegin(); it
!= vars
.rend(); ++it
) {
536 unsigned id
= it
->second
;
537 assignment
& var
= ctx
.assignments
[id
];
538 DefInfo info
= DefInfo(ctx
, ctx
.pseudo_dummy
, var
.rc
);
539 uint32_t size
= info
.size
;
541 /* check if this is a dead operand, then we can re-use the space from the definition */
542 bool is_dead_operand
= false;
543 for (unsigned i
= 0; !is_phi(instr
) && !is_dead_operand
&& (i
< instr
->operands
.size()); i
++) {
544 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isKillBeforeDef() && instr
->operands
[i
].tempId() == id
)
545 is_dead_operand
= true;
548 std::pair
<PhysReg
, bool> res
;
549 if (is_dead_operand
) {
550 if (instr
->opcode
== aco_opcode::p_create_vector
) {
551 PhysReg
reg(def_reg_lo
);
552 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
553 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
554 assert(!reg_file
.test(reg
, var
.rc
.bytes()));
555 res
= {reg
, reg
.byte() == 0 || instr_can_access_subdword(ctx
, instr
)};
558 reg
.reg_b
+= instr
->operands
[i
].bytes();
561 info
.lb
= def_reg_lo
;
562 info
.ub
= def_reg_hi
+ 1;
563 res
= get_reg_simple(ctx
, reg_file
, info
);
567 info
.ub
= def_reg_lo
;
568 res
= get_reg_simple(ctx
, reg_file
, info
);
570 info
.lb
= (def_reg_hi
+ info
.stride
) & ~(info
.stride
- 1);
572 res
= get_reg_simple(ctx
, reg_file
, info
);
577 /* mark the area as blocked */
578 reg_file
.block(res
.first
, var
.rc
);
580 /* create parallelcopy pair (without definition id) */
581 Temp tmp
= Temp(id
, var
.rc
);
582 Operand pc_op
= Operand(tmp
);
583 pc_op
.setFixed(var
.reg
);
584 Definition pc_def
= Definition(res
.first
, pc_op
.regClass());
585 parallelcopies
.emplace_back(pc_op
, pc_def
);
589 unsigned best_pos
= lb
;
590 unsigned num_moves
= 0xFF;
591 unsigned num_vars
= 0;
593 /* we use a sliding window to find potential positions */
594 unsigned reg_lo
= lb
;
595 unsigned reg_hi
= lb
+ size
- 1;
596 unsigned stride
= var
.rc
.is_subdword() ? 1 : info
.stride
;
597 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
598 if (!is_dead_operand
&& ((reg_lo
>= def_reg_lo
&& reg_lo
<= def_reg_hi
) ||
599 (reg_hi
>= def_reg_lo
&& reg_hi
<= def_reg_hi
)))
602 /* second, check that we have at most k=num_moves elements in the window
603 * and no element is larger than the currently processed one */
606 unsigned last_var
= 0;
608 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
609 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
612 if (reg_file
.is_blocked(PhysReg
{j
}) || k
> num_moves
) {
616 if (reg_file
[j
] == 0xF0000000) {
621 /* we cannot split live ranges of linear vgprs */
622 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
626 bool is_kill
= false;
627 for (const Operand
& op
: instr
->operands
) {
628 if (op
.isTemp() && op
.isKillBeforeDef() && op
.tempId() == reg_file
[j
]) {
633 if (!is_kill
&& ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
638 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
639 last_var
= reg_file
[j
];
641 if (k
> num_moves
|| (k
== num_moves
&& n
<= num_vars
)) {
654 /* FIXME: we messed up and couldn't find space for the variables to be copied */
655 if (num_moves
== 0xFF)
659 reg_hi
= best_pos
+ size
- 1;
661 /* collect variables and block reg file */
662 std::set
<std::pair
<unsigned, unsigned>> new_vars
= collect_vars(ctx
, reg_file
, PhysReg
{reg_lo
}, size
);
664 /* mark the area as blocked */
665 reg_file
.block(PhysReg
{reg_lo
}, var
.rc
);
667 if (!get_regs_for_copies(ctx
, reg_file
, parallelcopies
, new_vars
, lb
, ub
, instr
, def_reg_lo
, def_reg_hi
))
670 adjust_max_used_regs(ctx
, var
.rc
, reg_lo
);
672 /* create parallelcopy pair (without definition id) */
673 Temp tmp
= Temp(id
, var
.rc
);
674 Operand pc_op
= Operand(tmp
);
675 pc_op
.setFixed(var
.reg
);
676 Definition pc_def
= Definition(PhysReg
{reg_lo
}, pc_op
.regClass());
677 parallelcopies
.emplace_back(pc_op
, pc_def
);
684 std::pair
<PhysReg
, bool> get_reg_impl(ra_ctx
& ctx
,
685 RegisterFile
& reg_file
,
686 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
688 aco_ptr
<Instruction
>& instr
)
690 uint32_t lb
= info
.lb
;
691 uint32_t ub
= info
.ub
;
692 uint32_t size
= info
.size
;
693 uint32_t stride
= info
.stride
;
694 RegClass rc
= info
.rc
;
696 /* check how many free regs we have */
697 unsigned regs_free
= reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
);
699 /* mark and count killed operands */
700 unsigned killed_ops
= 0;
701 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
702 if (instr
->operands
[j
].isTemp() &&
703 instr
->operands
[j
].isFirstKillBeforeDef() &&
704 instr
->operands
[j
].physReg() >= lb
&&
705 instr
->operands
[j
].physReg() < ub
&&
706 !reg_file
.test(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes())) {
707 assert(instr
->operands
[j
].isFixed());
708 reg_file
.block(instr
->operands
[j
].physReg(), instr
->operands
[j
].regClass());
709 killed_ops
+= instr
->operands
[j
].getTemp().size();
713 assert(regs_free
>= size
);
714 /* we might have to move dead operands to dst in order to make space */
715 unsigned op_moves
= 0;
717 if (size
> (regs_free
- killed_ops
))
718 op_moves
= size
- (regs_free
- killed_ops
);
720 /* find the best position to place the definition */
721 unsigned best_pos
= lb
;
722 unsigned num_moves
= 0xFF;
723 unsigned num_vars
= 0;
725 /* we use a sliding window to check potential positions */
726 unsigned reg_lo
= lb
;
727 unsigned reg_hi
= lb
+ size
- 1;
728 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
729 /* first check the edges: this is what we have to fix to allow for num_moves > size */
730 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
732 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
735 /* second, check that we have at most k=num_moves elements in the window
736 * and no element is larger than the currently processed one */
737 unsigned k
= op_moves
;
739 unsigned remaining_op_moves
= op_moves
;
740 unsigned last_var
= 0;
742 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
743 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
744 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
747 /* dead operands effectively reduce the number of estimated moves */
748 if (reg_file
.is_blocked(PhysReg
{j
})) {
749 if (remaining_op_moves
) {
751 remaining_op_moves
--;
756 if (reg_file
[j
] == 0xF0000000) {
762 if (ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
767 /* we cannot split live ranges of linear vgprs */
768 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
773 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
775 last_var
= reg_file
[j
];
778 if (!found
|| k
> num_moves
)
780 if (k
== num_moves
&& n
< num_vars
)
782 if (!aligned
&& k
== num_moves
&& n
== num_vars
)
792 if (num_moves
== 0xFF) {
793 /* remove killed operands from reg_file once again */
794 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
795 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef())
796 reg_file
.clear(instr
->operands
[i
]);
798 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
799 Definition def
= instr
->definitions
[i
];
800 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
806 RegisterFile register_file
= reg_file
;
808 /* now, we figured the placement for our definition */
809 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
811 if (instr
->opcode
== aco_opcode::p_create_vector
) {
812 /* move killed operands which aren't yet at the correct position (GFX9+)
813 * or which are in the definition space */
814 PhysReg reg
= PhysReg
{best_pos
};
815 for (Operand
& op
: instr
->operands
) {
816 if (op
.isTemp() && op
.isFirstKillBeforeDef() &&
817 op
.getTemp().type() == rc
.type()) {
818 if (op
.physReg() != reg
&&
819 (ctx
.program
->chip_class
>= GFX9
||
820 (op
.physReg().advance(op
.bytes()) > PhysReg
{best_pos
} &&
821 op
.physReg() < PhysReg
{best_pos
+ size
}))) {
822 vars
.emplace(op
.bytes(), op
.tempId());
828 reg
.reg_b
+= op
.bytes();
830 } else if (!is_phi(instr
)) {
831 /* re-enable killed operands */
832 for (Operand
& op
: instr
->operands
) {
833 if (op
.isTemp() && op
.isFirstKillBeforeDef())
838 std::vector
<std::pair
<Operand
, Definition
>> pc
;
839 if (!get_regs_for_copies(ctx
, reg_file
, pc
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1)) {
840 reg_file
= std::move(register_file
);
841 /* remove killed operands from reg_file once again */
842 if (!is_phi(instr
)) {
843 for (const Operand
& op
: instr
->operands
) {
844 if (op
.isTemp() && op
.isFirstKillBeforeDef())
848 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
849 Definition
& def
= instr
->definitions
[i
];
850 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
856 parallelcopies
.insert(parallelcopies
.end(), pc
.begin(), pc
.end());
858 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
859 reg_file
.clear(PhysReg
{best_pos
}, rc
);
861 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
863 /* remove killed operands from reg_file once again */
864 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
865 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
867 assert(!instr
->operands
[i
].isUndefined());
868 if (instr
->operands
[i
].isFirstKillBeforeDef())
869 reg_file
.clear(instr
->operands
[i
]);
871 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
872 Definition def
= instr
->definitions
[i
];
873 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
877 adjust_max_used_regs(ctx
, rc
, best_pos
);
878 return {PhysReg
{best_pos
}, true};
881 bool get_reg_specified(ra_ctx
& ctx
,
882 RegisterFile
& reg_file
,
884 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
885 aco_ptr
<Instruction
>& instr
,
888 if (rc
.is_subdword() && reg
.byte() && !instr_can_access_subdword(ctx
, instr
))
890 if (!rc
.is_subdword() && reg
.byte())
893 uint32_t size
= rc
.size();
897 if (rc
.type() == RegType::vgpr
) {
899 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
905 if (reg
% stride
!= 0)
908 ub
= ctx
.program
->max_reg_demand
.sgpr
;
911 uint32_t reg_lo
= reg
.reg();
912 uint32_t reg_hi
= reg
+ (size
- 1);
914 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
> reg_hi
)
917 if (reg_file
.test(reg
, rc
.bytes()))
920 adjust_max_used_regs(ctx
, rc
, reg_lo
);
924 PhysReg
get_reg(ra_ctx
& ctx
,
925 RegisterFile
& reg_file
,
927 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
928 aco_ptr
<Instruction
>& instr
)
930 auto split_vec
= ctx
.split_vectors
.find(temp
.id());
931 if (split_vec
!= ctx
.split_vectors
.end()) {
933 for (Definition def
: split_vec
->second
->definitions
) {
934 auto affinity_it
= ctx
.affinities
.find(def
.tempId());
935 if (affinity_it
!= ctx
.affinities
.end() && ctx
.assignments
[affinity_it
->second
].assigned
) {
936 PhysReg reg
= ctx
.assignments
[affinity_it
->second
].reg
;
938 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
941 offset
+= def
.bytes();
945 if (ctx
.affinities
.find(temp
.id()) != ctx
.affinities
.end() &&
946 ctx
.assignments
[ctx
.affinities
[temp
.id()]].assigned
) {
947 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[temp
.id()]].reg
;
948 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
952 if (ctx
.vectors
.find(temp
.id()) != ctx
.vectors
.end()) {
953 Instruction
* vec
= ctx
.vectors
[temp
.id()];
954 unsigned byte_offset
= 0;
955 for (const Operand
& op
: vec
->operands
) {
956 if (op
.isTemp() && op
.tempId() == temp
.id())
959 byte_offset
+= op
.bytes();
962 for (const Operand
& op
: vec
->operands
) {
964 op
.tempId() != temp
.id() &&
965 op
.getTemp().type() == temp
.type() &&
966 ctx
.assignments
[op
.tempId()].assigned
) {
967 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
968 reg
.reg_b
+= (byte_offset
- k
);
969 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
975 DefInfo
info(ctx
, ctx
.pseudo_dummy
, vec
->definitions
[0].regClass());
976 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
977 PhysReg reg
= res
.first
;
979 reg
.reg_b
+= byte_offset
;
980 /* make sure to only use byte offset if the instruction supports it */
981 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
986 DefInfo
info(ctx
, instr
, temp
.regClass());
988 /* try to find space without live-range splits */
989 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
994 /* try to find space with live-range splits */
995 res
= get_reg_impl(ctx
, reg_file
, parallelcopies
, info
, instr
);
1000 /* try using more registers */
1002 /* We should only fail here because keeping under the limit would require
1003 * too many moves. */
1004 assert(reg_file
.count_zero(PhysReg
{info
.lb
}, info
.ub
-info
.lb
) >= info
.size
);
1006 uint16_t max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
1007 uint16_t max_addressible_vgpr
= ctx
.program
->vgpr_limit
;
1008 if (info
.rc
.type() == RegType::vgpr
&& ctx
.program
->max_reg_demand
.vgpr
< max_addressible_vgpr
) {
1009 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
+ 1, ctx
.program
->max_reg_demand
.sgpr
));
1010 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
);
1011 } else if (info
.rc
.type() == RegType::sgpr
&& ctx
.program
->max_reg_demand
.sgpr
< max_addressible_sgpr
) {
1012 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
, ctx
.program
->max_reg_demand
.sgpr
+ 1));
1013 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
);
1016 //FIXME: if nothing helps, shift-rotate the registers to make space
1018 fprintf(stderr
, "ACO: failed to allocate registers during shader compilation\n");
1022 PhysReg
get_reg_create_vector(ra_ctx
& ctx
,
1023 RegisterFile
& reg_file
,
1025 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
1026 aco_ptr
<Instruction
>& instr
)
1028 RegClass rc
= temp
.regClass();
1029 /* create_vector instructions have different costs w.r.t. register coalescing */
1030 uint32_t size
= rc
.size();
1031 uint32_t bytes
= rc
.bytes();
1032 uint32_t stride
= 1;
1034 if (rc
.type() == RegType::vgpr
) {
1036 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
1039 ub
= ctx
.program
->max_reg_demand
.sgpr
;
1046 //TODO: improve p_create_vector for sub-dword vectors
1048 unsigned best_pos
= -1;
1049 unsigned num_moves
= 0xFF;
1050 bool best_war_hint
= true;
1052 /* test for each operand which definition placement causes the least shuffle instructions */
1053 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1054 // TODO: think about, if we can alias live operands on the same register
1055 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isKillBeforeDef() || instr
->operands
[i
].getTemp().type() != rc
.type())
1058 if (offset
> instr
->operands
[i
].physReg().reg_b
)
1061 unsigned reg_lo
= instr
->operands
[i
].physReg().reg_b
- offset
;
1065 unsigned reg_hi
= reg_lo
+ size
- 1;
1068 /* no need to check multiple times */
1069 if (reg_lo
== best_pos
)
1073 // TODO: this can be improved */
1074 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
% stride
!= 0)
1076 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
1078 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
1081 /* count variables to be moved and check war_hint */
1082 bool war_hint
= false;
1083 bool linear_vgpr
= false;
1084 for (unsigned j
= reg_lo
; j
<= reg_hi
&& !linear_vgpr
; j
++) {
1085 if (reg_file
[j
] != 0) {
1086 if (reg_file
[j
] == 0xF0000000) {
1089 unsigned bytes_left
= bytes
- (j
- reg_lo
) * 4;
1090 for (unsigned k
= 0; k
< MIN2(bytes_left
, 4); k
++, reg
.reg_b
++)
1091 k
+= reg_file
.test(reg
, 1);
1094 /* we cannot split live ranges of linear vgprs */
1095 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6))
1099 war_hint
|= ctx
.war_hint
[j
];
1101 if (linear_vgpr
|| (war_hint
&& !best_war_hint
))
1104 /* count operands in wrong positions */
1105 for (unsigned j
= 0, offset
= 0; j
< instr
->operands
.size(); offset
+= instr
->operands
[j
].bytes(), j
++) {
1107 !instr
->operands
[j
].isTemp() ||
1108 instr
->operands
[j
].getTemp().type() != rc
.type())
1110 if (instr
->operands
[j
].physReg().reg_b
!= reg_lo
* 4 + offset
)
1111 k
+= instr
->operands
[j
].bytes();
1113 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
1114 if (k
> num_moves
|| (!aligned
&& k
== num_moves
))
1119 best_war_hint
= war_hint
;
1122 if (num_moves
>= bytes
)
1123 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
);
1125 /* re-enable killed operands which are in the wrong position */
1126 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1127 if (instr
->operands
[i
].isTemp() &&
1128 instr
->operands
[i
].isFirstKillBeforeDef() &&
1129 instr
->operands
[i
].physReg().reg_b
!= best_pos
* 4 + offset
)
1130 reg_file
.fill(instr
->operands
[i
]);
1133 /* collect variables to be moved */
1134 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
1136 /* GFX9+: move killed operands which aren't yet at the correct position
1137 * Moving all killed operands generally leads to more register swaps.
1138 * This is only done on GFX9+ because of the cheap v_swap instruction.
1140 if (ctx
.program
->chip_class
>= GFX9
) {
1141 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1142 if (instr
->operands
[i
].isTemp() &&
1143 instr
->operands
[i
].isFirstKillBeforeDef() &&
1144 instr
->operands
[i
].getTemp().type() == rc
.type() &&
1145 instr
->operands
[i
].physReg().reg_b
!= best_pos
* 4 + offset
) {
1146 vars
.emplace(instr
->operands
[i
].bytes(), instr
->operands
[i
].tempId());
1147 reg_file
.clear(instr
->operands
[i
]);
1151 ASSERTED
bool success
= false;
1152 success
= get_regs_for_copies(ctx
, reg_file
, parallelcopies
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1);
1155 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
1156 adjust_max_used_regs(ctx
, rc
, best_pos
);
1158 /* remove killed operands from reg_file once again */
1159 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1160 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
1162 assert(!instr
->operands
[i
].isUndefined());
1163 if (instr
->operands
[i
].isFirstKillBeforeDef())
1164 reg_file
.clear(instr
->operands
[i
]);
1167 return PhysReg
{best_pos
};
1170 void handle_pseudo(ra_ctx
& ctx
,
1171 const RegisterFile
& reg_file
,
1174 if (instr
->format
!= Format::PSEUDO
)
1177 /* all instructions which use handle_operands() need this information */
1178 switch (instr
->opcode
) {
1179 case aco_opcode::p_extract_vector
:
1180 case aco_opcode::p_create_vector
:
1181 case aco_opcode::p_split_vector
:
1182 case aco_opcode::p_parallelcopy
:
1183 case aco_opcode::p_wqm
:
1189 /* if all definitions are vgpr, no need to care for SCC */
1190 bool writes_sgpr
= false;
1191 for (Definition
& def
: instr
->definitions
) {
1192 if (def
.getTemp().type() == RegType::sgpr
) {
1197 /* if all operands are constant, no need to care either */
1198 bool reads_sgpr
= false;
1199 for (Operand
& op
: instr
->operands
) {
1200 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1205 if (!(writes_sgpr
&& reads_sgpr
))
1208 Pseudo_instruction
*pi
= (Pseudo_instruction
*)instr
;
1209 if (reg_file
[scc
.reg()]) {
1210 pi
->tmp_in_scc
= true;
1212 int reg
= ctx
.max_used_sgpr
;
1213 for (; reg
>= 0 && reg_file
[reg
]; reg
--)
1216 reg
= ctx
.max_used_sgpr
+ 1;
1217 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& reg_file
[reg
]; reg
++)
1219 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
1222 adjust_max_used_regs(ctx
, s1
, reg
);
1223 pi
->scratch_sgpr
= PhysReg
{(unsigned)reg
};
1225 pi
->tmp_in_scc
= false;
1229 bool operand_can_use_reg(ra_ctx
& ctx
, aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
)
1231 if (instr
->operands
[idx
].isFixed())
1232 return instr
->operands
[idx
].physReg() == reg
;
1234 if (reg
.byte() && !instr_can_access_subdword(ctx
, instr
))
1237 switch (instr
->format
) {
1239 return reg
!= scc
&&
1241 (reg
!= m0
|| idx
== 1 || idx
== 3) && /* offset can be m0 */
1242 (reg
!= vcc
|| (instr
->definitions
.empty() && idx
== 2)); /* sdata can be vcc */
1244 // TODO: there are more instructions with restrictions on registers
1249 void get_reg_for_operand(ra_ctx
& ctx
, RegisterFile
& register_file
,
1250 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopy
,
1251 aco_ptr
<Instruction
>& instr
, Operand
& operand
)
1253 /* check if the operand is fixed */
1255 if (operand
.isFixed()) {
1256 assert(operand
.physReg() != ctx
.assignments
[operand
.tempId()].reg
);
1258 /* check if target reg is blocked, and move away the blocking var */
1259 if (register_file
[operand
.physReg().reg()]) {
1260 assert(register_file
[operand
.physReg()] != 0xF0000000);
1261 uint32_t blocking_id
= register_file
[operand
.physReg().reg()];
1262 RegClass rc
= ctx
.assignments
[blocking_id
].rc
;
1263 Operand pc_op
= Operand(Temp
{blocking_id
, rc
});
1264 pc_op
.setFixed(operand
.physReg());
1267 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.getTemp(), parallelcopy
, ctx
.pseudo_dummy
);
1268 Definition pc_def
= Definition(PhysReg
{reg
}, pc_op
.regClass());
1269 register_file
.clear(pc_op
);
1270 parallelcopy
.emplace_back(pc_op
, pc_def
);
1272 dst
= operand
.physReg();
1275 dst
= get_reg(ctx
, register_file
, operand
.getTemp(), parallelcopy
, instr
);
1278 Operand pc_op
= operand
;
1279 pc_op
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
1280 Definition pc_def
= Definition(dst
, pc_op
.regClass());
1281 register_file
.clear(pc_op
);
1282 parallelcopy
.emplace_back(pc_op
, pc_def
);
1283 update_renames(ctx
, register_file
, parallelcopy
, instr
);
1286 Temp
read_variable(ra_ctx
& ctx
, Temp val
, unsigned block_idx
)
1288 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.renames
[block_idx
].find(val
.id());
1289 if (it
== ctx
.renames
[block_idx
].end())
1295 Temp
handle_live_in(ra_ctx
& ctx
, Temp val
, Block
* block
)
1297 std::vector
<unsigned>& preds
= val
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
1298 if (preds
.size() == 0 || val
.regClass() == val
.regClass().as_linear())
1301 assert(preds
.size() > 0);
1304 if (!ctx
.sealed
[block
->index
]) {
1305 /* consider rename from already processed predecessor */
1306 Temp tmp
= read_variable(ctx
, val
, preds
[0]);
1308 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1309 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1310 ctx
.assignments
.emplace_back();
1311 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1312 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1313 phi
->definitions
[0] = Definition(new_val
);
1314 for (unsigned i
= 0; i
< preds
.size(); i
++)
1315 phi
->operands
[i
] = Operand(val
);
1316 if (tmp
.regClass() == new_val
.regClass())
1317 ctx
.affinities
[new_val
.id()] = tmp
.id();
1319 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1320 ctx
.incomplete_phis
[block
->index
].emplace_back(phi
.get());
1321 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1323 } else if (preds
.size() == 1) {
1324 /* if the block has only one predecessor, just look there for the name */
1325 new_val
= read_variable(ctx
, val
, preds
[0]);
1327 /* there are multiple predecessors and the block is sealed */
1328 Temp ops
[preds
.size()];
1330 /* get the rename from each predecessor and check if they are the same */
1331 bool needs_phi
= false;
1332 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1333 ops
[i
] = read_variable(ctx
, val
, preds
[i
]);
1337 needs_phi
|= !(new_val
== ops
[i
]);
1341 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1342 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1343 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1344 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1345 phi
->definitions
[0] = Definition(new_val
);
1346 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1347 phi
->operands
[i
] = Operand(ops
[i
]);
1348 phi
->operands
[i
].setFixed(ctx
.assignments
[ops
[i
].id()].reg
);
1349 if (ops
[i
].regClass() == new_val
.regClass())
1350 ctx
.affinities
[new_val
.id()] = ops
[i
].id();
1352 ctx
.assignments
.emplace_back();
1353 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1354 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1355 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1359 if (new_val
!= val
) {
1360 ctx
.renames
[block
->index
][val
.id()] = new_val
;
1361 ctx
.orig_names
[new_val
.id()] = val
;
1366 void try_remove_trivial_phi(ra_ctx
& ctx
, Temp temp
)
1368 std::unordered_map
<unsigned, phi_info
>::iterator info
= ctx
.phi_map
.find(temp
.id());
1370 if (info
== ctx
.phi_map
.end() || !ctx
.sealed
[info
->second
.block_idx
])
1373 assert(info
->second
.block_idx
!= 0);
1374 Instruction
* phi
= info
->second
.phi
;
1376 Definition def
= phi
->definitions
[0];
1378 /* a phi node is trivial if all operands are the same as the definition of the phi */
1379 for (const Operand
& op
: phi
->operands
) {
1380 const Temp t
= op
.getTemp();
1381 if (t
== same
|| t
== def
.getTemp()) {
1382 assert(t
== same
|| op
.physReg() == def
.physReg());
1390 assert(same
!= Temp() || same
== def
.getTemp());
1392 /* reroute all uses to same and remove phi */
1393 std::vector
<Temp
> phi_users
;
1394 std::unordered_map
<unsigned, phi_info
>::iterator same_phi_info
= ctx
.phi_map
.find(same
.id());
1395 for (Instruction
* instr
: info
->second
.uses
) {
1396 assert(phi
!= instr
);
1397 /* recursively try to remove trivial phis */
1398 if (is_phi(instr
)) {
1399 /* ignore if the phi was already flagged trivial */
1400 if (instr
->definitions
.empty())
1403 if (instr
->definitions
[0].getTemp() != temp
)
1404 phi_users
.emplace_back(instr
->definitions
[0].getTemp());
1406 for (Operand
& op
: instr
->operands
) {
1407 if (op
.isTemp() && op
.tempId() == def
.tempId()) {
1409 if (same_phi_info
!= ctx
.phi_map
.end())
1410 same_phi_info
->second
.uses
.emplace(instr
);
1415 auto it
= ctx
.orig_names
.find(same
.id());
1416 unsigned orig_var
= it
!= ctx
.orig_names
.end() ? it
->second
.id() : same
.id();
1417 for (unsigned i
= 0; i
< ctx
.program
->blocks
.size(); i
++) {
1418 auto it
= ctx
.renames
[i
].find(orig_var
);
1419 if (it
!= ctx
.renames
[i
].end() && it
->second
== def
.getTemp())
1420 ctx
.renames
[i
][orig_var
] = same
;
1423 phi
->definitions
.clear(); /* this indicates that the phi can be removed */
1424 ctx
.phi_map
.erase(info
);
1425 for (Temp t
: phi_users
)
1426 try_remove_trivial_phi(ctx
, t
);
1431 } /* end namespace */
1434 void register_allocation(Program
*program
, std::vector
<TempSet
>& live_out_per_block
)
1436 ra_ctx
ctx(program
);
1437 std::vector
<std::vector
<Temp
>> phi_ressources
;
1438 std::unordered_map
<unsigned, unsigned> temp_to_phi_ressources
;
1440 for (std::vector
<Block
>::reverse_iterator it
= program
->blocks
.rbegin(); it
!= program
->blocks
.rend(); it
++) {
1443 /* first, compute the death points of all live vars within the block */
1444 TempSet
& live
= live_out_per_block
[block
.index
];
1446 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator rit
;
1447 for (rit
= block
.instructions
.rbegin(); rit
!= block
.instructions
.rend(); ++rit
) {
1448 aco_ptr
<Instruction
>& instr
= *rit
;
1449 if (is_phi(instr
)) {
1450 if (instr
->definitions
[0].isKill() || instr
->definitions
[0].isFixed()) {
1451 live
.erase(instr
->definitions
[0].getTemp());
1454 /* collect information about affinity-related temporaries */
1455 std::vector
<Temp
> affinity_related
;
1456 /* affinity_related[0] is the last seen affinity-related temp */
1457 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1458 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1459 for (const Operand
& op
: instr
->operands
) {
1460 if (op
.isTemp() && op
.regClass() == instr
->definitions
[0].regClass()) {
1461 affinity_related
.emplace_back(op
.getTemp());
1462 temp_to_phi_ressources
[op
.tempId()] = phi_ressources
.size();
1465 phi_ressources
.emplace_back(std::move(affinity_related
));
1467 /* add vector affinities */
1468 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1469 for (const Operand
& op
: instr
->operands
) {
1470 if (op
.isTemp() && op
.isFirstKill() && op
.getTemp().type() == instr
->definitions
[0].getTemp().type())
1471 ctx
.vectors
[op
.tempId()] = instr
.get();
1475 if (instr
->opcode
== aco_opcode::p_split_vector
&& instr
->operands
[0].isFirstKillBeforeDef())
1476 ctx
.split_vectors
[instr
->operands
[0].tempId()] = instr
.get();
1478 /* add operands to live variables */
1479 for (const Operand
& op
: instr
->operands
) {
1481 live
.emplace(op
.getTemp());
1485 /* erase definitions from live */
1486 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1487 const Definition
& def
= instr
->definitions
[i
];
1490 live
.erase(def
.getTemp());
1491 /* mark last-seen phi operand */
1492 std::unordered_map
<unsigned, unsigned>::iterator it
= temp_to_phi_ressources
.find(def
.tempId());
1493 if (it
!= temp_to_phi_ressources
.end() && def
.regClass() == phi_ressources
[it
->second
][0].regClass()) {
1494 phi_ressources
[it
->second
][0] = def
.getTemp();
1495 /* try to coalesce phi affinities with parallelcopies */
1496 Operand op
= Operand();
1497 if (!def
.isFixed() && instr
->opcode
== aco_opcode::p_parallelcopy
)
1498 op
= instr
->operands
[i
];
1499 else if (instr
->opcode
== aco_opcode::v_mad_f32
&& !instr
->usesModifiers())
1500 op
= instr
->operands
[2];
1502 if (op
.isTemp() && op
.isFirstKillBeforeDef() && def
.regClass() == op
.regClass()) {
1503 phi_ressources
[it
->second
].emplace_back(op
.getTemp());
1504 temp_to_phi_ressources
[op
.tempId()] = it
->second
;
1510 /* create affinities */
1511 for (std::vector
<Temp
>& vec
: phi_ressources
) {
1512 assert(vec
.size() > 1);
1513 for (unsigned i
= 1; i
< vec
.size(); i
++)
1514 if (vec
[i
].id() != vec
[0].id())
1515 ctx
.affinities
[vec
[i
].id()] = vec
[0].id();
1518 /* state of register file after phis */
1519 std::vector
<std::bitset
<128>> sgpr_live_in(program
->blocks
.size());
1521 for (Block
& block
: program
->blocks
) {
1522 TempSet
& live
= live_out_per_block
[block
.index
];
1523 /* initialize register file */
1524 assert(block
.index
!= 0 || live
.empty());
1525 RegisterFile register_file
;
1526 ctx
.war_hint
.reset();
1528 for (Temp t
: live
) {
1529 Temp renamed
= handle_live_in(ctx
, t
, &block
);
1530 assignment
& var
= ctx
.assignments
[renamed
.id()];
1531 /* due to live-range splits, the live-in might be a phi, now */
1533 register_file
.fill(Definition(renamed
.id(), var
.reg
, var
.rc
));
1536 std::vector
<aco_ptr
<Instruction
>> instructions
;
1537 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1539 /* this is a slight adjustment from the paper as we already have phi nodes:
1540 * We consider them incomplete phis and only handle the definition. */
1542 /* handle fixed phi definitions */
1543 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1544 aco_ptr
<Instruction
>& phi
= *it
;
1547 Definition
& definition
= phi
->definitions
[0];
1548 if (!definition
.isFixed())
1551 /* check if a dead exec mask phi is needed */
1552 if (definition
.isKill()) {
1553 for (Operand
& op
: phi
->operands
) {
1554 assert(op
.isTemp());
1555 if (!ctx
.assignments
[op
.tempId()].assigned
||
1556 ctx
.assignments
[op
.tempId()].reg
!= exec
) {
1557 definition
.setKill(false);
1563 if (definition
.isKill())
1566 assert(definition
.physReg() == exec
);
1567 assert(!register_file
.test(definition
.physReg(), definition
.bytes()));
1568 register_file
.fill(definition
);
1569 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1572 /* look up the affinities */
1573 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1574 aco_ptr
<Instruction
>& phi
= *it
;
1577 Definition
& definition
= phi
->definitions
[0];
1578 if (definition
.isKill() || definition
.isFixed())
1581 if (ctx
.affinities
.find(definition
.tempId()) != ctx
.affinities
.end() &&
1582 ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].assigned
) {
1583 assert(ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].rc
== definition
.regClass());
1584 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].reg
;
1585 bool try_use_special_reg
= reg
== scc
|| reg
== exec
;
1586 if (try_use_special_reg
) {
1587 for (const Operand
& op
: phi
->operands
) {
1588 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
&&
1589 ctx
.assignments
[op
.tempId()].reg
== reg
)) {
1590 try_use_special_reg
= false;
1594 if (!try_use_special_reg
)
1597 /* only assign if register is still free */
1598 if (!register_file
.test(reg
, definition
.bytes())) {
1599 definition
.setFixed(reg
);
1600 register_file
.fill(definition
);
1601 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1606 /* find registers for phis without affinity or where the register was blocked */
1607 for (it
= block
.instructions
.begin();it
!= block
.instructions
.end(); ++it
) {
1608 aco_ptr
<Instruction
>& phi
= *it
;
1612 Definition
& definition
= phi
->definitions
[0];
1613 if (definition
.isKill())
1616 if (!definition
.isFixed()) {
1617 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1618 /* try to find a register that is used by at least one operand */
1619 for (const Operand
& op
: phi
->operands
) {
1620 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
))
1622 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
1623 /* we tried this already on the previous loop */
1624 if (reg
== scc
|| reg
== exec
)
1626 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
, reg
)) {
1627 definition
.setFixed(reg
);
1631 if (!definition
.isFixed())
1632 definition
.setFixed(get_reg(ctx
, register_file
, definition
.getTemp(), parallelcopy
, phi
));
1634 /* process parallelcopy */
1635 for (std::pair
<Operand
, Definition
> pc
: parallelcopy
) {
1636 /* see if it's a copy from a different phi */
1637 //TODO: prefer moving some previous phis over live-ins
1638 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1639 Instruction
*prev_phi
= NULL
;
1640 std::vector
<aco_ptr
<Instruction
>>::iterator phi_it
;
1641 for (phi_it
= instructions
.begin(); phi_it
!= instructions
.end(); ++phi_it
) {
1642 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1643 prev_phi
= phi_it
->get();
1646 while (!prev_phi
&& is_phi(*++phi_it
)) {
1647 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1648 prev_phi
= phi_it
->get();
1651 /* if so, just update that phi's register */
1652 register_file
.clear(prev_phi
->definitions
[0]);
1653 prev_phi
->definitions
[0].setFixed(pc
.second
.physReg());
1654 ctx
.assignments
[prev_phi
->definitions
[0].tempId()] = {pc
.second
.physReg(), pc
.second
.regClass()};
1655 register_file
.fill(prev_phi
->definitions
[0]);
1660 std::unordered_map
<unsigned, Temp
>::iterator orig_it
= ctx
.orig_names
.find(pc
.first
.tempId());
1661 Temp orig
= pc
.first
.getTemp();
1662 if (orig_it
!= ctx
.orig_names
.end())
1663 orig
= orig_it
->second
;
1665 ctx
.orig_names
[pc
.second
.tempId()] = orig
;
1666 ctx
.renames
[block
.index
][orig
.id()] = pc
.second
.getTemp();
1668 /* otherwise, this is a live-in and we need to create a new phi
1669 * to move it in this block's predecessors */
1670 aco_opcode opcode
= pc
.first
.getTemp().is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1671 std::vector
<unsigned>& preds
= pc
.first
.getTemp().is_linear() ? block
.linear_preds
: block
.logical_preds
;
1672 aco_ptr
<Instruction
> new_phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1673 new_phi
->definitions
[0] = pc
.second
;
1674 for (unsigned i
= 0; i
< preds
.size(); i
++)
1675 new_phi
->operands
[i
] = Operand(pc
.first
);
1676 instructions
.emplace_back(std::move(new_phi
));
1679 register_file
.fill(definition
);
1680 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1682 live
.emplace(definition
.getTemp());
1684 /* update phi affinities */
1685 for (const Operand
& op
: phi
->operands
) {
1686 if (op
.isTemp() && op
.regClass() == phi
->definitions
[0].regClass())
1687 ctx
.affinities
[op
.tempId()] = definition
.tempId();
1690 instructions
.emplace_back(std::move(*it
));
1693 /* fill in sgpr_live_in */
1694 for (unsigned i
= 0; i
<= ctx
.max_used_sgpr
; i
++)
1695 sgpr_live_in
[block
.index
][i
] = register_file
[i
];
1696 sgpr_live_in
[block
.index
][127] = register_file
[scc
.reg()];
1698 /* Handle all other instructions of the block */
1699 for (; it
!= block
.instructions
.end(); ++it
) {
1700 aco_ptr
<Instruction
>& instr
= *it
;
1702 /* parallelcopies from p_phi are inserted here which means
1703 * live ranges of killed operands end here as well */
1704 if (instr
->opcode
== aco_opcode::p_logical_end
) {
1705 /* no need to process this instruction any further */
1706 if (block
.logical_succs
.size() != 1) {
1707 instructions
.emplace_back(std::move(instr
));
1711 Block
& succ
= program
->blocks
[block
.logical_succs
[0]];
1713 for (; idx
< succ
.logical_preds
.size(); idx
++) {
1714 if (succ
.logical_preds
[idx
] == block
.index
)
1717 for (aco_ptr
<Instruction
>& phi
: succ
.instructions
) {
1718 if (phi
->opcode
== aco_opcode::p_phi
) {
1719 if (phi
->operands
[idx
].isTemp() &&
1720 phi
->operands
[idx
].getTemp().type() == RegType::sgpr
&&
1721 phi
->operands
[idx
].isFirstKillBeforeDef()) {
1722 Temp phi_op
= read_variable(ctx
, phi
->operands
[idx
].getTemp(), block
.index
);
1723 PhysReg reg
= ctx
.assignments
[phi_op
.id()].reg
;
1724 assert(register_file
[reg
] == phi_op
.id());
1725 register_file
[reg
] = 0;
1727 } else if (phi
->opcode
!= aco_opcode::p_linear_phi
) {
1731 instructions
.emplace_back(std::move(instr
));
1735 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1737 assert(!is_phi(instr
));
1739 /* handle operands */
1740 for (unsigned i
= 0; i
< instr
->operands
.size(); ++i
) {
1741 auto& operand
= instr
->operands
[i
];
1742 if (!operand
.isTemp())
1745 /* rename operands */
1746 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), block
.index
));
1747 assert(ctx
.assignments
[operand
.tempId()].assigned
);
1749 PhysReg reg
= ctx
.assignments
[operand
.tempId()].reg
;
1750 if (operand_can_use_reg(ctx
, instr
, i
, reg
))
1751 operand
.setFixed(reg
);
1753 get_reg_for_operand(ctx
, register_file
, parallelcopy
, instr
, operand
);
1755 if (instr
->format
== Format::EXP
||
1756 (instr
->isVMEM() && i
== 3 && ctx
.program
->chip_class
== GFX6
) ||
1757 (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
)) {
1758 for (unsigned j
= 0; j
< operand
.size(); j
++)
1759 ctx
.war_hint
.set(operand
.physReg().reg() + j
);
1762 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
1763 if (phi
!= ctx
.phi_map
.end())
1764 phi
->second
.uses
.emplace(instr
.get());
1767 /* remove dead vars from register file */
1768 for (const Operand
& op
: instr
->operands
) {
1769 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1770 register_file
.clear(op
);
1773 /* try to optimize v_mad_f32 -> v_mac_f32 */
1774 if (instr
->opcode
== aco_opcode::v_mad_f32
&&
1775 instr
->operands
[2].isTemp() &&
1776 instr
->operands
[2].isKillBeforeDef() &&
1777 instr
->operands
[2].getTemp().type() == RegType::vgpr
&&
1778 instr
->operands
[1].isTemp() &&
1779 instr
->operands
[1].getTemp().type() == RegType::vgpr
&&
1780 !instr
->usesModifiers()) {
1781 unsigned def_id
= instr
->definitions
[0].tempId();
1782 auto it
= ctx
.affinities
.find(def_id
);
1783 if (it
== ctx
.affinities
.end() || !ctx
.assignments
[it
->second
].assigned
||
1784 instr
->operands
[2].physReg() == ctx
.assignments
[it
->second
].reg
||
1785 register_file
.test(ctx
.assignments
[it
->second
].reg
, instr
->operands
[2].bytes())) {
1786 instr
->format
= Format::VOP2
;
1787 instr
->opcode
= aco_opcode::v_mac_f32
;
1791 /* handle definitions which must have the same register as an operand */
1792 if (instr
->opcode
== aco_opcode::v_interp_p2_f32
||
1793 instr
->opcode
== aco_opcode::v_mac_f32
||
1794 instr
->opcode
== aco_opcode::v_writelane_b32
||
1795 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) {
1796 instr
->definitions
[0].setFixed(instr
->operands
[2].physReg());
1797 } else if (instr
->opcode
== aco_opcode::s_addk_i32
||
1798 instr
->opcode
== aco_opcode::s_mulk_i32
) {
1799 instr
->definitions
[0].setFixed(instr
->operands
[0].physReg());
1800 } else if (instr
->format
== Format::MUBUF
&&
1801 instr
->definitions
.size() == 1 &&
1802 instr
->operands
.size() == 4) {
1803 instr
->definitions
[0].setFixed(instr
->operands
[3].physReg());
1804 } else if (instr
->format
== Format::MIMG
&&
1805 instr
->definitions
.size() == 1 &&
1806 instr
->operands
[1].regClass().type() == RegType::vgpr
) {
1807 instr
->definitions
[0].setFixed(instr
->operands
[1].physReg());
1810 ctx
.defs_done
.reset();
1812 /* handle fixed definitions first */
1813 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1814 auto& definition
= instr
->definitions
[i
];
1815 if (!definition
.isFixed())
1818 adjust_max_used_regs(ctx
, definition
.regClass(), definition
.physReg());
1819 /* check if the target register is blocked */
1820 if (register_file
[definition
.physReg().reg()] != 0) {
1821 /* create parallelcopy pair to move blocking var */
1822 Temp tmp
= {register_file
[definition
.physReg()], ctx
.assignments
[register_file
[definition
.physReg()]].rc
};
1823 Operand pc_op
= Operand(tmp
);
1824 pc_op
.setFixed(ctx
.assignments
[register_file
[definition
.physReg().reg()]].reg
);
1825 RegClass rc
= pc_op
.regClass();
1826 tmp
= Temp
{program
->allocateId(), rc
};
1827 Definition pc_def
= Definition(tmp
);
1829 /* re-enable the killed operands, so that we don't move the blocking var there */
1830 for (const Operand
& op
: instr
->operands
) {
1831 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1832 register_file
.fill(op
);
1835 /* find a new register for the blocking variable */
1836 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.getTemp(), parallelcopy
, instr
);
1837 /* once again, disable killed operands */
1838 for (const Operand
& op
: instr
->operands
) {
1839 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1840 register_file
.clear(op
);
1842 for (unsigned k
= 0; k
< i
; k
++) {
1843 if (instr
->definitions
[k
].isTemp() && ctx
.defs_done
.test(k
) && !instr
->definitions
[k
].isKill())
1844 register_file
.fill(instr
->definitions
[k
]);
1846 pc_def
.setFixed(reg
);
1848 /* finish assignment of parallelcopy */
1849 ctx
.assignments
.emplace_back(reg
, pc_def
.regClass());
1850 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1851 parallelcopy
.emplace_back(pc_op
, pc_def
);
1853 /* add changes to reg_file */
1854 register_file
.clear(pc_op
);
1855 register_file
.fill(pc_def
);
1857 ctx
.defs_done
.set(i
);
1859 if (!definition
.isTemp())
1862 /* set live if it has a kill point */
1863 if (!definition
.isKill())
1864 live
.emplace(definition
.getTemp());
1866 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1867 register_file
.fill(definition
);
1870 /* handle all other definitions */
1871 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1872 auto& definition
= instr
->definitions
[i
];
1874 if (definition
.isFixed() || !definition
.isTemp())
1878 if (definition
.hasHint() && register_file
[definition
.physReg().reg()] == 0)
1879 definition
.setFixed(definition
.physReg());
1880 else if (instr
->opcode
== aco_opcode::p_split_vector
) {
1881 PhysReg reg
= instr
->operands
[0].physReg();
1882 for (unsigned j
= 0; j
< i
; j
++)
1883 reg
.reg_b
+= instr
->definitions
[j
].bytes();
1884 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1885 definition
.setFixed(reg
);
1886 } else if (instr
->opcode
== aco_opcode::p_wqm
|| instr
->opcode
== aco_opcode::p_parallelcopy
) {
1887 PhysReg reg
= instr
->operands
[i
].physReg();
1888 if (instr
->operands
[i
].isTemp() &&
1889 instr
->operands
[i
].getTemp().type() == definition
.getTemp().type() &&
1890 !register_file
.test(reg
, definition
.bytes()))
1891 definition
.setFixed(reg
);
1892 } else if (instr
->opcode
== aco_opcode::p_extract_vector
) {
1894 if (instr
->operands
[0].isKillBeforeDef() &&
1895 instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1896 reg
= instr
->operands
[0].physReg();
1897 reg
.reg_b
+= definition
.bytes() * instr
->operands
[1].constantValue();
1898 assert(!register_file
.test(reg
, definition
.bytes()));
1899 definition
.setFixed(reg
);
1901 } else if (instr
->opcode
== aco_opcode::p_create_vector
) {
1902 PhysReg reg
= get_reg_create_vector(ctx
, register_file
, definition
.getTemp(),
1903 parallelcopy
, instr
);
1904 definition
.setFixed(reg
);
1907 if (!definition
.isFixed()) {
1908 Temp tmp
= definition
.getTemp();
1909 if (tmp
.regClass().is_subdword() &&
1910 !instr_can_access_subdword(ctx
, instr
)) {
1911 assert(tmp
.bytes() <= 4);
1912 tmp
= Temp(definition
.tempId(), v1
);
1914 definition
.setFixed(get_reg(ctx
, register_file
, tmp
, parallelcopy
, instr
));
1917 assert(definition
.isFixed() && ((definition
.getTemp().type() == RegType::vgpr
&& definition
.physReg() >= 256) ||
1918 (definition
.getTemp().type() != RegType::vgpr
&& definition
.physReg() < 256)));
1919 ctx
.defs_done
.set(i
);
1921 /* set live if it has a kill point */
1922 if (!definition
.isKill())
1923 live
.emplace(definition
.getTemp());
1925 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1926 register_file
.fill(definition
);
1929 handle_pseudo(ctx
, register_file
, instr
.get());
1931 /* kill definitions and late-kill operands */
1932 for (const Definition
& def
: instr
->definitions
) {
1933 if (def
.isTemp() && def
.isKill())
1934 register_file
.clear(def
);
1936 for (const Operand
& op
: instr
->operands
) {
1937 if (op
.isTemp() && op
.isFirstKill() && op
.isLateKill())
1938 register_file
.clear(op
);
1941 /* emit parallelcopy */
1942 if (!parallelcopy
.empty()) {
1943 aco_ptr
<Pseudo_instruction
> pc
;
1944 pc
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, parallelcopy
.size(), parallelcopy
.size()));
1945 bool temp_in_scc
= register_file
[scc
.reg()];
1946 bool sgpr_operands_alias_defs
= false;
1947 uint64_t sgpr_operands
[4] = {0, 0, 0, 0};
1948 for (unsigned i
= 0; i
< parallelcopy
.size(); i
++) {
1949 if (temp_in_scc
&& parallelcopy
[i
].first
.isTemp() && parallelcopy
[i
].first
.getTemp().type() == RegType::sgpr
) {
1950 if (!sgpr_operands_alias_defs
) {
1951 unsigned reg
= parallelcopy
[i
].first
.physReg().reg();
1952 unsigned size
= parallelcopy
[i
].first
.getTemp().size();
1953 sgpr_operands
[reg
/ 64u] |= ((1u << size
) - 1) << (reg
% 64u);
1955 reg
= parallelcopy
[i
].second
.physReg().reg();
1956 size
= parallelcopy
[i
].second
.getTemp().size();
1957 if (sgpr_operands
[reg
/ 64u] & ((1u << size
) - 1) << (reg
% 64u))
1958 sgpr_operands_alias_defs
= true;
1962 pc
->operands
[i
] = parallelcopy
[i
].first
;
1963 pc
->definitions
[i
] = parallelcopy
[i
].second
;
1964 assert(pc
->operands
[i
].size() == pc
->definitions
[i
].size());
1966 /* it might happen that the operand is already renamed. we have to restore the original name. */
1967 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.orig_names
.find(pc
->operands
[i
].tempId());
1968 Temp orig
= it
!= ctx
.orig_names
.end() ? it
->second
: pc
->operands
[i
].getTemp();
1969 ctx
.orig_names
[pc
->definitions
[i
].tempId()] = orig
;
1970 ctx
.renames
[block
.index
][orig
.id()] = pc
->definitions
[i
].getTemp();
1972 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(pc
->operands
[i
].tempId());
1973 if (phi
!= ctx
.phi_map
.end())
1974 phi
->second
.uses
.emplace(pc
.get());
1977 if (temp_in_scc
&& sgpr_operands_alias_defs
) {
1978 /* disable definitions and re-enable operands */
1979 for (const Definition
& def
: instr
->definitions
) {
1980 if (def
.isTemp() && !def
.isKill())
1981 register_file
.clear(def
);
1983 for (const Operand
& op
: instr
->operands
) {
1984 if (op
.isTemp() && op
.isFirstKill())
1985 register_file
.block(op
.physReg(), op
.regClass());
1988 handle_pseudo(ctx
, register_file
, pc
.get());
1990 /* re-enable live vars */
1991 for (const Operand
& op
: instr
->operands
) {
1992 if (op
.isTemp() && op
.isFirstKill())
1993 register_file
.clear(op
);
1995 for (const Definition
& def
: instr
->definitions
) {
1996 if (def
.isTemp() && !def
.isKill())
1997 register_file
.fill(def
);
2000 pc
->tmp_in_scc
= false;
2003 instructions
.emplace_back(std::move(pc
));
2006 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
2007 bool instr_needs_vop3
= !instr
->isVOP3() &&
2008 ((instr
->format
== Format::VOPC
&& !(instr
->definitions
[0].physReg() == vcc
)) ||
2009 (instr
->opcode
== aco_opcode::v_cndmask_b32
&& !(instr
->operands
[2].physReg() == vcc
)) ||
2010 ((instr
->opcode
== aco_opcode::v_add_co_u32
||
2011 instr
->opcode
== aco_opcode::v_addc_co_u32
||
2012 instr
->opcode
== aco_opcode::v_sub_co_u32
||
2013 instr
->opcode
== aco_opcode::v_subb_co_u32
||
2014 instr
->opcode
== aco_opcode::v_subrev_co_u32
||
2015 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
2016 !(instr
->definitions
[1].physReg() == vcc
)) ||
2017 ((instr
->opcode
== aco_opcode::v_addc_co_u32
||
2018 instr
->opcode
== aco_opcode::v_subb_co_u32
||
2019 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
2020 !(instr
->operands
[2].physReg() == vcc
)));
2021 if (instr_needs_vop3
) {
2023 /* if the first operand is a literal, we have to move it to a reg */
2024 if (instr
->operands
.size() && instr
->operands
[0].isLiteral() && program
->chip_class
< GFX10
) {
2025 bool can_sgpr
= true;
2026 /* check, if we have to move to vgpr */
2027 for (const Operand
& op
: instr
->operands
) {
2028 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
2033 /* disable definitions and re-enable operands */
2034 for (const Definition
& def
: instr
->definitions
)
2035 register_file
.clear(def
);
2036 for (const Operand
& op
: instr
->operands
) {
2037 if (op
.isTemp() && op
.isFirstKill())
2038 register_file
.block(op
.physReg(), op
.regClass());
2040 Temp tmp
= {program
->allocateId(), can_sgpr
? s1
: v1
};
2041 ctx
.assignments
.emplace_back();
2042 PhysReg reg
= get_reg(ctx
, register_file
, tmp
, parallelcopy
, instr
);
2044 aco_ptr
<Instruction
> mov
;
2046 mov
.reset(create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1));
2048 mov
.reset(create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1));
2049 mov
->operands
[0] = instr
->operands
[0];
2050 mov
->definitions
[0] = Definition(tmp
);
2051 mov
->definitions
[0].setFixed(reg
);
2053 instr
->operands
[0] = Operand(tmp
);
2054 instr
->operands
[0].setFixed(reg
);
2055 instructions
.emplace_back(std::move(mov
));
2056 /* re-enable live vars */
2057 for (const Operand
& op
: instr
->operands
) {
2058 if (op
.isTemp() && op
.isFirstKill())
2059 register_file
.clear(op
);
2061 for (const Definition
& def
: instr
->definitions
) {
2062 if (def
.isTemp() && !def
.isKill())
2063 register_file
.fill(def
);
2067 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2068 aco_ptr
<Instruction
> tmp
= std::move(instr
);
2069 Format format
= asVOP3(tmp
->format
);
2070 instr
.reset(create_instruction
<VOP3A_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
2071 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2072 Operand
& operand
= tmp
->operands
[i
];
2073 instr
->operands
[i
] = operand
;
2074 /* keep phi_map up to date */
2075 if (operand
.isTemp()) {
2076 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.tempId());
2077 if (phi
!= ctx
.phi_map
.end()) {
2078 phi
->second
.uses
.erase(tmp
.get());
2079 phi
->second
.uses
.emplace(instr
.get());
2083 std::copy(tmp
->definitions
.begin(), tmp
->definitions
.end(), instr
->definitions
.begin());
2085 instructions
.emplace_back(std::move(*it
));
2087 } /* end for Instr */
2089 block
.instructions
= std::move(instructions
);
2091 ctx
.filled
[block
.index
] = true;
2092 for (unsigned succ_idx
: block
.linear_succs
) {
2093 Block
& succ
= program
->blocks
[succ_idx
];
2094 /* seal block if all predecessors are filled */
2095 bool all_filled
= true;
2096 for (unsigned pred_idx
: succ
.linear_preds
) {
2097 if (!ctx
.filled
[pred_idx
]) {
2103 ctx
.sealed
[succ_idx
] = true;
2105 /* finish incomplete phis and check if they became trivial */
2106 for (Instruction
* phi
: ctx
.incomplete_phis
[succ_idx
]) {
2107 std::vector
<unsigned> preds
= phi
->definitions
[0].getTemp().is_linear() ? succ
.linear_preds
: succ
.logical_preds
;
2108 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
2109 phi
->operands
[i
].setTemp(read_variable(ctx
, phi
->operands
[i
].getTemp(), preds
[i
]));
2110 phi
->operands
[i
].setFixed(ctx
.assignments
[phi
->operands
[i
].tempId()].reg
);
2112 try_remove_trivial_phi(ctx
, phi
->definitions
[0].getTemp());
2114 /* complete the original phi nodes, but no need to check triviality */
2115 for (aco_ptr
<Instruction
>& instr
: succ
.instructions
) {
2118 std::vector
<unsigned> preds
= instr
->opcode
== aco_opcode::p_phi
? succ
.logical_preds
: succ
.linear_preds
;
2120 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2121 auto& operand
= instr
->operands
[i
];
2122 if (!operand
.isTemp())
2124 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), preds
[i
]));
2125 operand
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
2126 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
2127 if (phi
!= ctx
.phi_map
.end())
2128 phi
->second
.uses
.emplace(instr
.get());
2135 /* remove trivial phis */
2136 for (Block
& block
: program
->blocks
) {
2137 auto end
= std::find_if(block
.instructions
.begin(), block
.instructions
.end(),
2138 [](aco_ptr
<Instruction
>& instr
) { return !is_phi(instr
);});
2139 auto middle
= std::remove_if(block
.instructions
.begin(), end
,
2140 [](const aco_ptr
<Instruction
>& instr
) { return instr
->definitions
.empty();});
2141 block
.instructions
.erase(middle
, end
);
2144 /* find scc spill registers which may be needed for parallelcopies created by phis */
2145 for (Block
& block
: program
->blocks
) {
2146 if (block
.linear_preds
.size() <= 1)
2149 std::bitset
<128> regs
= sgpr_live_in
[block
.index
];
2153 /* choose a register */
2155 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& regs
[reg
]; reg
++)
2157 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
2158 adjust_max_used_regs(ctx
, s1
, reg
);
2160 /* update predecessors */
2161 for (unsigned& pred_index
: block
.linear_preds
) {
2162 Block
& pred
= program
->blocks
[pred_index
];
2163 pred
.scc_live_out
= true;
2164 pred
.scratch_sgpr
= PhysReg
{(uint16_t)reg
};
2168 /* num_gpr = rnd_up(max_used_gpr + 1) */
2169 program
->config
->num_vgprs
= align(ctx
.max_used_vgpr
+ 1, 4);
2170 if (program
->family
== CHIP_TONGA
|| program
->family
== CHIP_ICELAND
) /* workaround hardware bug */
2171 program
->config
->num_sgprs
= get_sgpr_alloc(program
, program
->sgpr_limit
);
2173 program
->config
->num_sgprs
= align(ctx
.max_used_sgpr
+ 1 + get_extra_sgprs(program
), 8);