2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
32 #include <unordered_map>
36 #include "util/u_math.h"
45 assignment() = default;
46 assignment(PhysReg reg
, RegClass rc
) : reg(reg
), rc(rc
), assigned(-1) {}
52 std::set
<Instruction
*> uses
;
56 std::bitset
<512> war_hint
;
58 std::vector
<assignment
> assignments
;
59 std::vector
<std::unordered_map
<unsigned, Temp
>> renames
;
60 std::vector
<std::vector
<Instruction
*>> incomplete_phis
;
61 std::vector
<bool> filled
;
62 std::vector
<bool> sealed
;
63 std::unordered_map
<unsigned, Temp
> orig_names
;
64 std::unordered_map
<unsigned, phi_info
> phi_map
;
65 std::unordered_map
<unsigned, unsigned> affinities
;
66 unsigned max_used_sgpr
= 0;
67 unsigned max_used_vgpr
= 0;
68 std::bitset
<64> defs_done
; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
70 ra_ctx(Program
* program
) : program(program
),
71 assignments(program
->peekAllocationId()),
72 renames(program
->blocks
.size()),
73 incomplete_phis(program
->blocks
.size()),
74 filled(program
->blocks
.size()),
75 sealed(program
->blocks
.size()) {}
80 RegisterFile() {regs
.fill(0);}
82 std::array
<uint32_t, 512> regs
;
83 std::map
<uint32_t, std::array
<uint32_t, 4>> subdword_regs
;
85 const uint32_t& operator [] (unsigned index
) const {
89 uint32_t& operator [] (unsigned index
) {
93 unsigned count_zero(PhysReg start
, unsigned size
) {
95 for (unsigned i
= 0; i
< size
; i
++)
96 res
+= !regs
[start
+ i
];
100 bool test(PhysReg start
, unsigned num_bytes
) {
101 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
102 if (regs
[i
] & 0x0FFFFFFF)
104 if (regs
[i
] == 0xF0000000) {
105 assert(subdword_regs
.find(i
) != subdword_regs
.end());
106 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++) {
107 if (subdword_regs
[i
][j
])
115 void fill(PhysReg start
, unsigned size
, uint32_t val
) {
116 for (unsigned i
= 0; i
< size
; i
++)
117 regs
[start
+ i
] = val
;
120 void fill_subdword(PhysReg start
, unsigned num_bytes
, uint32_t val
) {
121 fill(start
, DIV_ROUND_UP(num_bytes
, 4), 0xF0000000);
122 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
124 std::array
<uint32_t, 4>& sub
= subdword_regs
.emplace(i
, std::array
<uint32_t, 4>{0, 0, 0, 0}).first
->second
;
125 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++)
128 if (sub
== std::array
<uint32_t, 4>{0, 0, 0, 0}) {
129 subdword_regs
.erase(i
);
135 void block(PhysReg start
, unsigned num_bytes
) {
136 if (start
.byte() || num_bytes
% 4)
137 fill_subdword(start
, num_bytes
, 0xFFFFFFFF);
139 fill(start
, num_bytes
/ 4, 0xFFFFFFFF);
142 bool is_blocked(PhysReg start
) {
143 if (regs
[start
] == 0xFFFFFFFF)
145 if (regs
[start
] == 0xF0000000) {
146 for (unsigned i
= start
.byte(); i
< 4; i
++)
147 if (subdword_regs
[start
][i
] == 0xFFFFFFFF)
153 void clear(PhysReg start
, RegClass rc
) {
154 if (rc
.is_subdword())
155 fill_subdword(start
, rc
.bytes(), 0);
157 fill(start
, rc
.size(), 0);
160 void fill(Operand op
) {
161 if (op
.regClass().is_subdword())
162 fill_subdword(op
.physReg(), op
.bytes(), op
.tempId());
164 fill(op
.physReg(), op
.size(), op
.tempId());
167 void clear(Operand op
) {
168 clear(op
.physReg(), op
.regClass());
171 void fill(Definition def
) {
172 if (def
.regClass().is_subdword())
173 fill_subdword(def
.physReg(), def
.bytes(), def
.tempId());
175 fill(def
.physReg(), def
.size(), def
.tempId());
178 void clear(Definition def
) {
179 clear(def
.physReg(), def
.regClass());
184 /* helper function for debugging */
186 void print_regs(ra_ctx
& ctx
, bool vgprs
, RegisterFile
& reg_file
)
188 unsigned max
= vgprs
? ctx
.program
->max_reg_demand
.vgpr
: ctx
.program
->max_reg_demand
.sgpr
;
189 unsigned lb
= vgprs
? 256 : 0;
190 unsigned ub
= lb
+ max
;
191 char reg_char
= vgprs
? 'v' : 's';
195 for (unsigned i
= lb
; i
< ub
; i
+= 3) {
196 printf("%.2u ", i
- lb
);
201 printf("%cgprs: ", reg_char
);
202 unsigned free_regs
= 0;
204 bool char_select
= false;
205 for (unsigned i
= lb
; i
< ub
; i
++) {
206 if (reg_file
[i
] == 0xFFFF) {
208 } else if (reg_file
[i
]) {
209 if (reg_file
[i
] != prev
) {
211 char_select
= !char_select
;
213 printf(char_select
? "#" : "@");
221 printf("%u/%u used, %u/%u free\n", max
- free_regs
, max
, free_regs
, max
);
223 /* print assignments */
226 for (unsigned i
= lb
; i
< ub
; i
++) {
227 if (reg_file
[i
] != prev
) {
228 if (prev
&& size
> 1)
229 printf("-%d]\n", i
- 1 - lb
);
233 if (prev
&& prev
!= 0xFFFF) {
234 if (ctx
.orig_names
.count(reg_file
[i
]) && ctx
.orig_names
[reg_file
[i
]].id() != reg_file
[i
])
235 printf("%%%u (was %%%d) = %c[%d", reg_file
[i
], ctx
.orig_names
[reg_file
[i
]].id(), reg_char
, i
- lb
);
237 printf("%%%u = %c[%d", reg_file
[i
], reg_char
, i
- lb
);
244 if (prev
&& size
> 1)
245 printf("-%d]\n", ub
- lb
- 1);
252 void adjust_max_used_regs(ra_ctx
& ctx
, RegClass rc
, unsigned reg
)
254 unsigned max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
255 unsigned size
= rc
.size();
256 if (rc
.type() == RegType::vgpr
) {
258 unsigned hi
= reg
- 256 + size
- 1;
259 ctx
.max_used_vgpr
= std::max(ctx
.max_used_vgpr
, hi
);
260 } else if (reg
+ rc
.size() <= max_addressible_sgpr
) {
261 unsigned hi
= reg
+ size
- 1;
262 ctx
.max_used_sgpr
= std::max(ctx
.max_used_sgpr
, std::min(hi
, max_addressible_sgpr
));
267 void update_renames(ra_ctx
& ctx
, RegisterFile
& reg_file
,
268 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
269 aco_ptr
<Instruction
>& instr
)
271 /* allocate id's and rename operands: this is done transparently here */
272 for (std::pair
<Operand
, Definition
>& copy
: parallelcopies
) {
273 /* the definitions with id are not from this function and already handled */
274 if (copy
.second
.isTemp())
277 /* check if we we moved another parallelcopy definition */
278 for (std::pair
<Operand
, Definition
>& other
: parallelcopies
) {
279 if (!other
.second
.isTemp())
281 if (copy
.first
.getTemp() == other
.second
.getTemp()) {
282 copy
.first
.setTemp(other
.first
.getTemp());
283 copy
.first
.setFixed(other
.first
.physReg());
286 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
287 copy
.second
.setTemp(Temp(ctx
.program
->allocateId(), copy
.second
.regClass()));
288 ctx
.assignments
.emplace_back(copy
.second
.physReg(), copy
.second
.regClass());
289 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
290 reg_file
.fill(copy
.second
);
292 /* check if we moved an operand */
293 for (Operand
& op
: instr
->operands
) {
296 if (op
.tempId() == copy
.first
.tempId()) {
297 bool omit_renaming
= instr
->opcode
== aco_opcode::p_create_vector
&& !op
.isKillBeforeDef();
298 for (std::pair
<Operand
, Definition
>& pc
: parallelcopies
) {
299 PhysReg def_reg
= pc
.second
.physReg();
300 omit_renaming
&= def_reg
> copy
.first
.physReg() ?
301 (copy
.first
.physReg() + copy
.first
.size() <= def_reg
.reg()) :
302 (def_reg
+ pc
.second
.size() <= copy
.first
.physReg().reg());
306 op
.setTemp(copy
.second
.getTemp());
307 op
.setFixed(copy
.second
.physReg());
313 bool instr_can_access_subdword(aco_ptr
<Instruction
>& instr
)
315 return instr
->isSDWA() || instr
->format
== Format::PSEUDO
;
318 std::pair
<PhysReg
, bool> get_reg_simple(ra_ctx
& ctx
,
319 RegisterFile
& reg_file
,
320 uint32_t lb
, uint32_t ub
,
321 uint32_t size
, uint32_t stride
,
324 if (rc
.is_subdword()) {
325 for (std::pair
<uint32_t, std::array
<uint32_t, 4>> entry
: reg_file
.subdword_regs
) {
326 assert(reg_file
[entry
.first
] == 0xF0000000);
327 if (lb
> entry
.first
|| entry
.first
>= ub
)
330 for (unsigned i
= 0; i
< 4; i
+= stride
) {
331 if (entry
.second
[i
] != 0)
334 bool reg_found
= true;
335 for (unsigned j
= 1; reg_found
&& i
+ j
< 4 && j
< rc
.bytes(); j
++)
336 reg_found
&= entry
.second
[i
+ j
] == 0;
338 /* check neighboring reg if needed */
339 reg_found
&= (i
<= 4 - rc
.bytes() || reg_file
[entry
.first
+ 1] == 0);
341 PhysReg res
{entry
.first
};
348 stride
= 1; /* stride in full registers */
351 /* best fit algorithm: find the smallest gap to fit in the variable */
353 unsigned best_pos
= 0xFFFF;
354 unsigned gap_size
= 0xFFFF;
355 unsigned next_pos
= 0xFFFF;
357 for (unsigned current_reg
= lb
; current_reg
< ub
; current_reg
++) {
358 if (reg_file
[current_reg
] != 0 || ctx
.war_hint
[current_reg
]) {
359 if (next_pos
== 0xFFFF)
362 /* check if the variable fits */
363 if (next_pos
+ size
> current_reg
) {
368 /* check if the tested gap is smaller */
369 if (current_reg
- next_pos
< gap_size
) {
371 gap_size
= current_reg
- next_pos
;
377 if (next_pos
== 0xFFFF)
378 next_pos
= current_reg
;
382 if (next_pos
!= 0xFFFF &&
383 next_pos
+ size
<= ub
&&
384 ub
- next_pos
< gap_size
) {
386 gap_size
= ub
- next_pos
;
388 if (best_pos
!= 0xFFFF) {
389 adjust_max_used_regs(ctx
, rc
, best_pos
);
390 return {PhysReg
{best_pos
}, true};
396 unsigned reg_lo
= lb
;
397 unsigned reg_hi
= lb
+ size
- 1;
398 while (!found
&& reg_lo
+ size
<= ub
) {
399 if (reg_file
[reg_lo
] != 0) {
403 reg_hi
= reg_lo
+ size
- 1;
405 for (unsigned reg
= reg_lo
+ 1; found
&& reg
<= reg_hi
; reg
++) {
406 if (reg_file
[reg
] != 0 || ctx
.war_hint
[reg
])
410 adjust_max_used_regs(ctx
, rc
, reg_lo
);
411 return {PhysReg
{reg_lo
}, true};
420 /* collect variables from a register area and clear reg_file */
421 std::set
<std::pair
<unsigned, unsigned>> collect_vars(ra_ctx
& ctx
, RegisterFile
& reg_file
,
422 PhysReg reg
, unsigned size
)
424 std::set
<std::pair
<unsigned, unsigned>> vars
;
425 for (unsigned j
= reg
; j
< reg
+ size
; j
++) {
426 if (reg_file
.is_blocked(PhysReg
{j
}))
428 if (reg_file
[j
] == 0xF0000000) {
429 for (unsigned k
= 0; k
< 4; k
++) {
430 unsigned id
= reg_file
.subdword_regs
[j
][k
];
432 assignment
& var
= ctx
.assignments
[id
];
433 vars
.emplace(var
.rc
.bytes(), id
);
434 reg_file
.clear(var
.reg
, var
.rc
);
439 } else if (reg_file
[j
] != 0) {
440 unsigned id
= reg_file
[j
];
441 assignment
& var
= ctx
.assignments
[id
];
442 vars
.emplace(var
.rc
.bytes(), id
);
443 reg_file
.clear(var
.reg
, var
.rc
);
449 bool get_regs_for_copies(ra_ctx
& ctx
,
450 RegisterFile
& reg_file
,
451 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
452 const std::set
<std::pair
<unsigned, unsigned>> &vars
,
453 uint32_t lb
, uint32_t ub
,
454 aco_ptr
<Instruction
>& instr
,
459 /* variables are sorted from small sized to large */
460 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
461 for (std::set
<std::pair
<unsigned, unsigned>>::const_reverse_iterator it
= vars
.rbegin(); it
!= vars
.rend(); ++it
) {
462 unsigned id
= it
->second
;
463 assignment
& var
= ctx
.assignments
[id
];
464 uint32_t size
= var
.rc
.size();
466 if (var
.rc
.type() == RegType::sgpr
) {
473 /* check if this is a dead operand, then we can re-use the space from the definition */
474 bool is_dead_operand
= false;
475 for (unsigned i
= 0; !is_phi(instr
) && !is_dead_operand
&& (i
< instr
->operands
.size()); i
++) {
476 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isKillBeforeDef() && instr
->operands
[i
].tempId() == id
)
477 is_dead_operand
= true;
480 std::pair
<PhysReg
, bool> res
;
481 if (is_dead_operand
) {
482 if (instr
->opcode
== aco_opcode::p_create_vector
) {
483 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
484 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
485 for (unsigned j
= 0; j
< size
; j
++)
486 assert(reg_file
[def_reg_lo
+ offset
+ j
] == 0);
487 res
= {PhysReg
{def_reg_lo
+ offset
}, true};
492 res
= get_reg_simple(ctx
, reg_file
, def_reg_lo
, def_reg_hi
+ 1, size
, stride
, var
.rc
);
495 res
= get_reg_simple(ctx
, reg_file
, lb
, def_reg_lo
, size
, stride
, var
.rc
);
497 unsigned lb
= (def_reg_hi
+ stride
) & ~(stride
- 1);
498 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, var
.rc
);
503 /* mark the area as blocked */
504 reg_file
.block(res
.first
, var
.rc
.bytes());
506 /* create parallelcopy pair (without definition id) */
507 Temp tmp
= Temp(id
, var
.rc
);
508 Operand pc_op
= Operand(tmp
);
509 pc_op
.setFixed(var
.reg
);
510 Definition pc_def
= Definition(res
.first
, pc_op
.regClass());
511 parallelcopies
.emplace_back(pc_op
, pc_def
);
515 unsigned best_pos
= lb
;
516 unsigned num_moves
= 0xFF;
517 unsigned num_vars
= 0;
519 /* we use a sliding window to find potential positions */
520 unsigned reg_lo
= lb
;
521 unsigned reg_hi
= lb
+ size
- 1;
522 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
523 if (!is_dead_operand
&& ((reg_lo
>= def_reg_lo
&& reg_lo
<= def_reg_hi
) ||
524 (reg_hi
>= def_reg_lo
&& reg_hi
<= def_reg_hi
)))
527 /* second, check that we have at most k=num_moves elements in the window
528 * and no element is larger than the currently processed one */
531 unsigned last_var
= 0;
533 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
534 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
537 if (reg_file
.is_blocked(PhysReg
{j
}) || k
> num_moves
) {
541 if (reg_file
[j
] == 0xF0000000) {
546 /* we cannot split live ranges of linear vgprs */
547 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
551 bool is_kill
= false;
552 for (const Operand
& op
: instr
->operands
) {
553 if (op
.isTemp() && op
.isKillBeforeDef() && op
.tempId() == reg_file
[j
]) {
558 if (!is_kill
&& ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
563 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
564 last_var
= reg_file
[j
];
566 if (k
> num_moves
|| (k
== num_moves
&& n
<= num_vars
)) {
579 /* FIXME: we messed up and couldn't find space for the variables to be copied */
580 if (num_moves
== 0xFF)
584 reg_hi
= best_pos
+ size
- 1;
586 /* collect variables and block reg file */
587 std::set
<std::pair
<unsigned, unsigned>> new_vars
= collect_vars(ctx
, reg_file
, PhysReg
{reg_lo
}, size
);
589 /* mark the area as blocked */
590 reg_file
.block(PhysReg
{reg_lo
}, size
* 4);
592 if (!get_regs_for_copies(ctx
, reg_file
, parallelcopies
, new_vars
, lb
, ub
, instr
, def_reg_lo
, def_reg_hi
))
595 adjust_max_used_regs(ctx
, var
.rc
, reg_lo
);
597 /* create parallelcopy pair (without definition id) */
598 Temp tmp
= Temp(id
, var
.rc
);
599 Operand pc_op
= Operand(tmp
);
600 pc_op
.setFixed(var
.reg
);
601 Definition pc_def
= Definition(PhysReg
{reg_lo
}, pc_op
.regClass());
602 parallelcopies
.emplace_back(pc_op
, pc_def
);
609 std::pair
<PhysReg
, bool> get_reg_impl(ra_ctx
& ctx
,
610 RegisterFile
& reg_file
,
611 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
612 uint32_t lb
, uint32_t ub
,
613 uint32_t size
, uint32_t stride
,
615 aco_ptr
<Instruction
>& instr
)
617 /* check how many free regs we have */
618 unsigned regs_free
= reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
);
620 /* mark and count killed operands */
621 unsigned killed_ops
= 0;
622 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
623 if (instr
->operands
[j
].isTemp() &&
624 instr
->operands
[j
].isFirstKillBeforeDef() &&
625 instr
->operands
[j
].physReg() >= lb
&&
626 instr
->operands
[j
].physReg() < ub
) {
627 assert(instr
->operands
[j
].isFixed());
628 assert(!reg_file
.test(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes()));
629 reg_file
.block(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes());
630 killed_ops
+= instr
->operands
[j
].getTemp().size();
634 assert(regs_free
>= size
);
635 /* we might have to move dead operands to dst in order to make space */
636 unsigned op_moves
= 0;
638 if (size
> (regs_free
- killed_ops
))
639 op_moves
= size
- (regs_free
- killed_ops
);
641 /* find the best position to place the definition */
642 unsigned best_pos
= lb
;
643 unsigned num_moves
= 0xFF;
644 unsigned num_vars
= 0;
646 /* we use a sliding window to check potential positions */
647 unsigned reg_lo
= lb
;
648 unsigned reg_hi
= lb
+ size
- 1;
649 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
650 /* first check the edges: this is what we have to fix to allow for num_moves > size */
651 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
653 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
656 /* second, check that we have at most k=num_moves elements in the window
657 * and no element is larger than the currently processed one */
658 unsigned k
= op_moves
;
660 unsigned remaining_op_moves
= op_moves
;
661 unsigned last_var
= 0;
663 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
664 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
665 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
668 /* dead operands effectively reduce the number of estimated moves */
669 if (reg_file
.is_blocked(PhysReg
{j
})) {
670 if (remaining_op_moves
) {
672 remaining_op_moves
--;
677 if (reg_file
[j
] == 0xF0000000) {
683 if (ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
688 /* we cannot split live ranges of linear vgprs */
689 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
694 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
696 last_var
= reg_file
[j
];
699 if (!found
|| k
> num_moves
)
701 if (k
== num_moves
&& n
< num_vars
)
703 if (!aligned
&& k
== num_moves
&& n
== num_vars
)
713 if (num_moves
== 0xFF) {
714 /* remove killed operands from reg_file once again */
715 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
716 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef())
717 reg_file
.clear(instr
->operands
[i
]);
719 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
720 Definition def
= instr
->definitions
[i
];
721 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
727 RegisterFile register_file
= reg_file
;
729 /* now, we figured the placement for our definition */
730 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
732 if (instr
->opcode
== aco_opcode::p_create_vector
) {
733 /* move killed operands which aren't yet at the correct position */
734 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
735 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef() &&
736 instr
->operands
[i
].getTemp().type() == rc
.type()) {
738 if (instr
->operands
[i
].physReg() != best_pos
+ offset
) {
739 vars
.emplace(instr
->operands
[i
].bytes(), instr
->operands
[i
].tempId());
740 reg_file
.clear(instr
->operands
[i
]);
742 reg_file
.fill(instr
->operands
[i
]);
747 /* re-enable the killed operands */
748 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
749 if (instr
->operands
[j
].isTemp() && instr
->operands
[j
].isFirstKill())
750 reg_file
.fill(instr
->operands
[j
]);
754 std::vector
<std::pair
<Operand
, Definition
>> pc
;
755 if (!get_regs_for_copies(ctx
, reg_file
, pc
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1)) {
756 reg_file
= std::move(register_file
);
757 /* remove killed operands from reg_file once again */
758 if (!is_phi(instr
)) {
759 for (const Operand
& op
: instr
->operands
) {
760 if (op
.isTemp() && op
.isFirstKillBeforeDef())
764 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
765 Definition
& def
= instr
->definitions
[i
];
766 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
772 parallelcopies
.insert(parallelcopies
.end(), pc
.begin(), pc
.end());
774 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
775 reg_file
.clear(PhysReg
{best_pos
}, rc
);
777 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
779 /* remove killed operands from reg_file once again */
780 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
781 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
783 assert(!instr
->operands
[i
].isUndefined());
784 if (instr
->operands
[i
].isFirstKillBeforeDef())
785 reg_file
.clear(instr
->operands
[i
]);
787 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
788 Definition def
= instr
->definitions
[i
];
789 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
793 adjust_max_used_regs(ctx
, rc
, best_pos
);
794 return {PhysReg
{best_pos
}, true};
797 PhysReg
get_reg(ra_ctx
& ctx
,
798 RegisterFile
& reg_file
,
800 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
801 aco_ptr
<Instruction
>& instr
)
803 uint32_t size
= rc
.size();
806 if (rc
.type() == RegType::vgpr
) {
808 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
811 ub
= ctx
.program
->max_reg_demand
.sgpr
;
818 if (rc
.is_subdword()) {
819 /* stride in bytes */
820 if(!instr_can_access_subdword(instr
))
822 else if (rc
.bytes() % 4 == 0)
824 else if (rc
.bytes() % 2 == 0)
828 std::pair
<PhysReg
, bool> res
= {{}, false};
829 /* try to find space without live-range splits */
830 if (rc
.type() == RegType::vgpr
&& (size
== 4 || size
== 8))
831 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, 4, rc
);
833 res
= get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
837 /* try to find space with live-range splits */
838 res
= get_reg_impl(ctx
, reg_file
, parallelcopies
, lb
, ub
, size
, stride
, rc
, instr
);
843 /* try using more registers */
845 /* We should only fail here because keeping under the limit would require
847 assert(reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
) >= size
);
849 uint16_t max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
850 uint16_t max_addressible_vgpr
= ctx
.program
->vgpr_limit
;
851 if (rc
.type() == RegType::vgpr
&& ctx
.program
->max_reg_demand
.vgpr
< max_addressible_vgpr
) {
852 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
+ 1, ctx
.program
->max_reg_demand
.sgpr
));
853 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
854 } else if (rc
.type() == RegType::sgpr
&& ctx
.program
->max_reg_demand
.sgpr
< max_addressible_sgpr
) {
855 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
, ctx
.program
->max_reg_demand
.sgpr
+ 1));
856 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
859 //FIXME: if nothing helps, shift-rotate the registers to make space
861 unreachable("did not find a register");
865 std::pair
<PhysReg
, bool> get_reg_vec(ra_ctx
& ctx
,
866 RegisterFile
& reg_file
,
869 uint32_t size
= rc
.size();
872 if (rc
.type() == RegType::vgpr
) {
874 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
877 ub
= ctx
.program
->max_reg_demand
.sgpr
;
883 return get_reg_simple(ctx
, reg_file
, lb
, ub
, size
, stride
, rc
);
887 PhysReg
get_reg_create_vector(ra_ctx
& ctx
,
888 RegisterFile
& reg_file
,
890 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
891 aco_ptr
<Instruction
>& instr
)
893 /* create_vector instructions have different costs w.r.t. register coalescing */
894 uint32_t size
= rc
.size();
897 if (rc
.type() == RegType::vgpr
) {
899 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
902 ub
= ctx
.program
->max_reg_demand
.sgpr
;
909 unsigned best_pos
= -1;
910 unsigned num_moves
= 0xFF;
911 bool best_war_hint
= true;
913 /* test for each operand which definition placement causes the least shuffle instructions */
914 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
915 // TODO: think about, if we can alias live operands on the same register
916 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isKillBeforeDef() || instr
->operands
[i
].getTemp().type() != rc
.type())
919 if (offset
> instr
->operands
[i
].physReg())
922 unsigned reg_lo
= instr
->operands
[i
].physReg() - offset
;
923 unsigned reg_hi
= reg_lo
+ size
- 1;
926 /* no need to check multiple times */
927 if (reg_lo
== best_pos
)
931 // TODO: this can be improved */
932 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
% stride
!= 0)
934 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
[reg_lo
] == reg_file
[reg_lo
- 1])
936 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
[reg_hi
] == reg_file
[reg_hi
+ 1])
939 /* count variables to be moved and check war_hint */
940 bool war_hint
= false;
941 bool linear_vgpr
= false;
942 for (unsigned j
= reg_lo
; j
<= reg_hi
&& !linear_vgpr
; j
++) {
943 if (reg_file
[j
] != 0) {
945 /* we cannot split live ranges of linear vgprs */
946 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6))
949 war_hint
|= ctx
.war_hint
[j
];
951 if (linear_vgpr
|| (war_hint
&& !best_war_hint
))
954 /* count operands in wrong positions */
955 for (unsigned j
= 0, offset
= 0; j
< instr
->operands
.size(); offset
+= instr
->operands
[j
].size(), j
++) {
957 !instr
->operands
[j
].isTemp() ||
958 instr
->operands
[j
].getTemp().type() != rc
.type())
960 if (instr
->operands
[j
].physReg() != reg_lo
+ offset
)
961 k
+= instr
->operands
[j
].size();
963 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
964 if (k
> num_moves
|| (!aligned
&& k
== num_moves
))
969 best_war_hint
= war_hint
;
972 if (num_moves
>= size
)
973 return get_reg(ctx
, reg_file
, rc
, parallelcopies
, instr
);
975 /* collect variables to be moved */
976 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
978 /* move killed operands which aren't yet at the correct position */
979 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].size(), i
++) {
980 if (instr
->operands
[i
].isTemp() &&
981 instr
->operands
[i
].isFirstKillBeforeDef() &&
982 instr
->operands
[i
].getTemp().type() == rc
.type() &&
983 instr
->operands
[i
].physReg() != best_pos
+ offset
)
984 vars
.emplace(instr
->operands
[i
].bytes(), instr
->operands
[i
].tempId());
987 ASSERTED
bool success
= false;
988 success
= get_regs_for_copies(ctx
, reg_file
, parallelcopies
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1);
991 update_renames(ctx
, reg_file
, parallelcopies
, instr
);
992 adjust_max_used_regs(ctx
, rc
, best_pos
);
993 return PhysReg
{best_pos
};
996 bool get_reg_specified(ra_ctx
& ctx
,
997 RegisterFile
& reg_file
,
999 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
1000 aco_ptr
<Instruction
>& instr
,
1003 uint32_t size
= rc
.size();
1004 uint32_t stride
= 1;
1007 if (rc
.type() == RegType::vgpr
) {
1009 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
1015 if (reg
% stride
!= 0)
1018 ub
= ctx
.program
->max_reg_demand
.sgpr
;
1021 if (rc
.is_subdword() && reg
.byte() && !instr_can_access_subdword(instr
))
1024 uint32_t reg_lo
= reg
.reg();
1025 uint32_t reg_hi
= reg
+ (size
- 1);
1027 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
> reg_hi
)
1030 if (reg_file
.test(reg
, rc
.bytes()))
1033 adjust_max_used_regs(ctx
, rc
, reg_lo
);
1037 void handle_pseudo(ra_ctx
& ctx
,
1038 const RegisterFile
& reg_file
,
1041 if (instr
->format
!= Format::PSEUDO
)
1044 /* all instructions which use handle_operands() need this information */
1045 switch (instr
->opcode
) {
1046 case aco_opcode::p_extract_vector
:
1047 case aco_opcode::p_create_vector
:
1048 case aco_opcode::p_split_vector
:
1049 case aco_opcode::p_parallelcopy
:
1050 case aco_opcode::p_wqm
:
1056 /* if all definitions are vgpr, no need to care for SCC */
1057 bool writes_sgpr
= false;
1058 for (Definition
& def
: instr
->definitions
) {
1059 if (def
.getTemp().type() == RegType::sgpr
) {
1064 /* if all operands are constant, no need to care either */
1065 bool reads_sgpr
= false;
1066 for (Operand
& op
: instr
->operands
) {
1067 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1072 if (!(writes_sgpr
&& reads_sgpr
))
1075 Pseudo_instruction
*pi
= (Pseudo_instruction
*)instr
;
1076 if (reg_file
[scc
.reg()]) {
1077 pi
->tmp_in_scc
= true;
1079 int reg
= ctx
.max_used_sgpr
;
1080 for (; reg
>= 0 && reg_file
[reg
]; reg
--)
1083 reg
= ctx
.max_used_sgpr
+ 1;
1084 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& reg_file
[reg
]; reg
++)
1086 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
1089 adjust_max_used_regs(ctx
, s1
, reg
);
1090 pi
->scratch_sgpr
= PhysReg
{(unsigned)reg
};
1092 pi
->tmp_in_scc
= false;
1096 bool operand_can_use_reg(aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
)
1098 if (!instr_can_access_subdword(instr
) && reg
.byte())
1101 switch (instr
->format
) {
1103 return reg
!= scc
&&
1105 (reg
!= m0
|| idx
== 1 || idx
== 3) && /* offset can be m0 */
1106 (reg
!= vcc
|| (instr
->definitions
.empty() && idx
== 2)); /* sdata can be vcc */
1108 // TODO: there are more instructions with restrictions on registers
1113 Temp
read_variable(ra_ctx
& ctx
, Temp val
, unsigned block_idx
)
1115 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.renames
[block_idx
].find(val
.id());
1116 if (it
== ctx
.renames
[block_idx
].end())
1122 Temp
handle_live_in(ra_ctx
& ctx
, Temp val
, Block
* block
)
1124 std::vector
<unsigned>& preds
= val
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
1125 if (preds
.size() == 0 || val
.regClass() == val
.regClass().as_linear())
1128 assert(preds
.size() > 0);
1131 if (!ctx
.sealed
[block
->index
]) {
1132 /* consider rename from already processed predecessor */
1133 Temp tmp
= read_variable(ctx
, val
, preds
[0]);
1135 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1136 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1137 ctx
.assignments
.emplace_back();
1138 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1139 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1140 phi
->definitions
[0] = Definition(new_val
);
1141 for (unsigned i
= 0; i
< preds
.size(); i
++)
1142 phi
->operands
[i
] = Operand(val
);
1143 if (tmp
.regClass() == new_val
.regClass())
1144 ctx
.affinities
[new_val
.id()] = tmp
.id();
1146 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1147 ctx
.incomplete_phis
[block
->index
].emplace_back(phi
.get());
1148 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1150 } else if (preds
.size() == 1) {
1151 /* if the block has only one predecessor, just look there for the name */
1152 new_val
= read_variable(ctx
, val
, preds
[0]);
1154 /* there are multiple predecessors and the block is sealed */
1155 Temp ops
[preds
.size()];
1157 /* get the rename from each predecessor and check if they are the same */
1158 bool needs_phi
= false;
1159 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1160 ops
[i
] = read_variable(ctx
, val
, preds
[i
]);
1164 needs_phi
|= !(new_val
== ops
[i
]);
1168 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1169 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1170 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1171 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1172 phi
->definitions
[0] = Definition(new_val
);
1173 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1174 phi
->operands
[i
] = Operand(ops
[i
]);
1175 phi
->operands
[i
].setFixed(ctx
.assignments
[ops
[i
].id()].reg
);
1176 if (ops
[i
].regClass() == new_val
.regClass())
1177 ctx
.affinities
[new_val
.id()] = ops
[i
].id();
1179 ctx
.assignments
.emplace_back();
1180 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1181 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1182 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1186 if (new_val
!= val
) {
1187 ctx
.renames
[block
->index
][val
.id()] = new_val
;
1188 ctx
.orig_names
[new_val
.id()] = val
;
1193 void try_remove_trivial_phi(ra_ctx
& ctx
, Temp temp
)
1195 std::unordered_map
<unsigned, phi_info
>::iterator info
= ctx
.phi_map
.find(temp
.id());
1197 if (info
== ctx
.phi_map
.end() || !ctx
.sealed
[info
->second
.block_idx
])
1200 assert(info
->second
.block_idx
!= 0);
1201 Instruction
* phi
= info
->second
.phi
;
1203 Definition def
= phi
->definitions
[0];
1205 /* a phi node is trivial if all operands are the same as the definition of the phi */
1206 for (const Operand
& op
: phi
->operands
) {
1207 const Temp t
= op
.getTemp();
1208 if (t
== same
|| t
== def
.getTemp()) {
1209 assert(t
== same
|| op
.physReg() == def
.physReg());
1217 assert(same
!= Temp() || same
== def
.getTemp());
1219 /* reroute all uses to same and remove phi */
1220 std::vector
<Temp
> phi_users
;
1221 std::unordered_map
<unsigned, phi_info
>::iterator same_phi_info
= ctx
.phi_map
.find(same
.id());
1222 for (Instruction
* instr
: info
->second
.uses
) {
1223 assert(phi
!= instr
);
1224 /* recursively try to remove trivial phis */
1225 if (is_phi(instr
)) {
1226 /* ignore if the phi was already flagged trivial */
1227 if (instr
->definitions
.empty())
1230 if (instr
->definitions
[0].getTemp() != temp
)
1231 phi_users
.emplace_back(instr
->definitions
[0].getTemp());
1233 for (Operand
& op
: instr
->operands
) {
1234 if (op
.isTemp() && op
.tempId() == def
.tempId()) {
1236 if (same_phi_info
!= ctx
.phi_map
.end())
1237 same_phi_info
->second
.uses
.emplace(instr
);
1242 auto it
= ctx
.orig_names
.find(same
.id());
1243 unsigned orig_var
= it
!= ctx
.orig_names
.end() ? it
->second
.id() : same
.id();
1244 for (unsigned i
= 0; i
< ctx
.program
->blocks
.size(); i
++) {
1245 auto it
= ctx
.renames
[i
].find(orig_var
);
1246 if (it
!= ctx
.renames
[i
].end() && it
->second
== def
.getTemp())
1247 ctx
.renames
[i
][orig_var
] = same
;
1250 phi
->definitions
.clear(); /* this indicates that the phi can be removed */
1251 ctx
.phi_map
.erase(info
);
1252 for (Temp t
: phi_users
)
1253 try_remove_trivial_phi(ctx
, t
);
1258 } /* end namespace */
1261 void register_allocation(Program
*program
, std::vector
<TempSet
>& live_out_per_block
)
1263 ra_ctx
ctx(program
);
1265 std::unordered_map
<unsigned, Instruction
*> vectors
;
1266 std::vector
<std::vector
<Temp
>> phi_ressources
;
1267 std::unordered_map
<unsigned, unsigned> temp_to_phi_ressources
;
1269 for (std::vector
<Block
>::reverse_iterator it
= program
->blocks
.rbegin(); it
!= program
->blocks
.rend(); it
++) {
1272 /* first, compute the death points of all live vars within the block */
1273 TempSet
& live
= live_out_per_block
[block
.index
];
1275 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator rit
;
1276 for (rit
= block
.instructions
.rbegin(); rit
!= block
.instructions
.rend(); ++rit
) {
1277 aco_ptr
<Instruction
>& instr
= *rit
;
1278 if (is_phi(instr
)) {
1279 live
.erase(instr
->definitions
[0].getTemp());
1280 if (instr
->definitions
[0].isKill() || instr
->definitions
[0].isFixed())
1282 /* collect information about affinity-related temporaries */
1283 std::vector
<Temp
> affinity_related
;
1284 /* affinity_related[0] is the last seen affinity-related temp */
1285 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1286 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1287 for (const Operand
& op
: instr
->operands
) {
1288 if (op
.isTemp() && op
.regClass() == instr
->definitions
[0].regClass()) {
1289 affinity_related
.emplace_back(op
.getTemp());
1290 temp_to_phi_ressources
[op
.tempId()] = phi_ressources
.size();
1293 phi_ressources
.emplace_back(std::move(affinity_related
));
1297 /* add vector affinities */
1298 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1299 for (const Operand
& op
: instr
->operands
) {
1300 if (op
.isTemp() && op
.getTemp().type() == instr
->definitions
[0].getTemp().type())
1301 vectors
[op
.tempId()] = instr
.get();
1305 /* add operands to live variables */
1306 for (const Operand
& op
: instr
->operands
) {
1308 live
.emplace(op
.getTemp());
1311 /* erase definitions from live */
1312 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1313 const Definition
& def
= instr
->definitions
[i
];
1316 live
.erase(def
.getTemp());
1317 /* mark last-seen phi operand */
1318 std::unordered_map
<unsigned, unsigned>::iterator it
= temp_to_phi_ressources
.find(def
.tempId());
1319 if (it
!= temp_to_phi_ressources
.end() && def
.regClass() == phi_ressources
[it
->second
][0].regClass()) {
1320 phi_ressources
[it
->second
][0] = def
.getTemp();
1321 /* try to coalesce phi affinities with parallelcopies */
1322 if (!def
.isFixed() && instr
->opcode
== aco_opcode::p_parallelcopy
) {
1323 Operand op
= instr
->operands
[i
];
1324 if (op
.isTemp() && op
.isFirstKillBeforeDef() && def
.regClass() == op
.regClass()) {
1325 phi_ressources
[it
->second
].emplace_back(op
.getTemp());
1326 temp_to_phi_ressources
[op
.tempId()] = it
->second
;
1333 /* create affinities */
1334 for (std::vector
<Temp
>& vec
: phi_ressources
) {
1335 assert(vec
.size() > 1);
1336 for (unsigned i
= 1; i
< vec
.size(); i
++)
1337 if (vec
[i
].id() != vec
[0].id())
1338 ctx
.affinities
[vec
[i
].id()] = vec
[0].id();
1341 /* state of register file after phis */
1342 std::vector
<std::bitset
<128>> sgpr_live_in(program
->blocks
.size());
1344 for (Block
& block
: program
->blocks
) {
1345 TempSet
& live
= live_out_per_block
[block
.index
];
1346 /* initialize register file */
1347 assert(block
.index
!= 0 || live
.empty());
1348 RegisterFile register_file
;
1349 ctx
.war_hint
.reset();
1351 for (Temp t
: live
) {
1352 Temp renamed
= handle_live_in(ctx
, t
, &block
);
1353 if (ctx
.assignments
[renamed
.id()].assigned
)
1354 register_file
.fill(ctx
.assignments
[renamed
.id()].reg
, t
.size(), renamed
.id());
1357 std::vector
<aco_ptr
<Instruction
>> instructions
;
1358 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1360 /* this is a slight adjustment from the paper as we already have phi nodes:
1361 * We consider them incomplete phis and only handle the definition. */
1363 /* handle fixed phi definitions */
1364 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1365 aco_ptr
<Instruction
>& phi
= *it
;
1368 Definition
& definition
= phi
->definitions
[0];
1369 if (!definition
.isFixed())
1372 /* check if a dead exec mask phi is needed */
1373 if (definition
.isKill()) {
1374 for (Operand
& op
: phi
->operands
) {
1375 assert(op
.isTemp());
1376 if (!ctx
.assignments
[op
.tempId()].assigned
||
1377 ctx
.assignments
[op
.tempId()].reg
!= exec
) {
1378 definition
.setKill(false);
1384 if (definition
.isKill())
1387 assert(definition
.physReg() == exec
);
1388 assert(!register_file
.test(definition
.physReg(), definition
.bytes()));
1389 register_file
.fill(definition
);
1390 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1393 /* look up the affinities */
1394 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1395 aco_ptr
<Instruction
>& phi
= *it
;
1398 Definition
& definition
= phi
->definitions
[0];
1399 if (definition
.isKill() || definition
.isFixed())
1402 if (ctx
.affinities
.find(definition
.tempId()) != ctx
.affinities
.end() &&
1403 ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].assigned
) {
1404 assert(ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].rc
== definition
.regClass());
1405 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].reg
;
1406 bool try_use_special_reg
= reg
== scc
|| reg
== exec
;
1407 if (try_use_special_reg
) {
1408 for (const Operand
& op
: phi
->operands
) {
1409 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
&&
1410 ctx
.assignments
[op
.tempId()].reg
== reg
)) {
1411 try_use_special_reg
= false;
1415 if (!try_use_special_reg
)
1418 /* only assign if register is still free */
1419 if (!register_file
.test(reg
, definition
.bytes())) {
1420 definition
.setFixed(reg
);
1421 register_file
.fill(definition
);
1422 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1427 /* find registers for phis without affinity or where the register was blocked */
1428 for (it
= block
.instructions
.begin();it
!= block
.instructions
.end(); ++it
) {
1429 aco_ptr
<Instruction
>& phi
= *it
;
1433 Definition
& definition
= phi
->definitions
[0];
1434 if (definition
.isKill())
1437 if (!definition
.isFixed()) {
1438 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1439 /* try to find a register that is used by at least one operand */
1440 for (const Operand
& op
: phi
->operands
) {
1441 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
))
1443 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
1444 /* we tried this already on the previous loop */
1445 if (reg
== scc
|| reg
== exec
)
1447 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
, reg
)) {
1448 definition
.setFixed(reg
);
1452 if (!definition
.isFixed())
1453 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
));
1455 /* process parallelcopy */
1456 for (std::pair
<Operand
, Definition
> pc
: parallelcopy
) {
1457 /* see if it's a copy from a different phi */
1458 //TODO: prefer moving some previous phis over live-ins
1459 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1460 Instruction
*prev_phi
= NULL
;
1461 std::vector
<aco_ptr
<Instruction
>>::iterator phi_it
;
1462 for (phi_it
= instructions
.begin(); phi_it
!= instructions
.end(); ++phi_it
) {
1463 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1464 prev_phi
= phi_it
->get();
1467 while (!prev_phi
&& is_phi(*++phi_it
)) {
1468 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1469 prev_phi
= phi_it
->get();
1472 /* if so, just update that phi's register */
1473 prev_phi
->definitions
[0].setFixed(pc
.second
.physReg());
1474 ctx
.assignments
[prev_phi
->definitions
[0].tempId()] = {pc
.second
.physReg(), pc
.second
.regClass()};
1475 register_file
.fill(pc
.second
.physReg(), pc
.second
.size(), prev_phi
->definitions
[0].tempId());
1480 std::unordered_map
<unsigned, Temp
>::iterator orig_it
= ctx
.orig_names
.find(pc
.first
.tempId());
1481 Temp orig
= pc
.first
.getTemp();
1482 if (orig_it
!= ctx
.orig_names
.end())
1483 orig
= orig_it
->second
;
1485 ctx
.orig_names
[pc
.second
.tempId()] = orig
;
1486 ctx
.renames
[block
.index
][orig
.id()] = pc
.second
.getTemp();
1488 /* otherwise, this is a live-in and we need to create a new phi
1489 * to move it in this block's predecessors */
1490 aco_opcode opcode
= pc
.first
.getTemp().is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1491 std::vector
<unsigned>& preds
= pc
.first
.getTemp().is_linear() ? block
.linear_preds
: block
.logical_preds
;
1492 aco_ptr
<Instruction
> new_phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1493 new_phi
->definitions
[0] = pc
.second
;
1494 for (unsigned i
= 0; i
< preds
.size(); i
++)
1495 new_phi
->operands
[i
] = Operand(pc
.first
);
1496 instructions
.emplace_back(std::move(new_phi
));
1499 register_file
.fill(definition
);
1500 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1502 live
.emplace(definition
.getTemp());
1504 /* update phi affinities */
1505 for (const Operand
& op
: phi
->operands
) {
1506 if (op
.isTemp() && op
.regClass() == phi
->definitions
[0].regClass())
1507 ctx
.affinities
[op
.tempId()] = definition
.tempId();
1510 instructions
.emplace_back(std::move(*it
));
1513 /* fill in sgpr_live_in */
1514 for (unsigned i
= 0; i
<= ctx
.max_used_sgpr
; i
++)
1515 sgpr_live_in
[block
.index
][i
] = register_file
[i
];
1516 sgpr_live_in
[block
.index
][127] = register_file
[scc
.reg()];
1518 /* Handle all other instructions of the block */
1519 for (; it
!= block
.instructions
.end(); ++it
) {
1520 aco_ptr
<Instruction
>& instr
= *it
;
1522 /* parallelcopies from p_phi are inserted here which means
1523 * live ranges of killed operands end here as well */
1524 if (instr
->opcode
== aco_opcode::p_logical_end
) {
1525 /* no need to process this instruction any further */
1526 if (block
.logical_succs
.size() != 1) {
1527 instructions
.emplace_back(std::move(instr
));
1531 Block
& succ
= program
->blocks
[block
.logical_succs
[0]];
1533 for (; idx
< succ
.logical_preds
.size(); idx
++) {
1534 if (succ
.logical_preds
[idx
] == block
.index
)
1537 for (aco_ptr
<Instruction
>& phi
: succ
.instructions
) {
1538 if (phi
->opcode
== aco_opcode::p_phi
) {
1539 if (phi
->operands
[idx
].isTemp() &&
1540 phi
->operands
[idx
].getTemp().type() == RegType::sgpr
&&
1541 phi
->operands
[idx
].isFirstKillBeforeDef()) {
1542 Temp phi_op
= read_variable(ctx
, phi
->operands
[idx
].getTemp(), block
.index
);
1543 PhysReg reg
= ctx
.assignments
[phi_op
.id()].reg
;
1544 assert(register_file
[reg
] == phi_op
.id());
1545 register_file
[reg
] = 0;
1547 } else if (phi
->opcode
!= aco_opcode::p_linear_phi
) {
1551 instructions
.emplace_back(std::move(instr
));
1555 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1557 assert(!is_phi(instr
));
1559 /* handle operands */
1560 for (unsigned i
= 0; i
< instr
->operands
.size(); ++i
) {
1561 auto& operand
= instr
->operands
[i
];
1562 if (!operand
.isTemp())
1565 /* rename operands */
1566 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), block
.index
));
1568 /* check if the operand is fixed */
1569 if (operand
.isFixed()) {
1571 if (operand
.physReg() == ctx
.assignments
[operand
.tempId()].reg
) {
1572 /* we are fine: the operand is already assigned the correct reg */
1575 /* check if target reg is blocked, and move away the blocking var */
1576 if (register_file
[operand
.physReg().reg()]) {
1577 uint32_t blocking_id
= register_file
[operand
.physReg().reg()];
1578 RegClass rc
= ctx
.assignments
[blocking_id
].rc
;
1579 Operand pc_op
= Operand(Temp
{blocking_id
, rc
});
1580 pc_op
.setFixed(operand
.physReg());
1581 Definition pc_def
= Definition(Temp
{program
->allocateId(), pc_op
.regClass()});
1583 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.regClass(), parallelcopy
, instr
);
1584 pc_def
.setFixed(reg
);
1585 ctx
.assignments
.emplace_back(reg
, pc_def
.regClass());
1586 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1587 register_file
.clear(pc_op
);
1588 register_file
.fill(pc_def
);
1589 parallelcopy
.emplace_back(pc_op
, pc_def
);
1591 /* handle renames of previous operands */
1592 for (unsigned j
= 0; j
< i
; j
++) {
1593 Operand
& op
= instr
->operands
[j
];
1594 if (op
.isTemp() && op
.tempId() == blocking_id
) {
1595 op
.setTemp(pc_def
.getTemp());
1600 /* move operand to fixed reg and create parallelcopy pair */
1601 Operand pc_op
= operand
;
1602 Temp tmp
= Temp
{program
->allocateId(), operand
.regClass()};
1603 Definition pc_def
= Definition(tmp
);
1604 pc_def
.setFixed(operand
.physReg());
1605 pc_op
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
1606 operand
.setTemp(tmp
);
1607 ctx
.assignments
.emplace_back(pc_def
.physReg(), pc_def
.regClass());
1608 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1609 operand
.setFixed(pc_def
.physReg());
1610 register_file
.clear(pc_op
);
1611 register_file
.fill(pc_def
);
1612 parallelcopy
.emplace_back(pc_op
, pc_def
);
1615 assert(ctx
.assignments
[operand
.tempId()].assigned
);
1616 PhysReg reg
= ctx
.assignments
[operand
.tempId()].reg
;
1618 if (operand_can_use_reg(instr
, i
, reg
)) {
1619 operand
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
1621 Operand pc_op
= operand
;
1622 pc_op
.setFixed(reg
);
1623 PhysReg new_reg
= get_reg(ctx
, register_file
, operand
.regClass(), parallelcopy
, instr
);
1624 Definition pc_def
= Definition(program
->allocateId(), new_reg
, pc_op
.regClass());
1625 ctx
.assignments
.emplace_back(new_reg
, pc_def
.regClass());
1626 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1627 register_file
.clear(pc_op
);
1628 register_file
.fill(pc_def
);
1629 parallelcopy
.emplace_back(pc_op
, pc_def
);
1630 operand
.setTemp(pc_def
.getTemp());
1631 operand
.setFixed(new_reg
);
1634 if (instr
->format
== Format::EXP
||
1635 (instr
->isVMEM() && i
== 3 && program
->chip_class
== GFX6
) ||
1636 (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
)) {
1637 for (unsigned j
= 0; j
< operand
.size(); j
++)
1638 ctx
.war_hint
.set(operand
.physReg().reg() + j
);
1641 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
1642 if (phi
!= ctx
.phi_map
.end())
1643 phi
->second
.uses
.emplace(instr
.get());
1646 /* remove dead vars from register file */
1647 for (const Operand
& op
: instr
->operands
) {
1648 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1649 register_file
.clear(op
);
1652 /* try to optimize v_mad_f32 -> v_mac_f32 */
1653 if (instr
->opcode
== aco_opcode::v_mad_f32
&&
1654 instr
->operands
[2].isTemp() &&
1655 instr
->operands
[2].isKillBeforeDef() &&
1656 instr
->operands
[2].getTemp().type() == RegType::vgpr
&&
1657 instr
->operands
[1].isTemp() &&
1658 instr
->operands
[1].getTemp().type() == RegType::vgpr
) { /* TODO: swap src0 and src1 in this case */
1659 VOP3A_instruction
* vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
1660 bool can_use_mac
= !(vop3
->abs
[0] || vop3
->abs
[1] || vop3
->abs
[2] ||
1661 vop3
->neg
[0] || vop3
->neg
[1] || vop3
->neg
[2] ||
1662 vop3
->clamp
|| vop3
->omod
|| vop3
->opsel
);
1664 instr
->format
= Format::VOP2
;
1665 instr
->opcode
= aco_opcode::v_mac_f32
;
1669 /* handle definitions which must have the same register as an operand */
1670 if (instr
->opcode
== aco_opcode::v_interp_p2_f32
||
1671 instr
->opcode
== aco_opcode::v_mac_f32
||
1672 instr
->opcode
== aco_opcode::v_writelane_b32
||
1673 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) {
1674 instr
->definitions
[0].setFixed(instr
->operands
[2].physReg());
1675 } else if (instr
->opcode
== aco_opcode::s_addk_i32
||
1676 instr
->opcode
== aco_opcode::s_mulk_i32
) {
1677 instr
->definitions
[0].setFixed(instr
->operands
[0].physReg());
1678 } else if (instr
->format
== Format::MUBUF
&&
1679 instr
->definitions
.size() == 1 &&
1680 instr
->operands
.size() == 4) {
1681 instr
->definitions
[0].setFixed(instr
->operands
[3].physReg());
1682 } else if (instr
->format
== Format::MIMG
&&
1683 instr
->definitions
.size() == 1 &&
1684 instr
->operands
[1].regClass().type() == RegType::vgpr
) {
1685 instr
->definitions
[0].setFixed(instr
->operands
[1].physReg());
1688 ctx
.defs_done
.reset();
1690 /* handle fixed definitions first */
1691 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1692 auto& definition
= instr
->definitions
[i
];
1693 if (!definition
.isFixed())
1696 adjust_max_used_regs(ctx
, definition
.regClass(), definition
.physReg());
1697 /* check if the target register is blocked */
1698 if (register_file
[definition
.physReg().reg()] != 0) {
1699 /* create parallelcopy pair to move blocking var */
1700 Temp tmp
= {register_file
[definition
.physReg()], ctx
.assignments
[register_file
[definition
.physReg()]].rc
};
1701 Operand pc_op
= Operand(tmp
);
1702 pc_op
.setFixed(ctx
.assignments
[register_file
[definition
.physReg().reg()]].reg
);
1703 RegClass rc
= pc_op
.regClass();
1704 tmp
= Temp
{program
->allocateId(), rc
};
1705 Definition pc_def
= Definition(tmp
);
1707 /* re-enable the killed operands, so that we don't move the blocking var there */
1708 for (const Operand
& op
: instr
->operands
) {
1709 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1710 register_file
.fill(op
);
1713 /* find a new register for the blocking variable */
1714 PhysReg reg
= get_reg(ctx
, register_file
, rc
, parallelcopy
, instr
);
1715 /* once again, disable killed operands */
1716 for (const Operand
& op
: instr
->operands
) {
1717 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1718 register_file
.clear(op
);
1720 for (unsigned k
= 0; k
< i
; k
++) {
1721 if (instr
->definitions
[k
].isTemp() && ctx
.defs_done
.test(k
) && !instr
->definitions
[k
].isKill())
1722 register_file
.fill(instr
->definitions
[k
]);
1724 pc_def
.setFixed(reg
);
1726 /* finish assignment of parallelcopy */
1727 ctx
.assignments
.emplace_back(reg
, pc_def
.regClass());
1728 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1729 parallelcopy
.emplace_back(pc_op
, pc_def
);
1731 /* add changes to reg_file */
1732 register_file
.clear(pc_op
);
1733 register_file
.fill(pc_def
);
1735 ctx
.defs_done
.set(i
);
1737 if (!definition
.isTemp())
1740 /* set live if it has a kill point */
1741 if (!definition
.isKill())
1742 live
.emplace(definition
.getTemp());
1744 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1745 register_file
.fill(definition
);
1748 /* handle all other definitions */
1749 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
1750 auto& definition
= instr
->definitions
[i
];
1752 if (definition
.isFixed() || !definition
.isTemp())
1756 if (definition
.hasHint() && register_file
[definition
.physReg().reg()] == 0)
1757 definition
.setFixed(definition
.physReg());
1758 else if (instr
->opcode
== aco_opcode::p_split_vector
) {
1759 PhysReg reg
= instr
->operands
[0].physReg();
1760 reg
.reg_b
+= i
* definition
.bytes();
1761 if (!get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1762 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1763 definition
.setFixed(reg
);
1764 } else if (instr
->opcode
== aco_opcode::p_wqm
) {
1766 if (instr
->operands
[0].isKillBeforeDef() && instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1767 reg
= instr
->operands
[0].physReg();
1768 assert(register_file
[reg
.reg()] == 0);
1770 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1772 definition
.setFixed(reg
);
1773 } else if (instr
->opcode
== aco_opcode::p_extract_vector
) {
1775 if (instr
->operands
[0].isKillBeforeDef() &&
1776 instr
->operands
[0].getTemp().type() == definition
.getTemp().type()) {
1777 reg
= instr
->operands
[0].physReg();
1778 reg
.reg_b
+= definition
.bytes() * instr
->operands
[1].constantValue();
1779 assert(!register_file
.test(reg
, definition
.bytes()));
1781 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1783 definition
.setFixed(reg
);
1784 } else if (instr
->opcode
== aco_opcode::p_create_vector
) {
1785 PhysReg reg
= get_reg_create_vector(ctx
, register_file
, definition
.regClass(),
1786 parallelcopy
, instr
);
1787 definition
.setFixed(reg
);
1788 } else if (ctx
.affinities
.find(definition
.tempId()) != ctx
.affinities
.end() &&
1789 ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].assigned
) {
1790 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].reg
;
1791 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
))
1792 definition
.setFixed(reg
);
1794 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1796 } else if (vectors
.find(definition
.tempId()) != vectors
.end()) {
1797 Instruction
* vec
= vectors
[definition
.tempId()];
1798 unsigned byte_offset
= 0;
1799 for (const Operand
& op
: vec
->operands
) {
1800 if (op
.isTemp() && op
.tempId() == definition
.tempId())
1803 byte_offset
+= op
.bytes();
1806 for (const Operand
& op
: vec
->operands
) {
1808 op
.tempId() != definition
.tempId() &&
1809 op
.getTemp().type() == definition
.getTemp().type() &&
1810 ctx
.assignments
[op
.tempId()].assigned
) {
1811 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
1812 reg
.reg_b
+= (byte_offset
- k
);
1813 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
, reg
)) {
1814 definition
.setFixed(reg
);
1820 if (!definition
.isFixed()) {
1821 std::pair
<PhysReg
, bool> res
= get_reg_vec(ctx
, register_file
, vec
->definitions
[0].regClass());
1822 PhysReg reg
= res
.first
;
1824 reg
.reg_b
+= byte_offset
;
1825 /* make sure to only use byte offset if the instruction supports it */
1826 if (vec
->definitions
[0].regClass().is_subdword() && reg
.byte() && !instr_can_access_subdword(instr
))
1827 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1829 reg
= get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
);
1831 definition
.setFixed(reg
);
1834 definition
.setFixed(get_reg(ctx
, register_file
, definition
.regClass(), parallelcopy
, instr
));
1836 assert(definition
.isFixed() && ((definition
.getTemp().type() == RegType::vgpr
&& definition
.physReg() >= 256) ||
1837 (definition
.getTemp().type() != RegType::vgpr
&& definition
.physReg() < 256)));
1838 ctx
.defs_done
.set(i
);
1840 /* set live if it has a kill point */
1841 if (!definition
.isKill())
1842 live
.emplace(definition
.getTemp());
1844 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1845 register_file
.fill(definition
);
1848 handle_pseudo(ctx
, register_file
, instr
.get());
1850 /* kill definitions and late-kill operands */
1851 for (const Definition
& def
: instr
->definitions
) {
1852 if (def
.isTemp() && def
.isKill())
1853 register_file
.clear(def
);
1855 for (const Operand
& op
: instr
->operands
) {
1856 if (op
.isTemp() && op
.isFirstKill() && op
.isLateKill())
1857 register_file
.clear(op
);
1860 /* emit parallelcopy */
1861 if (!parallelcopy
.empty()) {
1862 aco_ptr
<Pseudo_instruction
> pc
;
1863 pc
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, parallelcopy
.size(), parallelcopy
.size()));
1864 bool temp_in_scc
= register_file
[scc
.reg()];
1865 bool sgpr_operands_alias_defs
= false;
1866 uint64_t sgpr_operands
[4] = {0, 0, 0, 0};
1867 for (unsigned i
= 0; i
< parallelcopy
.size(); i
++) {
1868 if (temp_in_scc
&& parallelcopy
[i
].first
.isTemp() && parallelcopy
[i
].first
.getTemp().type() == RegType::sgpr
) {
1869 if (!sgpr_operands_alias_defs
) {
1870 unsigned reg
= parallelcopy
[i
].first
.physReg().reg();
1871 unsigned size
= parallelcopy
[i
].first
.getTemp().size();
1872 sgpr_operands
[reg
/ 64u] |= ((1u << size
) - 1) << (reg
% 64u);
1874 reg
= parallelcopy
[i
].second
.physReg().reg();
1875 size
= parallelcopy
[i
].second
.getTemp().size();
1876 if (sgpr_operands
[reg
/ 64u] & ((1u << size
) - 1) << (reg
% 64u))
1877 sgpr_operands_alias_defs
= true;
1881 pc
->operands
[i
] = parallelcopy
[i
].first
;
1882 pc
->definitions
[i
] = parallelcopy
[i
].second
;
1883 assert(pc
->operands
[i
].size() == pc
->definitions
[i
].size());
1885 /* it might happen that the operand is already renamed. we have to restore the original name. */
1886 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.orig_names
.find(pc
->operands
[i
].tempId());
1887 Temp orig
= it
!= ctx
.orig_names
.end() ? it
->second
: pc
->operands
[i
].getTemp();
1888 ctx
.orig_names
[pc
->definitions
[i
].tempId()] = orig
;
1889 ctx
.renames
[block
.index
][orig
.id()] = pc
->definitions
[i
].getTemp();
1891 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(pc
->operands
[i
].tempId());
1892 if (phi
!= ctx
.phi_map
.end())
1893 phi
->second
.uses
.emplace(pc
.get());
1896 if (temp_in_scc
&& sgpr_operands_alias_defs
) {
1897 /* disable definitions and re-enable operands */
1898 for (const Definition
& def
: instr
->definitions
) {
1899 if (def
.isTemp() && !def
.isKill())
1900 register_file
.clear(def
);
1902 for (const Operand
& op
: instr
->operands
) {
1903 if (op
.isTemp() && op
.isFirstKill())
1904 register_file
.fill(op
.physReg(), op
.size(), 0xFFFF);
1907 handle_pseudo(ctx
, register_file
, pc
.get());
1909 /* re-enable live vars */
1910 for (const Operand
& op
: instr
->operands
) {
1911 if (op
.isTemp() && op
.isFirstKill())
1912 register_file
.clear(op
);
1914 for (const Definition
& def
: instr
->definitions
) {
1915 if (def
.isTemp() && !def
.isKill())
1916 register_file
.fill(def
);
1919 pc
->tmp_in_scc
= false;
1922 instructions
.emplace_back(std::move(pc
));
1925 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
1926 bool instr_needs_vop3
= !instr
->isVOP3() &&
1927 ((instr
->format
== Format::VOPC
&& !(instr
->definitions
[0].physReg() == vcc
)) ||
1928 (instr
->opcode
== aco_opcode::v_cndmask_b32
&& !(instr
->operands
[2].physReg() == vcc
)) ||
1929 ((instr
->opcode
== aco_opcode::v_add_co_u32
||
1930 instr
->opcode
== aco_opcode::v_addc_co_u32
||
1931 instr
->opcode
== aco_opcode::v_sub_co_u32
||
1932 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1933 instr
->opcode
== aco_opcode::v_subrev_co_u32
||
1934 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1935 !(instr
->definitions
[1].physReg() == vcc
)) ||
1936 ((instr
->opcode
== aco_opcode::v_addc_co_u32
||
1937 instr
->opcode
== aco_opcode::v_subb_co_u32
||
1938 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
1939 !(instr
->operands
[2].physReg() == vcc
)));
1940 if (instr_needs_vop3
) {
1942 /* if the first operand is a literal, we have to move it to a reg */
1943 if (instr
->operands
.size() && instr
->operands
[0].isLiteral() && program
->chip_class
< GFX10
) {
1944 bool can_sgpr
= true;
1945 /* check, if we have to move to vgpr */
1946 for (const Operand
& op
: instr
->operands
) {
1947 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1952 /* disable definitions and re-enable operands */
1953 for (const Definition
& def
: instr
->definitions
)
1954 register_file
.clear(def
);
1955 for (const Operand
& op
: instr
->operands
) {
1956 if (op
.isTemp() && op
.isFirstKill())
1957 register_file
.fill(op
.physReg(), op
.size(), 0xFFFF);
1959 RegClass rc
= can_sgpr
? s1
: v1
;
1960 PhysReg reg
= get_reg(ctx
, register_file
, rc
, parallelcopy
, instr
);
1961 Temp tmp
= {program
->allocateId(), rc
};
1962 ctx
.assignments
.emplace_back(reg
, rc
);
1964 aco_ptr
<Instruction
> mov
;
1966 mov
.reset(create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1));
1968 mov
.reset(create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1));
1969 mov
->operands
[0] = instr
->operands
[0];
1970 mov
->definitions
[0] = Definition(tmp
);
1971 mov
->definitions
[0].setFixed(reg
);
1973 instr
->operands
[0] = Operand(tmp
);
1974 instr
->operands
[0].setFixed(reg
);
1975 instructions
.emplace_back(std::move(mov
));
1976 /* re-enable live vars */
1977 for (const Operand
& op
: instr
->operands
) {
1978 if (op
.isTemp() && op
.isFirstKill())
1979 register_file
.clear(op
);
1981 for (const Definition
& def
: instr
->definitions
) {
1982 if (def
.isTemp() && !def
.isKill())
1983 register_file
.fill(def
);
1987 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
1988 aco_ptr
<Instruction
> tmp
= std::move(instr
);
1989 Format format
= asVOP3(tmp
->format
);
1990 instr
.reset(create_instruction
<VOP3A_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
1991 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1992 Operand
& operand
= tmp
->operands
[i
];
1993 instr
->operands
[i
] = operand
;
1994 /* keep phi_map up to date */
1995 if (operand
.isTemp()) {
1996 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.tempId());
1997 if (phi
!= ctx
.phi_map
.end()) {
1998 phi
->second
.uses
.erase(tmp
.get());
1999 phi
->second
.uses
.emplace(instr
.get());
2003 std::copy(tmp
->definitions
.begin(), tmp
->definitions
.end(), instr
->definitions
.begin());
2005 instructions
.emplace_back(std::move(*it
));
2007 } /* end for Instr */
2009 block
.instructions
= std::move(instructions
);
2011 ctx
.filled
[block
.index
] = true;
2012 for (unsigned succ_idx
: block
.linear_succs
) {
2013 Block
& succ
= program
->blocks
[succ_idx
];
2014 /* seal block if all predecessors are filled */
2015 bool all_filled
= true;
2016 for (unsigned pred_idx
: succ
.linear_preds
) {
2017 if (!ctx
.filled
[pred_idx
]) {
2023 ctx
.sealed
[succ_idx
] = true;
2025 /* finish incomplete phis and check if they became trivial */
2026 for (Instruction
* phi
: ctx
.incomplete_phis
[succ_idx
]) {
2027 std::vector
<unsigned> preds
= phi
->definitions
[0].getTemp().is_linear() ? succ
.linear_preds
: succ
.logical_preds
;
2028 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
2029 phi
->operands
[i
].setTemp(read_variable(ctx
, phi
->operands
[i
].getTemp(), preds
[i
]));
2030 phi
->operands
[i
].setFixed(ctx
.assignments
[phi
->operands
[i
].tempId()].reg
);
2032 try_remove_trivial_phi(ctx
, phi
->definitions
[0].getTemp());
2034 /* complete the original phi nodes, but no need to check triviality */
2035 for (aco_ptr
<Instruction
>& instr
: succ
.instructions
) {
2038 std::vector
<unsigned> preds
= instr
->opcode
== aco_opcode::p_phi
? succ
.logical_preds
: succ
.linear_preds
;
2040 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2041 auto& operand
= instr
->operands
[i
];
2042 if (!operand
.isTemp())
2044 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), preds
[i
]));
2045 operand
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
2046 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
2047 if (phi
!= ctx
.phi_map
.end())
2048 phi
->second
.uses
.emplace(instr
.get());
2055 /* remove trivial phis */
2056 for (Block
& block
: program
->blocks
) {
2057 auto end
= std::find_if(block
.instructions
.begin(), block
.instructions
.end(),
2058 [](aco_ptr
<Instruction
>& instr
) { return !is_phi(instr
);});
2059 auto middle
= std::remove_if(block
.instructions
.begin(), end
,
2060 [](const aco_ptr
<Instruction
>& instr
) { return instr
->definitions
.empty();});
2061 block
.instructions
.erase(middle
, end
);
2064 /* find scc spill registers which may be needed for parallelcopies created by phis */
2065 for (Block
& block
: program
->blocks
) {
2066 if (block
.linear_preds
.size() <= 1)
2069 std::bitset
<128> regs
= sgpr_live_in
[block
.index
];
2073 /* choose a register */
2075 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& regs
[reg
]; reg
++)
2077 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
2078 adjust_max_used_regs(ctx
, s1
, reg
);
2080 /* update predecessors */
2081 for (unsigned& pred_index
: block
.linear_preds
) {
2082 Block
& pred
= program
->blocks
[pred_index
];
2083 pred
.scc_live_out
= true;
2084 pred
.scratch_sgpr
= PhysReg
{(uint16_t)reg
};
2088 /* num_gpr = rnd_up(max_used_gpr + 1) */
2089 program
->config
->num_vgprs
= align(ctx
.max_used_vgpr
+ 1, 4);
2090 if (program
->family
== CHIP_TONGA
|| program
->family
== CHIP_ICELAND
) /* workaround hardware bug */
2091 program
->config
->num_sgprs
= get_sgpr_alloc(program
, program
->sgpr_limit
);
2093 program
->config
->num_sgprs
= align(ctx
.max_used_sgpr
+ 1 + get_extra_sgprs(program
), 8);