2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
32 #include <unordered_map>
36 #include "util/u_math.h"
41 unsigned get_subdword_operand_stride(chip_class chip
, const aco_ptr
<Instruction
>& instr
, unsigned idx
, RegClass rc
);
42 void add_subdword_operand(chip_class chip
, aco_ptr
<Instruction
>& instr
, unsigned idx
, unsigned byte
, RegClass rc
);
43 std::pair
<unsigned, unsigned> get_subdword_definition_info(Program
*program
, const aco_ptr
<Instruction
>& instr
, RegClass rc
);
44 void add_subdword_definition(Program
*program
, aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
, bool is_partial
);
50 assignment() = default;
51 assignment(PhysReg reg
, RegClass rc
) : reg(reg
), rc(rc
), assigned(-1) {}
57 std::set
<Instruction
*> uses
;
61 std::bitset
<512> war_hint
;
63 std::vector
<assignment
> assignments
;
64 std::vector
<std::unordered_map
<unsigned, Temp
>> renames
;
65 std::vector
<std::vector
<Instruction
*>> incomplete_phis
;
66 std::vector
<bool> filled
;
67 std::vector
<bool> sealed
;
68 std::unordered_map
<unsigned, Temp
> orig_names
;
69 std::unordered_map
<unsigned, phi_info
> phi_map
;
70 std::unordered_map
<unsigned, unsigned> affinities
;
71 std::unordered_map
<unsigned, Instruction
*> vectors
;
72 std::unordered_map
<unsigned, Instruction
*> split_vectors
;
73 aco_ptr
<Instruction
> pseudo_dummy
;
74 unsigned max_used_sgpr
= 0;
75 unsigned max_used_vgpr
= 0;
76 std::bitset
<64> defs_done
; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
78 ra_ctx(Program
* program
) : program(program
),
79 assignments(program
->peekAllocationId()),
80 renames(program
->blocks
.size()),
81 incomplete_phis(program
->blocks
.size()),
82 filled(program
->blocks
.size()),
83 sealed(program
->blocks
.size())
85 pseudo_dummy
.reset(create_instruction
<Instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, 0, 0));
96 DefInfo(ra_ctx
& ctx
, aco_ptr
<Instruction
>& instr
, RegClass rc_
, int operand
) : rc(rc_
) {
100 if (rc
.type() == RegType::vgpr
) {
102 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
105 ub
= ctx
.program
->max_reg_demand
.sgpr
;
112 if (rc
.is_subdword() && operand
>= 0) {
113 /* stride in bytes */
114 stride
= get_subdword_operand_stride(ctx
.program
->chip_class
, instr
, operand
, rc
);
115 } else if (rc
.is_subdword()) {
116 std::pair
<unsigned, unsigned> info
= get_subdword_definition_info(ctx
.program
, instr
, rc
);
118 if (info
.second
> rc
.bytes()) {
119 rc
= RegClass::get(rc
.type(), info
.second
);
121 /* we might still be able to put the definition in the high half,
122 * but that's only useful for affinities and this information isn't
124 stride
= align(stride
, info
.second
);
125 if (!rc
.is_subdword())
126 stride
= DIV_ROUND_UP(stride
, 4);
135 RegisterFile() {regs
.fill(0);}
137 std::array
<uint32_t, 512> regs
;
138 std::map
<uint32_t, std::array
<uint32_t, 4>> subdword_regs
;
140 const uint32_t& operator [] (unsigned index
) const {
144 uint32_t& operator [] (unsigned index
) {
148 unsigned count_zero(PhysReg start
, unsigned size
) {
150 for (unsigned i
= 0; i
< size
; i
++)
151 res
+= !regs
[start
+ i
];
155 bool test(PhysReg start
, unsigned num_bytes
) {
156 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
157 if (regs
[i
] & 0x0FFFFFFF)
159 if (regs
[i
] == 0xF0000000) {
160 assert(subdword_regs
.find(i
) != subdword_regs
.end());
161 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++) {
162 if (subdword_regs
[i
][j
])
170 void block(PhysReg start
, RegClass rc
) {
171 if (rc
.is_subdword())
172 fill_subdword(start
, rc
.bytes(), 0xFFFFFFFF);
174 fill(start
, rc
.size(), 0xFFFFFFFF);
177 bool is_blocked(PhysReg start
) {
178 if (regs
[start
] == 0xFFFFFFFF)
180 if (regs
[start
] == 0xF0000000) {
181 for (unsigned i
= start
.byte(); i
< 4; i
++)
182 if (subdword_regs
[start
][i
] == 0xFFFFFFFF)
188 bool is_empty_or_blocked(PhysReg start
) {
189 if (regs
[start
] == 0xF0000000) {
190 return subdword_regs
[start
][start
.byte()] + 1 <= 1;
192 return regs
[start
] + 1 <= 1;
195 void clear(PhysReg start
, RegClass rc
) {
196 if (rc
.is_subdword())
197 fill_subdword(start
, rc
.bytes(), 0);
199 fill(start
, rc
.size(), 0);
202 void fill(Operand op
) {
203 if (op
.regClass().is_subdword())
204 fill_subdword(op
.physReg(), op
.bytes(), op
.tempId());
206 fill(op
.physReg(), op
.size(), op
.tempId());
209 void clear(Operand op
) {
210 clear(op
.physReg(), op
.regClass());
213 void fill(Definition def
) {
214 if (def
.regClass().is_subdword())
215 fill_subdword(def
.physReg(), def
.bytes(), def
.tempId());
217 fill(def
.physReg(), def
.size(), def
.tempId());
220 void clear(Definition def
) {
221 clear(def
.physReg(), def
.regClass());
224 unsigned get_id(PhysReg reg
) {
225 return regs
[reg
] == 0xF0000000 ? subdword_regs
[reg
][reg
.byte()] : regs
[reg
];
229 void fill(PhysReg start
, unsigned size
, uint32_t val
) {
230 for (unsigned i
= 0; i
< size
; i
++)
231 regs
[start
+ i
] = val
;
234 void fill_subdword(PhysReg start
, unsigned num_bytes
, uint32_t val
) {
235 fill(start
, DIV_ROUND_UP(num_bytes
, 4), 0xF0000000);
236 for (PhysReg i
= start
; i
.reg_b
< start
.reg_b
+ num_bytes
; i
= PhysReg(i
+ 1)) {
238 std::array
<uint32_t, 4>& sub
= subdword_regs
.emplace(i
, std::array
<uint32_t, 4>{0, 0, 0, 0}).first
->second
;
239 for (unsigned j
= i
.byte(); i
* 4 + j
< start
.reg_b
+ num_bytes
&& j
< 4; j
++)
242 if (sub
== std::array
<uint32_t, 4>{0, 0, 0, 0}) {
243 subdword_regs
.erase(i
);
251 /* helper function for debugging */
253 void print_regs(ra_ctx
& ctx
, bool vgprs
, RegisterFile
& reg_file
)
255 unsigned max
= vgprs
? ctx
.program
->max_reg_demand
.vgpr
: ctx
.program
->max_reg_demand
.sgpr
;
256 unsigned lb
= vgprs
? 256 : 0;
257 unsigned ub
= lb
+ max
;
258 char reg_char
= vgprs
? 'v' : 's';
262 for (unsigned i
= lb
; i
< ub
; i
+= 3) {
263 printf("%.2u ", i
- lb
);
268 printf("%cgprs: ", reg_char
);
269 unsigned free_regs
= 0;
271 bool char_select
= false;
272 for (unsigned i
= lb
; i
< ub
; i
++) {
273 if (reg_file
[i
] == 0xFFFF) {
275 } else if (reg_file
[i
]) {
276 if (reg_file
[i
] != prev
) {
278 char_select
= !char_select
;
280 printf(char_select
? "#" : "@");
288 printf("%u/%u used, %u/%u free\n", max
- free_regs
, max
, free_regs
, max
);
290 /* print assignments */
293 for (unsigned i
= lb
; i
< ub
; i
++) {
294 if (reg_file
[i
] != prev
) {
295 if (prev
&& size
> 1)
296 printf("-%d]\n", i
- 1 - lb
);
300 if (prev
&& prev
!= 0xFFFF) {
301 if (ctx
.orig_names
.count(reg_file
[i
]) && ctx
.orig_names
[reg_file
[i
]].id() != reg_file
[i
])
302 printf("%%%u (was %%%d) = %c[%d", reg_file
[i
], ctx
.orig_names
[reg_file
[i
]].id(), reg_char
, i
- lb
);
304 printf("%%%u = %c[%d", reg_file
[i
], reg_char
, i
- lb
);
311 if (prev
&& size
> 1)
312 printf("-%d]\n", ub
- lb
- 1);
319 unsigned get_subdword_operand_stride(chip_class chip
, const aco_ptr
<Instruction
>& instr
, unsigned idx
, RegClass rc
)
321 /* v_readfirstlane_b32 cannot use SDWA */
322 if (instr
->opcode
== aco_opcode::p_as_uniform
)
324 if (instr
->format
== Format::PSEUDO
&& chip
>= GFX8
)
325 return rc
.bytes() % 2 == 0 ? 2 : 1;
327 if (instr
->opcode
== aco_opcode::v_cvt_f32_ubyte0
) {
329 } else if (can_use_SDWA(chip
, instr
)) {
330 return rc
.bytes() % 2 == 0 ? 2 : 1;
331 } else if (rc
.bytes() == 2 && can_use_opsel(chip
, instr
->opcode
, idx
, 1)) {
335 switch (instr
->opcode
) {
336 case aco_opcode::ds_write_b8
:
337 case aco_opcode::ds_write_b16
:
338 return chip
>= GFX8
? 2 : 4;
339 case aco_opcode::buffer_store_byte
:
340 case aco_opcode::buffer_store_short
:
341 case aco_opcode::flat_store_byte
:
342 case aco_opcode::flat_store_short
:
343 case aco_opcode::scratch_store_byte
:
344 case aco_opcode::scratch_store_short
:
345 case aco_opcode::global_store_byte
:
346 case aco_opcode::global_store_short
:
347 return chip
>= GFX9
? 2 : 4;
355 void add_subdword_operand(chip_class chip
, aco_ptr
<Instruction
>& instr
, unsigned idx
, unsigned byte
, RegClass rc
)
357 if (instr
->format
== Format::PSEUDO
|| byte
== 0)
360 assert(rc
.bytes() <= 2);
362 if (!instr
->usesModifiers() && instr
->opcode
== aco_opcode::v_cvt_f32_ubyte0
) {
365 instr
->opcode
= aco_opcode::v_cvt_f32_ubyte0
;
368 instr
->opcode
= aco_opcode::v_cvt_f32_ubyte1
;
371 instr
->opcode
= aco_opcode::v_cvt_f32_ubyte2
;
374 instr
->opcode
= aco_opcode::v_cvt_f32_ubyte3
;
378 } else if (can_use_SDWA(chip
, instr
)) {
379 convert_to_SDWA(chip
, instr
);
381 } else if (rc
.bytes() == 2 && can_use_opsel(chip
, instr
->opcode
, idx
, byte
/ 2)) {
382 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
383 vop3
->opsel
|= (byte
/ 2) << idx
;
387 if (chip
>= GFX8
&& instr
->opcode
== aco_opcode::ds_write_b8
&& byte
== 2) {
388 instr
->opcode
= aco_opcode::ds_write_b8_d16_hi
;
391 if (chip
>= GFX8
&& instr
->opcode
== aco_opcode::ds_write_b16
&& byte
== 2) {
392 instr
->opcode
= aco_opcode::ds_write_b16_d16_hi
;
396 if (chip
>= GFX9
&& byte
== 2) {
397 if (instr
->opcode
== aco_opcode::buffer_store_byte
)
398 instr
->opcode
= aco_opcode::buffer_store_byte_d16_hi
;
399 else if (instr
->opcode
== aco_opcode::buffer_store_short
)
400 instr
->opcode
= aco_opcode::buffer_store_short_d16_hi
;
401 else if (instr
->opcode
== aco_opcode::flat_store_byte
)
402 instr
->opcode
= aco_opcode::flat_store_byte_d16_hi
;
403 else if (instr
->opcode
== aco_opcode::flat_store_short
)
404 instr
->opcode
= aco_opcode::flat_store_short_d16_hi
;
405 else if (instr
->opcode
== aco_opcode::scratch_store_byte
)
406 instr
->opcode
= aco_opcode::scratch_store_byte_d16_hi
;
407 else if (instr
->opcode
== aco_opcode::scratch_store_short
)
408 instr
->opcode
= aco_opcode::scratch_store_short_d16_hi
;
409 else if (instr
->opcode
== aco_opcode::global_store_byte
)
410 instr
->opcode
= aco_opcode::global_store_byte_d16_hi
;
411 else if (instr
->opcode
== aco_opcode::global_store_short
)
412 instr
->opcode
= aco_opcode::global_store_short_d16_hi
;
414 unreachable("Something went wrong: Impossible register assignment.");
418 /* minimum_stride, bytes_written */
419 std::pair
<unsigned, unsigned> get_subdword_definition_info(Program
*program
, const aco_ptr
<Instruction
>& instr
, RegClass rc
)
421 chip_class chip
= program
->chip_class
;
423 if (instr
->format
== Format::PSEUDO
&& chip
>= GFX8
)
424 return std::make_pair(rc
.bytes() % 2 == 0 ? 2 : 1, rc
.bytes());
425 else if (instr
->format
== Format::PSEUDO
)
426 return std::make_pair(4, rc
.size() * 4u);
428 unsigned bytes_written
= chip
>= GFX10
? rc
.bytes() : 4u;
429 switch (instr
->opcode
) {
430 case aco_opcode::v_mad_f16
:
431 case aco_opcode::v_mad_u16
:
432 case aco_opcode::v_mad_i16
:
433 case aco_opcode::v_fma_f16
:
434 case aco_opcode::v_div_fixup_f16
:
435 case aco_opcode::v_interp_p2_f16
:
436 bytes_written
= chip
>= GFX9
? rc
.bytes() : 4u;
441 bytes_written
= MAX2(bytes_written
, instr_info
.definition_size
[(int)instr
->opcode
] / 8u);
443 if (can_use_SDWA(chip
, instr
)) {
444 return std::make_pair(rc
.bytes(), rc
.bytes());
445 } else if (rc
.bytes() == 2 && can_use_opsel(chip
, instr
->opcode
, -1, 1)) {
446 return std::make_pair(2u, bytes_written
);
449 switch (instr
->opcode
) {
450 case aco_opcode::buffer_load_ubyte_d16
:
451 case aco_opcode::buffer_load_short_d16
:
452 case aco_opcode::flat_load_ubyte_d16
:
453 case aco_opcode::flat_load_short_d16
:
454 case aco_opcode::scratch_load_ubyte_d16
:
455 case aco_opcode::scratch_load_short_d16
:
456 case aco_opcode::global_load_ubyte_d16
:
457 case aco_opcode::global_load_short_d16
:
458 case aco_opcode::ds_read_u8_d16
:
459 case aco_opcode::ds_read_u16_d16
:
460 if (chip
>= GFX9
&& !program
->sram_ecc_enabled
)
461 return std::make_pair(2u, 2u);
463 return std::make_pair(2u, 4u);
468 return std::make_pair(4u, bytes_written
);
471 void add_subdword_definition(Program
*program
, aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
, bool is_partial
)
473 RegClass rc
= instr
->definitions
[idx
].regClass();
474 chip_class chip
= program
->chip_class
;
476 instr
->definitions
[idx
].setFixed(reg
);
478 if (instr
->format
== Format::PSEUDO
) {
480 } else if (can_use_SDWA(chip
, instr
)) {
481 if (reg
.byte() || (is_partial
&& chip
< GFX10
))
482 convert_to_SDWA(chip
, instr
);
484 } else if (reg
.byte() && rc
.bytes() == 2 && can_use_opsel(chip
, instr
->opcode
, -1, reg
.byte() / 2)) {
485 VOP3A_instruction
*vop3
= static_cast<VOP3A_instruction
*>(instr
.get());
487 vop3
->opsel
|= (1 << 3); /* dst in high half */
491 if (reg
.byte() == 2) {
492 if (instr
->opcode
== aco_opcode::buffer_load_ubyte_d16
)
493 instr
->opcode
= aco_opcode::buffer_load_ubyte_d16_hi
;
494 else if (instr
->opcode
== aco_opcode::buffer_load_short_d16
)
495 instr
->opcode
= aco_opcode::buffer_load_short_d16_hi
;
496 else if (instr
->opcode
== aco_opcode::flat_load_ubyte_d16
)
497 instr
->opcode
= aco_opcode::flat_load_ubyte_d16_hi
;
498 else if (instr
->opcode
== aco_opcode::flat_load_short_d16
)
499 instr
->opcode
= aco_opcode::flat_load_short_d16_hi
;
500 else if (instr
->opcode
== aco_opcode::scratch_load_ubyte_d16
)
501 instr
->opcode
= aco_opcode::scratch_load_ubyte_d16_hi
;
502 else if (instr
->opcode
== aco_opcode::scratch_load_short_d16
)
503 instr
->opcode
= aco_opcode::scratch_load_short_d16_hi
;
504 else if (instr
->opcode
== aco_opcode::global_load_ubyte_d16
)
505 instr
->opcode
= aco_opcode::global_load_ubyte_d16_hi
;
506 else if (instr
->opcode
== aco_opcode::global_load_short_d16
)
507 instr
->opcode
= aco_opcode::global_load_short_d16_hi
;
508 else if (instr
->opcode
== aco_opcode::ds_read_u8_d16
)
509 instr
->opcode
= aco_opcode::ds_read_u8_d16_hi
;
510 else if (instr
->opcode
== aco_opcode::ds_read_u16_d16
)
511 instr
->opcode
= aco_opcode::ds_read_u16_d16_hi
;
513 unreachable("Something went wrong: Impossible register assignment.");
517 void adjust_max_used_regs(ra_ctx
& ctx
, RegClass rc
, unsigned reg
)
519 unsigned max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
520 unsigned size
= rc
.size();
521 if (rc
.type() == RegType::vgpr
) {
523 unsigned hi
= reg
- 256 + size
- 1;
524 ctx
.max_used_vgpr
= std::max(ctx
.max_used_vgpr
, hi
);
525 } else if (reg
+ rc
.size() <= max_addressible_sgpr
) {
526 unsigned hi
= reg
+ size
- 1;
527 ctx
.max_used_sgpr
= std::max(ctx
.max_used_sgpr
, std::min(hi
, max_addressible_sgpr
));
532 void update_renames(ra_ctx
& ctx
, RegisterFile
& reg_file
,
533 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
534 aco_ptr
<Instruction
>& instr
, bool rename_not_killed_ops
)
536 /* allocate id's and rename operands: this is done transparently here */
537 for (std::pair
<Operand
, Definition
>& copy
: parallelcopies
) {
538 /* the definitions with id are not from this function and already handled */
539 if (copy
.second
.isTemp())
542 /* check if we we moved another parallelcopy definition */
543 for (std::pair
<Operand
, Definition
>& other
: parallelcopies
) {
544 if (!other
.second
.isTemp())
546 if (copy
.first
.getTemp() == other
.second
.getTemp()) {
547 copy
.first
.setTemp(other
.first
.getTemp());
548 copy
.first
.setFixed(other
.first
.physReg());
551 // FIXME: if a definition got moved, change the target location and remove the parallelcopy
552 copy
.second
.setTemp(Temp(ctx
.program
->allocateId(), copy
.second
.regClass()));
553 ctx
.assignments
.emplace_back(copy
.second
.physReg(), copy
.second
.regClass());
554 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
555 reg_file
.fill(copy
.second
);
557 /* check if we moved an operand */
559 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
560 Operand
& op
= instr
->operands
[i
];
563 if (op
.tempId() == copy
.first
.tempId()) {
564 bool omit_renaming
= !rename_not_killed_ops
&& !op
.isKillBeforeDef();
565 for (std::pair
<Operand
, Definition
>& pc
: parallelcopies
) {
566 PhysReg def_reg
= pc
.second
.physReg();
567 omit_renaming
&= def_reg
> copy
.first
.physReg() ?
568 (copy
.first
.physReg() + copy
.first
.size() <= def_reg
.reg()) :
569 (def_reg
+ pc
.second
.size() <= copy
.first
.physReg().reg());
573 op
.setFirstKill(true);
579 op
.setTemp(copy
.second
.getTemp());
580 op
.setFixed(copy
.second
.physReg());
586 std::pair
<PhysReg
, bool> get_reg_simple(ra_ctx
& ctx
,
587 RegisterFile
& reg_file
,
590 uint32_t lb
= info
.lb
;
591 uint32_t ub
= info
.ub
;
592 uint32_t size
= info
.size
;
593 uint32_t stride
= info
.rc
.is_subdword() ? DIV_ROUND_UP(info
.stride
, 4) : info
.stride
;
594 RegClass rc
= info
.rc
;
597 info
.rc
= RegClass(rc
.type(), size
);
598 for (unsigned stride
= 8; stride
> 1; stride
/= 2) {
601 info
.stride
= stride
;
602 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
607 /* best fit algorithm: find the smallest gap to fit in the variable */
608 unsigned best_pos
= 0xFFFF;
609 unsigned gap_size
= 0xFFFF;
610 unsigned last_pos
= 0xFFFF;
612 for (unsigned current_reg
= lb
; current_reg
< ub
; current_reg
++) {
614 if (reg_file
[current_reg
] == 0 && !ctx
.war_hint
[current_reg
]) {
615 if (last_pos
== 0xFFFF)
616 last_pos
= current_reg
;
618 /* stop searching after max_used_gpr */
619 if (current_reg
== ctx
.max_used_sgpr
+ 1 || current_reg
== 256 + ctx
.max_used_vgpr
+ 1)
625 if (last_pos
== 0xFFFF)
628 /* early return on exact matches */
629 if (last_pos
+ size
== current_reg
) {
630 adjust_max_used_regs(ctx
, rc
, last_pos
);
631 return {PhysReg
{last_pos
}, true};
634 /* check if it fits and the gap size is smaller */
635 if (last_pos
+ size
< current_reg
&& current_reg
- last_pos
< gap_size
) {
637 gap_size
= current_reg
- last_pos
;
643 if (last_pos
+ size
<= ub
&& ub
- last_pos
< gap_size
) {
645 gap_size
= ub
- last_pos
;
648 if (best_pos
== 0xFFFF)
651 /* find best position within gap by leaving a good stride for other variables*/
652 unsigned buffer
= gap_size
- size
;
654 if (((best_pos
+ size
) % 8 != 0 && (best_pos
+ buffer
) % 8 == 0) ||
655 ((best_pos
+ size
) % 4 != 0 && (best_pos
+ buffer
) % 4 == 0) ||
656 ((best_pos
+ size
) % 2 != 0 && (best_pos
+ buffer
) % 2 == 0))
657 best_pos
= best_pos
+ buffer
;
660 adjust_max_used_regs(ctx
, rc
, best_pos
);
661 return {PhysReg
{best_pos
}, true};
665 unsigned reg_lo
= lb
;
666 unsigned reg_hi
= lb
+ size
- 1;
667 while (!found
&& reg_lo
+ size
<= ub
) {
668 if (reg_file
[reg_lo
] != 0) {
672 reg_hi
= reg_lo
+ size
- 1;
674 for (unsigned reg
= reg_lo
+ 1; found
&& reg
<= reg_hi
; reg
++) {
675 if (reg_file
[reg
] != 0 || ctx
.war_hint
[reg
])
679 adjust_max_used_regs(ctx
, rc
, reg_lo
);
680 return {PhysReg
{reg_lo
}, true};
686 /* do this late because using the upper bytes of a register can require
687 * larger instruction encodings or copies
688 * TODO: don't do this in situations where it doesn't benefit */
689 if (rc
.is_subdword()) {
690 for (std::pair
<uint32_t, std::array
<uint32_t, 4>> entry
: reg_file
.subdword_regs
) {
691 assert(reg_file
[entry
.first
] == 0xF0000000);
692 if (lb
> entry
.first
|| entry
.first
>= ub
)
695 for (unsigned i
= 0; i
< 4; i
+= info
.stride
) {
696 if (entry
.second
[i
] != 0)
699 bool reg_found
= true;
700 for (unsigned j
= 1; reg_found
&& i
+ j
< 4 && j
< rc
.bytes(); j
++)
701 reg_found
&= entry
.second
[i
+ j
] == 0;
703 /* check neighboring reg if needed */
704 reg_found
&= ((int)i
<= 4 - (int)rc
.bytes() || reg_file
[entry
.first
+ 1] == 0);
706 PhysReg res
{entry
.first
};
708 adjust_max_used_regs(ctx
, rc
, entry
.first
);
718 /* collect variables from a register area and clear reg_file */
719 std::set
<std::pair
<unsigned, unsigned>> collect_vars(ra_ctx
& ctx
, RegisterFile
& reg_file
,
720 PhysReg reg
, unsigned size
)
722 std::set
<std::pair
<unsigned, unsigned>> vars
;
723 for (unsigned j
= reg
; j
< reg
+ size
; j
++) {
724 if (reg_file
.is_blocked(PhysReg
{j
}))
726 if (reg_file
[j
] == 0xF0000000) {
727 for (unsigned k
= 0; k
< 4; k
++) {
728 unsigned id
= reg_file
.subdword_regs
[j
][k
];
730 assignment
& var
= ctx
.assignments
[id
];
731 vars
.emplace(var
.rc
.bytes(), id
);
732 reg_file
.clear(var
.reg
, var
.rc
);
737 } else if (reg_file
[j
] != 0) {
738 unsigned id
= reg_file
[j
];
739 assignment
& var
= ctx
.assignments
[id
];
740 vars
.emplace(var
.rc
.bytes(), id
);
741 reg_file
.clear(var
.reg
, var
.rc
);
747 bool get_regs_for_copies(ra_ctx
& ctx
,
748 RegisterFile
& reg_file
,
749 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
750 const std::set
<std::pair
<unsigned, unsigned>> &vars
,
751 uint32_t lb
, uint32_t ub
,
752 aco_ptr
<Instruction
>& instr
,
757 /* variables are sorted from small sized to large */
758 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */
759 for (std::set
<std::pair
<unsigned, unsigned>>::const_reverse_iterator it
= vars
.rbegin(); it
!= vars
.rend(); ++it
) {
760 unsigned id
= it
->second
;
761 assignment
& var
= ctx
.assignments
[id
];
762 DefInfo info
= DefInfo(ctx
, ctx
.pseudo_dummy
, var
.rc
, -1);
763 uint32_t size
= info
.size
;
765 /* check if this is a dead operand, then we can re-use the space from the definition
766 * also use the correct stride for sub-dword operands */
767 bool is_dead_operand
= false;
768 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
769 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
770 if (instr
->operands
[i
].isKillBeforeDef())
771 is_dead_operand
= true;
772 info
= DefInfo(ctx
, instr
, var
.rc
, i
);
777 std::pair
<PhysReg
, bool> res
;
778 if (is_dead_operand
) {
779 if (instr
->opcode
== aco_opcode::p_create_vector
) {
780 PhysReg
reg(def_reg_lo
);
781 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
782 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].tempId() == id
) {
783 res
= {reg
, (!var
.rc
.is_subdword() || (reg
.byte() % info
.stride
== 0)) && !reg_file
.test(reg
, var
.rc
.bytes())};
786 reg
.reg_b
+= instr
->operands
[i
].bytes();
789 res
= {var
.reg
, !reg_file
.test(var
.reg
, var
.rc
.bytes())};
791 info
.lb
= def_reg_lo
;
792 info
.ub
= def_reg_hi
+ 1;
793 res
= get_reg_simple(ctx
, reg_file
, info
);
797 info
.ub
= MIN2(def_reg_lo
, ub
);
798 res
= get_reg_simple(ctx
, reg_file
, info
);
799 if (!res
.second
&& def_reg_hi
< ub
) {
800 info
.lb
= (def_reg_hi
+ info
.stride
) & ~(info
.stride
- 1);
802 res
= get_reg_simple(ctx
, reg_file
, info
);
807 /* mark the area as blocked */
808 reg_file
.block(res
.first
, var
.rc
);
810 /* create parallelcopy pair (without definition id) */
811 Temp tmp
= Temp(id
, var
.rc
);
812 Operand pc_op
= Operand(tmp
);
813 pc_op
.setFixed(var
.reg
);
814 Definition pc_def
= Definition(res
.first
, pc_op
.regClass());
815 parallelcopies
.emplace_back(pc_op
, pc_def
);
819 unsigned best_pos
= lb
;
820 unsigned num_moves
= 0xFF;
821 unsigned num_vars
= 0;
823 /* we use a sliding window to find potential positions */
824 unsigned reg_lo
= lb
;
825 unsigned reg_hi
= lb
+ size
- 1;
826 unsigned stride
= var
.rc
.is_subdword() ? 1 : info
.stride
;
827 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
828 if (!is_dead_operand
&& ((reg_lo
>= def_reg_lo
&& reg_lo
<= def_reg_hi
) ||
829 (reg_hi
>= def_reg_lo
&& reg_hi
<= def_reg_hi
)))
832 /* second, check that we have at most k=num_moves elements in the window
833 * and no element is larger than the currently processed one */
836 unsigned last_var
= 0;
838 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
839 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
842 if (reg_file
.is_blocked(PhysReg
{j
}) || k
> num_moves
) {
846 if (reg_file
[j
] == 0xF0000000) {
851 /* we cannot split live ranges of linear vgprs */
852 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
856 bool is_kill
= false;
857 for (const Operand
& op
: instr
->operands
) {
858 if (op
.isTemp() && op
.isKillBeforeDef() && op
.tempId() == reg_file
[j
]) {
863 if (!is_kill
&& ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
868 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
869 last_var
= reg_file
[j
];
871 if (k
> num_moves
|| (k
== num_moves
&& n
<= num_vars
)) {
884 /* FIXME: we messed up and couldn't find space for the variables to be copied */
885 if (num_moves
== 0xFF)
889 reg_hi
= best_pos
+ size
- 1;
891 /* collect variables and block reg file */
892 std::set
<std::pair
<unsigned, unsigned>> new_vars
= collect_vars(ctx
, reg_file
, PhysReg
{reg_lo
}, size
);
894 /* mark the area as blocked */
895 reg_file
.block(PhysReg
{reg_lo
}, var
.rc
);
897 if (!get_regs_for_copies(ctx
, reg_file
, parallelcopies
, new_vars
, lb
, ub
, instr
, def_reg_lo
, def_reg_hi
))
900 adjust_max_used_regs(ctx
, var
.rc
, reg_lo
);
902 /* create parallelcopy pair (without definition id) */
903 Temp tmp
= Temp(id
, var
.rc
);
904 Operand pc_op
= Operand(tmp
);
905 pc_op
.setFixed(var
.reg
);
906 Definition pc_def
= Definition(PhysReg
{reg_lo
}, pc_op
.regClass());
907 parallelcopies
.emplace_back(pc_op
, pc_def
);
914 std::pair
<PhysReg
, bool> get_reg_impl(ra_ctx
& ctx
,
915 RegisterFile
& reg_file
,
916 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
918 aco_ptr
<Instruction
>& instr
)
920 uint32_t lb
= info
.lb
;
921 uint32_t ub
= info
.ub
;
922 uint32_t size
= info
.size
;
923 uint32_t stride
= info
.stride
;
924 RegClass rc
= info
.rc
;
926 /* check how many free regs we have */
927 unsigned regs_free
= reg_file
.count_zero(PhysReg
{lb
}, ub
-lb
);
929 /* mark and count killed operands */
930 unsigned killed_ops
= 0;
931 for (unsigned j
= 0; !is_phi(instr
) && j
< instr
->operands
.size(); j
++) {
932 if (instr
->operands
[j
].isTemp() &&
933 instr
->operands
[j
].isFirstKillBeforeDef() &&
934 instr
->operands
[j
].physReg() >= lb
&&
935 instr
->operands
[j
].physReg() < ub
&&
936 !reg_file
.test(instr
->operands
[j
].physReg(), instr
->operands
[j
].bytes())) {
937 assert(instr
->operands
[j
].isFixed());
938 reg_file
.block(instr
->operands
[j
].physReg(), instr
->operands
[j
].regClass());
939 killed_ops
+= instr
->operands
[j
].getTemp().size();
943 assert(regs_free
>= size
);
944 /* we might have to move dead operands to dst in order to make space */
945 unsigned op_moves
= 0;
947 if (size
> (regs_free
- killed_ops
))
948 op_moves
= size
- (regs_free
- killed_ops
);
950 /* find the best position to place the definition */
951 unsigned best_pos
= lb
;
952 unsigned num_moves
= 0xFF;
953 unsigned num_vars
= 0;
955 /* we use a sliding window to check potential positions */
956 unsigned reg_lo
= lb
;
957 unsigned reg_hi
= lb
+ size
- 1;
958 for (reg_lo
= lb
, reg_hi
= lb
+ size
- 1; reg_hi
< ub
; reg_lo
+= stride
, reg_hi
+= stride
) {
959 /* first check the edges: this is what we have to fix to allow for num_moves > size */
960 if (reg_lo
> lb
&& !reg_file
.is_empty_or_blocked(PhysReg(reg_lo
)) &&
961 reg_file
.get_id(PhysReg(reg_lo
)) == reg_file
.get_id(PhysReg(reg_lo
).advance(-1)))
963 if (reg_hi
< ub
- 1 && !reg_file
.is_empty_or_blocked(PhysReg(reg_hi
).advance(3)) &&
964 reg_file
.get_id(PhysReg(reg_hi
).advance(3)) == reg_file
.get_id(PhysReg(reg_hi
).advance(4)))
967 /* second, check that we have at most k=num_moves elements in the window
968 * and no element is larger than the currently processed one */
969 unsigned k
= op_moves
;
971 unsigned remaining_op_moves
= op_moves
;
972 unsigned last_var
= 0;
974 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
975 for (unsigned j
= reg_lo
; found
&& j
<= reg_hi
; j
++) {
976 if (reg_file
[j
] == 0 || reg_file
[j
] == last_var
)
979 /* dead operands effectively reduce the number of estimated moves */
980 if (reg_file
.is_blocked(PhysReg
{j
})) {
981 if (remaining_op_moves
) {
983 remaining_op_moves
--;
988 if (reg_file
[j
] == 0xF0000000) {
994 if (ctx
.assignments
[reg_file
[j
]].rc
.size() >= size
) {
999 /* we cannot split live ranges of linear vgprs */
1000 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6)) {
1005 k
+= ctx
.assignments
[reg_file
[j
]].rc
.size();
1007 last_var
= reg_file
[j
];
1010 if (!found
|| k
> num_moves
)
1012 if (k
== num_moves
&& n
< num_vars
)
1014 if (!aligned
&& k
== num_moves
&& n
== num_vars
)
1024 if (num_moves
== 0xFF) {
1025 /* remove killed operands from reg_file once again */
1026 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
1027 if (instr
->operands
[i
].isTemp() && instr
->operands
[i
].isFirstKillBeforeDef())
1028 reg_file
.clear(instr
->operands
[i
]);
1030 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1031 Definition def
= instr
->definitions
[i
];
1032 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
1038 RegisterFile register_file
= reg_file
;
1040 /* now, we figured the placement for our definition */
1041 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
1043 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1044 /* move killed operands which aren't yet at the correct position (GFX9+)
1045 * or which are in the definition space */
1046 PhysReg reg
= PhysReg
{best_pos
};
1047 for (Operand
& op
: instr
->operands
) {
1048 if (op
.isTemp() && op
.isFirstKillBeforeDef() &&
1049 op
.getTemp().type() == rc
.type()) {
1050 if (op
.physReg() != reg
&&
1051 (ctx
.program
->chip_class
>= GFX9
||
1052 (op
.physReg().advance(op
.bytes()) > PhysReg
{best_pos
} &&
1053 op
.physReg() < PhysReg
{best_pos
+ size
}))) {
1054 vars
.emplace(op
.bytes(), op
.tempId());
1060 reg
.reg_b
+= op
.bytes();
1062 } else if (!is_phi(instr
)) {
1063 /* re-enable killed operands */
1064 for (Operand
& op
: instr
->operands
) {
1065 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1070 std::vector
<std::pair
<Operand
, Definition
>> pc
;
1071 if (!get_regs_for_copies(ctx
, reg_file
, pc
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1)) {
1072 reg_file
= std::move(register_file
);
1073 /* remove killed operands from reg_file once again */
1074 if (!is_phi(instr
)) {
1075 for (const Operand
& op
: instr
->operands
) {
1076 if (op
.isTemp() && op
.isFirstKillBeforeDef())
1080 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1081 Definition
& def
= instr
->definitions
[i
];
1082 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
1088 parallelcopies
.insert(parallelcopies
.end(), pc
.begin(), pc
.end());
1090 /* we set the definition regs == 0. the actual caller is responsible for correct setting */
1091 reg_file
.clear(PhysReg
{best_pos
}, rc
);
1093 update_renames(ctx
, reg_file
, parallelcopies
, instr
, instr
->opcode
!= aco_opcode::p_create_vector
);
1095 /* remove killed operands from reg_file once again */
1096 for (unsigned i
= 0; !is_phi(instr
) && i
< instr
->operands
.size(); i
++) {
1097 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
1099 assert(!instr
->operands
[i
].isUndefined());
1100 if (instr
->operands
[i
].isFirstKillBeforeDef())
1101 reg_file
.clear(instr
->operands
[i
]);
1103 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1104 Definition def
= instr
->definitions
[i
];
1105 if (def
.isTemp() && def
.isFixed() && ctx
.defs_done
.test(i
))
1109 adjust_max_used_regs(ctx
, rc
, best_pos
);
1110 return {PhysReg
{best_pos
}, true};
1113 bool get_reg_specified(ra_ctx
& ctx
,
1114 RegisterFile
& reg_file
,
1116 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
1117 aco_ptr
<Instruction
>& instr
,
1120 std::pair
<unsigned, unsigned> sdw_def_info
;
1121 if (rc
.is_subdword())
1122 sdw_def_info
= get_subdword_definition_info(ctx
.program
, instr
, rc
);
1124 if (rc
.is_subdword() && reg
.byte() % sdw_def_info
.first
)
1126 if (!rc
.is_subdword() && reg
.byte())
1129 uint32_t size
= rc
.size();
1130 uint32_t stride
= 1;
1133 if (rc
.type() == RegType::vgpr
) {
1135 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
1141 if (reg
% stride
!= 0)
1144 ub
= ctx
.program
->max_reg_demand
.sgpr
;
1147 uint32_t reg_lo
= reg
.reg();
1148 uint32_t reg_hi
= reg
+ (size
- 1);
1150 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
> reg_hi
)
1153 if (rc
.is_subdword()) {
1155 test_reg
.reg_b
= reg
.reg_b
& ~(sdw_def_info
.second
- 1);
1156 if (reg_file
.test(test_reg
, sdw_def_info
.second
))
1159 if (reg_file
.test(reg
, rc
.bytes()))
1163 adjust_max_used_regs(ctx
, rc
, reg_lo
);
1167 PhysReg
get_reg(ra_ctx
& ctx
,
1168 RegisterFile
& reg_file
,
1170 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
1171 aco_ptr
<Instruction
>& instr
,
1172 int operand_index
=-1)
1174 auto split_vec
= ctx
.split_vectors
.find(temp
.id());
1175 if (split_vec
!= ctx
.split_vectors
.end()) {
1176 unsigned offset
= 0;
1177 for (Definition def
: split_vec
->second
->definitions
) {
1178 auto affinity_it
= ctx
.affinities
.find(def
.tempId());
1179 if (affinity_it
!= ctx
.affinities
.end() && ctx
.assignments
[affinity_it
->second
].assigned
) {
1180 PhysReg reg
= ctx
.assignments
[affinity_it
->second
].reg
;
1181 reg
.reg_b
-= offset
;
1182 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
1185 offset
+= def
.bytes();
1189 if (ctx
.affinities
.find(temp
.id()) != ctx
.affinities
.end() &&
1190 ctx
.assignments
[ctx
.affinities
[temp
.id()]].assigned
) {
1191 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[temp
.id()]].reg
;
1192 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
1196 if (ctx
.vectors
.find(temp
.id()) != ctx
.vectors
.end()) {
1197 Instruction
* vec
= ctx
.vectors
[temp
.id()];
1198 unsigned byte_offset
= 0;
1199 for (const Operand
& op
: vec
->operands
) {
1200 if (op
.isTemp() && op
.tempId() == temp
.id())
1203 byte_offset
+= op
.bytes();
1206 for (const Operand
& op
: vec
->operands
) {
1208 op
.tempId() != temp
.id() &&
1209 op
.getTemp().type() == temp
.type() &&
1210 ctx
.assignments
[op
.tempId()].assigned
) {
1211 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
1212 reg
.reg_b
+= (byte_offset
- k
);
1213 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
1219 DefInfo
info(ctx
, ctx
.pseudo_dummy
, vec
->definitions
[0].regClass(), -1);
1220 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
1221 PhysReg reg
= res
.first
;
1223 reg
.reg_b
+= byte_offset
;
1224 /* make sure to only use byte offset if the instruction supports it */
1225 if (get_reg_specified(ctx
, reg_file
, temp
.regClass(), parallelcopies
, instr
, reg
))
1230 DefInfo
info(ctx
, instr
, temp
.regClass(), operand_index
);
1232 /* try to find space without live-range splits */
1233 std::pair
<PhysReg
, bool> res
= get_reg_simple(ctx
, reg_file
, info
);
1238 /* try to find space with live-range splits */
1239 res
= get_reg_impl(ctx
, reg_file
, parallelcopies
, info
, instr
);
1244 /* try using more registers */
1246 /* We should only fail here because keeping under the limit would require
1247 * too many moves. */
1248 assert(reg_file
.count_zero(PhysReg
{info
.lb
}, info
.ub
-info
.lb
) >= info
.size
);
1250 uint16_t max_addressible_sgpr
= ctx
.program
->sgpr_limit
;
1251 uint16_t max_addressible_vgpr
= ctx
.program
->vgpr_limit
;
1252 if (info
.rc
.type() == RegType::vgpr
&& ctx
.program
->max_reg_demand
.vgpr
< max_addressible_vgpr
) {
1253 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
+ 1, ctx
.program
->max_reg_demand
.sgpr
));
1254 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
, operand_index
);
1255 } else if (info
.rc
.type() == RegType::sgpr
&& ctx
.program
->max_reg_demand
.sgpr
< max_addressible_sgpr
) {
1256 update_vgpr_sgpr_demand(ctx
.program
, RegisterDemand(ctx
.program
->max_reg_demand
.vgpr
, ctx
.program
->max_reg_demand
.sgpr
+ 1));
1257 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
, operand_index
);
1260 //FIXME: if nothing helps, shift-rotate the registers to make space
1262 aco_err(ctx
.program
, "Failed to allocate registers during shader compilation.");
1266 PhysReg
get_reg_create_vector(ra_ctx
& ctx
,
1267 RegisterFile
& reg_file
,
1269 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopies
,
1270 aco_ptr
<Instruction
>& instr
)
1272 RegClass rc
= temp
.regClass();
1273 /* create_vector instructions have different costs w.r.t. register coalescing */
1274 uint32_t size
= rc
.size();
1275 uint32_t bytes
= rc
.bytes();
1276 uint32_t stride
= 1;
1278 if (rc
.type() == RegType::vgpr
) {
1280 ub
= 256 + ctx
.program
->max_reg_demand
.vgpr
;
1283 ub
= ctx
.program
->max_reg_demand
.sgpr
;
1290 //TODO: improve p_create_vector for sub-dword vectors
1292 unsigned best_pos
= -1;
1293 unsigned num_moves
= 0xFF;
1294 bool best_war_hint
= true;
1296 /* test for each operand which definition placement causes the least shuffle instructions */
1297 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1298 // TODO: think about, if we can alias live operands on the same register
1299 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isKillBeforeDef() || instr
->operands
[i
].getTemp().type() != rc
.type())
1302 if (offset
> instr
->operands
[i
].physReg().reg_b
)
1305 unsigned reg_lo
= instr
->operands
[i
].physReg().reg_b
- offset
;
1309 unsigned reg_hi
= reg_lo
+ size
- 1;
1312 /* no need to check multiple times */
1313 if (reg_lo
== best_pos
)
1317 // TODO: this can be improved */
1318 if (reg_lo
< lb
|| reg_hi
>= ub
|| reg_lo
% stride
!= 0)
1320 if (reg_lo
> lb
&& reg_file
[reg_lo
] != 0 && reg_file
.get_id(PhysReg(reg_lo
)) == reg_file
.get_id(PhysReg(reg_lo
).advance(-1)))
1322 if (reg_hi
< ub
- 1 && reg_file
[reg_hi
] != 0 && reg_file
.get_id(PhysReg(reg_hi
).advance(3)) == reg_file
.get_id(PhysReg(reg_hi
).advance(4)))
1325 /* count variables to be moved and check war_hint */
1326 bool war_hint
= false;
1327 bool linear_vgpr
= false;
1328 for (unsigned j
= reg_lo
; j
<= reg_hi
&& !linear_vgpr
; j
++) {
1329 if (reg_file
[j
] != 0) {
1330 if (reg_file
[j
] == 0xF0000000) {
1333 unsigned bytes_left
= bytes
- (j
- reg_lo
) * 4;
1334 for (unsigned k
= 0; k
< MIN2(bytes_left
, 4); k
++, reg
.reg_b
++)
1335 k
+= reg_file
.test(reg
, 1);
1338 /* we cannot split live ranges of linear vgprs */
1339 if (ctx
.assignments
[reg_file
[j
]].rc
& (1 << 6))
1343 war_hint
|= ctx
.war_hint
[j
];
1345 if (linear_vgpr
|| (war_hint
&& !best_war_hint
))
1348 /* count operands in wrong positions */
1349 for (unsigned j
= 0, offset
= 0; j
< instr
->operands
.size(); offset
+= instr
->operands
[j
].bytes(), j
++) {
1351 !instr
->operands
[j
].isTemp() ||
1352 instr
->operands
[j
].getTemp().type() != rc
.type())
1354 if (instr
->operands
[j
].physReg().reg_b
!= reg_lo
* 4 + offset
)
1355 k
+= instr
->operands
[j
].bytes();
1357 bool aligned
= rc
== RegClass::v4
&& reg_lo
% 4 == 0;
1358 if (k
> num_moves
|| (!aligned
&& k
== num_moves
))
1363 best_war_hint
= war_hint
;
1366 if (num_moves
>= bytes
)
1367 return get_reg(ctx
, reg_file
, temp
, parallelcopies
, instr
);
1369 /* re-enable killed operands which are in the wrong position */
1370 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1371 if (instr
->operands
[i
].isTemp() &&
1372 instr
->operands
[i
].isFirstKillBeforeDef() &&
1373 instr
->operands
[i
].physReg().reg_b
!= best_pos
* 4 + offset
)
1374 reg_file
.fill(instr
->operands
[i
]);
1377 /* collect variables to be moved */
1378 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, reg_file
, PhysReg
{best_pos
}, size
);
1380 for (unsigned i
= 0, offset
= 0; i
< instr
->operands
.size(); offset
+= instr
->operands
[i
].bytes(), i
++) {
1381 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFirstKillBeforeDef() ||
1382 instr
->operands
[i
].getTemp().type() != rc
.type())
1384 bool correct_pos
= instr
->operands
[i
].physReg().reg_b
== best_pos
* 4 + offset
;
1385 /* GFX9+: move killed operands which aren't yet at the correct position
1386 * Moving all killed operands generally leads to more register swaps.
1387 * This is only done on GFX9+ because of the cheap v_swap instruction.
1389 if (ctx
.program
->chip_class
>= GFX9
&& !correct_pos
) {
1390 vars
.emplace(instr
->operands
[i
].bytes(), instr
->operands
[i
].tempId());
1391 reg_file
.clear(instr
->operands
[i
]);
1392 /* fill operands which are in the correct position to avoid overwriting */
1393 } else if (correct_pos
) {
1394 reg_file
.fill(instr
->operands
[i
]);
1397 ASSERTED
bool success
= false;
1398 success
= get_regs_for_copies(ctx
, reg_file
, parallelcopies
, vars
, lb
, ub
, instr
, best_pos
, best_pos
+ size
- 1);
1401 update_renames(ctx
, reg_file
, parallelcopies
, instr
, false);
1402 adjust_max_used_regs(ctx
, rc
, best_pos
);
1404 /* remove killed operands from reg_file once again */
1405 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
1406 if (!instr
->operands
[i
].isTemp() || !instr
->operands
[i
].isFixed())
1408 assert(!instr
->operands
[i
].isUndefined());
1409 if (instr
->operands
[i
].isFirstKillBeforeDef())
1410 reg_file
.clear(instr
->operands
[i
]);
1413 return PhysReg
{best_pos
};
1416 void handle_pseudo(ra_ctx
& ctx
,
1417 const RegisterFile
& reg_file
,
1420 if (instr
->format
!= Format::PSEUDO
)
1423 /* all instructions which use handle_operands() need this information */
1424 switch (instr
->opcode
) {
1425 case aco_opcode::p_extract_vector
:
1426 case aco_opcode::p_create_vector
:
1427 case aco_opcode::p_split_vector
:
1428 case aco_opcode::p_parallelcopy
:
1429 case aco_opcode::p_wqm
:
1435 /* if all definitions are vgpr, no need to care for SCC */
1436 bool writes_sgpr
= false;
1437 for (Definition
& def
: instr
->definitions
) {
1438 if (def
.getTemp().type() == RegType::sgpr
) {
1443 /* if all operands are constant, no need to care either */
1444 bool reads_sgpr
= false;
1445 bool reads_subdword
= false;
1446 for (Operand
& op
: instr
->operands
) {
1447 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
1451 if (op
.isTemp() && op
.regClass().is_subdword())
1452 reads_subdword
= true;
1454 bool needs_scratch_reg
= (writes_sgpr
&& reads_sgpr
) ||
1455 (ctx
.program
->chip_class
<= GFX7
&& reads_subdword
);
1456 if (!needs_scratch_reg
)
1459 Pseudo_instruction
*pi
= (Pseudo_instruction
*)instr
;
1460 if (reg_file
[scc
.reg()]) {
1461 pi
->tmp_in_scc
= true;
1463 int reg
= ctx
.max_used_sgpr
;
1464 for (; reg
>= 0 && reg_file
[reg
]; reg
--)
1467 reg
= ctx
.max_used_sgpr
+ 1;
1468 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& reg_file
[reg
]; reg
++)
1470 if (reg
== ctx
.program
->max_reg_demand
.sgpr
) {
1471 assert(reads_subdword
&& reg_file
[m0
] == 0);
1476 adjust_max_used_regs(ctx
, s1
, reg
);
1477 pi
->scratch_sgpr
= PhysReg
{(unsigned)reg
};
1479 pi
->tmp_in_scc
= false;
1483 bool operand_can_use_reg(chip_class chip
, aco_ptr
<Instruction
>& instr
, unsigned idx
, PhysReg reg
, RegClass rc
)
1485 if (instr
->operands
[idx
].isFixed())
1486 return instr
->operands
[idx
].physReg() == reg
;
1489 unsigned stride
= get_subdword_operand_stride(chip
, instr
, idx
, rc
);
1490 if (reg
.byte() % stride
)
1494 switch (instr
->format
) {
1496 return reg
!= scc
&&
1498 (reg
!= m0
|| idx
== 1 || idx
== 3) && /* offset can be m0 */
1499 (reg
!= vcc
|| (instr
->definitions
.empty() && idx
== 2)); /* sdata can be vcc */
1501 // TODO: there are more instructions with restrictions on registers
1506 void get_reg_for_operand(ra_ctx
& ctx
, RegisterFile
& register_file
,
1507 std::vector
<std::pair
<Operand
, Definition
>>& parallelcopy
,
1508 aco_ptr
<Instruction
>& instr
, Operand
& operand
, unsigned operand_index
)
1510 /* check if the operand is fixed */
1512 if (operand
.isFixed()) {
1513 assert(operand
.physReg() != ctx
.assignments
[operand
.tempId()].reg
);
1515 /* check if target reg is blocked, and move away the blocking var */
1516 if (register_file
[operand
.physReg().reg()]) {
1517 assert(register_file
[operand
.physReg()] != 0xF0000000);
1518 uint32_t blocking_id
= register_file
[operand
.physReg().reg()];
1519 RegClass rc
= ctx
.assignments
[blocking_id
].rc
;
1520 Operand pc_op
= Operand(Temp
{blocking_id
, rc
});
1521 pc_op
.setFixed(operand
.physReg());
1524 PhysReg reg
= get_reg(ctx
, register_file
, pc_op
.getTemp(), parallelcopy
, ctx
.pseudo_dummy
);
1525 Definition pc_def
= Definition(PhysReg
{reg
}, pc_op
.regClass());
1526 register_file
.clear(pc_op
);
1527 parallelcopy
.emplace_back(pc_op
, pc_def
);
1529 dst
= operand
.physReg();
1532 dst
= get_reg(ctx
, register_file
, operand
.getTemp(), parallelcopy
, instr
, operand_index
);
1535 Operand pc_op
= operand
;
1536 pc_op
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
1537 Definition pc_def
= Definition(dst
, pc_op
.regClass());
1538 register_file
.clear(pc_op
);
1539 parallelcopy
.emplace_back(pc_op
, pc_def
);
1540 update_renames(ctx
, register_file
, parallelcopy
, instr
, true);
1543 Temp
read_variable(ra_ctx
& ctx
, Temp val
, unsigned block_idx
)
1545 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.renames
[block_idx
].find(val
.id());
1546 if (it
== ctx
.renames
[block_idx
].end())
1552 Temp
handle_live_in(ra_ctx
& ctx
, Temp val
, Block
* block
)
1554 std::vector
<unsigned>& preds
= val
.is_linear() ? block
->linear_preds
: block
->logical_preds
;
1555 if (preds
.size() == 0 || val
.regClass() == val
.regClass().as_linear())
1558 assert(preds
.size() > 0);
1561 if (!ctx
.sealed
[block
->index
]) {
1562 /* consider rename from already processed predecessor */
1563 Temp tmp
= read_variable(ctx
, val
, preds
[0]);
1565 /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */
1566 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1567 ctx
.assignments
.emplace_back();
1568 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1569 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1570 phi
->definitions
[0] = Definition(new_val
);
1571 for (unsigned i
= 0; i
< preds
.size(); i
++)
1572 phi
->operands
[i
] = Operand(val
);
1573 if (tmp
.regClass() == new_val
.regClass())
1574 ctx
.affinities
[new_val
.id()] = tmp
.id();
1576 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1577 ctx
.incomplete_phis
[block
->index
].emplace_back(phi
.get());
1578 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1580 } else if (preds
.size() == 1) {
1581 /* if the block has only one predecessor, just look there for the name */
1582 new_val
= read_variable(ctx
, val
, preds
[0]);
1584 /* there are multiple predecessors and the block is sealed */
1585 Temp ops
[preds
.size()];
1587 /* get the rename from each predecessor and check if they are the same */
1588 bool needs_phi
= false;
1589 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1590 ops
[i
] = read_variable(ctx
, val
, preds
[i
]);
1594 needs_phi
|= !(new_val
== ops
[i
]);
1598 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
1599 aco_opcode opcode
= val
.is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1600 aco_ptr
<Instruction
> phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1601 new_val
= Temp
{ctx
.program
->allocateId(), val
.regClass()};
1602 phi
->definitions
[0] = Definition(new_val
);
1603 for (unsigned i
= 0; i
< preds
.size(); i
++) {
1604 phi
->operands
[i
] = Operand(ops
[i
]);
1605 phi
->operands
[i
].setFixed(ctx
.assignments
[ops
[i
].id()].reg
);
1606 if (ops
[i
].regClass() == new_val
.regClass())
1607 ctx
.affinities
[new_val
.id()] = ops
[i
].id();
1609 ctx
.assignments
.emplace_back();
1610 assert(ctx
.assignments
.size() == ctx
.program
->peekAllocationId());
1611 ctx
.phi_map
.emplace(new_val
.id(), phi_info
{phi
.get(), block
->index
});
1612 block
->instructions
.insert(block
->instructions
.begin(), std::move(phi
));
1616 if (new_val
!= val
) {
1617 ctx
.renames
[block
->index
][val
.id()] = new_val
;
1618 ctx
.orig_names
[new_val
.id()] = val
;
1623 void try_remove_trivial_phi(ra_ctx
& ctx
, Temp temp
)
1625 std::unordered_map
<unsigned, phi_info
>::iterator info
= ctx
.phi_map
.find(temp
.id());
1627 if (info
== ctx
.phi_map
.end() || !ctx
.sealed
[info
->second
.block_idx
])
1630 assert(info
->second
.block_idx
!= 0);
1631 Instruction
* phi
= info
->second
.phi
;
1633 Definition def
= phi
->definitions
[0];
1635 /* a phi node is trivial if all operands are the same as the definition of the phi */
1636 for (const Operand
& op
: phi
->operands
) {
1637 const Temp t
= op
.getTemp();
1638 if (t
== same
|| t
== def
.getTemp()) {
1639 assert(t
== same
|| op
.physReg() == def
.physReg());
1647 assert(same
!= Temp() || same
== def
.getTemp());
1649 /* reroute all uses to same and remove phi */
1650 std::vector
<Temp
> phi_users
;
1651 std::unordered_map
<unsigned, phi_info
>::iterator same_phi_info
= ctx
.phi_map
.find(same
.id());
1652 for (Instruction
* instr
: info
->second
.uses
) {
1653 assert(phi
!= instr
);
1654 /* recursively try to remove trivial phis */
1655 if (is_phi(instr
)) {
1656 /* ignore if the phi was already flagged trivial */
1657 if (instr
->definitions
.empty())
1660 if (instr
->definitions
[0].getTemp() != temp
)
1661 phi_users
.emplace_back(instr
->definitions
[0].getTemp());
1663 for (Operand
& op
: instr
->operands
) {
1664 if (op
.isTemp() && op
.tempId() == def
.tempId()) {
1666 if (same_phi_info
!= ctx
.phi_map
.end())
1667 same_phi_info
->second
.uses
.emplace(instr
);
1672 auto it
= ctx
.orig_names
.find(same
.id());
1673 unsigned orig_var
= it
!= ctx
.orig_names
.end() ? it
->second
.id() : same
.id();
1674 for (unsigned i
= 0; i
< ctx
.program
->blocks
.size(); i
++) {
1675 auto it
= ctx
.renames
[i
].find(orig_var
);
1676 if (it
!= ctx
.renames
[i
].end() && it
->second
== def
.getTemp())
1677 ctx
.renames
[i
][orig_var
] = same
;
1680 phi
->definitions
.clear(); /* this indicates that the phi can be removed */
1681 ctx
.phi_map
.erase(info
);
1682 for (Temp t
: phi_users
)
1683 try_remove_trivial_phi(ctx
, t
);
1688 } /* end namespace */
1691 void register_allocation(Program
*program
, std::vector
<TempSet
>& live_out_per_block
)
1693 ra_ctx
ctx(program
);
1694 std::vector
<std::vector
<Temp
>> phi_ressources
;
1695 std::unordered_map
<unsigned, unsigned> temp_to_phi_ressources
;
1697 for (std::vector
<Block
>::reverse_iterator it
= program
->blocks
.rbegin(); it
!= program
->blocks
.rend(); it
++) {
1700 /* first, compute the death points of all live vars within the block */
1701 TempSet
& live
= live_out_per_block
[block
.index
];
1703 std::vector
<aco_ptr
<Instruction
>>::reverse_iterator rit
;
1704 for (rit
= block
.instructions
.rbegin(); rit
!= block
.instructions
.rend(); ++rit
) {
1705 aco_ptr
<Instruction
>& instr
= *rit
;
1706 if (is_phi(instr
)) {
1707 if (instr
->definitions
[0].isKill() || instr
->definitions
[0].isFixed()) {
1708 live
.erase(instr
->definitions
[0].getTemp());
1711 /* collect information about affinity-related temporaries */
1712 std::vector
<Temp
> affinity_related
;
1713 /* affinity_related[0] is the last seen affinity-related temp */
1714 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1715 affinity_related
.emplace_back(instr
->definitions
[0].getTemp());
1716 for (const Operand
& op
: instr
->operands
) {
1717 if (op
.isTemp() && op
.regClass() == instr
->definitions
[0].regClass()) {
1718 affinity_related
.emplace_back(op
.getTemp());
1719 temp_to_phi_ressources
[op
.tempId()] = phi_ressources
.size();
1722 phi_ressources
.emplace_back(std::move(affinity_related
));
1724 /* add vector affinities */
1725 if (instr
->opcode
== aco_opcode::p_create_vector
) {
1726 for (const Operand
& op
: instr
->operands
) {
1727 if (op
.isTemp() && op
.isFirstKill() && op
.getTemp().type() == instr
->definitions
[0].getTemp().type())
1728 ctx
.vectors
[op
.tempId()] = instr
.get();
1732 if (instr
->opcode
== aco_opcode::p_split_vector
&& instr
->operands
[0].isFirstKillBeforeDef())
1733 ctx
.split_vectors
[instr
->operands
[0].tempId()] = instr
.get();
1735 /* add operands to live variables */
1736 for (const Operand
& op
: instr
->operands
) {
1738 live
.emplace(op
.getTemp());
1742 /* erase definitions from live */
1743 for (unsigned i
= 0; i
< instr
->definitions
.size(); i
++) {
1744 const Definition
& def
= instr
->definitions
[i
];
1747 live
.erase(def
.getTemp());
1748 /* mark last-seen phi operand */
1749 std::unordered_map
<unsigned, unsigned>::iterator it
= temp_to_phi_ressources
.find(def
.tempId());
1750 if (it
!= temp_to_phi_ressources
.end() && def
.regClass() == phi_ressources
[it
->second
][0].regClass()) {
1751 phi_ressources
[it
->second
][0] = def
.getTemp();
1752 /* try to coalesce phi affinities with parallelcopies */
1753 Operand op
= Operand();
1754 if (!def
.isFixed() && instr
->opcode
== aco_opcode::p_parallelcopy
)
1755 op
= instr
->operands
[i
];
1756 else if ((instr
->opcode
== aco_opcode::v_mad_f32
||
1757 (instr
->opcode
== aco_opcode::v_fma_f32
&& program
->chip_class
>= GFX10
) ||
1758 instr
->opcode
== aco_opcode::v_mad_f16
||
1759 instr
->opcode
== aco_opcode::v_mad_legacy_f16
||
1760 (instr
->opcode
== aco_opcode::v_fma_f16
&& program
->chip_class
>= GFX10
)) && !instr
->usesModifiers())
1761 op
= instr
->operands
[2];
1763 if (op
.isTemp() && op
.isFirstKillBeforeDef() && def
.regClass() == op
.regClass()) {
1764 phi_ressources
[it
->second
].emplace_back(op
.getTemp());
1765 temp_to_phi_ressources
[op
.tempId()] = it
->second
;
1771 /* create affinities */
1772 for (std::vector
<Temp
>& vec
: phi_ressources
) {
1773 assert(vec
.size() > 1);
1774 for (unsigned i
= 1; i
< vec
.size(); i
++)
1775 if (vec
[i
].id() != vec
[0].id())
1776 ctx
.affinities
[vec
[i
].id()] = vec
[0].id();
1779 /* state of register file after phis */
1780 std::vector
<std::bitset
<128>> sgpr_live_in(program
->blocks
.size());
1782 for (Block
& block
: program
->blocks
) {
1783 TempSet
& live
= live_out_per_block
[block
.index
];
1784 /* initialize register file */
1785 assert(block
.index
!= 0 || live
.empty());
1786 RegisterFile register_file
;
1787 ctx
.war_hint
.reset();
1789 for (Temp t
: live
) {
1790 Temp renamed
= handle_live_in(ctx
, t
, &block
);
1791 assignment
& var
= ctx
.assignments
[renamed
.id()];
1792 /* due to live-range splits, the live-in might be a phi, now */
1794 register_file
.fill(Definition(renamed
.id(), var
.reg
, var
.rc
));
1797 std::vector
<aco_ptr
<Instruction
>> instructions
;
1798 std::vector
<aco_ptr
<Instruction
>>::iterator it
;
1800 /* this is a slight adjustment from the paper as we already have phi nodes:
1801 * We consider them incomplete phis and only handle the definition. */
1803 /* handle fixed phi definitions */
1804 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1805 aco_ptr
<Instruction
>& phi
= *it
;
1808 Definition
& definition
= phi
->definitions
[0];
1809 if (!definition
.isFixed())
1812 /* check if a dead exec mask phi is needed */
1813 if (definition
.isKill()) {
1814 for (Operand
& op
: phi
->operands
) {
1815 assert(op
.isTemp());
1816 if (!ctx
.assignments
[op
.tempId()].assigned
||
1817 ctx
.assignments
[op
.tempId()].reg
!= exec
) {
1818 definition
.setKill(false);
1824 if (definition
.isKill())
1827 assert(definition
.physReg() == exec
);
1828 assert(!register_file
.test(definition
.physReg(), definition
.bytes()));
1829 register_file
.fill(definition
);
1830 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1833 /* look up the affinities */
1834 for (it
= block
.instructions
.begin(); it
!= block
.instructions
.end(); ++it
) {
1835 aco_ptr
<Instruction
>& phi
= *it
;
1838 Definition
& definition
= phi
->definitions
[0];
1839 if (definition
.isKill() || definition
.isFixed())
1842 if (ctx
.affinities
.find(definition
.tempId()) != ctx
.affinities
.end() &&
1843 ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].assigned
) {
1844 assert(ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].rc
== definition
.regClass());
1845 PhysReg reg
= ctx
.assignments
[ctx
.affinities
[definition
.tempId()]].reg
;
1846 bool try_use_special_reg
= reg
== scc
|| reg
== exec
;
1847 if (try_use_special_reg
) {
1848 for (const Operand
& op
: phi
->operands
) {
1849 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
&&
1850 ctx
.assignments
[op
.tempId()].reg
== reg
)) {
1851 try_use_special_reg
= false;
1855 if (!try_use_special_reg
)
1858 /* only assign if register is still free */
1859 if (!register_file
.test(reg
, definition
.bytes())) {
1860 definition
.setFixed(reg
);
1861 register_file
.fill(definition
);
1862 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1867 /* find registers for phis without affinity or where the register was blocked */
1868 for (it
= block
.instructions
.begin();it
!= block
.instructions
.end(); ++it
) {
1869 aco_ptr
<Instruction
>& phi
= *it
;
1873 Definition
& definition
= phi
->definitions
[0];
1874 if (definition
.isKill())
1877 if (!definition
.isFixed()) {
1878 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1879 /* try to find a register that is used by at least one operand */
1880 for (const Operand
& op
: phi
->operands
) {
1881 if (!(op
.isTemp() && ctx
.assignments
[op
.tempId()].assigned
))
1883 PhysReg reg
= ctx
.assignments
[op
.tempId()].reg
;
1884 /* we tried this already on the previous loop */
1885 if (reg
== scc
|| reg
== exec
)
1887 if (get_reg_specified(ctx
, register_file
, definition
.regClass(), parallelcopy
, phi
, reg
)) {
1888 definition
.setFixed(reg
);
1892 if (!definition
.isFixed())
1893 definition
.setFixed(get_reg(ctx
, register_file
, definition
.getTemp(), parallelcopy
, phi
));
1895 /* process parallelcopy */
1896 for (std::pair
<Operand
, Definition
> pc
: parallelcopy
) {
1897 /* see if it's a copy from a different phi */
1898 //TODO: prefer moving some previous phis over live-ins
1899 //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec)
1900 Instruction
*prev_phi
= NULL
;
1901 std::vector
<aco_ptr
<Instruction
>>::iterator phi_it
;
1902 for (phi_it
= instructions
.begin(); phi_it
!= instructions
.end(); ++phi_it
) {
1903 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1904 prev_phi
= phi_it
->get();
1907 while (!prev_phi
&& is_phi(*++phi_it
)) {
1908 if ((*phi_it
)->definitions
[0].tempId() == pc
.first
.tempId())
1909 prev_phi
= phi_it
->get();
1912 /* if so, just update that phi's register */
1913 register_file
.clear(prev_phi
->definitions
[0]);
1914 prev_phi
->definitions
[0].setFixed(pc
.second
.physReg());
1915 ctx
.assignments
[prev_phi
->definitions
[0].tempId()] = {pc
.second
.physReg(), pc
.second
.regClass()};
1916 register_file
.fill(prev_phi
->definitions
[0]);
1921 std::unordered_map
<unsigned, Temp
>::iterator orig_it
= ctx
.orig_names
.find(pc
.first
.tempId());
1922 Temp orig
= pc
.first
.getTemp();
1923 if (orig_it
!= ctx
.orig_names
.end())
1924 orig
= orig_it
->second
;
1926 ctx
.orig_names
[pc
.second
.tempId()] = orig
;
1927 ctx
.renames
[block
.index
][orig
.id()] = pc
.second
.getTemp();
1929 /* otherwise, this is a live-in and we need to create a new phi
1930 * to move it in this block's predecessors */
1931 aco_opcode opcode
= pc
.first
.getTemp().is_linear() ? aco_opcode::p_linear_phi
: aco_opcode::p_phi
;
1932 std::vector
<unsigned>& preds
= pc
.first
.getTemp().is_linear() ? block
.linear_preds
: block
.logical_preds
;
1933 aco_ptr
<Instruction
> new_phi
{create_instruction
<Pseudo_instruction
>(opcode
, Format::PSEUDO
, preds
.size(), 1)};
1934 new_phi
->definitions
[0] = pc
.second
;
1935 for (unsigned i
= 0; i
< preds
.size(); i
++)
1936 new_phi
->operands
[i
] = Operand(pc
.first
);
1937 instructions
.emplace_back(std::move(new_phi
));
1940 register_file
.fill(definition
);
1941 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
1943 live
.emplace(definition
.getTemp());
1945 /* update phi affinities */
1946 for (const Operand
& op
: phi
->operands
) {
1947 if (op
.isTemp() && op
.regClass() == phi
->definitions
[0].regClass())
1948 ctx
.affinities
[op
.tempId()] = definition
.tempId();
1951 instructions
.emplace_back(std::move(*it
));
1954 /* fill in sgpr_live_in */
1955 for (unsigned i
= 0; i
<= ctx
.max_used_sgpr
; i
++)
1956 sgpr_live_in
[block
.index
][i
] = register_file
[i
];
1957 sgpr_live_in
[block
.index
][127] = register_file
[scc
.reg()];
1959 /* Handle all other instructions of the block */
1960 for (; it
!= block
.instructions
.end(); ++it
) {
1961 aco_ptr
<Instruction
>& instr
= *it
;
1963 /* parallelcopies from p_phi are inserted here which means
1964 * live ranges of killed operands end here as well */
1965 if (instr
->opcode
== aco_opcode::p_logical_end
) {
1966 /* no need to process this instruction any further */
1967 if (block
.logical_succs
.size() != 1) {
1968 instructions
.emplace_back(std::move(instr
));
1972 Block
& succ
= program
->blocks
[block
.logical_succs
[0]];
1974 for (; idx
< succ
.logical_preds
.size(); idx
++) {
1975 if (succ
.logical_preds
[idx
] == block
.index
)
1978 for (aco_ptr
<Instruction
>& phi
: succ
.instructions
) {
1979 if (phi
->opcode
== aco_opcode::p_phi
) {
1980 if (phi
->operands
[idx
].isTemp() &&
1981 phi
->operands
[idx
].getTemp().type() == RegType::sgpr
&&
1982 phi
->operands
[idx
].isFirstKillBeforeDef()) {
1983 Temp phi_op
= read_variable(ctx
, phi
->operands
[idx
].getTemp(), block
.index
);
1984 PhysReg reg
= ctx
.assignments
[phi_op
.id()].reg
;
1985 assert(register_file
[reg
] == phi_op
.id());
1986 register_file
[reg
] = 0;
1988 } else if (phi
->opcode
!= aco_opcode::p_linear_phi
) {
1992 instructions
.emplace_back(std::move(instr
));
1996 std::vector
<std::pair
<Operand
, Definition
>> parallelcopy
;
1998 assert(!is_phi(instr
));
2000 /* handle operands */
2001 for (unsigned i
= 0; i
< instr
->operands
.size(); ++i
) {
2002 auto& operand
= instr
->operands
[i
];
2003 if (!operand
.isTemp())
2006 /* rename operands */
2007 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), block
.index
));
2008 assert(ctx
.assignments
[operand
.tempId()].assigned
);
2010 PhysReg reg
= ctx
.assignments
[operand
.tempId()].reg
;
2011 if (operand_can_use_reg(program
->chip_class
, instr
, i
, reg
, operand
.regClass()))
2012 operand
.setFixed(reg
);
2014 get_reg_for_operand(ctx
, register_file
, parallelcopy
, instr
, operand
, i
);
2016 if (instr
->format
== Format::EXP
||
2017 (instr
->isVMEM() && i
== 3 && ctx
.program
->chip_class
== GFX6
) ||
2018 (instr
->format
== Format::DS
&& static_cast<DS_instruction
*>(instr
.get())->gds
)) {
2019 for (unsigned j
= 0; j
< operand
.size(); j
++)
2020 ctx
.war_hint
.set(operand
.physReg().reg() + j
);
2023 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
2024 if (phi
!= ctx
.phi_map
.end())
2025 phi
->second
.uses
.emplace(instr
.get());
2028 /* remove dead vars from register file */
2029 for (const Operand
& op
: instr
->operands
) {
2030 if (op
.isTemp() && op
.isFirstKillBeforeDef())
2031 register_file
.clear(op
);
2034 /* try to optimize v_mad_f32 -> v_mac_f32 */
2035 if ((instr
->opcode
== aco_opcode::v_mad_f32
||
2036 (instr
->opcode
== aco_opcode::v_fma_f32
&& program
->chip_class
>= GFX10
) ||
2037 instr
->opcode
== aco_opcode::v_mad_f16
||
2038 instr
->opcode
== aco_opcode::v_mad_legacy_f16
||
2039 (instr
->opcode
== aco_opcode::v_fma_f16
&& program
->chip_class
>= GFX10
)) &&
2040 instr
->operands
[2].isTemp() &&
2041 instr
->operands
[2].isKillBeforeDef() &&
2042 instr
->operands
[2].getTemp().type() == RegType::vgpr
&&
2043 instr
->operands
[1].isTemp() &&
2044 instr
->operands
[1].getTemp().type() == RegType::vgpr
&&
2045 !instr
->usesModifiers() &&
2046 instr
->operands
[0].physReg().byte() == 0 &&
2047 instr
->operands
[1].physReg().byte() == 0 &&
2048 instr
->operands
[2].physReg().byte() == 0) {
2049 unsigned def_id
= instr
->definitions
[0].tempId();
2050 auto it
= ctx
.affinities
.find(def_id
);
2051 if (it
== ctx
.affinities
.end() || !ctx
.assignments
[it
->second
].assigned
||
2052 instr
->operands
[2].physReg() == ctx
.assignments
[it
->second
].reg
||
2053 register_file
.test(ctx
.assignments
[it
->second
].reg
, instr
->operands
[2].bytes())) {
2054 instr
->format
= Format::VOP2
;
2055 switch (instr
->opcode
) {
2056 case aco_opcode::v_mad_f32
:
2057 instr
->opcode
= aco_opcode::v_mac_f32
;
2059 case aco_opcode::v_fma_f32
:
2060 instr
->opcode
= aco_opcode::v_fmac_f32
;
2062 case aco_opcode::v_mad_f16
:
2063 case aco_opcode::v_mad_legacy_f16
:
2064 instr
->opcode
= aco_opcode::v_mac_f16
;
2066 case aco_opcode::v_fma_f16
:
2067 instr
->opcode
= aco_opcode::v_fmac_f16
;
2075 /* handle definitions which must have the same register as an operand */
2076 if (instr
->opcode
== aco_opcode::v_interp_p2_f32
||
2077 instr
->opcode
== aco_opcode::v_mac_f32
||
2078 instr
->opcode
== aco_opcode::v_fmac_f32
||
2079 instr
->opcode
== aco_opcode::v_mac_f16
||
2080 instr
->opcode
== aco_opcode::v_fmac_f16
||
2081 instr
->opcode
== aco_opcode::v_writelane_b32
||
2082 instr
->opcode
== aco_opcode::v_writelane_b32_e64
) {
2083 instr
->definitions
[0].setFixed(instr
->operands
[2].physReg());
2084 } else if (instr
->opcode
== aco_opcode::s_addk_i32
||
2085 instr
->opcode
== aco_opcode::s_mulk_i32
) {
2086 instr
->definitions
[0].setFixed(instr
->operands
[0].physReg());
2087 } else if (instr
->format
== Format::MUBUF
&&
2088 instr
->definitions
.size() == 1 &&
2089 instr
->operands
.size() == 4) {
2090 instr
->definitions
[0].setFixed(instr
->operands
[3].physReg());
2091 } else if (instr
->format
== Format::MIMG
&&
2092 instr
->definitions
.size() == 1 &&
2093 instr
->operands
[1].regClass().type() == RegType::vgpr
) {
2094 instr
->definitions
[0].setFixed(instr
->operands
[1].physReg());
2097 ctx
.defs_done
.reset();
2099 /* handle fixed definitions first */
2100 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
2101 auto& definition
= instr
->definitions
[i
];
2102 if (!definition
.isFixed())
2105 adjust_max_used_regs(ctx
, definition
.regClass(), definition
.physReg());
2106 /* check if the target register is blocked */
2107 if (register_file
.test(definition
.physReg(), definition
.bytes())) {
2108 /* create parallelcopy pair to move blocking vars */
2109 std::set
<std::pair
<unsigned, unsigned>> vars
= collect_vars(ctx
, register_file
, definition
.physReg(), definition
.size());
2111 /* re-enable the killed operands, so that we don't move the blocking vars there */
2112 for (const Operand
& op
: instr
->operands
) {
2113 if (op
.isTemp() && op
.isFirstKillBeforeDef())
2114 register_file
.fill(op
);
2117 ASSERTED
bool success
= false;
2118 DefInfo
info(ctx
, instr
, definition
.regClass(), -1);
2119 success
= get_regs_for_copies(ctx
, register_file
, parallelcopy
,
2120 vars
, info
.lb
, info
.ub
, instr
,
2121 definition
.physReg(),
2122 definition
.physReg() + definition
.size() - 1);
2125 update_renames(ctx
, register_file
, parallelcopy
, instr
, false);
2127 /* once again, disable killed operands */
2128 for (const Operand
& op
: instr
->operands
) {
2129 if (op
.isTemp() && op
.isFirstKillBeforeDef())
2130 register_file
.clear(op
);
2132 for (unsigned k
= 0; k
< i
; k
++) {
2133 if (instr
->definitions
[k
].isTemp() && ctx
.defs_done
.test(k
) && !instr
->definitions
[k
].isKill())
2134 register_file
.fill(instr
->definitions
[k
]);
2137 ctx
.defs_done
.set(i
);
2139 if (!definition
.isTemp())
2142 /* set live if it has a kill point */
2143 if (!definition
.isKill())
2144 live
.emplace(definition
.getTemp());
2146 ctx
.assignments
[definition
.tempId()] = {definition
.physReg(), definition
.regClass()};
2147 register_file
.fill(definition
);
2150 /* handle all other definitions */
2151 for (unsigned i
= 0; i
< instr
->definitions
.size(); ++i
) {
2152 Definition
*definition
= &instr
->definitions
[i
];
2154 if (definition
->isFixed() || !definition
->isTemp())
2158 if (definition
->hasHint() && register_file
[definition
->physReg().reg()] == 0)
2159 definition
->setFixed(definition
->physReg());
2160 else if (instr
->opcode
== aco_opcode::p_split_vector
) {
2161 PhysReg reg
= instr
->operands
[0].physReg();
2162 for (unsigned j
= 0; j
< i
; j
++)
2163 reg
.reg_b
+= instr
->definitions
[j
].bytes();
2164 if (get_reg_specified(ctx
, register_file
, definition
->regClass(), parallelcopy
, instr
, reg
))
2165 definition
->setFixed(reg
);
2166 } else if (instr
->opcode
== aco_opcode::p_wqm
|| instr
->opcode
== aco_opcode::p_parallelcopy
) {
2167 PhysReg reg
= instr
->operands
[i
].physReg();
2168 if (instr
->operands
[i
].isTemp() &&
2169 instr
->operands
[i
].getTemp().type() == definition
->getTemp().type() &&
2170 !register_file
.test(reg
, definition
->bytes()))
2171 definition
->setFixed(reg
);
2172 } else if (instr
->opcode
== aco_opcode::p_extract_vector
) {
2174 if (instr
->operands
[0].isKillBeforeDef() &&
2175 instr
->operands
[0].getTemp().type() == definition
->getTemp().type()) {
2176 reg
= instr
->operands
[0].physReg();
2177 reg
.reg_b
+= definition
->bytes() * instr
->operands
[1].constantValue();
2178 assert(!register_file
.test(reg
, definition
->bytes()));
2179 definition
->setFixed(reg
);
2181 } else if (instr
->opcode
== aco_opcode::p_create_vector
) {
2182 PhysReg reg
= get_reg_create_vector(ctx
, register_file
, definition
->getTemp(),
2183 parallelcopy
, instr
);
2184 definition
->setFixed(reg
);
2187 if (!definition
->isFixed()) {
2188 Temp tmp
= definition
->getTemp();
2189 if (definition
->regClass().is_subdword() && definition
->bytes() < 4) {
2190 PhysReg reg
= get_reg(ctx
, register_file
, tmp
, parallelcopy
, instr
);
2191 bool partial
= !(tmp
.bytes() <= 4 && reg
.byte() == 0 && !register_file
.test(reg
, 4));
2192 add_subdword_definition(program
, instr
, i
, reg
, partial
);
2193 definition
= &instr
->definitions
[i
]; /* add_subdword_definition can invalidate the reference */
2195 definition
->setFixed(get_reg(ctx
, register_file
, tmp
, parallelcopy
, instr
));
2199 assert(definition
->isFixed() && ((definition
->getTemp().type() == RegType::vgpr
&& definition
->physReg() >= 256) ||
2200 (definition
->getTemp().type() != RegType::vgpr
&& definition
->physReg() < 256)));
2201 ctx
.defs_done
.set(i
);
2203 /* set live if it has a kill point */
2204 if (!definition
->isKill())
2205 live
.emplace(definition
->getTemp());
2207 ctx
.assignments
[definition
->tempId()] = {definition
->physReg(), definition
->regClass()};
2208 register_file
.fill(*definition
);
2211 handle_pseudo(ctx
, register_file
, instr
.get());
2213 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually be read */
2214 for (const Definition
& def
: instr
->definitions
) {
2215 if (def
.isTemp() && def
.isKill())
2216 register_file
.clear(def
);
2218 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2219 const Operand
& op
= instr
->operands
[i
];
2220 if (op
.isTemp() && op
.isFirstKill() && op
.isLateKill())
2221 register_file
.clear(op
);
2222 if (op
.isTemp() && op
.physReg().byte() != 0)
2223 add_subdword_operand(program
->chip_class
, instr
, i
, op
.physReg().byte(), op
.regClass());
2226 /* emit parallelcopy */
2227 if (!parallelcopy
.empty()) {
2228 aco_ptr
<Pseudo_instruction
> pc
;
2229 pc
.reset(create_instruction
<Pseudo_instruction
>(aco_opcode::p_parallelcopy
, Format::PSEUDO
, parallelcopy
.size(), parallelcopy
.size()));
2230 bool temp_in_scc
= register_file
[scc
.reg()];
2231 bool sgpr_operands_alias_defs
= false;
2232 uint64_t sgpr_operands
[4] = {0, 0, 0, 0};
2233 for (unsigned i
= 0; i
< parallelcopy
.size(); i
++) {
2234 if (temp_in_scc
&& parallelcopy
[i
].first
.isTemp() && parallelcopy
[i
].first
.getTemp().type() == RegType::sgpr
) {
2235 if (!sgpr_operands_alias_defs
) {
2236 unsigned reg
= parallelcopy
[i
].first
.physReg().reg();
2237 unsigned size
= parallelcopy
[i
].first
.getTemp().size();
2238 sgpr_operands
[reg
/ 64u] |= u_bit_consecutive64(reg
% 64u, size
);
2240 reg
= parallelcopy
[i
].second
.physReg().reg();
2241 size
= parallelcopy
[i
].second
.getTemp().size();
2242 if (sgpr_operands
[reg
/ 64u] & u_bit_consecutive64(reg
% 64u, size
))
2243 sgpr_operands_alias_defs
= true;
2247 pc
->operands
[i
] = parallelcopy
[i
].first
;
2248 pc
->definitions
[i
] = parallelcopy
[i
].second
;
2249 assert(pc
->operands
[i
].size() == pc
->definitions
[i
].size());
2251 /* it might happen that the operand is already renamed. we have to restore the original name. */
2252 std::unordered_map
<unsigned, Temp
>::iterator it
= ctx
.orig_names
.find(pc
->operands
[i
].tempId());
2253 Temp orig
= it
!= ctx
.orig_names
.end() ? it
->second
: pc
->operands
[i
].getTemp();
2254 ctx
.orig_names
[pc
->definitions
[i
].tempId()] = orig
;
2255 ctx
.renames
[block
.index
][orig
.id()] = pc
->definitions
[i
].getTemp();
2257 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(pc
->operands
[i
].tempId());
2258 if (phi
!= ctx
.phi_map
.end())
2259 phi
->second
.uses
.emplace(pc
.get());
2262 if (temp_in_scc
&& sgpr_operands_alias_defs
) {
2263 /* disable definitions and re-enable operands */
2264 for (const Definition
& def
: instr
->definitions
) {
2265 if (def
.isTemp() && !def
.isKill())
2266 register_file
.clear(def
);
2268 for (const Operand
& op
: instr
->operands
) {
2269 if (op
.isTemp() && op
.isFirstKill())
2270 register_file
.block(op
.physReg(), op
.regClass());
2273 handle_pseudo(ctx
, register_file
, pc
.get());
2275 /* re-enable live vars */
2276 for (const Operand
& op
: instr
->operands
) {
2277 if (op
.isTemp() && op
.isFirstKill())
2278 register_file
.clear(op
);
2280 for (const Definition
& def
: instr
->definitions
) {
2281 if (def
.isTemp() && !def
.isKill())
2282 register_file
.fill(def
);
2285 pc
->tmp_in_scc
= false;
2288 instructions
.emplace_back(std::move(pc
));
2291 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
2292 bool instr_needs_vop3
= !instr
->isVOP3() &&
2293 ((instr
->format
== Format::VOPC
&& !(instr
->definitions
[0].physReg() == vcc
)) ||
2294 (instr
->opcode
== aco_opcode::v_cndmask_b32
&& !(instr
->operands
[2].physReg() == vcc
)) ||
2295 ((instr
->opcode
== aco_opcode::v_add_co_u32
||
2296 instr
->opcode
== aco_opcode::v_addc_co_u32
||
2297 instr
->opcode
== aco_opcode::v_sub_co_u32
||
2298 instr
->opcode
== aco_opcode::v_subb_co_u32
||
2299 instr
->opcode
== aco_opcode::v_subrev_co_u32
||
2300 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
2301 !(instr
->definitions
[1].physReg() == vcc
)) ||
2302 ((instr
->opcode
== aco_opcode::v_addc_co_u32
||
2303 instr
->opcode
== aco_opcode::v_subb_co_u32
||
2304 instr
->opcode
== aco_opcode::v_subbrev_co_u32
) &&
2305 !(instr
->operands
[2].physReg() == vcc
)));
2306 if (instr_needs_vop3
) {
2308 /* if the first operand is a literal, we have to move it to a reg */
2309 if (instr
->operands
.size() && instr
->operands
[0].isLiteral() && program
->chip_class
< GFX10
) {
2310 bool can_sgpr
= true;
2311 /* check, if we have to move to vgpr */
2312 for (const Operand
& op
: instr
->operands
) {
2313 if (op
.isTemp() && op
.getTemp().type() == RegType::sgpr
) {
2318 /* disable definitions and re-enable operands */
2319 for (const Definition
& def
: instr
->definitions
)
2320 register_file
.clear(def
);
2321 for (const Operand
& op
: instr
->operands
) {
2322 if (op
.isTemp() && op
.isFirstKill())
2323 register_file
.block(op
.physReg(), op
.regClass());
2325 Temp tmp
= {program
->allocateId(), can_sgpr
? s1
: v1
};
2326 ctx
.assignments
.emplace_back();
2327 PhysReg reg
= get_reg(ctx
, register_file
, tmp
, parallelcopy
, instr
);
2329 aco_ptr
<Instruction
> mov
;
2331 mov
.reset(create_instruction
<SOP1_instruction
>(aco_opcode::s_mov_b32
, Format::SOP1
, 1, 1));
2333 mov
.reset(create_instruction
<VOP1_instruction
>(aco_opcode::v_mov_b32
, Format::VOP1
, 1, 1));
2334 mov
->operands
[0] = instr
->operands
[0];
2335 mov
->definitions
[0] = Definition(tmp
);
2336 mov
->definitions
[0].setFixed(reg
);
2338 instr
->operands
[0] = Operand(tmp
);
2339 instr
->operands
[0].setFixed(reg
);
2340 instructions
.emplace_back(std::move(mov
));
2341 /* re-enable live vars */
2342 for (const Operand
& op
: instr
->operands
) {
2343 if (op
.isTemp() && op
.isFirstKill())
2344 register_file
.clear(op
);
2346 for (const Definition
& def
: instr
->definitions
) {
2347 if (def
.isTemp() && !def
.isKill())
2348 register_file
.fill(def
);
2352 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
2353 aco_ptr
<Instruction
> tmp
= std::move(instr
);
2354 Format format
= asVOP3(tmp
->format
);
2355 instr
.reset(create_instruction
<VOP3A_instruction
>(tmp
->opcode
, format
, tmp
->operands
.size(), tmp
->definitions
.size()));
2356 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2357 Operand
& operand
= tmp
->operands
[i
];
2358 instr
->operands
[i
] = operand
;
2359 /* keep phi_map up to date */
2360 if (operand
.isTemp()) {
2361 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.tempId());
2362 if (phi
!= ctx
.phi_map
.end()) {
2363 phi
->second
.uses
.erase(tmp
.get());
2364 phi
->second
.uses
.emplace(instr
.get());
2368 std::copy(tmp
->definitions
.begin(), tmp
->definitions
.end(), instr
->definitions
.begin());
2371 instructions
.emplace_back(std::move(*it
));
2373 } /* end for Instr */
2375 block
.instructions
= std::move(instructions
);
2377 ctx
.filled
[block
.index
] = true;
2378 for (unsigned succ_idx
: block
.linear_succs
) {
2379 Block
& succ
= program
->blocks
[succ_idx
];
2380 /* seal block if all predecessors are filled */
2381 bool all_filled
= true;
2382 for (unsigned pred_idx
: succ
.linear_preds
) {
2383 if (!ctx
.filled
[pred_idx
]) {
2389 ctx
.sealed
[succ_idx
] = true;
2391 /* finish incomplete phis and check if they became trivial */
2392 for (Instruction
* phi
: ctx
.incomplete_phis
[succ_idx
]) {
2393 std::vector
<unsigned> preds
= phi
->definitions
[0].getTemp().is_linear() ? succ
.linear_preds
: succ
.logical_preds
;
2394 for (unsigned i
= 0; i
< phi
->operands
.size(); i
++) {
2395 phi
->operands
[i
].setTemp(read_variable(ctx
, phi
->operands
[i
].getTemp(), preds
[i
]));
2396 phi
->operands
[i
].setFixed(ctx
.assignments
[phi
->operands
[i
].tempId()].reg
);
2398 try_remove_trivial_phi(ctx
, phi
->definitions
[0].getTemp());
2400 /* complete the original phi nodes, but no need to check triviality */
2401 for (aco_ptr
<Instruction
>& instr
: succ
.instructions
) {
2404 std::vector
<unsigned> preds
= instr
->opcode
== aco_opcode::p_phi
? succ
.logical_preds
: succ
.linear_preds
;
2406 for (unsigned i
= 0; i
< instr
->operands
.size(); i
++) {
2407 auto& operand
= instr
->operands
[i
];
2408 if (!operand
.isTemp())
2410 operand
.setTemp(read_variable(ctx
, operand
.getTemp(), preds
[i
]));
2411 operand
.setFixed(ctx
.assignments
[operand
.tempId()].reg
);
2412 std::unordered_map
<unsigned, phi_info
>::iterator phi
= ctx
.phi_map
.find(operand
.getTemp().id());
2413 if (phi
!= ctx
.phi_map
.end())
2414 phi
->second
.uses
.emplace(instr
.get());
2421 /* remove trivial phis */
2422 for (Block
& block
: program
->blocks
) {
2423 auto end
= std::find_if(block
.instructions
.begin(), block
.instructions
.end(),
2424 [](aco_ptr
<Instruction
>& instr
) { return !is_phi(instr
);});
2425 auto middle
= std::remove_if(block
.instructions
.begin(), end
,
2426 [](const aco_ptr
<Instruction
>& instr
) { return instr
->definitions
.empty();});
2427 block
.instructions
.erase(middle
, end
);
2430 /* find scc spill registers which may be needed for parallelcopies created by phis */
2431 for (Block
& block
: program
->blocks
) {
2432 if (block
.linear_preds
.size() <= 1)
2435 std::bitset
<128> regs
= sgpr_live_in
[block
.index
];
2439 /* choose a register */
2441 for (; reg
< ctx
.program
->max_reg_demand
.sgpr
&& regs
[reg
]; reg
++)
2443 assert(reg
< ctx
.program
->max_reg_demand
.sgpr
);
2444 adjust_max_used_regs(ctx
, s1
, reg
);
2446 /* update predecessors */
2447 for (unsigned& pred_index
: block
.linear_preds
) {
2448 Block
& pred
= program
->blocks
[pred_index
];
2449 pred
.scc_live_out
= true;
2450 pred
.scratch_sgpr
= PhysReg
{(uint16_t)reg
};
2454 /* num_gpr = rnd_up(max_used_gpr + 1) */
2455 program
->config
->num_vgprs
= align(ctx
.max_used_vgpr
+ 1, 4);
2456 if (program
->family
== CHIP_TONGA
|| program
->family
== CHIP_ICELAND
) /* workaround hardware bug */
2457 program
->config
->num_sgprs
= get_sgpr_alloc(program
, program
->sgpr_limit
);
2459 program
->config
->num_sgprs
= align(ctx
.max_used_sgpr
+ 1 + get_extra_sgprs(program
), 8);