2 * Copyright © 2014 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "vc4_context.h"
30 #define QPU_R(file, index) { QPU_MUX_##file, index }
32 static const struct qpu_reg vc4_regs
[] = {
104 #define AB_INDEX (ACC_INDEX + 5)
107 vc4_alloc_reg_set(struct vc4_context
*vc4
)
109 assert(vc4_regs
[AB_INDEX
].addr
== 0);
110 assert(vc4_regs
[AB_INDEX
+ 1].addr
== 0);
111 STATIC_ASSERT(ARRAY_SIZE(vc4_regs
) == AB_INDEX
+ 64);
116 vc4
->regs
= ra_alloc_reg_set(vc4
, ARRAY_SIZE(vc4_regs
), true);
118 vc4
->reg_class_any
= ra_alloc_reg_class(vc4
->regs
);
119 vc4
->reg_class_a_or_b_or_acc
= ra_alloc_reg_class(vc4
->regs
);
120 vc4
->reg_class_r4_or_a
= ra_alloc_reg_class(vc4
->regs
);
121 vc4
->reg_class_a
= ra_alloc_reg_class(vc4
->regs
);
122 for (uint32_t i
= 0; i
< ARRAY_SIZE(vc4_regs
); i
++) {
123 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
126 if (vc4_regs
[i
].addr
== 31)
129 /* R4 can't be written as a general purpose register. (it's
130 * TMU_NOSWAP as a write address).
132 if (vc4_regs
[i
].mux
== QPU_MUX_R4
) {
133 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_r4_or_a
, i
);
134 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_any
, i
);
138 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_any
, i
);
139 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b_or_acc
, i
);
142 for (uint32_t i
= AB_INDEX
; i
< AB_INDEX
+ 64; i
+= 2) {
143 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a
, i
);
144 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_r4_or_a
, i
);
147 ra_set_finalize(vc4
->regs
, NULL
);
150 struct node_to_temp_map
{
156 node_to_temp_priority(const void *in_a
, const void *in_b
)
158 const struct node_to_temp_map
*a
= in_a
;
159 const struct node_to_temp_map
*b
= in_b
;
161 return a
->priority
- b
->priority
;
164 #define CLASS_BIT_A (1 << 0)
165 #define CLASS_BIT_B_OR_ACC (1 << 1)
166 #define CLASS_BIT_R4 (1 << 2)
169 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
171 * The return value should be freed by the caller.
174 vc4_register_allocate(struct vc4_context
*vc4
, struct vc4_compile
*c
)
176 struct node_to_temp_map map
[c
->num_temps
];
177 uint32_t temp_to_node
[c
->num_temps
];
178 uint32_t def
[c
->num_temps
];
179 uint32_t use
[c
->num_temps
];
180 uint8_t class_bits
[c
->num_temps
];
181 struct qpu_reg
*temp_registers
= calloc(c
->num_temps
,
182 sizeof(*temp_registers
));
183 for (int i
= 0; i
< ARRAY_SIZE(def
); i
++)
185 memset(use
, 0, sizeof(use
));
187 /* If things aren't ever written (undefined values), just read from
190 for (uint32_t i
= 0; i
< c
->num_temps
; i
++)
191 temp_registers
[i
] = qpu_rn(0);
193 vc4_alloc_reg_set(vc4
);
195 struct ra_graph
*g
= ra_alloc_interference_graph(vc4
->regs
,
198 /* Compute the live ranges so we can figure out interference.
201 list_for_each_entry(struct qinst
, inst
, &c
->instructions
, link
) {
202 if (inst
->dst
.file
== QFILE_TEMP
) {
203 def
[inst
->dst
.index
] = MIN2(ip
, def
[inst
->dst
.index
]);
204 use
[inst
->dst
.index
] = ip
;
207 for (int i
= 0; i
< qir_get_op_nsrc(inst
->op
); i
++) {
208 if (inst
->src
[i
].file
== QFILE_TEMP
)
209 use
[inst
->src
[i
].index
] = ip
;
215 /* The payload registers have values implicitly loaded
216 * at the start of the program.
218 def
[inst
->dst
.index
] = 0;
227 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
229 map
[i
].priority
= use
[i
] - def
[i
];
231 qsort(map
, c
->num_temps
, sizeof(map
[0]), node_to_temp_priority
);
232 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
233 temp_to_node
[map
[i
].temp
] = i
;
236 /* Figure out our register classes and preallocated registers. We
237 * start with any temp being able to be in any file, then instructions
238 * incrementally remove bits that the temp definitely can't be in.
241 CLASS_BIT_A
| CLASS_BIT_B_OR_ACC
| CLASS_BIT_R4
,
245 list_for_each_entry(struct qinst
, inst
, &c
->instructions
, link
) {
246 if (qir_writes_r4(inst
)) {
247 /* This instruction writes r4 (and optionally moves
248 * its result to a temp), so nothing else can be
249 * stored in r4 across it.
251 for (int i
= 0; i
< c
->num_temps
; i
++) {
252 if (def
[i
] < ip
&& use
[i
] > ip
)
253 class_bits
[i
] &= ~CLASS_BIT_R4
;
256 /* R4 can't be written as a general purpose
257 * register. (it's TMU_NOSWAP as a write address).
259 if (inst
->dst
.file
== QFILE_TEMP
)
260 class_bits
[inst
->dst
.index
] &= ~CLASS_BIT_R4
;
265 ra_set_node_reg(g
, temp_to_node
[inst
->dst
.index
],
266 AB_INDEX
+ QPU_R_FRAG_PAYLOAD_ZW
* 2 + 1);
270 ra_set_node_reg(g
, temp_to_node
[inst
->dst
.index
],
271 AB_INDEX
+ QPU_R_FRAG_PAYLOAD_ZW
* 2);
278 if (inst
->dst
.pack
&& !qir_is_mul(inst
)) {
279 /* The non-MUL pack flags require an A-file dst
282 class_bits
[inst
->dst
.index
] &= CLASS_BIT_A
;
285 if (qir_src_needs_a_file(inst
)) {
286 if (qir_is_float_input(inst
)) {
287 /* Special case: these can be done as R4
290 class_bits
[inst
->src
[0].index
] &= (CLASS_BIT_A
|
293 class_bits
[inst
->src
[0].index
] &= CLASS_BIT_A
;
299 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
300 int node
= temp_to_node
[i
];
302 switch (class_bits
[i
]) {
303 case CLASS_BIT_A
| CLASS_BIT_B_OR_ACC
| CLASS_BIT_R4
:
304 ra_set_node_class(g
, node
, vc4
->reg_class_any
);
306 case CLASS_BIT_A
| CLASS_BIT_B_OR_ACC
:
307 ra_set_node_class(g
, node
, vc4
->reg_class_a_or_b_or_acc
);
309 case CLASS_BIT_A
| CLASS_BIT_R4
:
310 ra_set_node_class(g
, node
, vc4
->reg_class_r4_or_a
);
313 ra_set_node_class(g
, node
, vc4
->reg_class_a
);
316 fprintf(stderr
, "temp %d: bad class bits: 0x%x\n",
323 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
324 for (uint32_t j
= i
+ 1; j
< c
->num_temps
; j
++) {
325 if (!(def
[i
] >= use
[j
] || def
[j
] >= use
[i
])) {
326 ra_add_node_interference(g
,
333 bool ok
= ra_allocate(g
);
335 fprintf(stderr
, "Failed to register allocate:\n");
340 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
341 temp_registers
[i
] = vc4_regs
[ra_get_node_reg(g
, temp_to_node
[i
])];
343 /* If the value's never used, just write to the NOP register
344 * for clarity in debug output.
346 if (def
[i
] == use
[i
])
347 temp_registers
[i
] = qpu_ra(QPU_W_NOP
);
352 return temp_registers
;