2 * Copyright © 2014 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "vc4_context.h"
30 #define QPU_R(file, index) { QPU_MUX_##file, index }
32 static const struct qpu_reg vc4_regs
[] = {
105 #define AB_INDEX (ACC_INDEX + ACC_COUNT)
109 vc4_alloc_reg_set(struct vc4_context
*vc4
)
111 assert(vc4_regs
[AB_INDEX
].addr
== 0);
112 assert(vc4_regs
[AB_INDEX
+ 1].addr
== 0);
113 STATIC_ASSERT(ARRAY_SIZE(vc4_regs
) == AB_INDEX
+ 64);
118 vc4
->regs
= ra_alloc_reg_set(vc4
, ARRAY_SIZE(vc4_regs
), true);
120 /* The physical regfiles split us into two classes, with [0] being the
121 * whole space and [1] being the bottom half (for threaded fragment
124 for (int i
= 0; i
< 2; i
++) {
125 vc4
->reg_class_any
[i
] = ra_alloc_reg_class(vc4
->regs
);
126 vc4
->reg_class_a_or_b
[i
] = ra_alloc_reg_class(vc4
->regs
);
127 vc4
->reg_class_a_or_b_or_acc
[i
] = ra_alloc_reg_class(vc4
->regs
);
128 vc4
->reg_class_r4_or_a
[i
] = ra_alloc_reg_class(vc4
->regs
);
129 vc4
->reg_class_a
[i
] = ra_alloc_reg_class(vc4
->regs
);
131 vc4
->reg_class_r0_r3
= ra_alloc_reg_class(vc4
->regs
);
134 for (uint32_t i
= ACC_INDEX
; i
< ACC_INDEX
+ 4; i
++) {
135 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_r0_r3
, i
);
136 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b_or_acc
[0], i
);
137 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b_or_acc
[1], i
);
140 /* R4 gets a special class because it can't be written as a general
141 * purpose register. (it's TMU_NOSWAP as a write address).
143 for (int i
= 0; i
< 2; i
++) {
144 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_r4_or_a
[i
],
146 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_any
[i
],
151 for (uint32_t i
= AB_INDEX
; i
< AB_INDEX
+ 64; i
++) {
152 /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in
155 if (vc4_regs
[i
].addr
== 14)
158 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_any
[0], i
);
159 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b
[0], i
);
160 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b_or_acc
[0], i
);
162 if (vc4_regs
[i
].addr
< 16) {
163 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_any
[1], i
);
164 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b
[1], i
);
165 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a_or_b_or_acc
[1], i
);
170 if (((i
- AB_INDEX
) & 1) == 0) {
171 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_a
[0], i
);
172 ra_class_add_reg(vc4
->regs
, vc4
->reg_class_r4_or_a
[0], i
);
174 if (vc4_regs
[i
].addr
< 16) {
175 ra_class_add_reg(vc4
->regs
,
176 vc4
->reg_class_a
[1], i
);
177 ra_class_add_reg(vc4
->regs
,
178 vc4
->reg_class_r4_or_a
[1], i
);
183 ra_set_finalize(vc4
->regs
, NULL
);
186 struct node_to_temp_map
{
192 node_to_temp_priority(const void *in_a
, const void *in_b
)
194 const struct node_to_temp_map
*a
= in_a
;
195 const struct node_to_temp_map
*b
= in_b
;
197 return a
->priority
- b
->priority
;
200 #define CLASS_BIT_A (1 << 0)
201 #define CLASS_BIT_B (1 << 1)
202 #define CLASS_BIT_R4 (1 << 2)
203 #define CLASS_BIT_R0_R3 (1 << 4)
205 struct vc4_ra_select_callback_data
{
211 vc4_ra_select_callback(struct ra_graph
*g
, BITSET_WORD
*regs
, void *data
)
213 struct vc4_ra_select_callback_data
*vc4_ra
= data
;
215 /* If r4 is available, always choose it -- few other things can go
216 * there, and choosing anything else means inserting a mov.
218 if (BITSET_TEST(regs
, ACC_INDEX
+ 4))
219 return ACC_INDEX
+ 4;
221 /* Choose an accumulator if possible (no delay between write and
222 * read), but round-robin through them to give post-RA instruction
223 * selection more options.
225 for (int i
= 0; i
< ACC_COUNT
; i
++) {
226 int acc_off
= (vc4_ra
->next_acc
+ i
) % ACC_COUNT
;
227 int acc
= ACC_INDEX
+ acc_off
;
229 if (BITSET_TEST(regs
, acc
)) {
230 vc4_ra
->next_acc
= acc_off
+ 1;
235 for (int i
= 0; i
< AB_COUNT
; i
++) {
236 int ab_off
= (vc4_ra
->next_ab
+ i
) % AB_COUNT
;
237 int ab
= AB_INDEX
+ ab_off
;
239 if (BITSET_TEST(regs
, ab
)) {
240 vc4_ra
->next_ab
= ab_off
+ 1;
245 unreachable("RA must pass us at least one possible reg.");
249 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
251 * The return value should be freed by the caller.
254 vc4_register_allocate(struct vc4_context
*vc4
, struct vc4_compile
*c
)
256 struct node_to_temp_map map
[c
->num_temps
];
257 uint32_t temp_to_node
[c
->num_temps
];
258 uint8_t class_bits
[c
->num_temps
];
259 struct qpu_reg
*temp_registers
= calloc(c
->num_temps
,
260 sizeof(*temp_registers
));
261 struct vc4_ra_select_callback_data callback_data
= {
266 /* If things aren't ever written (undefined values), just read from
269 for (uint32_t i
= 0; i
< c
->num_temps
; i
++)
270 temp_registers
[i
] = qpu_rn(0);
272 vc4_alloc_reg_set(vc4
);
274 struct ra_graph
*g
= ra_alloc_interference_graph(vc4
->regs
,
277 /* Compute the live ranges so we can figure out interference. */
278 qir_calculate_live_intervals(c
);
280 ra_set_select_reg_callback(g
, vc4_ra_select_callback
, &callback_data
);
282 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
284 map
[i
].priority
= c
->temp_end
[i
] - c
->temp_start
[i
];
286 qsort(map
, c
->num_temps
, sizeof(map
[0]), node_to_temp_priority
);
287 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
288 temp_to_node
[map
[i
].temp
] = i
;
291 /* Figure out our register classes and preallocated registers. We
292 * start with any temp being able to be in any file, then instructions
293 * incrementally remove bits that the temp definitely can't be in.
296 CLASS_BIT_A
| CLASS_BIT_B
| CLASS_BIT_R4
| CLASS_BIT_R0_R3
,
300 qir_for_each_inst_inorder(inst
, c
) {
301 if (qir_writes_r4(inst
)) {
302 /* This instruction writes r4 (and optionally moves
303 * its result to a temp), so nothing else can be
304 * stored in r4 across it.
306 for (int i
= 0; i
< c
->num_temps
; i
++) {
307 if (c
->temp_start
[i
] < ip
&& c
->temp_end
[i
] > ip
)
308 class_bits
[i
] &= ~CLASS_BIT_R4
;
311 /* If we're doing a conditional write of something
312 * writing R4 (math, tex results), then make sure that
313 * we store in a temp so that we actually
314 * conditionally move the result.
316 if (inst
->cond
!= QPU_COND_ALWAYS
)
317 class_bits
[inst
->dst
.index
] &= ~CLASS_BIT_R4
;
319 /* R4 can't be written as a general purpose
320 * register. (it's TMU_NOSWAP as a write address).
322 if (inst
->dst
.file
== QFILE_TEMP
)
323 class_bits
[inst
->dst
.index
] &= ~CLASS_BIT_R4
;
328 ra_set_node_reg(g
, temp_to_node
[inst
->dst
.index
],
329 AB_INDEX
+ QPU_R_FRAG_PAYLOAD_ZW
* 2 + 1);
333 ra_set_node_reg(g
, temp_to_node
[inst
->dst
.index
],
334 AB_INDEX
+ QPU_R_FRAG_PAYLOAD_ZW
* 2);
338 assert(inst
->src
[0].file
== QFILE_TEMP
);
339 class_bits
[inst
->src
[0].index
] &= CLASS_BIT_R0_R3
;
343 /* All accumulators are invalidated across a thread
346 for (int i
= 0; i
< c
->num_temps
; i
++) {
347 if (c
->temp_start
[i
] < ip
&& c
->temp_end
[i
] > ip
)
348 class_bits
[i
] &= ~(CLASS_BIT_R0_R3
|
357 if (inst
->dst
.pack
&& !qir_is_mul(inst
)) {
358 /* The non-MUL pack flags require an A-file dst
361 class_bits
[inst
->dst
.index
] &= CLASS_BIT_A
;
364 /* Apply restrictions for src unpacks. The integer unpacks
365 * can only be done from regfile A, while float unpacks can be
368 for (int i
= 0; i
< qir_get_nsrc(inst
); i
++) {
369 if (inst
->src
[i
].file
== QFILE_TEMP
&&
371 if (qir_is_float_input(inst
)) {
372 class_bits
[inst
->src
[i
].index
] &=
373 CLASS_BIT_A
| CLASS_BIT_R4
;
375 class_bits
[inst
->src
[i
].index
] &=
384 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
385 int node
= temp_to_node
[i
];
387 switch (class_bits
[i
]) {
388 case CLASS_BIT_A
| CLASS_BIT_B
| CLASS_BIT_R4
| CLASS_BIT_R0_R3
:
389 ra_set_node_class(g
, node
,
390 vc4
->reg_class_any
[c
->fs_threaded
]);
392 case CLASS_BIT_A
| CLASS_BIT_B
:
393 ra_set_node_class(g
, node
,
394 vc4
->reg_class_a_or_b
[c
->fs_threaded
]);
396 case CLASS_BIT_A
| CLASS_BIT_B
| CLASS_BIT_R0_R3
:
397 ra_set_node_class(g
, node
,
398 vc4
->reg_class_a_or_b_or_acc
[c
->fs_threaded
]);
400 case CLASS_BIT_A
| CLASS_BIT_R4
:
401 ra_set_node_class(g
, node
,
402 vc4
->reg_class_r4_or_a
[c
->fs_threaded
]);
405 ra_set_node_class(g
, node
,
406 vc4
->reg_class_a
[c
->fs_threaded
]);
408 case CLASS_BIT_R0_R3
:
409 ra_set_node_class(g
, node
, vc4
->reg_class_r0_r3
);
413 /* DDX/DDY used across thread switched might get us
416 if (c
->fs_threaded
) {
418 free(temp_registers
);
422 fprintf(stderr
, "temp %d: bad class bits: 0x%x\n",
429 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
430 for (uint32_t j
= i
+ 1; j
< c
->num_temps
; j
++) {
431 if (!(c
->temp_start
[i
] >= c
->temp_end
[j
] ||
432 c
->temp_start
[j
] >= c
->temp_end
[i
])) {
433 ra_add_node_interference(g
,
440 bool ok
= ra_allocate(g
);
442 if (!c
->fs_threaded
) {
443 fprintf(stderr
, "Failed to register allocate:\n");
448 free(temp_registers
);
452 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
453 temp_registers
[i
] = vc4_regs
[ra_get_node_reg(g
, temp_to_node
[i
])];
455 /* If the value's never used, just write to the NOP register
456 * for clarity in debug output.
458 if (c
->temp_start
[i
] == c
->temp_end
[i
])
459 temp_registers
[i
] = qpu_ra(QPU_W_NOP
);
464 return temp_registers
;