2 * Copyright © 2014 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
29 #define QPU_R(i) { .magic = false, .index = i }
33 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
37 vir_init_reg_sets(struct v3d_compiler
*compiler
)
39 /* Allocate up to 3 regfile classes, for the ways the physical
40 * register file can be divided up for fragment shader threading.
42 int max_thread_index
= (compiler
->devinfo
->ver
>= 40 ? 2 : 3);
44 compiler
->regs
= ra_alloc_reg_set(compiler
, PHYS_INDEX
+ PHYS_COUNT
,
49 for (int threads
= 0; threads
< max_thread_index
; threads
++) {
50 compiler
->reg_class_phys_or_acc
[threads
] =
51 ra_alloc_reg_class(compiler
->regs
);
52 compiler
->reg_class_phys
[threads
] =
53 ra_alloc_reg_class(compiler
->regs
);
55 for (int i
= PHYS_INDEX
;
56 i
< PHYS_INDEX
+ (PHYS_COUNT
>> threads
); i
++) {
57 ra_class_add_reg(compiler
->regs
,
58 compiler
->reg_class_phys_or_acc
[threads
], i
);
59 ra_class_add_reg(compiler
->regs
,
60 compiler
->reg_class_phys
[threads
], i
);
63 for (int i
= ACC_INDEX
+ 0; i
< ACC_INDEX
+ ACC_COUNT
; i
++) {
64 ra_class_add_reg(compiler
->regs
,
65 compiler
->reg_class_phys_or_acc
[threads
], i
);
69 ra_set_finalize(compiler
->regs
, NULL
);
74 struct node_to_temp_map
{
80 node_to_temp_priority(const void *in_a
, const void *in_b
)
82 const struct node_to_temp_map
*a
= in_a
;
83 const struct node_to_temp_map
*b
= in_b
;
85 return a
->priority
- b
->priority
;
88 #define CLASS_BIT_PHYS (1 << 0)
89 #define CLASS_BIT_R0_R2 (1 << 1)
90 #define CLASS_BIT_R3 (1 << 2)
91 #define CLASS_BIT_R4 (1 << 3)
94 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
96 * The return value should be freed by the caller.
99 v3d_register_allocate(struct v3d_compile
*c
)
101 struct node_to_temp_map map
[c
->num_temps
];
102 uint32_t temp_to_node
[c
->num_temps
];
103 uint8_t class_bits
[c
->num_temps
];
104 struct qpu_reg
*temp_registers
= calloc(c
->num_temps
,
105 sizeof(*temp_registers
));
106 int acc_nodes
[ACC_COUNT
];
108 struct ra_graph
*g
= ra_alloc_interference_graph(c
->compiler
->regs
,
110 ARRAY_SIZE(acc_nodes
));
111 /* Convert 1, 2, 4 threads to 0, 1, 2 index.
113 * V3D 4.x has double the physical register space, so 64 physical regs
114 * are available at both 1x and 2x threading, and 4x has 32.
116 int thread_index
= ffs(c
->threads
) - 1;
117 if (c
->devinfo
->ver
>= 40) {
118 if (thread_index
>= 1)
122 /* Make some fixed nodes for the accumulators, which we will need to
123 * interfere with when ops have implied r3/r4 writes or for the thread
124 * switches. We could represent these as classes for the nodes to
125 * live in, but the classes take up a lot of memory to set up, so we
126 * don't want to make too many.
128 for (int i
= 0; i
< ARRAY_SIZE(acc_nodes
); i
++) {
129 acc_nodes
[i
] = c
->num_temps
+ i
;
130 ra_set_node_reg(g
, acc_nodes
[i
], ACC_INDEX
+ i
);
133 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
135 map
[i
].priority
= c
->temp_end
[i
] - c
->temp_start
[i
];
137 qsort(map
, c
->num_temps
, sizeof(map
[0]), node_to_temp_priority
);
138 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
139 temp_to_node
[map
[i
].temp
] = i
;
142 /* Figure out our register classes and preallocated registers. We
143 * start with any temp being able to be in any file, then instructions
144 * incrementally remove bits that the temp definitely can't be in.
147 CLASS_BIT_PHYS
| CLASS_BIT_R0_R2
| CLASS_BIT_R3
| CLASS_BIT_R4
,
151 vir_for_each_inst_inorder(inst
, c
) {
152 /* If the instruction writes r3/r4 (and optionally moves its
153 * result to a temp), nothing else can be stored in r3/r4 across
156 if (vir_writes_r3(c
->devinfo
, inst
)) {
157 for (int i
= 0; i
< c
->num_temps
; i
++) {
158 if (c
->temp_start
[i
] < ip
&&
159 c
->temp_end
[i
] > ip
) {
160 ra_add_node_interference(g
,
166 if (vir_writes_r4(c
->devinfo
, inst
)) {
167 for (int i
= 0; i
< c
->num_temps
; i
++) {
168 if (c
->temp_start
[i
] < ip
&&
169 c
->temp_end
[i
] > ip
) {
170 ra_add_node_interference(g
,
177 if (inst
->qpu
.type
== V3D_QPU_INSTR_TYPE_ALU
) {
178 switch (inst
->qpu
.alu
.add
.op
) {
179 case V3D_QPU_A_LDVPMV_IN
:
180 case V3D_QPU_A_LDVPMV_OUT
:
181 case V3D_QPU_A_LDVPMD_IN
:
182 case V3D_QPU_A_LDVPMD_OUT
:
183 case V3D_QPU_A_LDVPMP
:
184 case V3D_QPU_A_LDVPMG_IN
:
185 case V3D_QPU_A_LDVPMG_OUT
:
186 /* LDVPMs only store to temps (the MA flag
187 * decides whether the LDVPM is in or out)
189 assert(inst
->dst
.file
== QFILE_TEMP
);
190 class_bits
[inst
->dst
.index
] &= CLASS_BIT_PHYS
;
198 if (inst
->src
[0].file
== QFILE_REG
) {
199 switch (inst
->src
[0].index
) {
203 /* Payload setup instructions: Force allocate
204 * the dst to the given register (so the MOV
207 assert(inst
->qpu
.alu
.mul
.op
== V3D_QPU_M_MOV
);
208 assert(inst
->dst
.file
== QFILE_TEMP
);
210 temp_to_node
[inst
->dst
.index
],
217 if (inst
->qpu
.sig
.thrsw
) {
218 /* All accumulators are invalidated across a thread
221 for (int i
= 0; i
< c
->num_temps
; i
++) {
222 if (c
->temp_start
[i
] < ip
&& c
->temp_end
[i
] > ip
)
223 class_bits
[i
] &= CLASS_BIT_PHYS
;
230 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
231 if (class_bits
[i
] == CLASS_BIT_PHYS
) {
232 ra_set_node_class(g
, temp_to_node
[i
],
233 c
->compiler
->reg_class_phys
[thread_index
]);
235 assert(class_bits
[i
] == (CLASS_BIT_PHYS
|
239 ra_set_node_class(g
, temp_to_node
[i
],
240 c
->compiler
->reg_class_phys_or_acc
[thread_index
]);
244 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
245 for (uint32_t j
= i
+ 1; j
< c
->num_temps
; j
++) {
246 if (!(c
->temp_start
[i
] >= c
->temp_end
[j
] ||
247 c
->temp_start
[j
] >= c
->temp_end
[i
])) {
248 ra_add_node_interference(g
,
255 bool ok
= ra_allocate(g
);
257 free(temp_registers
);
261 for (uint32_t i
= 0; i
< c
->num_temps
; i
++) {
262 int ra_reg
= ra_get_node_reg(g
, temp_to_node
[i
]);
263 if (ra_reg
< PHYS_INDEX
) {
264 temp_registers
[i
].magic
= true;
265 temp_registers
[i
].index
= (V3D_QPU_WADDR_R0
+
268 temp_registers
[i
].magic
= false;
269 temp_registers
[i
].index
= ra_reg
- PHYS_INDEX
;
272 /* If the value's never used, just write to the NOP register
273 * for clarity in debug output.
275 if (c
->temp_start
[i
] == c
->temp_end
[i
]) {
276 temp_registers
[i
].magic
= true;
277 temp_registers
[i
].index
= V3D_QPU_WADDR_NOP
;
283 return temp_registers
;