broadcom/vc5: Use THRSW to enable multi-threaded shaders.
[mesa.git] / src / broadcom / compiler / vir_register_allocate.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28
29 #define QPU_R(i) { .magic = false, .index = i }
30
31 #define ACC_INDEX 0
32 #define ACC_COUNT 5
33 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
34 #define PHYS_COUNT 64
35
36 bool
37 vir_init_reg_sets(struct v3d_compiler *compiler)
38 {
39 /* Allocate up to 3 regfile classes, for the ways the physical
40 * register file can be divided up for fragment shader threading.
41 */
42 int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
43
44 compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
45 true);
46 if (!compiler->regs)
47 return false;
48
49 for (int threads = 0; threads < max_thread_index; threads++) {
50 compiler->reg_class_phys_or_acc[threads] =
51 ra_alloc_reg_class(compiler->regs);
52 compiler->reg_class_phys[threads] =
53 ra_alloc_reg_class(compiler->regs);
54
55 for (int i = PHYS_INDEX;
56 i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
57 ra_class_add_reg(compiler->regs,
58 compiler->reg_class_phys_or_acc[threads], i);
59 ra_class_add_reg(compiler->regs,
60 compiler->reg_class_phys[threads], i);
61 }
62
63 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
64 ra_class_add_reg(compiler->regs,
65 compiler->reg_class_phys_or_acc[threads], i);
66 }
67 }
68
69 ra_set_finalize(compiler->regs, NULL);
70
71 return true;
72 }
73
74 struct node_to_temp_map {
75 uint32_t temp;
76 uint32_t priority;
77 };
78
79 static int
80 node_to_temp_priority(const void *in_a, const void *in_b)
81 {
82 const struct node_to_temp_map *a = in_a;
83 const struct node_to_temp_map *b = in_b;
84
85 return a->priority - b->priority;
86 }
87
88 #define CLASS_BIT_PHYS (1 << 0)
89 #define CLASS_BIT_R0_R2 (1 << 1)
90 #define CLASS_BIT_R3 (1 << 2)
91 #define CLASS_BIT_R4 (1 << 3)
92
93 /**
94 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
95 *
96 * The return value should be freed by the caller.
97 */
98 struct qpu_reg *
99 v3d_register_allocate(struct v3d_compile *c)
100 {
101 struct node_to_temp_map map[c->num_temps];
102 uint32_t temp_to_node[c->num_temps];
103 uint8_t class_bits[c->num_temps];
104 struct qpu_reg *temp_registers = calloc(c->num_temps,
105 sizeof(*temp_registers));
106 int acc_nodes[ACC_COUNT];
107
108 struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
109 c->num_temps +
110 ARRAY_SIZE(acc_nodes));
111 /* Convert 1, 2, 4 threads to 0, 1, 2 index.
112 *
113 * V3D 4.x has double the physical register space, so 64 physical regs
114 * are available at both 1x and 2x threading, and 4x has 32.
115 */
116 int thread_index = ffs(c->threads) - 1;
117 if (c->devinfo->ver >= 40) {
118 if (thread_index >= 1)
119 thread_index--;
120 }
121
122 /* Make some fixed nodes for the accumulators, which we will need to
123 * interfere with when ops have implied r3/r4 writes or for the thread
124 * switches. We could represent these as classes for the nodes to
125 * live in, but the classes take up a lot of memory to set up, so we
126 * don't want to make too many.
127 */
128 for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
129 acc_nodes[i] = c->num_temps + i;
130 ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
131 }
132
133 for (uint32_t i = 0; i < c->num_temps; i++) {
134 map[i].temp = i;
135 map[i].priority = c->temp_end[i] - c->temp_start[i];
136 }
137 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
138 for (uint32_t i = 0; i < c->num_temps; i++) {
139 temp_to_node[map[i].temp] = i;
140 }
141
142 /* Figure out our register classes and preallocated registers. We
143 * start with any temp being able to be in any file, then instructions
144 * incrementally remove bits that the temp definitely can't be in.
145 */
146 memset(class_bits,
147 CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
148 sizeof(class_bits));
149
150 int ip = 0;
151 vir_for_each_inst_inorder(inst, c) {
152 /* If the instruction writes r3/r4 (and optionally moves its
153 * result to a temp), nothing else can be stored in r3/r4 across
154 * it.
155 */
156 if (vir_writes_r3(c->devinfo, inst)) {
157 for (int i = 0; i < c->num_temps; i++) {
158 if (c->temp_start[i] < ip &&
159 c->temp_end[i] > ip) {
160 ra_add_node_interference(g,
161 temp_to_node[i],
162 acc_nodes[3]);
163 }
164 }
165 }
166 if (vir_writes_r4(c->devinfo, inst)) {
167 for (int i = 0; i < c->num_temps; i++) {
168 if (c->temp_start[i] < ip &&
169 c->temp_end[i] > ip) {
170 ra_add_node_interference(g,
171 temp_to_node[i],
172 acc_nodes[4]);
173 }
174 }
175 }
176
177 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
178 switch (inst->qpu.alu.add.op) {
179 case V3D_QPU_A_LDVPMV_IN:
180 case V3D_QPU_A_LDVPMV_OUT:
181 case V3D_QPU_A_LDVPMD_IN:
182 case V3D_QPU_A_LDVPMD_OUT:
183 case V3D_QPU_A_LDVPMP:
184 case V3D_QPU_A_LDVPMG_IN:
185 case V3D_QPU_A_LDVPMG_OUT:
186 /* LDVPMs only store to temps (the MA flag
187 * decides whether the LDVPM is in or out)
188 */
189 assert(inst->dst.file == QFILE_TEMP);
190 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
191 break;
192
193 default:
194 break;
195 }
196 }
197
198 if (inst->src[0].file == QFILE_REG) {
199 switch (inst->src[0].index) {
200 case 0:
201 case 1:
202 case 2:
203 /* Payload setup instructions: Force allocate
204 * the dst to the given register (so the MOV
205 * will disappear).
206 */
207 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
208 assert(inst->dst.file == QFILE_TEMP);
209 ra_set_node_reg(g,
210 temp_to_node[inst->dst.index],
211 PHYS_INDEX +
212 inst->src[0].index);
213 break;
214 }
215 }
216
217 if (inst->qpu.sig.thrsw) {
218 /* All accumulators are invalidated across a thread
219 * switch.
220 */
221 for (int i = 0; i < c->num_temps; i++) {
222 if (c->temp_start[i] < ip && c->temp_end[i] > ip)
223 class_bits[i] &= CLASS_BIT_PHYS;
224 }
225 }
226
227 ip++;
228 }
229
230 for (uint32_t i = 0; i < c->num_temps; i++) {
231 if (class_bits[i] == CLASS_BIT_PHYS) {
232 ra_set_node_class(g, temp_to_node[i],
233 c->compiler->reg_class_phys[thread_index]);
234 } else {
235 assert(class_bits[i] == (CLASS_BIT_PHYS |
236 CLASS_BIT_R0_R2 |
237 CLASS_BIT_R3 |
238 CLASS_BIT_R4));
239 ra_set_node_class(g, temp_to_node[i],
240 c->compiler->reg_class_phys_or_acc[thread_index]);
241 }
242 }
243
244 for (uint32_t i = 0; i < c->num_temps; i++) {
245 for (uint32_t j = i + 1; j < c->num_temps; j++) {
246 if (!(c->temp_start[i] >= c->temp_end[j] ||
247 c->temp_start[j] >= c->temp_end[i])) {
248 ra_add_node_interference(g,
249 temp_to_node[i],
250 temp_to_node[j]);
251 }
252 }
253 }
254
255 bool ok = ra_allocate(g);
256 if (!ok) {
257 free(temp_registers);
258 return NULL;
259 }
260
261 for (uint32_t i = 0; i < c->num_temps; i++) {
262 int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
263 if (ra_reg < PHYS_INDEX) {
264 temp_registers[i].magic = true;
265 temp_registers[i].index = (V3D_QPU_WADDR_R0 +
266 ra_reg - ACC_INDEX);
267 } else {
268 temp_registers[i].magic = false;
269 temp_registers[i].index = ra_reg - PHYS_INDEX;
270 }
271
272 /* If the value's never used, just write to the NOP register
273 * for clarity in debug output.
274 */
275 if (c->temp_start[i] == c->temp_end[i]) {
276 temp_registers[i].magic = true;
277 temp_registers[i].index = V3D_QPU_WADDR_NOP;
278 }
279 }
280
281 ralloc_free(g);
282
283 return temp_registers;
284 }