broadcom/vc5: Add support for V3Dv4 signal bits.
[mesa.git] / src / broadcom / compiler / vir_register_allocate.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "v3d_compiler.h"
27
28 #define QPU_R(i) { .magic = false, .index = i }
29
30 #define ACC_INDEX 0
31 #define ACC_COUNT 5
32 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
33 #define PHYS_COUNT 64
34
35 bool
36 vir_init_reg_sets(struct v3d_compiler *compiler)
37 {
38 compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
39 true);
40 if (!compiler->regs)
41 return false;
42
43 /* Allocate 3 regfile classes, for the ways the physical register file
44 * can be divided up for fragment shader threading.
45 */
46 for (int threads = 0; threads < 3; threads++) {
47 compiler->reg_class[threads] =
48 ra_alloc_reg_class(compiler->regs);
49
50 for (int i = PHYS_INDEX;
51 i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
52 ra_class_add_reg(compiler->regs,
53 compiler->reg_class[threads], i);
54 }
55
56 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
57 ra_class_add_reg(compiler->regs,
58 compiler->reg_class[threads], i);
59 }
60 }
61
62 ra_set_finalize(compiler->regs, NULL);
63
64 return true;
65 }
66
67 struct node_to_temp_map {
68 uint32_t temp;
69 uint32_t priority;
70 };
71
72 static int
73 node_to_temp_priority(const void *in_a, const void *in_b)
74 {
75 const struct node_to_temp_map *a = in_a;
76 const struct node_to_temp_map *b = in_b;
77
78 return a->priority - b->priority;
79 }
80
81 #define CLASS_BIT_PHYS (1 << 0)
82 #define CLASS_BIT_R0_R2 (1 << 1)
83 #define CLASS_BIT_R3 (1 << 2)
84 #define CLASS_BIT_R4 (1 << 3)
85
86 /**
87 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
88 *
89 * The return value should be freed by the caller.
90 */
91 struct qpu_reg *
92 v3d_register_allocate(struct v3d_compile *c)
93 {
94 struct node_to_temp_map map[c->num_temps];
95 uint32_t temp_to_node[c->num_temps];
96 uint8_t class_bits[c->num_temps];
97 struct qpu_reg *temp_registers = calloc(c->num_temps,
98 sizeof(*temp_registers));
99 int acc_nodes[ACC_COUNT];
100
101 struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
102 c->num_temps +
103 ARRAY_SIZE(acc_nodes));
104
105 /* Make some fixed nodes for the accumulators, which we will need to
106 * interfere with when ops have implied r3/r4 writes or for the thread
107 * switches. We could represent these as classes for the nodes to
108 * live in, but the classes take up a lot of memory to set up, so we
109 * don't want to make too many.
110 */
111 for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
112 acc_nodes[i] = c->num_temps + i;
113 ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
114 }
115
116 /* Compute the live ranges so we can figure out interference. */
117 vir_calculate_live_intervals(c);
118
119 for (uint32_t i = 0; i < c->num_temps; i++) {
120 map[i].temp = i;
121 map[i].priority = c->temp_end[i] - c->temp_start[i];
122 }
123 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
124 for (uint32_t i = 0; i < c->num_temps; i++) {
125 temp_to_node[map[i].temp] = i;
126 }
127
128 /* Figure out our register classes and preallocated registers. We
129 * start with any temp being able to be in any file, then instructions
130 * incrementally remove bits that the temp definitely can't be in.
131 */
132 memset(class_bits,
133 CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
134 sizeof(class_bits));
135
136 int ip = 0;
137 vir_for_each_inst_inorder(inst, c) {
138 /* If the instruction writes r3/r4 (and optionally moves its
139 * result to a temp), nothing else can be stored in r3/r4 across
140 * it.
141 */
142 if (vir_writes_r3(c->devinfo, inst)) {
143 for (int i = 0; i < c->num_temps; i++) {
144 if (c->temp_start[i] < ip &&
145 c->temp_end[i] > ip) {
146 ra_add_node_interference(g,
147 temp_to_node[i],
148 acc_nodes[3]);
149 }
150 }
151 }
152 if (vir_writes_r4(c->devinfo, inst)) {
153 for (int i = 0; i < c->num_temps; i++) {
154 if (c->temp_start[i] < ip &&
155 c->temp_end[i] > ip) {
156 ra_add_node_interference(g,
157 temp_to_node[i],
158 acc_nodes[4]);
159 }
160 }
161 }
162
163 if (inst->src[0].file == QFILE_REG) {
164 switch (inst->src[0].index) {
165 case 0:
166 case 1:
167 case 2:
168 /* Payload setup instructions: Force allocate
169 * the dst to the given register (so the MOV
170 * will disappear).
171 */
172 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
173 assert(inst->dst.file == QFILE_TEMP);
174 ra_set_node_reg(g,
175 temp_to_node[inst->dst.index],
176 PHYS_INDEX +
177 inst->src[0].index);
178 break;
179 }
180 }
181
182 #if 0
183 switch (inst->op) {
184 case QOP_THRSW:
185 /* All accumulators are invalidated across a thread
186 * switch.
187 */
188 for (int i = 0; i < c->num_temps; i++) {
189 if (c->temp_start[i] < ip && c->temp_end[i] > ip)
190 class_bits[i] &= ~(CLASS_BIT_R0_R3 |
191 CLASS_BIT_R4);
192 }
193 break;
194
195 default:
196 break;
197 }
198 #endif
199
200 ip++;
201 }
202
203 for (uint32_t i = 0; i < c->num_temps; i++) {
204 ra_set_node_class(g, temp_to_node[i],
205 c->compiler->reg_class[c->fs_threaded]);
206 }
207
208 for (uint32_t i = 0; i < c->num_temps; i++) {
209 for (uint32_t j = i + 1; j < c->num_temps; j++) {
210 if (!(c->temp_start[i] >= c->temp_end[j] ||
211 c->temp_start[j] >= c->temp_end[i])) {
212 ra_add_node_interference(g,
213 temp_to_node[i],
214 temp_to_node[j]);
215 }
216 }
217 }
218
219 bool ok = ra_allocate(g);
220 if (!ok) {
221 if (!c->fs_threaded) {
222 fprintf(stderr, "Failed to register allocate:\n");
223 vir_dump(c);
224 }
225
226 c->failed = true;
227 free(temp_registers);
228 return NULL;
229 }
230
231 for (uint32_t i = 0; i < c->num_temps; i++) {
232 int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
233 if (ra_reg < PHYS_INDEX) {
234 temp_registers[i].magic = true;
235 temp_registers[i].index = (V3D_QPU_WADDR_R0 +
236 ra_reg - ACC_INDEX);
237 } else {
238 temp_registers[i].magic = false;
239 temp_registers[i].index = ra_reg - PHYS_INDEX;
240 }
241
242 /* If the value's never used, just write to the NOP register
243 * for clarity in debug output.
244 */
245 if (c->temp_start[i] == c->temp_end[i]) {
246 temp_registers[i].magic = true;
247 temp_registers[i].index = V3D_QPU_WADDR_NOP;
248 }
249 }
250
251 ralloc_free(g);
252
253 return temp_registers;
254 }