8cb5de15fff4a8fbae7e89afdcab3feb55c1adbc
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <stdio.h>
25 #include <inttypes.h>
26
27 #include "vc4_context.h"
28 #include "vc4_qir.h"
29 #include "vc4_qpu.h"
30
31 static void
32 vc4_dump_program(struct qcompile *c)
33 {
34 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
35
36 for (int i = 0; i < c->num_qpu_insts; i++) {
37 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
38 vc4_qpu_disasm(&c->qpu_insts[i], 1);
39 fprintf(stderr, "\n");
40 }
41 }
42
43 /**
44 * This is used to resolve the fact that we might register-allocate two
45 * different operands of an instruction to the same physical register file
46 * even though instructions have only one field for the register file source
47 * address.
48 *
49 * In that case, we need to move one to a temporary that can be used in the
50 * instruction, instead.
51 */
52 static void
53 fixup_raddr_conflict(uint64_t *insts, uint32_t *ni,
54 struct qpu_reg src0, struct qpu_reg *src1)
55 {
56 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
57 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
58 src0.addr != src1->addr) {
59 insts[(*ni)++] = qpu_inst(qpu_a_MOV(qpu_r3(), *src1),
60 qpu_m_NOP());
61 *src1 = qpu_r3();
62 }
63 }
64
65 void
66 vc4_generate_code(struct qcompile *c)
67 {
68 uint64_t *insts = malloc(sizeof(uint64_t) * 1024); /* XXX: sizing */
69 uint32_t ni = 0;
70 struct qpu_reg allocate_to_qpu_reg[3 + 32 + 32];
71 bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
72 int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
73 int *reg_uses_remaining =
74 calloc(c->num_temps, sizeof(*reg_uses_remaining));
75
76 for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
77 reg_in_use[i] = false;
78 for (int i = 0; i < c->num_temps; i++)
79 reg_allocated[i] = -1;
80 for (int i = 0; i < 3; i++)
81 allocate_to_qpu_reg[i] = qpu_rn(i);
82 for (int i = 0; i < 32; i++)
83 allocate_to_qpu_reg[i + 3] = qpu_ra(i);
84 for (int i = 0; i < 32; i++)
85 allocate_to_qpu_reg[i + 3 + 32] = qpu_rb(i);
86
87 struct simple_node *node;
88 foreach(node, &c->instructions) {
89 struct qinst *qinst = (struct qinst *)node;
90
91 if (qinst->dst.file == QFILE_TEMP)
92 reg_uses_remaining[qinst->dst.index]++;
93 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
94 if (qinst->src[i].file == QFILE_TEMP)
95 reg_uses_remaining[qinst->src[i].index]++;
96 }
97 }
98
99 switch (c->stage) {
100 case QSTAGE_VERT:
101 case QSTAGE_COORD:
102 insts[ni++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00);
103 insts[ni++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00);
104 break;
105 case QSTAGE_FRAG:
106 break;
107 }
108
109 foreach(node, &c->instructions) {
110 struct qinst *qinst = (struct qinst *)node;
111
112 #if 0
113 fprintf(stderr, "translating qinst to qpu: ");
114 qir_dump_inst(qinst);
115 fprintf(stderr, "\n");
116 #endif
117
118 static const struct {
119 uint32_t op;
120 bool is_mul;
121 } translate[] = {
122 #define A(name) [QOP_##name] = {QPU_A_##name, false}
123 #define M(name) [QOP_##name] = {QPU_M_##name, true}
124 A(FADD),
125 A(FSUB),
126 A(FMIN),
127 A(FMAX),
128 A(FMINABS),
129 A(FMAXABS),
130 A(FTOI),
131
132 M(FMUL),
133 };
134
135 static const uint32_t compareflags[] = {
136 [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS,
137 [QOP_SNE - QOP_SEQ] = QPU_COND_ZC,
138 [QOP_SLT - QOP_SEQ] = QPU_COND_NS,
139 [QOP_SGE - QOP_SEQ] = QPU_COND_NC,
140 };
141
142 struct qpu_reg src[4];
143 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
144 int index = qinst->src[i].index;
145 switch (qinst->src[i].file) {
146 case QFILE_NULL:
147 src[i] = qpu_rn(0);
148 break;
149 case QFILE_TEMP:
150 assert(reg_allocated[index] != -1);
151 src[i] = allocate_to_qpu_reg[reg_allocated[index]];
152 reg_uses_remaining[index]--;
153 if (reg_uses_remaining[index] == 0)
154 reg_in_use[reg_allocated[index]] = false;
155 break;
156 case QFILE_UNIF:
157 src[i] = qpu_unif();
158 break;
159 case QFILE_VARY:
160 src[i] = qpu_vary();
161 break;
162 }
163 }
164
165 struct qpu_reg dst;
166 switch (qinst->dst.file) {
167 case QFILE_NULL:
168 dst = qpu_ra(QPU_W_NOP);
169 break;
170
171 case QFILE_TEMP:
172 if (reg_allocated[qinst->dst.index] == -1) {
173 int alloc;
174 for (alloc = 0;
175 alloc < ARRAY_SIZE(reg_in_use);
176 alloc++) {
177 /* The pack flags require an A-file register. */
178 if (qinst->op == QOP_PACK_SCALED &&
179 allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
180 continue;
181 }
182
183 if (!reg_in_use[alloc])
184 break;
185 }
186 assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
187 reg_in_use[alloc] = true;
188 reg_allocated[qinst->dst.index] = alloc;
189 }
190
191 dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
192
193 reg_uses_remaining[qinst->dst.index]--;
194 if (reg_uses_remaining[qinst->dst.index] == 0) {
195 reg_in_use[reg_allocated[qinst->dst.index]] =
196 false;
197 }
198 break;
199
200 case QFILE_VARY:
201 case QFILE_UNIF:
202 assert(!"not reached");
203 break;
204 }
205
206 switch (qinst->op) {
207 case QOP_MOV:
208 /* Skip emitting the MOV if it's a no-op. */
209 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
210 dst.mux != src[0].mux || dst.addr != src[0].addr) {
211 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
212 qpu_m_NOP());
213 }
214 break;
215
216 case QOP_SEQ:
217 case QOP_SNE:
218 case QOP_SGE:
219 case QOP_SLT:
220 fixup_raddr_conflict(insts, &ni, src[0], &src[1]);
221 insts[ni++] = qpu_inst(qpu_a_SUB(qpu_ra(QPU_W_NOP),
222 src[0], src[1]),
223 qpu_m_NOP());
224 insts[ni - 1] |= QPU_SF;
225
226 insts[ni++] = qpu_load_imm_f(dst, 0.0);
227 insts[ni++] = qpu_load_imm_f(dst, 1.0);
228 insts[ni - 1] = ((insts[ni - 1] & ~QPU_COND_ADD_MASK)
229 | QPU_SET_FIELD(compareflags[qinst->op - QOP_SEQ],
230 QPU_COND_ADD));
231
232 break;
233
234 case QOP_VPM_WRITE:
235 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM),
236 src[0]),
237 qpu_m_NOP());
238 break;
239
240 case QOP_VPM_READ:
241 insts[ni++] = qpu_inst(qpu_a_MOV(dst,
242 qpu_ra(QPU_R_VPM)),
243 qpu_m_NOP());
244 break;
245
246 case QOP_RCP:
247 case QOP_RSQ:
248 case QOP_EXP2:
249 case QOP_LOG2:
250 switch (qinst->op) {
251 case QOP_RCP:
252 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
253 src[0]),
254 qpu_m_NOP());
255 break;
256 case QOP_RSQ:
257 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
258 src[0]),
259 qpu_m_NOP());
260 break;
261 case QOP_EXP2:
262 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
263 src[0]),
264 qpu_m_NOP());
265 break;
266 case QOP_LOG2:
267 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
268 src[0]),
269 qpu_m_NOP());
270 break;
271 default:
272 abort();
273 }
274
275 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
276 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
277
278 insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r4()),
279 qpu_m_NOP());
280
281 break;
282
283 case QOP_PACK_COLORS:
284 for (int i = 0; i < 4; i++) {
285 insts[ni++] = qpu_inst(qpu_a_NOP(),
286 qpu_m_MOV(qpu_r3(), src[i]));
287 insts[ni - 1] |= QPU_PM;
288 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
289 QPU_PACK);
290 }
291
292 insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r3()),
293 qpu_m_NOP());
294 break;
295
296 case QOP_TLB_COLOR_WRITE:
297 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_tlbc(),
298 src[0]),
299 qpu_m_NOP());
300 break;
301
302 case QOP_PACK_SCALED:
303 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
304 qpu_m_NOP());
305 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK);
306
307 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[1]),
308 qpu_m_NOP());
309 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK);
310
311 break;
312
313 default:
314 assert(qinst->op < ARRAY_SIZE(translate));
315 assert(translate[qinst->op].op != 0); /* NOPs */
316
317 /* If we have only one source, put it in the second
318 * argument slot as well so that we don't take up
319 * another raddr just to get unused data.
320 */
321 if (qir_get_op_nsrc(qinst->op) == 1)
322 src[1] = src[0];
323
324 fixup_raddr_conflict(insts, &ni, src[0], &src[1]);
325
326 if (translate[qinst->op].is_mul) {
327 insts[ni++] = qpu_inst(qpu_a_NOP(),
328 qpu_m_alu2(translate[qinst->op].op,
329 dst, src[0], src[1]));
330 } else {
331 insts[ni++] = qpu_inst(qpu_a_alu2(translate[qinst->op].op,
332 dst, src[0], src[1]),
333 qpu_m_NOP());
334 }
335 break;
336 }
337
338 if ((dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B) &&
339 dst.addr < 32)
340 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
341 }
342
343 /* thread end can't have VPM write */
344 if (QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_ADD) == QPU_W_VPM ||
345 QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_MUL) == QPU_W_VPM)
346 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
347
348 insts[ni - 1] = qpu_set_sig(insts[ni - 1], QPU_SIG_PROG_END);
349 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
350 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
351
352 switch (c->stage) {
353 case QSTAGE_VERT:
354 case QSTAGE_COORD:
355 break;
356 case QSTAGE_FRAG:
357 insts[2] = qpu_set_sig(insts[2], QPU_SIG_WAIT_FOR_SCOREBOARD);
358 insts[ni - 1] = qpu_set_sig(insts[ni - 1],
359 QPU_SIG_SCOREBOARD_UNLOCK);
360 break;
361 }
362
363 c->qpu_insts = insts;
364 c->num_qpu_insts = ni;
365
366 if (vc4_debug & VC4_DEBUG_QPU)
367 vc4_dump_program(c);
368
369 vc4_qpu_validate(insts, ni);
370 }
371