330876734d1b6bbee068553d489e5b7a332f4b07
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <stdio.h>
25 #include <inttypes.h>
26
27 #include "vc4_context.h"
28 #include "vc4_qir.h"
29 #include "vc4_qpu.h"
30
31 static void
32 vc4_dump_program(struct qcompile *c)
33 {
34 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
35
36 for (int i = 0; i < c->num_qpu_insts; i++) {
37 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
38 vc4_qpu_disasm(&c->qpu_insts[i], 1);
39 fprintf(stderr, "\n");
40 }
41 }
42
43 void
44 vc4_generate_code(struct qcompile *c)
45 {
46 uint64_t *insts = malloc(sizeof(uint64_t) * 1024); /* XXX: sizing */
47 uint32_t ni = 0;
48 struct qpu_reg allocate_to_qpu_reg[4 + 32 + 32];
49 bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
50 int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
51 int *reg_uses_remaining =
52 calloc(c->num_temps, sizeof(*reg_uses_remaining));
53
54 for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
55 reg_in_use[i] = false;
56 for (int i = 0; i < c->num_temps; i++)
57 reg_allocated[i] = -1;
58 for (int i = 0; i < 4; i++)
59 allocate_to_qpu_reg[i] = qpu_rn(i);
60 for (int i = 0; i < 32; i++)
61 allocate_to_qpu_reg[i + 4] = qpu_ra(i);
62 for (int i = 0; i < 32; i++)
63 allocate_to_qpu_reg[i + 4 + 32] = qpu_rb(i);
64
65 struct simple_node *node;
66 foreach(node, &c->instructions) {
67 struct qinst *qinst = (struct qinst *)node;
68
69 if (qinst->dst.file == QFILE_TEMP)
70 reg_uses_remaining[qinst->dst.index]++;
71 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
72 if (qinst->src[i].file == QFILE_TEMP)
73 reg_uses_remaining[qinst->src[i].index]++;
74 }
75 }
76
77 switch (c->stage) {
78 case QSTAGE_VERT:
79 case QSTAGE_COORD:
80 insts[ni++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00);
81 insts[ni++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00);
82 break;
83 case QSTAGE_FRAG:
84 break;
85 }
86
87 foreach(node, &c->instructions) {
88 struct qinst *qinst = (struct qinst *)node;
89
90 #if 0
91 fprintf(stderr, "translating qinst to qpu: ");
92 qir_dump_inst(qinst);
93 fprintf(stderr, "\n");
94 #endif
95
96 static const struct {
97 uint32_t op;
98 bool is_mul;
99 } translate[] = {
100 #define A(name) [QOP_##name] = {QPU_A_##name, false}
101 #define M(name) [QOP_##name] = {QPU_M_##name, true}
102 A(FADD),
103 A(FSUB),
104 A(FMIN),
105 A(FMAX),
106 A(FMINABS),
107 A(FMAXABS),
108 A(FTOI),
109
110 M(FMUL),
111 };
112
113 struct qpu_reg src[4];
114 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
115 int index = qinst->src[i].index;
116 switch (qinst->src[i].file) {
117 case QFILE_NULL:
118 src[i] = qpu_rn(0);
119 break;
120 case QFILE_TEMP:
121 assert(reg_allocated[index] != -1);
122 src[i] = allocate_to_qpu_reg[reg_allocated[index]];
123 reg_uses_remaining[index]--;
124 if (reg_uses_remaining[index] == 0)
125 reg_in_use[reg_allocated[index]] = false;
126 break;
127 case QFILE_UNIF:
128 src[i] = qpu_unif();
129 break;
130 case QFILE_VARY:
131 src[i] = qpu_vary();
132 break;
133 }
134 }
135
136 struct qpu_reg dst;
137 switch (qinst->dst.file) {
138 case QFILE_NULL:
139 dst = qpu_ra(QPU_W_NOP);
140 break;
141
142 case QFILE_TEMP:
143 if (reg_allocated[qinst->dst.index] == -1) {
144 int alloc;
145 for (alloc = 0;
146 alloc < ARRAY_SIZE(reg_in_use);
147 alloc++) {
148 /* The pack flags require an A-file register. */
149 if (qinst->op == QOP_PACK_SCALED &&
150 allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
151 continue;
152 }
153
154 if (!reg_in_use[alloc])
155 break;
156 }
157 assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
158 reg_in_use[alloc] = true;
159 reg_allocated[qinst->dst.index] = alloc;
160 }
161
162 dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
163
164 reg_uses_remaining[qinst->dst.index]--;
165 if (reg_uses_remaining[qinst->dst.index] == 0) {
166 reg_in_use[reg_allocated[qinst->dst.index]] =
167 false;
168 }
169 break;
170
171 case QFILE_VARY:
172 case QFILE_UNIF:
173 assert(!"not reached");
174 break;
175 }
176
177 switch (qinst->op) {
178 case QOP_MOV:
179 /* Skip emitting the MOV if it's a no-op. */
180 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
181 dst.mux != src[0].mux || dst.addr != src[0].addr) {
182 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
183 qpu_m_NOP());
184 }
185 break;
186
187 case QOP_VPM_WRITE:
188 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM),
189 src[0]),
190 qpu_m_NOP());
191 break;
192
193 case QOP_VPM_READ:
194 insts[ni++] = qpu_inst(qpu_a_MOV(dst,
195 qpu_ra(QPU_R_VPM)),
196 qpu_m_NOP());
197 break;
198
199 case QOP_RCP:
200 case QOP_RSQ:
201 case QOP_EXP2:
202 case QOP_LOG2:
203 switch (qinst->op) {
204 case QOP_RCP:
205 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
206 src[0]),
207 qpu_m_NOP());
208 break;
209 case QOP_RSQ:
210 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
211 src[0]),
212 qpu_m_NOP());
213 break;
214 case QOP_EXP2:
215 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
216 src[0]),
217 qpu_m_NOP());
218 break;
219 case QOP_LOG2:
220 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
221 src[0]),
222 qpu_m_NOP());
223 break;
224 default:
225 abort();
226 }
227
228 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
229 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
230
231 insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r4()),
232 qpu_m_NOP());
233
234 break;
235
236 case QOP_PACK_COLORS:
237 for (int i = 0; i < 4; i++) {
238 insts[ni++] = qpu_inst(qpu_a_NOP(),
239 qpu_m_MOV(qpu_r5(), src[i]));
240 insts[ni - 1] |= QPU_PM;
241 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
242 QPU_PACK);
243 }
244
245 insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r5()),
246 qpu_m_NOP());
247 break;
248
249 case QOP_TLB_COLOR_WRITE:
250 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_tlbc(),
251 src[0]),
252 qpu_m_NOP());
253 break;
254
255 case QOP_PACK_SCALED:
256 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
257 qpu_m_NOP());
258 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK);
259
260 insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[1]),
261 qpu_m_NOP());
262 insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK);
263
264 break;
265
266 default:
267 assert(qinst->op < ARRAY_SIZE(translate));
268 assert(translate[qinst->op].op != 0); /* NOPs */
269
270 /* If we have only one source, put it in the second
271 * argument slot as well so that we don't take up
272 * another raddr just to get unused data.
273 */
274 if (qir_get_op_nsrc(qinst->op) == 1)
275 src[1] = src[0];
276
277 if ((src[0].mux == QPU_MUX_A || src[0].mux == QPU_MUX_B) &&
278 (src[1].mux == QPU_MUX_A || src[1].mux == QPU_MUX_B) &&
279 src[0].addr != src[1].addr) {
280 insts[ni++] = qpu_inst(qpu_a_MOV(qpu_r5(), src[1]),
281 qpu_m_NOP());
282 src[1] = qpu_r5();
283 }
284
285 if (translate[qinst->op].is_mul) {
286 insts[ni++] = qpu_inst(qpu_a_NOP(),
287 qpu_m_alu2(translate[qinst->op].op,
288 dst, src[0], src[1]));
289 } else {
290 insts[ni++] = qpu_inst(qpu_a_alu2(translate[qinst->op].op,
291 dst, src[0], src[1]),
292 qpu_m_NOP());
293 }
294 break;
295 }
296
297 if ((dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B) &&
298 dst.addr < 32)
299 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
300 }
301
302 /* thread end can't have VPM write */
303 if (QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_ADD) == QPU_W_VPM ||
304 QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_MUL) == QPU_W_VPM)
305 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
306
307 insts[ni - 1] = qpu_set_sig(insts[ni - 1], QPU_SIG_PROG_END);
308 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
309 insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
310
311 switch (c->stage) {
312 case QSTAGE_VERT:
313 case QSTAGE_COORD:
314 break;
315 case QSTAGE_FRAG:
316 insts[2] = qpu_set_sig(insts[2], QPU_SIG_WAIT_FOR_SCOREBOARD);
317 insts[ni - 1] = qpu_set_sig(insts[ni - 1],
318 QPU_SIG_SCOREBOARD_UNLOCK);
319 break;
320 }
321
322 c->qpu_insts = insts;
323 c->num_qpu_insts = ni;
324
325 vc4_dump_program(c);
326 vc4_qpu_validate(insts, ni);
327 }
328