efb22fdb8ee3ac3c12c87ffcf503452676355667
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include <stdio.h>
24 #include <errno.h>
25 #include <byteswap.h>
26 #include "util/u_format.h"
27 #include "util/u_memory.h"
28 #include "pipe/p_shader_tokens.h"
29 #include "r600_pipe.h"
30 #include "r600_sq.h"
31 #include "r600_opcodes.h"
32 #include "r600_asm.h"
33 #include "r600_formats.h"
34 #include "r600d.h"
35
36 #ifdef PIPE_ARCH_BIG_ENDIAN
37 #define CPU_TO_LE32(x) bswap_32(x)
38 #else
39 #define CPU_TO_LE32(x) (x)
40 #endif
41
42 #define NUM_OF_CYCLES 3
43 #define NUM_OF_COMPONENTS 4
44
45 static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
46 {
47 if(alu->is_op3)
48 return 3;
49
50 switch (bc->chiprev) {
51 case CHIPREV_R600:
52 case CHIPREV_R700:
53 switch (alu->inst) {
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
55 return 0;
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
62 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
63 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
71 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
72 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
73 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
77 return 2;
78
79 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
80 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
81 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
82 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
83 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
84 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
85 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
86 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
87 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
88 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
89 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
90 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
91 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
92 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
93 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
94 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
95 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
96 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
97 return 1;
98 default: R600_ERR(
99 "Need instruction operand number for 0x%x.\n", alu->inst);
100 }
101 break;
102 case CHIPREV_EVERGREEN:
103 switch (alu->inst) {
104 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
105 return 0;
106 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
107 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
108 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
109 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
110 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
111 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
112 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
113 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
114 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
115 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
116 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
117 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
118 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
119 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
120 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
121 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
122 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
123 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
124 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
125 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
126 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
127 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
128 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
129 return 2;
130
131 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
132 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
133 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
134 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
135 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
136 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
137 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
138 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
139 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
140 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
141 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
142 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
143 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
144 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
145 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
146 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
147 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
148 return 1;
149 default: R600_ERR(
150 "Need instruction operand number for 0x%x.\n", alu->inst);
151 }
152 break;
153 }
154
155 return 3;
156 }
157
158 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
159
160 static struct r600_bc_cf *r600_bc_cf(void)
161 {
162 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
163
164 if (cf == NULL)
165 return NULL;
166 LIST_INITHEAD(&cf->list);
167 LIST_INITHEAD(&cf->alu);
168 LIST_INITHEAD(&cf->vtx);
169 LIST_INITHEAD(&cf->tex);
170 return cf;
171 }
172
173 static struct r600_bc_alu *r600_bc_alu(void)
174 {
175 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
176
177 if (alu == NULL)
178 return NULL;
179 LIST_INITHEAD(&alu->list);
180 return alu;
181 }
182
183 static struct r600_bc_vtx *r600_bc_vtx(void)
184 {
185 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
186
187 if (vtx == NULL)
188 return NULL;
189 LIST_INITHEAD(&vtx->list);
190 return vtx;
191 }
192
193 static struct r600_bc_tex *r600_bc_tex(void)
194 {
195 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
196
197 if (tex == NULL)
198 return NULL;
199 LIST_INITHEAD(&tex->list);
200 return tex;
201 }
202
203 int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
204 {
205 LIST_INITHEAD(&bc->cf);
206 bc->family = family;
207 switch (bc->family) {
208 case CHIP_R600:
209 case CHIP_RV610:
210 case CHIP_RV630:
211 case CHIP_RV670:
212 case CHIP_RV620:
213 case CHIP_RV635:
214 case CHIP_RS780:
215 case CHIP_RS880:
216 bc->chiprev = CHIPREV_R600;
217 break;
218 case CHIP_RV770:
219 case CHIP_RV730:
220 case CHIP_RV710:
221 case CHIP_RV740:
222 bc->chiprev = CHIPREV_R700;
223 break;
224 case CHIP_CEDAR:
225 case CHIP_REDWOOD:
226 case CHIP_JUNIPER:
227 case CHIP_CYPRESS:
228 case CHIP_HEMLOCK:
229 case CHIP_PALM:
230 case CHIP_BARTS:
231 case CHIP_TURKS:
232 case CHIP_CAICOS:
233 bc->chiprev = CHIPREV_EVERGREEN;
234 break;
235 default:
236 R600_ERR("unknown family %d\n", bc->family);
237 return -EINVAL;
238 }
239 return 0;
240 }
241
242 static int r600_bc_add_cf(struct r600_bc *bc)
243 {
244 struct r600_bc_cf *cf = r600_bc_cf();
245
246 if (cf == NULL)
247 return -ENOMEM;
248 LIST_ADDTAIL(&cf->list, &bc->cf);
249 if (bc->cf_last)
250 cf->id = bc->cf_last->id + 2;
251 bc->cf_last = cf;
252 bc->ncf++;
253 bc->ndw += 2;
254 bc->force_add_cf = 0;
255 return 0;
256 }
257
258 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
259 {
260 int r;
261
262 if (bc->cf_last && (bc->cf_last->inst == output->inst ||
263 (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
264 output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
265 output->type == bc->cf_last->output.type &&
266 output->elem_size == bc->cf_last->output.elem_size &&
267 output->swizzle_x == bc->cf_last->output.swizzle_x &&
268 output->swizzle_y == bc->cf_last->output.swizzle_y &&
269 output->swizzle_z == bc->cf_last->output.swizzle_z &&
270 output->swizzle_w == bc->cf_last->output.swizzle_w &&
271 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
272
273 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
274 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
275
276 bc->cf_last->output.end_of_program |= output->end_of_program;
277 bc->cf_last->output.inst = output->inst;
278 bc->cf_last->output.gpr = output->gpr;
279 bc->cf_last->output.array_base = output->array_base;
280 bc->cf_last->output.burst_count += output->burst_count;
281 return 0;
282
283 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
284 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
285
286 bc->cf_last->output.end_of_program |= output->end_of_program;
287 bc->cf_last->output.inst = output->inst;
288 bc->cf_last->output.burst_count += output->burst_count;
289 return 0;
290 }
291 }
292
293 r = r600_bc_add_cf(bc);
294 if (r)
295 return r;
296 bc->cf_last->inst = output->inst;
297 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
298 return 0;
299 }
300
301 /* alu instructions that can ony exits once per group */
302 static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
303 {
304 switch (bc->chiprev) {
305 case CHIPREV_R600:
306 case CHIPREV_R700:
307 return !alu->is_op3 && (
308 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
309 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
310 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
311 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
312 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
313 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
314 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
315 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
316 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
317 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
318 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
319 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
320 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
321 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
322 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
323 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
324 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
325 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
326 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
327 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
328 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
329 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
330 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
331 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
332 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
333 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
334 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
335 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
336 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
337 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
338 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
339 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
340 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
341 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
342 case CHIPREV_EVERGREEN:
343 default:
344 return !alu->is_op3 && (
345 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
346 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
347 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
348 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
349 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
350 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
351 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
352 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
353 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
354 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
355 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
356 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
357 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
358 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
359 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
360 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
361 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
362 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
363 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
364 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
365 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
366 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
367 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
368 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
369 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
370 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
371 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
372 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
373 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
374 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
375 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
376 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
377 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
378 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
379 }
380 }
381
382 static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
383 {
384 switch (bc->chiprev) {
385 case CHIPREV_R600:
386 case CHIPREV_R700:
387 return !alu->is_op3 && (
388 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
389 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
390 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
391 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
392 case CHIPREV_EVERGREEN:
393 default:
394 return !alu->is_op3 && (
395 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
396 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
397 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
398 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
399 }
400 }
401
402 static int is_alu_cube_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
403 {
404 switch (bc->chiprev) {
405 case CHIPREV_R600:
406 case CHIPREV_R700:
407 return !alu->is_op3 &&
408 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
409 case CHIPREV_EVERGREEN:
410 default:
411 return !alu->is_op3 &&
412 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
413 }
414 }
415
416 static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
417 {
418 switch (bc->chiprev) {
419 case CHIPREV_R600:
420 case CHIPREV_R700:
421 return !alu->is_op3 && (
422 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
423 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
424 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
425 case CHIPREV_EVERGREEN:
426 default:
427 return !alu->is_op3 && (
428 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
429 }
430 }
431
432 /* alu instructions that can only execute on the vector unit */
433 static int is_alu_vec_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
434 {
435 return is_alu_reduction_inst(bc, alu) ||
436 is_alu_mova_inst(bc, alu) ||
437 (bc->chiprev == CHIPREV_EVERGREEN &&
438 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR);
439 }
440
441 /* alu instructions that can only execute on the trans unit */
442 static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
443 {
444 switch (bc->chiprev) {
445 case CHIPREV_R600:
446 case CHIPREV_R700:
447 if (!alu->is_op3)
448 return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
449 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
450 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
451 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
452 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
453 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
454 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
455 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
456 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
457 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
458 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
459 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
460 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
461 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
462 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
463 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
464 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
465 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
466 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
467 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
468 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
469 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
470 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
471 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
472 else
473 return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
474 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
475 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
476 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
477 case CHIPREV_EVERGREEN:
478 default:
479 if (!alu->is_op3)
480 /* Note that FLT_TO_INT_* instructions are vector-only instructions
481 * on Evergreen, despite what the documentation says. FLT_TO_INT
482 * can do both vector and scalar. */
483 return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
484 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
485 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
486 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
487 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
488 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
489 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
490 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
491 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
492 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
493 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
494 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
495 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
496 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
497 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
498 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
499 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
500 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
501 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
502 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
503 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
504 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
505 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
506 else
507 return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
508 }
509 }
510
511 /* alu instructions that can execute on any unit */
512 static int is_alu_any_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
513 {
514 return !is_alu_vec_unit_inst(bc, alu) &&
515 !is_alu_trans_unit_inst(bc, alu);
516 }
517
518 static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
519 struct r600_bc_alu *assignment[5])
520 {
521 struct r600_bc_alu *alu;
522 unsigned i, chan, trans;
523
524 for (i = 0; i < 5; i++)
525 assignment[i] = NULL;
526
527 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
528 chan = alu->dst.chan;
529 if (is_alu_trans_unit_inst(bc, alu))
530 trans = 1;
531 else if (is_alu_vec_unit_inst(bc, alu))
532 trans = 0;
533 else if (assignment[chan])
534 trans = 1; // assume ALU_INST_PREFER_VECTOR
535 else
536 trans = 0;
537
538 if (trans) {
539 if (assignment[4]) {
540 assert(0); //ALU.Trans has already been allocated
541 return -1;
542 }
543 assignment[4] = alu;
544 } else {
545 if (assignment[chan]) {
546 assert(0); //ALU.chan has already been allocated
547 return -1;
548 }
549 assignment[chan] = alu;
550 }
551
552 if (alu->last)
553 break;
554 }
555 return 0;
556 }
557
558 struct alu_bank_swizzle {
559 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
560 int hw_cfile_addr[4];
561 int hw_cfile_elem[4];
562 };
563
564 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
565 [SQ_ALU_VEC_012] = { 0, 1, 2 },
566 [SQ_ALU_VEC_021] = { 0, 2, 1 },
567 [SQ_ALU_VEC_120] = { 1, 2, 0 },
568 [SQ_ALU_VEC_102] = { 1, 0, 2 },
569 [SQ_ALU_VEC_201] = { 2, 0, 1 },
570 [SQ_ALU_VEC_210] = { 2, 1, 0 }
571 };
572
573 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
574 [SQ_ALU_SCL_210] = { 2, 1, 0 },
575 [SQ_ALU_SCL_122] = { 1, 2, 2 },
576 [SQ_ALU_SCL_212] = { 2, 1, 2 },
577 [SQ_ALU_SCL_221] = { 2, 2, 1 }
578 };
579
580 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
581 {
582 int i, cycle, component;
583 /* set up gpr use */
584 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
585 for (component = 0; component < NUM_OF_COMPONENTS; component++)
586 bs->hw_gpr[cycle][component] = -1;
587 for (i = 0; i < 4; i++)
588 bs->hw_cfile_addr[i] = -1;
589 for (i = 0; i < 4; i++)
590 bs->hw_cfile_elem[i] = -1;
591 }
592
593 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
594 {
595 if (bs->hw_gpr[cycle][chan] == -1)
596 bs->hw_gpr[cycle][chan] = sel;
597 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
598 // Another scalar operation has already used GPR read port for channel
599 return -1;
600 }
601 return 0;
602 }
603
604 static int reserve_cfile(struct r600_bc *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
605 {
606 int res, num_res = 4;
607 if (bc->chiprev >= CHIPREV_R700) {
608 num_res = 2;
609 chan /= 2;
610 }
611 for (res = 0; res < num_res; ++res) {
612 if (bs->hw_cfile_addr[res] == -1) {
613 bs->hw_cfile_addr[res] = sel;
614 bs->hw_cfile_elem[res] = chan;
615 return 0;
616 } else if (bs->hw_cfile_addr[res] == sel &&
617 bs->hw_cfile_elem[res] == chan)
618 return 0; // Read for this scalar element already reserved, nothing to do here.
619 }
620 // All cfile read ports are used, cannot reference vector element
621 return -1;
622 }
623
624 static int is_gpr(unsigned sel)
625 {
626 return (sel >= 0 && sel <= 127);
627 }
628
629 /* CB constants start at 512, and get translated to a kcache index when ALU
630 * clauses are constructed. Note that we handle kcache constants the same way
631 * as (the now gone) cfile constants, is that really required? */
632 static int is_cfile(unsigned sel)
633 {
634 return (sel > 255 && sel < 512) ||
635 (sel > 511 && sel < 4607) || // Kcache before translate
636 (sel > 127 && sel < 192); // Kcache after translate
637 }
638
639 static int is_const(int sel)
640 {
641 return is_cfile(sel) ||
642 (sel >= V_SQ_ALU_SRC_0 &&
643 sel <= V_SQ_ALU_SRC_LITERAL);
644 }
645
646 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
647 struct alu_bank_swizzle *bs, int bank_swizzle)
648 {
649 int r, src, num_src, sel, elem, cycle;
650
651 num_src = r600_bc_get_num_operands(bc, alu);
652 for (src = 0; src < num_src; src++) {
653 sel = alu->src[src].sel;
654 elem = alu->src[src].chan;
655 if (is_gpr(sel)) {
656 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
657 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
658 // Nothing to do; special-case optimization,
659 // second source uses first source’s reservation
660 continue;
661 else {
662 r = reserve_gpr(bs, sel, elem, cycle);
663 if (r)
664 return r;
665 }
666 } else if (is_cfile(sel)) {
667 r = reserve_cfile(bc, bs, sel, elem);
668 if (r)
669 return r;
670 }
671 // No restrictions on PV, PS, literal or special constants
672 }
673 return 0;
674 }
675
676 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
677 struct alu_bank_swizzle *bs, int bank_swizzle)
678 {
679 int r, src, num_src, const_count, sel, elem, cycle;
680
681 num_src = r600_bc_get_num_operands(bc, alu);
682 for (const_count = 0, src = 0; src < num_src; ++src) {
683 sel = alu->src[src].sel;
684 elem = alu->src[src].chan;
685 if (is_const(sel)) { // Any constant, including literal and inline constants
686 if (const_count >= 2)
687 // More than two references to a constant in
688 // transcendental operation.
689 return -1;
690 else
691 const_count++;
692 }
693 if (is_cfile(sel)) {
694 r = reserve_cfile(bc, bs, sel, elem);
695 if (r)
696 return r;
697 }
698 }
699 for (src = 0; src < num_src; ++src) {
700 sel = alu->src[src].sel;
701 elem = alu->src[src].chan;
702 if (is_gpr(sel)) {
703 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
704 if (cycle < const_count)
705 // Cycle for GPR load conflicts with
706 // constant load in transcendental operation.
707 return -1;
708 r = reserve_gpr(bs, sel, elem, cycle);
709 if (r)
710 return r;
711 }
712 // PV PS restrictions
713 if (const_count && (sel == 254 || sel == 255)) {
714 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
715 if (cycle < const_count)
716 return -1;
717 }
718 }
719 return 0;
720 }
721
722 static int check_and_set_bank_swizzle(struct r600_bc *bc,
723 struct r600_bc_alu *slots[5])
724 {
725 struct alu_bank_swizzle bs;
726 int bank_swizzle[5];
727 int i, r = 0, forced = 0;
728 boolean scalar_only = true;
729 for (i = 0; i < 5; i++) {
730 if (slots[i] && slots[i]->bank_swizzle_force) {
731 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
732 forced = 1;
733 }
734 if (i < 4 && slots[i])
735 scalar_only = false;
736 }
737 if (forced)
738 return 0;
739
740 // just check every possible combination of bank swizzle
741 // not very efficent, but works on the first try in most of the cases
742 for (i = 0; i < 4; i++)
743 bank_swizzle[i] = SQ_ALU_VEC_012;
744 bank_swizzle[4] = SQ_ALU_SCL_210;
745 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
746 init_bank_swizzle(&bs);
747 if (scalar_only == false) {
748 for (i = 0; i < 4; i++) {
749 if (slots[i]) {
750 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
751 if (r)
752 break;
753 }
754 }
755 } else
756 r = 0;
757
758 if (!r && slots[4]) {
759 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
760 }
761 if (!r) {
762 for (i = 0; i < 5; i++) {
763 if (slots[i])
764 slots[i]->bank_swizzle = bank_swizzle[i];
765 }
766 return 0;
767 }
768
769 if (scalar_only) {
770 bank_swizzle[4]++;
771 } else {
772 for (i = 0; i < 5; i++) {
773 bank_swizzle[i]++;
774 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
775 break;
776 else
777 bank_swizzle[i] = SQ_ALU_VEC_012;
778 }
779 }
780 }
781
782 // couldn't find a working swizzle
783 return -1;
784 }
785
786 static int replace_gpr_with_pv_ps(struct r600_bc *bc,
787 struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
788 {
789 struct r600_bc_alu *prev[5];
790 int gpr[5], chan[5];
791 int i, j, r, src, num_src;
792
793 r = assign_alu_units(bc, alu_prev, prev);
794 if (r)
795 return r;
796
797 for (i = 0; i < 5; ++i) {
798 if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
799 gpr[i] = prev[i]->dst.sel;
800 /* cube writes more than PV.X */
801 if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
802 chan[i] = 0;
803 else
804 chan[i] = prev[i]->dst.chan;
805 } else
806 gpr[i] = -1;
807 }
808
809 for (i = 0; i < 5; ++i) {
810 struct r600_bc_alu *alu = slots[i];
811 if(!alu)
812 continue;
813
814 num_src = r600_bc_get_num_operands(bc, alu);
815 for (src = 0; src < num_src; ++src) {
816 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
817 continue;
818
819 if (alu->src[src].sel == gpr[4] &&
820 alu->src[src].chan == chan[4]) {
821 alu->src[src].sel = V_SQ_ALU_SRC_PS;
822 alu->src[src].chan = 0;
823 continue;
824 }
825
826 for (j = 0; j < 4; ++j) {
827 if (alu->src[src].sel == gpr[j] &&
828 alu->src[src].chan == j) {
829 alu->src[src].sel = V_SQ_ALU_SRC_PV;
830 alu->src[src].chan = chan[j];
831 break;
832 }
833 }
834 }
835 }
836
837 return 0;
838 }
839
840 void r600_bc_special_constants(u32 value, unsigned *sel, unsigned *neg)
841 {
842 switch(value) {
843 case 0:
844 *sel = V_SQ_ALU_SRC_0;
845 break;
846 case 1:
847 *sel = V_SQ_ALU_SRC_1_INT;
848 break;
849 case -1:
850 *sel = V_SQ_ALU_SRC_M_1_INT;
851 break;
852 case 0x3F800000: // 1.0f
853 *sel = V_SQ_ALU_SRC_1;
854 break;
855 case 0x3F000000: // 0.5f
856 *sel = V_SQ_ALU_SRC_0_5;
857 break;
858 case 0xBF800000: // -1.0f
859 *sel = V_SQ_ALU_SRC_1;
860 *neg ^= 1;
861 break;
862 case 0xBF000000: // -0.5f
863 *sel = V_SQ_ALU_SRC_0_5;
864 *neg ^= 1;
865 break;
866 default:
867 *sel = V_SQ_ALU_SRC_LITERAL;
868 break;
869 }
870 }
871
872 /* compute how many literal are needed */
873 static int r600_bc_alu_nliterals(struct r600_bc *bc, struct r600_bc_alu *alu,
874 uint32_t literal[4], unsigned *nliteral)
875 {
876 unsigned num_src = r600_bc_get_num_operands(bc, alu);
877 unsigned i, j;
878
879 for (i = 0; i < num_src; ++i) {
880 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
881 uint32_t value = alu->src[i].value;
882 unsigned found = 0;
883 for (j = 0; j < *nliteral; ++j) {
884 if (literal[j] == value) {
885 found = 1;
886 break;
887 }
888 }
889 if (!found) {
890 if (*nliteral >= 4)
891 return -EINVAL;
892 literal[(*nliteral)++] = value;
893 }
894 }
895 }
896 return 0;
897 }
898
899 static void r600_bc_alu_adjust_literals(struct r600_bc *bc,
900 struct r600_bc_alu *alu,
901 uint32_t literal[4], unsigned nliteral)
902 {
903 unsigned num_src = r600_bc_get_num_operands(bc, alu);
904 unsigned i, j;
905
906 for (i = 0; i < num_src; ++i) {
907 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
908 uint32_t value = alu->src[i].value;
909 for (j = 0; j < nliteral; ++j) {
910 if (literal[j] == value) {
911 alu->src[i].chan = j;
912 break;
913 }
914 }
915 }
916 }
917 }
918
919 static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
920 struct r600_bc_alu *alu_prev)
921 {
922 struct r600_bc_alu *prev[5];
923 struct r600_bc_alu *result[5] = { NULL };
924
925 uint32_t literal[4], prev_literal[4];
926 unsigned nliteral = 0, prev_nliteral = 0;
927
928 int i, j, r, src, num_src;
929 int num_once_inst = 0;
930 int have_mova = 0, have_rel = 0;
931
932 r = assign_alu_units(bc, alu_prev, prev);
933 if (r)
934 return r;
935
936 for (i = 0; i < 5; ++i) {
937 struct r600_bc_alu *alu;
938
939 /* check number of literals */
940 if (prev[i]) {
941 if (r600_bc_alu_nliterals(bc, prev[i], literal, &nliteral))
942 return 0;
943 if (r600_bc_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
944 return 0;
945 if (is_alu_mova_inst(bc, prev[i])) {
946 if (have_rel)
947 return 0;
948 have_mova = 1;
949 }
950 num_once_inst += is_alu_once_inst(bc, prev[i]);
951 }
952 if (slots[i] && r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral))
953 return 0;
954
955 // let's check used slots
956 if (prev[i] && !slots[i]) {
957 result[i] = prev[i];
958 continue;
959 } else if (prev[i] && slots[i]) {
960 if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
961 // trans unit is still free try to use it
962 if (is_alu_any_unit_inst(bc, slots[i])) {
963 result[i] = prev[i];
964 result[4] = slots[i];
965 } else if (is_alu_any_unit_inst(bc, prev[i])) {
966 result[i] = slots[i];
967 result[4] = prev[i];
968 } else
969 return 0;
970 } else
971 return 0;
972 } else if(!slots[i]) {
973 continue;
974 } else
975 result[i] = slots[i];
976
977 alu = slots[i];
978 num_once_inst += is_alu_once_inst(bc, alu);
979
980 // let's check dst gpr
981 if (alu->dst.rel) {
982 if (have_mova)
983 return 0;
984 have_rel = 1;
985 }
986
987 // let's check source gprs
988 num_src = r600_bc_get_num_operands(bc, alu);
989 for (src = 0; src < num_src; ++src) {
990 if (alu->src[src].rel) {
991 if (have_mova)
992 return 0;
993 have_rel = 1;
994 }
995
996 // constants doesn't matter
997 if (!is_gpr(alu->src[src].sel))
998 continue;
999
1000 for (j = 0; j < 5; ++j) {
1001 if (!prev[j] || !prev[j]->dst.write)
1002 continue;
1003
1004 // if it's relative then we can't determin which gpr is really used
1005 if (prev[j]->dst.chan == alu->src[src].chan &&
1006 (prev[j]->dst.sel == alu->src[src].sel ||
1007 prev[j]->dst.rel || alu->src[src].rel))
1008 return 0;
1009 }
1010 }
1011 }
1012
1013 /* more than one PRED_ or KILL_ ? */
1014 if (num_once_inst > 1)
1015 return 0;
1016
1017 /* check if the result can still be swizzlet */
1018 r = check_and_set_bank_swizzle(bc, result);
1019 if (r)
1020 return 0;
1021
1022 /* looks like everything worked out right, apply the changes */
1023
1024 /* undo adding previus literals */
1025 bc->cf_last->ndw -= align(prev_nliteral, 2);
1026
1027 /* sort instructions */
1028 for (i = 0; i < 5; ++i) {
1029 slots[i] = result[i];
1030 if (result[i]) {
1031 LIST_DEL(&result[i]->list);
1032 result[i]->last = 0;
1033 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1034 }
1035 }
1036
1037 /* determine new last instruction */
1038 LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
1039
1040 /* determine new first instruction */
1041 for (i = 0; i < 5; ++i) {
1042 if (result[i]) {
1043 bc->cf_last->curr_bs_head = result[i];
1044 break;
1045 }
1046 }
1047
1048 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1049 bc->cf_last->prev2_bs_head = NULL;
1050
1051 return 0;
1052 }
1053
1054 /* This code handles kcache lines as single blocks of 32 constants. We could
1055 * probably do slightly better by recognizing that we actually have two
1056 * consecutive lines of 16 constants, but the resulting code would also be
1057 * somewhat more complicated. */
1058 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
1059 {
1060 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
1061 unsigned int required_lines;
1062 unsigned int free_lines = 0;
1063 unsigned int cache_line[3];
1064 unsigned int count = 0;
1065 unsigned int i, j;
1066 int r;
1067
1068 /* Collect required cache lines. */
1069 for (i = 0; i < 3; ++i) {
1070 bool found = false;
1071 unsigned int line;
1072
1073 if (alu->src[i].sel < 512)
1074 continue;
1075
1076 line = ((alu->src[i].sel - 512) / 32) * 2;
1077
1078 for (j = 0; j < count; ++j) {
1079 if (cache_line[j] == line) {
1080 found = true;
1081 break;
1082 }
1083 }
1084
1085 if (!found)
1086 cache_line[count++] = line;
1087 }
1088
1089 /* This should never actually happen. */
1090 if (count >= 3) return -ENOMEM;
1091
1092 for (i = 0; i < 2; ++i) {
1093 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1094 ++free_lines;
1095 }
1096 }
1097
1098 /* Filter lines pulled in by previous intructions. Note that this is
1099 * only for the required_lines count, we can't remove these from the
1100 * cache_line array since we may have to start a new ALU clause. */
1101 for (i = 0, required_lines = count; i < count; ++i) {
1102 for (j = 0; j < 2; ++j) {
1103 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1104 kcache[j].addr == cache_line[i]) {
1105 --required_lines;
1106 break;
1107 }
1108 }
1109 }
1110
1111 /* Start a new ALU clause if needed. */
1112 if (required_lines > free_lines) {
1113 if ((r = r600_bc_add_cf(bc))) {
1114 return r;
1115 }
1116 bc->cf_last->inst = (type << 3);
1117 kcache = bc->cf_last->kcache;
1118 }
1119
1120 /* Setup the kcache lines. */
1121 for (i = 0; i < count; ++i) {
1122 bool found = false;
1123
1124 for (j = 0; j < 2; ++j) {
1125 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1126 kcache[j].addr == cache_line[i]) {
1127 found = true;
1128 break;
1129 }
1130 }
1131
1132 if (found) continue;
1133
1134 for (j = 0; j < 2; ++j) {
1135 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1136 kcache[j].bank = 0;
1137 kcache[j].addr = cache_line[i];
1138 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1139 break;
1140 }
1141 }
1142 }
1143
1144 /* Alter the src operands to refer to the kcache. */
1145 for (i = 0; i < 3; ++i) {
1146 static const unsigned int base[] = {128, 160, 256, 288};
1147 unsigned int line;
1148
1149 if (alu->src[i].sel < 512)
1150 continue;
1151
1152 alu->src[i].sel -= 512;
1153 line = (alu->src[i].sel / 32) * 2;
1154
1155 for (j = 0; j < 2; ++j) {
1156 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1157 kcache[j].addr == line) {
1158 alu->src[i].sel &= 0x1f;
1159 alu->src[i].sel += base[j];
1160 break;
1161 }
1162 }
1163 }
1164
1165 return 0;
1166 }
1167
1168 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
1169 {
1170 struct r600_bc_alu *nalu = r600_bc_alu();
1171 struct r600_bc_alu *lalu;
1172 int i, r;
1173
1174 if (nalu == NULL)
1175 return -ENOMEM;
1176 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
1177
1178 if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1179 /* check if we could add it anyway */
1180 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1181 type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1182 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1183 if (lalu->predicate) {
1184 bc->force_add_cf = 1;
1185 break;
1186 }
1187 }
1188 } else
1189 bc->force_add_cf = 1;
1190 }
1191
1192 /* cf can contains only alu or only vtx or only tex */
1193 if (bc->cf_last == NULL || bc->force_add_cf) {
1194 r = r600_bc_add_cf(bc);
1195 if (r) {
1196 free(nalu);
1197 return r;
1198 }
1199 }
1200 bc->cf_last->inst = (type << 3);
1201
1202 /* Setup the kcache for this ALU instruction. This will start a new
1203 * ALU clause if needed. */
1204 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
1205 free(nalu);
1206 return r;
1207 }
1208
1209 if (!bc->cf_last->curr_bs_head) {
1210 bc->cf_last->curr_bs_head = nalu;
1211 }
1212 /* number of gpr == the last gpr used in any alu */
1213 for (i = 0; i < 3; i++) {
1214 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1215 bc->ngpr = nalu->src[i].sel + 1;
1216 }
1217 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1218 r600_bc_special_constants(nalu->src[i].value,
1219 &nalu->src[i].sel, &nalu->src[i].neg);
1220 }
1221 if (nalu->dst.sel >= bc->ngpr) {
1222 bc->ngpr = nalu->dst.sel + 1;
1223 }
1224 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1225 /* each alu use 2 dwords */
1226 bc->cf_last->ndw += 2;
1227 bc->ndw += 2;
1228
1229 /* process cur ALU instructions for bank swizzle */
1230 if (nalu->last) {
1231 uint32_t literal[4];
1232 unsigned nliteral;
1233 struct r600_bc_alu *slots[5];
1234 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1235 if (r)
1236 return r;
1237
1238 if (bc->cf_last->prev_bs_head) {
1239 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1240 if (r)
1241 return r;
1242 }
1243
1244 if (bc->cf_last->prev_bs_head) {
1245 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1246 if (r)
1247 return r;
1248 }
1249
1250 r = check_and_set_bank_swizzle(bc, slots);
1251 if (r)
1252 return r;
1253
1254 for (i = 0, nliteral = 0; i < 5; i++) {
1255 if (slots[i]) {
1256 r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
1257 if (r)
1258 return r;
1259 }
1260 }
1261 bc->cf_last->ndw += align(nliteral, 2);
1262
1263 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1264 * worst case */
1265 if ((bc->cf_last->ndw >> 1) >= 120) {
1266 bc->force_add_cf = 1;
1267 }
1268
1269 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1270 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1271 bc->cf_last->curr_bs_head = NULL;
1272 }
1273 return 0;
1274 }
1275
1276 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
1277 {
1278 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1279 }
1280
1281 static unsigned r600_bc_num_tex_and_vtx_instructions(const struct r600_bc *bc)
1282 {
1283 switch (bc->chiprev) {
1284 case CHIPREV_R600:
1285 return 8;
1286
1287 case CHIPREV_R700:
1288 return 16;
1289
1290 case CHIPREV_EVERGREEN:
1291 return 64;
1292
1293 default:
1294 R600_ERR("Unknown chiprev %d.\n", bc->chiprev);
1295 return 8;
1296 }
1297 }
1298
1299 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
1300 {
1301 struct r600_bc_vtx *nvtx = r600_bc_vtx();
1302 int r;
1303
1304 if (nvtx == NULL)
1305 return -ENOMEM;
1306 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
1307
1308 /* cf can contains only alu or only vtx or only tex */
1309 if (bc->cf_last == NULL ||
1310 (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1311 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) ||
1312 bc->force_add_cf) {
1313 r = r600_bc_add_cf(bc);
1314 if (r) {
1315 free(nvtx);
1316 return r;
1317 }
1318 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1319 }
1320 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1321 /* each fetch use 4 dwords */
1322 bc->cf_last->ndw += 4;
1323 bc->ndw += 4;
1324 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1325 bc->force_add_cf = 1;
1326 return 0;
1327 }
1328
1329 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
1330 {
1331 struct r600_bc_tex *ntex = r600_bc_tex();
1332 int r;
1333
1334 if (ntex == NULL)
1335 return -ENOMEM;
1336 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
1337
1338 /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1339 if (bc->cf_last != NULL &&
1340 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
1341 struct r600_bc_tex *ttex;
1342 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1343 if (ttex->dst_gpr == ntex->src_gpr) {
1344 bc->force_add_cf = 1;
1345 break;
1346 }
1347 }
1348 }
1349
1350 /* cf can contains only alu or only vtx or only tex */
1351 if (bc->cf_last == NULL ||
1352 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1353 bc->force_add_cf) {
1354 r = r600_bc_add_cf(bc);
1355 if (r) {
1356 free(ntex);
1357 return r;
1358 }
1359 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1360 }
1361 if (ntex->src_gpr >= bc->ngpr) {
1362 bc->ngpr = ntex->src_gpr + 1;
1363 }
1364 if (ntex->dst_gpr >= bc->ngpr) {
1365 bc->ngpr = ntex->dst_gpr + 1;
1366 }
1367 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1368 /* each texture fetch use 4 dwords */
1369 bc->cf_last->ndw += 4;
1370 bc->ndw += 4;
1371 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1372 bc->force_add_cf = 1;
1373 return 0;
1374 }
1375
1376 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
1377 {
1378 int r;
1379 r = r600_bc_add_cf(bc);
1380 if (r)
1381 return r;
1382
1383 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1384 bc->cf_last->inst = inst;
1385 return 0;
1386 }
1387
1388 /* common to all 3 families */
1389 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
1390 {
1391 bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1392 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1393 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1394 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x) |
1395 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1396 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1397 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1398 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1399 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1400 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1401 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1402 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1403 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1404 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1405 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1406 bc->bytecode[id++] = S_SQ_VTX_WORD2_OFFSET(vtx->offset) |
1407 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian) |
1408 S_SQ_VTX_WORD2_MEGA_FETCH(1);
1409 bc->bytecode[id++] = 0;
1410 return 0;
1411 }
1412
1413 /* common to all 3 families */
1414 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
1415 {
1416 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1417 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1418 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1419 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1420 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1421 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1422 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1423 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1424 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1425 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1426 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1427 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1428 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1429 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1430 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1431 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1432 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1433 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1434 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1435 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1436 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1437 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1438 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1439 bc->bytecode[id++] = 0;
1440 return 0;
1441 }
1442
1443 /* r600 only, r700/eg bits in r700_asm.c */
1444 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
1445 {
1446 /* don't replace gpr by pv or ps for destination register */
1447 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1448 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1449 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1450 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1451 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1452 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1453 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1454 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1455 S_SQ_ALU_WORD0_LAST(alu->last);
1456
1457 if (alu->is_op3) {
1458 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1459 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1460 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1461 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1462 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1463 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1464 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1465 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1466 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1467 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1468 } else {
1469 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1470 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1471 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1472 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1473 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1474 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1475 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1476 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1477 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1478 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1479 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1480 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1481 }
1482 return 0;
1483 }
1484
1485 static void r600_bc_cf_vtx_build(uint32_t *bytecode, const struct r600_bc_cf *cf)
1486 {
1487 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1488 *bytecode++ = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1489 S_SQ_CF_WORD1_BARRIER(1) |
1490 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1491 }
1492
1493 /* common for r600/r700 - eg in eg_asm.c */
1494 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
1495 {
1496 unsigned id = cf->id;
1497
1498 switch (cf->inst) {
1499 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1500 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1501 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1502 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1503 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1504 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1505 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1506 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1507
1508 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1509 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1510 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1511 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1512 S_SQ_CF_ALU_WORD1_BARRIER(1) |
1513 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
1514 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1515 break;
1516 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1517 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1518 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1519 if (bc->chiprev == CHIPREV_R700)
1520 r700_bc_cf_vtx_build(&bc->bytecode[id], cf);
1521 else
1522 r600_bc_cf_vtx_build(&bc->bytecode[id], cf);
1523 break;
1524 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1525 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1526 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1527 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1528 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1529 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1530 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1531 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1532 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1533 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1534 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1535 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1536 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
1537 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1538 break;
1539 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1540 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1541 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1542 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1543 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1544 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1545 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1546 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1547 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1548 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1549 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1550 S_SQ_CF_WORD1_BARRIER(1) |
1551 S_SQ_CF_WORD1_COND(cf->cond) |
1552 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1553
1554 break;
1555 default:
1556 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1557 return -EINVAL;
1558 }
1559 return 0;
1560 }
1561
1562 int r600_bc_build(struct r600_bc *bc)
1563 {
1564 struct r600_bc_cf *cf;
1565 struct r600_bc_alu *alu;
1566 struct r600_bc_vtx *vtx;
1567 struct r600_bc_tex *tex;
1568 uint32_t literal[4];
1569 unsigned nliteral;
1570 unsigned addr;
1571 int i, r;
1572
1573 if (bc->callstack[0].max > 0)
1574 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1575 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1576 bc->nstack = 1;
1577 }
1578
1579 /* first path compute addr of each CF block */
1580 /* addr start after all the CF instructions */
1581 addr = bc->cf_last->id + 2;
1582 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1583 switch (cf->inst) {
1584 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1585 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1586 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1587 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1588 break;
1589 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1590 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1591 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1592 /* fetch node need to be 16 bytes aligned*/
1593 addr += 3;
1594 addr &= 0xFFFFFFFCUL;
1595 break;
1596 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1597 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1598 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1599 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1600 break;
1601 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1602 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1603 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1604 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1605 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1606 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1607 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1608 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1609 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1610 break;
1611 default:
1612 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1613 return -EINVAL;
1614 }
1615 cf->addr = addr;
1616 addr += cf->ndw;
1617 bc->ndw = cf->addr + cf->ndw;
1618 }
1619 free(bc->bytecode);
1620 bc->bytecode = calloc(1, bc->ndw * 4);
1621 if (bc->bytecode == NULL)
1622 return -ENOMEM;
1623 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1624 addr = cf->addr;
1625 if (bc->chiprev == CHIPREV_EVERGREEN)
1626 r = eg_bc_cf_build(bc, cf);
1627 else
1628 r = r600_bc_cf_build(bc, cf);
1629 if (r)
1630 return r;
1631 switch (cf->inst) {
1632 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1633 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1634 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1635 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1636 nliteral = 0;
1637 memset(literal, 0, sizeof(literal));
1638 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1639 r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1640 if (r)
1641 return r;
1642 r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
1643 switch(bc->chiprev) {
1644 case CHIPREV_R600:
1645 r = r600_bc_alu_build(bc, alu, addr);
1646 break;
1647 case CHIPREV_R700:
1648 case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
1649 r = r700_bc_alu_build(bc, alu, addr);
1650 break;
1651 default:
1652 R600_ERR("unknown family %d\n", bc->family);
1653 return -EINVAL;
1654 }
1655 if (r)
1656 return r;
1657 addr += 2;
1658 if (alu->last) {
1659 for (i = 0; i < align(nliteral, 2); ++i) {
1660 bc->bytecode[addr++] = literal[i];
1661 }
1662 nliteral = 0;
1663 memset(literal, 0, sizeof(literal));
1664 }
1665 }
1666 break;
1667 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1668 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1669 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1670 r = r600_bc_vtx_build(bc, vtx, addr);
1671 if (r)
1672 return r;
1673 addr += 4;
1674 }
1675 break;
1676 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1677 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1678 r = r600_bc_tex_build(bc, tex, addr);
1679 if (r)
1680 return r;
1681 addr += 4;
1682 }
1683 break;
1684 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1685 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1686 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1687 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1688 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1689 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1690 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1691 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1692 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1693 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1694 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1695 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1696 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1697 break;
1698 default:
1699 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1700 return -EINVAL;
1701 }
1702 }
1703 return 0;
1704 }
1705
1706 void r600_bc_clear(struct r600_bc *bc)
1707 {
1708 struct r600_bc_cf *cf = NULL, *next_cf;
1709
1710 free(bc->bytecode);
1711 bc->bytecode = NULL;
1712
1713 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1714 struct r600_bc_alu *alu = NULL, *next_alu;
1715 struct r600_bc_tex *tex = NULL, *next_tex;
1716 struct r600_bc_tex *vtx = NULL, *next_vtx;
1717
1718 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1719 free(alu);
1720 }
1721
1722 LIST_INITHEAD(&cf->alu);
1723
1724 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1725 free(tex);
1726 }
1727
1728 LIST_INITHEAD(&cf->tex);
1729
1730 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1731 free(vtx);
1732 }
1733
1734 LIST_INITHEAD(&cf->vtx);
1735
1736 free(cf);
1737 }
1738
1739 LIST_INITHEAD(&cf->list);
1740 }
1741
1742 void r600_bc_dump(struct r600_bc *bc)
1743 {
1744 struct r600_bc_cf *cf = NULL;
1745 struct r600_bc_alu *alu = NULL;
1746 struct r600_bc_vtx *vtx = NULL;
1747 struct r600_bc_tex *tex = NULL;
1748
1749 unsigned i, id;
1750 uint32_t literal[4];
1751 unsigned nliteral;
1752 char chip = '6';
1753
1754 switch (bc->chiprev) {
1755 case 1:
1756 chip = '7';
1757 break;
1758 case 2:
1759 chip = 'E';
1760 break;
1761 case 0:
1762 default:
1763 chip = '6';
1764 break;
1765 }
1766 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
1767 fprintf(stderr, " %c\n", chip);
1768
1769 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1770 id = cf->id;
1771
1772 switch (cf->inst) {
1773 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1774 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1775 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1776 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1777 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1778 fprintf(stderr, "ADDR:%d ", cf->addr);
1779 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
1780 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
1781 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
1782 id++;
1783 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1784 fprintf(stderr, "INST:%d ", cf->inst);
1785 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
1786 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
1787 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
1788 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
1789 break;
1790 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1791 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1792 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1793 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1794 fprintf(stderr, "ADDR:%d\n", cf->addr);
1795 id++;
1796 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1797 fprintf(stderr, "INST:%d ", cf->inst);
1798 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
1799 break;
1800 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1801 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1802 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1803 fprintf(stderr, "GPR:%X ", cf->output.gpr);
1804 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
1805 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
1806 fprintf(stderr, "TYPE:%X\n", cf->output.type);
1807 id++;
1808 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1809 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
1810 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
1811 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
1812 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
1813 fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
1814 fprintf(stderr, "INST:%d ", cf->output.inst);
1815 fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
1816 fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
1817 break;
1818 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1819 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1820 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1821 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1822 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1823 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1824 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1825 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1826 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1827 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1828 fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
1829 id++;
1830 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1831 fprintf(stderr, "INST:%d ", cf->inst);
1832 fprintf(stderr, "COND:%X ", cf->cond);
1833 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
1834 break;
1835 }
1836
1837 id = cf->addr;
1838 nliteral = 0;
1839 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1840 r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1841
1842 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1843 fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
1844 fprintf(stderr, "REL:%d ", alu->src[0].rel);
1845 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
1846 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
1847 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
1848 fprintf(stderr, "REL:%d ", alu->src[1].rel);
1849 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
1850 fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
1851 fprintf(stderr, "LAST:%d)\n", alu->last);
1852 id++;
1853 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
1854 fprintf(stderr, "INST:%d ", alu->inst);
1855 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
1856 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
1857 fprintf(stderr, "REL:%d ", alu->dst.rel);
1858 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
1859 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
1860 if (alu->is_op3) {
1861 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
1862 fprintf(stderr, "REL:%d ", alu->src[2].rel);
1863 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
1864 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
1865 } else {
1866 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
1867 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
1868 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
1869 fprintf(stderr, "OMOD:%d ", alu->omod);
1870 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
1871 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
1872 }
1873
1874 id++;
1875 if (alu->last) {
1876 for (i = 0; i < nliteral; i++, id++) {
1877 float *f = (float*)(bc->bytecode + id);
1878 fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
1879 }
1880 id += nliteral & 1;
1881 nliteral = 0;
1882 }
1883 }
1884
1885 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1886 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1887 fprintf(stderr, "INST:%d ", tex->inst);
1888 fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
1889 fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
1890 fprintf(stderr, "REL:%d)\n", tex->src_rel);
1891 id++;
1892 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1893 fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
1894 fprintf(stderr, "REL:%d ", tex->dst_rel);
1895 fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
1896 fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
1897 fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
1898 fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
1899 fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
1900 fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
1901 fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
1902 fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
1903 fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
1904 id++;
1905 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1906 fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
1907 fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
1908 fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
1909 fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
1910 fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
1911 fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
1912 fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
1913 fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
1914 id++;
1915 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
1916 id++;
1917 }
1918
1919 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1920 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1921 fprintf(stderr, "INST:%d ", vtx->inst);
1922 fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
1923 fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
1924 id++;
1925 /* This assumes that no semantic fetches exist */
1926 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1927 fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
1928 fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
1929 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
1930 fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
1931 fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
1932 fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
1933 fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
1934 fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
1935 fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
1936 fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
1937 fprintf(stderr, "NUM:%d ", vtx->num_format_all);
1938 fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
1939 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
1940 id++;
1941 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1942 fprintf(stderr, "ENDIAN:%d ", vtx->endian);
1943 fprintf(stderr, "OFFSET:%d\n", vtx->offset);
1944 //TODO
1945 id++;
1946 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
1947 id++;
1948 }
1949 }
1950
1951 fprintf(stderr, "--------------------------------------\n");
1952 }
1953
1954 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
1955 unsigned *num_format, unsigned *format_comp, unsigned *endian)
1956 {
1957 const struct util_format_description *desc;
1958 unsigned i;
1959
1960 *format = 0;
1961 *num_format = 0;
1962 *format_comp = 0;
1963 *endian = ENDIAN_NONE;
1964
1965 desc = util_format_description(pformat);
1966 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
1967 goto out_unknown;
1968 }
1969
1970 /* Find the first non-VOID channel. */
1971 for (i = 0; i < 4; i++) {
1972 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
1973 break;
1974 }
1975 }
1976
1977 switch (desc->channel[i].type) {
1978 /* Half-floats, floats, ints */
1979 case UTIL_FORMAT_TYPE_FLOAT:
1980 switch (desc->channel[i].size) {
1981 case 16:
1982 switch (desc->nr_channels) {
1983 case 1:
1984 *format = FMT_16_FLOAT;
1985 break;
1986 case 2:
1987 *format = FMT_16_16_FLOAT;
1988 break;
1989 case 3:
1990 case 4:
1991 *format = FMT_16_16_16_16_FLOAT;
1992 break;
1993 }
1994 #ifdef PIPE_ARCH_BIG_ENDIAN
1995 *endian = ENDIAN_8IN16;
1996 #endif
1997 break;
1998 case 32:
1999 switch (desc->nr_channels) {
2000 case 1:
2001 *format = FMT_32_FLOAT;
2002 break;
2003 case 2:
2004 *format = FMT_32_32_FLOAT;
2005 break;
2006 case 3:
2007 *format = FMT_32_32_32_FLOAT;
2008 break;
2009 case 4:
2010 *format = FMT_32_32_32_32_FLOAT;
2011 break;
2012 }
2013 #ifdef PIPE_ARCH_BIG_ENDIAN
2014 *endian = ENDIAN_8IN32;
2015 #endif
2016 break;
2017 default:
2018 goto out_unknown;
2019 }
2020 break;
2021 /* Unsigned ints */
2022 case UTIL_FORMAT_TYPE_UNSIGNED:
2023 /* Signed ints */
2024 case UTIL_FORMAT_TYPE_SIGNED:
2025 switch (desc->channel[i].size) {
2026 case 8:
2027 switch (desc->nr_channels) {
2028 case 1:
2029 *format = FMT_8;
2030 break;
2031 case 2:
2032 *format = FMT_8_8;
2033 break;
2034 case 3:
2035 case 4:
2036 *format = FMT_8_8_8_8;
2037 break;
2038 }
2039 break;
2040 case 16:
2041 switch (desc->nr_channels) {
2042 case 1:
2043 *format = FMT_16;
2044 break;
2045 case 2:
2046 *format = FMT_16_16;
2047 break;
2048 case 3:
2049 case 4:
2050 *format = FMT_16_16_16_16;
2051 break;
2052 }
2053 #ifdef PIPE_ARCH_BIG_ENDIAN
2054 *endian = ENDIAN_8IN16;
2055 #endif
2056 break;
2057 case 32:
2058 switch (desc->nr_channels) {
2059 case 1:
2060 *format = FMT_32;
2061 break;
2062 case 2:
2063 *format = FMT_32_32;
2064 break;
2065 case 3:
2066 *format = FMT_32_32_32;
2067 break;
2068 case 4:
2069 *format = FMT_32_32_32_32;
2070 break;
2071 }
2072 #ifdef PIPE_ARCH_BIG_ENDIAN
2073 *endian = ENDIAN_8IN32;
2074 #endif
2075 break;
2076 default:
2077 goto out_unknown;
2078 }
2079 break;
2080 default:
2081 goto out_unknown;
2082 }
2083
2084 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2085 *format_comp = 1;
2086 }
2087 if (desc->channel[i].normalized) {
2088 *num_format = 0;
2089 } else {
2090 *num_format = 2;
2091 }
2092 return;
2093 out_unknown:
2094 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2095 }
2096
2097 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2098 {
2099 static int dump_shaders = -1;
2100
2101 struct r600_bc bc;
2102 struct r600_bc_vtx vtx;
2103 struct pipe_vertex_element *elements = ve->elements;
2104 const struct util_format_description *desc;
2105 unsigned fetch_resource_start = rctx->family >= CHIP_CEDAR ? 0 : 160;
2106 unsigned format, num_format, format_comp, endian;
2107 u32 *bytecode;
2108 int i, r;
2109
2110 /* vertex elements offset need special handling, if offset is bigger
2111 + * than what we can put in fetch instruction then we need to alterate
2112 * the vertex resource offset. In such case in order to simplify code
2113 * we will bound one resource per elements. It's a worst case scenario.
2114 */
2115 for (i = 0; i < ve->count; i++) {
2116 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
2117 if (ve->vbuffer_offset[i]) {
2118 ve->vbuffer_need_offset = 1;
2119 }
2120 }
2121
2122 memset(&bc, 0, sizeof(bc));
2123 r = r600_bc_init(&bc, r600_get_family(rctx->radeon));
2124 if (r)
2125 return r;
2126
2127 for (i = 0; i < ve->count; i++) {
2128 if (elements[i].instance_divisor > 1) {
2129 struct r600_bc_alu alu;
2130
2131 memset(&alu, 0, sizeof(alu));
2132 alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2133 alu.src[0].sel = 0;
2134 alu.src[0].chan = 3;
2135
2136 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2137 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2138
2139 alu.dst.sel = i + 1;
2140 alu.dst.chan = 3;
2141 alu.dst.write = 1;
2142 alu.last = 1;
2143
2144 if ((r = r600_bc_add_alu(&bc, &alu))) {
2145 r600_bc_clear(&bc);
2146 return r;
2147 }
2148 }
2149 }
2150
2151 for (i = 0; i < ve->count; i++) {
2152 unsigned vbuffer_index;
2153 r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp, &endian);
2154 desc = util_format_description(ve->elements[i].src_format);
2155 if (desc == NULL) {
2156 r600_bc_clear(&bc);
2157 R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2158 return -EINVAL;
2159 }
2160
2161 /* see above for vbuffer_need_offset explanation */
2162 vbuffer_index = elements[i].vertex_buffer_index;
2163 memset(&vtx, 0, sizeof(vtx));
2164 vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start;
2165 vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2166 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2167 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2168 vtx.mega_fetch_count = 0x1F;
2169 vtx.dst_gpr = i + 1;
2170 vtx.dst_sel_x = desc->swizzle[0];
2171 vtx.dst_sel_y = desc->swizzle[1];
2172 vtx.dst_sel_z = desc->swizzle[2];
2173 vtx.dst_sel_w = desc->swizzle[3];
2174 vtx.data_format = format;
2175 vtx.num_format_all = num_format;
2176 vtx.format_comp_all = format_comp;
2177 vtx.srf_mode_all = 1;
2178 vtx.offset = elements[i].src_offset;
2179 vtx.endian = endian;
2180
2181 if ((r = r600_bc_add_vtx(&bc, &vtx))) {
2182 r600_bc_clear(&bc);
2183 return r;
2184 }
2185 }
2186
2187 r600_bc_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2188
2189 if ((r = r600_bc_build(&bc))) {
2190 r600_bc_clear(&bc);
2191 return r;
2192 }
2193
2194 if (dump_shaders == -1)
2195 dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2196
2197 if (dump_shaders) {
2198 fprintf(stderr, "--------------------------------------------------------------\n");
2199 r600_bc_dump(&bc);
2200 fprintf(stderr, "______________________________________________________________\n");
2201 }
2202
2203 ve->fs_size = bc.ndw*4;
2204
2205 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
2206 ve->fetch_shader = r600_bo(rctx->radeon, ve->fs_size, 256, PIPE_BIND_VERTEX_BUFFER, 0);
2207 if (ve->fetch_shader == NULL) {
2208 r600_bc_clear(&bc);
2209 return -ENOMEM;
2210 }
2211
2212 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
2213 if (bytecode == NULL) {
2214 r600_bc_clear(&bc);
2215 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
2216 return -ENOMEM;
2217 }
2218
2219 for(i = 0; i < ve->fs_size / 4; i++) {
2220 *(bytecode + i) = CPU_TO_LE32(*(bc.bytecode + i));
2221 }
2222
2223 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
2224 r600_bc_clear(&bc);
2225
2226 if (rctx->family >= CHIP_CEDAR)
2227 evergreen_fetch_shader(ve);
2228 else
2229 r600_fetch_shader(ve);
2230
2231 return 0;
2232 }