r600g: remove an unused parameter from r600_bo_destroy
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include <stdio.h>
24 #include <errno.h>
25 #include <byteswap.h>
26 #include "util/u_format.h"
27 #include "util/u_memory.h"
28 #include "pipe/p_shader_tokens.h"
29 #include "r600_pipe.h"
30 #include "r600_sq.h"
31 #include "r600_opcodes.h"
32 #include "r600_asm.h"
33 #include "r600_formats.h"
34 #include "r600d.h"
35
36 #define NUM_OF_CYCLES 3
37 #define NUM_OF_COMPONENTS 4
38
39 static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
40 {
41 if(alu->is_op3)
42 return 3;
43
44 switch (bc->chip_class) {
45 case R600:
46 case R700:
47 switch (alu->inst) {
48 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
49 return 0;
50 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
51 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
52 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
53 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
55 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
62 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
63 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
71 return 2;
72
73 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
77 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
78 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
79 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
80 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
81 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
82 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
83 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
84 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
85 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
86 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
87 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
88 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
89 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
90 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
91 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
92 return 1;
93 default: R600_ERR(
94 "Need instruction operand number for 0x%x.\n", alu->inst);
95 }
96 break;
97 case EVERGREEN:
98 case CAYMAN:
99 switch (alu->inst) {
100 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
101 return 0;
102 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
103 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
104 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
105 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
106 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
107 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
108 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
109 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
110 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
111 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
112 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
113 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
114 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
115 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
116 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
117 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
118 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
119 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
120 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
121 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
122 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
123 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
124 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
125 return 2;
126
127 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
128 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
129 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
130 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
131 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
132 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
133 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
134 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
135 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
136 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
137 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
138 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
139 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
140 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
141 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
142 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
143 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
144 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
145 return 1;
146 default: R600_ERR(
147 "Need instruction operand number for 0x%x.\n", alu->inst);
148 }
149 break;
150 }
151
152 return 3;
153 }
154
155 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
156
157 static struct r600_bc_cf *r600_bc_cf(void)
158 {
159 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
160
161 if (cf == NULL)
162 return NULL;
163 LIST_INITHEAD(&cf->list);
164 LIST_INITHEAD(&cf->alu);
165 LIST_INITHEAD(&cf->vtx);
166 LIST_INITHEAD(&cf->tex);
167 return cf;
168 }
169
170 static struct r600_bc_alu *r600_bc_alu(void)
171 {
172 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
173
174 if (alu == NULL)
175 return NULL;
176 LIST_INITHEAD(&alu->list);
177 return alu;
178 }
179
180 static struct r600_bc_vtx *r600_bc_vtx(void)
181 {
182 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
183
184 if (vtx == NULL)
185 return NULL;
186 LIST_INITHEAD(&vtx->list);
187 return vtx;
188 }
189
190 static struct r600_bc_tex *r600_bc_tex(void)
191 {
192 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
193
194 if (tex == NULL)
195 return NULL;
196 LIST_INITHEAD(&tex->list);
197 return tex;
198 }
199
200 void r600_bc_init(struct r600_bc *bc, enum chip_class chip_class)
201 {
202 LIST_INITHEAD(&bc->cf);
203 bc->chip_class = chip_class;
204 }
205
206 static int r600_bc_add_cf(struct r600_bc *bc)
207 {
208 struct r600_bc_cf *cf = r600_bc_cf();
209
210 if (cf == NULL)
211 return -ENOMEM;
212 LIST_ADDTAIL(&cf->list, &bc->cf);
213 if (bc->cf_last)
214 cf->id = bc->cf_last->id + 2;
215 bc->cf_last = cf;
216 bc->ncf++;
217 bc->ndw += 2;
218 bc->force_add_cf = 0;
219 return 0;
220 }
221
222 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
223 {
224 int r;
225
226 if (bc->cf_last && (bc->cf_last->inst == output->inst ||
227 (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
228 output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
229 output->type == bc->cf_last->output.type &&
230 output->elem_size == bc->cf_last->output.elem_size &&
231 output->swizzle_x == bc->cf_last->output.swizzle_x &&
232 output->swizzle_y == bc->cf_last->output.swizzle_y &&
233 output->swizzle_z == bc->cf_last->output.swizzle_z &&
234 output->swizzle_w == bc->cf_last->output.swizzle_w &&
235 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
236
237 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
238 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
239
240 bc->cf_last->output.end_of_program |= output->end_of_program;
241 bc->cf_last->output.inst = output->inst;
242 bc->cf_last->output.gpr = output->gpr;
243 bc->cf_last->output.array_base = output->array_base;
244 bc->cf_last->output.burst_count += output->burst_count;
245 return 0;
246
247 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
248 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
249
250 bc->cf_last->output.end_of_program |= output->end_of_program;
251 bc->cf_last->output.inst = output->inst;
252 bc->cf_last->output.burst_count += output->burst_count;
253 return 0;
254 }
255 }
256
257 r = r600_bc_add_cf(bc);
258 if (r)
259 return r;
260 bc->cf_last->inst = output->inst;
261 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
262 return 0;
263 }
264
265 /* alu instructions that can ony exits once per group */
266 static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
267 {
268 switch (bc->chip_class) {
269 case R600:
270 case R700:
271 return !alu->is_op3 && (
272 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
273 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
274 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
275 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
276 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
277 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
278 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
279 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
280 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
281 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
282 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
283 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
284 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
285 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
286 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
287 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
288 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
289 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
290 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
291 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
292 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
293 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
294 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
295 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
296 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
297 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
298 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
299 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
300 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
301 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
302 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
303 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
304 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
305 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
306 case EVERGREEN:
307 case CAYMAN:
308 default:
309 return !alu->is_op3 && (
310 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
311 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
312 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
313 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
314 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
315 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
316 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
317 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
318 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
319 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
320 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
321 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
322 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
323 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
324 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
325 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
326 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
327 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
328 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
329 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
330 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
331 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
332 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
333 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
334 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
335 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
336 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
337 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
338 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
339 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
340 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
341 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
342 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
343 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
344 }
345 }
346
347 static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
348 {
349 switch (bc->chip_class) {
350 case R600:
351 case R700:
352 return !alu->is_op3 && (
353 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
354 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
355 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
356 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
357 case EVERGREEN:
358 case CAYMAN:
359 default:
360 return !alu->is_op3 && (
361 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
362 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
363 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
364 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
365 }
366 }
367
368 static int is_alu_cube_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
369 {
370 switch (bc->chip_class) {
371 case R600:
372 case R700:
373 return !alu->is_op3 &&
374 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
375 case EVERGREEN:
376 case CAYMAN:
377 default:
378 return !alu->is_op3 &&
379 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
380 }
381 }
382
383 static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
384 {
385 switch (bc->chip_class) {
386 case R600:
387 case R700:
388 return !alu->is_op3 && (
389 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
390 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
391 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
392 case EVERGREEN:
393 case CAYMAN:
394 default:
395 return !alu->is_op3 && (
396 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
397 }
398 }
399
400 /* alu instructions that can only execute on the vector unit */
401 static int is_alu_vec_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
402 {
403 return is_alu_reduction_inst(bc, alu) ||
404 is_alu_mova_inst(bc, alu) ||
405 (bc->chip_class == EVERGREEN &&
406 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR);
407 }
408
409 /* alu instructions that can only execute on the trans unit */
410 static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
411 {
412 switch (bc->chip_class) {
413 case R600:
414 case R700:
415 if (!alu->is_op3)
416 return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
417 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
418 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
419 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
420 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
421 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
422 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
423 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
424 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
425 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
426 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
427 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
428 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
429 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
430 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
431 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
432 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
433 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
434 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
435 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
436 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
437 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
438 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
439 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
440 else
441 return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
442 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
443 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
444 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
445 case EVERGREEN:
446 case CAYMAN:
447 default:
448 if (!alu->is_op3)
449 /* Note that FLT_TO_INT_* instructions are vector-only instructions
450 * on Evergreen, despite what the documentation says. FLT_TO_INT
451 * can do both vector and scalar. */
452 return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
453 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
454 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
455 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
456 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
457 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
458 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
459 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
460 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
461 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
462 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
463 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
464 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
465 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
466 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
467 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
468 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
469 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
470 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
471 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
472 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
473 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
474 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
475 else
476 return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
477 }
478 }
479
480 /* alu instructions that can execute on any unit */
481 static int is_alu_any_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
482 {
483 return !is_alu_vec_unit_inst(bc, alu) &&
484 !is_alu_trans_unit_inst(bc, alu);
485 }
486
487 static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
488 struct r600_bc_alu *assignment[5])
489 {
490 struct r600_bc_alu *alu;
491 unsigned i, chan, trans;
492 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
493
494 for (i = 0; i < max_slots; i++)
495 assignment[i] = NULL;
496
497 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
498 chan = alu->dst.chan;
499 if (max_slots == 4)
500 trans = 0;
501 else if (is_alu_trans_unit_inst(bc, alu))
502 trans = 1;
503 else if (is_alu_vec_unit_inst(bc, alu))
504 trans = 0;
505 else if (assignment[chan])
506 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
507 else
508 trans = 0;
509
510 if (trans) {
511 if (assignment[4]) {
512 assert(0); /* ALU.Trans has already been allocated. */
513 return -1;
514 }
515 assignment[4] = alu;
516 } else {
517 if (assignment[chan]) {
518 assert(0); /* ALU.chan has already been allocated. */
519 return -1;
520 }
521 assignment[chan] = alu;
522 }
523
524 if (alu->last)
525 break;
526 }
527 return 0;
528 }
529
530 struct alu_bank_swizzle {
531 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
532 int hw_cfile_addr[4];
533 int hw_cfile_elem[4];
534 };
535
536 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
537 [SQ_ALU_VEC_012] = { 0, 1, 2 },
538 [SQ_ALU_VEC_021] = { 0, 2, 1 },
539 [SQ_ALU_VEC_120] = { 1, 2, 0 },
540 [SQ_ALU_VEC_102] = { 1, 0, 2 },
541 [SQ_ALU_VEC_201] = { 2, 0, 1 },
542 [SQ_ALU_VEC_210] = { 2, 1, 0 }
543 };
544
545 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
546 [SQ_ALU_SCL_210] = { 2, 1, 0 },
547 [SQ_ALU_SCL_122] = { 1, 2, 2 },
548 [SQ_ALU_SCL_212] = { 2, 1, 2 },
549 [SQ_ALU_SCL_221] = { 2, 2, 1 }
550 };
551
552 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
553 {
554 int i, cycle, component;
555 /* set up gpr use */
556 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
557 for (component = 0; component < NUM_OF_COMPONENTS; component++)
558 bs->hw_gpr[cycle][component] = -1;
559 for (i = 0; i < 4; i++)
560 bs->hw_cfile_addr[i] = -1;
561 for (i = 0; i < 4; i++)
562 bs->hw_cfile_elem[i] = -1;
563 }
564
565 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
566 {
567 if (bs->hw_gpr[cycle][chan] == -1)
568 bs->hw_gpr[cycle][chan] = sel;
569 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
570 /* Another scalar operation has already used the GPR read port for the channel. */
571 return -1;
572 }
573 return 0;
574 }
575
576 static int reserve_cfile(struct r600_bc *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
577 {
578 int res, num_res = 4;
579 if (bc->chip_class >= R700) {
580 num_res = 2;
581 chan /= 2;
582 }
583 for (res = 0; res < num_res; ++res) {
584 if (bs->hw_cfile_addr[res] == -1) {
585 bs->hw_cfile_addr[res] = sel;
586 bs->hw_cfile_elem[res] = chan;
587 return 0;
588 } else if (bs->hw_cfile_addr[res] == sel &&
589 bs->hw_cfile_elem[res] == chan)
590 return 0; /* Read for this scalar element already reserved, nothing to do here. */
591 }
592 /* All cfile read ports are used, cannot reference vector element. */
593 return -1;
594 }
595
596 static int is_gpr(unsigned sel)
597 {
598 return (sel >= 0 && sel <= 127);
599 }
600
601 /* CB constants start at 512, and get translated to a kcache index when ALU
602 * clauses are constructed. Note that we handle kcache constants the same way
603 * as (the now gone) cfile constants, is that really required? */
604 static int is_cfile(unsigned sel)
605 {
606 return (sel > 255 && sel < 512) ||
607 (sel > 511 && sel < 4607) || /* Kcache before translation. */
608 (sel > 127 && sel < 192); /* Kcache after translation. */
609 }
610
611 static int is_const(int sel)
612 {
613 return is_cfile(sel) ||
614 (sel >= V_SQ_ALU_SRC_0 &&
615 sel <= V_SQ_ALU_SRC_LITERAL);
616 }
617
618 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
619 struct alu_bank_swizzle *bs, int bank_swizzle)
620 {
621 int r, src, num_src, sel, elem, cycle;
622
623 num_src = r600_bc_get_num_operands(bc, alu);
624 for (src = 0; src < num_src; src++) {
625 sel = alu->src[src].sel;
626 elem = alu->src[src].chan;
627 if (is_gpr(sel)) {
628 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
629 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
630 /* Nothing to do; special-case optimization,
631 * second source uses first source’s reservation. */
632 continue;
633 else {
634 r = reserve_gpr(bs, sel, elem, cycle);
635 if (r)
636 return r;
637 }
638 } else if (is_cfile(sel)) {
639 r = reserve_cfile(bc, bs, sel, elem);
640 if (r)
641 return r;
642 }
643 /* No restrictions on PV, PS, literal or special constants. */
644 }
645 return 0;
646 }
647
648 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
649 struct alu_bank_swizzle *bs, int bank_swizzle)
650 {
651 int r, src, num_src, const_count, sel, elem, cycle;
652
653 num_src = r600_bc_get_num_operands(bc, alu);
654 for (const_count = 0, src = 0; src < num_src; ++src) {
655 sel = alu->src[src].sel;
656 elem = alu->src[src].chan;
657 if (is_const(sel)) { /* Any constant, including literal and inline constants. */
658 if (const_count >= 2)
659 /* More than two references to a constant in
660 * transcendental operation. */
661 return -1;
662 else
663 const_count++;
664 }
665 if (is_cfile(sel)) {
666 r = reserve_cfile(bc, bs, sel, elem);
667 if (r)
668 return r;
669 }
670 }
671 for (src = 0; src < num_src; ++src) {
672 sel = alu->src[src].sel;
673 elem = alu->src[src].chan;
674 if (is_gpr(sel)) {
675 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
676 if (cycle < const_count)
677 /* Cycle for GPR load conflicts with
678 * constant load in transcendental operation. */
679 return -1;
680 r = reserve_gpr(bs, sel, elem, cycle);
681 if (r)
682 return r;
683 }
684 /* PV PS restrictions */
685 if (const_count && (sel == 254 || sel == 255)) {
686 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
687 if (cycle < const_count)
688 return -1;
689 }
690 }
691 return 0;
692 }
693
694 static int check_and_set_bank_swizzle(struct r600_bc *bc,
695 struct r600_bc_alu *slots[5])
696 {
697 struct alu_bank_swizzle bs;
698 int bank_swizzle[5];
699 int i, r = 0, forced = 0;
700 boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
701 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
702
703 for (i = 0; i < max_slots; i++) {
704 if (slots[i] && slots[i]->bank_swizzle_force) {
705 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
706 forced = 1;
707 }
708 if (i < 4 && slots[i])
709 scalar_only = false;
710 }
711 if (forced)
712 return 0;
713
714 /* Just check every possible combination of bank swizzle.
715 * Not very efficent, but works on the first try in most of the cases. */
716 for (i = 0; i < 4; i++)
717 bank_swizzle[i] = SQ_ALU_VEC_012;
718 bank_swizzle[4] = SQ_ALU_SCL_210;
719 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
720
721 if (max_slots == 4) {
722 for (i = 0; i < max_slots; i++) {
723 if (bank_swizzle[i] == SQ_ALU_VEC_210)
724 return -1;
725 }
726 }
727 init_bank_swizzle(&bs);
728 if (scalar_only == false) {
729 for (i = 0; i < 4; i++) {
730 if (slots[i]) {
731 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
732 if (r)
733 break;
734 }
735 }
736 } else
737 r = 0;
738
739 if (!r && slots[4] && max_slots == 5) {
740 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
741 }
742 if (!r) {
743 for (i = 0; i < max_slots; i++) {
744 if (slots[i])
745 slots[i]->bank_swizzle = bank_swizzle[i];
746 }
747 return 0;
748 }
749
750 if (scalar_only) {
751 bank_swizzle[4]++;
752 } else {
753 for (i = 0; i < max_slots; i++) {
754 bank_swizzle[i]++;
755 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
756 break;
757 else
758 bank_swizzle[i] = SQ_ALU_VEC_012;
759 }
760 }
761 }
762
763 /* Couldn't find a working swizzle. */
764 return -1;
765 }
766
767 static int replace_gpr_with_pv_ps(struct r600_bc *bc,
768 struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
769 {
770 struct r600_bc_alu *prev[5];
771 int gpr[5], chan[5];
772 int i, j, r, src, num_src;
773 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
774
775 r = assign_alu_units(bc, alu_prev, prev);
776 if (r)
777 return r;
778
779 for (i = 0; i < max_slots; ++i) {
780 if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
781 gpr[i] = prev[i]->dst.sel;
782 /* cube writes more than PV.X */
783 if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
784 chan[i] = 0;
785 else
786 chan[i] = prev[i]->dst.chan;
787 } else
788 gpr[i] = -1;
789 }
790
791 for (i = 0; i < max_slots; ++i) {
792 struct r600_bc_alu *alu = slots[i];
793 if(!alu)
794 continue;
795
796 num_src = r600_bc_get_num_operands(bc, alu);
797 for (src = 0; src < num_src; ++src) {
798 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
799 continue;
800
801 if (bc->chip_class < CAYMAN) {
802 if (alu->src[src].sel == gpr[4] &&
803 alu->src[src].chan == chan[4]) {
804 alu->src[src].sel = V_SQ_ALU_SRC_PS;
805 alu->src[src].chan = 0;
806 continue;
807 }
808 }
809
810 for (j = 0; j < 4; ++j) {
811 if (alu->src[src].sel == gpr[j] &&
812 alu->src[src].chan == j) {
813 alu->src[src].sel = V_SQ_ALU_SRC_PV;
814 alu->src[src].chan = chan[j];
815 break;
816 }
817 }
818 }
819 }
820
821 return 0;
822 }
823
824 void r600_bc_special_constants(u32 value, unsigned *sel, unsigned *neg)
825 {
826 switch(value) {
827 case 0:
828 *sel = V_SQ_ALU_SRC_0;
829 break;
830 case 1:
831 *sel = V_SQ_ALU_SRC_1_INT;
832 break;
833 case -1:
834 *sel = V_SQ_ALU_SRC_M_1_INT;
835 break;
836 case 0x3F800000: /* 1.0f */
837 *sel = V_SQ_ALU_SRC_1;
838 break;
839 case 0x3F000000: /* 0.5f */
840 *sel = V_SQ_ALU_SRC_0_5;
841 break;
842 case 0xBF800000: /* -1.0f */
843 *sel = V_SQ_ALU_SRC_1;
844 *neg ^= 1;
845 break;
846 case 0xBF000000: /* -0.5f */
847 *sel = V_SQ_ALU_SRC_0_5;
848 *neg ^= 1;
849 break;
850 default:
851 *sel = V_SQ_ALU_SRC_LITERAL;
852 break;
853 }
854 }
855
856 /* compute how many literal are needed */
857 static int r600_bc_alu_nliterals(struct r600_bc *bc, struct r600_bc_alu *alu,
858 uint32_t literal[4], unsigned *nliteral)
859 {
860 unsigned num_src = r600_bc_get_num_operands(bc, alu);
861 unsigned i, j;
862
863 for (i = 0; i < num_src; ++i) {
864 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
865 uint32_t value = alu->src[i].value;
866 unsigned found = 0;
867 for (j = 0; j < *nliteral; ++j) {
868 if (literal[j] == value) {
869 found = 1;
870 break;
871 }
872 }
873 if (!found) {
874 if (*nliteral >= 4)
875 return -EINVAL;
876 literal[(*nliteral)++] = value;
877 }
878 }
879 }
880 return 0;
881 }
882
883 static void r600_bc_alu_adjust_literals(struct r600_bc *bc,
884 struct r600_bc_alu *alu,
885 uint32_t literal[4], unsigned nliteral)
886 {
887 unsigned num_src = r600_bc_get_num_operands(bc, alu);
888 unsigned i, j;
889
890 for (i = 0; i < num_src; ++i) {
891 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
892 uint32_t value = alu->src[i].value;
893 for (j = 0; j < nliteral; ++j) {
894 if (literal[j] == value) {
895 alu->src[i].chan = j;
896 break;
897 }
898 }
899 }
900 }
901 }
902
903 static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
904 struct r600_bc_alu *alu_prev)
905 {
906 struct r600_bc_alu *prev[5];
907 struct r600_bc_alu *result[5] = { NULL };
908
909 uint32_t literal[4], prev_literal[4];
910 unsigned nliteral = 0, prev_nliteral = 0;
911
912 int i, j, r, src, num_src;
913 int num_once_inst = 0;
914 int have_mova = 0, have_rel = 0;
915 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
916
917 r = assign_alu_units(bc, alu_prev, prev);
918 if (r)
919 return r;
920
921 for (i = 0; i < max_slots; ++i) {
922 struct r600_bc_alu *alu;
923
924 /* check number of literals */
925 if (prev[i]) {
926 if (r600_bc_alu_nliterals(bc, prev[i], literal, &nliteral))
927 return 0;
928 if (r600_bc_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
929 return 0;
930 if (is_alu_mova_inst(bc, prev[i])) {
931 if (have_rel)
932 return 0;
933 have_mova = 1;
934 }
935 num_once_inst += is_alu_once_inst(bc, prev[i]);
936 }
937 if (slots[i] && r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral))
938 return 0;
939
940 /* Let's check used slots. */
941 if (prev[i] && !slots[i]) {
942 result[i] = prev[i];
943 continue;
944 } else if (prev[i] && slots[i]) {
945 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
946 /* Trans unit is still free try to use it. */
947 if (is_alu_any_unit_inst(bc, slots[i])) {
948 result[i] = prev[i];
949 result[4] = slots[i];
950 } else if (is_alu_any_unit_inst(bc, prev[i])) {
951 result[i] = slots[i];
952 result[4] = prev[i];
953 } else
954 return 0;
955 } else
956 return 0;
957 } else if(!slots[i]) {
958 continue;
959 } else
960 result[i] = slots[i];
961
962 alu = slots[i];
963 num_once_inst += is_alu_once_inst(bc, alu);
964
965 /* Let's check dst gpr. */
966 if (alu->dst.rel) {
967 if (have_mova)
968 return 0;
969 have_rel = 1;
970 }
971
972 /* Let's check source gprs */
973 num_src = r600_bc_get_num_operands(bc, alu);
974 for (src = 0; src < num_src; ++src) {
975 if (alu->src[src].rel) {
976 if (have_mova)
977 return 0;
978 have_rel = 1;
979 }
980
981 /* Constants don't matter. */
982 if (!is_gpr(alu->src[src].sel))
983 continue;
984
985 for (j = 0; j < max_slots; ++j) {
986 if (!prev[j] || !prev[j]->dst.write)
987 continue;
988
989 /* If it's relative then we can't determin which gpr is really used. */
990 if (prev[j]->dst.chan == alu->src[src].chan &&
991 (prev[j]->dst.sel == alu->src[src].sel ||
992 prev[j]->dst.rel || alu->src[src].rel))
993 return 0;
994 }
995 }
996 }
997
998 /* more than one PRED_ or KILL_ ? */
999 if (num_once_inst > 1)
1000 return 0;
1001
1002 /* check if the result can still be swizzlet */
1003 r = check_and_set_bank_swizzle(bc, result);
1004 if (r)
1005 return 0;
1006
1007 /* looks like everything worked out right, apply the changes */
1008
1009 /* undo adding previus literals */
1010 bc->cf_last->ndw -= align(prev_nliteral, 2);
1011
1012 /* sort instructions */
1013 for (i = 0; i < max_slots; ++i) {
1014 slots[i] = result[i];
1015 if (result[i]) {
1016 LIST_DEL(&result[i]->list);
1017 result[i]->last = 0;
1018 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1019 }
1020 }
1021
1022 /* determine new last instruction */
1023 LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
1024
1025 /* determine new first instruction */
1026 for (i = 0; i < max_slots; ++i) {
1027 if (result[i]) {
1028 bc->cf_last->curr_bs_head = result[i];
1029 break;
1030 }
1031 }
1032
1033 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1034 bc->cf_last->prev2_bs_head = NULL;
1035
1036 return 0;
1037 }
1038
1039 /* This code handles kcache lines as single blocks of 32 constants. We could
1040 * probably do slightly better by recognizing that we actually have two
1041 * consecutive lines of 16 constants, but the resulting code would also be
1042 * somewhat more complicated. */
1043 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
1044 {
1045 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
1046 unsigned int required_lines;
1047 unsigned int free_lines = 0;
1048 unsigned int cache_line[3];
1049 unsigned int count = 0;
1050 unsigned int i, j;
1051 int r;
1052
1053 /* Collect required cache lines. */
1054 for (i = 0; i < 3; ++i) {
1055 boolean found = false;
1056 unsigned int line;
1057
1058 if (alu->src[i].sel < 512)
1059 continue;
1060
1061 line = ((alu->src[i].sel - 512) / 32) * 2;
1062
1063 for (j = 0; j < count; ++j) {
1064 if (cache_line[j] == line) {
1065 found = true;
1066 break;
1067 }
1068 }
1069
1070 if (!found)
1071 cache_line[count++] = line;
1072 }
1073
1074 /* This should never actually happen. */
1075 if (count >= 3) return -ENOMEM;
1076
1077 for (i = 0; i < 2; ++i) {
1078 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1079 ++free_lines;
1080 }
1081 }
1082
1083 /* Filter lines pulled in by previous intructions. Note that this is
1084 * only for the required_lines count, we can't remove these from the
1085 * cache_line array since we may have to start a new ALU clause. */
1086 for (i = 0, required_lines = count; i < count; ++i) {
1087 for (j = 0; j < 2; ++j) {
1088 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1089 kcache[j].addr == cache_line[i]) {
1090 --required_lines;
1091 break;
1092 }
1093 }
1094 }
1095
1096 /* Start a new ALU clause if needed. */
1097 if (required_lines > free_lines) {
1098 if ((r = r600_bc_add_cf(bc))) {
1099 return r;
1100 }
1101 bc->cf_last->inst = (type << 3);
1102 kcache = bc->cf_last->kcache;
1103 }
1104
1105 /* Setup the kcache lines. */
1106 for (i = 0; i < count; ++i) {
1107 boolean found = false;
1108
1109 for (j = 0; j < 2; ++j) {
1110 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1111 kcache[j].addr == cache_line[i]) {
1112 found = true;
1113 break;
1114 }
1115 }
1116
1117 if (found) continue;
1118
1119 for (j = 0; j < 2; ++j) {
1120 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1121 kcache[j].bank = 0;
1122 kcache[j].addr = cache_line[i];
1123 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1124 break;
1125 }
1126 }
1127 }
1128
1129 /* Alter the src operands to refer to the kcache. */
1130 for (i = 0; i < 3; ++i) {
1131 static const unsigned int base[] = {128, 160, 256, 288};
1132 unsigned int line;
1133
1134 if (alu->src[i].sel < 512)
1135 continue;
1136
1137 alu->src[i].sel -= 512;
1138 line = (alu->src[i].sel / 32) * 2;
1139
1140 for (j = 0; j < 2; ++j) {
1141 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1142 kcache[j].addr == line) {
1143 alu->src[i].sel &= 0x1f;
1144 alu->src[i].sel += base[j];
1145 break;
1146 }
1147 }
1148 }
1149
1150 return 0;
1151 }
1152
1153 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
1154 {
1155 struct r600_bc_alu *nalu = r600_bc_alu();
1156 struct r600_bc_alu *lalu;
1157 int i, r;
1158
1159 if (nalu == NULL)
1160 return -ENOMEM;
1161 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
1162
1163 if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1164 /* check if we could add it anyway */
1165 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1166 type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1167 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1168 if (lalu->predicate) {
1169 bc->force_add_cf = 1;
1170 break;
1171 }
1172 }
1173 } else
1174 bc->force_add_cf = 1;
1175 }
1176
1177 /* cf can contains only alu or only vtx or only tex */
1178 if (bc->cf_last == NULL || bc->force_add_cf) {
1179 r = r600_bc_add_cf(bc);
1180 if (r) {
1181 free(nalu);
1182 return r;
1183 }
1184 }
1185 bc->cf_last->inst = (type << 3);
1186
1187 /* Setup the kcache for this ALU instruction. This will start a new
1188 * ALU clause if needed. */
1189 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
1190 free(nalu);
1191 return r;
1192 }
1193
1194 if (!bc->cf_last->curr_bs_head) {
1195 bc->cf_last->curr_bs_head = nalu;
1196 }
1197 /* number of gpr == the last gpr used in any alu */
1198 for (i = 0; i < 3; i++) {
1199 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1200 bc->ngpr = nalu->src[i].sel + 1;
1201 }
1202 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1203 r600_bc_special_constants(nalu->src[i].value,
1204 &nalu->src[i].sel, &nalu->src[i].neg);
1205 }
1206 if (nalu->dst.sel >= bc->ngpr) {
1207 bc->ngpr = nalu->dst.sel + 1;
1208 }
1209 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1210 /* each alu use 2 dwords */
1211 bc->cf_last->ndw += 2;
1212 bc->ndw += 2;
1213
1214 /* process cur ALU instructions for bank swizzle */
1215 if (nalu->last) {
1216 uint32_t literal[4];
1217 unsigned nliteral;
1218 struct r600_bc_alu *slots[5];
1219 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1220 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1221 if (r)
1222 return r;
1223
1224 if (bc->cf_last->prev_bs_head) {
1225 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1226 if (r)
1227 return r;
1228 }
1229
1230 if (bc->cf_last->prev_bs_head) {
1231 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1232 if (r)
1233 return r;
1234 }
1235
1236 r = check_and_set_bank_swizzle(bc, slots);
1237 if (r)
1238 return r;
1239
1240 for (i = 0, nliteral = 0; i < max_slots; i++) {
1241 if (slots[i]) {
1242 r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
1243 if (r)
1244 return r;
1245 }
1246 }
1247 bc->cf_last->ndw += align(nliteral, 2);
1248
1249 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1250 * worst case */
1251 if ((bc->cf_last->ndw >> 1) >= 120) {
1252 bc->force_add_cf = 1;
1253 }
1254
1255 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1256 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1257 bc->cf_last->curr_bs_head = NULL;
1258 }
1259 return 0;
1260 }
1261
1262 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
1263 {
1264 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1265 }
1266
1267 static unsigned r600_bc_num_tex_and_vtx_instructions(const struct r600_bc *bc)
1268 {
1269 switch (bc->chip_class) {
1270 case R600:
1271 return 8;
1272
1273 case R700:
1274 return 16;
1275
1276 case EVERGREEN:
1277 case CAYMAN:
1278 return 64;
1279
1280 default:
1281 R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1282 return 8;
1283 }
1284 }
1285
1286 static inline boolean last_inst_was_vtx_fetch(struct r600_bc *bc)
1287 {
1288 if (bc->chip_class == CAYMAN) {
1289 if (bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC)
1290 return TRUE;
1291 } else {
1292 if (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1293 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC)
1294 return TRUE;
1295 }
1296 return FALSE;
1297 }
1298
1299 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
1300 {
1301 struct r600_bc_vtx *nvtx = r600_bc_vtx();
1302 int r;
1303
1304 if (nvtx == NULL)
1305 return -ENOMEM;
1306 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
1307
1308 /* cf can contains only alu or only vtx or only tex */
1309 if (bc->cf_last == NULL ||
1310 last_inst_was_vtx_fetch(bc) ||
1311 bc->force_add_cf) {
1312 r = r600_bc_add_cf(bc);
1313 if (r) {
1314 free(nvtx);
1315 return r;
1316 }
1317 if (bc->chip_class == CAYMAN)
1318 bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1319 else
1320 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1321 }
1322 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1323 /* each fetch use 4 dwords */
1324 bc->cf_last->ndw += 4;
1325 bc->ndw += 4;
1326 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1327 bc->force_add_cf = 1;
1328 return 0;
1329 }
1330
1331 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
1332 {
1333 struct r600_bc_tex *ntex = r600_bc_tex();
1334 int r;
1335
1336 if (ntex == NULL)
1337 return -ENOMEM;
1338 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
1339
1340 /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1341 if (bc->cf_last != NULL &&
1342 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
1343 struct r600_bc_tex *ttex;
1344 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1345 if (ttex->dst_gpr == ntex->src_gpr) {
1346 bc->force_add_cf = 1;
1347 break;
1348 }
1349 }
1350 /* slight hack to make gradients always go into same cf */
1351 if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
1352 bc->force_add_cf = 1;
1353 }
1354
1355 /* cf can contains only alu or only vtx or only tex */
1356 if (bc->cf_last == NULL ||
1357 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1358 bc->force_add_cf) {
1359 r = r600_bc_add_cf(bc);
1360 if (r) {
1361 free(ntex);
1362 return r;
1363 }
1364 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1365 }
1366 if (ntex->src_gpr >= bc->ngpr) {
1367 bc->ngpr = ntex->src_gpr + 1;
1368 }
1369 if (ntex->dst_gpr >= bc->ngpr) {
1370 bc->ngpr = ntex->dst_gpr + 1;
1371 }
1372 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1373 /* each texture fetch use 4 dwords */
1374 bc->cf_last->ndw += 4;
1375 bc->ndw += 4;
1376 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1377 bc->force_add_cf = 1;
1378 return 0;
1379 }
1380
1381 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
1382 {
1383 int r;
1384 r = r600_bc_add_cf(bc);
1385 if (r)
1386 return r;
1387
1388 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1389 bc->cf_last->inst = inst;
1390 return 0;
1391 }
1392
1393 int cm_bc_add_cf_end(struct r600_bc *bc)
1394 {
1395 return r600_bc_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
1396 }
1397
1398 /* common to all 3 families */
1399 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
1400 {
1401 bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1402 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1403 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1404 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1405 if (bc->chip_class < CAYMAN)
1406 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1407 id++;
1408 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1409 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1410 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1411 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1412 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1413 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1414 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1415 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1416 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1417 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1418 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1419 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1420 if (bc->chip_class < CAYMAN)
1421 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1422 id++;
1423 bc->bytecode[id++] = 0;
1424 return 0;
1425 }
1426
1427 /* common to all 3 families */
1428 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
1429 {
1430 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1431 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1432 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1433 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1434 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1435 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1436 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1437 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1438 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1439 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1440 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1441 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1442 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1443 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1444 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1445 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1446 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1447 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1448 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1449 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1450 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1451 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1452 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1453 bc->bytecode[id++] = 0;
1454 return 0;
1455 }
1456
1457 /* r600 only, r700/eg bits in r700_asm.c */
1458 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
1459 {
1460 /* don't replace gpr by pv or ps for destination register */
1461 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1462 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1463 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1464 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1465 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1466 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1467 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1468 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1469 S_SQ_ALU_WORD0_LAST(alu->last);
1470
1471 if (alu->is_op3) {
1472 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1473 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1474 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1475 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1476 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1477 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1478 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1479 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1480 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1481 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1482 } else {
1483 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1484 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1485 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1486 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1487 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1488 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1489 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1490 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1491 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1492 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1493 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1494 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1495 }
1496 return 0;
1497 }
1498
1499 static void r600_bc_cf_vtx_build(uint32_t *bytecode, const struct r600_bc_cf *cf)
1500 {
1501 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1502 *bytecode++ = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1503 S_SQ_CF_WORD1_BARRIER(1) |
1504 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1505 }
1506
1507 /* common for r600/r700 - eg in eg_asm.c */
1508 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
1509 {
1510 unsigned id = cf->id;
1511
1512 switch (cf->inst) {
1513 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1514 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1515 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1516 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1517 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1518 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1519 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1520 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1521
1522 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1523 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1524 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1525 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1526 S_SQ_CF_ALU_WORD1_BARRIER(1) |
1527 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1528 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1529 break;
1530 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1531 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1532 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1533 if (bc->chip_class == R700)
1534 r700_bc_cf_vtx_build(&bc->bytecode[id], cf);
1535 else
1536 r600_bc_cf_vtx_build(&bc->bytecode[id], cf);
1537 break;
1538 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1539 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1540 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1541 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1542 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1543 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1544 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1545 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1546 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1547 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1548 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1549 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1550 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
1551 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1552 break;
1553 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1554 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1555 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1556 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1557 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1558 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1559 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1560 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1561 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1562 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1563 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1564 S_SQ_CF_WORD1_BARRIER(1) |
1565 S_SQ_CF_WORD1_COND(cf->cond) |
1566 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1567
1568 break;
1569 default:
1570 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1571 return -EINVAL;
1572 }
1573 return 0;
1574 }
1575
1576 int r600_bc_build(struct r600_bc *bc)
1577 {
1578 struct r600_bc_cf *cf;
1579 struct r600_bc_alu *alu;
1580 struct r600_bc_vtx *vtx;
1581 struct r600_bc_tex *tex;
1582 uint32_t literal[4];
1583 unsigned nliteral;
1584 unsigned addr;
1585 int i, r;
1586
1587 if (bc->callstack[0].max > 0)
1588 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1589 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1590 bc->nstack = 1;
1591 }
1592
1593 /* first path compute addr of each CF block */
1594 /* addr start after all the CF instructions */
1595 addr = bc->cf_last->id + 2;
1596 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1597 switch (cf->inst) {
1598 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1599 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1600 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1601 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1602 break;
1603 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1604 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1605 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1606 /* fetch node need to be 16 bytes aligned*/
1607 addr += 3;
1608 addr &= 0xFFFFFFFCUL;
1609 break;
1610 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1611 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1612 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1613 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1614 break;
1615 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1616 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1617 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1618 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1619 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1620 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1621 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1622 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1623 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1624 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1625 break;
1626 default:
1627 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1628 return -EINVAL;
1629 }
1630 cf->addr = addr;
1631 addr += cf->ndw;
1632 bc->ndw = cf->addr + cf->ndw;
1633 }
1634 free(bc->bytecode);
1635 bc->bytecode = calloc(1, bc->ndw * 4);
1636 if (bc->bytecode == NULL)
1637 return -ENOMEM;
1638 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1639 addr = cf->addr;
1640 if (bc->chip_class >= EVERGREEN)
1641 r = eg_bc_cf_build(bc, cf);
1642 else
1643 r = r600_bc_cf_build(bc, cf);
1644 if (r)
1645 return r;
1646 switch (cf->inst) {
1647 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1648 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1649 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1650 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1651 nliteral = 0;
1652 memset(literal, 0, sizeof(literal));
1653 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1654 r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1655 if (r)
1656 return r;
1657 r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
1658 switch(bc->chip_class) {
1659 case R600:
1660 r = r600_bc_alu_build(bc, alu, addr);
1661 break;
1662 case R700:
1663 case EVERGREEN: /* eg alu is same encoding as r700 */
1664 case CAYMAN: /* eg alu is same encoding as r700 */
1665 r = r700_bc_alu_build(bc, alu, addr);
1666 break;
1667 default:
1668 R600_ERR("unknown chip class %d.\n", bc->chip_class);
1669 return -EINVAL;
1670 }
1671 if (r)
1672 return r;
1673 addr += 2;
1674 if (alu->last) {
1675 for (i = 0; i < align(nliteral, 2); ++i) {
1676 bc->bytecode[addr++] = literal[i];
1677 }
1678 nliteral = 0;
1679 memset(literal, 0, sizeof(literal));
1680 }
1681 }
1682 break;
1683 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1684 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1685 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1686 r = r600_bc_vtx_build(bc, vtx, addr);
1687 if (r)
1688 return r;
1689 addr += 4;
1690 }
1691 break;
1692 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1693 if (bc->chip_class == CAYMAN) {
1694 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1695 r = r600_bc_vtx_build(bc, vtx, addr);
1696 if (r)
1697 return r;
1698 addr += 4;
1699 }
1700 }
1701 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1702 r = r600_bc_tex_build(bc, tex, addr);
1703 if (r)
1704 return r;
1705 addr += 4;
1706 }
1707 break;
1708 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1709 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1710 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1711 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1712 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1713 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1714 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1715 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1716 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1717 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1718 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1719 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1720 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1721 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1722 break;
1723 default:
1724 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1725 return -EINVAL;
1726 }
1727 }
1728 return 0;
1729 }
1730
1731 void r600_bc_clear(struct r600_bc *bc)
1732 {
1733 struct r600_bc_cf *cf = NULL, *next_cf;
1734
1735 free(bc->bytecode);
1736 bc->bytecode = NULL;
1737
1738 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1739 struct r600_bc_alu *alu = NULL, *next_alu;
1740 struct r600_bc_tex *tex = NULL, *next_tex;
1741 struct r600_bc_tex *vtx = NULL, *next_vtx;
1742
1743 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1744 free(alu);
1745 }
1746
1747 LIST_INITHEAD(&cf->alu);
1748
1749 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1750 free(tex);
1751 }
1752
1753 LIST_INITHEAD(&cf->tex);
1754
1755 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1756 free(vtx);
1757 }
1758
1759 LIST_INITHEAD(&cf->vtx);
1760
1761 free(cf);
1762 }
1763
1764 LIST_INITHEAD(&cf->list);
1765 }
1766
1767 void r600_bc_dump(struct r600_bc *bc)
1768 {
1769 struct r600_bc_cf *cf = NULL;
1770 struct r600_bc_alu *alu = NULL;
1771 struct r600_bc_vtx *vtx = NULL;
1772 struct r600_bc_tex *tex = NULL;
1773
1774 unsigned i, id;
1775 uint32_t literal[4];
1776 unsigned nliteral;
1777 char chip = '6';
1778
1779 switch (bc->chip_class) {
1780 case R700:
1781 chip = '7';
1782 break;
1783 case EVERGREEN:
1784 chip = 'E';
1785 break;
1786 case CAYMAN:
1787 chip = 'C';
1788 break;
1789 case R600:
1790 default:
1791 chip = '6';
1792 break;
1793 }
1794 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
1795 fprintf(stderr, " %c\n", chip);
1796
1797 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1798 id = cf->id;
1799
1800 switch (cf->inst) {
1801 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1802 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1803 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1804 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1805 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1806 fprintf(stderr, "ADDR:%d ", cf->addr);
1807 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
1808 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
1809 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
1810 id++;
1811 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1812 fprintf(stderr, "INST:%d ", cf->inst);
1813 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
1814 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
1815 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
1816 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
1817 break;
1818 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1819 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1820 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1821 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1822 fprintf(stderr, "ADDR:%d\n", cf->addr);
1823 id++;
1824 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1825 fprintf(stderr, "INST:%d ", cf->inst);
1826 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
1827 break;
1828 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1829 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1830 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1831 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1832 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1833 fprintf(stderr, "GPR:%X ", cf->output.gpr);
1834 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
1835 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
1836 fprintf(stderr, "TYPE:%X\n", cf->output.type);
1837 id++;
1838 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1839 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
1840 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
1841 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
1842 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
1843 fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
1844 fprintf(stderr, "INST:%d ", cf->output.inst);
1845 fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
1846 fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
1847 break;
1848 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1849 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1850 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1851 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1852 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1853 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1854 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1855 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1856 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1857 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1858 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1859 fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
1860 id++;
1861 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1862 fprintf(stderr, "INST:%d ", cf->inst);
1863 fprintf(stderr, "COND:%X ", cf->cond);
1864 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
1865 break;
1866 }
1867
1868 id = cf->addr;
1869 nliteral = 0;
1870 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1871 r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1872
1873 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1874 fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
1875 fprintf(stderr, "REL:%d ", alu->src[0].rel);
1876 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
1877 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
1878 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
1879 fprintf(stderr, "REL:%d ", alu->src[1].rel);
1880 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
1881 fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
1882 fprintf(stderr, "LAST:%d)\n", alu->last);
1883 id++;
1884 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
1885 fprintf(stderr, "INST:%d ", alu->inst);
1886 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
1887 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
1888 fprintf(stderr, "REL:%d ", alu->dst.rel);
1889 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
1890 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
1891 if (alu->is_op3) {
1892 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
1893 fprintf(stderr, "REL:%d ", alu->src[2].rel);
1894 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
1895 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
1896 } else {
1897 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
1898 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
1899 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
1900 fprintf(stderr, "OMOD:%d ", alu->omod);
1901 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
1902 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
1903 }
1904
1905 id++;
1906 if (alu->last) {
1907 for (i = 0; i < nliteral; i++, id++) {
1908 float *f = (float*)(bc->bytecode + id);
1909 fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
1910 }
1911 id += nliteral & 1;
1912 nliteral = 0;
1913 }
1914 }
1915
1916 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1917 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1918 fprintf(stderr, "INST:%d ", tex->inst);
1919 fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
1920 fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
1921 fprintf(stderr, "REL:%d)\n", tex->src_rel);
1922 id++;
1923 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1924 fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
1925 fprintf(stderr, "REL:%d ", tex->dst_rel);
1926 fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
1927 fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
1928 fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
1929 fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
1930 fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
1931 fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
1932 fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
1933 fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
1934 fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
1935 id++;
1936 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1937 fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
1938 fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
1939 fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
1940 fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
1941 fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
1942 fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
1943 fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
1944 fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
1945 id++;
1946 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
1947 id++;
1948 }
1949
1950 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1951 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1952 fprintf(stderr, "INST:%d ", vtx->inst);
1953 fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
1954 fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
1955 id++;
1956 /* This assumes that no semantic fetches exist */
1957 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1958 fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
1959 fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
1960 if (bc->chip_class < CAYMAN)
1961 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
1962 else
1963 fprintf(stderr, "SEL_Y:%d) ", 0);
1964 fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
1965 fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
1966 fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
1967 fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
1968 fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
1969 fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
1970 fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
1971 fprintf(stderr, "NUM:%d ", vtx->num_format_all);
1972 fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
1973 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
1974 id++;
1975 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1976 fprintf(stderr, "ENDIAN:%d ", vtx->endian);
1977 fprintf(stderr, "OFFSET:%d\n", vtx->offset);
1978 /* TODO */
1979 id++;
1980 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
1981 id++;
1982 }
1983 }
1984
1985 fprintf(stderr, "--------------------------------------\n");
1986 }
1987
1988 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
1989 unsigned *num_format, unsigned *format_comp, unsigned *endian)
1990 {
1991 const struct util_format_description *desc;
1992 unsigned i;
1993
1994 *format = 0;
1995 *num_format = 0;
1996 *format_comp = 0;
1997 *endian = ENDIAN_NONE;
1998
1999 desc = util_format_description(pformat);
2000 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2001 goto out_unknown;
2002 }
2003
2004 /* Find the first non-VOID channel. */
2005 for (i = 0; i < 4; i++) {
2006 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2007 break;
2008 }
2009 }
2010
2011 *endian = r600_endian_swap(desc->channel[i].size);
2012
2013 switch (desc->channel[i].type) {
2014 /* Half-floats, floats, ints */
2015 case UTIL_FORMAT_TYPE_FLOAT:
2016 switch (desc->channel[i].size) {
2017 case 16:
2018 switch (desc->nr_channels) {
2019 case 1:
2020 *format = FMT_16_FLOAT;
2021 break;
2022 case 2:
2023 *format = FMT_16_16_FLOAT;
2024 break;
2025 case 3:
2026 case 4:
2027 *format = FMT_16_16_16_16_FLOAT;
2028 break;
2029 }
2030 break;
2031 case 32:
2032 switch (desc->nr_channels) {
2033 case 1:
2034 *format = FMT_32_FLOAT;
2035 break;
2036 case 2:
2037 *format = FMT_32_32_FLOAT;
2038 break;
2039 case 3:
2040 *format = FMT_32_32_32_FLOAT;
2041 break;
2042 case 4:
2043 *format = FMT_32_32_32_32_FLOAT;
2044 break;
2045 }
2046 break;
2047 default:
2048 goto out_unknown;
2049 }
2050 break;
2051 /* Unsigned ints */
2052 case UTIL_FORMAT_TYPE_UNSIGNED:
2053 /* Signed ints */
2054 case UTIL_FORMAT_TYPE_SIGNED:
2055 switch (desc->channel[i].size) {
2056 case 8:
2057 switch (desc->nr_channels) {
2058 case 1:
2059 *format = FMT_8;
2060 break;
2061 case 2:
2062 *format = FMT_8_8;
2063 break;
2064 case 3:
2065 case 4:
2066 *format = FMT_8_8_8_8;
2067 break;
2068 }
2069 break;
2070 case 16:
2071 switch (desc->nr_channels) {
2072 case 1:
2073 *format = FMT_16;
2074 break;
2075 case 2:
2076 *format = FMT_16_16;
2077 break;
2078 case 3:
2079 case 4:
2080 *format = FMT_16_16_16_16;
2081 break;
2082 }
2083 break;
2084 case 32:
2085 switch (desc->nr_channels) {
2086 case 1:
2087 *format = FMT_32;
2088 break;
2089 case 2:
2090 *format = FMT_32_32;
2091 break;
2092 case 3:
2093 *format = FMT_32_32_32;
2094 break;
2095 case 4:
2096 *format = FMT_32_32_32_32;
2097 break;
2098 }
2099 break;
2100 default:
2101 goto out_unknown;
2102 }
2103 break;
2104 default:
2105 goto out_unknown;
2106 }
2107
2108 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2109 *format_comp = 1;
2110 }
2111 if (desc->channel[i].normalized) {
2112 *num_format = 0;
2113 } else {
2114 *num_format = 2;
2115 }
2116 return;
2117 out_unknown:
2118 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2119 }
2120
2121 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2122 {
2123 static int dump_shaders = -1;
2124
2125 struct r600_bc bc;
2126 struct r600_bc_vtx vtx;
2127 struct pipe_vertex_element *elements = ve->elements;
2128 const struct util_format_description *desc;
2129 unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
2130 unsigned format, num_format, format_comp, endian;
2131 u32 *bytecode;
2132 int i, r;
2133
2134 /* Vertex element offsets need special handling. If the offset is
2135 * bigger than what we can put in the fetch instruction we need to
2136 * alter the vertex resource offset. In order to simplify code we
2137 * will bind one resource per element in such cases. It's a worst
2138 * case scenario. */
2139 for (i = 0; i < ve->count; i++) {
2140 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
2141 if (ve->vbuffer_offset[i]) {
2142 ve->vbuffer_need_offset = 1;
2143 }
2144 }
2145
2146 memset(&bc, 0, sizeof(bc));
2147 r600_bc_init(&bc, rctx->chip_class);
2148
2149 for (i = 0; i < ve->count; i++) {
2150 if (elements[i].instance_divisor > 1) {
2151 struct r600_bc_alu alu;
2152
2153 memset(&alu, 0, sizeof(alu));
2154 alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2155 alu.src[0].sel = 0;
2156 alu.src[0].chan = 3;
2157
2158 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2159 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2160
2161 alu.dst.sel = i + 1;
2162 alu.dst.chan = 3;
2163 alu.dst.write = 1;
2164 alu.last = 1;
2165
2166 if ((r = r600_bc_add_alu(&bc, &alu))) {
2167 r600_bc_clear(&bc);
2168 return r;
2169 }
2170 }
2171 }
2172
2173 for (i = 0; i < ve->count; i++) {
2174 unsigned vbuffer_index;
2175 r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp, &endian);
2176 desc = util_format_description(ve->elements[i].src_format);
2177 if (desc == NULL) {
2178 r600_bc_clear(&bc);
2179 R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2180 return -EINVAL;
2181 }
2182
2183 /* see above for vbuffer_need_offset explanation */
2184 vbuffer_index = elements[i].vertex_buffer_index;
2185 memset(&vtx, 0, sizeof(vtx));
2186 vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start;
2187 vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2188 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2189 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2190 vtx.mega_fetch_count = 0x1F;
2191 vtx.dst_gpr = i + 1;
2192 vtx.dst_sel_x = desc->swizzle[0];
2193 vtx.dst_sel_y = desc->swizzle[1];
2194 vtx.dst_sel_z = desc->swizzle[2];
2195 vtx.dst_sel_w = desc->swizzle[3];
2196 vtx.data_format = format;
2197 vtx.num_format_all = num_format;
2198 vtx.format_comp_all = format_comp;
2199 vtx.srf_mode_all = 1;
2200 vtx.offset = elements[i].src_offset;
2201 vtx.endian = endian;
2202
2203 if ((r = r600_bc_add_vtx(&bc, &vtx))) {
2204 r600_bc_clear(&bc);
2205 return r;
2206 }
2207 }
2208
2209 r600_bc_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2210
2211 if ((r = r600_bc_build(&bc))) {
2212 r600_bc_clear(&bc);
2213 return r;
2214 }
2215
2216 if (dump_shaders == -1)
2217 dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2218
2219 if (dump_shaders) {
2220 fprintf(stderr, "--------------------------------------------------------------\n");
2221 r600_bc_dump(&bc);
2222 fprintf(stderr, "______________________________________________________________\n");
2223 }
2224
2225 ve->fs_size = bc.ndw*4;
2226
2227 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
2228 ve->fetch_shader = r600_bo(rctx->radeon, ve->fs_size, 256, PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_IMMUTABLE);
2229 if (ve->fetch_shader == NULL) {
2230 r600_bc_clear(&bc);
2231 return -ENOMEM;
2232 }
2233
2234 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, rctx->ctx.cs, PIPE_TRANSFER_WRITE);
2235 if (bytecode == NULL) {
2236 r600_bc_clear(&bc);
2237 r600_bo_reference(&ve->fetch_shader, NULL);
2238 return -ENOMEM;
2239 }
2240
2241 if (R600_BIG_ENDIAN) {
2242 for (i = 0; i < ve->fs_size / 4; ++i) {
2243 bytecode[i] = bswap_32(bc.bytecode[i]);
2244 }
2245 } else {
2246 memcpy(bytecode, bc.bytecode, ve->fs_size);
2247 }
2248
2249 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
2250 r600_bc_clear(&bc);
2251
2252 if (rctx->chip_class >= EVERGREEN)
2253 evergreen_fetch_shader(&rctx->context, ve);
2254 else
2255 r600_fetch_shader(&rctx->context, ve);
2256
2257 return 0;
2258 }