Merge remote branch 'origin/master' into pipe-video
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include <stdio.h>
24 #include <errno.h>
25 #include "util/u_format.h"
26 #include "util/u_memory.h"
27 #include "pipe/p_shader_tokens.h"
28 #include "r600_pipe.h"
29 #include "r600_sq.h"
30 #include "r600_opcodes.h"
31 #include "r600_asm.h"
32 #include "r600_formats.h"
33 #include "r600d.h"
34
35 #define NUM_OF_CYCLES 3
36 #define NUM_OF_COMPONENTS 4
37
38 #define PREV_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.prev, list)
39 #define NEXT_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)
40
41 static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
42 {
43 if(alu->is_op3)
44 return 3;
45
46 switch (bc->chiprev) {
47 case CHIPREV_R600:
48 case CHIPREV_R700:
49 switch (alu->inst) {
50 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
51 return 0;
52 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
53 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
55 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
62 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
63 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
71 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
72 return 2;
73
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
77 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
78 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
79 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
80 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
81 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
82 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
83 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
84 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
85 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
86 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
87 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
88 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
89 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
90 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
91 return 1;
92 default: R600_ERR(
93 "Need instruction operand number for 0x%x.\n", alu->inst);
94 }
95 break;
96 case CHIPREV_EVERGREEN:
97 switch (alu->inst) {
98 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
99 return 0;
100 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
101 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
102 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
103 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
104 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
105 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
106 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
107 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
108 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
109 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
110 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
111 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
112 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
113 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
114 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
115 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
116 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
117 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
118 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
119 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
120 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
121 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
122 return 2;
123
124 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
125 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
126 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
127 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
128 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
129 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
130 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
131 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
132 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
133 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
134 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
135 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
136 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
137 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
138 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
139 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
140 return 1;
141 default: R600_ERR(
142 "Need instruction operand number for 0x%x.\n", alu->inst);
143 }
144 break;
145 }
146
147 return 3;
148 }
149
150 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
151
152 static struct r600_bc_cf *r600_bc_cf(void)
153 {
154 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
155
156 if (cf == NULL)
157 return NULL;
158 LIST_INITHEAD(&cf->list);
159 LIST_INITHEAD(&cf->alu);
160 LIST_INITHEAD(&cf->vtx);
161 LIST_INITHEAD(&cf->tex);
162 cf->barrier = 1;
163 return cf;
164 }
165
166 static struct r600_bc_alu *r600_bc_alu(void)
167 {
168 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
169
170 if (alu == NULL)
171 return NULL;
172 LIST_INITHEAD(&alu->list);
173 return alu;
174 }
175
176 static struct r600_bc_vtx *r600_bc_vtx(void)
177 {
178 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
179
180 if (vtx == NULL)
181 return NULL;
182 LIST_INITHEAD(&vtx->list);
183 return vtx;
184 }
185
186 static struct r600_bc_tex *r600_bc_tex(void)
187 {
188 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
189
190 if (tex == NULL)
191 return NULL;
192 LIST_INITHEAD(&tex->list);
193 return tex;
194 }
195
196 int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
197 {
198 LIST_INITHEAD(&bc->cf);
199 bc->family = family;
200 switch (bc->family) {
201 case CHIP_R600:
202 case CHIP_RV610:
203 case CHIP_RV630:
204 case CHIP_RV670:
205 case CHIP_RV620:
206 case CHIP_RV635:
207 case CHIP_RS780:
208 case CHIP_RS880:
209 bc->chiprev = CHIPREV_R600;
210 break;
211 case CHIP_RV770:
212 case CHIP_RV730:
213 case CHIP_RV710:
214 case CHIP_RV740:
215 bc->chiprev = CHIPREV_R700;
216 break;
217 case CHIP_CEDAR:
218 case CHIP_REDWOOD:
219 case CHIP_JUNIPER:
220 case CHIP_CYPRESS:
221 case CHIP_HEMLOCK:
222 case CHIP_PALM:
223 case CHIP_BARTS:
224 case CHIP_TURKS:
225 case CHIP_CAICOS:
226 bc->chiprev = CHIPREV_EVERGREEN;
227 break;
228 default:
229 R600_ERR("unknown family %d\n", bc->family);
230 return -EINVAL;
231 }
232 return 0;
233 }
234
235 static int r600_bc_add_cf(struct r600_bc *bc)
236 {
237 struct r600_bc_cf *cf = r600_bc_cf();
238
239 if (cf == NULL)
240 return -ENOMEM;
241 LIST_ADDTAIL(&cf->list, &bc->cf);
242 if (bc->cf_last)
243 cf->id = bc->cf_last->id + 2;
244 bc->cf_last = cf;
245 bc->ncf++;
246 bc->ndw += 2;
247 bc->force_add_cf = 0;
248 return 0;
249 }
250
251 static void r600_bc_remove_cf(struct r600_bc *bc, struct r600_bc_cf *cf)
252 {
253 struct r600_bc_cf *other;
254 LIST_FOR_EACH_ENTRY(other, &bc->cf, list) {
255 if (other->id > cf->id)
256 other->id -= 2;
257 if (other->cf_addr > cf->id)
258 other->cf_addr -= 2;
259 }
260 LIST_DEL(&cf->list);
261 free(cf);
262 }
263
264 static void r600_bc_move_cf(struct r600_bc *bc, struct r600_bc_cf *cf, struct r600_bc_cf *next)
265 {
266 struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, next->list.prev, list);
267 unsigned old_id = cf->id;
268 unsigned new_id = next->list.prev == &bc->cf ? 0 : prev->id + 2;
269 struct r600_bc_cf *other;
270
271 if (prev == cf || next == cf)
272 return; /* position hasn't changed */
273
274 LIST_DEL(&cf->list);
275 LIST_FOR_EACH_ENTRY(other, &bc->cf, list) {
276 if (other->id > old_id)
277 other->id -= 2;
278 if (other->id >= new_id)
279 other->id += 2;
280 if (other->cf_addr > old_id)
281 other->cf_addr -= 2;
282 if (other->cf_addr > new_id)
283 other->cf_addr += 2;
284 }
285 cf->id = new_id;
286 LIST_ADD(&cf->list, &prev->list);
287 }
288
289 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
290 {
291 int r;
292
293 if (bc->cf_last && bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
294 output->type == bc->cf_last->output.type &&
295 output->elem_size == bc->cf_last->output.elem_size &&
296 output->swizzle_x == bc->cf_last->output.swizzle_x &&
297 output->swizzle_y == bc->cf_last->output.swizzle_y &&
298 output->swizzle_z == bc->cf_last->output.swizzle_z &&
299 output->swizzle_w == bc->cf_last->output.swizzle_w &&
300 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
301
302 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
303 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
304
305 bc->cf_last->output.gpr = output->gpr;
306 bc->cf_last->output.array_base = output->array_base;
307 bc->cf_last->output.burst_count += output->burst_count;
308 return 0;
309
310 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
311 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
312
313 bc->cf_last->output.burst_count += output->burst_count;
314 return 0;
315 }
316 }
317
318 r = r600_bc_add_cf(bc);
319 if (r)
320 return r;
321 bc->cf_last->inst = BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
322 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
323 bc->cf_last->output.burst_count = 1;
324 return 0;
325 }
326
327 /* alu predicate instructions */
328 static int is_alu_pred_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
329 {
330 switch (bc->chiprev) {
331 case CHIPREV_R600:
332 case CHIPREV_R700:
333 return !alu->is_op3 && (
334 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
335 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
336 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
337 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
338 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
339 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
340 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
341 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
342 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
343 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
344 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
345 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
346 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
347 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
348 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
349 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
350 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
351 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
352 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
353 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
354 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
355 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
356 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
357 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
358 case CHIPREV_EVERGREEN:
359 default:
360 return !alu->is_op3 && (
361 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
362 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
363 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
364 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
365 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
366 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
367 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
368 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
369 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
370 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
371 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
372 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
373 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
374 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
375 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
376 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
377 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
378 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
379 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
380 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
381 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
382 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
383 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
384 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
385 }
386 }
387
388 /* alu kill instructions */
389 static int is_alu_kill_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
390 {
391 switch (bc->chiprev) {
392 case CHIPREV_R600:
393 case CHIPREV_R700:
394 return !alu->is_op3 && (
395 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
396 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
397 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
398 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
399 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
400 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
401 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
402 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
403 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
404 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
405 case CHIPREV_EVERGREEN:
406 default:
407 return !alu->is_op3 && (
408 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
409 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
410 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
411 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
412 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
413 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
414 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
415 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
416 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
417 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
418 }
419 }
420
421 /* alu instructions that can ony exits once per group */
422 static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
423 {
424 return is_alu_kill_inst(bc, alu) ||
425 is_alu_pred_inst(bc, alu);
426 }
427
428 static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
429 {
430 switch (bc->chiprev) {
431 case CHIPREV_R600:
432 case CHIPREV_R700:
433 return !alu->is_op3 && (
434 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
435 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
436 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
437 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
438 case CHIPREV_EVERGREEN:
439 default:
440 return !alu->is_op3 && (
441 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
442 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
443 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
444 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
445 }
446 }
447
448 static int is_alu_cube_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
449 {
450 switch (bc->chiprev) {
451 case CHIPREV_R600:
452 case CHIPREV_R700:
453 return !alu->is_op3 &&
454 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
455 case CHIPREV_EVERGREEN:
456 default:
457 return !alu->is_op3 &&
458 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
459 }
460 }
461
462 static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
463 {
464 switch (bc->chiprev) {
465 case CHIPREV_R600:
466 case CHIPREV_R700:
467 return !alu->is_op3 && (
468 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
469 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
470 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
471 case CHIPREV_EVERGREEN:
472 default:
473 return !alu->is_op3 && (
474 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
475 }
476 }
477
478 /* alu instructions that can only execute on the vector unit */
479 static int is_alu_vec_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
480 {
481 return is_alu_reduction_inst(bc, alu) ||
482 is_alu_mova_inst(bc, alu);
483 }
484
485 /* alu instructions that can only execute on the trans unit */
486 static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
487 {
488 switch (bc->chiprev) {
489 case CHIPREV_R600:
490 case CHIPREV_R700:
491 if (!alu->is_op3)
492 return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
493 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
494 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
495 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
496 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
497 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
498 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
499 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
500 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
501 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
502 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
503 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
504 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
505 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
506 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
507 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
508 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
509 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
510 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
511 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
512 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
513 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
514 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
515 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
516 else
517 return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
518 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
519 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
520 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
521 case CHIPREV_EVERGREEN:
522 default:
523 if (!alu->is_op3)
524 /* Note that FLT_TO_INT* instructions are vector instructions
525 * on Evergreen, despite what the documentation says. */
526 return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
527 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
528 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
529 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
530 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
531 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
532 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
533 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
534 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
535 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
536 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
537 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
538 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
539 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
540 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
541 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
542 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
543 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
544 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
545 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
546 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
547 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
548 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
549 else
550 return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
551 }
552 }
553
554 /* alu instructions that can execute on any unit */
555 static int is_alu_any_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
556 {
557 return !is_alu_vec_unit_inst(bc, alu) &&
558 !is_alu_trans_unit_inst(bc, alu);
559 }
560
561 static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
562 struct r600_bc_alu *assignment[5])
563 {
564 struct r600_bc_alu *alu;
565 unsigned i, chan, trans;
566
567 for (i = 0; i < 5; i++)
568 assignment[i] = NULL;
569
570 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
571 chan = alu->dst.chan;
572 if (is_alu_trans_unit_inst(bc, alu))
573 trans = 1;
574 else if (is_alu_vec_unit_inst(bc, alu))
575 trans = 0;
576 else if (assignment[chan])
577 trans = 1; // assume ALU_INST_PREFER_VECTOR
578 else
579 trans = 0;
580
581 if (trans) {
582 if (assignment[4]) {
583 assert(0); //ALU.Trans has already been allocated
584 return -1;
585 }
586 assignment[4] = alu;
587 } else {
588 if (assignment[chan]) {
589 assert(0); //ALU.chan has already been allocated
590 return -1;
591 }
592 assignment[chan] = alu;
593 }
594
595 if (alu->last)
596 break;
597 }
598 return 0;
599 }
600
601 struct alu_bank_swizzle {
602 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
603 int hw_cfile_addr[4];
604 int hw_cfile_elem[4];
605 };
606
607 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
608 [SQ_ALU_VEC_012] = { 0, 1, 2 },
609 [SQ_ALU_VEC_021] = { 0, 2, 1 },
610 [SQ_ALU_VEC_120] = { 1, 2, 0 },
611 [SQ_ALU_VEC_102] = { 1, 0, 2 },
612 [SQ_ALU_VEC_201] = { 2, 0, 1 },
613 [SQ_ALU_VEC_210] = { 2, 1, 0 }
614 };
615
616 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
617 [SQ_ALU_SCL_210] = { 2, 1, 0 },
618 [SQ_ALU_SCL_122] = { 1, 2, 2 },
619 [SQ_ALU_SCL_212] = { 2, 1, 2 },
620 [SQ_ALU_SCL_221] = { 2, 2, 1 }
621 };
622
623 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
624 {
625 int i, cycle, component;
626 /* set up gpr use */
627 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
628 for (component = 0; component < NUM_OF_COMPONENTS; component++)
629 bs->hw_gpr[cycle][component] = -1;
630 for (i = 0; i < 4; i++)
631 bs->hw_cfile_addr[i] = -1;
632 for (i = 0; i < 4; i++)
633 bs->hw_cfile_elem[i] = -1;
634 }
635
636 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
637 {
638 if (bs->hw_gpr[cycle][chan] == -1)
639 bs->hw_gpr[cycle][chan] = sel;
640 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
641 // Another scalar operation has already used GPR read port for channel
642 return -1;
643 }
644 return 0;
645 }
646
647 static int reserve_cfile(struct r600_bc *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
648 {
649 int res, num_res = 4;
650 if (bc->chiprev >= CHIPREV_R700) {
651 num_res = 2;
652 chan /= 2;
653 }
654 for (res = 0; res < num_res; ++res) {
655 if (bs->hw_cfile_addr[res] == -1) {
656 bs->hw_cfile_addr[res] = sel;
657 bs->hw_cfile_elem[res] = chan;
658 return 0;
659 } else if (bs->hw_cfile_addr[res] == sel &&
660 bs->hw_cfile_elem[res] == chan)
661 return 0; // Read for this scalar element already reserved, nothing to do here.
662 }
663 // All cfile read ports are used, cannot reference vector element
664 return -1;
665 }
666
667 static int is_gpr(unsigned sel)
668 {
669 return (sel >= 0 && sel <= 127);
670 }
671
672 /* CB constants start at 512, and get translated to a kcache index when ALU
673 * clauses are constructed. Note that we handle kcache constants the same way
674 * as (the now gone) cfile constants, is that really required? */
675 static int is_cfile(unsigned sel)
676 {
677 return (sel > 255 && sel < 512) ||
678 (sel > 511 && sel < 4607) || // Kcache before translate
679 (sel > 127 && sel < 192); // Kcache after translate
680 }
681
682 static int is_const(int sel)
683 {
684 return is_cfile(sel) ||
685 (sel >= V_SQ_ALU_SRC_0 &&
686 sel <= V_SQ_ALU_SRC_LITERAL);
687 }
688
689 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
690 struct alu_bank_swizzle *bs, int bank_swizzle)
691 {
692 int r, src, num_src, sel, elem, cycle;
693
694 num_src = r600_bc_get_num_operands(bc, alu);
695 for (src = 0; src < num_src; src++) {
696 sel = alu->src[src].sel;
697 elem = alu->src[src].chan;
698 if (is_gpr(sel)) {
699 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
700 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
701 // Nothing to do; special-case optimization,
702 // second source uses first source’s reservation
703 continue;
704 else {
705 r = reserve_gpr(bs, sel, elem, cycle);
706 if (r)
707 return r;
708 }
709 } else if (is_cfile(sel)) {
710 r = reserve_cfile(bc, bs, sel, elem);
711 if (r)
712 return r;
713 }
714 // No restrictions on PV, PS, literal or special constants
715 }
716 return 0;
717 }
718
719 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
720 struct alu_bank_swizzle *bs, int bank_swizzle)
721 {
722 int r, src, num_src, const_count, sel, elem, cycle;
723
724 num_src = r600_bc_get_num_operands(bc, alu);
725 for (const_count = 0, src = 0; src < num_src; ++src) {
726 sel = alu->src[src].sel;
727 elem = alu->src[src].chan;
728 if (is_const(sel)) { // Any constant, including literal and inline constants
729 if (const_count >= 2)
730 // More than two references to a constant in
731 // transcendental operation.
732 return -1;
733 else
734 const_count++;
735 }
736 if (is_cfile(sel)) {
737 r = reserve_cfile(bc, bs, sel, elem);
738 if (r)
739 return r;
740 }
741 }
742 for (src = 0; src < num_src; ++src) {
743 sel = alu->src[src].sel;
744 elem = alu->src[src].chan;
745 if (is_gpr(sel)) {
746 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
747 if (cycle < const_count)
748 // Cycle for GPR load conflicts with
749 // constant load in transcendental operation.
750 return -1;
751 r = reserve_gpr(bs, sel, elem, cycle);
752 if (r)
753 return r;
754 }
755 // Constants already processed
756 // No restrictions on PV, PS
757 }
758 return 0;
759 }
760
761 static int check_and_set_bank_swizzle(struct r600_bc *bc,
762 struct r600_bc_alu *slots[5])
763 {
764 struct alu_bank_swizzle bs;
765 int bank_swizzle[5];
766 int i, r = 0, forced = 0;
767
768 for (i = 0; i < 5; i++)
769 if (slots[i] && slots[i]->bank_swizzle_force) {
770 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
771 forced = 1;
772 }
773
774 if (forced)
775 return 0;
776
777 // just check every possible combination of bank swizzle
778 // not very efficent, but works on the first try in most of the cases
779 for (i = 0; i < 4; i++)
780 bank_swizzle[i] = SQ_ALU_VEC_012;
781 bank_swizzle[4] = SQ_ALU_SCL_210;
782 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
783 init_bank_swizzle(&bs);
784 for (i = 0; i < 4; i++) {
785 if (slots[i]) {
786 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
787 if (r)
788 break;
789 }
790 }
791 if (!r && slots[4]) {
792 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
793 }
794 if (!r) {
795 for (i = 0; i < 5; i++) {
796 if (slots[i])
797 slots[i]->bank_swizzle = bank_swizzle[i];
798 }
799 return 0;
800 }
801
802 for (i = 0; i < 5; i++) {
803 bank_swizzle[i]++;
804 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
805 break;
806 else
807 bank_swizzle[i] = SQ_ALU_VEC_012;
808 }
809 }
810
811 // couldn't find a working swizzle
812 return -1;
813 }
814
815 static int replace_gpr_with_pv_ps(struct r600_bc *bc,
816 struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
817 {
818 struct r600_bc_alu *prev[5];
819 int gpr[5], chan[5];
820 int i, j, r, src, num_src;
821
822 r = assign_alu_units(bc, alu_prev, prev);
823 if (r)
824 return r;
825
826 for (i = 0; i < 5; ++i) {
827 if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
828 gpr[i] = prev[i]->dst.sel;
829 /* cube writes more than PV.X */
830 if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
831 chan[i] = 0;
832 else
833 chan[i] = prev[i]->dst.chan;
834 } else
835 gpr[i] = -1;
836 }
837
838 for (i = 0; i < 5; ++i) {
839 struct r600_bc_alu *alu = slots[i];
840 if(!alu)
841 continue;
842
843 num_src = r600_bc_get_num_operands(bc, alu);
844 for (src = 0; src < num_src; ++src) {
845 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
846 continue;
847
848 if (alu->src[src].sel == gpr[4] &&
849 alu->src[src].chan == chan[4]) {
850 alu->src[src].sel = V_SQ_ALU_SRC_PS;
851 alu->src[src].chan = 0;
852 continue;
853 }
854
855 for (j = 0; j < 4; ++j) {
856 if (alu->src[src].sel == gpr[j] &&
857 alu->src[src].chan == j) {
858 alu->src[src].sel = V_SQ_ALU_SRC_PV;
859 alu->src[src].chan = chan[j];
860 break;
861 }
862 }
863 }
864 }
865
866 return 0;
867 }
868
869 void r600_bc_special_constants(u32 value, unsigned *sel, unsigned *neg)
870 {
871 switch(value) {
872 case 0:
873 *sel = V_SQ_ALU_SRC_0;
874 break;
875 case 1:
876 *sel = V_SQ_ALU_SRC_1_INT;
877 break;
878 case -1:
879 *sel = V_SQ_ALU_SRC_M_1_INT;
880 break;
881 case 0x3F800000: // 1.0f
882 *sel = V_SQ_ALU_SRC_1;
883 break;
884 case 0x3F000000: // 0.5f
885 *sel = V_SQ_ALU_SRC_0_5;
886 break;
887 case 0xBF800000: // -1.0f
888 *sel = V_SQ_ALU_SRC_1;
889 *neg ^= 1;
890 break;
891 case 0xBF000000: // -0.5f
892 *sel = V_SQ_ALU_SRC_0_5;
893 *neg ^= 1;
894 break;
895 default:
896 *sel = V_SQ_ALU_SRC_LITERAL;
897 break;
898 }
899 }
900
901 /* compute how many literal are needed */
902 static int r600_bc_alu_nliterals(struct r600_bc *bc, struct r600_bc_alu *alu,
903 uint32_t literal[4], unsigned *nliteral)
904 {
905 unsigned num_src = r600_bc_get_num_operands(bc, alu);
906 unsigned i, j;
907
908 for (i = 0; i < num_src; ++i) {
909 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
910 uint32_t value = alu->src[i].value;
911 unsigned found = 0;
912 for (j = 0; j < *nliteral; ++j) {
913 if (literal[j] == value) {
914 found = 1;
915 break;
916 }
917 }
918 if (!found) {
919 if (*nliteral >= 4)
920 return -EINVAL;
921 literal[(*nliteral)++] = value;
922 }
923 }
924 }
925 return 0;
926 }
927
928 static void r600_bc_alu_adjust_literals(struct r600_bc *bc,
929 struct r600_bc_alu *alu,
930 uint32_t literal[4], unsigned nliteral)
931 {
932 unsigned num_src = r600_bc_get_num_operands(bc, alu);
933 unsigned i, j;
934
935 for (i = 0; i < num_src; ++i) {
936 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
937 uint32_t value = alu->src[i].value;
938 for (j = 0; j < nliteral; ++j) {
939 if (literal[j] == value) {
940 alu->src[i].chan = j;
941 break;
942 }
943 }
944 }
945 }
946 }
947
948 static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
949 struct r600_bc_alu *alu_prev)
950 {
951 struct r600_bc_alu *prev[5];
952 struct r600_bc_alu *result[5] = { NULL };
953
954 uint32_t literal[4], prev_literal[4];
955 unsigned nliteral = 0, prev_nliteral = 0;
956
957 int i, j, r, src, num_src;
958 int num_once_inst = 0;
959 int have_mova = 0, have_rel = 0;
960
961 r = assign_alu_units(bc, alu_prev, prev);
962 if (r)
963 return r;
964
965 for (i = 0; i < 5; ++i) {
966 struct r600_bc_alu *alu;
967
968 /* check number of literals */
969 if (prev[i]) {
970 if (r600_bc_alu_nliterals(bc, prev[i], literal, &nliteral))
971 return 0;
972 if (r600_bc_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
973 return 0;
974 if (is_alu_mova_inst(bc, prev[i])) {
975 if (have_rel)
976 return 0;
977 have_mova = 1;
978 }
979 num_once_inst += is_alu_once_inst(bc, prev[i]);
980 }
981 if (slots[i] && r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral))
982 return 0;
983
984 // let's check used slots
985 if (prev[i] && !slots[i]) {
986 result[i] = prev[i];
987 continue;
988 } else if (prev[i] && slots[i]) {
989 if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
990 // trans unit is still free try to use it
991 if (is_alu_any_unit_inst(bc, slots[i])) {
992 result[i] = prev[i];
993 result[4] = slots[i];
994 } else if (is_alu_any_unit_inst(bc, prev[i])) {
995 result[i] = slots[i];
996 result[4] = prev[i];
997 } else
998 return 0;
999 } else
1000 return 0;
1001 } else if(!slots[i]) {
1002 continue;
1003 } else
1004 result[i] = slots[i];
1005
1006 // let's check source gprs
1007 alu = slots[i];
1008 num_once_inst += is_alu_once_inst(bc, alu);
1009
1010 num_src = r600_bc_get_num_operands(bc, alu);
1011 for (src = 0; src < num_src; ++src) {
1012 if (alu->src[src].rel) {
1013 if (have_mova)
1014 return 0;
1015 have_rel = 1;
1016 }
1017
1018 // constants doesn't matter
1019 if (!is_gpr(alu->src[src].sel))
1020 continue;
1021
1022 for (j = 0; j < 5; ++j) {
1023 if (!prev[j] || !prev[j]->dst.write)
1024 continue;
1025
1026 // if it's relative then we can't determin which gpr is really used
1027 if (prev[j]->dst.chan == alu->src[src].chan &&
1028 (prev[j]->dst.sel == alu->src[src].sel ||
1029 prev[j]->dst.rel || alu->src[src].rel))
1030 return 0;
1031 }
1032 }
1033 }
1034
1035 /* more than one PRED_ or KILL_ ? */
1036 if (num_once_inst > 1)
1037 return 0;
1038
1039 /* check if the result can still be swizzlet */
1040 r = check_and_set_bank_swizzle(bc, result);
1041 if (r)
1042 return 0;
1043
1044 /* looks like everything worked out right, apply the changes */
1045
1046 /* undo adding previus literals */
1047 bc->cf_last->ndw -= align(prev_nliteral, 2);
1048
1049 /* sort instructions */
1050 for (i = 0; i < 5; ++i) {
1051 slots[i] = result[i];
1052 if (result[i]) {
1053 LIST_DEL(&result[i]->list);
1054 result[i]->last = 0;
1055 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1056 }
1057 }
1058
1059 /* determine new last instruction */
1060 LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
1061
1062 /* determine new first instruction */
1063 for (i = 0; i < 5; ++i) {
1064 if (result[i]) {
1065 bc->cf_last->curr_bs_head = result[i];
1066 break;
1067 }
1068 }
1069
1070 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1071 bc->cf_last->prev2_bs_head = NULL;
1072
1073 return 0;
1074 }
1075
1076 /* This code handles kcache lines as single blocks of 32 constants. We could
1077 * probably do slightly better by recognizing that we actually have two
1078 * consecutive lines of 16 constants, but the resulting code would also be
1079 * somewhat more complicated. */
1080 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
1081 {
1082 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
1083 unsigned int required_lines;
1084 unsigned int free_lines = 0;
1085 unsigned int cache_line[3];
1086 unsigned int count = 0;
1087 unsigned int i, j;
1088 int r;
1089
1090 /* Collect required cache lines. */
1091 for (i = 0; i < 3; ++i) {
1092 bool found = false;
1093 unsigned int line;
1094
1095 if (alu->src[i].sel < 512)
1096 continue;
1097
1098 line = ((alu->src[i].sel - 512) / 32) * 2;
1099
1100 for (j = 0; j < count; ++j) {
1101 if (cache_line[j] == line) {
1102 found = true;
1103 break;
1104 }
1105 }
1106
1107 if (!found)
1108 cache_line[count++] = line;
1109 }
1110
1111 /* This should never actually happen. */
1112 if (count >= 3) return -ENOMEM;
1113
1114 for (i = 0; i < 2; ++i) {
1115 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1116 ++free_lines;
1117 }
1118 }
1119
1120 /* Filter lines pulled in by previous intructions. Note that this is
1121 * only for the required_lines count, we can't remove these from the
1122 * cache_line array since we may have to start a new ALU clause. */
1123 for (i = 0, required_lines = count; i < count; ++i) {
1124 for (j = 0; j < 2; ++j) {
1125 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1126 kcache[j].addr == cache_line[i]) {
1127 --required_lines;
1128 break;
1129 }
1130 }
1131 }
1132
1133 /* Start a new ALU clause if needed. */
1134 if (required_lines > free_lines) {
1135 if ((r = r600_bc_add_cf(bc))) {
1136 return r;
1137 }
1138 bc->cf_last->inst = (type << 3);
1139 kcache = bc->cf_last->kcache;
1140 }
1141
1142 /* Setup the kcache lines. */
1143 for (i = 0; i < count; ++i) {
1144 bool found = false;
1145
1146 for (j = 0; j < 2; ++j) {
1147 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1148 kcache[j].addr == cache_line[i]) {
1149 found = true;
1150 break;
1151 }
1152 }
1153
1154 if (found) continue;
1155
1156 for (j = 0; j < 2; ++j) {
1157 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1158 kcache[j].bank = 0;
1159 kcache[j].addr = cache_line[i];
1160 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1161 break;
1162 }
1163 }
1164 }
1165
1166 /* Alter the src operands to refer to the kcache. */
1167 for (i = 0; i < 3; ++i) {
1168 static const unsigned int base[] = {128, 160, 256, 288};
1169 unsigned int line;
1170
1171 if (alu->src[i].sel < 512)
1172 continue;
1173
1174 alu->src[i].sel -= 512;
1175 line = (alu->src[i].sel / 32) * 2;
1176
1177 for (j = 0; j < 2; ++j) {
1178 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1179 kcache[j].addr == line) {
1180 alu->src[i].sel &= 0x1f;
1181 alu->src[i].sel += base[j];
1182 break;
1183 }
1184 }
1185 }
1186
1187 return 0;
1188 }
1189
1190 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
1191 {
1192 struct r600_bc_alu *nalu = r600_bc_alu();
1193 struct r600_bc_alu *lalu;
1194 int i, r;
1195
1196 if (nalu == NULL)
1197 return -ENOMEM;
1198 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
1199
1200 if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1201 /* check if we could add it anyway */
1202 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1203 type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1204 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1205 if (lalu->predicate) {
1206 bc->force_add_cf = 1;
1207 break;
1208 }
1209 }
1210 } else
1211 bc->force_add_cf = 1;
1212 }
1213
1214 /* cf can contains only alu or only vtx or only tex */
1215 if (bc->cf_last == NULL || bc->force_add_cf) {
1216 r = r600_bc_add_cf(bc);
1217 if (r) {
1218 free(nalu);
1219 return r;
1220 }
1221 }
1222 bc->cf_last->inst = (type << 3);
1223
1224 /* Setup the kcache for this ALU instruction. This will start a new
1225 * ALU clause if needed. */
1226 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
1227 free(nalu);
1228 return r;
1229 }
1230
1231 if (!bc->cf_last->curr_bs_head) {
1232 bc->cf_last->curr_bs_head = nalu;
1233 }
1234 /* number of gpr == the last gpr used in any alu */
1235 for (i = 0; i < 3; i++) {
1236 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1237 bc->ngpr = nalu->src[i].sel + 1;
1238 }
1239 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1240 r600_bc_special_constants(nalu->src[i].value,
1241 &nalu->src[i].sel, &nalu->src[i].neg);
1242 }
1243 if (nalu->dst.sel >= bc->ngpr) {
1244 bc->ngpr = nalu->dst.sel + 1;
1245 }
1246 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1247 /* each alu use 2 dwords */
1248 bc->cf_last->ndw += 2;
1249 bc->ndw += 2;
1250
1251 /* process cur ALU instructions for bank swizzle */
1252 if (nalu->last) {
1253 uint32_t literal[4];
1254 unsigned nliteral;
1255 struct r600_bc_alu *slots[5];
1256 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1257 if (r)
1258 return r;
1259
1260 if (bc->cf_last->prev_bs_head) {
1261 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1262 if (r)
1263 return r;
1264 }
1265
1266 if (bc->cf_last->prev_bs_head) {
1267 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1268 if (r)
1269 return r;
1270 }
1271
1272 r = check_and_set_bank_swizzle(bc, slots);
1273 if (r)
1274 return r;
1275
1276 for (i = 0, nliteral = 0; i < 5; i++) {
1277 if (slots[i]) {
1278 r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
1279 if (r)
1280 return r;
1281 }
1282 }
1283 bc->cf_last->ndw += align(nliteral, 2);
1284
1285 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1286 * worst case */
1287 if ((bc->cf_last->ndw >> 1) >= 120) {
1288 bc->force_add_cf = 1;
1289 }
1290
1291 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1292 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1293 bc->cf_last->curr_bs_head = NULL;
1294 }
1295 return 0;
1296 }
1297
1298 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
1299 {
1300 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1301 }
1302
1303 static void r600_bc_remove_alu(struct r600_bc_cf *cf, struct r600_bc_alu *alu)
1304 {
1305 if (alu->last && alu->list.prev != &cf->alu) {
1306 PREV_ALU(alu)->last = 1;
1307 }
1308 LIST_DEL(&alu->list);
1309 free(alu);
1310 cf->ndw -= 2;
1311 }
1312
1313 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
1314 {
1315 struct r600_bc_vtx *nvtx = r600_bc_vtx();
1316 int r;
1317
1318 if (nvtx == NULL)
1319 return -ENOMEM;
1320 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
1321
1322 /* cf can contains only alu or only vtx or only tex */
1323 if (bc->cf_last == NULL ||
1324 (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1325 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) ||
1326 bc->force_add_cf) {
1327 r = r600_bc_add_cf(bc);
1328 if (r) {
1329 free(nvtx);
1330 return r;
1331 }
1332 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1333 }
1334 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1335 /* each fetch use 4 dwords */
1336 bc->cf_last->ndw += 4;
1337 bc->ndw += 4;
1338 if ((bc->cf_last->ndw / 4) > 7)
1339 bc->force_add_cf = 1;
1340 return 0;
1341 }
1342
1343 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
1344 {
1345 struct r600_bc_tex *ntex = r600_bc_tex();
1346 int r;
1347
1348 if (ntex == NULL)
1349 return -ENOMEM;
1350 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
1351
1352 /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1353 if (bc->cf_last != NULL &&
1354 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
1355 struct r600_bc_tex *ttex;
1356 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1357 if (ttex->dst_gpr == ntex->src_gpr) {
1358 bc->force_add_cf = 1;
1359 break;
1360 }
1361 }
1362 }
1363
1364 /* cf can contains only alu or only vtx or only tex */
1365 if (bc->cf_last == NULL ||
1366 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1367 bc->force_add_cf) {
1368 r = r600_bc_add_cf(bc);
1369 if (r) {
1370 free(ntex);
1371 return r;
1372 }
1373 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1374 }
1375 if (ntex->src_gpr >= bc->ngpr) {
1376 bc->ngpr = ntex->src_gpr + 1;
1377 }
1378 if (ntex->dst_gpr >= bc->ngpr) {
1379 bc->ngpr = ntex->dst_gpr + 1;
1380 }
1381 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1382 /* each texture fetch use 4 dwords */
1383 bc->cf_last->ndw += 4;
1384 bc->ndw += 4;
1385 if ((bc->cf_last->ndw / 4) > 7)
1386 bc->force_add_cf = 1;
1387 return 0;
1388 }
1389
1390 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
1391 {
1392 int r;
1393 r = r600_bc_add_cf(bc);
1394 if (r)
1395 return r;
1396
1397 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1398 bc->cf_last->inst = inst;
1399 return 0;
1400 }
1401
1402 /* common to all 3 families */
1403 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
1404 {
1405 unsigned fetch_resource_start = 0;
1406
1407 /* check if we are fetch shader */
1408 /* fetch shader can also access vertex resource,
1409 * first fetch shader resource is at 160
1410 */
1411 if (bc->type == -1) {
1412 switch (bc->chiprev) {
1413 /* r600 */
1414 case CHIPREV_R600:
1415 /* r700 */
1416 case CHIPREV_R700:
1417 fetch_resource_start = 160;
1418 break;
1419 /* evergreen */
1420 case CHIPREV_EVERGREEN:
1421 fetch_resource_start = 0;
1422 break;
1423 default:
1424 fprintf(stderr, "%s:%s:%d unknown chiprev %d\n",
1425 __FILE__, __func__, __LINE__, bc->chiprev);
1426 break;
1427 }
1428 }
1429 bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id + fetch_resource_start) |
1430 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1431 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1432 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x) |
1433 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1434 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1435 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1436 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1437 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1438 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1439 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1440 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1441 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1442 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1443 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1444 bc->bytecode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1);
1445 bc->bytecode[id++] = 0;
1446 return 0;
1447 }
1448
1449 /* common to all 3 families */
1450 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
1451 {
1452 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1453 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1454 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1455 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1456 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1457 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1458 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1459 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1460 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1461 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1462 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1463 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1464 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1465 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1466 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1467 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1468 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1469 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1470 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1471 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1472 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1473 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1474 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1475 bc->bytecode[id++] = 0;
1476 return 0;
1477 }
1478
1479 /* r600 only, r700/eg bits in r700_asm.c */
1480 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
1481 {
1482 /* don't replace gpr by pv or ps for destination register */
1483 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1484 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1485 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1486 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1487 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1488 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1489 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1490 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1491 S_SQ_ALU_WORD0_LAST(alu->last);
1492
1493 if (alu->is_op3) {
1494 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1495 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1496 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1497 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1498 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1499 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1500 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1501 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1502 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1503 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1504 } else {
1505 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1506 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1507 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1508 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1509 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1510 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1511 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1512 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1513 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1514 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1515 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1516 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1517 }
1518 return 0;
1519 }
1520
1521 enum cf_class
1522 {
1523 CF_CLASS_ALU,
1524 CF_CLASS_TEXTURE,
1525 CF_CLASS_VERTEX,
1526 CF_CLASS_EXPORT,
1527 CF_CLASS_OTHER
1528 };
1529
1530 static enum cf_class r600_bc_cf_class(struct r600_bc_cf *cf)
1531 {
1532 switch (cf->inst) {
1533 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1534 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1535 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1536 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1537 return CF_CLASS_ALU;
1538
1539 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1540 return CF_CLASS_TEXTURE;
1541
1542 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1543 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1544 return CF_CLASS_VERTEX;
1545
1546 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1547 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1548 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1549 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1550 return CF_CLASS_EXPORT;
1551
1552 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1553 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1554 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1555 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1556 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1557 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1558 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1559 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1560 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1561 return CF_CLASS_OTHER;
1562
1563 default:
1564 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1565 return -EINVAL;
1566 }
1567 }
1568
1569 /* common for r600/r700 - eg in eg_asm.c */
1570 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
1571 {
1572 unsigned id = cf->id;
1573 unsigned end_of_program = bc->cf.prev == &cf->list;
1574
1575 switch (r600_bc_cf_class(cf)) {
1576 case CF_CLASS_ALU:
1577 assert(!end_of_program);
1578 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1579 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1580 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1581 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1582
1583 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1584 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1585 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1586 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1587 S_SQ_CF_ALU_WORD1_BARRIER(cf->barrier) |
1588 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
1589 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1590 break;
1591 case CF_CLASS_TEXTURE:
1592 case CF_CLASS_VERTEX:
1593 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1594 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1595 S_SQ_CF_WORD1_BARRIER(cf->barrier) |
1596 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1) |
1597 S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
1598 break;
1599 case CF_CLASS_EXPORT:
1600 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1601 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1602 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1603 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1604 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1605 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1606 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1607 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1608 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1609 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1610 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) |
1611 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(end_of_program);
1612 break;
1613 case CF_CLASS_OTHER:
1614 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1615 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1616 S_SQ_CF_WORD1_BARRIER(cf->barrier) |
1617 S_SQ_CF_WORD1_COND(cf->cond) |
1618 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
1619 S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
1620
1621 break;
1622 default:
1623 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1624 return -EINVAL;
1625 }
1626 return 0;
1627 }
1628
1629 struct gpr_usage_range {
1630 int replacement;
1631 int rel_block;
1632 int start;
1633 int end;
1634 };
1635
1636 struct gpr_usage {
1637 unsigned channels:4;
1638 int first_write;
1639 int last_write[4];
1640 unsigned nranges;
1641 struct gpr_usage_range *ranges;
1642 };
1643
1644 static struct gpr_usage_range* last_gpr_usage_range(struct gpr_usage *usage)
1645 {
1646 if (usage->nranges)
1647 return usage->ranges + usage->nranges - 1;
1648 else
1649 return NULL;
1650 }
1651
1652 static struct gpr_usage_range* add_gpr_usage_range(struct gpr_usage *usage)
1653 {
1654 struct gpr_usage_range *range;
1655
1656 usage->nranges++;
1657 usage->ranges = realloc(usage->ranges, usage->nranges * sizeof(struct gpr_usage_range));
1658 if (!usage->ranges)
1659 return NULL;
1660
1661 range = last_gpr_usage_range(usage);
1662 range->replacement = -1; /* no prefered replacement */
1663 range->rel_block = -1;
1664 range->start = -1;
1665 range->end = -1;
1666
1667 return range;
1668 }
1669
1670 static void notice_gpr_read(struct gpr_usage *usage, int id, unsigned chan)
1671 {
1672 struct gpr_usage_range* range;
1673
1674 usage->channels |= 1 << chan;
1675 usage->first_write = -1;
1676 if (!usage->nranges) {
1677 range = add_gpr_usage_range(usage);
1678 } else
1679 range = last_gpr_usage_range(usage);
1680
1681 if (range && range->end < id)
1682 range->end = id;
1683 }
1684
1685 static void notice_gpr_rel_read(struct r600_bc *bc, struct gpr_usage usage[128],
1686 int id, unsigned gpr, unsigned chan)
1687 {
1688 unsigned i;
1689 for (i = gpr; i < bc->ngpr; ++i)
1690 notice_gpr_read(&usage[i], id, chan);
1691
1692 last_gpr_usage_range(&usage[gpr])->rel_block = bc->ngpr - gpr;
1693 }
1694
1695 static void notice_gpr_last_write(struct gpr_usage *usage, int id, unsigned chan)
1696 {
1697 usage->last_write[chan] = id;
1698 }
1699
1700 static void notice_gpr_write(struct gpr_usage *usage, int id, unsigned chan,
1701 int predicate, int prefered_replacement)
1702 {
1703 struct gpr_usage_range* last_range = last_gpr_usage_range(usage);
1704 int start = usage->first_write != -1 ? usage->first_write : id;
1705 usage->channels &= ~(1 << chan);
1706 if (usage->channels) {
1707 if (usage->first_write == -1)
1708 usage->first_write = id;
1709 } else if (!last_range || (last_range->start != start && !predicate)) {
1710 usage->first_write = start;
1711 struct gpr_usage_range* range = add_gpr_usage_range(usage);
1712 range->replacement = prefered_replacement;
1713 range->start = start;
1714 } else if (last_range->start == start && prefered_replacement != -1) {
1715 last_range->replacement = prefered_replacement;
1716 }
1717 notice_gpr_last_write(usage, id, chan);
1718 }
1719
1720 static void notice_gpr_rel_last_write(struct gpr_usage usage[128], int id, unsigned chan)
1721 {
1722 unsigned i;
1723 for (i = 0; i < 128; ++i)
1724 notice_gpr_last_write(&usage[i], id, chan);
1725 }
1726
1727 static void notice_gpr_rel_write(struct gpr_usage usage[128], int id, unsigned chan)
1728 {
1729 unsigned i;
1730 for (i = 0; i < 128; ++i)
1731 notice_gpr_write(&usage[i], id, chan, 1, -1);
1732 }
1733
1734 static void notice_alu_src_gprs(struct r600_bc *bc, struct r600_bc_alu *alu,
1735 struct gpr_usage usage[128], int id)
1736 {
1737 unsigned src, num_src;
1738
1739 num_src = r600_bc_get_num_operands(bc, alu);
1740 for (src = 0; src < num_src; ++src) {
1741 // constants doesn't matter
1742 if (!is_gpr(alu->src[src].sel))
1743 continue;
1744
1745 if (alu->src[src].rel)
1746 notice_gpr_rel_read(bc, usage, id, alu->src[src].sel, alu->src[src].chan);
1747 else
1748 notice_gpr_read(&usage[alu->src[src].sel], id, alu->src[src].chan);
1749 }
1750 }
1751
1752 static void notice_alu_dst_gprs(struct r600_bc_alu *alu_first, struct gpr_usage usage[128],
1753 int id, int predicate)
1754 {
1755 struct r600_bc_alu *alu;
1756 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
1757 if (alu->dst.write) {
1758 if (alu->dst.rel)
1759 notice_gpr_rel_write(usage, id, alu->dst.chan);
1760 else if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV && is_gpr(alu->src[0].sel))
1761 notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan,
1762 predicate, alu->src[0].sel);
1763 else
1764 notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan, predicate, -1);
1765 }
1766
1767 if (alu->last)
1768 break;
1769 }
1770 }
1771
1772 static void notice_tex_gprs(struct r600_bc *bc, struct r600_bc_tex *tex,
1773 struct gpr_usage usage[128],
1774 int id, int predicate)
1775 {
1776 if (tex->src_rel) {
1777 if (tex->src_sel_x < 4)
1778 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_x);
1779 if (tex->src_sel_y < 4)
1780 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_y);
1781 if (tex->src_sel_z < 4)
1782 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_z);
1783 if (tex->src_sel_w < 4)
1784 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_w);
1785 } else {
1786 if (tex->src_sel_x < 4)
1787 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_x);
1788 if (tex->src_sel_y < 4)
1789 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_y);
1790 if (tex->src_sel_z < 4)
1791 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_z);
1792 if (tex->src_sel_w < 4)
1793 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_w);
1794 }
1795 if (tex->dst_rel) {
1796 if (tex->dst_sel_x != 7)
1797 notice_gpr_rel_write(usage, id, 0);
1798 if (tex->dst_sel_y != 7)
1799 notice_gpr_rel_write(usage, id, 1);
1800 if (tex->dst_sel_z != 7)
1801 notice_gpr_rel_write(usage, id, 2);
1802 if (tex->dst_sel_w != 7)
1803 notice_gpr_rel_write(usage, id, 3);
1804 } else {
1805 if (tex->dst_sel_x != 7)
1806 notice_gpr_write(&usage[tex->dst_gpr], id, 0, predicate, -1);
1807 if (tex->dst_sel_y != 7)
1808 notice_gpr_write(&usage[tex->dst_gpr], id, 1, predicate, -1);
1809 if (tex->dst_sel_z != 7)
1810 notice_gpr_write(&usage[tex->dst_gpr], id, 2, predicate, -1);
1811 if (tex->dst_sel_w != 7)
1812 notice_gpr_write(&usage[tex->dst_gpr], id, 3, predicate, -1);
1813 }
1814 }
1815
1816 static void notice_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
1817 int id, int predicate)
1818 {
1819 notice_gpr_read(&usage[vtx->src_gpr], id, vtx->src_sel_x);
1820
1821 if (vtx->dst_sel_x != 7)
1822 notice_gpr_write(&usage[vtx->dst_gpr], id, 0, predicate, -1);
1823 if (vtx->dst_sel_y != 7)
1824 notice_gpr_write(&usage[vtx->dst_gpr], id, 1, predicate, -1);
1825 if (vtx->dst_sel_z != 7)
1826 notice_gpr_write(&usage[vtx->dst_gpr], id, 2, predicate, -1);
1827 if (vtx->dst_sel_w != 7)
1828 notice_gpr_write(&usage[vtx->dst_gpr], id, 3, predicate, -1);
1829 }
1830
1831 static void notice_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
1832 struct r600_bc_cf *export_cf[128], int export_remap[128])
1833 {
1834 //TODO handle other memory operations
1835 struct gpr_usage *output = &usage[cf->output.gpr];
1836 int id = MAX4(output->last_write[0], output->last_write[1],
1837 output->last_write[2], output->last_write[3]);
1838 id += 0x100;
1839 id &= ~0xFF;
1840
1841 export_cf[cf->output.gpr] = cf;
1842 export_remap[cf->output.gpr] = id;
1843 if (cf->output.swizzle_x < 4)
1844 notice_gpr_read(output, id, cf->output.swizzle_x);
1845 if (cf->output.swizzle_y < 4)
1846 notice_gpr_read(output, id, cf->output.swizzle_y);
1847 if (cf->output.swizzle_z < 4)
1848 notice_gpr_read(output, id, cf->output.swizzle_z);
1849 if (cf->output.swizzle_w < 4)
1850 notice_gpr_read(output, id, cf->output.swizzle_w);
1851 }
1852
1853 static struct gpr_usage_range *find_src_range(struct gpr_usage *usage, int id)
1854 {
1855 unsigned i;
1856 for (i = 0; i < usage->nranges; ++i) {
1857 struct gpr_usage_range* range = &usage->ranges[i];
1858
1859 if (range->start < id && id <= range->end)
1860 return range;
1861 }
1862 return NULL;
1863 }
1864
1865 static struct gpr_usage_range *find_dst_range(struct gpr_usage *usage, int id)
1866 {
1867 unsigned i;
1868 for (i = 0; i < usage->nranges; ++i) {
1869 struct gpr_usage_range* range = &usage->ranges[i];
1870 int end = range->end;
1871
1872 if (range->start <= id && (id < end || end == -1))
1873 return range;
1874 }
1875 return NULL;
1876 }
1877
1878 static int is_barrier_needed(struct gpr_usage *usage, int id, unsigned chan, int last_barrier)
1879 {
1880 if (usage->last_write[chan] != (id & ~0xFF))
1881 return usage->last_write[chan] >= last_barrier;
1882 else
1883 return 0;
1884 }
1885
1886 static int is_intersection(struct gpr_usage_range* a, struct gpr_usage_range* b)
1887 {
1888 return a->start <= b->end && b->start < a->end;
1889 }
1890
1891 static int rate_replacement(struct gpr_usage usage[128], unsigned current, unsigned gpr,
1892 struct gpr_usage_range* range)
1893 {
1894 int max_gpr = gpr + MAX2(range->rel_block, 1);
1895 int best_start = 0x3FFFFFFF, best_end = 0x3FFFFFFF;
1896 unsigned i;
1897
1898 for (; gpr < max_gpr; ++gpr) {
1899
1900 if (gpr >= 128) /* relative gpr block won't fit into clause temporaries */
1901 return -1; /* forget it */
1902
1903 if (gpr == current) /* ignore ranges of to be replaced register */
1904 continue;
1905
1906 for (i = 0; i < usage[gpr].nranges; ++i) {
1907 if (usage[gpr].ranges[i].replacement < gpr)
1908 continue; /* ignore already remapped ranges */
1909
1910 if (is_intersection(&usage[gpr].ranges[i], range))
1911 return -1; /* forget it if usages overlap */
1912
1913 if (range->start >= usage[gpr].ranges[i].end)
1914 best_start = MIN2(best_start, range->start - usage[gpr].ranges[i].end);
1915
1916 if (range->end != -1 && range->end <= usage[gpr].ranges[i].start)
1917 best_end = MIN2(best_end, usage[gpr].ranges[i].start - range->end);
1918 }
1919 }
1920 return best_start + best_end;
1921 }
1922
1923 static void find_replacement(struct gpr_usage usage[128], unsigned current,
1924 struct gpr_usage_range *range)
1925 {
1926 unsigned i, j;
1927 int best_gpr = -1, best_rate = 0x7FFFFFFF;
1928
1929 if (range->replacement == current)
1930 return; /* register prefers to be not remapped */
1931
1932 if (range->replacement != -1 && range->replacement <= current) {
1933 struct gpr_usage_range *other = find_src_range(&usage[range->replacement], range->start);
1934 if (other && other->replacement != -1)
1935 range->replacement = other->replacement;
1936 }
1937
1938 if (range->replacement != -1 && range->replacement < current) {
1939 int rate = rate_replacement(usage, current, range->replacement, range);
1940
1941 /* check if prefered replacement can be used */
1942 if (rate != -1) {
1943 best_rate = rate;
1944 best_gpr = range->replacement;
1945 }
1946 }
1947
1948 if (best_gpr == -1 && (range->start & ~0xFF) == (range->end & ~0xFF)) {
1949 /* register is just used inside one ALU clause */
1950 /* try to use clause temporaries for it */
1951 for (i = 127; i > 123; --i) {
1952 int rate = rate_replacement(usage, current, i, range);
1953
1954 if (rate == -1) /* can't be used because ranges overlap */
1955 continue;
1956
1957 if (rate < best_rate) {
1958 best_rate = rate;
1959 best_gpr = i;
1960
1961 /* can't get better than this */
1962 if (rate == 0)
1963 break;
1964 }
1965 }
1966 }
1967
1968 if (best_gpr == -1) {
1969 for (i = 0; i < current; ++i) {
1970 int rate = rate_replacement(usage, current, i, range);
1971
1972 if (rate == -1) /* can't be used because ranges overlap */
1973 continue;
1974
1975 if (rate < best_rate) {
1976 best_rate = rate;
1977 best_gpr = i;
1978
1979 /* can't get better than this */
1980 if (rate == 0)
1981 break;
1982 }
1983 }
1984 }
1985
1986 if (best_gpr != -1) {
1987 struct gpr_usage_range *reservation = add_gpr_usage_range(&usage[best_gpr]);
1988 reservation->replacement = best_gpr;
1989 reservation->rel_block = -1;
1990 reservation->start = range->start;
1991 reservation->end = range->end;
1992 } else
1993 best_gpr = current;
1994
1995 range->replacement = best_gpr;
1996 if (range->rel_block == -1)
1997 return; /* no relative block to handle we are done here */
1998
1999 /* set prefered register for the whole relative register block */
2000 for (i = current + 1, ++best_gpr; i < current + range->rel_block; ++i, ++best_gpr) {
2001 for (j = 0; j < usage[i].nranges; ++j) {
2002 if (is_intersection(&usage[i].ranges[j], range))
2003 usage[i].ranges[j].replacement = best_gpr;
2004 }
2005 }
2006 }
2007
2008 static void replace_alu_gprs(struct r600_bc *bc, struct r600_bc_alu *alu, struct gpr_usage usage[128],
2009 int id, int last_barrier, unsigned *barrier)
2010 {
2011 struct gpr_usage *cur_usage;
2012 struct gpr_usage_range *range;
2013 unsigned src, num_src;
2014
2015 num_src = r600_bc_get_num_operands(bc, alu);
2016 for (src = 0; src < num_src; ++src) {
2017 // constants doesn't matter
2018 if (!is_gpr(alu->src[src].sel))
2019 continue;
2020
2021 cur_usage = &usage[alu->src[src].sel];
2022 range = find_src_range(cur_usage, id);
2023 alu->src[src].sel = range->replacement;
2024
2025 *barrier |= is_barrier_needed(cur_usage, id, alu->src[src].chan, last_barrier);
2026 }
2027
2028 if (alu->dst.write) {
2029 cur_usage = &usage[alu->dst.sel];
2030 range = find_dst_range(cur_usage, id);
2031 if (!range || range->replacement == -1) {
2032 if (!alu->is_op3)
2033 alu->dst.write = 0;
2034 else
2035 /*TODO: really check that register 123 is useable */
2036 alu->dst.sel = 123;
2037 } else {
2038 alu->dst.sel = range->replacement;
2039 *barrier |= is_barrier_needed(cur_usage, id, alu->dst.chan, last_barrier);
2040 }
2041 }
2042 if (alu->dst.write) {
2043 if (alu->dst.rel)
2044 notice_gpr_rel_last_write(usage, id, alu->dst.chan);
2045 else
2046 notice_gpr_last_write(cur_usage, id, alu->dst.chan);
2047 }
2048 }
2049
2050 static void replace_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
2051 int id, int last_barrier, unsigned *barrier)
2052 {
2053 struct gpr_usage *cur_usage = &usage[tex->src_gpr];
2054 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2055
2056 if (tex->src_rel) {
2057 *barrier = 1;
2058 } else {
2059 if (tex->src_sel_x < 4)
2060 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_x, last_barrier);
2061 if (tex->src_sel_y < 4)
2062 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_y, last_barrier);
2063 if (tex->src_sel_z < 4)
2064 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_z, last_barrier);
2065 if (tex->src_sel_w < 4)
2066 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_w, last_barrier);
2067 }
2068 tex->src_gpr = range->replacement;
2069
2070 cur_usage = &usage[tex->dst_gpr];
2071
2072 range = find_dst_range(cur_usage, id);
2073 if (range) {
2074 tex->dst_gpr = range->replacement;
2075
2076 if (tex->dst_rel) {
2077 if (tex->dst_sel_x != 7)
2078 notice_gpr_rel_last_write(usage, id, tex->dst_sel_x);
2079 if (tex->dst_sel_y != 7)
2080 notice_gpr_rel_last_write(usage, id, tex->dst_sel_y);
2081 if (tex->dst_sel_z != 7)
2082 notice_gpr_rel_last_write(usage, id, tex->dst_sel_z);
2083 if (tex->dst_sel_w != 7)
2084 notice_gpr_rel_last_write(usage, id, tex->dst_sel_w);
2085 } else {
2086 if (tex->dst_sel_x != 7)
2087 notice_gpr_last_write(cur_usage, id, tex->dst_sel_x);
2088 if (tex->dst_sel_y != 7)
2089 notice_gpr_last_write(cur_usage, id, tex->dst_sel_y);
2090 if (tex->dst_sel_z != 7)
2091 notice_gpr_last_write(cur_usage, id, tex->dst_sel_z);
2092 if (tex->dst_sel_w != 7)
2093 notice_gpr_last_write(cur_usage, id, tex->dst_sel_w);
2094 }
2095 } else {
2096 tex->dst_gpr = 123;
2097 }
2098 }
2099
2100 static void replace_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
2101 int id, int last_barrier, unsigned *barrier)
2102 {
2103 struct gpr_usage *cur_usage = &usage[vtx->src_gpr];
2104 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2105
2106 *barrier |= is_barrier_needed(cur_usage, id, vtx->src_sel_x, last_barrier);
2107
2108 vtx->src_gpr = range->replacement;
2109
2110 cur_usage = &usage[vtx->dst_gpr];
2111 range = find_dst_range(cur_usage, id);
2112 if (range) {
2113 vtx->dst_gpr = range->replacement;
2114
2115 if (vtx->dst_sel_x != 7)
2116 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_x);
2117 if (vtx->dst_sel_y != 7)
2118 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_y);
2119 if (vtx->dst_sel_z != 7)
2120 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_z);
2121 if (vtx->dst_sel_w != 7)
2122 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_w);
2123 } else {
2124 vtx->dst_gpr = 123;
2125 }
2126 }
2127
2128 static void replace_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
2129 int id, int last_barrier)
2130 {
2131 //TODO handle other memory operations
2132 struct gpr_usage *cur_usage = &usage[cf->output.gpr];
2133 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2134
2135 cf->barrier = 0;
2136 if (cf->output.swizzle_x < 4)
2137 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_x, last_barrier);
2138 if (cf->output.swizzle_y < 4)
2139 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_y, last_barrier);
2140 if (cf->output.swizzle_z < 4)
2141 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_z, last_barrier);
2142 if (cf->output.swizzle_w < 4)
2143 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_w, last_barrier);
2144
2145 cf->output.gpr = range->replacement;
2146 }
2147
2148 static void optimize_alu_inst(struct r600_bc *bc, struct r600_bc_cf *cf, struct r600_bc_alu *alu)
2149 {
2150 struct r600_bc_alu *alu_next;
2151 unsigned chan;
2152 unsigned src, num_src;
2153
2154 /* check if a MOV could be optimized away */
2155 if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV) {
2156
2157 /* destination equals source? */
2158 if (alu->dst.sel != alu->src[0].sel ||
2159 alu->dst.chan != alu->src[0].chan)
2160 return;
2161
2162 /* any special handling for the source? */
2163 if (alu->src[0].rel || alu->src[0].neg || alu->src[0].abs)
2164 return;
2165
2166 /* any special handling for destination? */
2167 if (alu->dst.rel || alu->dst.clamp)
2168 return;
2169
2170 /* ok find next instruction group and check if ps/pv is used */
2171 for (alu_next = alu; !alu_next->last; alu_next = NEXT_ALU(alu_next));
2172
2173 if (alu_next->list.next != &cf->alu) {
2174 chan = is_alu_reduction_inst(bc, alu) ? 0 : alu->dst.chan;
2175 for (alu_next = NEXT_ALU(alu_next); alu_next; alu_next = NEXT_ALU(alu_next)) {
2176 num_src = r600_bc_get_num_operands(bc, alu_next);
2177 for (src = 0; src < num_src; ++src) {
2178 if (alu_next->src[src].sel == V_SQ_ALU_SRC_PV &&
2179 alu_next->src[src].chan == chan)
2180 return;
2181
2182 if (alu_next->src[src].sel == V_SQ_ALU_SRC_PS)
2183 return;
2184 }
2185
2186 if (alu_next->last)
2187 break;
2188 }
2189 }
2190
2191 r600_bc_remove_alu(cf, alu);
2192 }
2193 }
2194
2195 static void optimize_export_inst(struct r600_bc *bc, struct r600_bc_cf *cf)
2196 {
2197 struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, cf->list.prev, list);
2198 if (&prev->list == &bc->cf ||
2199 prev->inst != cf->inst ||
2200 prev->output.type != cf->output.type ||
2201 prev->output.elem_size != cf->output.elem_size ||
2202 prev->output.swizzle_x != cf->output.swizzle_x ||
2203 prev->output.swizzle_y != cf->output.swizzle_y ||
2204 prev->output.swizzle_z != cf->output.swizzle_z ||
2205 prev->output.swizzle_w != cf->output.swizzle_w)
2206 return;
2207
2208 if ((prev->output.burst_count + cf->output.burst_count) > 16)
2209 return;
2210
2211 if ((prev->output.gpr + prev->output.burst_count) == cf->output.gpr &&
2212 (prev->output.array_base + prev->output.burst_count) == cf->output.array_base) {
2213
2214 prev->output.burst_count += cf->output.burst_count;
2215 r600_bc_remove_cf(bc, cf);
2216
2217 } else if (prev->output.gpr == (cf->output.gpr + cf->output.burst_count) &&
2218 prev->output.array_base == (cf->output.array_base + cf->output.burst_count)) {
2219
2220 cf->output.burst_count += prev->output.burst_count;
2221 r600_bc_remove_cf(bc, prev);
2222 }
2223 }
2224
2225 static void r600_bc_optimize(struct r600_bc *bc)
2226 {
2227 struct r600_bc_cf *cf, *next_cf;
2228 struct r600_bc_alu *first, *next_alu;
2229 struct r600_bc_alu *alu;
2230 struct r600_bc_vtx *vtx;
2231 struct r600_bc_tex *tex;
2232 struct gpr_usage usage[128];
2233
2234 /* assume that each gpr is exported only once */
2235 struct r600_bc_cf *export_cf[128] = { NULL };
2236 int export_remap[128];
2237
2238 int id, cond_start, barrier[bc->nstack];
2239 unsigned i, j, stack, predicate, old_stack;
2240
2241 memset(&usage, 0, sizeof(usage));
2242 for (i = 0; i < 128; ++i) {
2243 usage[i].first_write = -1;
2244 usage[i].last_write[0] = -1;
2245 usage[i].last_write[1] = -1;
2246 usage[i].last_write[2] = -1;
2247 usage[i].last_write[3] = -1;
2248 }
2249
2250 /* first gather some informations about the gpr usage */
2251 id = 0; stack = 0;
2252 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2253 old_stack = stack;
2254 if (stack == 0)
2255 cond_start = stack;
2256
2257 switch (r600_bc_cf_class(cf)) {
2258 case CF_CLASS_ALU:
2259 predicate = 0;
2260 first = NULL;
2261 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2262 if (!first)
2263 first = alu;
2264 notice_alu_src_gprs(bc, alu, usage, id);
2265 if (alu->last) {
2266 notice_alu_dst_gprs(first, usage, id, predicate || stack > 0);
2267 first = NULL;
2268 ++id;
2269 }
2270 if (is_alu_pred_inst(bc, alu))
2271 predicate++;
2272 }
2273 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
2274 stack += predicate;
2275 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
2276 stack -= 1;
2277 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
2278 stack -= 2;
2279 break;
2280 case CF_CLASS_TEXTURE:
2281 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2282 notice_tex_gprs(bc, tex, usage, id++, stack > 0);
2283 }
2284 break;
2285 case CF_CLASS_VERTEX:
2286 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2287 notice_vtx_gprs(vtx, usage, id++, stack > 0);
2288 }
2289 break;
2290 case CF_CLASS_EXPORT:
2291 notice_export_gprs(cf, usage, export_cf, export_remap);
2292 continue; // don't increment id
2293 case CF_CLASS_OTHER:
2294 switch (cf->inst) {
2295 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2296 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2297 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2298 break;
2299
2300 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2301 stack -= cf->pop_count;
2302 break;
2303
2304 default:
2305 // TODO implement loop handling
2306 goto out;
2307 }
2308 }
2309
2310 /* extend last_write after conditional block */
2311 if (stack == 0 && old_stack != 0)
2312 for (i = 0; i < 128; ++i)
2313 for (j = 0; j < 4; ++j)
2314 if (usage[i].last_write[j] >= cond_start)
2315 usage[i].last_write[j] = id;
2316
2317 id += 0x100;
2318 id &= ~0xFF;
2319 }
2320 assert(stack == 0);
2321
2322 /* try to optimize gpr usage */
2323 for (i = 0; i < 124; ++i) {
2324 for (j = 0; j < usage[i].nranges; ++j) {
2325 struct gpr_usage_range *range = &usage[i].ranges[j];
2326 if (range->start == -1)
2327 /* can't rearange shader inputs */
2328 range->replacement = i;
2329 else if (range->end == -1)
2330 /* gpr isn't used any more after this instruction */
2331 range->replacement = -1;
2332 else
2333 find_replacement(usage, i, range);
2334
2335 if (range->replacement == i)
2336 bc->ngpr = i;
2337 else if (range->replacement < i && range->replacement > bc->ngpr)
2338 bc->ngpr = range->replacement;
2339 }
2340 }
2341 bc->ngpr++;
2342
2343 /* apply the changes */
2344 for (i = 0; i < 128; ++i) {
2345 usage[i].last_write[0] = -1;
2346 usage[i].last_write[1] = -1;
2347 usage[i].last_write[2] = -1;
2348 usage[i].last_write[3] = -1;
2349 }
2350 barrier[0] = 0;
2351 id = 0; stack = 0;
2352 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2353 old_stack = stack;
2354 switch (r600_bc_cf_class(cf)) {
2355 case CF_CLASS_ALU:
2356 predicate = 0;
2357 first = NULL;
2358 cf->barrier = 0;
2359 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2360 replace_alu_gprs(bc, alu, usage, id, barrier[stack], &cf->barrier);
2361 if (alu->last)
2362 ++id;
2363
2364 if (is_alu_pred_inst(bc, alu))
2365 predicate++;
2366
2367 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)
2368 optimize_alu_inst(bc, cf, alu);
2369 }
2370 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
2371 stack += predicate;
2372 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
2373 stack -= 1;
2374 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
2375 stack -= 2;
2376 if (LIST_IS_EMPTY(&cf->alu)) {
2377 r600_bc_remove_cf(bc, cf);
2378 cf = NULL;
2379 }
2380 break;
2381 case CF_CLASS_TEXTURE:
2382 cf->barrier = 0;
2383 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2384 replace_tex_gprs(tex, usage, id++, barrier[stack], &cf->barrier);
2385 }
2386 break;
2387 case CF_CLASS_VERTEX:
2388 cf->barrier = 0;
2389 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2390 replace_vtx_gprs(vtx, usage, id++, barrier[stack], &cf->barrier);
2391 }
2392 break;
2393 case CF_CLASS_EXPORT:
2394 continue; // don't increment id
2395 case CF_CLASS_OTHER:
2396 if (cf->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
2397 cf->barrier = 0;
2398 stack -= cf->pop_count;
2399 }
2400 break;
2401 }
2402
2403 id &= ~0xFF;
2404 if (cf && cf->barrier)
2405 barrier[old_stack] = id;
2406
2407 for (i = old_stack + 1; i <= stack; ++i)
2408 barrier[i] = barrier[old_stack];
2409
2410 id += 0x100;
2411 if (stack != 0) /* ensure exports are placed outside of conditional blocks */
2412 continue;
2413
2414 for (i = 0; i < 128; ++i) {
2415 if (!export_cf[i] || id < export_remap[i])
2416 continue;
2417
2418 r600_bc_move_cf(bc, export_cf[i], next_cf);
2419 replace_export_gprs(export_cf[i], usage, export_remap[i], barrier[stack]);
2420 if (export_cf[i]->barrier)
2421 barrier[stack] = id - 1;
2422 next_cf = LIST_ENTRY(struct r600_bc_cf, export_cf[i]->list.next, list);
2423 optimize_export_inst(bc, export_cf[i]);
2424 export_cf[i] = NULL;
2425 }
2426 }
2427 assert(stack == 0);
2428
2429 out:
2430 for (i = 0; i < 128; ++i) {
2431 free(usage[i].ranges);
2432 }
2433 }
2434
2435 int r600_bc_build(struct r600_bc *bc)
2436 {
2437 struct r600_bc_cf *cf;
2438 struct r600_bc_alu *alu;
2439 struct r600_bc_vtx *vtx;
2440 struct r600_bc_tex *tex;
2441 struct r600_bc_cf *exports[4] = { NULL };
2442 uint32_t literal[4];
2443 unsigned nliteral;
2444 unsigned addr;
2445 int i, r;
2446
2447 if (bc->callstack[0].max > 0)
2448 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
2449 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
2450 bc->nstack = 1;
2451 }
2452
2453 //r600_bc_optimize(bc);
2454
2455 /* first path compute addr of each CF block */
2456 /* addr start after all the CF instructions */
2457 addr = LIST_ENTRY(struct r600_bc_cf, bc->cf.prev, list)->id + 2;
2458 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2459 switch (r600_bc_cf_class(cf)) {
2460 case CF_CLASS_ALU:
2461 break;
2462 case CF_CLASS_TEXTURE:
2463 case CF_CLASS_VERTEX:
2464 /* fetch node need to be 16 bytes aligned*/
2465 addr += 3;
2466 addr &= 0xFFFFFFFCUL;
2467 break;
2468 case CF_CLASS_EXPORT:
2469 if (cf->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT))
2470 exports[cf->output.type] = cf;
2471 break;
2472 case CF_CLASS_OTHER:
2473 break;
2474 default:
2475 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2476 return -EINVAL;
2477 }
2478 cf->addr = addr;
2479 addr += cf->ndw;
2480 bc->ndw = cf->addr + cf->ndw;
2481 }
2482
2483 /* set export done on last export of each type */
2484 for (i = 0; i < 4; ++i) {
2485 if (exports[i]) {
2486 exports[i]->inst = BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
2487 }
2488 }
2489
2490 free(bc->bytecode);
2491 bc->bytecode = calloc(1, bc->ndw * 4);
2492 if (bc->bytecode == NULL)
2493 return -ENOMEM;
2494 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2495 addr = cf->addr;
2496 if (bc->chiprev == CHIPREV_EVERGREEN)
2497 r = eg_bc_cf_build(bc, cf);
2498 else
2499 r = r600_bc_cf_build(bc, cf);
2500 if (r)
2501 return r;
2502 switch (r600_bc_cf_class(cf)) {
2503 case CF_CLASS_ALU:
2504 nliteral = 0;
2505 memset(literal, 0, sizeof(literal));
2506 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2507 r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
2508 if (r)
2509 return r;
2510 r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
2511 switch(bc->chiprev) {
2512 case CHIPREV_R600:
2513 r = r600_bc_alu_build(bc, alu, addr);
2514 break;
2515 case CHIPREV_R700:
2516 case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
2517 r = r700_bc_alu_build(bc, alu, addr);
2518 break;
2519 default:
2520 R600_ERR("unknown family %d\n", bc->family);
2521 return -EINVAL;
2522 }
2523 if (r)
2524 return r;
2525 addr += 2;
2526 if (alu->last) {
2527 for (i = 0; i < align(nliteral, 2); ++i) {
2528 bc->bytecode[addr++] = literal[i];
2529 }
2530 nliteral = 0;
2531 memset(literal, 0, sizeof(literal));
2532 }
2533 }
2534 break;
2535 case CF_CLASS_VERTEX:
2536 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2537 r = r600_bc_vtx_build(bc, vtx, addr);
2538 if (r)
2539 return r;
2540 addr += 4;
2541 }
2542 break;
2543 case CF_CLASS_TEXTURE:
2544 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2545 r = r600_bc_tex_build(bc, tex, addr);
2546 if (r)
2547 return r;
2548 addr += 4;
2549 }
2550 break;
2551 case CF_CLASS_EXPORT:
2552 case CF_CLASS_OTHER:
2553 break;
2554 default:
2555 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2556 return -EINVAL;
2557 }
2558 }
2559 return 0;
2560 }
2561
2562 void r600_bc_clear(struct r600_bc *bc)
2563 {
2564 struct r600_bc_cf *cf = NULL, *next_cf;
2565
2566 free(bc->bytecode);
2567 bc->bytecode = NULL;
2568
2569 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2570 struct r600_bc_alu *alu = NULL, *next_alu;
2571 struct r600_bc_tex *tex = NULL, *next_tex;
2572 struct r600_bc_tex *vtx = NULL, *next_vtx;
2573
2574 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2575 free(alu);
2576 }
2577
2578 LIST_INITHEAD(&cf->alu);
2579
2580 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
2581 free(tex);
2582 }
2583
2584 LIST_INITHEAD(&cf->tex);
2585
2586 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
2587 free(vtx);
2588 }
2589
2590 LIST_INITHEAD(&cf->vtx);
2591
2592 free(cf);
2593 }
2594
2595 LIST_INITHEAD(&cf->list);
2596 }
2597
2598 void r600_bc_dump(struct r600_bc *bc)
2599 {
2600 struct r600_bc_cf *cf = NULL;
2601 struct r600_bc_alu *alu = NULL;
2602 struct r600_bc_vtx *vtx = NULL;
2603 struct r600_bc_tex *tex = NULL;
2604
2605 unsigned i, id;
2606 uint32_t literal[4];
2607 unsigned nliteral;
2608 char chip = '6';
2609
2610 switch (bc->chiprev) {
2611 case 1:
2612 chip = '7';
2613 break;
2614 case 2:
2615 chip = 'E';
2616 break;
2617 case 0:
2618 default:
2619 chip = '6';
2620 break;
2621 }
2622 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
2623 fprintf(stderr, " %c\n", chip);
2624
2625 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2626 id = cf->id;
2627
2628 switch (r600_bc_cf_class(cf)) {
2629 case CF_CLASS_ALU:
2630 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2631 fprintf(stderr, "ADDR:%04d ", cf->addr);
2632 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2633 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2634 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2635 id++;
2636 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2637 fprintf(stderr, "INST:%d ", cf->inst);
2638 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2639 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2640 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2641 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2642 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2643 break;
2644 case CF_CLASS_TEXTURE:
2645 case CF_CLASS_VERTEX:
2646 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2647 fprintf(stderr, "ADDR:%04d\n", cf->addr);
2648 id++;
2649 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2650 fprintf(stderr, "INST:%d ", cf->inst);
2651 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2652 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2653 break;
2654 case CF_CLASS_EXPORT:
2655 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2656 fprintf(stderr, "GPR:%d ", cf->output.gpr);
2657 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2658 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2659 fprintf(stderr, "TYPE:%X\n", cf->output.type);
2660 id++;
2661 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2662 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2663 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2664 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2665 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2666 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2667 fprintf(stderr, "INST:%d ", cf->inst);
2668 fprintf(stderr, "BURST_COUNT:%d\n", cf->output.burst_count);
2669 break;
2670 case CF_CLASS_OTHER:
2671 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2672 fprintf(stderr, "ADDR:%04d\n", cf->cf_addr);
2673 id++;
2674 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2675 fprintf(stderr, "INST:%d ", cf->inst);
2676 fprintf(stderr, "COND:%X ", cf->cond);
2677 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2678 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2679 break;
2680 }
2681
2682 id = cf->addr;
2683 nliteral = 0;
2684 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2685 r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
2686
2687 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2688 fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
2689 fprintf(stderr, "REL:%d ", alu->src[0].rel);
2690 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
2691 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
2692 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
2693 fprintf(stderr, "REL:%d ", alu->src[1].rel);
2694 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
2695 fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
2696 fprintf(stderr, "LAST:%d)\n", alu->last);
2697 id++;
2698 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
2699 fprintf(stderr, "INST:%d ", alu->inst);
2700 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
2701 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
2702 fprintf(stderr, "REL:%d ", alu->dst.rel);
2703 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
2704 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
2705 if (alu->is_op3) {
2706 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
2707 fprintf(stderr, "REL:%d ", alu->src[2].rel);
2708 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
2709 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
2710 } else {
2711 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
2712 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
2713 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
2714 fprintf(stderr, "OMOD:%d ", alu->omod);
2715 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
2716 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
2717 }
2718
2719 id++;
2720 if (alu->last) {
2721 for (i = 0; i < nliteral; i++, id++) {
2722 float *f = (float*)(bc->bytecode + id);
2723 fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
2724 }
2725 id += nliteral & 1;
2726 nliteral = 0;
2727 }
2728 }
2729
2730 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2731 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2732 fprintf(stderr, "INST:%d ", tex->inst);
2733 fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
2734 fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
2735 fprintf(stderr, "REL:%d)\n", tex->src_rel);
2736 id++;
2737 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2738 fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
2739 fprintf(stderr, "REL:%d ", tex->dst_rel);
2740 fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
2741 fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
2742 fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
2743 fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
2744 fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
2745 fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
2746 fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
2747 fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
2748 fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
2749 id++;
2750 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2751 fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
2752 fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
2753 fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
2754 fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
2755 fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
2756 fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
2757 fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
2758 fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
2759 id++;
2760 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2761 id++;
2762 }
2763
2764 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2765 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2766 fprintf(stderr, "INST:%d ", vtx->inst);
2767 fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2768 fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2769 id++;
2770 /* This assumes that no semantic fetches exist */
2771 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2772 fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2773 fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2774 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2775 fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2776 fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2777 fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2778 fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2779 fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2780 fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2781 fprintf(stderr, "DATA_FORMAT:%d ", vtx->data_format);
2782 fprintf(stderr, "NUM_FORMAT_ALL:%d ", vtx->num_format_all);
2783 fprintf(stderr, "FORMAT_COMP_ALL:%d ", vtx->format_comp_all);
2784 fprintf(stderr, "SRF_MODE_ALL:%d\n", vtx->srf_mode_all);
2785 id++;
2786 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2787 //TODO
2788 id++;
2789 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2790 id++;
2791 }
2792 }
2793
2794 fprintf(stderr, "--------------------------------------\n");
2795 }
2796
2797 static void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
2798 {
2799 struct r600_pipe_state *rstate;
2800 unsigned i = 0;
2801
2802 if (count > 8) {
2803 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2804 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2805 S_SQ_CF_WORD1_BARRIER(1) |
2806 S_SQ_CF_WORD1_COUNT(8 - 1);
2807 bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
2808 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2809 S_SQ_CF_WORD1_BARRIER(1) |
2810 S_SQ_CF_WORD1_COUNT(count - 8 - 1);
2811 } else {
2812 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2813 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2814 S_SQ_CF_WORD1_BARRIER(1) |
2815 S_SQ_CF_WORD1_COUNT(count - 1);
2816 }
2817 bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
2818 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
2819 S_SQ_CF_WORD1_BARRIER(1);
2820
2821 rstate = &ve->rstate;
2822 rstate->id = R600_PIPE_STATE_FETCH_SHADER;
2823 rstate->nregs = 0;
2824 r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
2825 0x00000000, 0xFFFFFFFF, NULL);
2826 r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
2827 0x00000000, 0xFFFFFFFF, NULL);
2828 r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
2829 r600_bo_offset(ve->fetch_shader) >> 8,
2830 0xFFFFFFFF, ve->fetch_shader);
2831 }
2832
2833 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
2834 unsigned *num_format, unsigned *format_comp)
2835 {
2836 const struct util_format_description *desc;
2837 unsigned i;
2838
2839 *format = 0;
2840 *num_format = 0;
2841 *format_comp = 0;
2842
2843 desc = util_format_description(pformat);
2844 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2845 goto out_unknown;
2846 }
2847
2848 /* Find the first non-VOID channel. */
2849 for (i = 0; i < 4; i++) {
2850 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2851 break;
2852 }
2853 }
2854
2855 switch (desc->channel[i].type) {
2856 /* Half-floats, floats, ints */
2857 case UTIL_FORMAT_TYPE_FLOAT:
2858 switch (desc->channel[i].size) {
2859 case 16:
2860 switch (desc->nr_channels) {
2861 case 1:
2862 *format = FMT_16_FLOAT;
2863 break;
2864 case 2:
2865 *format = FMT_16_16_FLOAT;
2866 break;
2867 case 3:
2868 case 4:
2869 *format = FMT_16_16_16_16_FLOAT;
2870 break;
2871 }
2872 break;
2873 case 32:
2874 switch (desc->nr_channels) {
2875 case 1:
2876 *format = FMT_32_FLOAT;
2877 break;
2878 case 2:
2879 *format = FMT_32_32_FLOAT;
2880 break;
2881 case 3:
2882 *format = FMT_32_32_32_FLOAT;
2883 break;
2884 case 4:
2885 *format = FMT_32_32_32_32_FLOAT;
2886 break;
2887 }
2888 break;
2889 default:
2890 goto out_unknown;
2891 }
2892 break;
2893 /* Unsigned ints */
2894 case UTIL_FORMAT_TYPE_UNSIGNED:
2895 /* Signed ints */
2896 case UTIL_FORMAT_TYPE_SIGNED:
2897 switch (desc->channel[i].size) {
2898 case 8:
2899 switch (desc->nr_channels) {
2900 case 1:
2901 *format = FMT_8;
2902 break;
2903 case 2:
2904 *format = FMT_8_8;
2905 break;
2906 case 3:
2907 case 4:
2908 *format = FMT_8_8_8_8;
2909 break;
2910 }
2911 break;
2912 case 16:
2913 switch (desc->nr_channels) {
2914 case 1:
2915 *format = FMT_16;
2916 break;
2917 case 2:
2918 *format = FMT_16_16;
2919 break;
2920 case 3:
2921 case 4:
2922 *format = FMT_16_16_16_16;
2923 break;
2924 }
2925 break;
2926 case 32:
2927 switch (desc->nr_channels) {
2928 case 1:
2929 *format = FMT_32;
2930 break;
2931 case 2:
2932 *format = FMT_32_32;
2933 break;
2934 case 3:
2935 *format = FMT_32_32_32;
2936 break;
2937 case 4:
2938 *format = FMT_32_32_32_32;
2939 break;
2940 }
2941 break;
2942 default:
2943 goto out_unknown;
2944 }
2945 break;
2946 default:
2947 goto out_unknown;
2948 }
2949
2950 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2951 *format_comp = 1;
2952 }
2953 if (desc->channel[i].normalized) {
2954 *num_format = 0;
2955 } else {
2956 *num_format = 2;
2957 }
2958 return;
2959 out_unknown:
2960 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2961 }
2962
2963 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2964 {
2965 unsigned ndw, i;
2966 u32 *bytecode;
2967 unsigned fetch_resource_start = 0, format, num_format, format_comp;
2968 struct pipe_vertex_element *elements = ve->elements;
2969 const struct util_format_description *desc;
2970
2971 /* 2 dwords for cf aligned to 4 + 4 dwords per input */
2972 ndw = 8 + ve->count * 4;
2973 ve->fs_size = ndw * 4;
2974
2975 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
2976 ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
2977 if (ve->fetch_shader == NULL) {
2978 return -ENOMEM;
2979 }
2980
2981 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
2982 if (bytecode == NULL) {
2983 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
2984 return -ENOMEM;
2985 }
2986
2987 if (rctx->family >= CHIP_CEDAR) {
2988 eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
2989 } else {
2990 r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
2991 fetch_resource_start = 160;
2992 }
2993
2994 /* vertex elements offset need special handling, if offset is bigger
2995 * than what we can put in fetch instruction then we need to alterate
2996 * the vertex resource offset. In such case in order to simplify code
2997 * we will bound one resource per elements. It's a worst case scenario.
2998 */
2999 for (i = 0; i < ve->count; i++) {
3000 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
3001 if (ve->vbuffer_offset[i]) {
3002 ve->vbuffer_need_offset = 1;
3003 }
3004 }
3005
3006 for (i = 0; i < ve->count; i++) {
3007 unsigned vbuffer_index;
3008 r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp);
3009 desc = util_format_description(ve->elements[i].src_format);
3010 if (desc == NULL) {
3011 R600_ERR("unknown format %d\n", ve->elements[i].src_format);
3012 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
3013 return -EINVAL;
3014 }
3015
3016 /* see above for vbuffer_need_offset explanation */
3017 vbuffer_index = elements[i].vertex_buffer_index;
3018 if (ve->vbuffer_need_offset) {
3019 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start);
3020 } else {
3021 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start);
3022 }
3023 bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) |
3024 S_SQ_VTX_WORD0_SRC_SEL_X(0) |
3025 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
3026 bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) |
3027 S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) |
3028 S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) |
3029 S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) |
3030 S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) |
3031 S_SQ_VTX_WORD1_DATA_FORMAT(format) |
3032 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) |
3033 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) |
3034 S_SQ_VTX_WORD1_SRF_MODE_ALL(1) |
3035 S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1);
3036 bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) |
3037 S_SQ_VTX_WORD2_MEGA_FETCH(1);
3038 bytecode[8 + i * 4 + 3] = 0;
3039 }
3040 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
3041 return 0;
3042 }