r600g: bugfixing register remapping
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include <stdio.h>
24 #include <errno.h>
25 #include "util/u_format.h"
26 #include "util/u_memory.h"
27 #include "pipe/p_shader_tokens.h"
28 #include "r600_pipe.h"
29 #include "r600_sq.h"
30 #include "r600_opcodes.h"
31 #include "r600_asm.h"
32 #include "r600_formats.h"
33 #include "r600d.h"
34
35 #define NUM_OF_CYCLES 3
36 #define NUM_OF_COMPONENTS 4
37
38 #define PREV_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.prev, list)
39 #define NEXT_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)
40
41 static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
42 {
43 if(alu->is_op3)
44 return 3;
45
46 switch (bc->chiprev) {
47 case CHIPREV_R600:
48 case CHIPREV_R700:
49 switch (alu->inst) {
50 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
51 return 0;
52 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
53 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
55 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
62 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
63 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
71 return 2;
72
73 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
77 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
78 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
79 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
80 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
81 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
82 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
83 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
84 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
85 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
86 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
87 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
88 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
89 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
90 return 1;
91 default: R600_ERR(
92 "Need instruction operand number for 0x%x.\n", alu->inst);
93 }
94 break;
95 case CHIPREV_EVERGREEN:
96 switch (alu->inst) {
97 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
98 return 0;
99 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
100 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
101 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
102 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
103 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
104 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
105 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
106 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
107 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
108 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
109 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
110 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
111 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
112 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
113 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
114 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
115 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
116 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
117 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
118 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
119 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
120 return 2;
121
122 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
123 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
124 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
125 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
126 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
127 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
128 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
129 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
130 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
131 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
132 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
133 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
134 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
135 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
136 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
137 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
138 return 1;
139 default: R600_ERR(
140 "Need instruction operand number for 0x%x.\n", alu->inst);
141 }
142 break;
143 }
144
145 return 3;
146 }
147
148 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
149
150 static struct r600_bc_cf *r600_bc_cf(void)
151 {
152 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
153
154 if (cf == NULL)
155 return NULL;
156 LIST_INITHEAD(&cf->list);
157 LIST_INITHEAD(&cf->alu);
158 LIST_INITHEAD(&cf->vtx);
159 LIST_INITHEAD(&cf->tex);
160 cf->barrier = 1;
161 return cf;
162 }
163
164 static struct r600_bc_alu *r600_bc_alu(void)
165 {
166 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
167
168 if (alu == NULL)
169 return NULL;
170 LIST_INITHEAD(&alu->list);
171 return alu;
172 }
173
174 static struct r600_bc_vtx *r600_bc_vtx(void)
175 {
176 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
177
178 if (vtx == NULL)
179 return NULL;
180 LIST_INITHEAD(&vtx->list);
181 return vtx;
182 }
183
184 static struct r600_bc_tex *r600_bc_tex(void)
185 {
186 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
187
188 if (tex == NULL)
189 return NULL;
190 LIST_INITHEAD(&tex->list);
191 return tex;
192 }
193
194 int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
195 {
196 LIST_INITHEAD(&bc->cf);
197 bc->family = family;
198 switch (bc->family) {
199 case CHIP_R600:
200 case CHIP_RV610:
201 case CHIP_RV630:
202 case CHIP_RV670:
203 case CHIP_RV620:
204 case CHIP_RV635:
205 case CHIP_RS780:
206 case CHIP_RS880:
207 bc->chiprev = CHIPREV_R600;
208 break;
209 case CHIP_RV770:
210 case CHIP_RV730:
211 case CHIP_RV710:
212 case CHIP_RV740:
213 bc->chiprev = CHIPREV_R700;
214 break;
215 case CHIP_CEDAR:
216 case CHIP_REDWOOD:
217 case CHIP_JUNIPER:
218 case CHIP_CYPRESS:
219 case CHIP_HEMLOCK:
220 case CHIP_PALM:
221 case CHIP_BARTS:
222 case CHIP_TURKS:
223 case CHIP_CAICOS:
224 bc->chiprev = CHIPREV_EVERGREEN;
225 break;
226 default:
227 R600_ERR("unknown family %d\n", bc->family);
228 return -EINVAL;
229 }
230 return 0;
231 }
232
233 static int r600_bc_add_cf(struct r600_bc *bc)
234 {
235 struct r600_bc_cf *cf = r600_bc_cf();
236
237 if (cf == NULL)
238 return -ENOMEM;
239 LIST_ADDTAIL(&cf->list, &bc->cf);
240 if (bc->cf_last)
241 cf->id = bc->cf_last->id + 2;
242 bc->cf_last = cf;
243 bc->ncf++;
244 bc->ndw += 2;
245 bc->force_add_cf = 0;
246 return 0;
247 }
248
249 static void r600_bc_remove_cf(struct r600_bc *bc, struct r600_bc_cf *cf)
250 {
251 struct r600_bc_cf *other;
252 LIST_FOR_EACH_ENTRY(other, &bc->cf, list) {
253 if (other->id > cf->id)
254 other->id -= 2;
255 if (other->cf_addr > cf->id)
256 other->cf_addr -= 2;
257 }
258 LIST_DEL(&cf->list);
259 free(cf);
260 }
261
262 static void r600_bc_move_cf(struct r600_bc *bc, struct r600_bc_cf *cf, struct r600_bc_cf *next)
263 {
264 struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, next->list.prev, list);
265 unsigned old_id = cf->id;
266 unsigned new_id = next->list.prev == &bc->cf ? 0 : prev->id + 2;
267 struct r600_bc_cf *other;
268
269 if (prev == cf || next == cf)
270 return; /* position hasn't changed */
271
272 LIST_DEL(&cf->list);
273 LIST_FOR_EACH_ENTRY(other, &bc->cf, list) {
274 if (other->id > old_id)
275 other->id -= 2;
276 if (other->id >= new_id)
277 other->id += 2;
278 if (other->cf_addr > old_id)
279 other->cf_addr -= 2;
280 if (other->cf_addr > new_id)
281 other->cf_addr += 2;
282 }
283 cf->id = new_id;
284 LIST_ADD(&cf->list, &prev->list);
285 }
286
287 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
288 {
289 int r;
290
291 r = r600_bc_add_cf(bc);
292 if (r)
293 return r;
294 bc->cf_last->inst = BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
295 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
296 bc->cf_last->output.burst_count = 1;
297 return 0;
298 }
299
300 /* alu predicate instructions */
301 static int is_alu_pred_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
302 {
303 switch (bc->chiprev) {
304 case CHIPREV_R600:
305 case CHIPREV_R700:
306 return !alu->is_op3 && (
307 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
308 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
309 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
310 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
311 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
312 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
313 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
314 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
315 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
316 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
317 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
318 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
319 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
320 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
321 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
322 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
323 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
324 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
325 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
326 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
327 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
328 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
329 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
330 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
331 case CHIPREV_EVERGREEN:
332 default:
333 return !alu->is_op3 && (
334 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
335 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
336 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
337 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
338 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
339 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
340 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
341 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
342 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
343 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
344 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
345 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
346 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
347 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
348 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
349 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
350 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
351 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
352 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
353 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
354 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
355 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
356 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
357 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
358 }
359 }
360
361 /* alu kill instructions */
362 static int is_alu_kill_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
363 {
364 switch (bc->chiprev) {
365 case CHIPREV_R600:
366 case CHIPREV_R700:
367 return !alu->is_op3 && (
368 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
369 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
370 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
371 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
372 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
373 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
374 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
375 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
376 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
377 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
378 case CHIPREV_EVERGREEN:
379 default:
380 return !alu->is_op3 && (
381 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
382 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
383 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
384 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
385 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
386 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
387 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
388 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
389 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
390 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
391 }
392 }
393
394 /* alu instructions that can ony exits once per group */
395 static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
396 {
397 return is_alu_kill_inst(bc, alu) ||
398 is_alu_pred_inst(bc, alu);
399 }
400
401 static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
402 {
403 switch (bc->chiprev) {
404 case CHIPREV_R600:
405 case CHIPREV_R700:
406 return !alu->is_op3 && (
407 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
408 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
409 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
410 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
411 case CHIPREV_EVERGREEN:
412 default:
413 return !alu->is_op3 && (
414 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
415 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
416 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
417 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
418 }
419 }
420
421 static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
422 {
423 switch (bc->chiprev) {
424 case CHIPREV_R600:
425 case CHIPREV_R700:
426 return !alu->is_op3 && (
427 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
428 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
429 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
430 case CHIPREV_EVERGREEN:
431 default:
432 return !alu->is_op3 && (
433 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
434 }
435 }
436
437 /* alu instructions that can only execute on the vector unit */
438 static int is_alu_vec_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
439 {
440 return is_alu_reduction_inst(bc, alu) ||
441 is_alu_mova_inst(bc, alu);
442 }
443
444 /* alu instructions that can only execute on the trans unit */
445 static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
446 {
447 switch (bc->chiprev) {
448 case CHIPREV_R600:
449 case CHIPREV_R700:
450 if (!alu->is_op3)
451 return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
452 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
453 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
454 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
455 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
456 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
457 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
458 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
459 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
460 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
461 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
462 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
463 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
464 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
465 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
466 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
467 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
468 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
469 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
470 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
471 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
472 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
473 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
474 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
475 else
476 return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
477 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
478 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
479 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
480 case CHIPREV_EVERGREEN:
481 default:
482 if (!alu->is_op3)
483 return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
484 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
485 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR ||
486 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
487 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
488 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
489 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
490 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
491 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
492 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
493 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
494 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
495 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
496 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
497 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
498 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
499 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
500 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
501 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
502 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
503 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
504 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
505 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
506 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
507 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
508 else
509 return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
510 }
511 }
512
513 /* alu instructions that can execute on any unit */
514 static int is_alu_any_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
515 {
516 return !is_alu_vec_unit_inst(bc, alu) &&
517 !is_alu_trans_unit_inst(bc, alu);
518 }
519
520 static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
521 struct r600_bc_alu *assignment[5])
522 {
523 struct r600_bc_alu *alu;
524 unsigned i, chan, trans;
525
526 for (i = 0; i < 5; i++)
527 assignment[i] = NULL;
528
529 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
530 chan = alu->dst.chan;
531 if (is_alu_trans_unit_inst(bc, alu))
532 trans = 1;
533 else if (is_alu_vec_unit_inst(bc, alu))
534 trans = 0;
535 else if (assignment[chan])
536 trans = 1; // assume ALU_INST_PREFER_VECTOR
537 else
538 trans = 0;
539
540 if (trans) {
541 if (assignment[4]) {
542 assert(0); //ALU.Trans has already been allocated
543 return -1;
544 }
545 assignment[4] = alu;
546 } else {
547 if (assignment[chan]) {
548 assert(0); //ALU.chan has already been allocated
549 return -1;
550 }
551 assignment[chan] = alu;
552 }
553
554 if (alu->last)
555 break;
556 }
557 return 0;
558 }
559
560 struct alu_bank_swizzle {
561 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
562 int hw_cfile_addr[4];
563 int hw_cfile_elem[4];
564 };
565
566 const unsigned cycle_for_bank_swizzle_vec[][3] = {
567 [SQ_ALU_VEC_012] = { 0, 1, 2 },
568 [SQ_ALU_VEC_021] = { 0, 2, 1 },
569 [SQ_ALU_VEC_120] = { 1, 2, 0 },
570 [SQ_ALU_VEC_102] = { 1, 0, 2 },
571 [SQ_ALU_VEC_201] = { 2, 0, 1 },
572 [SQ_ALU_VEC_210] = { 2, 1, 0 }
573 };
574
575 const unsigned cycle_for_bank_swizzle_scl[][3] = {
576 [SQ_ALU_SCL_210] = { 2, 1, 0 },
577 [SQ_ALU_SCL_122] = { 1, 2, 2 },
578 [SQ_ALU_SCL_212] = { 2, 1, 2 },
579 [SQ_ALU_SCL_221] = { 2, 2, 1 }
580 };
581
582 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
583 {
584 int i, cycle, component;
585 /* set up gpr use */
586 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
587 for (component = 0; component < NUM_OF_COMPONENTS; component++)
588 bs->hw_gpr[cycle][component] = -1;
589 for (i = 0; i < 4; i++)
590 bs->hw_cfile_addr[i] = -1;
591 for (i = 0; i < 4; i++)
592 bs->hw_cfile_elem[i] = -1;
593 }
594
595 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
596 {
597 if (bs->hw_gpr[cycle][chan] == -1)
598 bs->hw_gpr[cycle][chan] = sel;
599 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
600 // Another scalar operation has already used GPR read port for channel
601 return -1;
602 }
603 return 0;
604 }
605
606 static int reserve_cfile(struct r600_bc *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
607 {
608 int res, num_res = 4;
609 if (bc->chiprev >= CHIPREV_R700) {
610 num_res = 2;
611 chan /= 2;
612 }
613 for (res = 0; res < num_res; ++res) {
614 if (bs->hw_cfile_addr[res] == -1) {
615 bs->hw_cfile_addr[res] = sel;
616 bs->hw_cfile_elem[res] = chan;
617 return 0;
618 } else if (bs->hw_cfile_addr[res] == sel &&
619 bs->hw_cfile_elem[res] == chan)
620 return 0; // Read for this scalar element already reserved, nothing to do here.
621 }
622 // All cfile read ports are used, cannot reference vector element
623 return -1;
624 }
625
626 static int is_gpr(unsigned sel)
627 {
628 return (sel >= 0 && sel <= 127);
629 }
630
631 /* CB constants start at 512, and get translated to a kcache index when ALU
632 * clauses are constructed. Note that we handle kcache constants the same way
633 * as (the now gone) cfile constants, is that really required? */
634 static int is_cfile(unsigned sel)
635 {
636 return (sel > 255 && sel < 512) ||
637 (sel > 511 && sel < 4607) || // Kcache before translate
638 (sel > 127 && sel < 192); // Kcache after translate
639 }
640
641 static int is_const(int sel)
642 {
643 return is_cfile(sel) ||
644 (sel >= V_SQ_ALU_SRC_0 &&
645 sel <= V_SQ_ALU_SRC_LITERAL);
646 }
647
648 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
649 struct alu_bank_swizzle *bs, int bank_swizzle)
650 {
651 int r, src, num_src, sel, elem, cycle;
652
653 num_src = r600_bc_get_num_operands(bc, alu);
654 for (src = 0; src < num_src; src++) {
655 sel = alu->src[src].sel;
656 elem = alu->src[src].chan;
657 if (is_gpr(sel)) {
658 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
659 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
660 // Nothing to do; special-case optimization,
661 // second source uses first source’s reservation
662 continue;
663 else {
664 r = reserve_gpr(bs, sel, elem, cycle);
665 if (r)
666 return r;
667 }
668 } else if (is_cfile(sel)) {
669 r = reserve_cfile(bc, bs, sel, elem);
670 if (r)
671 return r;
672 }
673 // No restrictions on PV, PS, literal or special constants
674 }
675 return 0;
676 }
677
678 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
679 struct alu_bank_swizzle *bs, int bank_swizzle)
680 {
681 int r, src, num_src, const_count, sel, elem, cycle;
682
683 num_src = r600_bc_get_num_operands(bc, alu);
684 for (const_count = 0, src = 0; src < num_src; ++src) {
685 sel = alu->src[src].sel;
686 elem = alu->src[src].chan;
687 if (is_const(sel)) { // Any constant, including literal and inline constants
688 if (const_count >= 2)
689 // More than two references to a constant in
690 // transcendental operation.
691 return -1;
692 else
693 const_count++;
694 }
695 if (is_cfile(sel)) {
696 r = reserve_cfile(bc, bs, sel, elem);
697 if (r)
698 return r;
699 }
700 }
701 for (src = 0; src < num_src; ++src) {
702 sel = alu->src[src].sel;
703 elem = alu->src[src].chan;
704 if (is_gpr(sel)) {
705 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
706 if (cycle < const_count)
707 // Cycle for GPR load conflicts with
708 // constant load in transcendental operation.
709 return -1;
710 r = reserve_gpr(bs, sel, elem, cycle);
711 if (r)
712 return r;
713 }
714 // Constants already processed
715 // No restrictions on PV, PS
716 }
717 return 0;
718 }
719
720 static int check_and_set_bank_swizzle(struct r600_bc *bc,
721 struct r600_bc_alu *slots[5])
722 {
723 struct alu_bank_swizzle bs;
724 int bank_swizzle[5];
725 int i, r = 0, forced = 0;
726
727 for (i = 0; i < 5; i++)
728 if (slots[i] && slots[i]->bank_swizzle_force) {
729 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
730 forced = 1;
731 }
732
733 if (forced)
734 return 0;
735
736 // just check every possible combination of bank swizzle
737 // not very efficent, but works on the first try in most of the cases
738 for (i = 0; i < 4; i++)
739 bank_swizzle[i] = SQ_ALU_VEC_012;
740 bank_swizzle[4] = SQ_ALU_SCL_210;
741 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
742 init_bank_swizzle(&bs);
743 for (i = 0; i < 4; i++) {
744 if (slots[i]) {
745 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
746 if (r)
747 break;
748 }
749 }
750 if (!r && slots[4]) {
751 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
752 }
753 if (!r) {
754 for (i = 0; i < 5; i++) {
755 if (slots[i])
756 slots[i]->bank_swizzle = bank_swizzle[i];
757 }
758 return 0;
759 }
760
761 for (i = 0; i < 5; i++) {
762 bank_swizzle[i]++;
763 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
764 break;
765 else
766 bank_swizzle[i] = SQ_ALU_VEC_012;
767 }
768 }
769
770 // couldn't find a working swizzle
771 return -1;
772 }
773
774 static int replace_gpr_with_pv_ps(struct r600_bc *bc,
775 struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
776 {
777 struct r600_bc_alu *prev[5];
778 int gpr[5], chan[5];
779 int i, j, r, src, num_src;
780
781 r = assign_alu_units(bc, alu_prev, prev);
782 if (r)
783 return r;
784
785 for (i = 0; i < 5; ++i) {
786 if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
787 gpr[i] = prev[i]->dst.sel;
788 if (is_alu_reduction_inst(bc, prev[i]))
789 chan[i] = 0;
790 else
791 chan[i] = prev[i]->dst.chan;
792 } else
793 gpr[i] = -1;
794 }
795
796 for (i = 0; i < 5; ++i) {
797 struct r600_bc_alu *alu = slots[i];
798 if(!alu)
799 continue;
800
801 num_src = r600_bc_get_num_operands(bc, alu);
802 for (src = 0; src < num_src; ++src) {
803 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
804 continue;
805
806 if (alu->src[src].sel == gpr[4] &&
807 alu->src[src].chan == chan[4]) {
808 alu->src[src].sel = V_SQ_ALU_SRC_PS;
809 alu->src[src].chan = 0;
810 continue;
811 }
812
813 for (j = 0; j < 4; ++j) {
814 if (alu->src[src].sel == gpr[j] &&
815 alu->src[src].chan == j) {
816 alu->src[src].sel = V_SQ_ALU_SRC_PV;
817 alu->src[src].chan = chan[j];
818 break;
819 }
820 }
821 }
822 }
823
824 return 0;
825 }
826
827 void r600_bc_special_constants(u32 value, unsigned *sel, unsigned *neg)
828 {
829 switch(value) {
830 case 0:
831 *sel = V_SQ_ALU_SRC_0;
832 break;
833 case 1:
834 *sel = V_SQ_ALU_SRC_1_INT;
835 break;
836 case -1:
837 *sel = V_SQ_ALU_SRC_M_1_INT;
838 break;
839 case 0x3F800000: // 1.0f
840 *sel = V_SQ_ALU_SRC_1;
841 break;
842 case 0x3F000000: // 0.5f
843 *sel = V_SQ_ALU_SRC_0_5;
844 break;
845 case 0xBF800000: // -1.0f
846 *sel = V_SQ_ALU_SRC_1;
847 *neg ^= 1;
848 break;
849 case 0xBF000000: // -0.5f
850 *sel = V_SQ_ALU_SRC_0_5;
851 *neg ^= 1;
852 break;
853 default:
854 *sel = V_SQ_ALU_SRC_LITERAL;
855 break;
856 }
857 }
858
859 /* compute how many literal are needed */
860 static int r600_bc_alu_nliterals(struct r600_bc *bc, struct r600_bc_alu *alu,
861 uint32_t literal[4], unsigned *nliteral)
862 {
863 unsigned num_src = r600_bc_get_num_operands(bc, alu);
864 unsigned i, j;
865
866 for (i = 0; i < num_src; ++i) {
867 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
868 uint32_t value = alu->src[i].value[alu->src[i].chan];
869 unsigned found = 0;
870 for (j = 0; j < *nliteral; ++j) {
871 if (literal[j] == value) {
872 found = 1;
873 break;
874 }
875 }
876 if (!found) {
877 if (*nliteral >= 4)
878 return -EINVAL;
879 literal[(*nliteral)++] = value;
880 }
881 }
882 }
883 return 0;
884 }
885
886 static void r600_bc_alu_adjust_literals(struct r600_bc *bc,
887 struct r600_bc_alu *alu,
888 uint32_t literal[4], unsigned nliteral)
889 {
890 unsigned num_src = r600_bc_get_num_operands(bc, alu);
891 unsigned i, j;
892
893 for (i = 0; i < num_src; ++i) {
894 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
895 uint32_t value = alu->src[i].value[alu->src[i].chan];
896 for (j = 0; j < nliteral; ++j) {
897 if (literal[j] == value) {
898 alu->src[i].chan = j;
899 break;
900 }
901 }
902 }
903 }
904 }
905
906 static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
907 struct r600_bc_alu *alu_prev)
908 {
909 struct r600_bc_alu *prev[5];
910 struct r600_bc_alu *result[5] = { NULL };
911
912 uint32_t literal[4], prev_literal[4];
913 unsigned nliteral = 0, prev_nliteral = 0;
914
915 int i, j, r, src, num_src;
916 int num_once_inst = 0;
917 int have_mova = 0, have_rel = 0;
918
919 r = assign_alu_units(bc, alu_prev, prev);
920 if (r)
921 return r;
922
923 for (i = 0; i < 5; ++i) {
924 struct r600_bc_alu *alu;
925
926 /* check number of literals */
927 if (prev[i]) {
928 if (r600_bc_alu_nliterals(bc, prev[i], literal, &nliteral))
929 return 0;
930 if (r600_bc_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
931 return 0;
932 if (is_alu_mova_inst(bc, prev[i])) {
933 if (have_rel)
934 return 0;
935 have_mova = 1;
936 }
937 num_once_inst += is_alu_once_inst(bc, prev[i]);
938 }
939 if (slots[i] && r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral))
940 return 0;
941
942 // let's check used slots
943 if (prev[i] && !slots[i]) {
944 result[i] = prev[i];
945 continue;
946 } else if (prev[i] && slots[i]) {
947 if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
948 // trans unit is still free try to use it
949 if (is_alu_any_unit_inst(bc, slots[i])) {
950 result[i] = prev[i];
951 result[4] = slots[i];
952 } else if (is_alu_any_unit_inst(bc, prev[i])) {
953 result[i] = slots[i];
954 result[4] = prev[i];
955 } else
956 return 0;
957 } else
958 return 0;
959 } else if(!slots[i]) {
960 continue;
961 } else
962 result[i] = slots[i];
963
964 // let's check source gprs
965 alu = slots[i];
966 num_once_inst += is_alu_once_inst(bc, alu);
967
968 num_src = r600_bc_get_num_operands(bc, alu);
969 for (src = 0; src < num_src; ++src) {
970 if (alu->src[src].rel) {
971 if (have_mova)
972 return 0;
973 have_rel = 1;
974 }
975
976 // constants doesn't matter
977 if (!is_gpr(alu->src[src].sel))
978 continue;
979
980 for (j = 0; j < 5; ++j) {
981 if (!prev[j] || !prev[j]->dst.write)
982 continue;
983
984 // if it's relative then we can't determin which gpr is really used
985 if (prev[j]->dst.chan == alu->src[src].chan &&
986 (prev[j]->dst.sel == alu->src[src].sel ||
987 prev[j]->dst.rel || alu->src[src].rel))
988 return 0;
989 }
990 }
991 }
992
993 /* more than one PRED_ or KILL_ ? */
994 if (num_once_inst > 1)
995 return 0;
996
997 /* check if the result can still be swizzlet */
998 r = check_and_set_bank_swizzle(bc, result);
999 if (r)
1000 return 0;
1001
1002 /* looks like everything worked out right, apply the changes */
1003
1004 /* undo adding previus literals */
1005 bc->cf_last->ndw -= align(prev_nliteral, 2);
1006
1007 /* sort instructions */
1008 for (i = 0; i < 5; ++i) {
1009 slots[i] = result[i];
1010 if (result[i]) {
1011 LIST_DEL(&result[i]->list);
1012 result[i]->last = 0;
1013 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1014 }
1015 }
1016
1017 /* determine new last instruction */
1018 LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
1019
1020 /* determine new first instruction */
1021 for (i = 0; i < 5; ++i) {
1022 if (result[i]) {
1023 bc->cf_last->curr_bs_head = result[i];
1024 break;
1025 }
1026 }
1027
1028 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1029 bc->cf_last->prev2_bs_head = NULL;
1030
1031 return 0;
1032 }
1033
1034 /* This code handles kcache lines as single blocks of 32 constants. We could
1035 * probably do slightly better by recognizing that we actually have two
1036 * consecutive lines of 16 constants, but the resulting code would also be
1037 * somewhat more complicated. */
1038 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
1039 {
1040 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
1041 unsigned int required_lines;
1042 unsigned int free_lines = 0;
1043 unsigned int cache_line[3];
1044 unsigned int count = 0;
1045 unsigned int i, j;
1046 int r;
1047
1048 /* Collect required cache lines. */
1049 for (i = 0; i < 3; ++i) {
1050 bool found = false;
1051 unsigned int line;
1052
1053 if (alu->src[i].sel < 512)
1054 continue;
1055
1056 line = ((alu->src[i].sel - 512) / 32) * 2;
1057
1058 for (j = 0; j < count; ++j) {
1059 if (cache_line[j] == line) {
1060 found = true;
1061 break;
1062 }
1063 }
1064
1065 if (!found)
1066 cache_line[count++] = line;
1067 }
1068
1069 /* This should never actually happen. */
1070 if (count >= 3) return -ENOMEM;
1071
1072 for (i = 0; i < 2; ++i) {
1073 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1074 ++free_lines;
1075 }
1076 }
1077
1078 /* Filter lines pulled in by previous intructions. Note that this is
1079 * only for the required_lines count, we can't remove these from the
1080 * cache_line array since we may have to start a new ALU clause. */
1081 for (i = 0, required_lines = count; i < count; ++i) {
1082 for (j = 0; j < 2; ++j) {
1083 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1084 kcache[j].addr == cache_line[i]) {
1085 --required_lines;
1086 break;
1087 }
1088 }
1089 }
1090
1091 /* Start a new ALU clause if needed. */
1092 if (required_lines > free_lines) {
1093 if ((r = r600_bc_add_cf(bc))) {
1094 return r;
1095 }
1096 bc->cf_last->inst = (type << 3);
1097 kcache = bc->cf_last->kcache;
1098 }
1099
1100 /* Setup the kcache lines. */
1101 for (i = 0; i < count; ++i) {
1102 bool found = false;
1103
1104 for (j = 0; j < 2; ++j) {
1105 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1106 kcache[j].addr == cache_line[i]) {
1107 found = true;
1108 break;
1109 }
1110 }
1111
1112 if (found) continue;
1113
1114 for (j = 0; j < 2; ++j) {
1115 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1116 kcache[j].bank = 0;
1117 kcache[j].addr = cache_line[i];
1118 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1119 break;
1120 }
1121 }
1122 }
1123
1124 /* Alter the src operands to refer to the kcache. */
1125 for (i = 0; i < 3; ++i) {
1126 static const unsigned int base[] = {128, 160, 256, 288};
1127 unsigned int line;
1128
1129 if (alu->src[i].sel < 512)
1130 continue;
1131
1132 alu->src[i].sel -= 512;
1133 line = (alu->src[i].sel / 32) * 2;
1134
1135 for (j = 0; j < 2; ++j) {
1136 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1137 kcache[j].addr == line) {
1138 alu->src[i].sel &= 0x1f;
1139 alu->src[i].sel += base[j];
1140 break;
1141 }
1142 }
1143 }
1144
1145 return 0;
1146 }
1147
1148 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
1149 {
1150 struct r600_bc_alu *nalu = r600_bc_alu();
1151 struct r600_bc_alu *lalu;
1152 int i, r;
1153
1154 if (nalu == NULL)
1155 return -ENOMEM;
1156 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
1157
1158 if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1159 /* check if we could add it anyway */
1160 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1161 type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1162 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1163 if (lalu->predicate) {
1164 bc->force_add_cf = 1;
1165 break;
1166 }
1167 }
1168 } else
1169 bc->force_add_cf = 1;
1170 }
1171
1172 /* cf can contains only alu or only vtx or only tex */
1173 if (bc->cf_last == NULL || bc->force_add_cf) {
1174 r = r600_bc_add_cf(bc);
1175 if (r) {
1176 free(nalu);
1177 return r;
1178 }
1179 }
1180 bc->cf_last->inst = (type << 3);
1181
1182 /* Setup the kcache for this ALU instruction. This will start a new
1183 * ALU clause if needed. */
1184 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
1185 free(nalu);
1186 return r;
1187 }
1188
1189 if (!bc->cf_last->curr_bs_head) {
1190 bc->cf_last->curr_bs_head = nalu;
1191 }
1192 /* number of gpr == the last gpr used in any alu */
1193 for (i = 0; i < 3; i++) {
1194 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1195 bc->ngpr = nalu->src[i].sel + 1;
1196 }
1197 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1198 r600_bc_special_constants(
1199 nalu->src[i].value[nalu->src[i].chan],
1200 &nalu->src[i].sel, &nalu->src[i].neg);
1201 }
1202 if (nalu->dst.sel >= bc->ngpr) {
1203 bc->ngpr = nalu->dst.sel + 1;
1204 }
1205 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1206 /* each alu use 2 dwords */
1207 bc->cf_last->ndw += 2;
1208 bc->ndw += 2;
1209
1210 /* process cur ALU instructions for bank swizzle */
1211 if (nalu->last) {
1212 uint32_t literal[4];
1213 unsigned nliteral;
1214 struct r600_bc_alu *slots[5];
1215 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1216 if (r)
1217 return r;
1218
1219 if (bc->cf_last->prev_bs_head) {
1220 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1221 if (r)
1222 return r;
1223 }
1224
1225 if (bc->cf_last->prev_bs_head) {
1226 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1227 if (r)
1228 return r;
1229 }
1230
1231 r = check_and_set_bank_swizzle(bc, slots);
1232 if (r)
1233 return r;
1234
1235 for (i = 0, nliteral = 0; i < 5; i++) {
1236 if (slots[i]) {
1237 r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
1238 if (r)
1239 return r;
1240 }
1241 }
1242 bc->cf_last->ndw += align(nliteral, 2);
1243
1244 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1245 * worst case */
1246 if ((bc->cf_last->ndw >> 1) >= 120) {
1247 bc->force_add_cf = 1;
1248 }
1249
1250 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1251 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1252 bc->cf_last->curr_bs_head = NULL;
1253 }
1254 return 0;
1255 }
1256
1257 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
1258 {
1259 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1260 }
1261
1262 static void r600_bc_remove_alu(struct r600_bc_cf *cf, struct r600_bc_alu *alu)
1263 {
1264 if (alu->last && alu->list.prev != &cf->alu) {
1265 PREV_ALU(alu)->last = 1;
1266 }
1267 LIST_DEL(&alu->list);
1268 free(alu);
1269 cf->ndw -= 2;
1270 }
1271
1272 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
1273 {
1274 struct r600_bc_vtx *nvtx = r600_bc_vtx();
1275 int r;
1276
1277 if (nvtx == NULL)
1278 return -ENOMEM;
1279 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
1280
1281 /* cf can contains only alu or only vtx or only tex */
1282 if (bc->cf_last == NULL ||
1283 (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1284 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) ||
1285 bc->force_add_cf) {
1286 r = r600_bc_add_cf(bc);
1287 if (r) {
1288 free(nvtx);
1289 return r;
1290 }
1291 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1292 }
1293 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1294 /* each fetch use 4 dwords */
1295 bc->cf_last->ndw += 4;
1296 bc->ndw += 4;
1297 if ((bc->cf_last->ndw / 4) > 7)
1298 bc->force_add_cf = 1;
1299 return 0;
1300 }
1301
1302 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
1303 {
1304 struct r600_bc_tex *ntex = r600_bc_tex();
1305 int r;
1306
1307 if (ntex == NULL)
1308 return -ENOMEM;
1309 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
1310
1311 /* cf can contains only alu or only vtx or only tex */
1312 if (bc->cf_last == NULL ||
1313 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1314 bc->force_add_cf) {
1315 r = r600_bc_add_cf(bc);
1316 if (r) {
1317 free(ntex);
1318 return r;
1319 }
1320 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1321 }
1322 if (ntex->src_gpr >= bc->ngpr) {
1323 bc->ngpr = ntex->src_gpr + 1;
1324 }
1325 if (ntex->dst_gpr >= bc->ngpr) {
1326 bc->ngpr = ntex->dst_gpr + 1;
1327 }
1328 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1329 /* each texture fetch use 4 dwords */
1330 bc->cf_last->ndw += 4;
1331 bc->ndw += 4;
1332 if ((bc->cf_last->ndw / 4) > 7)
1333 bc->force_add_cf = 1;
1334 return 0;
1335 }
1336
1337 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
1338 {
1339 int r;
1340 r = r600_bc_add_cf(bc);
1341 if (r)
1342 return r;
1343
1344 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1345 bc->cf_last->inst = inst;
1346 return 0;
1347 }
1348
1349 /* common to all 3 families */
1350 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
1351 {
1352 unsigned fetch_resource_start = 0;
1353
1354 /* check if we are fetch shader */
1355 /* fetch shader can also access vertex resource,
1356 * first fetch shader resource is at 160
1357 */
1358 if (bc->type == -1) {
1359 switch (bc->chiprev) {
1360 /* r600 */
1361 case CHIPREV_R600:
1362 /* r700 */
1363 case CHIPREV_R700:
1364 fetch_resource_start = 160;
1365 break;
1366 /* evergreen */
1367 case CHIPREV_EVERGREEN:
1368 fetch_resource_start = 0;
1369 break;
1370 default:
1371 fprintf(stderr, "%s:%s:%d unknown chiprev %d\n",
1372 __FILE__, __func__, __LINE__, bc->chiprev);
1373 break;
1374 }
1375 }
1376 bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id + fetch_resource_start) |
1377 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1378 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x) |
1379 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1380 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1381 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1382 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1383 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1384 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1385 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1386 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1387 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1388 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1389 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1390 bc->bytecode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1);
1391 bc->bytecode[id++] = 0;
1392 return 0;
1393 }
1394
1395 /* common to all 3 families */
1396 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
1397 {
1398 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1399 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1400 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1401 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1402 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1403 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1404 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1405 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1406 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1407 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1408 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1409 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1410 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1411 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1412 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1413 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1414 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1415 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1416 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1417 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1418 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1419 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1420 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1421 bc->bytecode[id++] = 0;
1422 return 0;
1423 }
1424
1425 /* r600 only, r700/eg bits in r700_asm.c */
1426 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
1427 {
1428 /* don't replace gpr by pv or ps for destination register */
1429 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1430 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1431 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1432 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1433 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1434 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1435 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1436 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1437 S_SQ_ALU_WORD0_LAST(alu->last);
1438
1439 if (alu->is_op3) {
1440 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1441 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1442 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1443 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1444 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1445 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1446 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1447 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1448 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1449 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1450 } else {
1451 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1452 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1453 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1454 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1455 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1456 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1457 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1458 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1459 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1460 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1461 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1462 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1463 }
1464 return 0;
1465 }
1466
1467 enum cf_class
1468 {
1469 CF_CLASS_ALU,
1470 CF_CLASS_TEXTURE,
1471 CF_CLASS_VERTEX,
1472 CF_CLASS_EXPORT,
1473 CF_CLASS_OTHER
1474 };
1475
1476 static enum cf_class r600_bc_cf_class(struct r600_bc_cf *cf)
1477 {
1478 switch (cf->inst) {
1479 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1480 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1481 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1482 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1483 return CF_CLASS_ALU;
1484
1485 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1486 return CF_CLASS_TEXTURE;
1487
1488 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1489 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1490 return CF_CLASS_VERTEX;
1491
1492 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1493 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1494 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1495 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1496 return CF_CLASS_EXPORT;
1497
1498 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1499 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1500 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1501 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1502 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1503 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1504 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1505 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1506 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1507 return CF_CLASS_OTHER;
1508
1509 default:
1510 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1511 return -EINVAL;
1512 }
1513 }
1514
1515 /* common for r600/r700 - eg in eg_asm.c */
1516 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
1517 {
1518 unsigned id = cf->id;
1519 unsigned end_of_program = bc->cf.prev == &cf->list;
1520
1521 switch (r600_bc_cf_class(cf)) {
1522 case CF_CLASS_ALU:
1523 assert(!end_of_program);
1524 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1525 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1526 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1527 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1528
1529 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1530 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1531 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1532 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1533 S_SQ_CF_ALU_WORD1_BARRIER(cf->barrier) |
1534 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
1535 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1536 break;
1537 case CF_CLASS_TEXTURE:
1538 case CF_CLASS_VERTEX:
1539 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1540 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1541 S_SQ_CF_WORD1_BARRIER(cf->barrier) |
1542 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1) |
1543 S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
1544 break;
1545 case CF_CLASS_EXPORT:
1546 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1547 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1548 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1549 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1550 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1551 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1552 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1553 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1554 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1555 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1556 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) |
1557 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(end_of_program);
1558 break;
1559 case CF_CLASS_OTHER:
1560 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1561 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1562 S_SQ_CF_WORD1_BARRIER(cf->barrier) |
1563 S_SQ_CF_WORD1_COND(cf->cond) |
1564 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
1565 S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
1566
1567 break;
1568 default:
1569 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1570 return -EINVAL;
1571 }
1572 return 0;
1573 }
1574
1575 struct gpr_usage_range {
1576 int replacement;
1577 int rel_block;
1578 int start;
1579 int end;
1580 };
1581
1582 struct gpr_usage {
1583 unsigned channels:4;
1584 int first_write;
1585 int last_write[4];
1586 unsigned nranges;
1587 struct gpr_usage_range *ranges;
1588 };
1589
1590 static struct gpr_usage_range* last_gpr_usage_range(struct gpr_usage *usage)
1591 {
1592 if (usage->nranges)
1593 return usage->ranges + usage->nranges - 1;
1594 else
1595 return NULL;
1596 }
1597
1598 static struct gpr_usage_range* add_gpr_usage_range(struct gpr_usage *usage)
1599 {
1600 struct gpr_usage_range *range;
1601
1602 usage->nranges++;
1603 usage->ranges = realloc(usage->ranges, usage->nranges * sizeof(struct gpr_usage_range));
1604 if (!usage->ranges)
1605 return NULL;
1606
1607 range = last_gpr_usage_range(usage);
1608 range->replacement = -1; /* no prefered replacement */
1609 range->rel_block = -1;
1610 range->start = -1;
1611 range->end = -1;
1612
1613 return range;
1614 }
1615
1616 static void notice_gpr_read(struct gpr_usage *usage, int id, unsigned chan)
1617 {
1618 struct gpr_usage_range* range;
1619
1620 usage->channels |= 1 << chan;
1621 usage->first_write = -1;
1622 if (!usage->nranges) {
1623 range = add_gpr_usage_range(usage);
1624 } else
1625 range = last_gpr_usage_range(usage);
1626
1627 if (range && range->end < id)
1628 range->end = id;
1629 }
1630
1631 static void notice_gpr_rel_read(struct r600_bc *bc, struct gpr_usage usage[128],
1632 int id, unsigned gpr, unsigned chan)
1633 {
1634 unsigned i;
1635 for (i = gpr; i < bc->ngpr; ++i)
1636 notice_gpr_read(&usage[i], id, chan);
1637
1638 last_gpr_usage_range(&usage[gpr])->rel_block = bc->ngpr - gpr;
1639 }
1640
1641 static void notice_gpr_last_write(struct gpr_usage *usage, int id, unsigned chan)
1642 {
1643 usage->last_write[chan] = id;
1644 }
1645
1646 static void notice_gpr_write(struct gpr_usage *usage, int id, unsigned chan,
1647 int predicate, int prefered_replacement)
1648 {
1649 struct gpr_usage_range* last_range = last_gpr_usage_range(usage);
1650 int start = usage->first_write != -1 ? usage->first_write : id;
1651 usage->channels &= ~(1 << chan);
1652 if (usage->channels) {
1653 if (usage->first_write == -1)
1654 usage->first_write = id;
1655 } else if (!last_range || (last_range->start != start && !predicate)) {
1656 usage->first_write = start;
1657 struct gpr_usage_range* range = add_gpr_usage_range(usage);
1658 range->replacement = prefered_replacement;
1659 range->start = start;
1660 } else if (last_range->start == start && prefered_replacement != -1) {
1661 last_range->replacement = prefered_replacement;
1662 }
1663 notice_gpr_last_write(usage, id, chan);
1664 }
1665
1666 static void notice_gpr_rel_last_write(struct gpr_usage usage[128], int id, unsigned chan)
1667 {
1668 unsigned i;
1669 for (i = 0; i < 128; ++i)
1670 notice_gpr_last_write(&usage[i], id, chan);
1671 }
1672
1673 static void notice_gpr_rel_write(struct gpr_usage usage[128], int id, unsigned chan)
1674 {
1675 unsigned i;
1676 for (i = 0; i < 128; ++i)
1677 notice_gpr_write(&usage[i], id, chan, 1, -1);
1678 }
1679
1680 static void notice_alu_src_gprs(struct r600_bc *bc, struct r600_bc_alu *alu,
1681 struct gpr_usage usage[128], int id)
1682 {
1683 unsigned src, num_src;
1684
1685 num_src = r600_bc_get_num_operands(bc, alu);
1686 for (src = 0; src < num_src; ++src) {
1687 // constants doesn't matter
1688 if (!is_gpr(alu->src[src].sel))
1689 continue;
1690
1691 if (alu->src[src].rel)
1692 notice_gpr_rel_read(bc, usage, id, alu->src[src].sel, alu->src[src].chan);
1693 else
1694 notice_gpr_read(&usage[alu->src[src].sel], id, alu->src[src].chan);
1695 }
1696 }
1697
1698 static void notice_alu_dst_gprs(struct r600_bc_alu *alu_first, struct gpr_usage usage[128],
1699 int id, int predicate)
1700 {
1701 struct r600_bc_alu *alu;
1702 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
1703 if (alu->dst.write) {
1704 if (alu->dst.rel)
1705 notice_gpr_rel_write(usage, id, alu->dst.chan);
1706 else if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV && is_gpr(alu->src[0].sel))
1707 notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan,
1708 predicate, alu->src[0].sel);
1709 else
1710 notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan, predicate, -1);
1711 }
1712
1713 if (alu->last)
1714 break;
1715 }
1716 }
1717
1718 static void notice_tex_gprs(struct r600_bc *bc, struct r600_bc_tex *tex,
1719 struct gpr_usage usage[128],
1720 int id, int predicate)
1721 {
1722 if (tex->src_rel) {
1723 if (tex->src_sel_x < 4)
1724 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_x);
1725 if (tex->src_sel_y < 4)
1726 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_y);
1727 if (tex->src_sel_z < 4)
1728 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_z);
1729 if (tex->src_sel_w < 4)
1730 notice_gpr_rel_read(bc, usage, id, tex->src_gpr, tex->src_sel_w);
1731 } else {
1732 if (tex->src_sel_x < 4)
1733 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_x);
1734 if (tex->src_sel_y < 4)
1735 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_y);
1736 if (tex->src_sel_z < 4)
1737 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_z);
1738 if (tex->src_sel_w < 4)
1739 notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_w);
1740 }
1741 if (tex->dst_rel) {
1742 if (tex->dst_sel_x != 7)
1743 notice_gpr_rel_write(usage, id, 0);
1744 if (tex->dst_sel_y != 7)
1745 notice_gpr_rel_write(usage, id, 1);
1746 if (tex->dst_sel_z != 7)
1747 notice_gpr_rel_write(usage, id, 2);
1748 if (tex->dst_sel_w != 7)
1749 notice_gpr_rel_write(usage, id, 3);
1750 } else {
1751 if (tex->dst_sel_x != 7)
1752 notice_gpr_write(&usage[tex->dst_gpr], id, 0, predicate, -1);
1753 if (tex->dst_sel_y != 7)
1754 notice_gpr_write(&usage[tex->dst_gpr], id, 1, predicate, -1);
1755 if (tex->dst_sel_z != 7)
1756 notice_gpr_write(&usage[tex->dst_gpr], id, 2, predicate, -1);
1757 if (tex->dst_sel_w != 7)
1758 notice_gpr_write(&usage[tex->dst_gpr], id, 3, predicate, -1);
1759 }
1760 }
1761
1762 static void notice_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
1763 int id, int predicate)
1764 {
1765 notice_gpr_read(&usage[vtx->src_gpr], id, vtx->src_sel_x);
1766
1767 if (vtx->dst_sel_x != 7)
1768 notice_gpr_write(&usage[vtx->dst_gpr], id, 0, predicate, -1);
1769 if (vtx->dst_sel_y != 7)
1770 notice_gpr_write(&usage[vtx->dst_gpr], id, 1, predicate, -1);
1771 if (vtx->dst_sel_z != 7)
1772 notice_gpr_write(&usage[vtx->dst_gpr], id, 2, predicate, -1);
1773 if (vtx->dst_sel_w != 7)
1774 notice_gpr_write(&usage[vtx->dst_gpr], id, 3, predicate, -1);
1775 }
1776
1777 static void notice_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
1778 struct r600_bc_cf *export_cf[128], int export_remap[128])
1779 {
1780 //TODO handle other memory operations
1781 struct gpr_usage *output = &usage[cf->output.gpr];
1782 int id = MAX4(output->last_write[0], output->last_write[1],
1783 output->last_write[2], output->last_write[3]);
1784 id += 0x100;
1785 id &= ~0xFF;
1786
1787 export_cf[cf->output.gpr] = cf;
1788 export_remap[cf->output.gpr] = id;
1789 if (cf->output.swizzle_x < 4)
1790 notice_gpr_read(output, id, cf->output.swizzle_x);
1791 if (cf->output.swizzle_y < 4)
1792 notice_gpr_read(output, id, cf->output.swizzle_y);
1793 if (cf->output.swizzle_z < 4)
1794 notice_gpr_read(output, id, cf->output.swizzle_z);
1795 if (cf->output.swizzle_w < 4)
1796 notice_gpr_read(output, id, cf->output.swizzle_w);
1797 }
1798
1799 static struct gpr_usage_range *find_src_range(struct gpr_usage *usage, int id)
1800 {
1801 unsigned i;
1802 for (i = 0; i < usage->nranges; ++i) {
1803 struct gpr_usage_range* range = &usage->ranges[i];
1804
1805 if (range->start < id && id <= range->end)
1806 return range;
1807 }
1808 return NULL;
1809 }
1810
1811 static struct gpr_usage_range *find_dst_range(struct gpr_usage *usage, int id)
1812 {
1813 unsigned i;
1814 for (i = 0; i < usage->nranges; ++i) {
1815 struct gpr_usage_range* range = &usage->ranges[i];
1816 int end = range->end;
1817
1818 if (range->start <= id && (id < end || end == -1))
1819 return range;
1820 }
1821 return NULL;
1822 }
1823
1824 static int is_barrier_needed(struct gpr_usage *usage, int id, unsigned chan, int last_barrier)
1825 {
1826 if (usage->last_write[chan] != (id & ~0xFF))
1827 return usage->last_write[chan] >= last_barrier;
1828 else
1829 return 0;
1830 }
1831
1832 static int is_intersection(struct gpr_usage_range* a, struct gpr_usage_range* b)
1833 {
1834 return a->start <= b->end && b->start < a->end;
1835 }
1836
1837 static int rate_replacement(struct gpr_usage usage[128], unsigned current, unsigned gpr,
1838 struct gpr_usage_range* range)
1839 {
1840 int max_gpr = gpr + MAX2(range->rel_block, 1);
1841 int best_start = 0x3FFFFFFF, best_end = 0x3FFFFFFF;
1842 unsigned i;
1843
1844 for (; gpr < max_gpr; ++gpr) {
1845
1846 if (gpr >= 128) /* relative gpr block won't fit into clause temporaries */
1847 return -1; /* forget it */
1848
1849 if (gpr == current) /* ignore ranges of to be replaced register */
1850 continue;
1851
1852 for (i = 0; i < usage[gpr].nranges; ++i) {
1853 if (usage[gpr].ranges[i].replacement < gpr)
1854 continue; /* ignore already remapped ranges */
1855
1856 if (is_intersection(&usage[gpr].ranges[i], range))
1857 return -1; /* forget it if usages overlap */
1858
1859 if (range->start >= usage[gpr].ranges[i].end)
1860 best_start = MIN2(best_start, range->start - usage[gpr].ranges[i].end);
1861
1862 if (range->end != -1 && range->end <= usage[gpr].ranges[i].start)
1863 best_end = MIN2(best_end, usage[gpr].ranges[i].start - range->end);
1864 }
1865 }
1866 return best_start + best_end;
1867 }
1868
1869 static void find_replacement(struct gpr_usage usage[128], unsigned current,
1870 struct gpr_usage_range *range)
1871 {
1872 unsigned i, j;
1873 int best_gpr = -1, best_rate = 0x7FFFFFFF;
1874
1875 if (range->replacement == current)
1876 return; /* register prefers to be not remapped */
1877
1878 if (range->replacement != -1 && range->replacement <= current) {
1879 struct gpr_usage_range *other = find_src_range(&usage[range->replacement], range->start);
1880 if (other && other->replacement != -1)
1881 range->replacement = other->replacement;
1882 }
1883
1884 if (range->replacement != -1 && range->replacement < current) {
1885 int rate = rate_replacement(usage, current, range->replacement, range);
1886
1887 /* check if prefered replacement can be used */
1888 if (rate != -1) {
1889 best_rate = rate;
1890 best_gpr = range->replacement;
1891 }
1892 }
1893
1894 if (best_gpr == -1 && (range->start & ~0xFF) == (range->end & ~0xFF)) {
1895 /* register is just used inside one ALU clause */
1896 /* try to use clause temporaries for it */
1897 for (i = 127; i > 123; --i) {
1898 int rate = rate_replacement(usage, current, i, range);
1899
1900 if (rate == -1) /* can't be used because ranges overlap */
1901 continue;
1902
1903 if (rate < best_rate) {
1904 best_rate = rate;
1905 best_gpr = i;
1906
1907 /* can't get better than this */
1908 if (rate == 0)
1909 break;
1910 }
1911 }
1912 }
1913
1914 if (best_gpr == -1) {
1915 for (i = 0; i < current; ++i) {
1916 int rate = rate_replacement(usage, current, i, range);
1917
1918 if (rate == -1) /* can't be used because ranges overlap */
1919 continue;
1920
1921 if (rate < best_rate) {
1922 best_rate = rate;
1923 best_gpr = i;
1924
1925 /* can't get better than this */
1926 if (rate == 0)
1927 break;
1928 }
1929 }
1930 }
1931
1932 if (best_gpr != -1) {
1933 struct gpr_usage_range *reservation = add_gpr_usage_range(&usage[best_gpr]);
1934 reservation->replacement = best_gpr;
1935 reservation->rel_block = -1;
1936 reservation->start = range->start;
1937 reservation->end = range->end;
1938 } else
1939 best_gpr = current;
1940
1941 range->replacement = best_gpr;
1942 if (range->rel_block == -1)
1943 return; /* no relative block to handle we are done here */
1944
1945 /* set prefered register for the whole relative register block */
1946 for (i = current + 1, ++best_gpr; i < current + range->rel_block; ++i, ++best_gpr) {
1947 for (j = 0; j < usage[i].nranges; ++j) {
1948 if (is_intersection(&usage[i].ranges[j], range))
1949 usage[i].ranges[j].replacement = best_gpr;
1950 }
1951 }
1952 }
1953
1954 static void replace_alu_gprs(struct r600_bc *bc, struct r600_bc_alu *alu, struct gpr_usage usage[128],
1955 int id, int last_barrier, unsigned *barrier)
1956 {
1957 struct gpr_usage *cur_usage;
1958 struct gpr_usage_range *range;
1959 unsigned src, num_src;
1960
1961 num_src = r600_bc_get_num_operands(bc, alu);
1962 for (src = 0; src < num_src; ++src) {
1963 // constants doesn't matter
1964 if (!is_gpr(alu->src[src].sel))
1965 continue;
1966
1967 cur_usage = &usage[alu->src[src].sel];
1968 range = find_src_range(cur_usage, id);
1969 alu->src[src].sel = range->replacement;
1970
1971 *barrier |= is_barrier_needed(cur_usage, id, alu->src[src].chan, last_barrier);
1972 }
1973
1974 if (alu->dst.write) {
1975 cur_usage = &usage[alu->dst.sel];
1976 range = find_dst_range(cur_usage, id);
1977 if (!range || range->replacement == -1) {
1978 if (!alu->is_op3)
1979 alu->dst.write = 0;
1980 else
1981 /*TODO: really check that register 123 is useable */
1982 alu->dst.sel = 123;
1983 } else {
1984 alu->dst.sel = range->replacement;
1985 *barrier |= is_barrier_needed(cur_usage, id, alu->dst.chan, last_barrier);
1986 }
1987 }
1988 if (alu->dst.write) {
1989 if (alu->dst.rel)
1990 notice_gpr_rel_last_write(usage, id, alu->dst.chan);
1991 else
1992 notice_gpr_last_write(cur_usage, id, alu->dst.chan);
1993 }
1994 }
1995
1996 static void replace_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
1997 int id, int last_barrier, unsigned *barrier)
1998 {
1999 struct gpr_usage *cur_usage = &usage[tex->src_gpr];
2000 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2001
2002 if (tex->src_rel) {
2003 *barrier = 1;
2004 } else {
2005 if (tex->src_sel_x < 4)
2006 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_x, last_barrier);
2007 if (tex->src_sel_y < 4)
2008 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_y, last_barrier);
2009 if (tex->src_sel_z < 4)
2010 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_z, last_barrier);
2011 if (tex->src_sel_w < 4)
2012 *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_w, last_barrier);
2013 }
2014 tex->src_gpr = range->replacement;
2015
2016 cur_usage = &usage[tex->dst_gpr];
2017
2018 range = find_dst_range(cur_usage, id);
2019 if (range) {
2020 tex->dst_gpr = range->replacement;
2021
2022 if (tex->dst_rel) {
2023 if (tex->dst_sel_x != 7)
2024 notice_gpr_rel_last_write(usage, id, tex->dst_sel_x);
2025 if (tex->dst_sel_y != 7)
2026 notice_gpr_rel_last_write(usage, id, tex->dst_sel_y);
2027 if (tex->dst_sel_z != 7)
2028 notice_gpr_rel_last_write(usage, id, tex->dst_sel_z);
2029 if (tex->dst_sel_w != 7)
2030 notice_gpr_rel_last_write(usage, id, tex->dst_sel_w);
2031 } else {
2032 if (tex->dst_sel_x != 7)
2033 notice_gpr_last_write(cur_usage, id, tex->dst_sel_x);
2034 if (tex->dst_sel_y != 7)
2035 notice_gpr_last_write(cur_usage, id, tex->dst_sel_y);
2036 if (tex->dst_sel_z != 7)
2037 notice_gpr_last_write(cur_usage, id, tex->dst_sel_z);
2038 if (tex->dst_sel_w != 7)
2039 notice_gpr_last_write(cur_usage, id, tex->dst_sel_w);
2040 }
2041 } else {
2042 tex->dst_gpr = 123;
2043 }
2044 }
2045
2046 static void replace_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
2047 int id, int last_barrier, unsigned *barrier)
2048 {
2049 struct gpr_usage *cur_usage = &usage[vtx->src_gpr];
2050 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2051
2052 *barrier |= is_barrier_needed(cur_usage, id, vtx->src_sel_x, last_barrier);
2053
2054 vtx->src_gpr = range->replacement;
2055
2056 cur_usage = &usage[vtx->dst_gpr];
2057 range = find_dst_range(cur_usage, id);
2058 if (range) {
2059 vtx->dst_gpr = range->replacement;
2060
2061 if (vtx->dst_sel_x != 7)
2062 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_x);
2063 if (vtx->dst_sel_y != 7)
2064 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_y);
2065 if (vtx->dst_sel_z != 7)
2066 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_z);
2067 if (vtx->dst_sel_w != 7)
2068 notice_gpr_last_write(cur_usage, id, vtx->dst_sel_w);
2069 } else {
2070 vtx->dst_gpr = 123;
2071 }
2072 }
2073
2074 static void replace_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
2075 int id, int last_barrier)
2076 {
2077 //TODO handle other memory operations
2078 struct gpr_usage *cur_usage = &usage[cf->output.gpr];
2079 struct gpr_usage_range *range = find_src_range(cur_usage, id);
2080
2081 cf->barrier = 0;
2082 if (cf->output.swizzle_x < 4)
2083 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_x, last_barrier);
2084 if (cf->output.swizzle_y < 4)
2085 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_y, last_barrier);
2086 if (cf->output.swizzle_z < 4)
2087 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_z, last_barrier);
2088 if (cf->output.swizzle_w < 4)
2089 cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_w, last_barrier);
2090
2091 cf->output.gpr = range->replacement;
2092 }
2093
2094 static void optimize_alu_inst(struct r600_bc *bc, struct r600_bc_cf *cf, struct r600_bc_alu *alu)
2095 {
2096 struct r600_bc_alu *alu_next;
2097 unsigned chan;
2098 unsigned src, num_src;
2099
2100 /* check if a MOV could be optimized away */
2101 if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV) {
2102
2103 /* destination equals source? */
2104 if (alu->dst.sel != alu->src[0].sel ||
2105 alu->dst.chan != alu->src[0].chan)
2106 return;
2107
2108 /* any special handling for the source? */
2109 if (alu->src[0].rel || alu->src[0].neg || alu->src[0].abs)
2110 return;
2111
2112 /* any special handling for destination? */
2113 if (alu->dst.rel || alu->dst.clamp)
2114 return;
2115
2116 /* ok find next instruction group and check if ps/pv is used */
2117 for (alu_next = alu; !alu_next->last; alu_next = NEXT_ALU(alu_next));
2118
2119 if (alu_next->list.next != &cf->alu) {
2120 chan = is_alu_reduction_inst(bc, alu) ? 0 : alu->dst.chan;
2121 for (alu_next = NEXT_ALU(alu_next); alu_next; alu_next = NEXT_ALU(alu_next)) {
2122 num_src = r600_bc_get_num_operands(bc, alu_next);
2123 for (src = 0; src < num_src; ++src) {
2124 if (alu_next->src[src].sel == V_SQ_ALU_SRC_PV &&
2125 alu_next->src[src].chan == chan)
2126 return;
2127
2128 if (alu_next->src[src].sel == V_SQ_ALU_SRC_PS)
2129 return;
2130 }
2131
2132 if (alu_next->last)
2133 break;
2134 }
2135 }
2136
2137 r600_bc_remove_alu(cf, alu);
2138 }
2139 }
2140
2141 static void optimize_export_inst(struct r600_bc *bc, struct r600_bc_cf *cf)
2142 {
2143 struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, cf->list.prev, list);
2144 if (&prev->list == &bc->cf ||
2145 prev->inst != cf->inst ||
2146 prev->output.type != cf->output.type ||
2147 prev->output.elem_size != cf->output.elem_size ||
2148 prev->output.swizzle_x != cf->output.swizzle_x ||
2149 prev->output.swizzle_y != cf->output.swizzle_y ||
2150 prev->output.swizzle_z != cf->output.swizzle_z ||
2151 prev->output.swizzle_w != cf->output.swizzle_w)
2152 return;
2153
2154 if ((prev->output.burst_count + cf->output.burst_count) > 16)
2155 return;
2156
2157 if ((prev->output.gpr + prev->output.burst_count) == cf->output.gpr &&
2158 (prev->output.array_base + prev->output.burst_count) == cf->output.array_base) {
2159
2160 prev->output.burst_count += cf->output.burst_count;
2161 r600_bc_remove_cf(bc, cf);
2162
2163 } else if (prev->output.gpr == (cf->output.gpr + cf->output.burst_count) &&
2164 prev->output.array_base == (cf->output.array_base + cf->output.burst_count)) {
2165
2166 cf->output.burst_count += prev->output.burst_count;
2167 r600_bc_remove_cf(bc, prev);
2168 }
2169 }
2170
2171 static void r600_bc_optimize(struct r600_bc *bc)
2172 {
2173 struct r600_bc_cf *cf, *next_cf;
2174 struct r600_bc_alu *first, *next_alu;
2175 struct r600_bc_alu *alu;
2176 struct r600_bc_vtx *vtx;
2177 struct r600_bc_tex *tex;
2178 struct gpr_usage usage[128];
2179
2180 /* assume that each gpr is exported only once */
2181 struct r600_bc_cf *export_cf[128] = { NULL };
2182 int export_remap[128];
2183
2184 int id, cond_start, barrier[bc->nstack];
2185 unsigned i, j, stack, predicate, old_stack;
2186
2187 memset(&usage, 0, sizeof(usage));
2188 for (i = 0; i < 128; ++i) {
2189 usage[i].first_write = -1;
2190 usage[i].last_write[0] = -1;
2191 usage[i].last_write[1] = -1;
2192 usage[i].last_write[2] = -1;
2193 usage[i].last_write[3] = -1;
2194 }
2195
2196 /* first gather some informations about the gpr usage */
2197 id = 0; stack = 0;
2198 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2199 old_stack = stack;
2200 if (stack == 0)
2201 cond_start = stack;
2202
2203 switch (r600_bc_cf_class(cf)) {
2204 case CF_CLASS_ALU:
2205 predicate = 0;
2206 first = NULL;
2207 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2208 if (!first)
2209 first = alu;
2210 notice_alu_src_gprs(bc, alu, usage, id);
2211 if (alu->last) {
2212 notice_alu_dst_gprs(first, usage, id, predicate || stack > 0);
2213 first = NULL;
2214 ++id;
2215 }
2216 if (is_alu_pred_inst(bc, alu))
2217 predicate++;
2218 }
2219 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
2220 stack += predicate;
2221 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
2222 stack -= 1;
2223 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
2224 stack -= 2;
2225 break;
2226 case CF_CLASS_TEXTURE:
2227 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2228 notice_tex_gprs(bc, tex, usage, id++, stack > 0);
2229 }
2230 break;
2231 case CF_CLASS_VERTEX:
2232 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2233 notice_vtx_gprs(vtx, usage, id++, stack > 0);
2234 }
2235 break;
2236 case CF_CLASS_EXPORT:
2237 notice_export_gprs(cf, usage, export_cf, export_remap);
2238 continue; // don't increment id
2239 case CF_CLASS_OTHER:
2240 switch (cf->inst) {
2241 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2242 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2243 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2244 break;
2245
2246 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2247 stack -= cf->pop_count;
2248 break;
2249
2250 default:
2251 // TODO implement loop handling
2252 goto out;
2253 }
2254 }
2255
2256 /* extend last_write after conditional block */
2257 if (stack == 0 && old_stack != 0)
2258 for (i = 0; i < 128; ++i)
2259 for (j = 0; j < 4; ++j)
2260 if (usage[i].last_write[j] >= cond_start)
2261 usage[i].last_write[j] = id;
2262
2263 id += 0x100;
2264 id &= ~0xFF;
2265 }
2266 assert(stack == 0);
2267
2268 /* try to optimize gpr usage */
2269 for (i = 0; i < 124; ++i) {
2270 for (j = 0; j < usage[i].nranges; ++j) {
2271 struct gpr_usage_range *range = &usage[i].ranges[j];
2272 if (range->start == -1)
2273 /* can't rearange shader inputs */
2274 range->replacement = i;
2275 else if (range->end == -1)
2276 /* gpr isn't used any more after this instruction */
2277 range->replacement = -1;
2278 else
2279 find_replacement(usage, i, range);
2280
2281 if (range->replacement == i)
2282 bc->ngpr = i;
2283 else if (range->replacement < i && range->replacement > bc->ngpr)
2284 bc->ngpr = range->replacement;
2285 }
2286 }
2287 bc->ngpr++;
2288
2289 /* apply the changes */
2290 for (i = 0; i < 128; ++i) {
2291 usage[i].last_write[0] = -1;
2292 usage[i].last_write[1] = -1;
2293 usage[i].last_write[2] = -1;
2294 usage[i].last_write[3] = -1;
2295 }
2296 barrier[0] = 0;
2297 id = 0; stack = 0;
2298 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2299 old_stack = stack;
2300 switch (r600_bc_cf_class(cf)) {
2301 case CF_CLASS_ALU:
2302 predicate = 0;
2303 first = NULL;
2304 cf->barrier = 0;
2305 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2306 replace_alu_gprs(bc, alu, usage, id, barrier[stack], &cf->barrier);
2307 if (alu->last)
2308 ++id;
2309
2310 if (is_alu_pred_inst(bc, alu))
2311 predicate++;
2312
2313 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)
2314 optimize_alu_inst(bc, cf, alu);
2315 }
2316 if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
2317 stack += predicate;
2318 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
2319 stack -= 1;
2320 else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
2321 stack -= 2;
2322 if (LIST_IS_EMPTY(&cf->alu)) {
2323 r600_bc_remove_cf(bc, cf);
2324 cf = NULL;
2325 }
2326 break;
2327 case CF_CLASS_TEXTURE:
2328 cf->barrier = 0;
2329 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2330 replace_tex_gprs(tex, usage, id++, barrier[stack], &cf->barrier);
2331 }
2332 break;
2333 case CF_CLASS_VERTEX:
2334 cf->barrier = 0;
2335 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2336 replace_vtx_gprs(vtx, usage, id++, barrier[stack], &cf->barrier);
2337 }
2338 break;
2339 case CF_CLASS_EXPORT:
2340 continue; // don't increment id
2341 case CF_CLASS_OTHER:
2342 if (cf->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
2343 cf->barrier = 0;
2344 stack -= cf->pop_count;
2345 }
2346 break;
2347 }
2348
2349 id &= ~0xFF;
2350 if (cf && cf->barrier)
2351 barrier[old_stack] = id;
2352
2353 for (i = old_stack + 1; i <= stack; ++i)
2354 barrier[i] = barrier[old_stack];
2355
2356 id += 0x100;
2357 if (stack != 0) /* ensure exports are placed outside of conditional blocks */
2358 continue;
2359
2360 for (i = 0; i < 128; ++i) {
2361 if (!export_cf[i] || id < export_remap[i])
2362 continue;
2363
2364 r600_bc_move_cf(bc, export_cf[i], next_cf);
2365 replace_export_gprs(export_cf[i], usage, export_remap[i], barrier[stack]);
2366 if (export_cf[i]->barrier)
2367 barrier[stack] = id - 1;
2368 next_cf = LIST_ENTRY(struct r600_bc_cf, export_cf[i]->list.next, list);
2369 optimize_export_inst(bc, export_cf[i]);
2370 export_cf[i] = NULL;
2371 }
2372 }
2373 assert(stack == 0);
2374
2375 out:
2376 for (i = 0; i < 128; ++i) {
2377 free(usage[i].ranges);
2378 }
2379 }
2380
2381 int r600_bc_build(struct r600_bc *bc)
2382 {
2383 struct r600_bc_cf *cf;
2384 struct r600_bc_alu *alu;
2385 struct r600_bc_vtx *vtx;
2386 struct r600_bc_tex *tex;
2387 struct r600_bc_cf *exports[4] = { NULL };
2388 uint32_t literal[4];
2389 unsigned nliteral;
2390 unsigned addr;
2391 int i, r;
2392
2393 if (bc->callstack[0].max > 0)
2394 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
2395 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
2396 bc->nstack = 1;
2397 }
2398
2399 //r600_bc_optimize(bc);
2400
2401 /* first path compute addr of each CF block */
2402 /* addr start after all the CF instructions */
2403 addr = LIST_ENTRY(struct r600_bc_cf, bc->cf.prev, list)->id + 2;
2404 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2405 switch (r600_bc_cf_class(cf)) {
2406 case CF_CLASS_ALU:
2407 break;
2408 case CF_CLASS_TEXTURE:
2409 case CF_CLASS_VERTEX:
2410 /* fetch node need to be 16 bytes aligned*/
2411 addr += 3;
2412 addr &= 0xFFFFFFFCUL;
2413 break;
2414 case CF_CLASS_EXPORT:
2415 if (cf->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT))
2416 exports[cf->output.type] = cf;
2417 break;
2418 case CF_CLASS_OTHER:
2419 break;
2420 default:
2421 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2422 return -EINVAL;
2423 }
2424 cf->addr = addr;
2425 addr += cf->ndw;
2426 bc->ndw = cf->addr + cf->ndw;
2427 }
2428
2429 /* set export done on last export of each type */
2430 for (i = 0; i < 4; ++i) {
2431 if (exports[i]) {
2432 exports[i]->inst = BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
2433 }
2434 }
2435
2436 free(bc->bytecode);
2437 bc->bytecode = calloc(1, bc->ndw * 4);
2438 if (bc->bytecode == NULL)
2439 return -ENOMEM;
2440 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2441 addr = cf->addr;
2442 if (bc->chiprev == CHIPREV_EVERGREEN)
2443 r = eg_bc_cf_build(bc, cf);
2444 else
2445 r = r600_bc_cf_build(bc, cf);
2446 if (r)
2447 return r;
2448 switch (r600_bc_cf_class(cf)) {
2449 case CF_CLASS_ALU:
2450 nliteral = 0;
2451 memset(literal, 0, sizeof(literal));
2452 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2453 r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
2454 if (r)
2455 return r;
2456 r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
2457 switch(bc->chiprev) {
2458 case CHIPREV_R600:
2459 r = r600_bc_alu_build(bc, alu, addr);
2460 break;
2461 case CHIPREV_R700:
2462 case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
2463 r = r700_bc_alu_build(bc, alu, addr);
2464 break;
2465 default:
2466 R600_ERR("unknown family %d\n", bc->family);
2467 return -EINVAL;
2468 }
2469 if (r)
2470 return r;
2471 addr += 2;
2472 if (alu->last) {
2473 for (i = 0; i < align(nliteral, 2); ++i) {
2474 bc->bytecode[addr++] = literal[i];
2475 }
2476 nliteral = 0;
2477 memset(literal, 0, sizeof(literal));
2478 }
2479 }
2480 break;
2481 case CF_CLASS_VERTEX:
2482 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2483 r = r600_bc_vtx_build(bc, vtx, addr);
2484 if (r)
2485 return r;
2486 addr += 4;
2487 }
2488 break;
2489 case CF_CLASS_TEXTURE:
2490 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2491 r = r600_bc_tex_build(bc, tex, addr);
2492 if (r)
2493 return r;
2494 addr += 4;
2495 }
2496 break;
2497 case CF_CLASS_EXPORT:
2498 case CF_CLASS_OTHER:
2499 break;
2500 default:
2501 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2502 return -EINVAL;
2503 }
2504 }
2505 return 0;
2506 }
2507
2508 void r600_bc_clear(struct r600_bc *bc)
2509 {
2510 struct r600_bc_cf *cf = NULL, *next_cf;
2511
2512 free(bc->bytecode);
2513 bc->bytecode = NULL;
2514
2515 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2516 struct r600_bc_alu *alu = NULL, *next_alu;
2517 struct r600_bc_tex *tex = NULL, *next_tex;
2518 struct r600_bc_tex *vtx = NULL, *next_vtx;
2519
2520 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2521 free(alu);
2522 }
2523
2524 LIST_INITHEAD(&cf->alu);
2525
2526 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
2527 free(tex);
2528 }
2529
2530 LIST_INITHEAD(&cf->tex);
2531
2532 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
2533 free(vtx);
2534 }
2535
2536 LIST_INITHEAD(&cf->vtx);
2537
2538 free(cf);
2539 }
2540
2541 LIST_INITHEAD(&cf->list);
2542 }
2543
2544 void r600_bc_dump(struct r600_bc *bc)
2545 {
2546 struct r600_bc_cf *cf = NULL;
2547 struct r600_bc_alu *alu = NULL;
2548 struct r600_bc_vtx *vtx = NULL;
2549 struct r600_bc_tex *tex = NULL;
2550
2551 unsigned i, id;
2552 uint32_t literal[4];
2553 unsigned nliteral;
2554 char chip = '6';
2555
2556 switch (bc->chiprev) {
2557 case 1:
2558 chip = '7';
2559 break;
2560 case 2:
2561 chip = 'E';
2562 break;
2563 case 0:
2564 default:
2565 chip = '6';
2566 break;
2567 }
2568 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
2569 fprintf(stderr, " %c\n", chip);
2570
2571 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2572 id = cf->id;
2573
2574 switch (r600_bc_cf_class(cf)) {
2575 case CF_CLASS_ALU:
2576 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2577 fprintf(stderr, "ADDR:%04d ", cf->addr);
2578 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2579 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2580 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2581 id++;
2582 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2583 fprintf(stderr, "INST:%d ", cf->inst);
2584 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2585 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2586 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2587 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2588 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2589 break;
2590 case CF_CLASS_TEXTURE:
2591 case CF_CLASS_VERTEX:
2592 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2593 fprintf(stderr, "ADDR:%04d\n", cf->addr);
2594 id++;
2595 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2596 fprintf(stderr, "INST:%d ", cf->inst);
2597 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2598 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2599 break;
2600 case CF_CLASS_EXPORT:
2601 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2602 fprintf(stderr, "GPR:%d ", cf->output.gpr);
2603 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2604 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2605 fprintf(stderr, "TYPE:%X\n", cf->output.type);
2606 id++;
2607 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2608 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2609 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2610 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2611 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2612 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2613 fprintf(stderr, "INST:%d ", cf->inst);
2614 fprintf(stderr, "BURST_COUNT:%d\n", cf->output.burst_count);
2615 break;
2616 case CF_CLASS_OTHER:
2617 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2618 fprintf(stderr, "ADDR:%04d\n", cf->cf_addr);
2619 id++;
2620 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2621 fprintf(stderr, "INST:%d ", cf->inst);
2622 fprintf(stderr, "COND:%X ", cf->cond);
2623 fprintf(stderr, "BARRIER:%d ", cf->barrier);
2624 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2625 break;
2626 }
2627
2628 id = cf->addr;
2629 nliteral = 0;
2630 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2631 r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
2632
2633 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2634 fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
2635 fprintf(stderr, "REL:%d ", alu->src[0].rel);
2636 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
2637 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
2638 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
2639 fprintf(stderr, "REL:%d ", alu->src[1].rel);
2640 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
2641 fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
2642 fprintf(stderr, "LAST:%d)\n", alu->last);
2643 id++;
2644 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
2645 fprintf(stderr, "INST:%d ", alu->inst);
2646 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
2647 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
2648 fprintf(stderr, "REL:%d ", alu->dst.rel);
2649 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
2650 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
2651 if (alu->is_op3) {
2652 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
2653 fprintf(stderr, "REL:%d ", alu->src[2].rel);
2654 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
2655 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
2656 } else {
2657 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
2658 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
2659 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
2660 fprintf(stderr, "OMOD:%d ", alu->omod);
2661 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
2662 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
2663 }
2664
2665 id++;
2666 if (alu->last) {
2667 for (i = 0; i < nliteral; i++, id++) {
2668 float *f = (float*)(bc->bytecode + id);
2669 fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
2670 }
2671 id += nliteral & 1;
2672 nliteral = 0;
2673 }
2674 }
2675
2676 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2677 //TODO
2678 }
2679
2680 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2681 //TODO
2682 }
2683 }
2684
2685 fprintf(stderr, "--------------------------------------\n");
2686 }
2687
2688 void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
2689 {
2690 struct r600_pipe_state *rstate;
2691 unsigned i = 0;
2692
2693 if (count > 8) {
2694 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2695 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2696 S_SQ_CF_WORD1_BARRIER(1) |
2697 S_SQ_CF_WORD1_COUNT(8 - 1);
2698 bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
2699 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2700 S_SQ_CF_WORD1_BARRIER(1) |
2701 S_SQ_CF_WORD1_COUNT(count - 8 - 1);
2702 } else {
2703 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2704 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
2705 S_SQ_CF_WORD1_BARRIER(1) |
2706 S_SQ_CF_WORD1_COUNT(count - 1);
2707 }
2708 bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
2709 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
2710 S_SQ_CF_WORD1_BARRIER(1);
2711
2712 rstate = &ve->rstate;
2713 rstate->id = R600_PIPE_STATE_FETCH_SHADER;
2714 rstate->nregs = 0;
2715 r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
2716 0x00000000, 0xFFFFFFFF, NULL);
2717 r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
2718 0x00000000, 0xFFFFFFFF, NULL);
2719 r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
2720 r600_bo_offset(ve->fetch_shader) >> 8,
2721 0xFFFFFFFF, ve->fetch_shader);
2722 }
2723
2724 void r600_cf_vtx_tc(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
2725 {
2726 struct r600_pipe_state *rstate;
2727 unsigned i = 0;
2728
2729 if (count > 8) {
2730 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2731 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
2732 S_SQ_CF_WORD1_BARRIER(1) |
2733 S_SQ_CF_WORD1_COUNT(8 - 1);
2734 bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
2735 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
2736 S_SQ_CF_WORD1_BARRIER(1) |
2737 S_SQ_CF_WORD1_COUNT((count - 8) - 1);
2738 } else {
2739 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
2740 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
2741 S_SQ_CF_WORD1_BARRIER(1) |
2742 S_SQ_CF_WORD1_COUNT(count - 1);
2743 }
2744 bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
2745 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
2746 S_SQ_CF_WORD1_BARRIER(1);
2747
2748 rstate = &ve->rstate;
2749 rstate->id = R600_PIPE_STATE_FETCH_SHADER;
2750 rstate->nregs = 0;
2751 r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
2752 0x00000000, 0xFFFFFFFF, NULL);
2753 r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
2754 0x00000000, 0xFFFFFFFF, NULL);
2755 r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
2756 r600_bo_offset(ve->fetch_shader) >> 8,
2757 0xFFFFFFFF, ve->fetch_shader);
2758 }
2759
2760 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
2761 unsigned *num_format, unsigned *format_comp)
2762 {
2763 const struct util_format_description *desc;
2764 unsigned i;
2765
2766 *format = 0;
2767 *num_format = 0;
2768 *format_comp = 0;
2769
2770 desc = util_format_description(pformat);
2771 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2772 goto out_unknown;
2773 }
2774
2775 /* Find the first non-VOID channel. */
2776 for (i = 0; i < 4; i++) {
2777 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2778 break;
2779 }
2780 }
2781
2782 switch (desc->channel[i].type) {
2783 /* Half-floats, floats, doubles */
2784 case UTIL_FORMAT_TYPE_FLOAT:
2785 switch (desc->channel[i].size) {
2786 case 16:
2787 switch (desc->nr_channels) {
2788 case 1:
2789 *format = FMT_16_FLOAT;
2790 break;
2791 case 2:
2792 *format = FMT_16_16_FLOAT;
2793 break;
2794 case 3:
2795 *format = FMT_16_16_16_FLOAT;
2796 break;
2797 case 4:
2798 *format = FMT_16_16_16_16_FLOAT;
2799 break;
2800 }
2801 break;
2802 case 32:
2803 switch (desc->nr_channels) {
2804 case 1:
2805 *format = FMT_32_FLOAT;
2806 break;
2807 case 2:
2808 *format = FMT_32_32_FLOAT;
2809 break;
2810 case 3:
2811 *format = FMT_32_32_32_FLOAT;
2812 break;
2813 case 4:
2814 *format = FMT_32_32_32_32_FLOAT;
2815 break;
2816 }
2817 break;
2818 default:
2819 goto out_unknown;
2820 }
2821 break;
2822 /* Unsigned ints */
2823 case UTIL_FORMAT_TYPE_UNSIGNED:
2824 /* Signed ints */
2825 case UTIL_FORMAT_TYPE_SIGNED:
2826 switch (desc->channel[i].size) {
2827 case 8:
2828 switch (desc->nr_channels) {
2829 case 1:
2830 *format = FMT_8;
2831 break;
2832 case 2:
2833 *format = FMT_8_8;
2834 break;
2835 case 3:
2836 // *format = FMT_8_8_8; /* fails piglit draw-vertices test */
2837 // break;
2838 case 4:
2839 *format = FMT_8_8_8_8;
2840 break;
2841 }
2842 break;
2843 case 16:
2844 switch (desc->nr_channels) {
2845 case 1:
2846 *format = FMT_16;
2847 break;
2848 case 2:
2849 *format = FMT_16_16;
2850 break;
2851 case 3:
2852 // *format = FMT_16_16_16; /* fails piglit draw-vertices test */
2853 // break;
2854 case 4:
2855 *format = FMT_16_16_16_16;
2856 break;
2857 }
2858 break;
2859 case 32:
2860 switch (desc->nr_channels) {
2861 case 1:
2862 *format = FMT_32;
2863 break;
2864 case 2:
2865 *format = FMT_32_32;
2866 break;
2867 case 3:
2868 *format = FMT_32_32_32;
2869 break;
2870 case 4:
2871 *format = FMT_32_32_32_32;
2872 break;
2873 }
2874 break;
2875 default:
2876 goto out_unknown;
2877 }
2878 break;
2879 default:
2880 goto out_unknown;
2881 }
2882
2883 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2884 *format_comp = 1;
2885 }
2886 if (desc->channel[i].normalized) {
2887 *num_format = 0;
2888 } else {
2889 *num_format = 2;
2890 }
2891 return;
2892 out_unknown:
2893 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2894 }
2895
2896 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2897 {
2898 unsigned ndw, i;
2899 u32 *bytecode;
2900 unsigned fetch_resource_start = 0, format, num_format, format_comp;
2901 struct pipe_vertex_element *elements = ve->elements;
2902 const struct util_format_description *desc;
2903
2904 /* 2 dwords for cf aligned to 4 + 4 dwords per input */
2905 ndw = 8 + ve->count * 4;
2906 ve->fs_size = ndw * 4;
2907
2908 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
2909 ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
2910 if (ve->fetch_shader == NULL) {
2911 return -ENOMEM;
2912 }
2913
2914 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
2915 if (bytecode == NULL) {
2916 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
2917 return -ENOMEM;
2918 }
2919
2920 if (rctx->family >= CHIP_CEDAR) {
2921 eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
2922 } else {
2923 r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
2924 fetch_resource_start = 160;
2925 }
2926
2927 /* vertex elements offset need special handling, if offset is bigger
2928 * than what we can put in fetch instruction then we need to alterate
2929 * the vertex resource offset. In such case in order to simplify code
2930 * we will bound one resource per elements. It's a worst case scenario.
2931 */
2932 for (i = 0; i < ve->count; i++) {
2933 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
2934 if (ve->vbuffer_offset[i]) {
2935 ve->vbuffer_need_offset = 1;
2936 }
2937 }
2938
2939 for (i = 0; i < ve->count; i++) {
2940 unsigned vbuffer_index;
2941 r600_vertex_data_type(ve->hw_format[i], &format, &num_format, &format_comp);
2942 desc = util_format_description(ve->hw_format[i]);
2943 if (desc == NULL) {
2944 R600_ERR("unknown format %d\n", ve->hw_format[i]);
2945 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
2946 return -EINVAL;
2947 }
2948
2949 /* see above for vbuffer_need_offset explanation */
2950 vbuffer_index = elements[i].vertex_buffer_index;
2951 if (ve->vbuffer_need_offset) {
2952 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start);
2953 } else {
2954 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start);
2955 }
2956 bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) |
2957 S_SQ_VTX_WORD0_SRC_SEL_X(0) |
2958 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
2959 bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) |
2960 S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) |
2961 S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) |
2962 S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) |
2963 S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) |
2964 S_SQ_VTX_WORD1_DATA_FORMAT(format) |
2965 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) |
2966 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) |
2967 S_SQ_VTX_WORD1_SRF_MODE_ALL(1) |
2968 S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1);
2969 bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) |
2970 S_SQ_VTX_WORD2_MEGA_FETCH(1);
2971 bytecode[8 + i * 4 + 3] = 0;
2972 }
2973 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
2974 return 0;
2975 }