r600g: Split ALU clauses based on used constant cache lines.
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include <stdio.h>
24 #include <errno.h>
25 #include "util/u_format.h"
26 #include "util/u_memory.h"
27 #include "pipe/p_shader_tokens.h"
28 #include "r600_pipe.h"
29 #include "r600_sq.h"
30 #include "r600_opcodes.h"
31 #include "r600_asm.h"
32 #include "r600_formats.h"
33 #include "r600d.h"
34
35 static inline unsigned int r600_bc_get_num_operands(struct r600_bc_alu *alu)
36 {
37 if(alu->is_op3)
38 return 3;
39
40 switch (alu->inst) {
41 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
42 return 0;
43 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
44 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
45 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
46 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
47 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
48 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
49 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
50 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
51 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
52 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
53 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
55 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
62 return 2;
63
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
71 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
72 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
73 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
77 return 1;
78 default: R600_ERR(
79 "Need instruction operand number for 0x%x.\n", alu->inst);
80 };
81
82 return 3;
83 }
84
85 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
86
87 static struct r600_bc_cf *r600_bc_cf(void)
88 {
89 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
90
91 if (cf == NULL)
92 return NULL;
93 LIST_INITHEAD(&cf->list);
94 LIST_INITHEAD(&cf->alu);
95 LIST_INITHEAD(&cf->vtx);
96 LIST_INITHEAD(&cf->tex);
97 return cf;
98 }
99
100 static struct r600_bc_alu *r600_bc_alu(void)
101 {
102 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
103
104 if (alu == NULL)
105 return NULL;
106 LIST_INITHEAD(&alu->list);
107 LIST_INITHEAD(&alu->bs_list);
108 return alu;
109 }
110
111 static struct r600_bc_vtx *r600_bc_vtx(void)
112 {
113 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
114
115 if (vtx == NULL)
116 return NULL;
117 LIST_INITHEAD(&vtx->list);
118 return vtx;
119 }
120
121 static struct r600_bc_tex *r600_bc_tex(void)
122 {
123 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
124
125 if (tex == NULL)
126 return NULL;
127 LIST_INITHEAD(&tex->list);
128 return tex;
129 }
130
131 int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
132 {
133 LIST_INITHEAD(&bc->cf);
134 bc->family = family;
135 switch (bc->family) {
136 case CHIP_R600:
137 case CHIP_RV610:
138 case CHIP_RV630:
139 case CHIP_RV670:
140 case CHIP_RV620:
141 case CHIP_RV635:
142 case CHIP_RS780:
143 case CHIP_RS880:
144 bc->chiprev = CHIPREV_R600;
145 break;
146 case CHIP_RV770:
147 case CHIP_RV730:
148 case CHIP_RV710:
149 case CHIP_RV740:
150 bc->chiprev = CHIPREV_R700;
151 break;
152 case CHIP_CEDAR:
153 case CHIP_REDWOOD:
154 case CHIP_JUNIPER:
155 case CHIP_CYPRESS:
156 case CHIP_HEMLOCK:
157 case CHIP_PALM:
158 case CHIP_BARTS:
159 case CHIP_TURKS:
160 case CHIP_CAICOS:
161 bc->chiprev = CHIPREV_EVERGREEN;
162 break;
163 default:
164 R600_ERR("unknown family %d\n", bc->family);
165 return -EINVAL;
166 }
167 return 0;
168 }
169
170 static int r600_bc_add_cf(struct r600_bc *bc)
171 {
172 struct r600_bc_cf *cf = r600_bc_cf();
173
174 if (cf == NULL)
175 return -ENOMEM;
176 LIST_ADDTAIL(&cf->list, &bc->cf);
177 if (bc->cf_last)
178 cf->id = bc->cf_last->id + 2;
179 bc->cf_last = cf;
180 bc->ncf++;
181 bc->ndw += 2;
182 bc->force_add_cf = 0;
183 return 0;
184 }
185
186 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
187 {
188 int r;
189
190 r = r600_bc_add_cf(bc);
191 if (r)
192 return r;
193 bc->cf_last->inst = output->inst;
194 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
195 return 0;
196 }
197
198 const unsigned bank_swizzle_vec[8] = {SQ_ALU_VEC_210, //000
199 SQ_ALU_VEC_120, //001
200 SQ_ALU_VEC_102, //010
201
202 SQ_ALU_VEC_201, //011
203 SQ_ALU_VEC_012, //100
204 SQ_ALU_VEC_021, //101
205
206 SQ_ALU_VEC_012, //110
207 SQ_ALU_VEC_012}; //111
208
209 const unsigned bank_swizzle_scl[8] = {SQ_ALU_SCL_210, //000
210 SQ_ALU_SCL_122, //001
211 SQ_ALU_SCL_122, //010
212
213 SQ_ALU_SCL_221, //011
214 SQ_ALU_SCL_212, //100
215 SQ_ALU_SCL_122, //101
216
217 SQ_ALU_SCL_122, //110
218 SQ_ALU_SCL_122}; //111
219
220 static int init_gpr(struct r600_bc_alu *alu)
221 {
222 int cycle, component;
223 /* set up gpr use */
224 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
225 for (component = 0; component < NUM_OF_COMPONENTS; component++)
226 alu->hw_gpr[cycle][component] = -1;
227 return 0;
228 }
229
230 #if 0
231 static int reserve_gpr(struct r600_bc_alu *alu, unsigned sel, unsigned chan, unsigned cycle)
232 {
233 if (alu->hw_gpr[cycle][chan] < 0)
234 alu->hw_gpr[cycle][chan] = sel;
235 else if (alu->hw_gpr[cycle][chan] != (int)sel) {
236 R600_ERR("Another scalar operation has already used GPR read port for channel\n");
237 return -1;
238 }
239 return 0;
240 }
241
242 static int cycle_for_scalar_bank_swizzle(const int swiz, const int sel, unsigned *p_cycle)
243 {
244 int table[3];
245 int ret = 0;
246 switch (swiz) {
247 case SQ_ALU_SCL_210:
248 table[0] = 2; table[1] = 1; table[2] = 0;
249 *p_cycle = table[sel];
250 break;
251 case SQ_ALU_SCL_122:
252 table[0] = 1; table[1] = 2; table[2] = 2;
253 *p_cycle = table[sel];
254 break;
255 case SQ_ALU_SCL_212:
256 table[0] = 2; table[1] = 1; table[2] = 2;
257 *p_cycle = table[sel];
258 break;
259 case SQ_ALU_SCL_221:
260 table[0] = 2; table[1] = 2; table[2] = 1;
261 *p_cycle = table[sel];
262 break;
263 break;
264 default:
265 R600_ERR("bad scalar bank swizzle value\n");
266 ret = -1;
267 break;
268 }
269 return ret;
270 }
271
272 static int cycle_for_vector_bank_swizzle(const int swiz, const int sel, unsigned *p_cycle)
273 {
274 int table[3];
275 int ret;
276
277 switch (swiz) {
278 case SQ_ALU_VEC_012:
279 table[0] = 0; table[1] = 1; table[2] = 2;
280 *p_cycle = table[sel];
281 break;
282 case SQ_ALU_VEC_021:
283 table[0] = 0; table[1] = 2; table[2] = 1;
284 *p_cycle = table[sel];
285 break;
286 case SQ_ALU_VEC_120:
287 table[0] = 1; table[1] = 2; table[2] = 0;
288 *p_cycle = table[sel];
289 break;
290 case SQ_ALU_VEC_102:
291 table[0] = 1; table[1] = 0; table[2] = 2;
292 *p_cycle = table[sel];
293 break;
294 case SQ_ALU_VEC_201:
295 table[0] = 2; table[1] = 0; table[2] = 1;
296 *p_cycle = table[sel];
297 break;
298 case SQ_ALU_VEC_210:
299 table[0] = 2; table[1] = 1; table[2] = 0;
300 *p_cycle = table[sel];
301 break;
302 default:
303 R600_ERR("bad vector bank swizzle value\n");
304 ret = -1;
305 break;
306 }
307 return ret;
308 }
309
310
311
312 static void update_chan_counter(struct r600_bc_alu *alu, int *chan_counter)
313 {
314 int num_src;
315 int i;
316 int channel_swizzle;
317
318 num_src = r600_bc_get_num_operands(alu);
319
320 for (i = 0; i < num_src; i++) {
321 channel_swizzle = alu->src[i].chan;
322 if ((alu->src[i].sel > 0 && alu->src[i].sel < 128) && channel_swizzle <= 3)
323 chan_counter[channel_swizzle]++;
324 }
325 }
326
327 /* we need something like this I think - but this is bogus */
328 int check_read_slots(struct r600_bc *bc, struct r600_bc_alu *alu_first)
329 {
330 struct r600_bc_alu *alu;
331 int chan_counter[4] = { 0 };
332
333 update_chan_counter(alu_first, chan_counter);
334
335 LIST_FOR_EACH_ENTRY(alu, &alu_first->bs_list, bs_list) {
336 update_chan_counter(alu, chan_counter);
337 }
338
339 if (chan_counter[0] > 3 ||
340 chan_counter[1] > 3 ||
341 chan_counter[2] > 3 ||
342 chan_counter[3] > 3) {
343 R600_ERR("needed to split instruction for input ran out of banks %x %d %d %d %d\n",
344 alu_first->inst, chan_counter[0], chan_counter[1], chan_counter[2], chan_counter[3]);
345 return -1;
346 }
347 return 0;
348 }
349 #endif
350
351 /* CB constants start at 512, and get translated to a kcache index when ALU
352 * clauses are constructed. Note that we handle kcache constants the same way
353 * as (the now gone) cfile constants, is that really required? */
354 static int is_const(int sel)
355 {
356 if (sel > 511 && sel < 4607)
357 return 1;
358 return 0;
359 }
360
361 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu)
362 {
363 unsigned swizzle_key;
364
365 if (alu->bank_swizzle_force) {
366 alu->bank_swizzle = alu->bank_swizzle_force;
367 return 0;
368 }
369 swizzle_key = (is_const(alu->src[0].sel) ? 4 : 0 ) +
370 (is_const(alu->src[1].sel) ? 2 : 0 ) +
371 (is_const(alu->src[2].sel) ? 1 : 0 );
372
373 alu->bank_swizzle = bank_swizzle_scl[swizzle_key];
374 return 0;
375 }
376
377 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu)
378 {
379 unsigned swizzle_key;
380
381 if (alu->bank_swizzle_force) {
382 alu->bank_swizzle = alu->bank_swizzle_force;
383 return 0;
384 }
385 swizzle_key = (is_const(alu->src[0].sel) ? 4 : 0 ) +
386 (is_const(alu->src[1].sel) ? 2 : 0 ) +
387 (is_const(alu->src[2].sel) ? 1 : 0 );
388
389 alu->bank_swizzle = bank_swizzle_vec[swizzle_key];
390 return 0;
391 }
392
393 static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *alu_first)
394 {
395 struct r600_bc_alu *alu = NULL;
396 int num_instr = 1;
397
398 init_gpr(alu_first);
399
400 LIST_FOR_EACH_ENTRY(alu, &alu_first->bs_list, bs_list) {
401 num_instr++;
402 }
403
404 if (num_instr == 1) {
405 check_scalar(bc, alu_first);
406
407 } else {
408 /* check_read_slots(bc, bc->cf_last->curr_bs_head);*/
409 check_vector(bc, alu_first);
410 LIST_FOR_EACH_ENTRY(alu, &alu_first->bs_list, bs_list) {
411 check_vector(bc, alu);
412 }
413 }
414 return 0;
415 }
416
417 /* This code handles kcache lines as single blocks of 32 constants. We could
418 * probably do slightly better by recognizing that we actually have two
419 * consecutive lines of 16 constants, but the resulting code would also be
420 * somewhat more complicated. */
421 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
422 {
423 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
424 unsigned int required_lines;
425 unsigned int free_lines = 0;
426 unsigned int cache_line[3];
427 unsigned int count = 0;
428 unsigned int i, j;
429 int r;
430
431 /* Collect required cache lines. */
432 for (i = 0; i < 3; ++i) {
433 bool found = false;
434 unsigned int line;
435
436 if (alu->src[i].sel < 512)
437 continue;
438
439 line = ((alu->src[i].sel - 512) / 32) * 2;
440
441 for (j = 0; j < count; ++j) {
442 if (cache_line[j] == line) {
443 found = true;
444 break;
445 }
446 }
447
448 if (!found)
449 cache_line[count++] = line;
450 }
451
452 /* This should never actually happen. */
453 if (count >= 3) return -ENOMEM;
454
455 for (i = 0; i < 2; ++i) {
456 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
457 ++free_lines;
458 }
459 }
460
461 /* Filter lines pulled in by previous intructions. Note that this is
462 * only for the required_lines count, we can't remove these from the
463 * cache_line array since we may have to start a new ALU clause. */
464 for (i = 0, required_lines = count; i < count; ++i) {
465 for (j = 0; j < 2; ++j) {
466 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
467 kcache[j].addr == cache_line[i]) {
468 --required_lines;
469 break;
470 }
471 }
472 }
473
474 /* Start a new ALU clause if needed. */
475 if (required_lines > free_lines) {
476 if ((r = r600_bc_add_cf(bc))) {
477 return r;
478 }
479 bc->cf_last->inst = (type << 3);
480 kcache = bc->cf_last->kcache;
481 }
482
483 /* Setup the kcache lines. */
484 for (i = 0; i < count; ++i) {
485 bool found = false;
486
487 for (j = 0; j < 2; ++j) {
488 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
489 kcache[j].addr == cache_line[i]) {
490 found = true;
491 break;
492 }
493 }
494
495 if (found) continue;
496
497 for (j = 0; j < 2; ++j) {
498 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
499 kcache[j].bank = 0;
500 kcache[j].addr = cache_line[i];
501 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
502 break;
503 }
504 }
505 }
506
507 /* Alter the src operands to refer to the kcache. */
508 for (i = 0; i < 3; ++i) {
509 static const unsigned int base[] = {128, 160, 256, 288};
510 unsigned int line;
511
512 if (alu->src[i].sel < 512)
513 continue;
514
515 alu->src[i].sel -= 512;
516 line = (alu->src[i].sel / 32) * 2;
517
518 for (j = 0; j < 2; ++j) {
519 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
520 kcache[j].addr == line) {
521 alu->src[i].sel &= 0x1f;
522 alu->src[i].sel += base[j];
523 break;
524 }
525 }
526 }
527
528 return 0;
529 }
530
531 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
532 {
533 struct r600_bc_alu *nalu = r600_bc_alu();
534 struct r600_bc_alu *lalu;
535 int i, r;
536
537 if (nalu == NULL)
538 return -ENOMEM;
539 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
540 nalu->nliteral = 0;
541
542 /* cf can contains only alu or only vtx or only tex */
543 if (bc->cf_last == NULL || bc->cf_last->inst != (type << 3) ||
544 bc->force_add_cf) {
545 r = r600_bc_add_cf(bc);
546 if (r) {
547 free(nalu);
548 return r;
549 }
550 bc->cf_last->inst = (type << 3);
551 }
552
553 /* Setup the kcache for this ALU instruction. This will start a new
554 * ALU clause if needed. */
555 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
556 free(nalu);
557 return r;
558 }
559
560 if (!bc->cf_last->curr_bs_head) {
561 bc->cf_last->curr_bs_head = nalu;
562 LIST_INITHEAD(&nalu->bs_list);
563 } else {
564 LIST_ADDTAIL(&nalu->bs_list, &bc->cf_last->curr_bs_head->bs_list);
565 }
566 /* at most 128 slots, one add alu can add 4 slots + 4 constants(2 slots)
567 * worst case */
568 if (nalu->last && (bc->cf_last->ndw >> 1) >= 120) {
569 bc->force_add_cf = 1;
570 }
571 /* number of gpr == the last gpr used in any alu */
572 for (i = 0; i < 3; i++) {
573 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
574 bc->ngpr = nalu->src[i].sel + 1;
575 }
576 /* compute how many literal are needed
577 * either 2 or 4 literals
578 */
579 if (nalu->src[i].sel == 253) {
580 if (((nalu->src[i].chan + 2) & 0x6) > nalu->nliteral) {
581 nalu->nliteral = (nalu->src[i].chan + 2) & 0x6;
582 }
583 }
584 }
585 if (!LIST_IS_EMPTY(&bc->cf_last->alu)) {
586 lalu = LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list);
587 if (!lalu->last && lalu->nliteral > nalu->nliteral) {
588 nalu->nliteral = lalu->nliteral;
589 }
590 }
591 if (nalu->dst.sel >= bc->ngpr) {
592 bc->ngpr = nalu->dst.sel + 1;
593 }
594 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
595 /* each alu use 2 dwords */
596 bc->cf_last->ndw += 2;
597 bc->ndw += 2;
598
599 /* process cur ALU instructions for bank swizzle */
600 if (nalu->last) {
601 check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head);
602 bc->cf_last->curr_bs_head = NULL;
603 }
604 return 0;
605 }
606
607 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
608 {
609 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
610 }
611
612 int r600_bc_add_literal(struct r600_bc *bc, const u32 *value)
613 {
614 struct r600_bc_alu *alu;
615
616 if (bc->cf_last == NULL) {
617 return 0;
618 }
619 if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
620 return 0;
621 }
622 /* all same on EG */
623 if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_JUMP ||
624 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_ELSE ||
625 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL ||
626 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK ||
627 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE ||
628 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END ||
629 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
630 return 0;
631 }
632 /* same on EG */
633 if (((bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)) &&
634 (bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3))) ||
635 LIST_IS_EMPTY(&bc->cf_last->alu)) {
636 R600_ERR("last CF is not ALU (%p)\n", bc->cf_last);
637 return -EINVAL;
638 }
639 alu = LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list);
640 if (!alu->last || !alu->nliteral || alu->literal_added) {
641 return 0;
642 }
643 memcpy(alu->value, value, 4 * 4);
644 bc->cf_last->ndw += alu->nliteral;
645 bc->ndw += alu->nliteral;
646 alu->literal_added = 1;
647 return 0;
648 }
649
650 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
651 {
652 struct r600_bc_vtx *nvtx = r600_bc_vtx();
653 int r;
654
655 if (nvtx == NULL)
656 return -ENOMEM;
657 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
658
659 /* cf can contains only alu or only vtx or only tex */
660 if (bc->cf_last == NULL ||
661 (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
662 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) ||
663 bc->force_add_cf) {
664 r = r600_bc_add_cf(bc);
665 if (r) {
666 free(nvtx);
667 return r;
668 }
669 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
670 }
671 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
672 /* each fetch use 4 dwords */
673 bc->cf_last->ndw += 4;
674 bc->ndw += 4;
675 if ((bc->ndw / 4) > 7)
676 bc->force_add_cf = 1;
677 return 0;
678 }
679
680 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
681 {
682 struct r600_bc_tex *ntex = r600_bc_tex();
683 int r;
684
685 if (ntex == NULL)
686 return -ENOMEM;
687 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
688
689 /* cf can contains only alu or only vtx or only tex */
690 if (bc->cf_last == NULL ||
691 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
692 bc->force_add_cf) {
693 r = r600_bc_add_cf(bc);
694 if (r) {
695 free(ntex);
696 return r;
697 }
698 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
699 }
700 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
701 /* each texture fetch use 4 dwords */
702 bc->cf_last->ndw += 4;
703 bc->ndw += 4;
704 if ((bc->ndw / 4) > 7)
705 bc->force_add_cf = 1;
706 return 0;
707 }
708
709 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
710 {
711 int r;
712 r = r600_bc_add_cf(bc);
713 if (r)
714 return r;
715
716 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
717 bc->cf_last->inst = inst;
718 return 0;
719 }
720
721 /* common to all 3 families */
722 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
723 {
724 unsigned fetch_resource_start = 0;
725
726 /* check if we are fetch shader */
727 /* fetch shader can also access vertex resource,
728 * first fetch shader resource is at 160
729 */
730 if (bc->type == -1) {
731 switch (bc->chiprev) {
732 /* r600 */
733 case CHIPREV_R600:
734 /* r700 */
735 case CHIPREV_R700:
736 fetch_resource_start = 160;
737 break;
738 /* evergreen */
739 case CHIPREV_EVERGREEN:
740 fetch_resource_start = 0;
741 break;
742 default:
743 fprintf(stderr, "%s:%s:%d unknown chiprev %d\n",
744 __FILE__, __func__, __LINE__, bc->chiprev);
745 break;
746 }
747 }
748 bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id + fetch_resource_start) |
749 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
750 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x) |
751 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
752 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
753 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
754 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
755 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
756 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
757 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
758 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
759 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
760 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
761 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
762 bc->bytecode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1);
763 bc->bytecode[id++] = 0;
764 return 0;
765 }
766
767 /* common to all 3 families */
768 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
769 {
770 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
771 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
772 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
773 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
774 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
775 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
776 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
777 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
778 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
779 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
780 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
781 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
782 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
783 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
784 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
785 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
786 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
787 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
788 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
789 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
790 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
791 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
792 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
793 bc->bytecode[id++] = 0;
794 return 0;
795 }
796
797 /* r600 only, r700/eg bits in r700_asm.c */
798 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
799 {
800 unsigned i;
801
802 /* don't replace gpr by pv or ps for destination register */
803 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
804 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
805 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
806 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
807 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
808 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
809 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
810 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
811 S_SQ_ALU_WORD0_LAST(alu->last);
812
813 if (alu->is_op3) {
814 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
815 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
816 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
817 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
818 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
819 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
820 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
821 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
822 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
823 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
824 } else {
825 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
826 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
827 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
828 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
829 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
830 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
831 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
832 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
833 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
834 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
835 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
836 }
837 if (alu->last) {
838 if (alu->nliteral && !alu->literal_added) {
839 R600_ERR("Bug in ALU processing for instruction 0x%08x, literal not added correctly\n", alu->inst);
840 }
841 for (i = 0; i < alu->nliteral; i++) {
842 bc->bytecode[id++] = alu->value[i];
843 }
844 }
845 return 0;
846 }
847
848 /* common for r600/r700 - eg in eg_asm.c */
849 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
850 {
851 unsigned id = cf->id;
852
853 switch (cf->inst) {
854 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
855 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
856 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
857 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
858 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
859 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
860
861 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
862 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
863 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
864 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
865 S_SQ_CF_ALU_WORD1_BARRIER(1) |
866 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
867 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
868 break;
869 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
870 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
871 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
872 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
873 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
874 S_SQ_CF_WORD1_BARRIER(1) |
875 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
876 break;
877 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
878 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
879 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
880 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
881 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
882 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
883 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
884 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
885 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
886 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
887 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
888 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
889 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
890 break;
891 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
892 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
893 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
894 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
895 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
896 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
897 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
898 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
899 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
900 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
901 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
902 S_SQ_CF_WORD1_BARRIER(1) |
903 S_SQ_CF_WORD1_COND(cf->cond) |
904 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
905
906 break;
907 default:
908 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
909 return -EINVAL;
910 }
911 return 0;
912 }
913
914 int r600_bc_build(struct r600_bc *bc)
915 {
916 struct r600_bc_cf *cf;
917 struct r600_bc_alu *alu;
918 struct r600_bc_vtx *vtx;
919 struct r600_bc_tex *tex;
920 unsigned addr;
921 int r;
922
923 if (bc->callstack[0].max > 0)
924 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
925 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
926 bc->nstack = 1;
927 }
928
929 /* first path compute addr of each CF block */
930 /* addr start after all the CF instructions */
931 addr = bc->cf_last->id + 2;
932 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
933 switch (cf->inst) {
934 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
935 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
936 break;
937 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
938 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
939 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
940 /* fetch node need to be 16 bytes aligned*/
941 addr += 3;
942 addr &= 0xFFFFFFFCUL;
943 break;
944 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
945 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
946 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
947 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
948 break;
949 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
950 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
951 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
952 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
953 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
954 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
955 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
956 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
957 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
958 break;
959 default:
960 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
961 return -EINVAL;
962 }
963 cf->addr = addr;
964 addr += cf->ndw;
965 bc->ndw = cf->addr + cf->ndw;
966 }
967 free(bc->bytecode);
968 bc->bytecode = calloc(1, bc->ndw * 4);
969 if (bc->bytecode == NULL)
970 return -ENOMEM;
971 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
972 addr = cf->addr;
973 if (bc->chiprev == CHIPREV_EVERGREEN)
974 r = eg_bc_cf_build(bc, cf);
975 else
976 r = r600_bc_cf_build(bc, cf);
977 if (r)
978 return r;
979 switch (cf->inst) {
980 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
981 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
982 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
983 switch(bc->chiprev) {
984 case CHIPREV_R600:
985 r = r600_bc_alu_build(bc, alu, addr);
986 break;
987 case CHIPREV_R700:
988 case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
989 r = r700_bc_alu_build(bc, alu, addr);
990 break;
991 default:
992 R600_ERR("unknown family %d\n", bc->family);
993 return -EINVAL;
994 }
995 if (r)
996 return r;
997 addr += 2;
998 if (alu->last) {
999 addr += alu->nliteral;
1000 }
1001 }
1002 break;
1003 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1004 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1005 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1006 r = r600_bc_vtx_build(bc, vtx, addr);
1007 if (r)
1008 return r;
1009 addr += 4;
1010 }
1011 break;
1012 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1013 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1014 r = r600_bc_tex_build(bc, tex, addr);
1015 if (r)
1016 return r;
1017 addr += 4;
1018 }
1019 break;
1020 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1021 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1022 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1023 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1024 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1025 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1026 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1027 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1028 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1029 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1030 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1031 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1032 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1033 break;
1034 default:
1035 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1036 return -EINVAL;
1037 }
1038 }
1039 return 0;
1040 }
1041
1042 void r600_bc_clear(struct r600_bc *bc)
1043 {
1044 struct r600_bc_cf *cf = NULL, *next_cf;
1045
1046 free(bc->bytecode);
1047 bc->bytecode = NULL;
1048
1049 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1050 struct r600_bc_alu *alu = NULL, *next_alu;
1051 struct r600_bc_tex *tex = NULL, *next_tex;
1052 struct r600_bc_tex *vtx = NULL, *next_vtx;
1053
1054 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1055 free(alu);
1056 }
1057
1058 LIST_INITHEAD(&cf->alu);
1059
1060 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1061 free(tex);
1062 }
1063
1064 LIST_INITHEAD(&cf->tex);
1065
1066 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1067 free(vtx);
1068 }
1069
1070 LIST_INITHEAD(&cf->vtx);
1071
1072 free(cf);
1073 }
1074
1075 LIST_INITHEAD(&cf->list);
1076 }
1077
1078 void r600_bc_dump(struct r600_bc *bc)
1079 {
1080 unsigned i;
1081 char chip = '6';
1082
1083 switch (bc->chiprev) {
1084 case 1:
1085 chip = '7';
1086 break;
1087 case 2:
1088 chip = 'E';
1089 break;
1090 case 0:
1091 default:
1092 chip = '6';
1093 break;
1094 }
1095 fprintf(stderr, "bytecode %d dw -----------------------\n", bc->ndw);
1096 fprintf(stderr, " %c\n", chip);
1097 for (i = 0; i < bc->ndw; i++) {
1098 fprintf(stderr, "0x%08X\n", bc->bytecode[i]);
1099 }
1100 fprintf(stderr, "--------------------------------------\n");
1101 }
1102
1103 void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
1104 {
1105 struct r600_pipe_state *rstate;
1106 unsigned i = 0;
1107
1108 if (count > 8) {
1109 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
1110 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
1111 S_SQ_CF_WORD1_BARRIER(1) |
1112 S_SQ_CF_WORD1_COUNT(8 - 1);
1113 bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
1114 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
1115 S_SQ_CF_WORD1_BARRIER(1) |
1116 S_SQ_CF_WORD1_COUNT(count - 8 - 1);
1117 } else {
1118 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
1119 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
1120 S_SQ_CF_WORD1_BARRIER(1) |
1121 S_SQ_CF_WORD1_COUNT(count - 1);
1122 }
1123 bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
1124 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
1125 S_SQ_CF_WORD1_BARRIER(1);
1126
1127 rstate = &ve->rstate;
1128 rstate->id = R600_PIPE_STATE_FETCH_SHADER;
1129 rstate->nregs = 0;
1130 r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
1131 0x00000000, 0xFFFFFFFF, NULL);
1132 r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
1133 0x00000000, 0xFFFFFFFF, NULL);
1134 r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
1135 r600_bo_offset(ve->fetch_shader) >> 8,
1136 0xFFFFFFFF, ve->fetch_shader);
1137 }
1138
1139 void r600_cf_vtx_tc(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
1140 {
1141 struct r600_pipe_state *rstate;
1142 unsigned i = 0;
1143
1144 if (count > 8) {
1145 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
1146 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
1147 S_SQ_CF_WORD1_BARRIER(1) |
1148 S_SQ_CF_WORD1_COUNT(8 - 1);
1149 bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
1150 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
1151 S_SQ_CF_WORD1_BARRIER(1) |
1152 S_SQ_CF_WORD1_COUNT((count - 8) - 1);
1153 } else {
1154 bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
1155 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
1156 S_SQ_CF_WORD1_BARRIER(1) |
1157 S_SQ_CF_WORD1_COUNT(count - 1);
1158 }
1159 bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
1160 bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
1161 S_SQ_CF_WORD1_BARRIER(1);
1162
1163 rstate = &ve->rstate;
1164 rstate->id = R600_PIPE_STATE_FETCH_SHADER;
1165 rstate->nregs = 0;
1166 r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
1167 0x00000000, 0xFFFFFFFF, NULL);
1168 r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
1169 0x00000000, 0xFFFFFFFF, NULL);
1170 r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
1171 r600_bo_offset(ve->fetch_shader) >> 8,
1172 0xFFFFFFFF, ve->fetch_shader);
1173 }
1174
1175 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
1176 unsigned *num_format, unsigned *format_comp)
1177 {
1178 const struct util_format_description *desc;
1179 unsigned i;
1180
1181 *format = 0;
1182 *num_format = 0;
1183 *format_comp = 0;
1184
1185 desc = util_format_description(pformat);
1186 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
1187 goto out_unknown;
1188 }
1189
1190 /* Find the first non-VOID channel. */
1191 for (i = 0; i < 4; i++) {
1192 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
1193 break;
1194 }
1195 }
1196
1197 switch (desc->channel[i].type) {
1198 /* Half-floats, floats, doubles */
1199 case UTIL_FORMAT_TYPE_FLOAT:
1200 switch (desc->channel[i].size) {
1201 case 16:
1202 switch (desc->nr_channels) {
1203 case 1:
1204 *format = FMT_16_FLOAT;
1205 break;
1206 case 2:
1207 *format = FMT_16_16_FLOAT;
1208 break;
1209 case 3:
1210 *format = FMT_16_16_16_FLOAT;
1211 break;
1212 case 4:
1213 *format = FMT_16_16_16_16_FLOAT;
1214 break;
1215 }
1216 break;
1217 case 32:
1218 switch (desc->nr_channels) {
1219 case 1:
1220 *format = FMT_32_FLOAT;
1221 break;
1222 case 2:
1223 *format = FMT_32_32_FLOAT;
1224 break;
1225 case 3:
1226 *format = FMT_32_32_32_FLOAT;
1227 break;
1228 case 4:
1229 *format = FMT_32_32_32_32_FLOAT;
1230 break;
1231 }
1232 break;
1233 default:
1234 goto out_unknown;
1235 }
1236 break;
1237 /* Unsigned ints */
1238 case UTIL_FORMAT_TYPE_UNSIGNED:
1239 /* Signed ints */
1240 case UTIL_FORMAT_TYPE_SIGNED:
1241 switch (desc->channel[i].size) {
1242 case 8:
1243 switch (desc->nr_channels) {
1244 case 1:
1245 *format = FMT_8;
1246 break;
1247 case 2:
1248 *format = FMT_8_8;
1249 break;
1250 case 3:
1251 // *format = FMT_8_8_8; /* fails piglit draw-vertices test */
1252 // break;
1253 case 4:
1254 *format = FMT_8_8_8_8;
1255 break;
1256 }
1257 break;
1258 case 16:
1259 switch (desc->nr_channels) {
1260 case 1:
1261 *format = FMT_16;
1262 break;
1263 case 2:
1264 *format = FMT_16_16;
1265 break;
1266 case 3:
1267 // *format = FMT_16_16_16; /* fails piglit draw-vertices test */
1268 // break;
1269 case 4:
1270 *format = FMT_16_16_16_16;
1271 break;
1272 }
1273 break;
1274 case 32:
1275 switch (desc->nr_channels) {
1276 case 1:
1277 *format = FMT_32;
1278 break;
1279 case 2:
1280 *format = FMT_32_32;
1281 break;
1282 case 3:
1283 *format = FMT_32_32_32;
1284 break;
1285 case 4:
1286 *format = FMT_32_32_32_32;
1287 break;
1288 }
1289 break;
1290 default:
1291 goto out_unknown;
1292 }
1293 break;
1294 default:
1295 goto out_unknown;
1296 }
1297
1298 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
1299 *format_comp = 1;
1300 }
1301 if (desc->channel[i].normalized) {
1302 *num_format = 0;
1303 } else {
1304 *num_format = 2;
1305 }
1306 return;
1307 out_unknown:
1308 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
1309 }
1310
1311 static void r600_bc(unsigned ndw, unsigned chiprev, u32 *bytecode)
1312 {
1313 unsigned i;
1314 char chip = '6';
1315
1316 switch (chiprev) {
1317 case 1:
1318 chip = '7';
1319 break;
1320 case 2:
1321 chip = 'E';
1322 break;
1323 case 0:
1324 default:
1325 chip = '6';
1326 break;
1327 }
1328 fprintf(stderr, "bytecode %d dw -----------------------\n", ndw);
1329 fprintf(stderr, " %c\n", chip);
1330 for (i = 0; i < ndw; i++) {
1331 fprintf(stderr, "0x%08X\n", bytecode[i]);
1332 }
1333 fprintf(stderr, "--------------------------------------\n");
1334 }
1335
1336 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
1337 {
1338 unsigned ndw, i;
1339 u32 *bytecode;
1340 unsigned fetch_resource_start = 0, format, num_format, format_comp;
1341 struct pipe_vertex_element *elements = ve->elements;
1342 const struct util_format_description *desc;
1343
1344 /* 2 dwords for cf aligned to 4 + 4 dwords per input */
1345 ndw = 8 + ve->count * 4;
1346 ve->fs_size = ndw * 4;
1347
1348 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
1349 ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
1350 if (ve->fetch_shader == NULL) {
1351 return -ENOMEM;
1352 }
1353
1354 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
1355 if (bytecode == NULL) {
1356 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
1357 return -ENOMEM;
1358 }
1359
1360 if (rctx->family >= CHIP_CEDAR) {
1361 eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
1362 } else {
1363 r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
1364 fetch_resource_start = 160;
1365 }
1366
1367 /* vertex elements offset need special handling, if offset is bigger
1368 * than what we can put in fetch instruction then we need to alterate
1369 * the vertex resource offset. In such case in order to simplify code
1370 * we will bound one resource per elements. It's a worst case scenario.
1371 */
1372 for (i = 0; i < ve->count; i++) {
1373 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
1374 if (ve->vbuffer_offset[i]) {
1375 ve->vbuffer_need_offset = 1;
1376 }
1377 }
1378
1379 for (i = 0; i < ve->count; i++) {
1380 unsigned vbuffer_index;
1381 r600_vertex_data_type(ve->hw_format[i], &format, &num_format, &format_comp);
1382 desc = util_format_description(ve->hw_format[i]);
1383 if (desc == NULL) {
1384 R600_ERR("unknown format %d\n", ve->hw_format[i]);
1385 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
1386 return -EINVAL;
1387 }
1388
1389 /* see above for vbuffer_need_offset explanation */
1390 vbuffer_index = elements[i].vertex_buffer_index;
1391 if (ve->vbuffer_need_offset) {
1392 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start);
1393 } else {
1394 bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start);
1395 }
1396 bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) |
1397 S_SQ_VTX_WORD0_SRC_SEL_X(0) |
1398 S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
1399 bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) |
1400 S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) |
1401 S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) |
1402 S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) |
1403 S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) |
1404 S_SQ_VTX_WORD1_DATA_FORMAT(format) |
1405 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) |
1406 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) |
1407 S_SQ_VTX_WORD1_SRF_MODE_ALL(1) |
1408 S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1);
1409 bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) |
1410 S_SQ_VTX_WORD2_MEGA_FETCH(1);
1411 bytecode[8 + i * 4 + 3] = 0;
1412 }
1413 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
1414 return 0;
1415 }