nv50: add functions for swizzle resolution
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 unsigned interp_mode[32];
116 /* perspective interpolation registers */
117 struct nv50_reg *iv_p;
118 struct nv50_reg *iv_c;
119
120 /* current instruction and total number of insns */
121 unsigned insn_cur;
122 unsigned insn_nr;
123
124 boolean allow32;
125 };
126
127 static void
128 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129 {
130 int i = 0;
131
132 if (reg->type == P_RESULT) {
133 if (pc->p->cfg.high_result < (reg->hw + 1))
134 pc->p->cfg.high_result = reg->hw + 1;
135 }
136
137 if (reg->type != P_TEMP)
138 return;
139
140 if (reg->hw >= 0) {
141 /*XXX: do this here too to catch FP temp-as-attr usage..
142 * not clean, but works */
143 if (pc->p->cfg.high_temp < (reg->hw + 1))
144 pc->p->cfg.high_temp = reg->hw + 1;
145 return;
146 }
147
148 if (reg->rhw != -1) {
149 /* try to allocate temporary with index rhw first */
150 if (!(pc->r_temp[reg->rhw])) {
151 pc->r_temp[reg->rhw] = reg;
152 reg->hw = reg->rhw;
153 if (pc->p->cfg.high_temp < (reg->rhw + 1))
154 pc->p->cfg.high_temp = reg->rhw + 1;
155 return;
156 }
157 /* make sure we don't get things like $r0 needs to go
158 * in $r1 and $r1 in $r0
159 */
160 i = pc->result_nr * 4;
161 }
162
163 for (; i < NV50_SU_MAX_TEMP; i++) {
164 if (!(pc->r_temp[i])) {
165 pc->r_temp[i] = reg;
166 reg->hw = i;
167 if (pc->p->cfg.high_temp < (i + 1))
168 pc->p->cfg.high_temp = i + 1;
169 return;
170 }
171 }
172
173 assert(0);
174 }
175
176 static struct nv50_reg *
177 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178 {
179 struct nv50_reg *r;
180 int i;
181
182 if (dst && dst->type == P_TEMP && dst->hw == -1)
183 return dst;
184
185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186 if (!pc->r_temp[i]) {
187 r = CALLOC_STRUCT(nv50_reg);
188 r->type = P_TEMP;
189 r->index = -1;
190 r->hw = i;
191 r->rhw = -1;
192 pc->r_temp[i] = r;
193 return r;
194 }
195 }
196
197 assert(0);
198 return NULL;
199 }
200
201 /* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204 static void
205 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206 {
207 assert(src->index == -1 && src->hw != -1);
208
209 if (dst->hw != -1)
210 pc->r_temp[dst->hw] = NULL;
211 pc->r_temp[src->hw] = dst;
212 dst->hw = src->hw;
213
214 FREE(src);
215 }
216
217 /* release the hardware resource held by r */
218 static void
219 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
220 {
221 assert(r->type == P_TEMP);
222 if (r->hw == -1)
223 return;
224
225 assert(pc->r_temp[r->hw] == r);
226 pc->r_temp[r->hw] = NULL;
227
228 r->acc = 0;
229 if (r->index == -1)
230 FREE(r);
231 }
232
233 static void
234 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
235 {
236 if (r->index == -1) {
237 unsigned hw = r->hw;
238
239 FREE(pc->r_temp[hw]);
240 pc->r_temp[hw] = NULL;
241 }
242 }
243
244 static int
245 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
246 {
247 int i;
248
249 if ((idx + 4) >= NV50_SU_MAX_TEMP)
250 return 1;
251
252 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
253 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
254 return alloc_temp4(pc, dst, idx + 4);
255
256 for (i = 0; i < 4; i++) {
257 dst[i] = CALLOC_STRUCT(nv50_reg);
258 dst[i]->type = P_TEMP;
259 dst[i]->index = -1;
260 dst[i]->hw = idx + i;
261 pc->r_temp[idx + i] = dst[i];
262 }
263
264 return 0;
265 }
266
267 static void
268 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
269 {
270 int i;
271
272 for (i = 0; i < 4; i++)
273 free_temp(pc, reg[i]);
274 }
275
276 static struct nv50_reg *
277 temp_temp(struct nv50_pc *pc)
278 {
279 if (pc->temp_temp_nr >= 16)
280 assert(0);
281
282 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
283 return pc->temp_temp[pc->temp_temp_nr++];
284 }
285
286 static void
287 kill_temp_temp(struct nv50_pc *pc)
288 {
289 int i;
290
291 for (i = 0; i < pc->temp_temp_nr; i++)
292 free_temp(pc, pc->temp_temp[i]);
293 pc->temp_temp_nr = 0;
294 }
295
296 static int
297 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
298 {
299 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
300 (pc->immd_nr + 1) * 4 * sizeof(float));
301 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
302 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
303 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
304 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
305
306 return pc->immd_nr++;
307 }
308
309 static struct nv50_reg *
310 alloc_immd(struct nv50_pc *pc, float f)
311 {
312 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
313 unsigned hw;
314
315 for (hw = 0; hw < pc->immd_nr * 4; hw++)
316 if (pc->immd_buf[hw] == f)
317 break;
318
319 if (hw == pc->immd_nr * 4)
320 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
321
322 r->type = P_IMMD;
323 r->hw = hw;
324 r->index = -1;
325 return r;
326 }
327
328 static struct nv50_program_exec *
329 exec(struct nv50_pc *pc)
330 {
331 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
332
333 e->param.index = -1;
334 return e;
335 }
336
337 static void
338 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
339 {
340 struct nv50_program *p = pc->p;
341
342 if (p->exec_tail)
343 p->exec_tail->next = e;
344 if (!p->exec_head)
345 p->exec_head = e;
346 p->exec_tail = e;
347 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
348 }
349
350 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
351
352 static boolean
353 is_long(struct nv50_program_exec *e)
354 {
355 if (e->inst[0] & 1)
356 return TRUE;
357 return FALSE;
358 }
359
360 static boolean
361 is_immd(struct nv50_program_exec *e)
362 {
363 if (is_long(e) && (e->inst[1] & 3) == 3)
364 return TRUE;
365 return FALSE;
366 }
367
368 static INLINE void
369 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
370 struct nv50_program_exec *e)
371 {
372 set_long(pc, e);
373 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
374 e->inst[1] |= (pred << 7) | (idx << 12);
375 }
376
377 static INLINE void
378 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
379 struct nv50_program_exec *e)
380 {
381 set_long(pc, e);
382 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
383 e->inst[1] |= (idx << 4) | (on << 6);
384 }
385
386 static INLINE void
387 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
388 {
389 if (is_long(e))
390 return;
391
392 e->inst[0] |= 1;
393 set_pred(pc, 0xf, 0, e);
394 set_pred_wr(pc, 0, 0, e);
395 }
396
397 static INLINE void
398 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
399 {
400 if (dst->type == P_RESULT) {
401 set_long(pc, e);
402 e->inst[1] |= 0x00000008;
403 }
404
405 alloc_reg(pc, dst);
406 e->inst[0] |= (dst->hw << 2);
407 }
408
409 static INLINE void
410 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
411 {
412 float f = pc->immd_buf[imm->hw];
413 unsigned val = fui(imm->neg ? -f : f);
414
415 set_long(pc, e);
416 /*XXX: can't be predicated - bits overlap.. catch cases where both
417 * are required and avoid them. */
418 set_pred(pc, 0, 0, e);
419 set_pred_wr(pc, 0, 0, e);
420
421 e->inst[1] |= 0x00000002 | 0x00000001;
422 e->inst[0] |= (val & 0x3f) << 16;
423 e->inst[1] |= (val >> 6) << 2;
424 }
425
426
427 #define INTERP_LINEAR 0
428 #define INTERP_FLAT 1
429 #define INTERP_PERSPECTIVE 2
430 #define INTERP_CENTROID 4
431
432 /* interpolant index has been stored in dst->rhw */
433 static void
434 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
435 unsigned mode)
436 {
437 assert(dst->rhw != -1);
438 struct nv50_program_exec *e = exec(pc);
439
440 e->inst[0] |= 0x80000000;
441 set_dst(pc, dst, e);
442 e->inst[0] |= (dst->rhw << 16);
443
444 if (mode & INTERP_FLAT) {
445 e->inst[0] |= (1 << 8);
446 } else {
447 if (mode & INTERP_PERSPECTIVE) {
448 e->inst[0] |= (1 << 25);
449 alloc_reg(pc, iv);
450 e->inst[0] |= (iv->hw << 9);
451 }
452
453 if (mode & INTERP_CENTROID)
454 e->inst[0] |= (1 << 24);
455 }
456
457 emit(pc, e);
458 }
459
460 static void
461 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
462 struct nv50_program_exec *e)
463 {
464 set_long(pc, e);
465
466 e->param.index = src->hw;
467 e->param.shift = s;
468 e->param.mask = m << (s % 32);
469
470 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
471 }
472
473 static void
474 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
475 {
476 struct nv50_program_exec *e = exec(pc);
477
478 e->inst[0] |= 0x10000000;
479
480 set_dst(pc, dst, e);
481
482 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
483 set_immd(pc, src, e);
484 /*XXX: 32-bit, but steals part of "half" reg space - need to
485 * catch and handle this case if/when we do half-regs
486 */
487 } else
488 if (src->type == P_IMMD || src->type == P_CONST) {
489 set_long(pc, e);
490 set_data(pc, src, 0x7f, 9, e);
491 e->inst[1] |= 0x20000000; /* src0 const? */
492 } else {
493 if (src->type == P_ATTR) {
494 set_long(pc, e);
495 e->inst[1] |= 0x00200000;
496 }
497
498 alloc_reg(pc, src);
499 e->inst[0] |= (src->hw << 9);
500 }
501
502 if (is_long(e) && !is_immd(e)) {
503 e->inst[1] |= 0x04000000; /* 32-bit */
504 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
505 if (!(e->inst[1] & 0x20000000))
506 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
507 } else
508 e->inst[0] |= 0x00008000;
509
510 emit(pc, e);
511 }
512
513 static INLINE void
514 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
515 {
516 struct nv50_reg *imm = alloc_immd(pc, f);
517 emit_mov(pc, dst, imm);
518 FREE(imm);
519 }
520
521 static boolean
522 check_swap_src_0_1(struct nv50_pc *pc,
523 struct nv50_reg **s0, struct nv50_reg **s1)
524 {
525 struct nv50_reg *src0 = *s0, *src1 = *s1;
526
527 if (src0->type == P_CONST) {
528 if (src1->type != P_CONST) {
529 *s0 = src1;
530 *s1 = src0;
531 return TRUE;
532 }
533 } else
534 if (src1->type == P_ATTR) {
535 if (src0->type != P_ATTR) {
536 *s0 = src1;
537 *s1 = src0;
538 return TRUE;
539 }
540 }
541
542 return FALSE;
543 }
544
545 static void
546 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
547 {
548 if (src->type == P_ATTR) {
549 set_long(pc, e);
550 e->inst[1] |= 0x00200000;
551 } else
552 if (src->type == P_CONST || src->type == P_IMMD) {
553 struct nv50_reg *temp = temp_temp(pc);
554
555 emit_mov(pc, temp, src);
556 src = temp;
557 }
558
559 alloc_reg(pc, src);
560 e->inst[0] |= (src->hw << 9);
561 }
562
563 static void
564 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
565 {
566 if (src->type == P_ATTR) {
567 struct nv50_reg *temp = temp_temp(pc);
568
569 emit_mov(pc, temp, src);
570 src = temp;
571 } else
572 if (src->type == P_CONST || src->type == P_IMMD) {
573 assert(!(e->inst[0] & 0x00800000));
574 if (e->inst[0] & 0x01000000) {
575 struct nv50_reg *temp = temp_temp(pc);
576
577 emit_mov(pc, temp, src);
578 src = temp;
579 } else {
580 set_data(pc, src, 0x7f, 16, e);
581 e->inst[0] |= 0x00800000;
582 }
583 }
584
585 alloc_reg(pc, src);
586 e->inst[0] |= (src->hw << 16);
587 }
588
589 static void
590 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
591 {
592 set_long(pc, e);
593
594 if (src->type == P_ATTR) {
595 struct nv50_reg *temp = temp_temp(pc);
596
597 emit_mov(pc, temp, src);
598 src = temp;
599 } else
600 if (src->type == P_CONST || src->type == P_IMMD) {
601 assert(!(e->inst[0] & 0x01000000));
602 if (e->inst[0] & 0x00800000) {
603 struct nv50_reg *temp = temp_temp(pc);
604
605 emit_mov(pc, temp, src);
606 src = temp;
607 } else {
608 set_data(pc, src, 0x7f, 32+14, e);
609 e->inst[0] |= 0x01000000;
610 }
611 }
612
613 alloc_reg(pc, src);
614 e->inst[1] |= (src->hw << 14);
615 }
616
617 static void
618 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
619 struct nv50_reg *src1)
620 {
621 struct nv50_program_exec *e = exec(pc);
622
623 e->inst[0] |= 0xc0000000;
624
625 if (!pc->allow32)
626 set_long(pc, e);
627
628 check_swap_src_0_1(pc, &src0, &src1);
629 set_dst(pc, dst, e);
630 set_src_0(pc, src0, e);
631 if (src1->type == P_IMMD && !is_long(e)) {
632 if (src0->neg)
633 e->inst[0] |= 0x00008000;
634 set_immd(pc, src1, e);
635 } else {
636 set_src_1(pc, src1, e);
637 if (src0->neg ^ src1->neg) {
638 if (is_long(e))
639 e->inst[1] |= 0x08000000;
640 else
641 e->inst[0] |= 0x00008000;
642 }
643 }
644
645 emit(pc, e);
646 }
647
648 static void
649 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
650 struct nv50_reg *src0, struct nv50_reg *src1)
651 {
652 struct nv50_program_exec *e = exec(pc);
653
654 e->inst[0] |= 0xb0000000;
655
656 check_swap_src_0_1(pc, &src0, &src1);
657
658 if (!pc->allow32 || src0->neg || src1->neg) {
659 set_long(pc, e);
660 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
661 }
662
663 set_dst(pc, dst, e);
664 set_src_0(pc, src0, e);
665 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
666 set_src_2(pc, src1, e);
667 else
668 if (src1->type == P_IMMD)
669 set_immd(pc, src1, e);
670 else
671 set_src_1(pc, src1, e);
672
673 emit(pc, e);
674 }
675
676 static void
677 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
678 struct nv50_reg *src0, struct nv50_reg *src1)
679 {
680 struct nv50_program_exec *e = exec(pc);
681
682 set_long(pc, e);
683 e->inst[0] |= 0xb0000000;
684 e->inst[1] |= (sub << 29);
685
686 check_swap_src_0_1(pc, &src0, &src1);
687 set_dst(pc, dst, e);
688 set_src_0(pc, src0, e);
689 set_src_1(pc, src1, e);
690
691 emit(pc, e);
692 }
693
694 static INLINE void
695 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
696 struct nv50_reg *src1)
697 {
698 src1->neg ^= 1;
699 emit_add(pc, dst, src0, src1);
700 src1->neg ^= 1;
701 }
702
703 static void
704 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
705 struct nv50_reg *src1, struct nv50_reg *src2)
706 {
707 struct nv50_program_exec *e = exec(pc);
708
709 e->inst[0] |= 0xe0000000;
710
711 check_swap_src_0_1(pc, &src0, &src1);
712 set_dst(pc, dst, e);
713 set_src_0(pc, src0, e);
714 set_src_1(pc, src1, e);
715 set_src_2(pc, src2, e);
716
717 if (src0->neg ^ src1->neg)
718 e->inst[1] |= 0x04000000;
719 if (src2->neg)
720 e->inst[1] |= 0x08000000;
721
722 emit(pc, e);
723 }
724
725 static INLINE void
726 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
727 struct nv50_reg *src1, struct nv50_reg *src2)
728 {
729 src2->neg ^= 1;
730 emit_mad(pc, dst, src0, src1, src2);
731 src2->neg ^= 1;
732 }
733
734 static void
735 emit_flop(struct nv50_pc *pc, unsigned sub,
736 struct nv50_reg *dst, struct nv50_reg *src)
737 {
738 struct nv50_program_exec *e = exec(pc);
739
740 e->inst[0] |= 0x90000000;
741 if (sub) {
742 set_long(pc, e);
743 e->inst[1] |= (sub << 29);
744 }
745
746 set_dst(pc, dst, e);
747 set_src_0(pc, src, e);
748
749 emit(pc, e);
750 }
751
752 static void
753 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
754 {
755 struct nv50_program_exec *e = exec(pc);
756
757 e->inst[0] |= 0xb0000000;
758
759 set_dst(pc, dst, e);
760 set_src_0(pc, src, e);
761 set_long(pc, e);
762 e->inst[1] |= (6 << 29) | 0x00004000;
763
764 emit(pc, e);
765 }
766
767 static void
768 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
769 {
770 struct nv50_program_exec *e = exec(pc);
771
772 e->inst[0] |= 0xb0000000;
773
774 set_dst(pc, dst, e);
775 set_src_0(pc, src, e);
776 set_long(pc, e);
777 e->inst[1] |= (6 << 29);
778
779 emit(pc, e);
780 }
781
782 #define CVTOP_RN 0x01
783 #define CVTOP_FLOOR 0x03
784 #define CVTOP_CEIL 0x05
785 #define CVTOP_TRUNC 0x07
786 #define CVTOP_SAT 0x08
787 #define CVTOP_ABS 0x10
788
789 #define CVT_F32_F32 0xc4
790 #define CVT_F32_S32 0x44
791 #define CVT_F32_U32 0x64
792 #define CVT_S32_F32 0x8c
793 #define CVT_S32_S32 0x0c
794 #define CVT_F32_F32_ROP 0xcc
795
796 static void
797 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
798 int wp, unsigned cop, unsigned fmt)
799 {
800 struct nv50_program_exec *e;
801
802 e = exec(pc);
803 set_long(pc, e);
804
805 e->inst[0] |= 0xa0000000;
806 e->inst[1] |= 0x00004000;
807 e->inst[1] |= (cop << 16);
808 e->inst[1] |= (fmt << 24);
809 set_src_0(pc, src, e);
810
811 if (wp >= 0)
812 set_pred_wr(pc, 1, wp, e);
813
814 if (dst)
815 set_dst(pc, dst, e);
816 else {
817 e->inst[0] |= 0x000001fc;
818 e->inst[1] |= 0x00000008;
819 }
820
821 emit(pc, e);
822 }
823
824 static void
825 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
826 struct nv50_reg *src0, struct nv50_reg *src1)
827 {
828 struct nv50_program_exec *e = exec(pc);
829 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
830 struct nv50_reg *rdst;
831
832 assert(c_op <= 7);
833 if (check_swap_src_0_1(pc, &src0, &src1))
834 c_op = inv_cop[c_op];
835
836 rdst = dst;
837 if (dst->type != P_TEMP)
838 dst = alloc_temp(pc, NULL);
839
840 /* set.u32 */
841 set_long(pc, e);
842 e->inst[0] |= 0xb0000000;
843 e->inst[1] |= (3 << 29);
844 e->inst[1] |= (c_op << 14);
845 /*XXX: breaks things, .u32 by default?
846 * decuda will disasm as .u16 and use .lo/.hi regs, but this
847 * doesn't seem to match what the hw actually does.
848 inst[1] |= 0x04000000; << breaks things.. .u32 by default?
849 */
850 set_dst(pc, dst, e);
851 set_src_0(pc, src0, e);
852 set_src_1(pc, src1, e);
853 emit(pc, e);
854
855 /* cvt.f32.u32 */
856 e = exec(pc);
857 e->inst[0] = 0xa0000001;
858 e->inst[1] = 0x64014780;
859 set_dst(pc, rdst, e);
860 set_src_0(pc, dst, e);
861 emit(pc, e);
862
863 if (dst != rdst)
864 free_temp(pc, dst);
865 }
866
867 static INLINE void
868 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
869 {
870 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
871 }
872
873 static void
874 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
875 struct nv50_reg *v, struct nv50_reg *e)
876 {
877 struct nv50_reg *temp = alloc_temp(pc, NULL);
878
879 emit_flop(pc, 3, temp, v);
880 emit_mul(pc, temp, temp, e);
881 emit_preex2(pc, temp, temp);
882 emit_flop(pc, 6, dst, temp);
883
884 free_temp(pc, temp);
885 }
886
887 static INLINE void
888 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
889 {
890 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
891 }
892
893 static void
894 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
895 struct nv50_reg **src)
896 {
897 struct nv50_reg *one = alloc_immd(pc, 1.0);
898 struct nv50_reg *zero = alloc_immd(pc, 0.0);
899 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
900 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
901 struct nv50_reg *tmp[4];
902 boolean allow32 = pc->allow32;
903
904 pc->allow32 = FALSE;
905
906 if (mask & (3 << 1)) {
907 tmp[0] = alloc_temp(pc, NULL);
908 emit_minmax(pc, 4, tmp[0], src[0], zero);
909 }
910
911 if (mask & (1 << 2)) {
912 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
913
914 tmp[1] = temp_temp(pc);
915 emit_minmax(pc, 4, tmp[1], src[1], zero);
916
917 tmp[3] = temp_temp(pc);
918 emit_minmax(pc, 4, tmp[3], src[3], neg128);
919 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
920
921 emit_pow(pc, dst[2], tmp[1], tmp[3]);
922 emit_mov(pc, dst[2], zero);
923 set_pred(pc, 3, 0, pc->p->exec_tail);
924 }
925
926 if (mask & (1 << 1))
927 assimilate_temp(pc, dst[1], tmp[0]);
928 else
929 if (mask & (1 << 2))
930 free_temp(pc, tmp[0]);
931
932 pc->allow32 = allow32;
933
934 /* do this last, in case src[i,j] == dst[0,3] */
935 if (mask & (1 << 0))
936 emit_mov(pc, dst[0], one);
937
938 if (mask & (1 << 3))
939 emit_mov(pc, dst[3], one);
940
941 FREE(pos128);
942 FREE(neg128);
943 FREE(zero);
944 FREE(one);
945 }
946
947 static void
948 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
949 {
950 struct nv50_program_exec *e = exec(pc);
951
952 set_long(pc, e);
953 e->inst[0] |= 0xa0000000; /* delta */
954 e->inst[1] |= (7 << 29); /* delta */
955 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
956 e->inst[1] |= (1 << 14); /* src .f32 */
957 set_dst(pc, dst, e);
958 set_src_0(pc, src, e);
959
960 emit(pc, e);
961 }
962
963 static void
964 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
965 {
966 struct nv50_program_exec *e;
967 const int r_pred = 1;
968
969 /* Sets predicate reg ? */
970 e = exec(pc);
971 e->inst[0] = 0xa00001fd;
972 e->inst[1] = 0xc4014788;
973 set_src_0(pc, src, e);
974 set_pred_wr(pc, 1, r_pred, e);
975 if (src->neg)
976 e->inst[1] |= 0x20000000;
977 emit(pc, e);
978
979 /* This is probably KILP */
980 e = exec(pc);
981 e->inst[0] = 0x000001fe;
982 set_long(pc, e);
983 set_pred(pc, 1 /* LT? */, r_pred, e);
984 emit(pc, e);
985 }
986
987 static void
988 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
989 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
990 {
991 struct nv50_reg *temp, *t[4];
992 struct nv50_program_exec *e;
993
994 unsigned c, mode, dim;
995
996 switch (type) {
997 case TGSI_TEXTURE_1D:
998 dim = 1;
999 break;
1000 case TGSI_TEXTURE_UNKNOWN:
1001 case TGSI_TEXTURE_2D:
1002 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1003 case TGSI_TEXTURE_RECT:
1004 dim = 2;
1005 break;
1006 case TGSI_TEXTURE_3D:
1007 case TGSI_TEXTURE_CUBE:
1008 case TGSI_TEXTURE_SHADOW2D:
1009 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1010 dim = 3;
1011 break;
1012 default:
1013 assert(0);
1014 break;
1015 }
1016
1017 /* some cards need t[0]'s hw index to be a multiple of 4 */
1018 alloc_temp4(pc, t, 0);
1019
1020 if (proj) {
1021 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1022 mode = pc->interp_mode[src[0]->index];
1023
1024 t[3]->rhw = src[3]->rhw;
1025 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1026 emit_flop(pc, 0, t[3], t[3]);
1027
1028 for (c = 0; c < dim; c++) {
1029 t[c]->rhw = src[c]->rhw;
1030 emit_interp(pc, t[c], t[3],
1031 (mode | INTERP_PERSPECTIVE));
1032 }
1033 } else {
1034 emit_flop(pc, 0, t[3], src[3]);
1035 for (c = 0; c < dim; c++)
1036 emit_mul(pc, t[c], src[c], t[3]);
1037
1038 /* XXX: for some reason the blob sometimes uses MAD:
1039 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1040 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1041 */
1042 }
1043 } else {
1044 if (type == TGSI_TEXTURE_CUBE) {
1045 temp = temp_temp(pc);
1046 emit_minmax(pc, 4, temp, src[0], src[1]);
1047 emit_minmax(pc, 4, temp, temp, src[2]);
1048 emit_flop(pc, 0, temp, temp);
1049 for (c = 0; c < 3; c++)
1050 emit_mul(pc, t[c], src[c], temp);
1051 } else {
1052 for (c = 0; c < dim; c++)
1053 emit_mov(pc, t[c], src[c]);
1054 }
1055 }
1056
1057 e = exec(pc);
1058 set_long(pc, e);
1059 e->inst[0] |= 0xf0000000;
1060 e->inst[1] |= 0x00000004;
1061 set_dst(pc, t[0], e);
1062 e->inst[0] |= (unit << 9);
1063
1064 if (dim == 2)
1065 e->inst[0] |= 0x00400000;
1066 else
1067 if (dim == 3)
1068 e->inst[0] |= 0x00800000;
1069
1070 e->inst[0] |= (mask & 0x3) << 25;
1071 e->inst[1] |= (mask & 0xc) << 12;
1072
1073 emit(pc, e);
1074
1075 #if 1
1076 if (mask & 1) emit_mov(pc, dst[0], t[0]);
1077 if (mask & 2) emit_mov(pc, dst[1], t[1]);
1078 if (mask & 4) emit_mov(pc, dst[2], t[2]);
1079 if (mask & 8) emit_mov(pc, dst[3], t[3]);
1080
1081 free_temp4(pc, t);
1082 #else
1083 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1084 * the texture coordinates, not the fetched values: latency ? */
1085
1086 for (c = 0; c < 4; c++) {
1087 if (mask & (1 << c))
1088 assimilate_temp(pc, dst[c], t[c]);
1089 else
1090 free_temp(pc, t[c]);
1091 }
1092 #endif
1093 }
1094
1095 static void
1096 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1097 {
1098 unsigned q = 0, m = ~0;
1099
1100 assert(!is_long(e));
1101
1102 switch (e->inst[0] >> 28) {
1103 case 0x1:
1104 /* MOV */
1105 q = 0x0403c000;
1106 m = 0xffff7fff;
1107 break;
1108 case 0x8:
1109 /* INTERP (move centroid, perspective and flat bits) */
1110 m = ~0x03000100;
1111 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1112 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1113 break;
1114 case 0x9:
1115 /* RCP */
1116 break;
1117 case 0xB:
1118 /* ADD */
1119 m = ~(127 << 16);
1120 q = ((e->inst[0] & (~m)) >> 2);
1121 break;
1122 case 0xC:
1123 /* MUL */
1124 m = ~0x00008000;
1125 q = ((e->inst[0] & (~m)) << 12);
1126 break;
1127 case 0xE:
1128 /* MAD (if src2 == dst) */
1129 q = ((e->inst[0] & 0x1fc) << 12);
1130 break;
1131 default:
1132 assert(0);
1133 break;
1134 }
1135
1136 set_long(pc, e);
1137 pc->p->exec_size++;
1138
1139 e->inst[0] &= m;
1140 e->inst[1] |= q;
1141 }
1142
1143 static boolean
1144 negate_supported(const struct tgsi_full_instruction *insn, int i)
1145 {
1146 switch (insn->Instruction.Opcode) {
1147 case TGSI_OPCODE_DP3:
1148 case TGSI_OPCODE_DP4:
1149 case TGSI_OPCODE_MUL:
1150 case TGSI_OPCODE_KIL:
1151 case TGSI_OPCODE_ADD:
1152 case TGSI_OPCODE_SUB:
1153 case TGSI_OPCODE_MAD:
1154 return TRUE;
1155 case TGSI_OPCODE_POW:
1156 return (i == 1) ? TRUE : FALSE;
1157 default:
1158 return FALSE;
1159 }
1160 }
1161
1162 /* Return a read mask for source registers deduced from opcode & write mask. */
1163 static unsigned
1164 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1165 {
1166 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1167
1168 switch (insn->Instruction.Opcode) {
1169 case TGSI_OPCODE_COS:
1170 case TGSI_OPCODE_SIN:
1171 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1172 case TGSI_OPCODE_DP3:
1173 return 0x7;
1174 case TGSI_OPCODE_DP4:
1175 case TGSI_OPCODE_DPH:
1176 case TGSI_OPCODE_KIL: /* WriteMask ignored */
1177 return 0xf;
1178 case TGSI_OPCODE_DST:
1179 return mask & (c ? 0xa : 0x6);
1180 case TGSI_OPCODE_EX2:
1181 case TGSI_OPCODE_LG2:
1182 case TGSI_OPCODE_POW:
1183 case TGSI_OPCODE_RCP:
1184 case TGSI_OPCODE_RSQ:
1185 case TGSI_OPCODE_SCS:
1186 return 0x1;
1187 case TGSI_OPCODE_LIT:
1188 return 0xb;
1189 case TGSI_OPCODE_TEX:
1190 case TGSI_OPCODE_TXP:
1191 {
1192 const struct tgsi_instruction_ext_texture *tex;
1193
1194 assert(insn->Instruction.Extended);
1195 tex = &insn->InstructionExtTexture;
1196
1197 mask = 0x7;
1198 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1199 mask |= 0x8;
1200
1201 switch (tex->Texture) {
1202 case TGSI_TEXTURE_1D:
1203 mask &= 0x9;
1204 break;
1205 case TGSI_TEXTURE_2D:
1206 mask &= 0xb;
1207 break;
1208 default:
1209 break;
1210 }
1211 }
1212 return mask;
1213 case TGSI_OPCODE_XPD:
1214 x = 0;
1215 if (mask & 1) x |= 0x6;
1216 if (mask & 2) x |= 0x5;
1217 if (mask & 4) x |= 0x3;
1218 return x;
1219 default:
1220 break;
1221 }
1222
1223 return mask;
1224 }
1225
1226 static struct nv50_reg *
1227 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1228 {
1229 switch (dst->DstRegister.File) {
1230 case TGSI_FILE_TEMPORARY:
1231 return &pc->temp[dst->DstRegister.Index * 4 + c];
1232 case TGSI_FILE_OUTPUT:
1233 return &pc->result[dst->DstRegister.Index * 4 + c];
1234 case TGSI_FILE_NULL:
1235 return NULL;
1236 default:
1237 break;
1238 }
1239
1240 return NULL;
1241 }
1242
1243 static struct nv50_reg *
1244 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1245 boolean neg)
1246 {
1247 struct nv50_reg *r = NULL;
1248 struct nv50_reg *temp;
1249 unsigned sgn, c;
1250
1251 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1252
1253 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1254 switch (c) {
1255 case TGSI_EXTSWIZZLE_X:
1256 case TGSI_EXTSWIZZLE_Y:
1257 case TGSI_EXTSWIZZLE_Z:
1258 case TGSI_EXTSWIZZLE_W:
1259 switch (src->SrcRegister.File) {
1260 case TGSI_FILE_INPUT:
1261 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1262 break;
1263 case TGSI_FILE_TEMPORARY:
1264 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1265 break;
1266 case TGSI_FILE_CONSTANT:
1267 r = &pc->param[src->SrcRegister.Index * 4 + c];
1268 break;
1269 case TGSI_FILE_IMMEDIATE:
1270 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1271 break;
1272 case TGSI_FILE_SAMPLER:
1273 break;
1274 default:
1275 assert(0);
1276 break;
1277 }
1278 break;
1279 case TGSI_EXTSWIZZLE_ZERO:
1280 r = alloc_immd(pc, 0.0);
1281 return r;
1282 case TGSI_EXTSWIZZLE_ONE:
1283 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1284 return alloc_immd(pc, -1.0);
1285 return alloc_immd(pc, 1.0);
1286 default:
1287 assert(0);
1288 break;
1289 }
1290
1291 switch (sgn) {
1292 case TGSI_UTIL_SIGN_KEEP:
1293 break;
1294 case TGSI_UTIL_SIGN_CLEAR:
1295 temp = temp_temp(pc);
1296 emit_abs(pc, temp, r);
1297 r = temp;
1298 break;
1299 case TGSI_UTIL_SIGN_TOGGLE:
1300 if (neg)
1301 r->neg = 1;
1302 else {
1303 temp = temp_temp(pc);
1304 emit_neg(pc, temp, r);
1305 r = temp;
1306 }
1307 break;
1308 case TGSI_UTIL_SIGN_SET:
1309 temp = temp_temp(pc);
1310 emit_abs(pc, temp, r);
1311 if (neg)
1312 temp->neg = 1;
1313 else
1314 emit_neg(pc, temp, temp);
1315 r = temp;
1316 break;
1317 default:
1318 assert(0);
1319 break;
1320 }
1321
1322 return r;
1323 }
1324
1325 /* return TRUE for ops that produce only a single result */
1326 static boolean
1327 is_scalar_op(unsigned op)
1328 {
1329 switch (op) {
1330 case TGSI_OPCODE_DP2:
1331 case TGSI_OPCODE_DP3:
1332 case TGSI_OPCODE_DP4:
1333 case TGSI_OPCODE_DPH:
1334 case TGSI_OPCODE_EX2:
1335 case TGSI_OPCODE_LG2:
1336 case TGSI_OPCODE_POW:
1337 case TGSI_OPCODE_RCP:
1338 case TGSI_OPCODE_RSQ:
1339 /*
1340 case TGSI_OPCODE_COS:
1341 case TGSI_OPCODE_KIL:
1342 case TGSI_OPCODE_LIT:
1343 case TGSI_OPCODE_SCS:
1344 case TGSI_OPCODE_SIN:
1345 */
1346 return TRUE;
1347 default:
1348 return FALSE;
1349 }
1350 }
1351
1352 /* Returns a bitmask indicating which dst components depend
1353 * on source s, component c (reverse of nv50_tgsi_src_mask).
1354 */
1355 static unsigned
1356 nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1357 {
1358 if (is_scalar_op(op))
1359 return 0x1;
1360
1361 switch (op) {
1362 case TGSI_OPCODE_DST:
1363 return (1 << c) & (s ? 0xa : 0x6);
1364 case TGSI_OPCODE_XPD:
1365 switch (c) {
1366 case 0: return 0x6;
1367 case 1: return 0x5;
1368 case 2: return 0x3;
1369 case 3: return 0x0;
1370 default:
1371 assert(0);
1372 return 0x0;
1373 }
1374 case TGSI_OPCODE_LIT:
1375 case TGSI_OPCODE_SCS:
1376 case TGSI_OPCODE_TEX:
1377 case TGSI_OPCODE_TXP:
1378 /* these take care of dangerous swizzles themselves */
1379 return 0x0;
1380 case TGSI_OPCODE_IF:
1381 case TGSI_OPCODE_KIL:
1382 /* don't call this function for these ops */
1383 assert(0);
1384 return 0;
1385 default:
1386 /* linear vector instruction */
1387 return (1 << c);
1388 }
1389 }
1390
1391 static boolean
1392 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1393 {
1394 const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1395 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1396 unsigned mask, sat, unit;
1397 boolean assimilate = FALSE;
1398 int i, c;
1399
1400 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1401 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1402
1403 for (c = 0; c < 4; c++) {
1404 if (mask & (1 << c))
1405 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1406 else
1407 dst[c] = NULL;
1408 rdst[c] = NULL;
1409 src[0][c] = NULL;
1410 src[1][c] = NULL;
1411 src[2][c] = NULL;
1412 }
1413
1414 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1415 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1416 unsigned src_mask;
1417 boolean neg_supp;
1418
1419 src_mask = nv50_tgsi_src_mask(inst, i);
1420 neg_supp = negate_supported(inst, i);
1421
1422 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1423 unit = fs->SrcRegister.Index;
1424
1425 for (c = 0; c < 4; c++)
1426 if (src_mask & (1 << c))
1427 src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1428 }
1429
1430 if (sat) {
1431 for (c = 0; c < 4; c++) {
1432 rdst[c] = dst[c];
1433 dst[c] = temp_temp(pc);
1434 }
1435 }
1436
1437 switch (inst->Instruction.Opcode) {
1438 case TGSI_OPCODE_ABS:
1439 for (c = 0; c < 4; c++) {
1440 if (!(mask & (1 << c)))
1441 continue;
1442 emit_abs(pc, dst[c], src[0][c]);
1443 }
1444 break;
1445 case TGSI_OPCODE_ADD:
1446 for (c = 0; c < 4; c++) {
1447 if (!(mask & (1 << c)))
1448 continue;
1449 emit_add(pc, dst[c], src[0][c], src[1][c]);
1450 }
1451 break;
1452 case TGSI_OPCODE_COS:
1453 temp = temp_temp(pc);
1454 emit_precossin(pc, temp, src[0][0]);
1455 emit_flop(pc, 5, temp, temp);
1456 for (c = 0; c < 4; c++) {
1457 if (!(mask & (1 << c)))
1458 continue;
1459 emit_mov(pc, dst[c], temp);
1460 }
1461 break;
1462 case TGSI_OPCODE_DP3:
1463 temp = temp_temp(pc);
1464 emit_mul(pc, temp, src[0][0], src[1][0]);
1465 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1466 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1467 for (c = 0; c < 4; c++) {
1468 if (!(mask & (1 << c)))
1469 continue;
1470 emit_mov(pc, dst[c], temp);
1471 }
1472 break;
1473 case TGSI_OPCODE_DP4:
1474 temp = temp_temp(pc);
1475 emit_mul(pc, temp, src[0][0], src[1][0]);
1476 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1477 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1478 emit_mad(pc, temp, src[0][3], src[1][3], temp);
1479 for (c = 0; c < 4; c++) {
1480 if (!(mask & (1 << c)))
1481 continue;
1482 emit_mov(pc, dst[c], temp);
1483 }
1484 break;
1485 case TGSI_OPCODE_DPH:
1486 temp = temp_temp(pc);
1487 emit_mul(pc, temp, src[0][0], src[1][0]);
1488 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1489 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1490 emit_add(pc, temp, src[1][3], temp);
1491 for (c = 0; c < 4; c++) {
1492 if (!(mask & (1 << c)))
1493 continue;
1494 emit_mov(pc, dst[c], temp);
1495 }
1496 break;
1497 case TGSI_OPCODE_DST:
1498 {
1499 struct nv50_reg *one = alloc_immd(pc, 1.0);
1500 if (mask & (1 << 0))
1501 emit_mov(pc, dst[0], one);
1502 if (mask & (1 << 1))
1503 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1504 if (mask & (1 << 2))
1505 emit_mov(pc, dst[2], src[0][2]);
1506 if (mask & (1 << 3))
1507 emit_mov(pc, dst[3], src[1][3]);
1508 FREE(one);
1509 }
1510 break;
1511 case TGSI_OPCODE_EX2:
1512 temp = temp_temp(pc);
1513 emit_preex2(pc, temp, src[0][0]);
1514 emit_flop(pc, 6, temp, temp);
1515 for (c = 0; c < 4; c++) {
1516 if (!(mask & (1 << c)))
1517 continue;
1518 emit_mov(pc, dst[c], temp);
1519 }
1520 break;
1521 case TGSI_OPCODE_FLR:
1522 for (c = 0; c < 4; c++) {
1523 if (!(mask & (1 << c)))
1524 continue;
1525 emit_flr(pc, dst[c], src[0][c]);
1526 }
1527 break;
1528 case TGSI_OPCODE_FRC:
1529 temp = temp_temp(pc);
1530 for (c = 0; c < 4; c++) {
1531 if (!(mask & (1 << c)))
1532 continue;
1533 emit_flr(pc, temp, src[0][c]);
1534 emit_sub(pc, dst[c], src[0][c], temp);
1535 }
1536 break;
1537 case TGSI_OPCODE_KIL:
1538 emit_kil(pc, src[0][0]);
1539 emit_kil(pc, src[0][1]);
1540 emit_kil(pc, src[0][2]);
1541 emit_kil(pc, src[0][3]);
1542 pc->p->cfg.fp.regs[2] |= 0x00100000;
1543 break;
1544 case TGSI_OPCODE_LIT:
1545 emit_lit(pc, &dst[0], mask, &src[0][0]);
1546 break;
1547 case TGSI_OPCODE_LG2:
1548 temp = temp_temp(pc);
1549 emit_flop(pc, 3, temp, src[0][0]);
1550 for (c = 0; c < 4; c++) {
1551 if (!(mask & (1 << c)))
1552 continue;
1553 emit_mov(pc, dst[c], temp);
1554 }
1555 break;
1556 case TGSI_OPCODE_LRP:
1557 temp = temp_temp(pc);
1558 for (c = 0; c < 4; c++) {
1559 if (!(mask & (1 << c)))
1560 continue;
1561 emit_sub(pc, temp, src[1][c], src[2][c]);
1562 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1563 }
1564 break;
1565 case TGSI_OPCODE_MAD:
1566 for (c = 0; c < 4; c++) {
1567 if (!(mask & (1 << c)))
1568 continue;
1569 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1570 }
1571 break;
1572 case TGSI_OPCODE_MAX:
1573 for (c = 0; c < 4; c++) {
1574 if (!(mask & (1 << c)))
1575 continue;
1576 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1577 }
1578 break;
1579 case TGSI_OPCODE_MIN:
1580 for (c = 0; c < 4; c++) {
1581 if (!(mask & (1 << c)))
1582 continue;
1583 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1584 }
1585 break;
1586 case TGSI_OPCODE_MOV:
1587 case TGSI_OPCODE_SWZ:
1588 for (c = 0; c < 4; c++) {
1589 if (!(mask & (1 << c)))
1590 continue;
1591 emit_mov(pc, dst[c], src[0][c]);
1592 }
1593 break;
1594 case TGSI_OPCODE_MUL:
1595 for (c = 0; c < 4; c++) {
1596 if (!(mask & (1 << c)))
1597 continue;
1598 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1599 }
1600 break;
1601 case TGSI_OPCODE_POW:
1602 temp = temp_temp(pc);
1603 emit_pow(pc, temp, src[0][0], src[1][0]);
1604 for (c = 0; c < 4; c++) {
1605 if (!(mask & (1 << c)))
1606 continue;
1607 emit_mov(pc, dst[c], temp);
1608 }
1609 break;
1610 case TGSI_OPCODE_RCP:
1611 for (c = 3; c >= 0; c--) {
1612 if (!(mask & (1 << c)))
1613 continue;
1614 emit_flop(pc, 0, dst[c], src[0][0]);
1615 }
1616 break;
1617 case TGSI_OPCODE_RSQ:
1618 for (c = 3; c >= 0; c--) {
1619 if (!(mask & (1 << c)))
1620 continue;
1621 emit_flop(pc, 2, dst[c], src[0][0]);
1622 }
1623 break;
1624 case TGSI_OPCODE_SCS:
1625 temp = temp_temp(pc);
1626 emit_precossin(pc, temp, src[0][0]);
1627 if (mask & (1 << 0))
1628 emit_flop(pc, 5, dst[0], temp);
1629 if (mask & (1 << 1))
1630 emit_flop(pc, 4, dst[1], temp);
1631 if (mask & (1 << 2))
1632 emit_mov_immdval(pc, dst[2], 0.0);
1633 if (mask & (1 << 3))
1634 emit_mov_immdval(pc, dst[3], 1.0);
1635 break;
1636 case TGSI_OPCODE_SGE:
1637 for (c = 0; c < 4; c++) {
1638 if (!(mask & (1 << c)))
1639 continue;
1640 emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1641 }
1642 break;
1643 case TGSI_OPCODE_SIN:
1644 temp = temp_temp(pc);
1645 emit_precossin(pc, temp, src[0][0]);
1646 emit_flop(pc, 4, temp, temp);
1647 for (c = 0; c < 4; c++) {
1648 if (!(mask & (1 << c)))
1649 continue;
1650 emit_mov(pc, dst[c], temp);
1651 }
1652 break;
1653 case TGSI_OPCODE_SLT:
1654 for (c = 0; c < 4; c++) {
1655 if (!(mask & (1 << c)))
1656 continue;
1657 emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1658 }
1659 break;
1660 case TGSI_OPCODE_SUB:
1661 for (c = 0; c < 4; c++) {
1662 if (!(mask & (1 << c)))
1663 continue;
1664 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1665 }
1666 break;
1667 case TGSI_OPCODE_TEX:
1668 emit_tex(pc, dst, mask, src[0], unit,
1669 inst->InstructionExtTexture.Texture, FALSE);
1670 break;
1671 case TGSI_OPCODE_TXP:
1672 emit_tex(pc, dst, mask, src[0], unit,
1673 inst->InstructionExtTexture.Texture, TRUE);
1674 break;
1675 case TGSI_OPCODE_XPD:
1676 temp = temp_temp(pc);
1677 if (mask & (1 << 0)) {
1678 emit_mul(pc, temp, src[0][2], src[1][1]);
1679 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1680 }
1681 if (mask & (1 << 1)) {
1682 emit_mul(pc, temp, src[0][0], src[1][2]);
1683 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1684 }
1685 if (mask & (1 << 2)) {
1686 emit_mul(pc, temp, src[0][1], src[1][0]);
1687 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1688 }
1689 if (mask & (1 << 3))
1690 emit_mov_immdval(pc, dst[3], 1.0);
1691 break;
1692 case TGSI_OPCODE_END:
1693 break;
1694 default:
1695 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1696 return FALSE;
1697 }
1698
1699 if (sat) {
1700 for (c = 0; c < 4; c++) {
1701 if (!(mask & (1 << c)))
1702 continue;
1703 emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
1704 CVT_F32_F32);
1705 }
1706 } else if (assimilate) {
1707 for (c = 0; c < 4; c++)
1708 if (rdst[c])
1709 assimilate_temp(pc, rdst[c], dst[c]);
1710 }
1711
1712 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1713 for (c = 0; c < 4; c++) {
1714 if (!src[i][c])
1715 continue;
1716 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1717 FREE(src[i][c]);
1718 else
1719 if (src[i][c]->acc == pc->insn_cur)
1720 release_hw(pc, src[i][c]);
1721 }
1722 }
1723
1724 kill_temp_temp(pc);
1725 return TRUE;
1726 }
1727
1728 static void
1729 prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1730 unsigned *r_usage[2])
1731 {
1732 const struct tgsi_full_instruction *insn;
1733 const struct tgsi_full_src_register *src;
1734 const struct tgsi_dst_register *dst;
1735
1736 unsigned i, c, k, n, mask, *acc_p;
1737
1738 insn = &tok->FullInstruction;
1739 dst = &insn->FullDstRegisters[0].DstRegister;
1740 mask = dst->WriteMask;
1741
1742 if (!r_usage[0])
1743 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1744 if (!r_usage[1])
1745 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1746
1747 if (dst->File == TGSI_FILE_TEMPORARY) {
1748 for (c = 0; c < 4; c++) {
1749 if (!(mask & (1 << c)))
1750 continue;
1751 r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1752 }
1753 }
1754
1755 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1756 src = &insn->FullSrcRegisters[i];
1757
1758 switch (src->SrcRegister.File) {
1759 case TGSI_FILE_TEMPORARY:
1760 acc_p = r_usage[0];
1761 break;
1762 case TGSI_FILE_INPUT:
1763 acc_p = r_usage[1];
1764 break;
1765 default:
1766 continue;
1767 }
1768
1769 mask = nv50_tgsi_src_mask(insn, i);
1770
1771 for (c = 0; c < 4; c++) {
1772 if (!(mask & (1 << c)))
1773 continue;
1774
1775 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1776 switch (k) {
1777 case TGSI_EXTSWIZZLE_X:
1778 case TGSI_EXTSWIZZLE_Y:
1779 case TGSI_EXTSWIZZLE_Z:
1780 case TGSI_EXTSWIZZLE_W:
1781 n = src->SrcRegister.Index * 4 + k;
1782 acc_p[n] = pc->insn_nr;
1783 break;
1784 default:
1785 break;
1786 }
1787 }
1788 }
1789 }
1790
1791 /* Returns a bitmask indicating which dst components need to be
1792 * written to temporaries first to avoid 'corrupting' sources.
1793 *
1794 * m[i] (out) indicate component to write in the i-th position
1795 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1796 */
1797 static unsigned
1798 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1799 {
1800 unsigned i, c, x, unsafe;
1801
1802 for (c = 0; c < 4; c++)
1803 m[c] = c;
1804
1805 /* Swap as long as a dst component written earlier is depended on
1806 * by one written later, but the next one isn't depended on by it.
1807 */
1808 for (c = 0; c < 3; c++) {
1809 if (rdep[m[c + 1]] & (1 << m[c]))
1810 continue; /* if next one is depended on by us */
1811 for (i = c + 1; i < 4; i++)
1812 /* if we are depended on by a later one */
1813 if (rdep[m[c]] & (1 << m[i]))
1814 break;
1815 if (i == 4)
1816 continue;
1817 /* now, swap */
1818 x = m[c];
1819 m[c] = m[c + 1];
1820 m[c + 1] = x;
1821
1822 /* restart */
1823 c = 0;
1824 }
1825
1826 /* mark dependencies that could not be resolved by reordering */
1827 for (i = 0; i < 3; ++i)
1828 for (c = i + 1; c < 4; ++c)
1829 if (rdep[m[i]] & (1 << m[c]))
1830 unsafe |= (1 << i);
1831
1832 /* NOTE: $unsafe is with respect to order, not component */
1833 return unsafe;
1834 }
1835
1836 /* Select a suitable dst register for broadcasting scalar results,
1837 * or return NULL if we have to allocate an extra TEMP.
1838 *
1839 * If e.g. only 1 component is written, we may also emit the final
1840 * result to a write-only register.
1841 */
1842 static struct nv50_reg *
1843 tgsi_broadcast_dst(struct nv50_pc *pc,
1844 const struct tgsi_full_dst_register *fd, unsigned mask)
1845 {
1846 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1847 int c = ffs(~mask & fd->DstRegister.WriteMask);
1848 if (c)
1849 return tgsi_dst(pc, c - 1, fd);
1850 } else {
1851 int c = ffs(fd->DstRegister.WriteMask) - 1;
1852 if ((1 << c) == fd->DstRegister.WriteMask)
1853 return tgsi_dst(pc, c, fd);
1854 }
1855
1856 return NULL;
1857 }
1858
1859 static unsigned
1860 load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1861 int *aid, int *p_oid)
1862 {
1863 struct nv50_reg *iv;
1864 int oid, c, n;
1865 unsigned mask = 0;
1866
1867 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1868
1869 for (c = 0, n = i * 4; c < 4; c++, n++) {
1870 oid = (*p_oid)++;
1871 pc->attr[n].type = P_TEMP;
1872 pc->attr[n].index = i;
1873
1874 if (pc->attr[n].acc == acc[n])
1875 continue;
1876 mask |= (1 << c);
1877
1878 pc->attr[n].acc = acc[n];
1879 pc->attr[n].rhw = pc->attr[n].hw = -1;
1880 alloc_reg(pc, &pc->attr[n]);
1881
1882 pc->attr[n].rhw = (*aid)++;
1883 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1884
1885 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1886 (*mid)++;
1887 pc->p->cfg.fp.regs[1] += 0x00010001;
1888 }
1889
1890 return mask;
1891 }
1892
1893 static boolean
1894 nv50_program_tx_prep(struct nv50_pc *pc)
1895 {
1896 struct tgsi_parse_context p;
1897 boolean ret = FALSE;
1898 unsigned i, c;
1899 unsigned fcol, bcol, fcrd, depr;
1900
1901 /* count (centroid) perspective interpolations */
1902 unsigned centroid_loads = 0;
1903 unsigned perspect_loads = 0;
1904
1905 /* track register access for temps and attrs */
1906 unsigned *r_usage[2];
1907 r_usage[0] = NULL;
1908 r_usage[1] = NULL;
1909
1910 depr = fcol = bcol = fcrd = 0xffff;
1911
1912 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1913 pc->p->cfg.fp.regs[0] = 0x01000404;
1914 pc->p->cfg.fp.regs[1] = 0x00000400;
1915 }
1916
1917 tgsi_parse_init(&p, pc->p->pipe.tokens);
1918 while (!tgsi_parse_end_of_tokens(&p)) {
1919 const union tgsi_full_token *tok = &p.FullToken;
1920
1921 tgsi_parse_token(&p);
1922 switch (tok->Token.Type) {
1923 case TGSI_TOKEN_TYPE_IMMEDIATE:
1924 {
1925 const struct tgsi_full_immediate *imm =
1926 &p.FullToken.FullImmediate;
1927
1928 ctor_immd(pc, imm->u[0].Float,
1929 imm->u[1].Float,
1930 imm->u[2].Float,
1931 imm->u[3].Float);
1932 }
1933 break;
1934 case TGSI_TOKEN_TYPE_DECLARATION:
1935 {
1936 const struct tgsi_full_declaration *d;
1937 unsigned last, first, mode;
1938
1939 d = &p.FullToken.FullDeclaration;
1940 first = d->DeclarationRange.First;
1941 last = d->DeclarationRange.Last;
1942
1943 switch (d->Declaration.File) {
1944 case TGSI_FILE_TEMPORARY:
1945 if (pc->temp_nr < (last + 1))
1946 pc->temp_nr = last + 1;
1947 break;
1948 case TGSI_FILE_OUTPUT:
1949 if (pc->result_nr < (last + 1))
1950 pc->result_nr = last + 1;
1951
1952 if (!d->Declaration.Semantic)
1953 break;
1954
1955 switch (d->Semantic.SemanticName) {
1956 case TGSI_SEMANTIC_POSITION:
1957 depr = first;
1958 pc->p->cfg.fp.regs[2] |= 0x00000100;
1959 pc->p->cfg.fp.regs[3] |= 0x00000011;
1960 break;
1961 default:
1962 break;
1963 }
1964
1965 break;
1966 case TGSI_FILE_INPUT:
1967 {
1968 if (pc->attr_nr < (last + 1))
1969 pc->attr_nr = last + 1;
1970
1971 if (pc->p->type != PIPE_SHADER_FRAGMENT)
1972 break;
1973
1974 switch (d->Declaration.Interpolate) {
1975 case TGSI_INTERPOLATE_CONSTANT:
1976 mode = INTERP_FLAT;
1977 break;
1978 case TGSI_INTERPOLATE_PERSPECTIVE:
1979 mode = INTERP_PERSPECTIVE;
1980 break;
1981 default:
1982 mode = INTERP_LINEAR;
1983 break;
1984 }
1985
1986 if (d->Declaration.Semantic) {
1987 switch (d->Semantic.SemanticName) {
1988 case TGSI_SEMANTIC_POSITION:
1989 fcrd = first;
1990 break;
1991 case TGSI_SEMANTIC_COLOR:
1992 fcol = first;
1993 mode = INTERP_PERSPECTIVE;
1994 break;
1995 case TGSI_SEMANTIC_BCOLOR:
1996 bcol = first;
1997 mode = INTERP_PERSPECTIVE;
1998 break;
1999 }
2000 }
2001
2002 if (d->Declaration.Centroid) {
2003 mode |= INTERP_CENTROID;
2004 if (mode & INTERP_PERSPECTIVE)
2005 centroid_loads++;
2006 } else
2007 if (mode & INTERP_PERSPECTIVE)
2008 perspect_loads++;
2009
2010 assert(last < 32);
2011 for (i = first; i <= last; i++)
2012 pc->interp_mode[i] = mode;
2013 }
2014 break;
2015 case TGSI_FILE_CONSTANT:
2016 if (pc->param_nr < (last + 1))
2017 pc->param_nr = last + 1;
2018 break;
2019 case TGSI_FILE_SAMPLER:
2020 break;
2021 default:
2022 NOUVEAU_ERR("bad decl file %d\n",
2023 d->Declaration.File);
2024 goto out_err;
2025 }
2026 }
2027 break;
2028 case TGSI_TOKEN_TYPE_INSTRUCTION:
2029 pc->insn_nr++;
2030 prep_inspect_insn(pc, tok, r_usage);
2031 break;
2032 default:
2033 break;
2034 }
2035 }
2036
2037 if (pc->temp_nr) {
2038 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
2039 if (!pc->temp)
2040 goto out_err;
2041
2042 for (i = 0; i < pc->temp_nr; i++) {
2043 for (c = 0; c < 4; c++) {
2044 pc->temp[i*4+c].type = P_TEMP;
2045 pc->temp[i*4+c].hw = -1;
2046 pc->temp[i*4+c].rhw = -1;
2047 pc->temp[i*4+c].index = i;
2048 pc->temp[i*4+c].acc = r_usage[0][i*4+c];
2049 }
2050 }
2051 }
2052
2053 if (pc->attr_nr) {
2054 int oid = 4, mid = 4, aid = 0;
2055 /* oid = VP output id
2056 * aid = FP attribute/interpolant id
2057 * mid = VP output mapping field ID
2058 */
2059
2060 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
2061 if (!pc->attr)
2062 goto out_err;
2063
2064 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2065 /* position should be loaded first */
2066 if (fcrd != 0xffff) {
2067 unsigned mask;
2068 mid = 0;
2069 mask = load_fp_attrib(pc, fcrd, r_usage[1],
2070 &mid, &aid, &oid);
2071 oid = 0;
2072 pc->p->cfg.fp.regs[1] |= (mask << 24);
2073 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
2074 }
2075 pc->p->cfg.fp.map[0] += 0x03020100;
2076
2077 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
2078
2079 if (perspect_loads) {
2080 pc->iv_p = alloc_temp(pc, NULL);
2081
2082 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
2083 pc->p->cfg.fp.regs[1] |= 0x08000000;
2084 pc->iv_p->rhw = aid++;
2085 emit_interp(pc, pc->iv_p, NULL,
2086 INTERP_LINEAR);
2087 emit_flop(pc, 0, pc->iv_p, pc->iv_p);
2088 } else {
2089 pc->iv_p->rhw = aid - 1;
2090 emit_flop(pc, 0, pc->iv_p,
2091 &pc->attr[fcrd * 4 + 3]);
2092 }
2093 }
2094
2095 if (centroid_loads) {
2096 pc->iv_c = alloc_temp(pc, NULL);
2097 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
2098 emit_interp(pc, pc->iv_c, NULL,
2099 INTERP_CENTROID);
2100 emit_flop(pc, 0, pc->iv_c, pc->iv_c);
2101 pc->p->cfg.fp.regs[1] |= 0x08000000;
2102 }
2103
2104 for (c = 0; c < 4; c++) {
2105 /* I don't know what these values do, but
2106 * let's set them like the blob does:
2107 */
2108 if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
2109 pc->p->cfg.fp.regs[0] += 0x00010000;
2110 if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
2111 pc->p->cfg.fp.regs[0] += 0x00010000;
2112 }
2113
2114 for (i = 0; i < pc->attr_nr; i++)
2115 load_fp_attrib(pc, i, r_usage[1],
2116 &mid, &aid, &oid);
2117
2118 if (pc->iv_p)
2119 free_temp(pc, pc->iv_p);
2120 if (pc->iv_c)
2121 free_temp(pc, pc->iv_c);
2122
2123 pc->p->cfg.fp.high_map = (mid / 4);
2124 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
2125 } else {
2126 /* vertex program */
2127 for (i = 0; i < pc->attr_nr * 4; i++) {
2128 pc->p->cfg.vp.attr[aid / 32] |=
2129 (1 << (aid % 32));
2130 pc->attr[i].type = P_ATTR;
2131 pc->attr[i].hw = aid++;
2132 pc->attr[i].index = i / 4;
2133 }
2134 }
2135 }
2136
2137 if (pc->result_nr) {
2138 int rid = 0;
2139
2140 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
2141 if (!pc->result)
2142 goto out_err;
2143
2144 for (i = 0; i < pc->result_nr; i++) {
2145 for (c = 0; c < 4; c++) {
2146 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2147 pc->result[i*4+c].type = P_TEMP;
2148 pc->result[i*4+c].hw = -1;
2149 pc->result[i*4+c].rhw = (i == depr) ?
2150 -1 : rid++;
2151 } else {
2152 pc->result[i*4+c].type = P_RESULT;
2153 pc->result[i*4+c].hw = rid++;
2154 }
2155 pc->result[i*4+c].index = i;
2156 }
2157
2158 if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2159 depr != 0xffff) {
2160 pc->result[depr * 4 + 2].rhw =
2161 (pc->result_nr - 1) * 4;
2162 }
2163 }
2164 }
2165
2166 if (pc->param_nr) {
2167 int rid = 0;
2168
2169 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2170 if (!pc->param)
2171 goto out_err;
2172
2173 for (i = 0; i < pc->param_nr; i++) {
2174 for (c = 0; c < 4; c++) {
2175 pc->param[i*4+c].type = P_CONST;
2176 pc->param[i*4+c].hw = rid++;
2177 pc->param[i*4+c].index = i;
2178 }
2179 }
2180 }
2181
2182 if (pc->immd_nr) {
2183 int rid = 0;
2184
2185 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2186 if (!pc->immd)
2187 goto out_err;
2188
2189 for (i = 0; i < pc->immd_nr; i++) {
2190 for (c = 0; c < 4; c++) {
2191 pc->immd[i*4+c].type = P_IMMD;
2192 pc->immd[i*4+c].hw = rid++;
2193 pc->immd[i*4+c].index = i;
2194 }
2195 }
2196 }
2197
2198 ret = TRUE;
2199 out_err:
2200 if (r_usage[0])
2201 FREE(r_usage[0]);
2202 if (r_usage[1])
2203 FREE(r_usage[1]);
2204
2205 tgsi_parse_free(&p);
2206 return ret;
2207 }
2208
2209 static void
2210 free_nv50_pc(struct nv50_pc *pc)
2211 {
2212 if (pc->immd)
2213 FREE(pc->immd);
2214 if (pc->param)
2215 FREE(pc->param);
2216 if (pc->result)
2217 FREE(pc->result);
2218 if (pc->attr)
2219 FREE(pc->attr);
2220 if (pc->temp)
2221 FREE(pc->temp);
2222
2223 FREE(pc);
2224 }
2225
2226 static boolean
2227 nv50_program_tx(struct nv50_program *p)
2228 {
2229 struct tgsi_parse_context parse;
2230 struct nv50_pc *pc;
2231 unsigned k;
2232 boolean ret;
2233
2234 pc = CALLOC_STRUCT(nv50_pc);
2235 if (!pc)
2236 return FALSE;
2237 pc->p = p;
2238 pc->p->cfg.high_temp = 4;
2239
2240 ret = nv50_program_tx_prep(pc);
2241 if (ret == FALSE)
2242 goto out_cleanup;
2243
2244 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2245 while (!tgsi_parse_end_of_tokens(&parse)) {
2246 const union tgsi_full_token *tok = &parse.FullToken;
2247
2248 /* don't allow half insn/immd on first and last instruction */
2249 pc->allow32 = TRUE;
2250 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2251 pc->allow32 = FALSE;
2252
2253 tgsi_parse_token(&parse);
2254
2255 switch (tok->Token.Type) {
2256 case TGSI_TOKEN_TYPE_INSTRUCTION:
2257 ++pc->insn_cur;
2258 ret = nv50_program_tx_insn(pc, tok);
2259 if (ret == FALSE)
2260 goto out_err;
2261 break;
2262 default:
2263 break;
2264 }
2265 }
2266
2267 if (p->type == PIPE_SHADER_FRAGMENT) {
2268 struct nv50_reg out;
2269
2270 out.type = P_TEMP;
2271 for (k = 0; k < pc->result_nr * 4; k++) {
2272 if (pc->result[k].rhw == -1)
2273 continue;
2274 if (pc->result[k].hw != pc->result[k].rhw) {
2275 out.hw = pc->result[k].rhw;
2276 emit_mov(pc, &out, &pc->result[k]);
2277 }
2278 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2279 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2280 }
2281 }
2282
2283 /* look for single half instructions and make them long */
2284 struct nv50_program_exec *e, *e_prev;
2285
2286 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2287 if (!is_long(e))
2288 k++;
2289
2290 if (!e->next || is_long(e->next)) {
2291 if (k & 1)
2292 convert_to_long(pc, e);
2293 k = 0;
2294 }
2295
2296 if (e->next)
2297 e_prev = e;
2298 }
2299
2300 if (!is_long(pc->p->exec_tail)) {
2301 /* this may occur if moving FP results */
2302 assert(e_prev && !is_long(e_prev));
2303 convert_to_long(pc, e_prev);
2304 convert_to_long(pc, pc->p->exec_tail);
2305 }
2306
2307 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2308 pc->p->exec_tail->inst[1] |= 0x00000001;
2309
2310 p->param_nr = pc->param_nr * 4;
2311 p->immd_nr = pc->immd_nr * 4;
2312 p->immd = pc->immd_buf;
2313
2314 out_err:
2315 tgsi_parse_free(&parse);
2316
2317 out_cleanup:
2318 free_nv50_pc(pc);
2319 return ret;
2320 }
2321
2322 static void
2323 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2324 {
2325 if (nv50_program_tx(p) == FALSE)
2326 assert(0);
2327 p->translated = TRUE;
2328 }
2329
2330 static void
2331 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2332 unsigned start, unsigned count, unsigned cbuf)
2333 {
2334 struct nouveau_channel *chan = nv50->screen->base.channel;
2335 struct nouveau_grobj *tesla = nv50->screen->tesla;
2336
2337 while (count) {
2338 unsigned nr = count > 2047 ? 2047 : count;
2339
2340 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2341 OUT_RING (chan, (cbuf << 0) | (start << 8));
2342 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2343 OUT_RINGp (chan, map, nr);
2344
2345 map += nr;
2346 start += nr;
2347 count -= nr;
2348 }
2349 }
2350
2351 static void
2352 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2353 {
2354 struct pipe_screen *pscreen = nv50->pipe.screen;
2355
2356 if (!p->data[0] && p->immd_nr) {
2357 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2358
2359 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2360 while (heap->next && heap->size < p->immd_nr) {
2361 struct nv50_program *evict = heap->next->priv;
2362 nouveau_resource_free(&evict->data[0]);
2363 }
2364
2365 if (nouveau_resource_alloc(heap, p->immd_nr, p,
2366 &p->data[0]))
2367 assert(0);
2368 }
2369
2370 /* immediates only need to be uploaded again when freed */
2371 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2372 p->immd_nr, NV50_CB_PMISC);
2373 }
2374
2375 if (!p->data[1] && p->param_nr) {
2376 struct nouveau_resource *heap =
2377 nv50->screen->parm_heap[p->type];
2378
2379 if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2380 while (heap->next && heap->size < p->param_nr) {
2381 struct nv50_program *evict = heap->next->priv;
2382 nouveau_resource_free(&evict->data[1]);
2383 }
2384
2385 if (nouveau_resource_alloc(heap, p->param_nr, p,
2386 &p->data[1]))
2387 assert(0);
2388 }
2389 }
2390
2391 if (p->param_nr) {
2392 unsigned cbuf = NV50_CB_PVP;
2393 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2394 PIPE_BUFFER_USAGE_CPU_READ);
2395 if (p->type == PIPE_SHADER_FRAGMENT)
2396 cbuf = NV50_CB_PFP;
2397 nv50_program_upload_data(nv50, map, p->data[1]->start,
2398 p->param_nr, cbuf);
2399 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2400 }
2401 }
2402
2403 static void
2404 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2405 {
2406 struct nouveau_channel *chan = nv50->screen->base.channel;
2407 struct nouveau_grobj *tesla = nv50->screen->tesla;
2408 struct nv50_program_exec *e;
2409 struct nouveau_stateobj *so;
2410 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2411 unsigned start, count, *up, *ptr;
2412 boolean upload = FALSE;
2413
2414 if (!p->bo) {
2415 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2416 p->exec_size * 4, &p->bo);
2417 upload = TRUE;
2418 }
2419
2420 if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2421 (p->data[1] && p->data[1]->start != p->data_start[1])) {
2422 for (e = p->exec_head; e; e = e->next) {
2423 unsigned ei, ci, bs;
2424
2425 if (e->param.index < 0)
2426 continue;
2427 bs = (e->inst[1] >> 22) & 0x07;
2428 assert(bs < 2);
2429 ei = e->param.shift >> 5;
2430 ci = e->param.index + p->data[bs]->start;
2431
2432 e->inst[ei] &= ~e->param.mask;
2433 e->inst[ei] |= (ci << e->param.shift);
2434 }
2435
2436 if (p->data[0])
2437 p->data_start[0] = p->data[0]->start;
2438 if (p->data[1])
2439 p->data_start[1] = p->data[1]->start;
2440
2441 upload = TRUE;
2442 }
2443
2444 if (!upload)
2445 return;
2446
2447 #ifdef NV50_PROGRAM_DUMP
2448 NOUVEAU_ERR("-------\n");
2449 for (e = p->exec_head; e; e = e->next) {
2450 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2451 if (is_long(e))
2452 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2453 }
2454 #endif
2455
2456 up = ptr = MALLOC(p->exec_size * 4);
2457 for (e = p->exec_head; e; e = e->next) {
2458 *(ptr++) = e->inst[0];
2459 if (is_long(e))
2460 *(ptr++) = e->inst[1];
2461 }
2462
2463 so = so_new(4,2);
2464 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2465 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2466 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2467 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2468
2469 start = 0; count = p->exec_size;
2470 while (count) {
2471 struct nouveau_channel *chan = nv50->screen->base.channel;
2472 unsigned nr;
2473
2474 so_emit(chan, so);
2475
2476 nr = MIN2(count, 2047);
2477 nr = MIN2(chan->pushbuf->remaining, nr);
2478 if (chan->pushbuf->remaining < (nr + 3)) {
2479 FIRE_RING(chan);
2480 continue;
2481 }
2482
2483 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2484 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2485 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2486 OUT_RINGp (chan, up + start, nr);
2487
2488 start += nr;
2489 count -= nr;
2490 }
2491
2492 FREE(up);
2493 so_ref(NULL, &so);
2494 }
2495
2496 void
2497 nv50_vertprog_validate(struct nv50_context *nv50)
2498 {
2499 struct nouveau_grobj *tesla = nv50->screen->tesla;
2500 struct nv50_program *p = nv50->vertprog;
2501 struct nouveau_stateobj *so;
2502
2503 if (!p->translated) {
2504 nv50_program_validate(nv50, p);
2505 if (!p->translated)
2506 assert(0);
2507 }
2508
2509 nv50_program_validate_data(nv50, p);
2510 nv50_program_validate_code(nv50, p);
2511
2512 so = so_new(13, 2);
2513 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2514 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2515 NOUVEAU_BO_HIGH, 0, 0);
2516 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2517 NOUVEAU_BO_LOW, 0, 0);
2518 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2519 so_data (so, p->cfg.vp.attr[0]);
2520 so_data (so, p->cfg.vp.attr[1]);
2521 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2522 so_data (so, p->cfg.high_result);
2523 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2524 so_data (so, p->cfg.high_result); //8);
2525 so_data (so, p->cfg.high_temp);
2526 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2527 so_data (so, 0); /* program start offset */
2528 so_ref(so, &nv50->state.vertprog);
2529 so_ref(NULL, &so);
2530 }
2531
2532 void
2533 nv50_fragprog_validate(struct nv50_context *nv50)
2534 {
2535 struct nouveau_grobj *tesla = nv50->screen->tesla;
2536 struct nv50_program *p = nv50->fragprog;
2537 struct nouveau_stateobj *so;
2538 unsigned i;
2539
2540 if (!p->translated) {
2541 nv50_program_validate(nv50, p);
2542 if (!p->translated)
2543 assert(0);
2544 }
2545
2546 nv50_program_validate_data(nv50, p);
2547 nv50_program_validate_code(nv50, p);
2548
2549 so = so_new(64, 2);
2550 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2551 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2552 NOUVEAU_BO_HIGH, 0, 0);
2553 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2554 NOUVEAU_BO_LOW, 0, 0);
2555 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2556 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2557 so_data (so, 0x00000004);
2558 so_data (so, 0x00000000);
2559 so_data (so, 0x00000000);
2560 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
2561 for (i = 0; i < p->cfg.fp.high_map; i++)
2562 so_data(so, p->cfg.fp.map[i]);
2563 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
2564 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2565 so_data (so, p->cfg.high_temp);
2566 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2567 so_data (so, p->cfg.high_result);
2568 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2569 so_data (so, p->cfg.fp.regs[2]);
2570 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2571 so_data (so, p->cfg.fp.regs[3]);
2572 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2573 so_data (so, 0); /* program start offset */
2574 so_ref(so, &nv50->state.fragprog);
2575 so_ref(NULL, &so);
2576 }
2577
2578 void
2579 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2580 {
2581 while (p->exec_head) {
2582 struct nv50_program_exec *e = p->exec_head;
2583
2584 p->exec_head = e->next;
2585 FREE(e);
2586 }
2587 p->exec_tail = NULL;
2588 p->exec_size = 0;
2589
2590 nouveau_bo_ref(NULL, &p->bo);
2591
2592 nouveau_resource_free(&p->data[0]);
2593 nouveau_resource_free(&p->data[1]);
2594
2595 p->translated = 0;
2596 }