Merge branch 'mesa_7_6_branch'
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 /* broadcast and destination replacement regs */
116 struct nv50_reg *r_brdc;
117 struct nv50_reg *r_dst[4];
118
119 unsigned interp_mode[32];
120 /* perspective interpolation registers */
121 struct nv50_reg *iv_p;
122 struct nv50_reg *iv_c;
123
124 /* current instruction and total number of insns */
125 unsigned insn_cur;
126 unsigned insn_nr;
127
128 boolean allow32;
129 };
130
131 static INLINE void
132 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
133 {
134 reg->type = type;
135 reg->index = index;
136 reg->hw = hw;
137 reg->neg = 0;
138 reg->rhw = -1;
139 reg->acc = 0;
140 }
141
142 static INLINE unsigned
143 popcnt4(uint32_t val)
144 {
145 static const unsigned cnt[16]
146 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
147 return cnt[val & 0xf];
148 }
149
150 static void
151 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
152 {
153 int i = 0;
154
155 if (reg->type == P_RESULT) {
156 if (pc->p->cfg.high_result < (reg->hw + 1))
157 pc->p->cfg.high_result = reg->hw + 1;
158 }
159
160 if (reg->type != P_TEMP)
161 return;
162
163 if (reg->hw >= 0) {
164 /*XXX: do this here too to catch FP temp-as-attr usage..
165 * not clean, but works */
166 if (pc->p->cfg.high_temp < (reg->hw + 1))
167 pc->p->cfg.high_temp = reg->hw + 1;
168 return;
169 }
170
171 if (reg->rhw != -1) {
172 /* try to allocate temporary with index rhw first */
173 if (!(pc->r_temp[reg->rhw])) {
174 pc->r_temp[reg->rhw] = reg;
175 reg->hw = reg->rhw;
176 if (pc->p->cfg.high_temp < (reg->rhw + 1))
177 pc->p->cfg.high_temp = reg->rhw + 1;
178 return;
179 }
180 /* make sure we don't get things like $r0 needs to go
181 * in $r1 and $r1 in $r0
182 */
183 i = pc->result_nr * 4;
184 }
185
186 for (; i < NV50_SU_MAX_TEMP; i++) {
187 if (!(pc->r_temp[i])) {
188 pc->r_temp[i] = reg;
189 reg->hw = i;
190 if (pc->p->cfg.high_temp < (i + 1))
191 pc->p->cfg.high_temp = i + 1;
192 return;
193 }
194 }
195
196 assert(0);
197 }
198
199 static struct nv50_reg *
200 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
201 {
202 struct nv50_reg *r;
203 int i;
204
205 if (dst && dst->type == P_TEMP && dst->hw == -1)
206 return dst;
207
208 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
209 if (!pc->r_temp[i]) {
210 r = MALLOC_STRUCT(nv50_reg);
211 ctor_reg(r, P_TEMP, -1, i);
212 pc->r_temp[i] = r;
213 return r;
214 }
215 }
216
217 assert(0);
218 return NULL;
219 }
220
221 /* Assign the hw of the discarded temporary register src
222 * to the tgsi register dst and free src.
223 */
224 static void
225 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
226 {
227 assert(src->index == -1 && src->hw != -1);
228
229 if (dst->hw != -1)
230 pc->r_temp[dst->hw] = NULL;
231 pc->r_temp[src->hw] = dst;
232 dst->hw = src->hw;
233
234 FREE(src);
235 }
236
237 /* release the hardware resource held by r */
238 static void
239 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
240 {
241 assert(r->type == P_TEMP);
242 if (r->hw == -1)
243 return;
244
245 assert(pc->r_temp[r->hw] == r);
246 pc->r_temp[r->hw] = NULL;
247
248 r->acc = 0;
249 if (r->index == -1)
250 FREE(r);
251 }
252
253 static void
254 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
255 {
256 if (r->index == -1) {
257 unsigned hw = r->hw;
258
259 FREE(pc->r_temp[hw]);
260 pc->r_temp[hw] = NULL;
261 }
262 }
263
264 static int
265 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
266 {
267 int i;
268
269 if ((idx + 4) >= NV50_SU_MAX_TEMP)
270 return 1;
271
272 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
273 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
274 return alloc_temp4(pc, dst, idx + 4);
275
276 for (i = 0; i < 4; i++) {
277 dst[i] = MALLOC_STRUCT(nv50_reg);
278 ctor_reg(dst[i], P_TEMP, -1, idx + i);
279 pc->r_temp[idx + i] = dst[i];
280 }
281
282 return 0;
283 }
284
285 static void
286 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
287 {
288 int i;
289
290 for (i = 0; i < 4; i++)
291 free_temp(pc, reg[i]);
292 }
293
294 static struct nv50_reg *
295 temp_temp(struct nv50_pc *pc)
296 {
297 if (pc->temp_temp_nr >= 16)
298 assert(0);
299
300 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
301 return pc->temp_temp[pc->temp_temp_nr++];
302 }
303
304 static void
305 kill_temp_temp(struct nv50_pc *pc)
306 {
307 int i;
308
309 for (i = 0; i < pc->temp_temp_nr; i++)
310 free_temp(pc, pc->temp_temp[i]);
311 pc->temp_temp_nr = 0;
312 }
313
314 static int
315 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
316 {
317 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
318 (pc->immd_nr + 1) * 4 * sizeof(float));
319 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
320 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
321 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
322 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
323
324 return pc->immd_nr++;
325 }
326
327 static struct nv50_reg *
328 alloc_immd(struct nv50_pc *pc, float f)
329 {
330 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
331 unsigned hw;
332
333 for (hw = 0; hw < pc->immd_nr * 4; hw++)
334 if (pc->immd_buf[hw] == f)
335 break;
336
337 if (hw == pc->immd_nr * 4)
338 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
339
340 ctor_reg(r, P_IMMD, -1, hw);
341 return r;
342 }
343
344 static struct nv50_program_exec *
345 exec(struct nv50_pc *pc)
346 {
347 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
348
349 e->param.index = -1;
350 return e;
351 }
352
353 static void
354 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
355 {
356 struct nv50_program *p = pc->p;
357
358 if (p->exec_tail)
359 p->exec_tail->next = e;
360 if (!p->exec_head)
361 p->exec_head = e;
362 p->exec_tail = e;
363 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
364 }
365
366 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
367
368 static boolean
369 is_long(struct nv50_program_exec *e)
370 {
371 if (e->inst[0] & 1)
372 return TRUE;
373 return FALSE;
374 }
375
376 static boolean
377 is_immd(struct nv50_program_exec *e)
378 {
379 if (is_long(e) && (e->inst[1] & 3) == 3)
380 return TRUE;
381 return FALSE;
382 }
383
384 static INLINE void
385 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
386 struct nv50_program_exec *e)
387 {
388 set_long(pc, e);
389 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
390 e->inst[1] |= (pred << 7) | (idx << 12);
391 }
392
393 static INLINE void
394 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
395 struct nv50_program_exec *e)
396 {
397 set_long(pc, e);
398 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
399 e->inst[1] |= (idx << 4) | (on << 6);
400 }
401
402 static INLINE void
403 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
404 {
405 if (is_long(e))
406 return;
407
408 e->inst[0] |= 1;
409 set_pred(pc, 0xf, 0, e);
410 set_pred_wr(pc, 0, 0, e);
411 }
412
413 static INLINE void
414 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
415 {
416 if (dst->type == P_RESULT) {
417 set_long(pc, e);
418 e->inst[1] |= 0x00000008;
419 }
420
421 alloc_reg(pc, dst);
422 e->inst[0] |= (dst->hw << 2);
423 }
424
425 static INLINE void
426 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
427 {
428 float f = pc->immd_buf[imm->hw];
429 unsigned val = fui(imm->neg ? -f : f);
430
431 set_long(pc, e);
432 /*XXX: can't be predicated - bits overlap.. catch cases where both
433 * are required and avoid them. */
434 set_pred(pc, 0, 0, e);
435 set_pred_wr(pc, 0, 0, e);
436
437 e->inst[1] |= 0x00000002 | 0x00000001;
438 e->inst[0] |= (val & 0x3f) << 16;
439 e->inst[1] |= (val >> 6) << 2;
440 }
441
442
443 #define INTERP_LINEAR 0
444 #define INTERP_FLAT 1
445 #define INTERP_PERSPECTIVE 2
446 #define INTERP_CENTROID 4
447
448 /* interpolant index has been stored in dst->rhw */
449 static void
450 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
451 unsigned mode)
452 {
453 assert(dst->rhw != -1);
454 struct nv50_program_exec *e = exec(pc);
455
456 e->inst[0] |= 0x80000000;
457 set_dst(pc, dst, e);
458 e->inst[0] |= (dst->rhw << 16);
459
460 if (mode & INTERP_FLAT) {
461 e->inst[0] |= (1 << 8);
462 } else {
463 if (mode & INTERP_PERSPECTIVE) {
464 e->inst[0] |= (1 << 25);
465 alloc_reg(pc, iv);
466 e->inst[0] |= (iv->hw << 9);
467 }
468
469 if (mode & INTERP_CENTROID)
470 e->inst[0] |= (1 << 24);
471 }
472
473 emit(pc, e);
474 }
475
476 static void
477 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
478 struct nv50_program_exec *e)
479 {
480 set_long(pc, e);
481
482 e->param.index = src->hw;
483 e->param.shift = s;
484 e->param.mask = m << (s % 32);
485
486 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
487 }
488
489 static void
490 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
491 {
492 struct nv50_program_exec *e = exec(pc);
493
494 e->inst[0] |= 0x10000000;
495
496 set_dst(pc, dst, e);
497
498 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
499 set_immd(pc, src, e);
500 /*XXX: 32-bit, but steals part of "half" reg space - need to
501 * catch and handle this case if/when we do half-regs
502 */
503 } else
504 if (src->type == P_IMMD || src->type == P_CONST) {
505 set_long(pc, e);
506 set_data(pc, src, 0x7f, 9, e);
507 e->inst[1] |= 0x20000000; /* src0 const? */
508 } else {
509 if (src->type == P_ATTR) {
510 set_long(pc, e);
511 e->inst[1] |= 0x00200000;
512 }
513
514 alloc_reg(pc, src);
515 e->inst[0] |= (src->hw << 9);
516 }
517
518 if (is_long(e) && !is_immd(e)) {
519 e->inst[1] |= 0x04000000; /* 32-bit */
520 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
521 if (!(e->inst[1] & 0x20000000))
522 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
523 } else
524 e->inst[0] |= 0x00008000;
525
526 emit(pc, e);
527 }
528
529 static INLINE void
530 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
531 {
532 struct nv50_reg *imm = alloc_immd(pc, f);
533 emit_mov(pc, dst, imm);
534 FREE(imm);
535 }
536
537 static boolean
538 check_swap_src_0_1(struct nv50_pc *pc,
539 struct nv50_reg **s0, struct nv50_reg **s1)
540 {
541 struct nv50_reg *src0 = *s0, *src1 = *s1;
542
543 if (src0->type == P_CONST) {
544 if (src1->type != P_CONST) {
545 *s0 = src1;
546 *s1 = src0;
547 return TRUE;
548 }
549 } else
550 if (src1->type == P_ATTR) {
551 if (src0->type != P_ATTR) {
552 *s0 = src1;
553 *s1 = src0;
554 return TRUE;
555 }
556 }
557
558 return FALSE;
559 }
560
561 static void
562 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
563 {
564 if (src->type == P_ATTR) {
565 set_long(pc, e);
566 e->inst[1] |= 0x00200000;
567 } else
568 if (src->type == P_CONST || src->type == P_IMMD) {
569 struct nv50_reg *temp = temp_temp(pc);
570
571 emit_mov(pc, temp, src);
572 src = temp;
573 }
574
575 alloc_reg(pc, src);
576 e->inst[0] |= (src->hw << 9);
577 }
578
579 static void
580 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
581 {
582 if (src->type == P_ATTR) {
583 struct nv50_reg *temp = temp_temp(pc);
584
585 emit_mov(pc, temp, src);
586 src = temp;
587 } else
588 if (src->type == P_CONST || src->type == P_IMMD) {
589 assert(!(e->inst[0] & 0x00800000));
590 if (e->inst[0] & 0x01000000) {
591 struct nv50_reg *temp = temp_temp(pc);
592
593 emit_mov(pc, temp, src);
594 src = temp;
595 } else {
596 set_data(pc, src, 0x7f, 16, e);
597 e->inst[0] |= 0x00800000;
598 }
599 }
600
601 alloc_reg(pc, src);
602 e->inst[0] |= (src->hw << 16);
603 }
604
605 static void
606 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
607 {
608 set_long(pc, e);
609
610 if (src->type == P_ATTR) {
611 struct nv50_reg *temp = temp_temp(pc);
612
613 emit_mov(pc, temp, src);
614 src = temp;
615 } else
616 if (src->type == P_CONST || src->type == P_IMMD) {
617 assert(!(e->inst[0] & 0x01000000));
618 if (e->inst[0] & 0x00800000) {
619 struct nv50_reg *temp = temp_temp(pc);
620
621 emit_mov(pc, temp, src);
622 src = temp;
623 } else {
624 set_data(pc, src, 0x7f, 32+14, e);
625 e->inst[0] |= 0x01000000;
626 }
627 }
628
629 alloc_reg(pc, src);
630 e->inst[1] |= (src->hw << 14);
631 }
632
633 static void
634 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
635 struct nv50_reg *src1)
636 {
637 struct nv50_program_exec *e = exec(pc);
638
639 e->inst[0] |= 0xc0000000;
640
641 if (!pc->allow32)
642 set_long(pc, e);
643
644 check_swap_src_0_1(pc, &src0, &src1);
645 set_dst(pc, dst, e);
646 set_src_0(pc, src0, e);
647 if (src1->type == P_IMMD && !is_long(e)) {
648 if (src0->neg)
649 e->inst[0] |= 0x00008000;
650 set_immd(pc, src1, e);
651 } else {
652 set_src_1(pc, src1, e);
653 if (src0->neg ^ src1->neg) {
654 if (is_long(e))
655 e->inst[1] |= 0x08000000;
656 else
657 e->inst[0] |= 0x00008000;
658 }
659 }
660
661 emit(pc, e);
662 }
663
664 static void
665 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
666 struct nv50_reg *src0, struct nv50_reg *src1)
667 {
668 struct nv50_program_exec *e = exec(pc);
669
670 e->inst[0] |= 0xb0000000;
671
672 check_swap_src_0_1(pc, &src0, &src1);
673
674 if (!pc->allow32 || src0->neg || src1->neg) {
675 set_long(pc, e);
676 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
677 }
678
679 set_dst(pc, dst, e);
680 set_src_0(pc, src0, e);
681 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
682 set_src_2(pc, src1, e);
683 else
684 if (src1->type == P_IMMD)
685 set_immd(pc, src1, e);
686 else
687 set_src_1(pc, src1, e);
688
689 emit(pc, e);
690 }
691
692 static void
693 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
694 struct nv50_reg *src0, struct nv50_reg *src1)
695 {
696 struct nv50_program_exec *e = exec(pc);
697
698 set_long(pc, e);
699 e->inst[0] |= 0xb0000000;
700 e->inst[1] |= (sub << 29);
701
702 check_swap_src_0_1(pc, &src0, &src1);
703 set_dst(pc, dst, e);
704 set_src_0(pc, src0, e);
705 set_src_1(pc, src1, e);
706
707 emit(pc, e);
708 }
709
710 static INLINE void
711 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
712 struct nv50_reg *src1)
713 {
714 src1->neg ^= 1;
715 emit_add(pc, dst, src0, src1);
716 src1->neg ^= 1;
717 }
718
719 static void
720 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721 struct nv50_reg *src1, struct nv50_reg *src2)
722 {
723 struct nv50_program_exec *e = exec(pc);
724
725 e->inst[0] |= 0xe0000000;
726
727 check_swap_src_0_1(pc, &src0, &src1);
728 set_dst(pc, dst, e);
729 set_src_0(pc, src0, e);
730 set_src_1(pc, src1, e);
731 set_src_2(pc, src2, e);
732
733 if (src0->neg ^ src1->neg)
734 e->inst[1] |= 0x04000000;
735 if (src2->neg)
736 e->inst[1] |= 0x08000000;
737
738 emit(pc, e);
739 }
740
741 static INLINE void
742 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
743 struct nv50_reg *src1, struct nv50_reg *src2)
744 {
745 src2->neg ^= 1;
746 emit_mad(pc, dst, src0, src1, src2);
747 src2->neg ^= 1;
748 }
749
750 static void
751 emit_flop(struct nv50_pc *pc, unsigned sub,
752 struct nv50_reg *dst, struct nv50_reg *src)
753 {
754 struct nv50_program_exec *e = exec(pc);
755
756 e->inst[0] |= 0x90000000;
757 if (sub) {
758 set_long(pc, e);
759 e->inst[1] |= (sub << 29);
760 }
761
762 set_dst(pc, dst, e);
763 set_src_0(pc, src, e);
764
765 emit(pc, e);
766 }
767
768 static void
769 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
770 {
771 struct nv50_program_exec *e = exec(pc);
772
773 e->inst[0] |= 0xb0000000;
774
775 set_dst(pc, dst, e);
776 set_src_0(pc, src, e);
777 set_long(pc, e);
778 e->inst[1] |= (6 << 29) | 0x00004000;
779
780 emit(pc, e);
781 }
782
783 static void
784 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
785 {
786 struct nv50_program_exec *e = exec(pc);
787
788 e->inst[0] |= 0xb0000000;
789
790 set_dst(pc, dst, e);
791 set_src_0(pc, src, e);
792 set_long(pc, e);
793 e->inst[1] |= (6 << 29);
794
795 emit(pc, e);
796 }
797
798 #define CVTOP_RN 0x01
799 #define CVTOP_FLOOR 0x03
800 #define CVTOP_CEIL 0x05
801 #define CVTOP_TRUNC 0x07
802 #define CVTOP_SAT 0x08
803 #define CVTOP_ABS 0x10
804
805 /* 0x04 == 32 bit */
806 /* 0x40 == dst is float */
807 /* 0x80 == src is float */
808 #define CVT_F32_F32 0xc4
809 #define CVT_F32_S32 0x44
810 #define CVT_F32_U32 0x64
811 #define CVT_S32_F32 0x8c
812 #define CVT_S32_S32 0x0c
813 #define CVT_F32_F32_ROP 0xcc
814
815 static void
816 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
817 int wp, unsigned cvn, unsigned fmt)
818 {
819 struct nv50_program_exec *e;
820
821 e = exec(pc);
822 set_long(pc, e);
823
824 e->inst[0] |= 0xa0000000;
825 e->inst[1] |= 0x00004000;
826 e->inst[1] |= (cvn << 16);
827 e->inst[1] |= (fmt << 24);
828 set_src_0(pc, src, e);
829
830 if (wp >= 0)
831 set_pred_wr(pc, 1, wp, e);
832
833 if (dst)
834 set_dst(pc, dst, e);
835 else {
836 e->inst[0] |= 0x000001fc;
837 e->inst[1] |= 0x00000008;
838 }
839
840 emit(pc, e);
841 }
842
843 /* nv50 Condition codes:
844 * 0x1 = LT
845 * 0x2 = EQ
846 * 0x3 = LE
847 * 0x4 = GT
848 * 0x5 = NE
849 * 0x6 = GE
850 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge)
851 * 0x8 = unordered bit (allows NaN)
852 */
853 static void
854 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
855 struct nv50_reg *src0, struct nv50_reg *src1)
856 {
857 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
858
859 struct nv50_program_exec *e = exec(pc);
860 struct nv50_reg *rdst;
861
862 assert(ccode < 16);
863 if (check_swap_src_0_1(pc, &src0, &src1))
864 ccode = cc_swapped[ccode & 7] | (ccode & 8);
865
866 rdst = dst;
867 if (dst && dst->type != P_TEMP)
868 dst = alloc_temp(pc, NULL);
869
870 /* set.u32 */
871 set_long(pc, e);
872 e->inst[0] |= 0xb0000000;
873 e->inst[1] |= 0x60000000 | (ccode << 14);
874
875 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
876 * that doesn't seem to match what the hw actually does
877 e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
878 */
879
880 if (wp >= 0)
881 set_pred_wr(pc, 1, wp, e);
882 if (dst)
883 set_dst(pc, dst, e);
884 else {
885 e->inst[0] |= 0x000001fc;
886 e->inst[1] |= 0x00000008;
887 }
888
889 set_src_0(pc, src0, e);
890 set_src_1(pc, src1, e);
891
892 emit(pc, e);
893
894 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
895 if (rdst)
896 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
897 if (rdst && rdst != dst)
898 free_temp(pc, dst);
899 }
900
901 static INLINE unsigned
902 map_tgsi_setop_cc(unsigned op)
903 {
904 switch (op) {
905 case TGSI_OPCODE_SLT: return 0x1;
906 case TGSI_OPCODE_SGE: return 0x6;
907 case TGSI_OPCODE_SEQ: return 0x2;
908 case TGSI_OPCODE_SGT: return 0x4;
909 case TGSI_OPCODE_SLE: return 0x3;
910 case TGSI_OPCODE_SNE: return 0xd;
911 default:
912 assert(0);
913 return 0;
914 }
915 }
916
917 static INLINE void
918 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
919 {
920 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
921 }
922
923 static void
924 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
925 struct nv50_reg *v, struct nv50_reg *e)
926 {
927 struct nv50_reg *temp = alloc_temp(pc, NULL);
928
929 emit_flop(pc, 3, temp, v);
930 emit_mul(pc, temp, temp, e);
931 emit_preex2(pc, temp, temp);
932 emit_flop(pc, 6, dst, temp);
933
934 free_temp(pc, temp);
935 }
936
937 static INLINE void
938 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
939 {
940 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
941 }
942
943 static INLINE void
944 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
945 {
946 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
947 }
948
949 static void
950 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
951 struct nv50_reg **src)
952 {
953 struct nv50_reg *one = alloc_immd(pc, 1.0);
954 struct nv50_reg *zero = alloc_immd(pc, 0.0);
955 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
956 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
957 struct nv50_reg *tmp[4];
958 boolean allow32 = pc->allow32;
959
960 pc->allow32 = FALSE;
961
962 if (mask & (3 << 1)) {
963 tmp[0] = alloc_temp(pc, NULL);
964 emit_minmax(pc, 4, tmp[0], src[0], zero);
965 }
966
967 if (mask & (1 << 2)) {
968 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
969
970 tmp[1] = temp_temp(pc);
971 emit_minmax(pc, 4, tmp[1], src[1], zero);
972
973 tmp[3] = temp_temp(pc);
974 emit_minmax(pc, 4, tmp[3], src[3], neg128);
975 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
976
977 emit_pow(pc, dst[2], tmp[1], tmp[3]);
978 emit_mov(pc, dst[2], zero);
979 set_pred(pc, 3, 0, pc->p->exec_tail);
980 }
981
982 if (mask & (1 << 1))
983 assimilate_temp(pc, dst[1], tmp[0]);
984 else
985 if (mask & (1 << 2))
986 free_temp(pc, tmp[0]);
987
988 pc->allow32 = allow32;
989
990 /* do this last, in case src[i,j] == dst[0,3] */
991 if (mask & (1 << 0))
992 emit_mov(pc, dst[0], one);
993
994 if (mask & (1 << 3))
995 emit_mov(pc, dst[3], one);
996
997 FREE(pos128);
998 FREE(neg128);
999 FREE(zero);
1000 FREE(one);
1001 }
1002
1003 static void
1004 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1005 {
1006 struct nv50_program_exec *e = exec(pc);
1007
1008 set_long(pc, e);
1009 e->inst[0] |= 0xa0000000; /* delta */
1010 e->inst[1] |= (7 << 29); /* delta */
1011 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
1012 e->inst[1] |= (1 << 14); /* src .f32 */
1013 set_dst(pc, dst, e);
1014 set_src_0(pc, src, e);
1015
1016 emit(pc, e);
1017 }
1018
1019 static void
1020 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1021 {
1022 struct nv50_program_exec *e;
1023 const int r_pred = 1;
1024
1025 /* Sets predicate reg ? */
1026 e = exec(pc);
1027 e->inst[0] = 0xa00001fd;
1028 e->inst[1] = 0xc4014788;
1029 set_src_0(pc, src, e);
1030 set_pred_wr(pc, 1, r_pred, e);
1031 if (src->neg)
1032 e->inst[1] |= 0x20000000;
1033 emit(pc, e);
1034
1035 /* This is probably KILP */
1036 e = exec(pc);
1037 e->inst[0] = 0x000001fe;
1038 set_long(pc, e);
1039 set_pred(pc, 1 /* LT? */, r_pred, e);
1040 emit(pc, e);
1041 }
1042
1043 static void
1044 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1045 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1046 {
1047 struct nv50_reg *temp, *t[4];
1048 struct nv50_program_exec *e;
1049
1050 unsigned c, mode, dim;
1051
1052 switch (type) {
1053 case TGSI_TEXTURE_1D:
1054 dim = 1;
1055 break;
1056 case TGSI_TEXTURE_UNKNOWN:
1057 case TGSI_TEXTURE_2D:
1058 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1059 case TGSI_TEXTURE_RECT:
1060 dim = 2;
1061 break;
1062 case TGSI_TEXTURE_3D:
1063 case TGSI_TEXTURE_CUBE:
1064 case TGSI_TEXTURE_SHADOW2D:
1065 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1066 dim = 3;
1067 break;
1068 default:
1069 assert(0);
1070 break;
1071 }
1072
1073 /* some cards need t[0]'s hw index to be a multiple of 4 */
1074 alloc_temp4(pc, t, 0);
1075
1076 if (proj) {
1077 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1078 mode = pc->interp_mode[src[0]->index];
1079
1080 t[3]->rhw = src[3]->rhw;
1081 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1082 emit_flop(pc, 0, t[3], t[3]);
1083
1084 for (c = 0; c < dim; c++) {
1085 t[c]->rhw = src[c]->rhw;
1086 emit_interp(pc, t[c], t[3],
1087 (mode | INTERP_PERSPECTIVE));
1088 }
1089 } else {
1090 emit_flop(pc, 0, t[3], src[3]);
1091 for (c = 0; c < dim; c++)
1092 emit_mul(pc, t[c], src[c], t[3]);
1093
1094 /* XXX: for some reason the blob sometimes uses MAD:
1095 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1096 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1097 */
1098 }
1099 } else {
1100 if (type == TGSI_TEXTURE_CUBE) {
1101 temp = temp_temp(pc);
1102 emit_minmax(pc, 4, temp, src[0], src[1]);
1103 emit_minmax(pc, 4, temp, temp, src[2]);
1104 emit_flop(pc, 0, temp, temp);
1105 for (c = 0; c < 3; c++)
1106 emit_mul(pc, t[c], src[c], temp);
1107 } else {
1108 for (c = 0; c < dim; c++)
1109 emit_mov(pc, t[c], src[c]);
1110 }
1111 }
1112
1113 e = exec(pc);
1114 set_long(pc, e);
1115 e->inst[0] |= 0xf0000000;
1116 e->inst[1] |= 0x00000004;
1117 set_dst(pc, t[0], e);
1118 e->inst[0] |= (unit << 9);
1119
1120 if (dim == 2)
1121 e->inst[0] |= 0x00400000;
1122 else
1123 if (dim == 3)
1124 e->inst[0] |= 0x00800000;
1125
1126 e->inst[0] |= (mask & 0x3) << 25;
1127 e->inst[1] |= (mask & 0xc) << 12;
1128
1129 emit(pc, e);
1130
1131 #if 1
1132 if (mask & 1) emit_mov(pc, dst[0], t[0]);
1133 if (mask & 2) emit_mov(pc, dst[1], t[1]);
1134 if (mask & 4) emit_mov(pc, dst[2], t[2]);
1135 if (mask & 8) emit_mov(pc, dst[3], t[3]);
1136
1137 free_temp4(pc, t);
1138 #else
1139 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1140 * the texture coordinates, not the fetched values: latency ? */
1141
1142 for (c = 0; c < 4; c++) {
1143 if (mask & (1 << c))
1144 assimilate_temp(pc, dst[c], t[c]);
1145 else
1146 free_temp(pc, t[c]);
1147 }
1148 #endif
1149 }
1150
1151 static void
1152 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1153 {
1154 unsigned q = 0, m = ~0;
1155
1156 assert(!is_long(e));
1157
1158 switch (e->inst[0] >> 28) {
1159 case 0x1:
1160 /* MOV */
1161 q = 0x0403c000;
1162 m = 0xffff7fff;
1163 break;
1164 case 0x8:
1165 /* INTERP (move centroid, perspective and flat bits) */
1166 m = ~0x03000100;
1167 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1168 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1169 break;
1170 case 0x9:
1171 /* RCP */
1172 break;
1173 case 0xB:
1174 /* ADD */
1175 m = ~(127 << 16);
1176 q = ((e->inst[0] & (~m)) >> 2);
1177 break;
1178 case 0xC:
1179 /* MUL */
1180 m = ~0x00008000;
1181 q = ((e->inst[0] & (~m)) << 12);
1182 break;
1183 case 0xE:
1184 /* MAD (if src2 == dst) */
1185 q = ((e->inst[0] & 0x1fc) << 12);
1186 break;
1187 default:
1188 assert(0);
1189 break;
1190 }
1191
1192 set_long(pc, e);
1193 pc->p->exec_size++;
1194
1195 e->inst[0] &= m;
1196 e->inst[1] |= q;
1197 }
1198
1199 static boolean
1200 negate_supported(const struct tgsi_full_instruction *insn, int i)
1201 {
1202 switch (insn->Instruction.Opcode) {
1203 case TGSI_OPCODE_DP3:
1204 case TGSI_OPCODE_DP4:
1205 case TGSI_OPCODE_MUL:
1206 case TGSI_OPCODE_KIL:
1207 case TGSI_OPCODE_ADD:
1208 case TGSI_OPCODE_SUB:
1209 case TGSI_OPCODE_MAD:
1210 return TRUE;
1211 case TGSI_OPCODE_POW:
1212 return (i == 1) ? TRUE : FALSE;
1213 default:
1214 return FALSE;
1215 }
1216 }
1217
1218 /* Return a read mask for source registers deduced from opcode & write mask. */
1219 static unsigned
1220 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1221 {
1222 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1223
1224 switch (insn->Instruction.Opcode) {
1225 case TGSI_OPCODE_COS:
1226 case TGSI_OPCODE_SIN:
1227 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1228 case TGSI_OPCODE_DP3:
1229 return 0x7;
1230 case TGSI_OPCODE_DP4:
1231 case TGSI_OPCODE_DPH:
1232 case TGSI_OPCODE_KIL: /* WriteMask ignored */
1233 return 0xf;
1234 case TGSI_OPCODE_DST:
1235 return mask & (c ? 0xa : 0x6);
1236 case TGSI_OPCODE_EX2:
1237 case TGSI_OPCODE_LG2:
1238 case TGSI_OPCODE_POW:
1239 case TGSI_OPCODE_RCP:
1240 case TGSI_OPCODE_RSQ:
1241 case TGSI_OPCODE_SCS:
1242 return 0x1;
1243 case TGSI_OPCODE_LIT:
1244 return 0xb;
1245 case TGSI_OPCODE_TEX:
1246 case TGSI_OPCODE_TXP:
1247 {
1248 const struct tgsi_instruction_ext_texture *tex;
1249
1250 assert(insn->Instruction.Extended);
1251 tex = &insn->InstructionExtTexture;
1252
1253 mask = 0x7;
1254 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1255 mask |= 0x8;
1256
1257 switch (tex->Texture) {
1258 case TGSI_TEXTURE_1D:
1259 mask &= 0x9;
1260 break;
1261 case TGSI_TEXTURE_2D:
1262 mask &= 0xb;
1263 break;
1264 default:
1265 break;
1266 }
1267 }
1268 return mask;
1269 case TGSI_OPCODE_XPD:
1270 x = 0;
1271 if (mask & 1) x |= 0x6;
1272 if (mask & 2) x |= 0x5;
1273 if (mask & 4) x |= 0x3;
1274 return x;
1275 default:
1276 break;
1277 }
1278
1279 return mask;
1280 }
1281
1282 static struct nv50_reg *
1283 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1284 {
1285 switch (dst->DstRegister.File) {
1286 case TGSI_FILE_TEMPORARY:
1287 return &pc->temp[dst->DstRegister.Index * 4 + c];
1288 case TGSI_FILE_OUTPUT:
1289 return &pc->result[dst->DstRegister.Index * 4 + c];
1290 case TGSI_FILE_NULL:
1291 return NULL;
1292 default:
1293 break;
1294 }
1295
1296 return NULL;
1297 }
1298
1299 static struct nv50_reg *
1300 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1301 boolean neg)
1302 {
1303 struct nv50_reg *r = NULL;
1304 struct nv50_reg *temp;
1305 unsigned sgn, c;
1306
1307 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1308
1309 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1310 switch (c) {
1311 case TGSI_EXTSWIZZLE_X:
1312 case TGSI_EXTSWIZZLE_Y:
1313 case TGSI_EXTSWIZZLE_Z:
1314 case TGSI_EXTSWIZZLE_W:
1315 switch (src->SrcRegister.File) {
1316 case TGSI_FILE_INPUT:
1317 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1318 break;
1319 case TGSI_FILE_TEMPORARY:
1320 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1321 break;
1322 case TGSI_FILE_CONSTANT:
1323 r = &pc->param[src->SrcRegister.Index * 4 + c];
1324 break;
1325 case TGSI_FILE_IMMEDIATE:
1326 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1327 break;
1328 case TGSI_FILE_SAMPLER:
1329 break;
1330 default:
1331 assert(0);
1332 break;
1333 }
1334 break;
1335 case TGSI_EXTSWIZZLE_ZERO:
1336 r = alloc_immd(pc, 0.0);
1337 return r;
1338 case TGSI_EXTSWIZZLE_ONE:
1339 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1340 return alloc_immd(pc, -1.0);
1341 return alloc_immd(pc, 1.0);
1342 default:
1343 assert(0);
1344 break;
1345 }
1346
1347 switch (sgn) {
1348 case TGSI_UTIL_SIGN_KEEP:
1349 break;
1350 case TGSI_UTIL_SIGN_CLEAR:
1351 temp = temp_temp(pc);
1352 emit_abs(pc, temp, r);
1353 r = temp;
1354 break;
1355 case TGSI_UTIL_SIGN_TOGGLE:
1356 if (neg)
1357 r->neg = 1;
1358 else {
1359 temp = temp_temp(pc);
1360 emit_neg(pc, temp, r);
1361 r = temp;
1362 }
1363 break;
1364 case TGSI_UTIL_SIGN_SET:
1365 temp = temp_temp(pc);
1366 emit_abs(pc, temp, r);
1367 if (neg)
1368 temp->neg = 1;
1369 else
1370 emit_neg(pc, temp, temp);
1371 r = temp;
1372 break;
1373 default:
1374 assert(0);
1375 break;
1376 }
1377
1378 return r;
1379 }
1380
1381 /* return TRUE for ops that produce only a single result */
1382 static boolean
1383 is_scalar_op(unsigned op)
1384 {
1385 switch (op) {
1386 case TGSI_OPCODE_COS:
1387 case TGSI_OPCODE_DP2:
1388 case TGSI_OPCODE_DP3:
1389 case TGSI_OPCODE_DP4:
1390 case TGSI_OPCODE_DPH:
1391 case TGSI_OPCODE_EX2:
1392 case TGSI_OPCODE_LG2:
1393 case TGSI_OPCODE_POW:
1394 case TGSI_OPCODE_RCP:
1395 case TGSI_OPCODE_RSQ:
1396 case TGSI_OPCODE_SIN:
1397 /*
1398 case TGSI_OPCODE_KIL:
1399 case TGSI_OPCODE_LIT:
1400 case TGSI_OPCODE_SCS:
1401 */
1402 return TRUE;
1403 default:
1404 return FALSE;
1405 }
1406 }
1407
1408 /* Returns a bitmask indicating which dst components depend
1409 * on source s, component c (reverse of nv50_tgsi_src_mask).
1410 */
1411 static unsigned
1412 nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1413 {
1414 if (is_scalar_op(op))
1415 return 0x1;
1416
1417 switch (op) {
1418 case TGSI_OPCODE_DST:
1419 return (1 << c) & (s ? 0xa : 0x6);
1420 case TGSI_OPCODE_XPD:
1421 switch (c) {
1422 case 0: return 0x6;
1423 case 1: return 0x5;
1424 case 2: return 0x3;
1425 case 3: return 0x0;
1426 default:
1427 assert(0);
1428 return 0x0;
1429 }
1430 case TGSI_OPCODE_LIT:
1431 case TGSI_OPCODE_SCS:
1432 case TGSI_OPCODE_TEX:
1433 case TGSI_OPCODE_TXP:
1434 /* these take care of dangerous swizzles themselves */
1435 return 0x0;
1436 case TGSI_OPCODE_IF:
1437 case TGSI_OPCODE_KIL:
1438 /* don't call this function for these ops */
1439 assert(0);
1440 return 0;
1441 default:
1442 /* linear vector instruction */
1443 return (1 << c);
1444 }
1445 }
1446
1447 static boolean
1448 nv50_program_tx_insn(struct nv50_pc *pc,
1449 const struct tgsi_full_instruction *inst)
1450 {
1451 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1452 unsigned mask, sat, unit;
1453 int i, c;
1454
1455 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1456 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1457
1458 memset(src, 0, sizeof(src));
1459
1460 for (c = 0; c < 4; c++) {
1461 if ((mask & (1 << c)) && !pc->r_dst[c])
1462 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1463 else
1464 dst[c] = pc->r_dst[c];
1465 rdst[c] = dst[c];
1466 }
1467
1468 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1469 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1470 unsigned src_mask;
1471 boolean neg_supp;
1472
1473 src_mask = nv50_tgsi_src_mask(inst, i);
1474 neg_supp = negate_supported(inst, i);
1475
1476 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1477 unit = fs->SrcRegister.Index;
1478
1479 for (c = 0; c < 4; c++)
1480 if (src_mask & (1 << c))
1481 src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1482 }
1483
1484 brdc = temp = pc->r_brdc;
1485 if (brdc && brdc->type != P_TEMP) {
1486 temp = temp_temp(pc);
1487 if (sat)
1488 brdc = temp;
1489 } else
1490 if (sat) {
1491 for (c = 0; c < 4; c++) {
1492 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1493 continue;
1494 rdst[c] = dst[c];
1495 dst[c] = temp_temp(pc);
1496 }
1497 }
1498
1499 assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1500
1501 switch (inst->Instruction.Opcode) {
1502 case TGSI_OPCODE_ABS:
1503 for (c = 0; c < 4; c++) {
1504 if (!(mask & (1 << c)))
1505 continue;
1506 emit_abs(pc, dst[c], src[0][c]);
1507 }
1508 break;
1509 case TGSI_OPCODE_ADD:
1510 for (c = 0; c < 4; c++) {
1511 if (!(mask & (1 << c)))
1512 continue;
1513 emit_add(pc, dst[c], src[0][c], src[1][c]);
1514 }
1515 break;
1516 case TGSI_OPCODE_CEIL:
1517 for (c = 0; c < 4; c++) {
1518 if (!(mask & (1 << c)))
1519 continue;
1520 emit_cvt(pc, dst[c], src[0][c], -1,
1521 CVTOP_CEIL, CVT_F32_F32);
1522 }
1523 break;
1524 case TGSI_OPCODE_COS:
1525 if (mask & 8) {
1526 emit_precossin(pc, temp, src[0][3]);
1527 emit_flop(pc, 5, dst[3], temp);
1528 if (!(mask &= 7))
1529 break;
1530 if (temp == dst[3])
1531 temp = brdc = temp_temp(pc);
1532 }
1533 emit_precossin(pc, temp, src[0][0]);
1534 emit_flop(pc, 5, brdc, temp);
1535 break;
1536 case TGSI_OPCODE_DP3:
1537 emit_mul(pc, temp, src[0][0], src[1][0]);
1538 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1539 emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1540 break;
1541 case TGSI_OPCODE_DP4:
1542 emit_mul(pc, temp, src[0][0], src[1][0]);
1543 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1544 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1545 emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1546 break;
1547 case TGSI_OPCODE_DPH:
1548 emit_mul(pc, temp, src[0][0], src[1][0]);
1549 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1550 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1551 emit_add(pc, brdc, src[1][3], temp);
1552 break;
1553 case TGSI_OPCODE_DST:
1554 if (mask & (1 << 1))
1555 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1556 if (mask & (1 << 2))
1557 emit_mov(pc, dst[2], src[0][2]);
1558 if (mask & (1 << 3))
1559 emit_mov(pc, dst[3], src[1][3]);
1560 if (mask & (1 << 0))
1561 emit_mov_immdval(pc, dst[0], 1.0f);
1562 break;
1563 case TGSI_OPCODE_EX2:
1564 emit_preex2(pc, temp, src[0][0]);
1565 emit_flop(pc, 6, brdc, temp);
1566 break;
1567 case TGSI_OPCODE_FLR:
1568 for (c = 0; c < 4; c++) {
1569 if (!(mask & (1 << c)))
1570 continue;
1571 emit_flr(pc, dst[c], src[0][c]);
1572 }
1573 break;
1574 case TGSI_OPCODE_FRC:
1575 temp = temp_temp(pc);
1576 for (c = 0; c < 4; c++) {
1577 if (!(mask & (1 << c)))
1578 continue;
1579 emit_flr(pc, temp, src[0][c]);
1580 emit_sub(pc, dst[c], src[0][c], temp);
1581 }
1582 break;
1583 case TGSI_OPCODE_KIL:
1584 emit_kil(pc, src[0][0]);
1585 emit_kil(pc, src[0][1]);
1586 emit_kil(pc, src[0][2]);
1587 emit_kil(pc, src[0][3]);
1588 break;
1589 case TGSI_OPCODE_LIT:
1590 emit_lit(pc, &dst[0], mask, &src[0][0]);
1591 break;
1592 case TGSI_OPCODE_LG2:
1593 emit_flop(pc, 3, brdc, src[0][0]);
1594 break;
1595 case TGSI_OPCODE_LRP:
1596 temp = temp_temp(pc);
1597 for (c = 0; c < 4; c++) {
1598 if (!(mask & (1 << c)))
1599 continue;
1600 emit_sub(pc, temp, src[1][c], src[2][c]);
1601 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1602 }
1603 break;
1604 case TGSI_OPCODE_MAD:
1605 for (c = 0; c < 4; c++) {
1606 if (!(mask & (1 << c)))
1607 continue;
1608 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1609 }
1610 break;
1611 case TGSI_OPCODE_MAX:
1612 for (c = 0; c < 4; c++) {
1613 if (!(mask & (1 << c)))
1614 continue;
1615 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1616 }
1617 break;
1618 case TGSI_OPCODE_MIN:
1619 for (c = 0; c < 4; c++) {
1620 if (!(mask & (1 << c)))
1621 continue;
1622 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1623 }
1624 break;
1625 case TGSI_OPCODE_MOV:
1626 case TGSI_OPCODE_SWZ:
1627 for (c = 0; c < 4; c++) {
1628 if (!(mask & (1 << c)))
1629 continue;
1630 emit_mov(pc, dst[c], src[0][c]);
1631 }
1632 break;
1633 case TGSI_OPCODE_MUL:
1634 for (c = 0; c < 4; c++) {
1635 if (!(mask & (1 << c)))
1636 continue;
1637 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1638 }
1639 break;
1640 case TGSI_OPCODE_POW:
1641 emit_pow(pc, brdc, src[0][0], src[1][0]);
1642 break;
1643 case TGSI_OPCODE_RCP:
1644 emit_flop(pc, 0, brdc, src[0][0]);
1645 break;
1646 case TGSI_OPCODE_RSQ:
1647 emit_flop(pc, 2, brdc, src[0][0]);
1648 break;
1649 case TGSI_OPCODE_SCS:
1650 temp = temp_temp(pc);
1651 if (mask & 3)
1652 emit_precossin(pc, temp, src[0][0]);
1653 if (mask & (1 << 0))
1654 emit_flop(pc, 5, dst[0], temp);
1655 if (mask & (1 << 1))
1656 emit_flop(pc, 4, dst[1], temp);
1657 if (mask & (1 << 2))
1658 emit_mov_immdval(pc, dst[2], 0.0);
1659 if (mask & (1 << 3))
1660 emit_mov_immdval(pc, dst[3], 1.0);
1661 break;
1662 case TGSI_OPCODE_SIN:
1663 if (mask & 8) {
1664 emit_precossin(pc, temp, src[0][3]);
1665 emit_flop(pc, 4, dst[3], temp);
1666 if (!(mask &= 7))
1667 break;
1668 if (temp == dst[3])
1669 temp = brdc = temp_temp(pc);
1670 }
1671 emit_precossin(pc, temp, src[0][0]);
1672 emit_flop(pc, 4, brdc, temp);
1673 break;
1674 case TGSI_OPCODE_SLT:
1675 case TGSI_OPCODE_SGE:
1676 case TGSI_OPCODE_SEQ:
1677 case TGSI_OPCODE_SGT:
1678 case TGSI_OPCODE_SLE:
1679 case TGSI_OPCODE_SNE:
1680 i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1681 for (c = 0; c < 4; c++) {
1682 if (!(mask & (1 << c)))
1683 continue;
1684 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1685 }
1686 break;
1687 case TGSI_OPCODE_SUB:
1688 for (c = 0; c < 4; c++) {
1689 if (!(mask & (1 << c)))
1690 continue;
1691 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1692 }
1693 break;
1694 case TGSI_OPCODE_TEX:
1695 emit_tex(pc, dst, mask, src[0], unit,
1696 inst->InstructionExtTexture.Texture, FALSE);
1697 break;
1698 case TGSI_OPCODE_TXP:
1699 emit_tex(pc, dst, mask, src[0], unit,
1700 inst->InstructionExtTexture.Texture, TRUE);
1701 break;
1702 case TGSI_OPCODE_TRUNC:
1703 for (c = 0; c < 4; c++) {
1704 if (!(mask & (1 << c)))
1705 continue;
1706 emit_cvt(pc, dst[c], src[0][c], -1,
1707 CVTOP_TRUNC, CVT_F32_F32);
1708 }
1709 break;
1710 case TGSI_OPCODE_XPD:
1711 temp = temp_temp(pc);
1712 if (mask & (1 << 0)) {
1713 emit_mul(pc, temp, src[0][2], src[1][1]);
1714 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1715 }
1716 if (mask & (1 << 1)) {
1717 emit_mul(pc, temp, src[0][0], src[1][2]);
1718 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1719 }
1720 if (mask & (1 << 2)) {
1721 emit_mul(pc, temp, src[0][1], src[1][0]);
1722 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1723 }
1724 if (mask & (1 << 3))
1725 emit_mov_immdval(pc, dst[3], 1.0);
1726 break;
1727 case TGSI_OPCODE_END:
1728 break;
1729 default:
1730 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1731 return FALSE;
1732 }
1733
1734 if (brdc) {
1735 if (sat)
1736 emit_sat(pc, brdc, brdc);
1737 for (c = 0; c < 4; c++)
1738 if ((mask & (1 << c)) && dst[c] != brdc)
1739 emit_mov(pc, dst[c], brdc);
1740 } else
1741 if (sat) {
1742 for (c = 0; c < 4; c++) {
1743 if (!(mask & (1 << c)))
1744 continue;
1745 /* in this case we saturate later */
1746 if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1747 continue;
1748 emit_sat(pc, rdst[c], dst[c]);
1749 }
1750 }
1751
1752 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1753 for (c = 0; c < 4; c++) {
1754 if (!src[i][c])
1755 continue;
1756 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1757 FREE(src[i][c]);
1758 }
1759 }
1760
1761 kill_temp_temp(pc);
1762 return TRUE;
1763 }
1764
1765 static void
1766 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1767 {
1768 struct nv50_reg *reg = NULL;
1769 const struct tgsi_full_src_register *src;
1770 const struct tgsi_dst_register *dst;
1771 unsigned i, c, k, mask;
1772
1773 dst = &insn->FullDstRegisters[0].DstRegister;
1774 mask = dst->WriteMask;
1775
1776 if (dst->File == TGSI_FILE_TEMPORARY)
1777 reg = pc->temp;
1778 else
1779 if (dst->File == TGSI_FILE_OUTPUT)
1780 reg = pc->result;
1781
1782 if (reg) {
1783 for (c = 0; c < 4; c++) {
1784 if (!(mask & (1 << c)))
1785 continue;
1786 reg[dst->Index * 4 + c].acc = pc->insn_nr;
1787 }
1788 }
1789
1790 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1791 src = &insn->FullSrcRegisters[i];
1792
1793 if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
1794 reg = pc->temp;
1795 else
1796 if (src->SrcRegister.File == TGSI_FILE_INPUT)
1797 reg = pc->attr;
1798 else
1799 continue;
1800
1801 mask = nv50_tgsi_src_mask(insn, i);
1802
1803 for (c = 0; c < 4; c++) {
1804 if (!(mask & (1 << c)))
1805 continue;
1806 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1807
1808 if (k > TGSI_EXTSWIZZLE_W)
1809 continue;
1810
1811 reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
1812 }
1813 }
1814 }
1815
1816 /* Returns a bitmask indicating which dst components need to be
1817 * written to temporaries first to avoid 'corrupting' sources.
1818 *
1819 * m[i] (out) indicate component to write in the i-th position
1820 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1821 */
1822 static unsigned
1823 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1824 {
1825 unsigned i, c, x, unsafe;
1826
1827 for (c = 0; c < 4; c++)
1828 m[c] = c;
1829
1830 /* Swap as long as a dst component written earlier is depended on
1831 * by one written later, but the next one isn't depended on by it.
1832 */
1833 for (c = 0; c < 3; c++) {
1834 if (rdep[m[c + 1]] & (1 << m[c]))
1835 continue; /* if next one is depended on by us */
1836 for (i = c + 1; i < 4; i++)
1837 /* if we are depended on by a later one */
1838 if (rdep[m[c]] & (1 << m[i]))
1839 break;
1840 if (i == 4)
1841 continue;
1842 /* now, swap */
1843 x = m[c];
1844 m[c] = m[c + 1];
1845 m[c + 1] = x;
1846
1847 /* restart */
1848 c = 0;
1849 }
1850
1851 /* mark dependencies that could not be resolved by reordering */
1852 for (i = 0; i < 3; ++i)
1853 for (c = i + 1; c < 4; ++c)
1854 if (rdep[m[i]] & (1 << m[c]))
1855 unsafe |= (1 << i);
1856
1857 /* NOTE: $unsafe is with respect to order, not component */
1858 return unsafe;
1859 }
1860
1861 /* Select a suitable dst register for broadcasting scalar results,
1862 * or return NULL if we have to allocate an extra TEMP.
1863 *
1864 * If e.g. only 1 component is written, we may also emit the final
1865 * result to a write-only register.
1866 */
1867 static struct nv50_reg *
1868 tgsi_broadcast_dst(struct nv50_pc *pc,
1869 const struct tgsi_full_dst_register *fd, unsigned mask)
1870 {
1871 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1872 int c = ffs(~mask & fd->DstRegister.WriteMask);
1873 if (c)
1874 return tgsi_dst(pc, c - 1, fd);
1875 } else {
1876 int c = ffs(fd->DstRegister.WriteMask) - 1;
1877 if ((1 << c) == fd->DstRegister.WriteMask)
1878 return tgsi_dst(pc, c, fd);
1879 }
1880
1881 return NULL;
1882 }
1883
1884 /* Scan source swizzles and return a bitmask indicating dst regs that
1885 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1886 */
1887 static unsigned
1888 nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1889 unsigned rdep[4])
1890 {
1891 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1892 const struct tgsi_full_src_register *fs;
1893 unsigned i, deqs = 0;
1894
1895 for (i = 0; i < 4; ++i)
1896 rdep[i] = 0;
1897
1898 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1899 unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1900 boolean neg_supp = negate_supported(insn, i);
1901
1902 fs = &insn->FullSrcRegisters[i];
1903 if (fs->SrcRegister.File != fd->DstRegister.File ||
1904 fs->SrcRegister.Index != fd->DstRegister.Index)
1905 continue;
1906
1907 for (chn = 0; chn < 4; ++chn) {
1908 unsigned s, c;
1909
1910 if (!(mask & (1 << chn))) /* src is not read */
1911 continue;
1912 c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1913 s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1914
1915 if (c > TGSI_EXTSWIZZLE_W ||
1916 !(fd->DstRegister.WriteMask & (1 << c)))
1917 continue;
1918
1919 /* no danger if src is copied to TEMP first */
1920 if ((s != TGSI_UTIL_SIGN_KEEP) &&
1921 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1922 continue;
1923
1924 rdep[c] |= nv50_tgsi_dst_revdep(
1925 insn->Instruction.Opcode, i, chn);
1926 deqs |= (1 << c);
1927 }
1928 }
1929
1930 return deqs;
1931 }
1932
1933 static boolean
1934 nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1935 {
1936 struct tgsi_full_instruction insn = tok->FullInstruction;
1937 const struct tgsi_full_dst_register *fd;
1938 unsigned i, deqs, rdep[4], m[4];
1939
1940 fd = &tok->FullInstruction.FullDstRegisters[0];
1941 deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1942
1943 if (is_scalar_op(insn.Instruction.Opcode)) {
1944 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1945 if (!pc->r_brdc)
1946 pc->r_brdc = temp_temp(pc);
1947 return nv50_program_tx_insn(pc, &insn);
1948 }
1949 pc->r_brdc = NULL;
1950
1951 if (!deqs)
1952 return nv50_program_tx_insn(pc, &insn);
1953
1954 deqs = nv50_revdep_reorder(m, rdep);
1955
1956 for (i = 0; i < 4; ++i) {
1957 assert(pc->r_dst[m[i]] == NULL);
1958
1959 insn.FullDstRegisters[0].DstRegister.WriteMask =
1960 fd->DstRegister.WriteMask & (1 << m[i]);
1961
1962 if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1963 continue;
1964
1965 if (deqs & (1 << i))
1966 pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1967
1968 if (!nv50_program_tx_insn(pc, &insn))
1969 return FALSE;
1970 }
1971
1972 for (i = 0; i < 4; i++) {
1973 struct nv50_reg *reg = pc->r_dst[i];
1974 if (!reg)
1975 continue;
1976 pc->r_dst[i] = NULL;
1977
1978 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1979 emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1980 else
1981 emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1982 free_temp(pc, reg);
1983 }
1984
1985 return TRUE;
1986 }
1987
1988 static void
1989 load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
1990 {
1991 struct nv50_reg *iv, **ppiv;
1992 unsigned mode = pc->interp_mode[reg->index];
1993
1994 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
1995 iv = *ppiv;
1996
1997 if ((mode & INTERP_PERSPECTIVE) && !iv) {
1998 iv = *ppiv = alloc_temp(pc, NULL);
1999 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2000
2001 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2002 emit_flop(pc, 0, iv, iv);
2003
2004 /* XXX: when loading interpolants dynamically, move these
2005 * to the program head, or make sure it can't be skipped.
2006 */
2007 }
2008
2009 emit_interp(pc, reg, iv, mode);
2010 }
2011
2012 static boolean
2013 nv50_program_tx_prep(struct nv50_pc *pc)
2014 {
2015 struct tgsi_parse_context tp;
2016 struct nv50_program *p = pc->p;
2017 boolean ret = FALSE;
2018 unsigned i, c, flat_nr = 0;
2019
2020 tgsi_parse_init(&tp, pc->p->pipe.tokens);
2021 while (!tgsi_parse_end_of_tokens(&tp)) {
2022 const union tgsi_full_token *tok = &tp.FullToken;
2023
2024 tgsi_parse_token(&tp);
2025 switch (tok->Token.Type) {
2026 case TGSI_TOKEN_TYPE_IMMEDIATE:
2027 {
2028 const struct tgsi_full_immediate *imm =
2029 &tp.FullToken.FullImmediate;
2030
2031 ctor_immd(pc, imm->u[0].Float,
2032 imm->u[1].Float,
2033 imm->u[2].Float,
2034 imm->u[3].Float);
2035 }
2036 break;
2037 case TGSI_TOKEN_TYPE_DECLARATION:
2038 {
2039 const struct tgsi_full_declaration *d;
2040 unsigned si, last, first, mode;
2041
2042 d = &tp.FullToken.FullDeclaration;
2043 first = d->DeclarationRange.First;
2044 last = d->DeclarationRange.Last;
2045
2046 switch (d->Declaration.File) {
2047 case TGSI_FILE_TEMPORARY:
2048 break;
2049 case TGSI_FILE_OUTPUT:
2050 if (!d->Declaration.Semantic ||
2051 p->type == PIPE_SHADER_FRAGMENT)
2052 break;
2053
2054 si = d->Semantic.SemanticIndex;
2055 switch (d->Semantic.SemanticName) {
2056 case TGSI_SEMANTIC_BCOLOR:
2057 p->cfg.two_side[si].hw = first;
2058 if (p->cfg.io_nr > first)
2059 p->cfg.io_nr = first;
2060 break;
2061 case TGSI_SEMANTIC_PSIZE:
2062 p->cfg.psiz = first;
2063 if (p->cfg.io_nr > first)
2064 p->cfg.io_nr = first;
2065 break;
2066 /*
2067 case TGSI_SEMANTIC_CLIP_DISTANCE:
2068 p->cfg.clpd = MIN2(p->cfg.clpd, first);
2069 break;
2070 */
2071 default:
2072 break;
2073 }
2074 break;
2075 case TGSI_FILE_INPUT:
2076 {
2077 if (p->type != PIPE_SHADER_FRAGMENT)
2078 break;
2079
2080 switch (d->Declaration.Interpolate) {
2081 case TGSI_INTERPOLATE_CONSTANT:
2082 mode = INTERP_FLAT;
2083 flat_nr++;
2084 break;
2085 case TGSI_INTERPOLATE_PERSPECTIVE:
2086 mode = INTERP_PERSPECTIVE;
2087 p->cfg.regs[1] |= 0x08 << 24;
2088 break;
2089 default:
2090 mode = INTERP_LINEAR;
2091 break;
2092 }
2093 if (d->Declaration.Centroid)
2094 mode |= INTERP_CENTROID;
2095
2096 assert(last < 32);
2097 for (i = first; i <= last; i++)
2098 pc->interp_mode[i] = mode;
2099 }
2100 break;
2101 case TGSI_FILE_CONSTANT:
2102 break;
2103 case TGSI_FILE_SAMPLER:
2104 break;
2105 default:
2106 NOUVEAU_ERR("bad decl file %d\n",
2107 d->Declaration.File);
2108 goto out_err;
2109 }
2110 }
2111 break;
2112 case TGSI_TOKEN_TYPE_INSTRUCTION:
2113 pc->insn_nr++;
2114 prep_inspect_insn(pc, &tok->FullInstruction);
2115 break;
2116 default:
2117 break;
2118 }
2119 }
2120
2121 if (p->type == PIPE_SHADER_VERTEX) {
2122 int rid = 0;
2123
2124 for (i = 0; i < pc->attr_nr * 4; ++i) {
2125 if (pc->attr[i].acc) {
2126 pc->attr[i].hw = rid++;
2127 p->cfg.attr[i / 32] |= 1 << (i % 32);
2128 }
2129 }
2130
2131 for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2132 p->cfg.io[i].hw = rid;
2133 p->cfg.io[i].id_vp = i;
2134
2135 for (c = 0; c < 4; ++c) {
2136 int n = i * 4 + c;
2137 if (!pc->result[n].acc)
2138 continue;
2139 pc->result[n].hw = rid++;
2140 p->cfg.io[i].mask |= 1 << c;
2141 }
2142 }
2143
2144 for (c = 0; c < 2; ++c)
2145 if (p->cfg.two_side[c].hw < 0x40)
2146 p->cfg.two_side[c] = p->cfg.io[
2147 p->cfg.two_side[c].hw];
2148
2149 if (p->cfg.psiz < 0x40)
2150 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2151 } else
2152 if (p->type == PIPE_SHADER_FRAGMENT) {
2153 int rid, aid;
2154 unsigned n = 0, m = pc->attr_nr - flat_nr;
2155
2156 int base = (TGSI_SEMANTIC_POSITION ==
2157 p->info.input_semantic_name[0]) ? 0 : 1;
2158
2159 /* non-flat interpolants have to be mapped to
2160 * the lower hardware IDs, so sort them:
2161 */
2162 for (i = 0; i < pc->attr_nr; i++) {
2163 if (pc->interp_mode[i] == INTERP_FLAT) {
2164 p->cfg.io[m].id_vp = i + base;
2165 p->cfg.io[m++].id_fp = i;
2166 } else {
2167 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2168 p->cfg.io[n].linear = TRUE;
2169 p->cfg.io[n].id_vp = i + base;
2170 p->cfg.io[n++].id_fp = i;
2171 }
2172 }
2173
2174 if (!base) /* set w-coordinate mask from perspective interp */
2175 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2176
2177 aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2178 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2179
2180 for (n = 0; n < pc->attr_nr; ++n) {
2181 p->cfg.io[n].hw = rid = aid;
2182 i = p->cfg.io[n].id_fp;
2183
2184 for (c = 0; c < 4; ++c) {
2185 if (!pc->attr[i * 4 + c].acc)
2186 continue;
2187 pc->attr[i * 4 + c].rhw = rid++;
2188 p->cfg.io[n].mask |= 1 << c;
2189
2190 load_interpolant(pc, &pc->attr[i * 4 + c]);
2191 }
2192 aid += popcnt4(p->cfg.io[n].mask);
2193 }
2194
2195 if (!base)
2196 p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2197
2198 m = popcnt4(p->cfg.regs[1] >> 24);
2199
2200 /* set count of non-position inputs and of non-flat
2201 * non-position inputs for FP_INTERPOLANT_CTRL
2202 */
2203 p->cfg.regs[1] |= aid - m;
2204
2205 if (flat_nr) {
2206 i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2207 p->cfg.regs[1] |= (i - m) << 16;
2208 } else
2209 p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2210
2211 /* mark color semantic for light-twoside */
2212 n = 0x40;
2213 for (i = 0; i < pc->attr_nr; i++) {
2214 ubyte si, sn;
2215
2216 sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2217 si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2218
2219 if (sn == TGSI_SEMANTIC_COLOR) {
2220 p->cfg.two_side[si] = p->cfg.io[i];
2221
2222 /* increase colour count */
2223 p->cfg.regs[0] += popcnt4(
2224 p->cfg.two_side[si].mask) << 16;
2225
2226 n = MIN2(n, p->cfg.io[i].hw - m);
2227 }
2228 }
2229 if (n < 0x40)
2230 p->cfg.regs[0] += n;
2231
2232 /* Initialize FP results:
2233 * FragDepth is always first TGSI and last hw output
2234 */
2235 i = p->info.writes_z ? 4 : 0;
2236 for (rid = 0; i < pc->result_nr * 4; i++)
2237 pc->result[i].rhw = rid++;
2238 if (p->info.writes_z)
2239 pc->result[2].rhw = rid;
2240 }
2241
2242 if (pc->immd_nr) {
2243 int rid = 0;
2244
2245 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2246 if (!pc->immd)
2247 goto out_err;
2248
2249 for (i = 0; i < pc->immd_nr; i++) {
2250 for (c = 0; c < 4; c++, rid++)
2251 ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2252 }
2253 }
2254
2255 ret = TRUE;
2256 out_err:
2257 if (pc->iv_p)
2258 free_temp(pc, pc->iv_p);
2259 if (pc->iv_c)
2260 free_temp(pc, pc->iv_c);
2261
2262 tgsi_parse_free(&tp);
2263 return ret;
2264 }
2265
2266 static void
2267 free_nv50_pc(struct nv50_pc *pc)
2268 {
2269 if (pc->immd)
2270 FREE(pc->immd);
2271 if (pc->param)
2272 FREE(pc->param);
2273 if (pc->result)
2274 FREE(pc->result);
2275 if (pc->attr)
2276 FREE(pc->attr);
2277 if (pc->temp)
2278 FREE(pc->temp);
2279
2280 FREE(pc);
2281 }
2282
2283 static boolean
2284 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2285 {
2286 int i, c;
2287 unsigned rtype[2] = { P_ATTR, P_RESULT };
2288
2289 pc->p = p;
2290 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2291 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2292 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2293 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2294
2295 p->cfg.high_temp = 4;
2296
2297 p->cfg.two_side[0].hw = 0x40;
2298 p->cfg.two_side[1].hw = 0x40;
2299
2300 switch (p->type) {
2301 case PIPE_SHADER_VERTEX:
2302 p->cfg.psiz = 0x40;
2303 p->cfg.clpd = 0x40;
2304 p->cfg.io_nr = pc->result_nr;
2305 break;
2306 case PIPE_SHADER_FRAGMENT:
2307 rtype[0] = rtype[1] = P_TEMP;
2308
2309 p->cfg.regs[0] = 0x01000004;
2310 p->cfg.io_nr = pc->attr_nr;
2311
2312 if (p->info.writes_z) {
2313 p->cfg.regs[2] |= 0x00000100;
2314 p->cfg.regs[3] |= 0x00000011;
2315 }
2316 if (p->info.uses_kill)
2317 p->cfg.regs[2] |= 0x00100000;
2318 break;
2319 }
2320
2321 if (pc->temp_nr) {
2322 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2323 if (!pc->temp)
2324 return FALSE;
2325
2326 for (i = 0; i < pc->temp_nr * 4; ++i)
2327 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2328 }
2329
2330 if (pc->attr_nr) {
2331 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2332 if (!pc->attr)
2333 return FALSE;
2334
2335 for (i = 0; i < pc->attr_nr * 4; ++i)
2336 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2337 }
2338
2339 if (pc->result_nr) {
2340 unsigned nr = pc->result_nr * 4;
2341
2342 pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2343 if (!pc->result)
2344 return FALSE;
2345
2346 for (i = 0; i < nr; ++i)
2347 ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2348 }
2349
2350 if (pc->param_nr) {
2351 int rid = 0;
2352
2353 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2354 if (!pc->param)
2355 return FALSE;
2356
2357 for (i = 0; i < pc->param_nr; ++i)
2358 for (c = 0; c < 4; ++c, ++rid)
2359 ctor_reg(&pc->param[rid], P_CONST, i, rid);
2360 }
2361
2362 return TRUE;
2363 }
2364
2365 static boolean
2366 nv50_program_tx(struct nv50_program *p)
2367 {
2368 struct tgsi_parse_context parse;
2369 struct nv50_pc *pc;
2370 unsigned k;
2371 boolean ret;
2372
2373 pc = CALLOC_STRUCT(nv50_pc);
2374 if (!pc)
2375 return FALSE;
2376
2377 ret = ctor_nv50_pc(pc, p);
2378 if (ret == FALSE)
2379 goto out_cleanup;
2380
2381 ret = nv50_program_tx_prep(pc);
2382 if (ret == FALSE)
2383 goto out_cleanup;
2384
2385 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2386 while (!tgsi_parse_end_of_tokens(&parse)) {
2387 const union tgsi_full_token *tok = &parse.FullToken;
2388
2389 /* don't allow half insn/immd on first and last instruction */
2390 pc->allow32 = TRUE;
2391 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2392 pc->allow32 = FALSE;
2393
2394 tgsi_parse_token(&parse);
2395
2396 switch (tok->Token.Type) {
2397 case TGSI_TOKEN_TYPE_INSTRUCTION:
2398 ++pc->insn_cur;
2399 ret = nv50_tgsi_insn(pc, tok);
2400 if (ret == FALSE)
2401 goto out_err;
2402 break;
2403 default:
2404 break;
2405 }
2406 }
2407
2408 if (p->type == PIPE_SHADER_FRAGMENT) {
2409 struct nv50_reg out;
2410 ctor_reg(&out, P_TEMP, -1, -1);
2411
2412 for (k = 0; k < pc->result_nr * 4; k++) {
2413 if (pc->result[k].rhw == -1)
2414 continue;
2415 if (pc->result[k].hw != pc->result[k].rhw) {
2416 out.hw = pc->result[k].rhw;
2417 emit_mov(pc, &out, &pc->result[k]);
2418 }
2419 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2420 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2421 }
2422 }
2423
2424 /* look for single half instructions and make them long */
2425 struct nv50_program_exec *e, *e_prev;
2426
2427 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2428 if (!is_long(e))
2429 k++;
2430
2431 if (!e->next || is_long(e->next)) {
2432 if (k & 1)
2433 convert_to_long(pc, e);
2434 k = 0;
2435 }
2436
2437 if (e->next)
2438 e_prev = e;
2439 }
2440
2441 if (!is_long(pc->p->exec_tail)) {
2442 /* this may occur if moving FP results */
2443 assert(e_prev && !is_long(e_prev));
2444 convert_to_long(pc, e_prev);
2445 convert_to_long(pc, pc->p->exec_tail);
2446 }
2447
2448 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2449 pc->p->exec_tail->inst[1] |= 0x00000001;
2450
2451 p->param_nr = pc->param_nr * 4;
2452 p->immd_nr = pc->immd_nr * 4;
2453 p->immd = pc->immd_buf;
2454
2455 out_err:
2456 tgsi_parse_free(&parse);
2457
2458 out_cleanup:
2459 free_nv50_pc(pc);
2460 return ret;
2461 }
2462
2463 static void
2464 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2465 {
2466 if (nv50_program_tx(p) == FALSE)
2467 assert(0);
2468 p->translated = TRUE;
2469 }
2470
2471 static void
2472 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2473 unsigned start, unsigned count, unsigned cbuf)
2474 {
2475 struct nouveau_channel *chan = nv50->screen->base.channel;
2476 struct nouveau_grobj *tesla = nv50->screen->tesla;
2477
2478 while (count) {
2479 unsigned nr = count > 2047 ? 2047 : count;
2480
2481 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2482 OUT_RING (chan, (cbuf << 0) | (start << 8));
2483 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2484 OUT_RINGp (chan, map, nr);
2485
2486 map += nr;
2487 start += nr;
2488 count -= nr;
2489 }
2490 }
2491
2492 static void
2493 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2494 {
2495 struct pipe_screen *pscreen = nv50->pipe.screen;
2496
2497 if (!p->data[0] && p->immd_nr) {
2498 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2499
2500 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2501 while (heap->next && heap->size < p->immd_nr) {
2502 struct nv50_program *evict = heap->next->priv;
2503 nouveau_resource_free(&evict->data[0]);
2504 }
2505
2506 if (nouveau_resource_alloc(heap, p->immd_nr, p,
2507 &p->data[0]))
2508 assert(0);
2509 }
2510
2511 /* immediates only need to be uploaded again when freed */
2512 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2513 p->immd_nr, NV50_CB_PMISC);
2514 }
2515
2516 assert(p->param_nr <= 128);
2517
2518 if (p->param_nr) {
2519 unsigned cb;
2520 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2521 PIPE_BUFFER_USAGE_CPU_READ);
2522
2523 if (p->type == PIPE_SHADER_VERTEX)
2524 cb = NV50_CB_PVP;
2525 else
2526 cb = NV50_CB_PFP;
2527
2528 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2529 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2530 }
2531 }
2532
2533 static void
2534 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2535 {
2536 struct nouveau_channel *chan = nv50->screen->base.channel;
2537 struct nouveau_grobj *tesla = nv50->screen->tesla;
2538 struct nv50_program_exec *e;
2539 struct nouveau_stateobj *so;
2540 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2541 unsigned start, count, *up, *ptr;
2542 boolean upload = FALSE;
2543
2544 if (!p->bo) {
2545 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2546 p->exec_size * 4, &p->bo);
2547 upload = TRUE;
2548 }
2549
2550 if (p->data[0] && p->data[0]->start != p->data_start[0])
2551 upload = TRUE;
2552
2553 if (!upload)
2554 return;
2555
2556 for (e = p->exec_head; e; e = e->next) {
2557 unsigned ei, ci, bs;
2558
2559 if (e->param.index < 0)
2560 continue;
2561 bs = (e->inst[1] >> 22) & 0x07;
2562 assert(bs < 2);
2563 ei = e->param.shift >> 5;
2564 ci = e->param.index;
2565 if (bs == 0)
2566 ci += p->data[bs]->start;
2567
2568 e->inst[ei] &= ~e->param.mask;
2569 e->inst[ei] |= (ci << e->param.shift);
2570 }
2571
2572 if (p->data[0])
2573 p->data_start[0] = p->data[0]->start;
2574
2575 #ifdef NV50_PROGRAM_DUMP
2576 NOUVEAU_ERR("-------\n");
2577 for (e = p->exec_head; e; e = e->next) {
2578 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2579 if (is_long(e))
2580 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2581 }
2582 #endif
2583
2584 up = ptr = MALLOC(p->exec_size * 4);
2585 for (e = p->exec_head; e; e = e->next) {
2586 *(ptr++) = e->inst[0];
2587 if (is_long(e))
2588 *(ptr++) = e->inst[1];
2589 }
2590
2591 so = so_new(4,2);
2592 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2593 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2594 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2595 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2596
2597 start = 0; count = p->exec_size;
2598 while (count) {
2599 struct nouveau_channel *chan = nv50->screen->base.channel;
2600 unsigned nr;
2601
2602 so_emit(chan, so);
2603
2604 nr = MIN2(count, 2047);
2605 nr = MIN2(chan->pushbuf->remaining, nr);
2606 if (chan->pushbuf->remaining < (nr + 3)) {
2607 FIRE_RING(chan);
2608 continue;
2609 }
2610
2611 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2612 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2613 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2614 OUT_RINGp (chan, up + start, nr);
2615
2616 start += nr;
2617 count -= nr;
2618 }
2619
2620 FREE(up);
2621 so_ref(NULL, &so);
2622 }
2623
2624 void
2625 nv50_vertprog_validate(struct nv50_context *nv50)
2626 {
2627 struct nouveau_grobj *tesla = nv50->screen->tesla;
2628 struct nv50_program *p = nv50->vertprog;
2629 struct nouveau_stateobj *so;
2630
2631 if (!p->translated) {
2632 nv50_program_validate(nv50, p);
2633 if (!p->translated)
2634 assert(0);
2635 }
2636
2637 nv50_program_validate_data(nv50, p);
2638 nv50_program_validate_code(nv50, p);
2639
2640 so = so_new(13, 2);
2641 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2642 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2643 NOUVEAU_BO_HIGH, 0, 0);
2644 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2645 NOUVEAU_BO_LOW, 0, 0);
2646 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2647 so_data (so, p->cfg.attr[0]);
2648 so_data (so, p->cfg.attr[1]);
2649 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2650 so_data (so, p->cfg.high_result);
2651 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2652 so_data (so, p->cfg.high_result); //8);
2653 so_data (so, p->cfg.high_temp);
2654 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2655 so_data (so, 0); /* program start offset */
2656 so_ref(so, &nv50->state.vertprog);
2657 so_ref(NULL, &so);
2658 }
2659
2660 void
2661 nv50_fragprog_validate(struct nv50_context *nv50)
2662 {
2663 struct nouveau_grobj *tesla = nv50->screen->tesla;
2664 struct nv50_program *p = nv50->fragprog;
2665 struct nouveau_stateobj *so;
2666
2667 if (!p->translated) {
2668 nv50_program_validate(nv50, p);
2669 if (!p->translated)
2670 assert(0);
2671 }
2672
2673 nv50_program_validate_data(nv50, p);
2674 nv50_program_validate_code(nv50, p);
2675
2676 so = so_new(64, 2);
2677 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2678 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2679 NOUVEAU_BO_HIGH, 0, 0);
2680 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2681 NOUVEAU_BO_LOW, 0, 0);
2682 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2683 so_data (so, p->cfg.high_temp);
2684 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2685 so_data (so, p->cfg.high_result);
2686 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2687 so_data (so, p->cfg.regs[2]);
2688 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2689 so_data (so, p->cfg.regs[3]);
2690 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2691 so_data (so, 0); /* program start offset */
2692 so_ref(so, &nv50->state.fragprog);
2693 so_ref(NULL, &so);
2694 }
2695
2696 static void
2697 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
2698 {
2699 struct nv50_program *fp = nv50->fragprog;
2700 struct nv50_program *vp = nv50->vertprog;
2701 unsigned i, c, m = base;
2702
2703 /* XXX: This can't work correctly in all cases yet, we either
2704 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
2705 * to be per FP input instead of per VP output
2706 */
2707 memset(pntc, 0, 8 * sizeof(uint32_t));
2708
2709 for (i = 0; i < fp->cfg.io_nr; i++) {
2710 uint8_t sn, si;
2711 uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
2712 unsigned n = popcnt4(fp->cfg.io[i].mask);
2713
2714 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
2715 m += n;
2716 continue;
2717 }
2718
2719 sn = vp->info.input_semantic_name[j];
2720 si = vp->info.input_semantic_index[j];
2721
2722 if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
2723 ubyte mode =
2724 nv50->rasterizer->pipe.sprite_coord_mode[si];
2725
2726 if (mode == PIPE_SPRITE_COORD_NONE) {
2727 m += n;
2728 continue;
2729 }
2730 }
2731
2732 /* this is either PointCoord or replaced by sprite coords */
2733 for (c = 0; c < 4; c++) {
2734 if (!(fp->cfg.io[i].mask & (1 << c)))
2735 continue;
2736 pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
2737 ++m;
2738 }
2739 }
2740 }
2741
2742 static int
2743 nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
2744 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
2745 {
2746 int c;
2747 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
2748 uint8_t *map = (uint8_t *)p_map;
2749
2750 for (c = 0; c < 4; ++c) {
2751 if (mf & 1) {
2752 if (fpi->linear == TRUE)
2753 lin[mid / 32] |= 1 << (mid % 32);
2754 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
2755 }
2756
2757 oid += mv & 1;
2758 mf >>= 1;
2759 mv >>= 1;
2760 }
2761
2762 return mid;
2763 }
2764
2765 void
2766 nv50_linkage_validate(struct nv50_context *nv50)
2767 {
2768 struct nouveau_grobj *tesla = nv50->screen->tesla;
2769 struct nv50_program *vp = nv50->vertprog;
2770 struct nv50_program *fp = nv50->fragprog;
2771 struct nouveau_stateobj *so;
2772 struct nv50_sreg4 dummy, *vpo;
2773 int i, n, c, m = 0;
2774 uint32_t map[16], lin[4], reg[5], pcrd[8];
2775
2776 memset(map, 0, sizeof(map));
2777 memset(lin, 0, sizeof(lin));
2778
2779 reg[1] = 0x00000004; /* low and high clip distance map ids */
2780 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
2781 reg[3] = 0x00000000; /* point size map id & enable */
2782 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
2783 reg[4] = fp->cfg.regs[1]; /* interpolant info */
2784
2785 dummy.linear = FALSE;
2786 dummy.mask = 0xf; /* map all components of HPOS */
2787 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
2788
2789 dummy.mask = 0x0;
2790
2791 if (vp->cfg.clpd < 0x40) {
2792 for (c = 0; c < vp->cfg.clpd_nr; ++c)
2793 map[m++] = vp->cfg.clpd + c;
2794 reg[1] = (m << 8);
2795 }
2796
2797 reg[0] |= m << 8; /* adjust BFC0 id */
2798
2799 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
2800 if (nv50->rasterizer->pipe.light_twoside) {
2801 vpo = &vp->cfg.two_side[0];
2802
2803 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
2804 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
2805 }
2806
2807 reg[0] += m - 4; /* adjust FFC0 id */
2808 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
2809
2810 i = 0;
2811 if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
2812 i = 1;
2813 for (; i < fp->cfg.io_nr; i++) {
2814 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
2815 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
2816
2817 n = fp->cfg.io[i].id_vp;
2818 if (n >= vp->cfg.io_nr ||
2819 vp->info.output_semantic_name[n] != sn ||
2820 vp->info.output_semantic_index[n] != si)
2821 vpo = &dummy;
2822 else
2823 vpo = &vp->cfg.io[n];
2824
2825 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
2826 }
2827
2828 if (nv50->rasterizer->pipe.point_size_per_vertex) {
2829 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
2830 reg[3] = (m++ << 4) | 1;
2831 }
2832
2833 /* now fill the stateobj */
2834 so = so_new(64, 0);
2835
2836 n = (m + 3) / 4;
2837 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
2838 so_data (so, m);
2839 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
2840 so_datap (so, map, n);
2841
2842 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2843 so_datap (so, reg, 4);
2844
2845 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
2846 so_data (so, reg[4]);
2847
2848 so_method(so, tesla, 0x1540, 4);
2849 so_datap (so, lin, 4);
2850
2851 if (nv50->rasterizer->pipe.point_sprite) {
2852 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
2853
2854 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
2855 so_datap (so, pcrd, 8);
2856 }
2857
2858 so_ref(so, &nv50->state.programs);
2859 so_ref(NULL, &so);
2860 }
2861
2862 void
2863 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2864 {
2865 while (p->exec_head) {
2866 struct nv50_program_exec *e = p->exec_head;
2867
2868 p->exec_head = e->next;
2869 FREE(e);
2870 }
2871 p->exec_tail = NULL;
2872 p->exec_size = 0;
2873
2874 nouveau_bo_ref(NULL, &p->bo);
2875
2876 nouveau_resource_free(&p->data[0]);
2877
2878 p->translated = 0;
2879 }