r300: Zero-initialize register for NV_vertex_program
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 unsigned interp_mode[32];
116 /* perspective interpolation registers */
117 struct nv50_reg *iv_p;
118 struct nv50_reg *iv_c;
119
120 /* current instruction and total number of insns */
121 unsigned insn_cur;
122 unsigned insn_nr;
123
124 boolean allow32;
125 };
126
127 static void
128 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129 {
130 int i = 0;
131
132 if (reg->type == P_RESULT) {
133 if (pc->p->cfg.high_result < (reg->hw + 1))
134 pc->p->cfg.high_result = reg->hw + 1;
135 }
136
137 if (reg->type != P_TEMP)
138 return;
139
140 if (reg->hw >= 0) {
141 /*XXX: do this here too to catch FP temp-as-attr usage..
142 * not clean, but works */
143 if (pc->p->cfg.high_temp < (reg->hw + 1))
144 pc->p->cfg.high_temp = reg->hw + 1;
145 return;
146 }
147
148 if (reg->rhw != -1) {
149 /* try to allocate temporary with index rhw first */
150 if (!(pc->r_temp[reg->rhw])) {
151 pc->r_temp[reg->rhw] = reg;
152 reg->hw = reg->rhw;
153 if (pc->p->cfg.high_temp < (reg->rhw + 1))
154 pc->p->cfg.high_temp = reg->rhw + 1;
155 return;
156 }
157 /* make sure we don't get things like $r0 needs to go
158 * in $r1 and $r1 in $r0
159 */
160 i = pc->result_nr * 4;
161 }
162
163 for (; i < NV50_SU_MAX_TEMP; i++) {
164 if (!(pc->r_temp[i])) {
165 pc->r_temp[i] = reg;
166 reg->hw = i;
167 if (pc->p->cfg.high_temp < (i + 1))
168 pc->p->cfg.high_temp = i + 1;
169 return;
170 }
171 }
172
173 assert(0);
174 }
175
176 static struct nv50_reg *
177 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178 {
179 struct nv50_reg *r;
180 int i;
181
182 if (dst && dst->type == P_TEMP && dst->hw == -1)
183 return dst;
184
185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186 if (!pc->r_temp[i]) {
187 r = CALLOC_STRUCT(nv50_reg);
188 r->type = P_TEMP;
189 r->index = -1;
190 r->hw = i;
191 r->rhw = -1;
192 pc->r_temp[i] = r;
193 return r;
194 }
195 }
196
197 assert(0);
198 return NULL;
199 }
200
201 /* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204 static void
205 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206 {
207 assert(src->index == -1 && src->hw != -1);
208
209 if (dst->hw != -1)
210 pc->r_temp[dst->hw] = NULL;
211 pc->r_temp[src->hw] = dst;
212 dst->hw = src->hw;
213
214 FREE(src);
215 }
216
217 /* release the hardware resource held by r */
218 static void
219 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
220 {
221 assert(r->type == P_TEMP);
222 if (r->hw == -1)
223 return;
224
225 assert(pc->r_temp[r->hw] == r);
226 pc->r_temp[r->hw] = NULL;
227
228 r->acc = 0;
229 if (r->index == -1)
230 FREE(r);
231 }
232
233 static void
234 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
235 {
236 if (r->index == -1) {
237 unsigned hw = r->hw;
238
239 FREE(pc->r_temp[hw]);
240 pc->r_temp[hw] = NULL;
241 }
242 }
243
244 static int
245 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
246 {
247 int i;
248
249 if ((idx + 4) >= NV50_SU_MAX_TEMP)
250 return 1;
251
252 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
253 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
254 return alloc_temp4(pc, dst, idx + 4);
255
256 for (i = 0; i < 4; i++) {
257 dst[i] = CALLOC_STRUCT(nv50_reg);
258 dst[i]->type = P_TEMP;
259 dst[i]->index = -1;
260 dst[i]->hw = idx + i;
261 pc->r_temp[idx + i] = dst[i];
262 }
263
264 return 0;
265 }
266
267 static void
268 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
269 {
270 int i;
271
272 for (i = 0; i < 4; i++)
273 free_temp(pc, reg[i]);
274 }
275
276 static struct nv50_reg *
277 temp_temp(struct nv50_pc *pc)
278 {
279 if (pc->temp_temp_nr >= 16)
280 assert(0);
281
282 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
283 return pc->temp_temp[pc->temp_temp_nr++];
284 }
285
286 static void
287 kill_temp_temp(struct nv50_pc *pc)
288 {
289 int i;
290
291 for (i = 0; i < pc->temp_temp_nr; i++)
292 free_temp(pc, pc->temp_temp[i]);
293 pc->temp_temp_nr = 0;
294 }
295
296 static int
297 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
298 {
299 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
300 (pc->immd_nr + 1) * 4 * sizeof(float));
301 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
302 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
303 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
304 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
305
306 return pc->immd_nr++;
307 }
308
309 static struct nv50_reg *
310 alloc_immd(struct nv50_pc *pc, float f)
311 {
312 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
313 unsigned hw;
314
315 for (hw = 0; hw < pc->immd_nr * 4; hw++)
316 if (pc->immd_buf[hw] == f)
317 break;
318
319 if (hw == pc->immd_nr * 4)
320 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
321
322 r->type = P_IMMD;
323 r->hw = hw;
324 r->index = -1;
325 return r;
326 }
327
328 static struct nv50_program_exec *
329 exec(struct nv50_pc *pc)
330 {
331 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
332
333 e->param.index = -1;
334 return e;
335 }
336
337 static void
338 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
339 {
340 struct nv50_program *p = pc->p;
341
342 if (p->exec_tail)
343 p->exec_tail->next = e;
344 if (!p->exec_head)
345 p->exec_head = e;
346 p->exec_tail = e;
347 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
348 }
349
350 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
351
352 static boolean
353 is_long(struct nv50_program_exec *e)
354 {
355 if (e->inst[0] & 1)
356 return TRUE;
357 return FALSE;
358 }
359
360 static boolean
361 is_immd(struct nv50_program_exec *e)
362 {
363 if (is_long(e) && (e->inst[1] & 3) == 3)
364 return TRUE;
365 return FALSE;
366 }
367
368 static INLINE void
369 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
370 struct nv50_program_exec *e)
371 {
372 set_long(pc, e);
373 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
374 e->inst[1] |= (pred << 7) | (idx << 12);
375 }
376
377 static INLINE void
378 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
379 struct nv50_program_exec *e)
380 {
381 set_long(pc, e);
382 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
383 e->inst[1] |= (idx << 4) | (on << 6);
384 }
385
386 static INLINE void
387 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
388 {
389 if (is_long(e))
390 return;
391
392 e->inst[0] |= 1;
393 set_pred(pc, 0xf, 0, e);
394 set_pred_wr(pc, 0, 0, e);
395 }
396
397 static INLINE void
398 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
399 {
400 if (dst->type == P_RESULT) {
401 set_long(pc, e);
402 e->inst[1] |= 0x00000008;
403 }
404
405 alloc_reg(pc, dst);
406 e->inst[0] |= (dst->hw << 2);
407 }
408
409 static INLINE void
410 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
411 {
412 float f = pc->immd_buf[imm->hw];
413 unsigned val = fui(imm->neg ? -f : f);
414
415 set_long(pc, e);
416 /*XXX: can't be predicated - bits overlap.. catch cases where both
417 * are required and avoid them. */
418 set_pred(pc, 0, 0, e);
419 set_pred_wr(pc, 0, 0, e);
420
421 e->inst[1] |= 0x00000002 | 0x00000001;
422 e->inst[0] |= (val & 0x3f) << 16;
423 e->inst[1] |= (val >> 6) << 2;
424 }
425
426
427 #define INTERP_LINEAR 0
428 #define INTERP_FLAT 1
429 #define INTERP_PERSPECTIVE 2
430 #define INTERP_CENTROID 4
431
432 /* interpolant index has been stored in dst->rhw */
433 static void
434 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
435 unsigned mode)
436 {
437 assert(dst->rhw != -1);
438 struct nv50_program_exec *e = exec(pc);
439
440 e->inst[0] |= 0x80000000;
441 set_dst(pc, dst, e);
442 e->inst[0] |= (dst->rhw << 16);
443
444 if (mode & INTERP_FLAT) {
445 e->inst[0] |= (1 << 8);
446 } else {
447 if (mode & INTERP_PERSPECTIVE) {
448 e->inst[0] |= (1 << 25);
449 alloc_reg(pc, iv);
450 e->inst[0] |= (iv->hw << 9);
451 }
452
453 if (mode & INTERP_CENTROID)
454 e->inst[0] |= (1 << 24);
455 }
456
457 emit(pc, e);
458 }
459
460 static void
461 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
462 struct nv50_program_exec *e)
463 {
464 set_long(pc, e);
465
466 e->param.index = src->hw;
467 e->param.shift = s;
468 e->param.mask = m << (s % 32);
469
470 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
471 }
472
473 static void
474 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
475 {
476 struct nv50_program_exec *e = exec(pc);
477
478 e->inst[0] |= 0x10000000;
479
480 set_dst(pc, dst, e);
481
482 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
483 set_immd(pc, src, e);
484 /*XXX: 32-bit, but steals part of "half" reg space - need to
485 * catch and handle this case if/when we do half-regs
486 */
487 } else
488 if (src->type == P_IMMD || src->type == P_CONST) {
489 set_long(pc, e);
490 set_data(pc, src, 0x7f, 9, e);
491 e->inst[1] |= 0x20000000; /* src0 const? */
492 } else {
493 if (src->type == P_ATTR) {
494 set_long(pc, e);
495 e->inst[1] |= 0x00200000;
496 }
497
498 alloc_reg(pc, src);
499 e->inst[0] |= (src->hw << 9);
500 }
501
502 if (is_long(e) && !is_immd(e)) {
503 e->inst[1] |= 0x04000000; /* 32-bit */
504 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
505 if (!(e->inst[1] & 0x20000000))
506 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
507 } else
508 e->inst[0] |= 0x00008000;
509
510 emit(pc, e);
511 }
512
513 static INLINE void
514 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
515 {
516 struct nv50_reg *imm = alloc_immd(pc, f);
517 emit_mov(pc, dst, imm);
518 FREE(imm);
519 }
520
521 static boolean
522 check_swap_src_0_1(struct nv50_pc *pc,
523 struct nv50_reg **s0, struct nv50_reg **s1)
524 {
525 struct nv50_reg *src0 = *s0, *src1 = *s1;
526
527 if (src0->type == P_CONST) {
528 if (src1->type != P_CONST) {
529 *s0 = src1;
530 *s1 = src0;
531 return TRUE;
532 }
533 } else
534 if (src1->type == P_ATTR) {
535 if (src0->type != P_ATTR) {
536 *s0 = src1;
537 *s1 = src0;
538 return TRUE;
539 }
540 }
541
542 return FALSE;
543 }
544
545 static void
546 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
547 {
548 if (src->type == P_ATTR) {
549 set_long(pc, e);
550 e->inst[1] |= 0x00200000;
551 } else
552 if (src->type == P_CONST || src->type == P_IMMD) {
553 struct nv50_reg *temp = temp_temp(pc);
554
555 emit_mov(pc, temp, src);
556 src = temp;
557 }
558
559 alloc_reg(pc, src);
560 e->inst[0] |= (src->hw << 9);
561 }
562
563 static void
564 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
565 {
566 if (src->type == P_ATTR) {
567 struct nv50_reg *temp = temp_temp(pc);
568
569 emit_mov(pc, temp, src);
570 src = temp;
571 } else
572 if (src->type == P_CONST || src->type == P_IMMD) {
573 assert(!(e->inst[0] & 0x00800000));
574 if (e->inst[0] & 0x01000000) {
575 struct nv50_reg *temp = temp_temp(pc);
576
577 emit_mov(pc, temp, src);
578 src = temp;
579 } else {
580 set_data(pc, src, 0x7f, 16, e);
581 e->inst[0] |= 0x00800000;
582 }
583 }
584
585 alloc_reg(pc, src);
586 e->inst[0] |= (src->hw << 16);
587 }
588
589 static void
590 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
591 {
592 set_long(pc, e);
593
594 if (src->type == P_ATTR) {
595 struct nv50_reg *temp = temp_temp(pc);
596
597 emit_mov(pc, temp, src);
598 src = temp;
599 } else
600 if (src->type == P_CONST || src->type == P_IMMD) {
601 assert(!(e->inst[0] & 0x01000000));
602 if (e->inst[0] & 0x00800000) {
603 struct nv50_reg *temp = temp_temp(pc);
604
605 emit_mov(pc, temp, src);
606 src = temp;
607 } else {
608 set_data(pc, src, 0x7f, 32+14, e);
609 e->inst[0] |= 0x01000000;
610 }
611 }
612
613 alloc_reg(pc, src);
614 e->inst[1] |= (src->hw << 14);
615 }
616
617 static void
618 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
619 struct nv50_reg *src1)
620 {
621 struct nv50_program_exec *e = exec(pc);
622
623 e->inst[0] |= 0xc0000000;
624
625 if (!pc->allow32)
626 set_long(pc, e);
627
628 check_swap_src_0_1(pc, &src0, &src1);
629 set_dst(pc, dst, e);
630 set_src_0(pc, src0, e);
631 if (src1->type == P_IMMD && !is_long(e)) {
632 if (src0->neg)
633 e->inst[0] |= 0x00008000;
634 set_immd(pc, src1, e);
635 } else {
636 set_src_1(pc, src1, e);
637 if (src0->neg ^ src1->neg) {
638 if (is_long(e))
639 e->inst[1] |= 0x08000000;
640 else
641 e->inst[0] |= 0x00008000;
642 }
643 }
644
645 emit(pc, e);
646 }
647
648 static void
649 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
650 struct nv50_reg *src0, struct nv50_reg *src1)
651 {
652 struct nv50_program_exec *e = exec(pc);
653
654 e->inst[0] |= 0xb0000000;
655
656 check_swap_src_0_1(pc, &src0, &src1);
657
658 if (!pc->allow32 || src0->neg || src1->neg) {
659 set_long(pc, e);
660 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
661 }
662
663 set_dst(pc, dst, e);
664 set_src_0(pc, src0, e);
665 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
666 set_src_2(pc, src1, e);
667 else
668 if (src1->type == P_IMMD)
669 set_immd(pc, src1, e);
670 else
671 set_src_1(pc, src1, e);
672
673 emit(pc, e);
674 }
675
676 static void
677 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
678 struct nv50_reg *src0, struct nv50_reg *src1)
679 {
680 struct nv50_program_exec *e = exec(pc);
681
682 set_long(pc, e);
683 e->inst[0] |= 0xb0000000;
684 e->inst[1] |= (sub << 29);
685
686 check_swap_src_0_1(pc, &src0, &src1);
687 set_dst(pc, dst, e);
688 set_src_0(pc, src0, e);
689 set_src_1(pc, src1, e);
690
691 emit(pc, e);
692 }
693
694 static INLINE void
695 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
696 struct nv50_reg *src1)
697 {
698 src1->neg ^= 1;
699 emit_add(pc, dst, src0, src1);
700 src1->neg ^= 1;
701 }
702
703 static void
704 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
705 struct nv50_reg *src1, struct nv50_reg *src2)
706 {
707 struct nv50_program_exec *e = exec(pc);
708
709 e->inst[0] |= 0xe0000000;
710
711 check_swap_src_0_1(pc, &src0, &src1);
712 set_dst(pc, dst, e);
713 set_src_0(pc, src0, e);
714 set_src_1(pc, src1, e);
715 set_src_2(pc, src2, e);
716
717 if (src0->neg ^ src1->neg)
718 e->inst[1] |= 0x04000000;
719 if (src2->neg)
720 e->inst[1] |= 0x08000000;
721
722 emit(pc, e);
723 }
724
725 static INLINE void
726 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
727 struct nv50_reg *src1, struct nv50_reg *src2)
728 {
729 src2->neg ^= 1;
730 emit_mad(pc, dst, src0, src1, src2);
731 src2->neg ^= 1;
732 }
733
734 static void
735 emit_flop(struct nv50_pc *pc, unsigned sub,
736 struct nv50_reg *dst, struct nv50_reg *src)
737 {
738 struct nv50_program_exec *e = exec(pc);
739
740 e->inst[0] |= 0x90000000;
741 if (sub) {
742 set_long(pc, e);
743 e->inst[1] |= (sub << 29);
744 }
745
746 set_dst(pc, dst, e);
747 set_src_0(pc, src, e);
748
749 emit(pc, e);
750 }
751
752 static void
753 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
754 {
755 struct nv50_program_exec *e = exec(pc);
756
757 e->inst[0] |= 0xb0000000;
758
759 set_dst(pc, dst, e);
760 set_src_0(pc, src, e);
761 set_long(pc, e);
762 e->inst[1] |= (6 << 29) | 0x00004000;
763
764 emit(pc, e);
765 }
766
767 static void
768 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
769 {
770 struct nv50_program_exec *e = exec(pc);
771
772 e->inst[0] |= 0xb0000000;
773
774 set_dst(pc, dst, e);
775 set_src_0(pc, src, e);
776 set_long(pc, e);
777 e->inst[1] |= (6 << 29);
778
779 emit(pc, e);
780 }
781
782 #define CVTOP_RN 0x01
783 #define CVTOP_FLOOR 0x03
784 #define CVTOP_CEIL 0x05
785 #define CVTOP_TRUNC 0x07
786 #define CVTOP_SAT 0x08
787 #define CVTOP_ABS 0x10
788
789 #define CVT_F32_F32 0xc4
790 #define CVT_F32_S32 0x44
791 #define CVT_F32_U32 0x64
792 #define CVT_S32_F32 0x8c
793 #define CVT_S32_S32 0x0c
794 #define CVT_F32_F32_ROP 0xcc
795
796 static void
797 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
798 int wp, unsigned cop, unsigned fmt)
799 {
800 struct nv50_program_exec *e;
801
802 e = exec(pc);
803 set_long(pc, e);
804
805 e->inst[0] |= 0xa0000000;
806 e->inst[1] |= 0x00004000;
807 e->inst[1] |= (cop << 16);
808 e->inst[1] |= (fmt << 24);
809 set_src_0(pc, src, e);
810
811 if (wp >= 0)
812 set_pred_wr(pc, 1, wp, e);
813
814 if (dst)
815 set_dst(pc, dst, e);
816 else {
817 e->inst[0] |= 0x000001fc;
818 e->inst[1] |= 0x00000008;
819 }
820
821 emit(pc, e);
822 }
823
824 static void
825 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
826 struct nv50_reg *src0, struct nv50_reg *src1)
827 {
828 struct nv50_program_exec *e = exec(pc);
829 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
830 struct nv50_reg *rdst;
831
832 assert(c_op <= 7);
833 if (check_swap_src_0_1(pc, &src0, &src1))
834 c_op = inv_cop[c_op];
835
836 rdst = dst;
837 if (dst->type != P_TEMP)
838 dst = alloc_temp(pc, NULL);
839
840 /* set.u32 */
841 set_long(pc, e);
842 e->inst[0] |= 0xb0000000;
843 e->inst[1] |= (3 << 29);
844 e->inst[1] |= (c_op << 14);
845 /*XXX: breaks things, .u32 by default?
846 * decuda will disasm as .u16 and use .lo/.hi regs, but this
847 * doesn't seem to match what the hw actually does.
848 inst[1] |= 0x04000000; << breaks things.. .u32 by default?
849 */
850 set_dst(pc, dst, e);
851 set_src_0(pc, src0, e);
852 set_src_1(pc, src1, e);
853 emit(pc, e);
854
855 /* cvt.f32.u32 */
856 e = exec(pc);
857 e->inst[0] = 0xa0000001;
858 e->inst[1] = 0x64014780;
859 set_dst(pc, rdst, e);
860 set_src_0(pc, dst, e);
861 emit(pc, e);
862
863 if (dst != rdst)
864 free_temp(pc, dst);
865 }
866
867 static INLINE void
868 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
869 {
870 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
871 }
872
873 static void
874 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
875 struct nv50_reg *v, struct nv50_reg *e)
876 {
877 struct nv50_reg *temp = alloc_temp(pc, NULL);
878
879 emit_flop(pc, 3, temp, v);
880 emit_mul(pc, temp, temp, e);
881 emit_preex2(pc, temp, temp);
882 emit_flop(pc, 6, dst, temp);
883
884 free_temp(pc, temp);
885 }
886
887 static INLINE void
888 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
889 {
890 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
891 }
892
893 static void
894 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
895 struct nv50_reg **src)
896 {
897 struct nv50_reg *one = alloc_immd(pc, 1.0);
898 struct nv50_reg *zero = alloc_immd(pc, 0.0);
899 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
900 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
901 struct nv50_reg *tmp[4];
902 boolean allow32 = pc->allow32;
903
904 pc->allow32 = FALSE;
905
906 if (mask & (3 << 1)) {
907 tmp[0] = alloc_temp(pc, NULL);
908 emit_minmax(pc, 4, tmp[0], src[0], zero);
909 }
910
911 if (mask & (1 << 2)) {
912 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
913
914 tmp[1] = temp_temp(pc);
915 emit_minmax(pc, 4, tmp[1], src[1], zero);
916
917 tmp[3] = temp_temp(pc);
918 emit_minmax(pc, 4, tmp[3], src[3], neg128);
919 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
920
921 emit_pow(pc, dst[2], tmp[1], tmp[3]);
922 emit_mov(pc, dst[2], zero);
923 set_pred(pc, 3, 0, pc->p->exec_tail);
924 }
925
926 if (mask & (1 << 1))
927 assimilate_temp(pc, dst[1], tmp[0]);
928 else
929 if (mask & (1 << 2))
930 free_temp(pc, tmp[0]);
931
932 pc->allow32 = allow32;
933
934 /* do this last, in case src[i,j] == dst[0,3] */
935 if (mask & (1 << 0))
936 emit_mov(pc, dst[0], one);
937
938 if (mask & (1 << 3))
939 emit_mov(pc, dst[3], one);
940
941 FREE(pos128);
942 FREE(neg128);
943 FREE(zero);
944 FREE(one);
945 }
946
947 static void
948 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
949 {
950 struct nv50_program_exec *e = exec(pc);
951
952 set_long(pc, e);
953 e->inst[0] |= 0xa0000000; /* delta */
954 e->inst[1] |= (7 << 29); /* delta */
955 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
956 e->inst[1] |= (1 << 14); /* src .f32 */
957 set_dst(pc, dst, e);
958 set_src_0(pc, src, e);
959
960 emit(pc, e);
961 }
962
963 static void
964 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
965 {
966 struct nv50_program_exec *e;
967 const int r_pred = 1;
968
969 /* Sets predicate reg ? */
970 e = exec(pc);
971 e->inst[0] = 0xa00001fd;
972 e->inst[1] = 0xc4014788;
973 set_src_0(pc, src, e);
974 set_pred_wr(pc, 1, r_pred, e);
975 if (src->neg)
976 e->inst[1] |= 0x20000000;
977 emit(pc, e);
978
979 /* This is probably KILP */
980 e = exec(pc);
981 e->inst[0] = 0x000001fe;
982 set_long(pc, e);
983 set_pred(pc, 1 /* LT? */, r_pred, e);
984 emit(pc, e);
985 }
986
987 static void
988 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
989 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
990 {
991 struct nv50_reg *temp, *t[4];
992 struct nv50_program_exec *e;
993
994 unsigned c, mode, dim;
995
996 switch (type) {
997 case TGSI_TEXTURE_1D:
998 dim = 1;
999 break;
1000 case TGSI_TEXTURE_UNKNOWN:
1001 case TGSI_TEXTURE_2D:
1002 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1003 case TGSI_TEXTURE_RECT:
1004 dim = 2;
1005 break;
1006 case TGSI_TEXTURE_3D:
1007 case TGSI_TEXTURE_CUBE:
1008 case TGSI_TEXTURE_SHADOW2D:
1009 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1010 dim = 3;
1011 break;
1012 default:
1013 assert(0);
1014 break;
1015 }
1016
1017 /* some cards need t[0]'s hw index to be a multiple of 4 */
1018 alloc_temp4(pc, t, 0);
1019
1020 if (proj) {
1021 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1022 mode = pc->interp_mode[src[0]->index];
1023
1024 t[3]->rhw = src[3]->rhw;
1025 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1026 emit_flop(pc, 0, t[3], t[3]);
1027
1028 for (c = 0; c < dim; c++) {
1029 t[c]->rhw = src[c]->rhw;
1030 emit_interp(pc, t[c], t[3],
1031 (mode | INTERP_PERSPECTIVE));
1032 }
1033 } else {
1034 emit_flop(pc, 0, t[3], src[3]);
1035 for (c = 0; c < dim; c++)
1036 emit_mul(pc, t[c], src[c], t[3]);
1037
1038 /* XXX: for some reason the blob sometimes uses MAD:
1039 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1040 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1041 */
1042 }
1043 } else {
1044 if (type == TGSI_TEXTURE_CUBE) {
1045 temp = temp_temp(pc);
1046 emit_minmax(pc, 4, temp, src[0], src[1]);
1047 emit_minmax(pc, 4, temp, temp, src[2]);
1048 emit_flop(pc, 0, temp, temp);
1049 for (c = 0; c < 3; c++)
1050 emit_mul(pc, t[c], src[c], temp);
1051 } else {
1052 for (c = 0; c < dim; c++)
1053 emit_mov(pc, t[c], src[c]);
1054 }
1055 }
1056
1057 e = exec(pc);
1058 set_long(pc, e);
1059 e->inst[0] |= 0xf0000000;
1060 e->inst[1] |= 0x00000004;
1061 set_dst(pc, t[0], e);
1062 e->inst[0] |= (unit << 9);
1063
1064 if (dim == 2)
1065 e->inst[0] |= 0x00400000;
1066 else
1067 if (dim == 3)
1068 e->inst[0] |= 0x00800000;
1069
1070 e->inst[0] |= (mask & 0x3) << 25;
1071 e->inst[1] |= (mask & 0xc) << 12;
1072
1073 emit(pc, e);
1074
1075 #if 1
1076 if (mask & 1) emit_mov(pc, dst[0], t[0]);
1077 if (mask & 2) emit_mov(pc, dst[1], t[1]);
1078 if (mask & 4) emit_mov(pc, dst[2], t[2]);
1079 if (mask & 8) emit_mov(pc, dst[3], t[3]);
1080
1081 free_temp4(pc, t);
1082 #else
1083 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1084 * the texture coordinates, not the fetched values: latency ? */
1085
1086 for (c = 0; c < 4; c++) {
1087 if (mask & (1 << c))
1088 assimilate_temp(pc, dst[c], t[c]);
1089 else
1090 free_temp(pc, t[c]);
1091 }
1092 #endif
1093 }
1094
1095 static void
1096 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1097 {
1098 unsigned q = 0, m = ~0;
1099
1100 assert(!is_long(e));
1101
1102 switch (e->inst[0] >> 28) {
1103 case 0x1:
1104 /* MOV */
1105 q = 0x0403c000;
1106 m = 0xffff7fff;
1107 break;
1108 case 0x8:
1109 /* INTERP (move centroid, perspective and flat bits) */
1110 m = ~0x03000100;
1111 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1112 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1113 break;
1114 case 0x9:
1115 /* RCP */
1116 break;
1117 case 0xB:
1118 /* ADD */
1119 m = ~(127 << 16);
1120 q = ((e->inst[0] & (~m)) >> 2);
1121 break;
1122 case 0xC:
1123 /* MUL */
1124 m = ~0x00008000;
1125 q = ((e->inst[0] & (~m)) << 12);
1126 break;
1127 case 0xE:
1128 /* MAD (if src2 == dst) */
1129 q = ((e->inst[0] & 0x1fc) << 12);
1130 break;
1131 default:
1132 assert(0);
1133 break;
1134 }
1135
1136 set_long(pc, e);
1137 pc->p->exec_size++;
1138
1139 e->inst[0] &= m;
1140 e->inst[1] |= q;
1141 }
1142
1143 static boolean
1144 negate_supported(const struct tgsi_full_instruction *insn, int i)
1145 {
1146 switch (insn->Instruction.Opcode) {
1147 case TGSI_OPCODE_DP3:
1148 case TGSI_OPCODE_DP4:
1149 case TGSI_OPCODE_MUL:
1150 case TGSI_OPCODE_KIL:
1151 case TGSI_OPCODE_ADD:
1152 case TGSI_OPCODE_SUB:
1153 case TGSI_OPCODE_MAD:
1154 return TRUE;
1155 case TGSI_OPCODE_POW:
1156 return (i == 1) ? TRUE : FALSE;
1157 default:
1158 return FALSE;
1159 }
1160 }
1161
1162 static struct nv50_reg *
1163 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1164 {
1165 switch (dst->DstRegister.File) {
1166 case TGSI_FILE_TEMPORARY:
1167 return &pc->temp[dst->DstRegister.Index * 4 + c];
1168 case TGSI_FILE_OUTPUT:
1169 return &pc->result[dst->DstRegister.Index * 4 + c];
1170 case TGSI_FILE_NULL:
1171 return NULL;
1172 default:
1173 break;
1174 }
1175
1176 return NULL;
1177 }
1178
1179 static struct nv50_reg *
1180 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1181 boolean neg)
1182 {
1183 struct nv50_reg *r = NULL;
1184 struct nv50_reg *temp;
1185 unsigned sgn, c;
1186
1187 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1188
1189 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1190 switch (c) {
1191 case TGSI_EXTSWIZZLE_X:
1192 case TGSI_EXTSWIZZLE_Y:
1193 case TGSI_EXTSWIZZLE_Z:
1194 case TGSI_EXTSWIZZLE_W:
1195 switch (src->SrcRegister.File) {
1196 case TGSI_FILE_INPUT:
1197 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1198 break;
1199 case TGSI_FILE_TEMPORARY:
1200 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1201 break;
1202 case TGSI_FILE_CONSTANT:
1203 r = &pc->param[src->SrcRegister.Index * 4 + c];
1204 break;
1205 case TGSI_FILE_IMMEDIATE:
1206 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1207 break;
1208 case TGSI_FILE_SAMPLER:
1209 break;
1210 default:
1211 assert(0);
1212 break;
1213 }
1214 break;
1215 case TGSI_EXTSWIZZLE_ZERO:
1216 r = alloc_immd(pc, 0.0);
1217 return r;
1218 case TGSI_EXTSWIZZLE_ONE:
1219 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1220 return alloc_immd(pc, -1.0);
1221 return alloc_immd(pc, 1.0);
1222 default:
1223 assert(0);
1224 break;
1225 }
1226
1227 switch (sgn) {
1228 case TGSI_UTIL_SIGN_KEEP:
1229 break;
1230 case TGSI_UTIL_SIGN_CLEAR:
1231 temp = temp_temp(pc);
1232 emit_abs(pc, temp, r);
1233 r = temp;
1234 break;
1235 case TGSI_UTIL_SIGN_TOGGLE:
1236 if (neg)
1237 r->neg = 1;
1238 else {
1239 temp = temp_temp(pc);
1240 emit_neg(pc, temp, r);
1241 r = temp;
1242 }
1243 break;
1244 case TGSI_UTIL_SIGN_SET:
1245 temp = temp_temp(pc);
1246 emit_abs(pc, temp, r);
1247 if (neg)
1248 temp->neg = 1;
1249 else
1250 emit_neg(pc, temp, temp);
1251 r = temp;
1252 break;
1253 default:
1254 assert(0);
1255 break;
1256 }
1257
1258 return r;
1259 }
1260
1261 /* returns TRUE if instruction can overwrite sources before they're read */
1262 static boolean
1263 direct2dest_op(const struct tgsi_full_instruction *insn)
1264 {
1265 if (insn->Instruction.Saturate)
1266 return FALSE;
1267
1268 switch (insn->Instruction.Opcode) {
1269 case TGSI_OPCODE_COS:
1270 case TGSI_OPCODE_DP3:
1271 case TGSI_OPCODE_DP4:
1272 case TGSI_OPCODE_DPH:
1273 case TGSI_OPCODE_KIL:
1274 case TGSI_OPCODE_LIT:
1275 case TGSI_OPCODE_POW:
1276 case TGSI_OPCODE_RCP:
1277 case TGSI_OPCODE_RSQ:
1278 case TGSI_OPCODE_SCS:
1279 case TGSI_OPCODE_SIN:
1280 case TGSI_OPCODE_TEX:
1281 case TGSI_OPCODE_TXP:
1282 return FALSE;
1283 default:
1284 return TRUE;
1285 }
1286 }
1287
1288 static boolean
1289 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1290 {
1291 const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1292 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1293 unsigned mask, sat, unit;
1294 boolean assimilate = FALSE;
1295 int i, c;
1296
1297 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1298 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1299
1300 for (c = 0; c < 4; c++) {
1301 if (mask & (1 << c))
1302 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1303 else
1304 dst[c] = NULL;
1305 rdst[c] = NULL;
1306 src[0][c] = NULL;
1307 src[1][c] = NULL;
1308 src[2][c] = NULL;
1309 }
1310
1311 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1312 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1313
1314 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1315 unit = fs->SrcRegister.Index;
1316
1317 for (c = 0; c < 4; c++)
1318 src[i][c] = tgsi_src(pc, c, fs,
1319 negate_supported(inst, i));
1320 }
1321
1322 if (sat) {
1323 for (c = 0; c < 4; c++) {
1324 rdst[c] = dst[c];
1325 dst[c] = temp_temp(pc);
1326 }
1327 } else
1328 if (direct2dest_op(inst)) {
1329 for (c = 0; c < 4; c++) {
1330 if (!dst[c] || dst[c]->type != P_TEMP)
1331 continue;
1332
1333 for (i = c + 1; i < 4; i++) {
1334 if (dst[c] == src[0][i] ||
1335 dst[c] == src[1][i] ||
1336 dst[c] == src[2][i])
1337 break;
1338 }
1339 if (i == 4)
1340 continue;
1341
1342 assimilate = TRUE;
1343 rdst[c] = dst[c];
1344 dst[c] = alloc_temp(pc, NULL);
1345 }
1346 }
1347
1348 switch (inst->Instruction.Opcode) {
1349 case TGSI_OPCODE_ABS:
1350 for (c = 0; c < 4; c++) {
1351 if (!(mask & (1 << c)))
1352 continue;
1353 emit_abs(pc, dst[c], src[0][c]);
1354 }
1355 break;
1356 case TGSI_OPCODE_ADD:
1357 for (c = 0; c < 4; c++) {
1358 if (!(mask & (1 << c)))
1359 continue;
1360 emit_add(pc, dst[c], src[0][c], src[1][c]);
1361 }
1362 break;
1363 case TGSI_OPCODE_COS:
1364 temp = temp_temp(pc);
1365 emit_precossin(pc, temp, src[0][0]);
1366 emit_flop(pc, 5, temp, temp);
1367 for (c = 0; c < 4; c++) {
1368 if (!(mask & (1 << c)))
1369 continue;
1370 emit_mov(pc, dst[c], temp);
1371 }
1372 break;
1373 case TGSI_OPCODE_DP3:
1374 temp = temp_temp(pc);
1375 emit_mul(pc, temp, src[0][0], src[1][0]);
1376 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1377 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1378 for (c = 0; c < 4; c++) {
1379 if (!(mask & (1 << c)))
1380 continue;
1381 emit_mov(pc, dst[c], temp);
1382 }
1383 break;
1384 case TGSI_OPCODE_DP4:
1385 temp = temp_temp(pc);
1386 emit_mul(pc, temp, src[0][0], src[1][0]);
1387 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1388 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1389 emit_mad(pc, temp, src[0][3], src[1][3], temp);
1390 for (c = 0; c < 4; c++) {
1391 if (!(mask & (1 << c)))
1392 continue;
1393 emit_mov(pc, dst[c], temp);
1394 }
1395 break;
1396 case TGSI_OPCODE_DPH:
1397 temp = temp_temp(pc);
1398 emit_mul(pc, temp, src[0][0], src[1][0]);
1399 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1400 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1401 emit_add(pc, temp, src[1][3], temp);
1402 for (c = 0; c < 4; c++) {
1403 if (!(mask & (1 << c)))
1404 continue;
1405 emit_mov(pc, dst[c], temp);
1406 }
1407 break;
1408 case TGSI_OPCODE_DST:
1409 {
1410 struct nv50_reg *one = alloc_immd(pc, 1.0);
1411 if (mask & (1 << 0))
1412 emit_mov(pc, dst[0], one);
1413 if (mask & (1 << 1))
1414 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1415 if (mask & (1 << 2))
1416 emit_mov(pc, dst[2], src[0][2]);
1417 if (mask & (1 << 3))
1418 emit_mov(pc, dst[3], src[1][3]);
1419 FREE(one);
1420 }
1421 break;
1422 case TGSI_OPCODE_EX2:
1423 temp = temp_temp(pc);
1424 emit_preex2(pc, temp, src[0][0]);
1425 emit_flop(pc, 6, temp, temp);
1426 for (c = 0; c < 4; c++) {
1427 if (!(mask & (1 << c)))
1428 continue;
1429 emit_mov(pc, dst[c], temp);
1430 }
1431 break;
1432 case TGSI_OPCODE_FLR:
1433 for (c = 0; c < 4; c++) {
1434 if (!(mask & (1 << c)))
1435 continue;
1436 emit_flr(pc, dst[c], src[0][c]);
1437 }
1438 break;
1439 case TGSI_OPCODE_FRC:
1440 temp = temp_temp(pc);
1441 for (c = 0; c < 4; c++) {
1442 if (!(mask & (1 << c)))
1443 continue;
1444 emit_flr(pc, temp, src[0][c]);
1445 emit_sub(pc, dst[c], src[0][c], temp);
1446 }
1447 break;
1448 case TGSI_OPCODE_KIL:
1449 emit_kil(pc, src[0][0]);
1450 emit_kil(pc, src[0][1]);
1451 emit_kil(pc, src[0][2]);
1452 emit_kil(pc, src[0][3]);
1453 pc->p->cfg.fp.regs[2] |= 0x00100000;
1454 break;
1455 case TGSI_OPCODE_LIT:
1456 emit_lit(pc, &dst[0], mask, &src[0][0]);
1457 break;
1458 case TGSI_OPCODE_LG2:
1459 temp = temp_temp(pc);
1460 emit_flop(pc, 3, temp, src[0][0]);
1461 for (c = 0; c < 4; c++) {
1462 if (!(mask & (1 << c)))
1463 continue;
1464 emit_mov(pc, dst[c], temp);
1465 }
1466 break;
1467 case TGSI_OPCODE_LRP:
1468 temp = temp_temp(pc);
1469 for (c = 0; c < 4; c++) {
1470 if (!(mask & (1 << c)))
1471 continue;
1472 emit_sub(pc, temp, src[1][c], src[2][c]);
1473 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1474 }
1475 break;
1476 case TGSI_OPCODE_MAD:
1477 for (c = 0; c < 4; c++) {
1478 if (!(mask & (1 << c)))
1479 continue;
1480 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1481 }
1482 break;
1483 case TGSI_OPCODE_MAX:
1484 for (c = 0; c < 4; c++) {
1485 if (!(mask & (1 << c)))
1486 continue;
1487 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1488 }
1489 break;
1490 case TGSI_OPCODE_MIN:
1491 for (c = 0; c < 4; c++) {
1492 if (!(mask & (1 << c)))
1493 continue;
1494 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1495 }
1496 break;
1497 case TGSI_OPCODE_MOV:
1498 case TGSI_OPCODE_SWZ:
1499 for (c = 0; c < 4; c++) {
1500 if (!(mask & (1 << c)))
1501 continue;
1502 emit_mov(pc, dst[c], src[0][c]);
1503 }
1504 break;
1505 case TGSI_OPCODE_MUL:
1506 for (c = 0; c < 4; c++) {
1507 if (!(mask & (1 << c)))
1508 continue;
1509 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1510 }
1511 break;
1512 case TGSI_OPCODE_POW:
1513 temp = temp_temp(pc);
1514 emit_pow(pc, temp, src[0][0], src[1][0]);
1515 for (c = 0; c < 4; c++) {
1516 if (!(mask & (1 << c)))
1517 continue;
1518 emit_mov(pc, dst[c], temp);
1519 }
1520 break;
1521 case TGSI_OPCODE_RCP:
1522 for (c = 3; c >= 0; c--) {
1523 if (!(mask & (1 << c)))
1524 continue;
1525 emit_flop(pc, 0, dst[c], src[0][0]);
1526 }
1527 break;
1528 case TGSI_OPCODE_RSQ:
1529 for (c = 3; c >= 0; c--) {
1530 if (!(mask & (1 << c)))
1531 continue;
1532 emit_flop(pc, 2, dst[c], src[0][0]);
1533 }
1534 break;
1535 case TGSI_OPCODE_SCS:
1536 temp = temp_temp(pc);
1537 emit_precossin(pc, temp, src[0][0]);
1538 if (mask & (1 << 0))
1539 emit_flop(pc, 5, dst[0], temp);
1540 if (mask & (1 << 1))
1541 emit_flop(pc, 4, dst[1], temp);
1542 if (mask & (1 << 2))
1543 emit_mov_immdval(pc, dst[2], 0.0);
1544 if (mask & (1 << 3))
1545 emit_mov_immdval(pc, dst[3], 1.0);
1546 break;
1547 case TGSI_OPCODE_SGE:
1548 for (c = 0; c < 4; c++) {
1549 if (!(mask & (1 << c)))
1550 continue;
1551 emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1552 }
1553 break;
1554 case TGSI_OPCODE_SIN:
1555 temp = temp_temp(pc);
1556 emit_precossin(pc, temp, src[0][0]);
1557 emit_flop(pc, 4, temp, temp);
1558 for (c = 0; c < 4; c++) {
1559 if (!(mask & (1 << c)))
1560 continue;
1561 emit_mov(pc, dst[c], temp);
1562 }
1563 break;
1564 case TGSI_OPCODE_SLT:
1565 for (c = 0; c < 4; c++) {
1566 if (!(mask & (1 << c)))
1567 continue;
1568 emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1569 }
1570 break;
1571 case TGSI_OPCODE_SUB:
1572 for (c = 0; c < 4; c++) {
1573 if (!(mask & (1 << c)))
1574 continue;
1575 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1576 }
1577 break;
1578 case TGSI_OPCODE_TEX:
1579 emit_tex(pc, dst, mask, src[0], unit,
1580 inst->InstructionExtTexture.Texture, FALSE);
1581 break;
1582 case TGSI_OPCODE_TXP:
1583 emit_tex(pc, dst, mask, src[0], unit,
1584 inst->InstructionExtTexture.Texture, TRUE);
1585 break;
1586 case TGSI_OPCODE_XPD:
1587 temp = temp_temp(pc);
1588 if (mask & (1 << 0)) {
1589 emit_mul(pc, temp, src[0][2], src[1][1]);
1590 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1591 }
1592 if (mask & (1 << 1)) {
1593 emit_mul(pc, temp, src[0][0], src[1][2]);
1594 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1595 }
1596 if (mask & (1 << 2)) {
1597 emit_mul(pc, temp, src[0][1], src[1][0]);
1598 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1599 }
1600 if (mask & (1 << 3))
1601 emit_mov_immdval(pc, dst[3], 1.0);
1602 break;
1603 case TGSI_OPCODE_END:
1604 break;
1605 default:
1606 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1607 return FALSE;
1608 }
1609
1610 if (sat) {
1611 for (c = 0; c < 4; c++) {
1612 if (!(mask & (1 << c)))
1613 continue;
1614 emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
1615 CVT_F32_F32);
1616 }
1617 } else if (assimilate) {
1618 for (c = 0; c < 4; c++)
1619 if (rdst[c])
1620 assimilate_temp(pc, rdst[c], dst[c]);
1621 }
1622
1623 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1624 for (c = 0; c < 4; c++) {
1625 if (!src[i][c])
1626 continue;
1627 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1628 FREE(src[i][c]);
1629 else
1630 if (src[i][c]->acc == pc->insn_cur)
1631 release_hw(pc, src[i][c]);
1632 }
1633 }
1634
1635 kill_temp_temp(pc);
1636 return TRUE;
1637 }
1638
1639 /* Adjust a bitmask that indicates what components of a source are used,
1640 * we use this in tx_prep so we only load interpolants that are needed.
1641 */
1642 static void
1643 insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1644 {
1645 const struct tgsi_instruction_ext_texture *tex;
1646
1647 switch (insn->Instruction.Opcode) {
1648 case TGSI_OPCODE_DP3:
1649 *mask = 0x7;
1650 break;
1651 case TGSI_OPCODE_DP4:
1652 case TGSI_OPCODE_DPH:
1653 *mask = 0xF;
1654 break;
1655 case TGSI_OPCODE_LIT:
1656 *mask = 0xB;
1657 break;
1658 case TGSI_OPCODE_RCP:
1659 case TGSI_OPCODE_RSQ:
1660 *mask = 0x1;
1661 break;
1662 case TGSI_OPCODE_TEX:
1663 case TGSI_OPCODE_TXP:
1664 assert(insn->Instruction.Extended);
1665 tex = &insn->InstructionExtTexture;
1666
1667 *mask = 0x7;
1668 if (tex->Texture == TGSI_TEXTURE_1D)
1669 *mask = 0x1;
1670 else
1671 if (tex->Texture == TGSI_TEXTURE_2D)
1672 *mask = 0x3;
1673
1674 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1675 *mask |= 0x8;
1676 break;
1677 default:
1678 break;
1679 }
1680 }
1681
1682 static void
1683 prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1684 unsigned *r_usage[2])
1685 {
1686 const struct tgsi_full_instruction *insn;
1687 const struct tgsi_full_src_register *src;
1688 const struct tgsi_dst_register *dst;
1689
1690 unsigned i, c, k, n, mask, *acc_p;
1691
1692 insn = &tok->FullInstruction;
1693 dst = &insn->FullDstRegisters[0].DstRegister;
1694 mask = dst->WriteMask;
1695
1696 if (!r_usage[0])
1697 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1698 if (!r_usage[1])
1699 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1700
1701 if (dst->File == TGSI_FILE_TEMPORARY) {
1702 for (c = 0; c < 4; c++) {
1703 if (!(mask & (1 << c)))
1704 continue;
1705 r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1706 }
1707 }
1708
1709 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1710 src = &insn->FullSrcRegisters[i];
1711
1712 switch (src->SrcRegister.File) {
1713 case TGSI_FILE_TEMPORARY:
1714 acc_p = r_usage[0];
1715 break;
1716 case TGSI_FILE_INPUT:
1717 acc_p = r_usage[1];
1718 break;
1719 default:
1720 continue;
1721 }
1722
1723 insn_adjust_mask(insn, &mask);
1724
1725 for (c = 0; c < 4; c++) {
1726 if (!(mask & (1 << c)))
1727 continue;
1728
1729 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1730 switch (k) {
1731 case TGSI_EXTSWIZZLE_X:
1732 case TGSI_EXTSWIZZLE_Y:
1733 case TGSI_EXTSWIZZLE_Z:
1734 case TGSI_EXTSWIZZLE_W:
1735 n = src->SrcRegister.Index * 4 + k;
1736 acc_p[n] = pc->insn_nr;
1737 break;
1738 default:
1739 break;
1740 }
1741 }
1742 }
1743 }
1744
1745 static unsigned
1746 load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1747 int *aid, int *p_oid)
1748 {
1749 struct nv50_reg *iv;
1750 int oid, c, n;
1751 unsigned mask = 0;
1752
1753 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1754
1755 for (c = 0, n = i * 4; c < 4; c++, n++) {
1756 oid = (*p_oid)++;
1757 pc->attr[n].type = P_TEMP;
1758 pc->attr[n].index = i;
1759
1760 if (pc->attr[n].acc == acc[n])
1761 continue;
1762 mask |= (1 << c);
1763
1764 pc->attr[n].acc = acc[n];
1765 pc->attr[n].rhw = pc->attr[n].hw = -1;
1766 alloc_reg(pc, &pc->attr[n]);
1767
1768 pc->attr[n].rhw = (*aid)++;
1769 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1770
1771 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1772 (*mid)++;
1773 pc->p->cfg.fp.regs[1] += 0x00010001;
1774 }
1775
1776 return mask;
1777 }
1778
1779 static boolean
1780 nv50_program_tx_prep(struct nv50_pc *pc)
1781 {
1782 struct tgsi_parse_context p;
1783 boolean ret = FALSE;
1784 unsigned i, c;
1785 unsigned fcol, bcol, fcrd, depr;
1786
1787 /* count (centroid) perspective interpolations */
1788 unsigned centroid_loads = 0;
1789 unsigned perspect_loads = 0;
1790
1791 /* track register access for temps and attrs */
1792 unsigned *r_usage[2];
1793 r_usage[0] = NULL;
1794 r_usage[1] = NULL;
1795
1796 depr = fcol = bcol = fcrd = 0xffff;
1797
1798 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1799 pc->p->cfg.fp.regs[0] = 0x01000404;
1800 pc->p->cfg.fp.regs[1] = 0x00000400;
1801 }
1802
1803 tgsi_parse_init(&p, pc->p->pipe.tokens);
1804 while (!tgsi_parse_end_of_tokens(&p)) {
1805 const union tgsi_full_token *tok = &p.FullToken;
1806
1807 tgsi_parse_token(&p);
1808 switch (tok->Token.Type) {
1809 case TGSI_TOKEN_TYPE_IMMEDIATE:
1810 {
1811 const struct tgsi_full_immediate *imm =
1812 &p.FullToken.FullImmediate;
1813
1814 ctor_immd(pc, imm->u[0].Float,
1815 imm->u[1].Float,
1816 imm->u[2].Float,
1817 imm->u[3].Float);
1818 }
1819 break;
1820 case TGSI_TOKEN_TYPE_DECLARATION:
1821 {
1822 const struct tgsi_full_declaration *d;
1823 unsigned last, first, mode;
1824
1825 d = &p.FullToken.FullDeclaration;
1826 first = d->DeclarationRange.First;
1827 last = d->DeclarationRange.Last;
1828
1829 switch (d->Declaration.File) {
1830 case TGSI_FILE_TEMPORARY:
1831 if (pc->temp_nr < (last + 1))
1832 pc->temp_nr = last + 1;
1833 break;
1834 case TGSI_FILE_OUTPUT:
1835 if (pc->result_nr < (last + 1))
1836 pc->result_nr = last + 1;
1837
1838 if (!d->Declaration.Semantic)
1839 break;
1840
1841 switch (d->Semantic.SemanticName) {
1842 case TGSI_SEMANTIC_POSITION:
1843 depr = first;
1844 pc->p->cfg.fp.regs[2] |= 0x00000100;
1845 pc->p->cfg.fp.regs[3] |= 0x00000011;
1846 break;
1847 default:
1848 break;
1849 }
1850
1851 break;
1852 case TGSI_FILE_INPUT:
1853 {
1854 if (pc->attr_nr < (last + 1))
1855 pc->attr_nr = last + 1;
1856
1857 if (pc->p->type != PIPE_SHADER_FRAGMENT)
1858 break;
1859
1860 switch (d->Declaration.Interpolate) {
1861 case TGSI_INTERPOLATE_CONSTANT:
1862 mode = INTERP_FLAT;
1863 break;
1864 case TGSI_INTERPOLATE_PERSPECTIVE:
1865 mode = INTERP_PERSPECTIVE;
1866 break;
1867 default:
1868 mode = INTERP_LINEAR;
1869 break;
1870 }
1871
1872 if (d->Declaration.Semantic) {
1873 switch (d->Semantic.SemanticName) {
1874 case TGSI_SEMANTIC_POSITION:
1875 fcrd = first;
1876 break;
1877 case TGSI_SEMANTIC_COLOR:
1878 fcol = first;
1879 mode = INTERP_PERSPECTIVE;
1880 break;
1881 case TGSI_SEMANTIC_BCOLOR:
1882 bcol = first;
1883 mode = INTERP_PERSPECTIVE;
1884 break;
1885 }
1886 }
1887
1888 if (d->Declaration.Centroid) {
1889 mode |= INTERP_CENTROID;
1890 if (mode & INTERP_PERSPECTIVE)
1891 centroid_loads++;
1892 } else
1893 if (mode & INTERP_PERSPECTIVE)
1894 perspect_loads++;
1895
1896 assert(last < 32);
1897 for (i = first; i <= last; i++)
1898 pc->interp_mode[i] = mode;
1899 }
1900 break;
1901 case TGSI_FILE_CONSTANT:
1902 if (pc->param_nr < (last + 1))
1903 pc->param_nr = last + 1;
1904 break;
1905 case TGSI_FILE_SAMPLER:
1906 break;
1907 default:
1908 NOUVEAU_ERR("bad decl file %d\n",
1909 d->Declaration.File);
1910 goto out_err;
1911 }
1912 }
1913 break;
1914 case TGSI_TOKEN_TYPE_INSTRUCTION:
1915 pc->insn_nr++;
1916 prep_inspect_insn(pc, tok, r_usage);
1917 break;
1918 default:
1919 break;
1920 }
1921 }
1922
1923 if (pc->temp_nr) {
1924 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1925 if (!pc->temp)
1926 goto out_err;
1927
1928 for (i = 0; i < pc->temp_nr; i++) {
1929 for (c = 0; c < 4; c++) {
1930 pc->temp[i*4+c].type = P_TEMP;
1931 pc->temp[i*4+c].hw = -1;
1932 pc->temp[i*4+c].rhw = -1;
1933 pc->temp[i*4+c].index = i;
1934 pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1935 }
1936 }
1937 }
1938
1939 if (pc->attr_nr) {
1940 int oid = 4, mid = 4, aid = 0;
1941 /* oid = VP output id
1942 * aid = FP attribute/interpolant id
1943 * mid = VP output mapping field ID
1944 */
1945
1946 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1947 if (!pc->attr)
1948 goto out_err;
1949
1950 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1951 /* position should be loaded first */
1952 if (fcrd != 0xffff) {
1953 unsigned mask;
1954 mid = 0;
1955 mask = load_fp_attrib(pc, fcrd, r_usage[1],
1956 &mid, &aid, &oid);
1957 oid = 0;
1958 pc->p->cfg.fp.regs[1] |= (mask << 24);
1959 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1960 }
1961 pc->p->cfg.fp.map[0] += 0x03020100;
1962
1963 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1964
1965 if (perspect_loads) {
1966 pc->iv_p = alloc_temp(pc, NULL);
1967
1968 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1969 pc->p->cfg.fp.regs[1] |= 0x08000000;
1970 pc->iv_p->rhw = aid++;
1971 emit_interp(pc, pc->iv_p, NULL,
1972 INTERP_LINEAR);
1973 emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1974 } else {
1975 pc->iv_p->rhw = aid - 1;
1976 emit_flop(pc, 0, pc->iv_p,
1977 &pc->attr[fcrd * 4 + 3]);
1978 }
1979 }
1980
1981 if (centroid_loads) {
1982 pc->iv_c = alloc_temp(pc, NULL);
1983 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1984 emit_interp(pc, pc->iv_c, NULL,
1985 INTERP_CENTROID);
1986 emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1987 pc->p->cfg.fp.regs[1] |= 0x08000000;
1988 }
1989
1990 for (c = 0; c < 4; c++) {
1991 /* I don't know what these values do, but
1992 * let's set them like the blob does:
1993 */
1994 if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1995 pc->p->cfg.fp.regs[0] += 0x00010000;
1996 if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1997 pc->p->cfg.fp.regs[0] += 0x00010000;
1998 }
1999
2000 for (i = 0; i < pc->attr_nr; i++)
2001 load_fp_attrib(pc, i, r_usage[1],
2002 &mid, &aid, &oid);
2003
2004 if (pc->iv_p)
2005 free_temp(pc, pc->iv_p);
2006 if (pc->iv_c)
2007 free_temp(pc, pc->iv_c);
2008
2009 pc->p->cfg.fp.high_map = (mid / 4);
2010 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
2011 } else {
2012 /* vertex program */
2013 for (i = 0; i < pc->attr_nr * 4; i++) {
2014 pc->p->cfg.vp.attr[aid / 32] |=
2015 (1 << (aid % 32));
2016 pc->attr[i].type = P_ATTR;
2017 pc->attr[i].hw = aid++;
2018 pc->attr[i].index = i / 4;
2019 }
2020 }
2021 }
2022
2023 if (pc->result_nr) {
2024 int rid = 0;
2025
2026 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
2027 if (!pc->result)
2028 goto out_err;
2029
2030 for (i = 0; i < pc->result_nr; i++) {
2031 for (c = 0; c < 4; c++) {
2032 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2033 pc->result[i*4+c].type = P_TEMP;
2034 pc->result[i*4+c].hw = -1;
2035 pc->result[i*4+c].rhw = (i == depr) ?
2036 -1 : rid++;
2037 } else {
2038 pc->result[i*4+c].type = P_RESULT;
2039 pc->result[i*4+c].hw = rid++;
2040 }
2041 pc->result[i*4+c].index = i;
2042 }
2043
2044 if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2045 depr != 0xffff) {
2046 pc->result[depr * 4 + 2].rhw =
2047 (pc->result_nr - 1) * 4;
2048 }
2049 }
2050 }
2051
2052 if (pc->param_nr) {
2053 int rid = 0;
2054
2055 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2056 if (!pc->param)
2057 goto out_err;
2058
2059 for (i = 0; i < pc->param_nr; i++) {
2060 for (c = 0; c < 4; c++) {
2061 pc->param[i*4+c].type = P_CONST;
2062 pc->param[i*4+c].hw = rid++;
2063 pc->param[i*4+c].index = i;
2064 }
2065 }
2066 }
2067
2068 if (pc->immd_nr) {
2069 int rid = 0;
2070
2071 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2072 if (!pc->immd)
2073 goto out_err;
2074
2075 for (i = 0; i < pc->immd_nr; i++) {
2076 for (c = 0; c < 4; c++) {
2077 pc->immd[i*4+c].type = P_IMMD;
2078 pc->immd[i*4+c].hw = rid++;
2079 pc->immd[i*4+c].index = i;
2080 }
2081 }
2082 }
2083
2084 ret = TRUE;
2085 out_err:
2086 if (r_usage[0])
2087 FREE(r_usage[0]);
2088 if (r_usage[1])
2089 FREE(r_usage[1]);
2090
2091 tgsi_parse_free(&p);
2092 return ret;
2093 }
2094
2095 static void
2096 free_nv50_pc(struct nv50_pc *pc)
2097 {
2098 if (pc->immd)
2099 FREE(pc->immd);
2100 if (pc->param)
2101 FREE(pc->param);
2102 if (pc->result)
2103 FREE(pc->result);
2104 if (pc->attr)
2105 FREE(pc->attr);
2106 if (pc->temp)
2107 FREE(pc->temp);
2108
2109 FREE(pc);
2110 }
2111
2112 static boolean
2113 nv50_program_tx(struct nv50_program *p)
2114 {
2115 struct tgsi_parse_context parse;
2116 struct nv50_pc *pc;
2117 unsigned k;
2118 boolean ret;
2119
2120 pc = CALLOC_STRUCT(nv50_pc);
2121 if (!pc)
2122 return FALSE;
2123 pc->p = p;
2124 pc->p->cfg.high_temp = 4;
2125
2126 ret = nv50_program_tx_prep(pc);
2127 if (ret == FALSE)
2128 goto out_cleanup;
2129
2130 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2131 while (!tgsi_parse_end_of_tokens(&parse)) {
2132 const union tgsi_full_token *tok = &parse.FullToken;
2133
2134 /* don't allow half insn/immd on first and last instruction */
2135 pc->allow32 = TRUE;
2136 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2137 pc->allow32 = FALSE;
2138
2139 tgsi_parse_token(&parse);
2140
2141 switch (tok->Token.Type) {
2142 case TGSI_TOKEN_TYPE_INSTRUCTION:
2143 ++pc->insn_cur;
2144 ret = nv50_program_tx_insn(pc, tok);
2145 if (ret == FALSE)
2146 goto out_err;
2147 break;
2148 default:
2149 break;
2150 }
2151 }
2152
2153 if (p->type == PIPE_SHADER_FRAGMENT) {
2154 struct nv50_reg out;
2155
2156 out.type = P_TEMP;
2157 for (k = 0; k < pc->result_nr * 4; k++) {
2158 if (pc->result[k].rhw == -1)
2159 continue;
2160 if (pc->result[k].hw != pc->result[k].rhw) {
2161 out.hw = pc->result[k].rhw;
2162 emit_mov(pc, &out, &pc->result[k]);
2163 }
2164 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2165 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2166 }
2167 }
2168
2169 /* look for single half instructions and make them long */
2170 struct nv50_program_exec *e, *e_prev;
2171
2172 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2173 if (!is_long(e))
2174 k++;
2175
2176 if (!e->next || is_long(e->next)) {
2177 if (k & 1)
2178 convert_to_long(pc, e);
2179 k = 0;
2180 }
2181
2182 if (e->next)
2183 e_prev = e;
2184 }
2185
2186 if (!is_long(pc->p->exec_tail)) {
2187 /* this may occur if moving FP results */
2188 assert(e_prev && !is_long(e_prev));
2189 convert_to_long(pc, e_prev);
2190 convert_to_long(pc, pc->p->exec_tail);
2191 }
2192
2193 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2194 pc->p->exec_tail->inst[1] |= 0x00000001;
2195
2196 p->param_nr = pc->param_nr * 4;
2197 p->immd_nr = pc->immd_nr * 4;
2198 p->immd = pc->immd_buf;
2199
2200 out_err:
2201 tgsi_parse_free(&parse);
2202
2203 out_cleanup:
2204 free_nv50_pc(pc);
2205 return ret;
2206 }
2207
2208 static void
2209 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2210 {
2211 if (nv50_program_tx(p) == FALSE)
2212 assert(0);
2213 p->translated = TRUE;
2214 }
2215
2216 static void
2217 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2218 unsigned start, unsigned count, unsigned cbuf)
2219 {
2220 struct nouveau_channel *chan = nv50->screen->base.channel;
2221 struct nouveau_grobj *tesla = nv50->screen->tesla;
2222
2223 while (count) {
2224 unsigned nr = count > 2047 ? 2047 : count;
2225
2226 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2227 OUT_RING (chan, (cbuf << 0) | (start << 8));
2228 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2229 OUT_RINGp (chan, map, nr);
2230
2231 map += nr;
2232 start += nr;
2233 count -= nr;
2234 }
2235 }
2236
2237 static void
2238 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2239 {
2240 struct pipe_screen *pscreen = nv50->pipe.screen;
2241
2242 if (!p->data[0] && p->immd_nr) {
2243 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2244
2245 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2246 while (heap->next && heap->size < p->immd_nr) {
2247 struct nv50_program *evict = heap->next->priv;
2248 nouveau_resource_free(&evict->data[0]);
2249 }
2250
2251 if (nouveau_resource_alloc(heap, p->immd_nr, p,
2252 &p->data[0]))
2253 assert(0);
2254 }
2255
2256 /* immediates only need to be uploaded again when freed */
2257 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2258 p->immd_nr, NV50_CB_PMISC);
2259 }
2260
2261 if (!p->data[1] && p->param_nr) {
2262 struct nouveau_resource *heap =
2263 nv50->screen->parm_heap[p->type];
2264
2265 if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2266 while (heap->next && heap->size < p->param_nr) {
2267 struct nv50_program *evict = heap->next->priv;
2268 nouveau_resource_free(&evict->data[1]);
2269 }
2270
2271 if (nouveau_resource_alloc(heap, p->param_nr, p,
2272 &p->data[1]))
2273 assert(0);
2274 }
2275 }
2276
2277 if (p->param_nr) {
2278 unsigned cbuf = NV50_CB_PVP;
2279 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2280 PIPE_BUFFER_USAGE_CPU_READ);
2281 if (p->type == PIPE_SHADER_FRAGMENT)
2282 cbuf = NV50_CB_PFP;
2283 nv50_program_upload_data(nv50, map, p->data[1]->start,
2284 p->param_nr, cbuf);
2285 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2286 }
2287 }
2288
2289 static void
2290 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2291 {
2292 struct nouveau_channel *chan = nv50->screen->base.channel;
2293 struct nouveau_grobj *tesla = nv50->screen->tesla;
2294 struct nv50_program_exec *e;
2295 struct nouveau_stateobj *so;
2296 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2297 unsigned start, count, *up, *ptr;
2298 boolean upload = FALSE;
2299
2300 if (!p->bo) {
2301 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2302 p->exec_size * 4, &p->bo);
2303 upload = TRUE;
2304 }
2305
2306 if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2307 (p->data[1] && p->data[1]->start != p->data_start[1])) {
2308 for (e = p->exec_head; e; e = e->next) {
2309 unsigned ei, ci, bs;
2310
2311 if (e->param.index < 0)
2312 continue;
2313 bs = (e->inst[1] >> 22) & 0x07;
2314 assert(bs < 2);
2315 ei = e->param.shift >> 5;
2316 ci = e->param.index + p->data[bs]->start;
2317
2318 e->inst[ei] &= ~e->param.mask;
2319 e->inst[ei] |= (ci << e->param.shift);
2320 }
2321
2322 if (p->data[0])
2323 p->data_start[0] = p->data[0]->start;
2324 if (p->data[1])
2325 p->data_start[1] = p->data[1]->start;
2326
2327 upload = TRUE;
2328 }
2329
2330 if (!upload)
2331 return;
2332
2333 #ifdef NV50_PROGRAM_DUMP
2334 NOUVEAU_ERR("-------\n");
2335 for (e = p->exec_head; e; e = e->next) {
2336 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2337 if (is_long(e))
2338 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2339 }
2340 #endif
2341
2342 up = ptr = MALLOC(p->exec_size * 4);
2343 for (e = p->exec_head; e; e = e->next) {
2344 *(ptr++) = e->inst[0];
2345 if (is_long(e))
2346 *(ptr++) = e->inst[1];
2347 }
2348
2349 so = so_new(4,2);
2350 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2351 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2352 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2353 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2354
2355 start = 0; count = p->exec_size;
2356 while (count) {
2357 struct nouveau_channel *chan = nv50->screen->base.channel;
2358 unsigned nr;
2359
2360 so_emit(chan, so);
2361
2362 nr = MIN2(count, 2047);
2363 nr = MIN2(chan->pushbuf->remaining, nr);
2364 if (chan->pushbuf->remaining < (nr + 3)) {
2365 FIRE_RING(chan);
2366 continue;
2367 }
2368
2369 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2370 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2371 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2372 OUT_RINGp (chan, up + start, nr);
2373
2374 start += nr;
2375 count -= nr;
2376 }
2377
2378 FREE(up);
2379 so_ref(NULL, &so);
2380 }
2381
2382 void
2383 nv50_vertprog_validate(struct nv50_context *nv50)
2384 {
2385 struct nouveau_grobj *tesla = nv50->screen->tesla;
2386 struct nv50_program *p = nv50->vertprog;
2387 struct nouveau_stateobj *so;
2388
2389 if (!p->translated) {
2390 nv50_program_validate(nv50, p);
2391 if (!p->translated)
2392 assert(0);
2393 }
2394
2395 nv50_program_validate_data(nv50, p);
2396 nv50_program_validate_code(nv50, p);
2397
2398 so = so_new(13, 2);
2399 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2400 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2401 NOUVEAU_BO_HIGH, 0, 0);
2402 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2403 NOUVEAU_BO_LOW, 0, 0);
2404 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2405 so_data (so, p->cfg.vp.attr[0]);
2406 so_data (so, p->cfg.vp.attr[1]);
2407 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2408 so_data (so, p->cfg.high_result);
2409 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2410 so_data (so, p->cfg.high_result); //8);
2411 so_data (so, p->cfg.high_temp);
2412 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2413 so_data (so, 0); /* program start offset */
2414 so_ref(so, &nv50->state.vertprog);
2415 so_ref(NULL, &so);
2416 }
2417
2418 void
2419 nv50_fragprog_validate(struct nv50_context *nv50)
2420 {
2421 struct nouveau_grobj *tesla = nv50->screen->tesla;
2422 struct nv50_program *p = nv50->fragprog;
2423 struct nouveau_stateobj *so;
2424 unsigned i;
2425
2426 if (!p->translated) {
2427 nv50_program_validate(nv50, p);
2428 if (!p->translated)
2429 assert(0);
2430 }
2431
2432 nv50_program_validate_data(nv50, p);
2433 nv50_program_validate_code(nv50, p);
2434
2435 so = so_new(64, 2);
2436 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2437 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2438 NOUVEAU_BO_HIGH, 0, 0);
2439 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2440 NOUVEAU_BO_LOW, 0, 0);
2441 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2442 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2443 so_data (so, 0x00000004);
2444 so_data (so, 0x00000000);
2445 so_data (so, 0x00000000);
2446 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
2447 for (i = 0; i < p->cfg.fp.high_map; i++)
2448 so_data(so, p->cfg.fp.map[i]);
2449 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
2450 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2451 so_data (so, p->cfg.high_temp);
2452 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2453 so_data (so, p->cfg.high_result);
2454 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2455 so_data (so, p->cfg.fp.regs[2]);
2456 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2457 so_data (so, p->cfg.fp.regs[3]);
2458 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2459 so_data (so, 0); /* program start offset */
2460 so_ref(so, &nv50->state.fragprog);
2461 so_ref(NULL, &so);
2462 }
2463
2464 void
2465 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2466 {
2467 while (p->exec_head) {
2468 struct nv50_program_exec *e = p->exec_head;
2469
2470 p->exec_head = e->next;
2471 FREE(e);
2472 }
2473 p->exec_tail = NULL;
2474 p->exec_size = 0;
2475
2476 nouveau_bo_ref(NULL, &p->bo);
2477
2478 nouveau_resource_free(&p->data[0]);
2479 nouveau_resource_free(&p->data[1]);
2480
2481 p->translated = 0;
2482 }