nv50: negate sources directly where supported
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 unsigned interp_mode[32];
116 /* perspective interpolation registers */
117 struct nv50_reg *iv_p;
118 struct nv50_reg *iv_c;
119
120 /* current instruction and total number of insns */
121 unsigned insn_cur;
122 unsigned insn_nr;
123
124 boolean allow32;
125 };
126
127 static void
128 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129 {
130 int i = 0;
131
132 if (reg->type == P_RESULT) {
133 if (pc->p->cfg.high_result < (reg->hw + 1))
134 pc->p->cfg.high_result = reg->hw + 1;
135 }
136
137 if (reg->type != P_TEMP)
138 return;
139
140 if (reg->hw >= 0) {
141 /*XXX: do this here too to catch FP temp-as-attr usage..
142 * not clean, but works */
143 if (pc->p->cfg.high_temp < (reg->hw + 1))
144 pc->p->cfg.high_temp = reg->hw + 1;
145 return;
146 }
147
148 if (reg->rhw != -1) {
149 /* try to allocate temporary with index rhw first */
150 if (!(pc->r_temp[reg->rhw])) {
151 pc->r_temp[reg->rhw] = reg;
152 reg->hw = reg->rhw;
153 if (pc->p->cfg.high_temp < (reg->rhw + 1))
154 pc->p->cfg.high_temp = reg->rhw + 1;
155 return;
156 }
157 /* make sure we don't get things like $r0 needs to go
158 * in $r1 and $r1 in $r0
159 */
160 i = pc->result_nr * 4;
161 }
162
163 for (; i < NV50_SU_MAX_TEMP; i++) {
164 if (!(pc->r_temp[i])) {
165 pc->r_temp[i] = reg;
166 reg->hw = i;
167 if (pc->p->cfg.high_temp < (i + 1))
168 pc->p->cfg.high_temp = i + 1;
169 return;
170 }
171 }
172
173 assert(0);
174 }
175
176 static struct nv50_reg *
177 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178 {
179 struct nv50_reg *r;
180 int i;
181
182 if (dst && dst->type == P_TEMP && dst->hw == -1)
183 return dst;
184
185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186 if (!pc->r_temp[i]) {
187 r = CALLOC_STRUCT(nv50_reg);
188 r->type = P_TEMP;
189 r->index = -1;
190 r->hw = i;
191 r->rhw = -1;
192 pc->r_temp[i] = r;
193 return r;
194 }
195 }
196
197 assert(0);
198 return NULL;
199 }
200
201 /* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204 static void
205 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206 {
207 assert(src->index == -1 && src->hw != -1);
208
209 if (dst->hw != -1)
210 pc->r_temp[dst->hw] = NULL;
211 pc->r_temp[src->hw] = dst;
212 dst->hw = src->hw;
213
214 FREE(src);
215 }
216
217 /* release the hardware resource held by r */
218 static void
219 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
220 {
221 assert(r->type == P_TEMP);
222 if (r->hw == -1)
223 return;
224
225 assert(pc->r_temp[r->hw] == r);
226 pc->r_temp[r->hw] = NULL;
227
228 r->acc = 0;
229 if (r->index == -1)
230 FREE(r);
231 }
232
233 static void
234 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
235 {
236 if (r->index == -1) {
237 unsigned hw = r->hw;
238
239 FREE(pc->r_temp[hw]);
240 pc->r_temp[hw] = NULL;
241 }
242 }
243
244 static int
245 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
246 {
247 int i;
248
249 if ((idx + 4) >= NV50_SU_MAX_TEMP)
250 return 1;
251
252 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
253 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
254 return alloc_temp4(pc, dst, idx + 1);
255
256 for (i = 0; i < 4; i++) {
257 dst[i] = CALLOC_STRUCT(nv50_reg);
258 dst[i]->type = P_TEMP;
259 dst[i]->index = -1;
260 dst[i]->hw = idx + i;
261 pc->r_temp[idx + i] = dst[i];
262 }
263
264 return 0;
265 }
266
267 static void
268 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
269 {
270 int i;
271
272 for (i = 0; i < 4; i++)
273 free_temp(pc, reg[i]);
274 }
275
276 static struct nv50_reg *
277 temp_temp(struct nv50_pc *pc)
278 {
279 if (pc->temp_temp_nr >= 16)
280 assert(0);
281
282 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
283 return pc->temp_temp[pc->temp_temp_nr++];
284 }
285
286 static void
287 kill_temp_temp(struct nv50_pc *pc)
288 {
289 int i;
290
291 for (i = 0; i < pc->temp_temp_nr; i++)
292 free_temp(pc, pc->temp_temp[i]);
293 pc->temp_temp_nr = 0;
294 }
295
296 static int
297 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
298 {
299 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
300 (pc->immd_nr + 1) * 4 * sizeof(float));
301 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
302 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
303 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
304 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
305
306 return pc->immd_nr++;
307 }
308
309 static struct nv50_reg *
310 alloc_immd(struct nv50_pc *pc, float f)
311 {
312 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
313 unsigned hw;
314
315 for (hw = 0; hw < pc->immd_nr * 4; hw++)
316 if (pc->immd_buf[hw] == f)
317 break;
318
319 if (hw == pc->immd_nr * 4)
320 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
321
322 r->type = P_IMMD;
323 r->hw = hw;
324 r->index = -1;
325 return r;
326 }
327
328 static struct nv50_program_exec *
329 exec(struct nv50_pc *pc)
330 {
331 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
332
333 e->param.index = -1;
334 return e;
335 }
336
337 static void
338 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
339 {
340 struct nv50_program *p = pc->p;
341
342 if (p->exec_tail)
343 p->exec_tail->next = e;
344 if (!p->exec_head)
345 p->exec_head = e;
346 p->exec_tail = e;
347 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
348 }
349
350 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
351
352 static boolean
353 is_long(struct nv50_program_exec *e)
354 {
355 if (e->inst[0] & 1)
356 return TRUE;
357 return FALSE;
358 }
359
360 static boolean
361 is_immd(struct nv50_program_exec *e)
362 {
363 if (is_long(e) && (e->inst[1] & 3) == 3)
364 return TRUE;
365 return FALSE;
366 }
367
368 static INLINE void
369 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
370 struct nv50_program_exec *e)
371 {
372 set_long(pc, e);
373 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
374 e->inst[1] |= (pred << 7) | (idx << 12);
375 }
376
377 static INLINE void
378 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
379 struct nv50_program_exec *e)
380 {
381 set_long(pc, e);
382 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
383 e->inst[1] |= (idx << 4) | (on << 6);
384 }
385
386 static INLINE void
387 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
388 {
389 if (is_long(e))
390 return;
391
392 e->inst[0] |= 1;
393 set_pred(pc, 0xf, 0, e);
394 set_pred_wr(pc, 0, 0, e);
395 }
396
397 static INLINE void
398 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
399 {
400 if (dst->type == P_RESULT) {
401 set_long(pc, e);
402 e->inst[1] |= 0x00000008;
403 }
404
405 alloc_reg(pc, dst);
406 e->inst[0] |= (dst->hw << 2);
407 }
408
409 static INLINE void
410 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
411 {
412 float f = pc->immd_buf[imm->hw];
413 unsigned val = fui(imm->neg ? -f : f);
414
415 set_long(pc, e);
416 /*XXX: can't be predicated - bits overlap.. catch cases where both
417 * are required and avoid them. */
418 set_pred(pc, 0, 0, e);
419 set_pred_wr(pc, 0, 0, e);
420
421 e->inst[1] |= 0x00000002 | 0x00000001;
422 e->inst[0] |= (val & 0x3f) << 16;
423 e->inst[1] |= (val >> 6) << 2;
424 }
425
426
427 #define INTERP_LINEAR 0
428 #define INTERP_FLAT 1
429 #define INTERP_PERSPECTIVE 2
430 #define INTERP_CENTROID 4
431
432 /* interpolant index has been stored in dst->rhw */
433 static void
434 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
435 unsigned mode)
436 {
437 assert(dst->rhw != -1);
438 struct nv50_program_exec *e = exec(pc);
439
440 e->inst[0] |= 0x80000000;
441 set_dst(pc, dst, e);
442 e->inst[0] |= (dst->rhw << 16);
443
444 if (mode & INTERP_FLAT) {
445 e->inst[0] |= (1 << 8);
446 } else {
447 if (mode & INTERP_PERSPECTIVE) {
448 e->inst[0] |= (1 << 25);
449 alloc_reg(pc, iv);
450 e->inst[0] |= (iv->hw << 9);
451 }
452
453 if (mode & INTERP_CENTROID)
454 e->inst[0] |= (1 << 24);
455 }
456
457 emit(pc, e);
458 }
459
460 static void
461 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
462 struct nv50_program_exec *e)
463 {
464 set_long(pc, e);
465
466 e->param.index = src->hw;
467 e->param.shift = s;
468 e->param.mask = m << (s % 32);
469
470 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
471 }
472
473 static void
474 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
475 {
476 struct nv50_program_exec *e = exec(pc);
477
478 e->inst[0] |= 0x10000000;
479
480 set_dst(pc, dst, e);
481
482 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
483 set_immd(pc, src, e);
484 /*XXX: 32-bit, but steals part of "half" reg space - need to
485 * catch and handle this case if/when we do half-regs
486 */
487 } else
488 if (src->type == P_IMMD || src->type == P_CONST) {
489 set_long(pc, e);
490 set_data(pc, src, 0x7f, 9, e);
491 e->inst[1] |= 0x20000000; /* src0 const? */
492 } else {
493 if (src->type == P_ATTR) {
494 set_long(pc, e);
495 e->inst[1] |= 0x00200000;
496 }
497
498 alloc_reg(pc, src);
499 e->inst[0] |= (src->hw << 9);
500 }
501
502 if (is_long(e) && !is_immd(e)) {
503 e->inst[1] |= 0x04000000; /* 32-bit */
504 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
505 if (!(e->inst[1] & 0x20000000))
506 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
507 } else
508 e->inst[0] |= 0x00008000;
509
510 emit(pc, e);
511 }
512
513 static INLINE void
514 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
515 {
516 struct nv50_reg *imm = alloc_immd(pc, f);
517 emit_mov(pc, dst, imm);
518 FREE(imm);
519 }
520
521 static boolean
522 check_swap_src_0_1(struct nv50_pc *pc,
523 struct nv50_reg **s0, struct nv50_reg **s1)
524 {
525 struct nv50_reg *src0 = *s0, *src1 = *s1;
526
527 if (src0->type == P_CONST) {
528 if (src1->type != P_CONST) {
529 *s0 = src1;
530 *s1 = src0;
531 return TRUE;
532 }
533 } else
534 if (src1->type == P_ATTR) {
535 if (src0->type != P_ATTR) {
536 *s0 = src1;
537 *s1 = src0;
538 return TRUE;
539 }
540 }
541
542 return FALSE;
543 }
544
545 static void
546 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
547 {
548 if (src->type == P_ATTR) {
549 set_long(pc, e);
550 e->inst[1] |= 0x00200000;
551 } else
552 if (src->type == P_CONST || src->type == P_IMMD) {
553 struct nv50_reg *temp = temp_temp(pc);
554
555 emit_mov(pc, temp, src);
556 src = temp;
557 }
558
559 alloc_reg(pc, src);
560 e->inst[0] |= (src->hw << 9);
561 }
562
563 static void
564 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
565 {
566 if (src->type == P_ATTR) {
567 struct nv50_reg *temp = temp_temp(pc);
568
569 emit_mov(pc, temp, src);
570 src = temp;
571 } else
572 if (src->type == P_CONST || src->type == P_IMMD) {
573 assert(!(e->inst[0] & 0x00800000));
574 if (e->inst[0] & 0x01000000) {
575 struct nv50_reg *temp = temp_temp(pc);
576
577 emit_mov(pc, temp, src);
578 src = temp;
579 } else {
580 set_data(pc, src, 0x7f, 16, e);
581 e->inst[0] |= 0x00800000;
582 }
583 }
584
585 alloc_reg(pc, src);
586 e->inst[0] |= (src->hw << 16);
587 }
588
589 static void
590 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
591 {
592 set_long(pc, e);
593
594 if (src->type == P_ATTR) {
595 struct nv50_reg *temp = temp_temp(pc);
596
597 emit_mov(pc, temp, src);
598 src = temp;
599 } else
600 if (src->type == P_CONST || src->type == P_IMMD) {
601 assert(!(e->inst[0] & 0x01000000));
602 if (e->inst[0] & 0x00800000) {
603 struct nv50_reg *temp = temp_temp(pc);
604
605 emit_mov(pc, temp, src);
606 src = temp;
607 } else {
608 set_data(pc, src, 0x7f, 32+14, e);
609 e->inst[0] |= 0x01000000;
610 }
611 }
612
613 alloc_reg(pc, src);
614 e->inst[1] |= (src->hw << 14);
615 }
616
617 static void
618 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
619 struct nv50_reg *src1)
620 {
621 struct nv50_program_exec *e = exec(pc);
622
623 e->inst[0] |= 0xc0000000;
624
625 if (!pc->allow32)
626 set_long(pc, e);
627
628 check_swap_src_0_1(pc, &src0, &src1);
629 set_dst(pc, dst, e);
630 set_src_0(pc, src0, e);
631 if (src1->type == P_IMMD && !is_long(e)) {
632 if (src0->neg)
633 e->inst[0] |= 0x00008000;
634 set_immd(pc, src1, e);
635 } else {
636 set_src_1(pc, src1, e);
637 if (src0->neg ^ src1->neg) {
638 if (is_long(e))
639 e->inst[1] |= 0x08000000;
640 else
641 e->inst[0] |= 0x00008000;
642 }
643 }
644
645 emit(pc, e);
646 }
647
648 static void
649 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
650 struct nv50_reg *src0, struct nv50_reg *src1)
651 {
652 struct nv50_program_exec *e = exec(pc);
653
654 e->inst[0] |= 0xb0000000;
655
656 check_swap_src_0_1(pc, &src0, &src1);
657
658 if (!pc->allow32 || src0->neg || src1->neg) {
659 set_long(pc, e);
660 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
661 }
662
663 set_dst(pc, dst, e);
664 set_src_0(pc, src0, e);
665 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
666 set_src_2(pc, src1, e);
667 else
668 if (src1->type == P_IMMD)
669 set_immd(pc, src1, e);
670 else
671 set_src_1(pc, src1, e);
672
673 emit(pc, e);
674 }
675
676 static void
677 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
678 struct nv50_reg *src0, struct nv50_reg *src1)
679 {
680 struct nv50_program_exec *e = exec(pc);
681
682 set_long(pc, e);
683 e->inst[0] |= 0xb0000000;
684 e->inst[1] |= (sub << 29);
685
686 check_swap_src_0_1(pc, &src0, &src1);
687 set_dst(pc, dst, e);
688 set_src_0(pc, src0, e);
689 set_src_1(pc, src1, e);
690
691 emit(pc, e);
692 }
693
694 static INLINE void
695 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
696 struct nv50_reg *src1)
697 {
698 src1->neg ^= 1;
699 emit_add(pc, dst, src0, src1);
700 src1->neg ^= 1;
701 }
702
703 static void
704 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
705 struct nv50_reg *src1, struct nv50_reg *src2)
706 {
707 struct nv50_program_exec *e = exec(pc);
708
709 e->inst[0] |= 0xe0000000;
710
711 check_swap_src_0_1(pc, &src0, &src1);
712 set_dst(pc, dst, e);
713 set_src_0(pc, src0, e);
714 set_src_1(pc, src1, e);
715 set_src_2(pc, src2, e);
716
717 if (src0->neg ^ src1->neg)
718 e->inst[1] |= 0x04000000;
719 if (src2->neg)
720 e->inst[1] |= 0x08000000;
721
722 emit(pc, e);
723 }
724
725 static INLINE void
726 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
727 struct nv50_reg *src1, struct nv50_reg *src2)
728 {
729 src2->neg ^= 1;
730 emit_mad(pc, dst, src0, src1, src2);
731 src2->neg ^= 1;
732 }
733
734 static void
735 emit_flop(struct nv50_pc *pc, unsigned sub,
736 struct nv50_reg *dst, struct nv50_reg *src)
737 {
738 struct nv50_program_exec *e = exec(pc);
739
740 e->inst[0] |= 0x90000000;
741 if (sub) {
742 set_long(pc, e);
743 e->inst[1] |= (sub << 29);
744 }
745
746 set_dst(pc, dst, e);
747 set_src_0(pc, src, e);
748
749 emit(pc, e);
750 }
751
752 static void
753 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
754 {
755 struct nv50_program_exec *e = exec(pc);
756
757 e->inst[0] |= 0xb0000000;
758
759 set_dst(pc, dst, e);
760 set_src_0(pc, src, e);
761 set_long(pc, e);
762 e->inst[1] |= (6 << 29) | 0x00004000;
763
764 emit(pc, e);
765 }
766
767 static void
768 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
769 {
770 struct nv50_program_exec *e = exec(pc);
771
772 e->inst[0] |= 0xb0000000;
773
774 set_dst(pc, dst, e);
775 set_src_0(pc, src, e);
776 set_long(pc, e);
777 e->inst[1] |= (6 << 29);
778
779 emit(pc, e);
780 }
781
782 #define CVTOP_RN 0x01
783 #define CVTOP_FLOOR 0x03
784 #define CVTOP_CEIL 0x05
785 #define CVTOP_TRUNC 0x07
786 #define CVTOP_SAT 0x08
787 #define CVTOP_ABS 0x10
788
789 #define CVT_F32_F32 0xc4
790 #define CVT_F32_S32 0x44
791 #define CVT_F32_U32 0x64
792 #define CVT_S32_F32 0x8c
793 #define CVT_S32_S32 0x0c
794 #define CVT_F32_F32_ROP 0xcc
795
796 static void
797 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
798 int wp, unsigned cop, unsigned fmt)
799 {
800 struct nv50_program_exec *e;
801
802 e = exec(pc);
803 set_long(pc, e);
804
805 e->inst[0] |= 0xa0000000;
806 e->inst[1] |= 0x00004000;
807 e->inst[1] |= (cop << 16);
808 e->inst[1] |= (fmt << 24);
809 set_src_0(pc, src, e);
810
811 if (wp >= 0)
812 set_pred_wr(pc, 1, wp, e);
813
814 if (dst)
815 set_dst(pc, dst, e);
816 else {
817 e->inst[0] |= 0x000001fc;
818 e->inst[1] |= 0x00000008;
819 }
820
821 emit(pc, e);
822 }
823
824 static void
825 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
826 struct nv50_reg *src0, struct nv50_reg *src1)
827 {
828 struct nv50_program_exec *e = exec(pc);
829 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
830 struct nv50_reg *rdst;
831
832 assert(c_op <= 7);
833 if (check_swap_src_0_1(pc, &src0, &src1))
834 c_op = inv_cop[c_op];
835
836 rdst = dst;
837 if (dst->type != P_TEMP)
838 dst = alloc_temp(pc, NULL);
839
840 /* set.u32 */
841 set_long(pc, e);
842 e->inst[0] |= 0xb0000000;
843 e->inst[1] |= (3 << 29);
844 e->inst[1] |= (c_op << 14);
845 /*XXX: breaks things, .u32 by default?
846 * decuda will disasm as .u16 and use .lo/.hi regs, but this
847 * doesn't seem to match what the hw actually does.
848 inst[1] |= 0x04000000; << breaks things.. .u32 by default?
849 */
850 set_dst(pc, dst, e);
851 set_src_0(pc, src0, e);
852 set_src_1(pc, src1, e);
853 emit(pc, e);
854
855 /* cvt.f32.u32 */
856 e = exec(pc);
857 e->inst[0] = 0xa0000001;
858 e->inst[1] = 0x64014780;
859 set_dst(pc, rdst, e);
860 set_src_0(pc, dst, e);
861 emit(pc, e);
862
863 if (dst != rdst)
864 free_temp(pc, dst);
865 }
866
867 static INLINE void
868 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
869 {
870 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
871 }
872
873 static void
874 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
875 struct nv50_reg *v, struct nv50_reg *e)
876 {
877 struct nv50_reg *temp = alloc_temp(pc, NULL);
878
879 emit_flop(pc, 3, temp, v);
880 emit_mul(pc, temp, temp, e);
881 emit_preex2(pc, temp, temp);
882 emit_flop(pc, 6, dst, temp);
883
884 free_temp(pc, temp);
885 }
886
887 static INLINE void
888 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
889 {
890 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
891 }
892
893 static void
894 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
895 struct nv50_reg **src)
896 {
897 struct nv50_reg *one = alloc_immd(pc, 1.0);
898 struct nv50_reg *zero = alloc_immd(pc, 0.0);
899 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
900 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
901 struct nv50_reg *tmp[4];
902 boolean allow32 = pc->allow32;
903
904 pc->allow32 = FALSE;
905
906 if (mask & (3 << 1)) {
907 tmp[0] = alloc_temp(pc, NULL);
908 emit_minmax(pc, 4, tmp[0], src[0], zero);
909 }
910
911 if (mask & (1 << 2)) {
912 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
913
914 tmp[1] = temp_temp(pc);
915 emit_minmax(pc, 4, tmp[1], src[1], zero);
916
917 tmp[3] = temp_temp(pc);
918 emit_minmax(pc, 4, tmp[3], src[3], neg128);
919 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
920
921 emit_pow(pc, dst[2], tmp[1], tmp[3]);
922 emit_mov(pc, dst[2], zero);
923 set_pred(pc, 3, 0, pc->p->exec_tail);
924 }
925
926 if (mask & (1 << 1))
927 assimilate_temp(pc, dst[1], tmp[0]);
928 else
929 if (mask & (1 << 2))
930 free_temp(pc, tmp[0]);
931
932 pc->allow32 = allow32;
933
934 /* do this last, in case src[i,j] == dst[0,3] */
935 if (mask & (1 << 0))
936 emit_mov(pc, dst[0], one);
937
938 if (mask & (1 << 3))
939 emit_mov(pc, dst[3], one);
940
941 FREE(pos128);
942 FREE(neg128);
943 FREE(zero);
944 FREE(one);
945 }
946
947 static void
948 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
949 {
950 struct nv50_program_exec *e = exec(pc);
951
952 set_long(pc, e);
953 e->inst[0] |= 0xa0000000; /* delta */
954 e->inst[1] |= (7 << 29); /* delta */
955 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
956 e->inst[1] |= (1 << 14); /* src .f32 */
957 set_dst(pc, dst, e);
958 set_src_0(pc, src, e);
959
960 emit(pc, e);
961 }
962
963 static void
964 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
965 {
966 struct nv50_program_exec *e;
967 const int r_pred = 1;
968
969 /* Sets predicate reg ? */
970 e = exec(pc);
971 e->inst[0] = 0xa00001fd;
972 e->inst[1] = 0xc4014788;
973 set_src_0(pc, src, e);
974 set_pred_wr(pc, 1, r_pred, e);
975 if (src->neg)
976 e->inst[1] |= 0x20000000;
977 emit(pc, e);
978
979 /* This is probably KILP */
980 e = exec(pc);
981 e->inst[0] = 0x000001fe;
982 set_long(pc, e);
983 set_pred(pc, 1 /* LT? */, r_pred, e);
984 emit(pc, e);
985 }
986
987 static void
988 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
989 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
990 {
991 struct nv50_reg *temp, *t[4];
992 struct nv50_program_exec *e;
993
994 unsigned c, mode, dim;
995
996 switch (type) {
997 case TGSI_TEXTURE_1D:
998 dim = 1;
999 break;
1000 case TGSI_TEXTURE_UNKNOWN:
1001 case TGSI_TEXTURE_2D:
1002 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1003 case TGSI_TEXTURE_RECT:
1004 dim = 2;
1005 break;
1006 case TGSI_TEXTURE_3D:
1007 case TGSI_TEXTURE_CUBE:
1008 case TGSI_TEXTURE_SHADOW2D:
1009 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1010 dim = 3;
1011 break;
1012 default:
1013 assert(0);
1014 break;
1015 }
1016
1017 alloc_temp4(pc, t, 0);
1018
1019 if (proj) {
1020 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1021 mode = pc->interp_mode[src[0]->index];
1022
1023 t[3]->rhw = src[3]->rhw;
1024 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1025 emit_flop(pc, 0, t[3], t[3]);
1026
1027 for (c = 0; c < dim; c++) {
1028 t[c]->rhw = src[c]->rhw;
1029 emit_interp(pc, t[c], t[3],
1030 (mode | INTERP_PERSPECTIVE));
1031 }
1032 } else {
1033 emit_flop(pc, 0, t[3], src[3]);
1034 for (c = 0; c < dim; c++)
1035 emit_mul(pc, t[c], src[c], t[3]);
1036
1037 /* XXX: for some reason the blob sometimes uses MAD:
1038 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1039 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1040 */
1041 }
1042 } else {
1043 if (type == TGSI_TEXTURE_CUBE) {
1044 temp = temp_temp(pc);
1045 emit_minmax(pc, 4, temp, src[0], src[1]);
1046 emit_minmax(pc, 4, temp, temp, src[2]);
1047 emit_flop(pc, 0, temp, temp);
1048 for (c = 0; c < 3; c++)
1049 emit_mul(pc, t[c], src[c], temp);
1050 } else {
1051 for (c = 0; c < dim; c++)
1052 emit_mov(pc, t[c], src[c]);
1053 }
1054 }
1055
1056 e = exec(pc);
1057 set_long(pc, e);
1058 e->inst[0] |= 0xf0000000;
1059 e->inst[1] |= 0x00000004;
1060 set_dst(pc, t[0], e);
1061 e->inst[0] |= (unit << 9);
1062
1063 if (dim == 2)
1064 e->inst[0] |= 0x00400000;
1065 else
1066 if (dim == 3)
1067 e->inst[0] |= 0x00800000;
1068
1069 e->inst[0] |= (mask & 0x3) << 25;
1070 e->inst[1] |= (mask & 0xc) << 12;
1071
1072 emit(pc, e);
1073
1074 #if 1
1075 if (mask & 1) emit_mov(pc, dst[0], t[0]);
1076 if (mask & 2) emit_mov(pc, dst[1], t[1]);
1077 if (mask & 4) emit_mov(pc, dst[2], t[2]);
1078 if (mask & 8) emit_mov(pc, dst[3], t[3]);
1079
1080 free_temp4(pc, t);
1081 #else
1082 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1083 * the texture coordinates, not the fetched values: latency ? */
1084
1085 for (c = 0; c < 4; c++) {
1086 if (mask & (1 << c))
1087 assimilate_temp(pc, dst[c], t[c]);
1088 else
1089 free_temp(pc, t[c]);
1090 }
1091 #endif
1092 }
1093
1094 static void
1095 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1096 {
1097 unsigned q = 0, m = ~0;
1098
1099 assert(!is_long(e));
1100
1101 switch (e->inst[0] >> 28) {
1102 case 0x1:
1103 /* MOV */
1104 q = 0x0403c000;
1105 m = 0xffff7fff;
1106 break;
1107 case 0x8:
1108 /* INTERP */
1109 m = ~0x02000000;
1110 if (e->inst[0] & 0x02000000)
1111 q = 0x00020000;
1112 break;
1113 case 0x9:
1114 /* RCP */
1115 break;
1116 case 0xB:
1117 /* ADD */
1118 m = ~(127 << 16);
1119 q = ((e->inst[0] & (~m)) >> 2);
1120 break;
1121 case 0xC:
1122 /* MUL */
1123 m = ~0x00008000;
1124 q = ((e->inst[0] & (~m)) << 12);
1125 break;
1126 case 0xE:
1127 /* MAD (if src2 == dst) */
1128 q = ((e->inst[0] & 0x1fc) << 12);
1129 break;
1130 default:
1131 assert(0);
1132 break;
1133 }
1134
1135 set_long(pc, e);
1136 pc->p->exec_size++;
1137
1138 e->inst[0] &= m;
1139 e->inst[1] |= q;
1140 }
1141
1142 static boolean
1143 negate_supported(const struct tgsi_full_instruction *insn, int i)
1144 {
1145 switch (insn->Instruction.Opcode) {
1146 case TGSI_OPCODE_DP3:
1147 case TGSI_OPCODE_DP4:
1148 case TGSI_OPCODE_MUL:
1149 case TGSI_OPCODE_KIL:
1150 case TGSI_OPCODE_ADD:
1151 case TGSI_OPCODE_SUB:
1152 case TGSI_OPCODE_MAD:
1153 return TRUE;
1154 case TGSI_OPCODE_POW:
1155 return (i == 1) ? TRUE : FALSE;
1156 default:
1157 return FALSE;
1158 }
1159 }
1160
1161 static struct nv50_reg *
1162 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1163 {
1164 switch (dst->DstRegister.File) {
1165 case TGSI_FILE_TEMPORARY:
1166 return &pc->temp[dst->DstRegister.Index * 4 + c];
1167 case TGSI_FILE_OUTPUT:
1168 return &pc->result[dst->DstRegister.Index * 4 + c];
1169 case TGSI_FILE_NULL:
1170 return NULL;
1171 default:
1172 break;
1173 }
1174
1175 return NULL;
1176 }
1177
1178 static struct nv50_reg *
1179 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1180 boolean neg)
1181 {
1182 struct nv50_reg *r = NULL;
1183 struct nv50_reg *temp;
1184 unsigned sgn, c;
1185
1186 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1187
1188 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1189 switch (c) {
1190 case TGSI_EXTSWIZZLE_X:
1191 case TGSI_EXTSWIZZLE_Y:
1192 case TGSI_EXTSWIZZLE_Z:
1193 case TGSI_EXTSWIZZLE_W:
1194 switch (src->SrcRegister.File) {
1195 case TGSI_FILE_INPUT:
1196 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1197 break;
1198 case TGSI_FILE_TEMPORARY:
1199 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1200 break;
1201 case TGSI_FILE_CONSTANT:
1202 r = &pc->param[src->SrcRegister.Index * 4 + c];
1203 break;
1204 case TGSI_FILE_IMMEDIATE:
1205 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1206 break;
1207 case TGSI_FILE_SAMPLER:
1208 break;
1209 default:
1210 assert(0);
1211 break;
1212 }
1213 break;
1214 case TGSI_EXTSWIZZLE_ZERO:
1215 r = alloc_immd(pc, 0.0);
1216 return r;
1217 case TGSI_EXTSWIZZLE_ONE:
1218 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1219 return alloc_immd(pc, -1.0);
1220 return alloc_immd(pc, 1.0);
1221 default:
1222 assert(0);
1223 break;
1224 }
1225
1226 switch (sgn) {
1227 case TGSI_UTIL_SIGN_KEEP:
1228 break;
1229 case TGSI_UTIL_SIGN_CLEAR:
1230 temp = temp_temp(pc);
1231 emit_abs(pc, temp, r);
1232 r = temp;
1233 break;
1234 case TGSI_UTIL_SIGN_TOGGLE:
1235 if (neg)
1236 r->neg = 1;
1237 else {
1238 temp = temp_temp(pc);
1239 emit_neg(pc, temp, r);
1240 r = temp;
1241 }
1242 break;
1243 case TGSI_UTIL_SIGN_SET:
1244 temp = temp_temp(pc);
1245 emit_abs(pc, temp, r);
1246 if (neg)
1247 temp->neg = 1;
1248 else
1249 emit_neg(pc, temp, temp);
1250 r = temp;
1251 break;
1252 default:
1253 assert(0);
1254 break;
1255 }
1256
1257 return r;
1258 }
1259
1260 /* returns TRUE if instruction can overwrite sources before they're read */
1261 static boolean
1262 direct2dest_op(const struct tgsi_full_instruction *insn)
1263 {
1264 if (insn->Instruction.Saturate)
1265 return FALSE;
1266
1267 switch (insn->Instruction.Opcode) {
1268 case TGSI_OPCODE_COS:
1269 case TGSI_OPCODE_DP3:
1270 case TGSI_OPCODE_DP4:
1271 case TGSI_OPCODE_DPH:
1272 case TGSI_OPCODE_KIL:
1273 case TGSI_OPCODE_LIT:
1274 case TGSI_OPCODE_POW:
1275 case TGSI_OPCODE_RCP:
1276 case TGSI_OPCODE_RSQ:
1277 case TGSI_OPCODE_SCS:
1278 case TGSI_OPCODE_SIN:
1279 case TGSI_OPCODE_TEX:
1280 case TGSI_OPCODE_TXP:
1281 return FALSE;
1282 default:
1283 return TRUE;
1284 }
1285 }
1286
1287 static boolean
1288 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1289 {
1290 const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1291 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1292 unsigned mask, sat, unit;
1293 boolean assimilate = FALSE;
1294 int i, c;
1295
1296 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1297 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1298
1299 for (c = 0; c < 4; c++) {
1300 if (mask & (1 << c))
1301 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1302 else
1303 dst[c] = NULL;
1304 rdst[c] = NULL;
1305 src[0][c] = NULL;
1306 src[1][c] = NULL;
1307 src[2][c] = NULL;
1308 }
1309
1310 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1311 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1312
1313 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1314 unit = fs->SrcRegister.Index;
1315
1316 for (c = 0; c < 4; c++)
1317 src[i][c] = tgsi_src(pc, c, fs,
1318 negate_supported(inst, i));
1319 }
1320
1321 if (sat) {
1322 for (c = 0; c < 4; c++) {
1323 rdst[c] = dst[c];
1324 dst[c] = temp_temp(pc);
1325 }
1326 } else
1327 if (direct2dest_op(inst)) {
1328 for (c = 0; c < 4; c++) {
1329 if (!dst[c] || dst[c]->type != P_TEMP)
1330 continue;
1331
1332 for (i = c + 1; i < 4; i++) {
1333 if (dst[c] == src[0][i] ||
1334 dst[c] == src[1][i] ||
1335 dst[c] == src[2][i])
1336 break;
1337 }
1338 if (i == 4)
1339 continue;
1340
1341 assimilate = TRUE;
1342 rdst[c] = dst[c];
1343 dst[c] = alloc_temp(pc, NULL);
1344 }
1345 }
1346
1347 switch (inst->Instruction.Opcode) {
1348 case TGSI_OPCODE_ABS:
1349 for (c = 0; c < 4; c++) {
1350 if (!(mask & (1 << c)))
1351 continue;
1352 emit_abs(pc, dst[c], src[0][c]);
1353 }
1354 break;
1355 case TGSI_OPCODE_ADD:
1356 for (c = 0; c < 4; c++) {
1357 if (!(mask & (1 << c)))
1358 continue;
1359 emit_add(pc, dst[c], src[0][c], src[1][c]);
1360 }
1361 break;
1362 case TGSI_OPCODE_COS:
1363 temp = temp_temp(pc);
1364 emit_precossin(pc, temp, src[0][0]);
1365 emit_flop(pc, 5, temp, temp);
1366 for (c = 0; c < 4; c++) {
1367 if (!(mask & (1 << c)))
1368 continue;
1369 emit_mov(pc, dst[c], temp);
1370 }
1371 break;
1372 case TGSI_OPCODE_DP3:
1373 temp = temp_temp(pc);
1374 emit_mul(pc, temp, src[0][0], src[1][0]);
1375 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1376 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1377 for (c = 0; c < 4; c++) {
1378 if (!(mask & (1 << c)))
1379 continue;
1380 emit_mov(pc, dst[c], temp);
1381 }
1382 break;
1383 case TGSI_OPCODE_DP4:
1384 temp = temp_temp(pc);
1385 emit_mul(pc, temp, src[0][0], src[1][0]);
1386 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1387 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1388 emit_mad(pc, temp, src[0][3], src[1][3], temp);
1389 for (c = 0; c < 4; c++) {
1390 if (!(mask & (1 << c)))
1391 continue;
1392 emit_mov(pc, dst[c], temp);
1393 }
1394 break;
1395 case TGSI_OPCODE_DPH:
1396 temp = temp_temp(pc);
1397 emit_mul(pc, temp, src[0][0], src[1][0]);
1398 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1399 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1400 emit_add(pc, temp, src[1][3], temp);
1401 for (c = 0; c < 4; c++) {
1402 if (!(mask & (1 << c)))
1403 continue;
1404 emit_mov(pc, dst[c], temp);
1405 }
1406 break;
1407 case TGSI_OPCODE_DST:
1408 {
1409 struct nv50_reg *one = alloc_immd(pc, 1.0);
1410 if (mask & (1 << 0))
1411 emit_mov(pc, dst[0], one);
1412 if (mask & (1 << 1))
1413 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1414 if (mask & (1 << 2))
1415 emit_mov(pc, dst[2], src[0][2]);
1416 if (mask & (1 << 3))
1417 emit_mov(pc, dst[3], src[1][3]);
1418 FREE(one);
1419 }
1420 break;
1421 case TGSI_OPCODE_EX2:
1422 temp = temp_temp(pc);
1423 emit_preex2(pc, temp, src[0][0]);
1424 emit_flop(pc, 6, temp, temp);
1425 for (c = 0; c < 4; c++) {
1426 if (!(mask & (1 << c)))
1427 continue;
1428 emit_mov(pc, dst[c], temp);
1429 }
1430 break;
1431 case TGSI_OPCODE_FLR:
1432 for (c = 0; c < 4; c++) {
1433 if (!(mask & (1 << c)))
1434 continue;
1435 emit_flr(pc, dst[c], src[0][c]);
1436 }
1437 break;
1438 case TGSI_OPCODE_FRC:
1439 temp = temp_temp(pc);
1440 for (c = 0; c < 4; c++) {
1441 if (!(mask & (1 << c)))
1442 continue;
1443 emit_flr(pc, temp, src[0][c]);
1444 emit_sub(pc, dst[c], src[0][c], temp);
1445 }
1446 break;
1447 case TGSI_OPCODE_KIL:
1448 emit_kil(pc, src[0][0]);
1449 emit_kil(pc, src[0][1]);
1450 emit_kil(pc, src[0][2]);
1451 emit_kil(pc, src[0][3]);
1452 pc->p->cfg.fp.regs[2] |= 0x00100000;
1453 break;
1454 case TGSI_OPCODE_LIT:
1455 emit_lit(pc, &dst[0], mask, &src[0][0]);
1456 break;
1457 case TGSI_OPCODE_LG2:
1458 temp = temp_temp(pc);
1459 emit_flop(pc, 3, temp, src[0][0]);
1460 for (c = 0; c < 4; c++) {
1461 if (!(mask & (1 << c)))
1462 continue;
1463 emit_mov(pc, dst[c], temp);
1464 }
1465 break;
1466 case TGSI_OPCODE_LRP:
1467 temp = temp_temp(pc);
1468 for (c = 0; c < 4; c++) {
1469 if (!(mask & (1 << c)))
1470 continue;
1471 emit_sub(pc, temp, src[1][c], src[2][c]);
1472 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1473 }
1474 break;
1475 case TGSI_OPCODE_MAD:
1476 for (c = 0; c < 4; c++) {
1477 if (!(mask & (1 << c)))
1478 continue;
1479 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1480 }
1481 break;
1482 case TGSI_OPCODE_MAX:
1483 for (c = 0; c < 4; c++) {
1484 if (!(mask & (1 << c)))
1485 continue;
1486 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1487 }
1488 break;
1489 case TGSI_OPCODE_MIN:
1490 for (c = 0; c < 4; c++) {
1491 if (!(mask & (1 << c)))
1492 continue;
1493 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1494 }
1495 break;
1496 case TGSI_OPCODE_MOV:
1497 for (c = 0; c < 4; c++) {
1498 if (!(mask & (1 << c)))
1499 continue;
1500 emit_mov(pc, dst[c], src[0][c]);
1501 }
1502 break;
1503 case TGSI_OPCODE_MUL:
1504 for (c = 0; c < 4; c++) {
1505 if (!(mask & (1 << c)))
1506 continue;
1507 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1508 }
1509 break;
1510 case TGSI_OPCODE_POW:
1511 temp = temp_temp(pc);
1512 emit_pow(pc, temp, src[0][0], src[1][0]);
1513 for (c = 0; c < 4; c++) {
1514 if (!(mask & (1 << c)))
1515 continue;
1516 emit_mov(pc, dst[c], temp);
1517 }
1518 break;
1519 case TGSI_OPCODE_RCP:
1520 for (c = 3; c >= 0; c--) {
1521 if (!(mask & (1 << c)))
1522 continue;
1523 emit_flop(pc, 0, dst[c], src[0][0]);
1524 }
1525 break;
1526 case TGSI_OPCODE_RSQ:
1527 for (c = 3; c >= 0; c--) {
1528 if (!(mask & (1 << c)))
1529 continue;
1530 emit_flop(pc, 2, dst[c], src[0][0]);
1531 }
1532 break;
1533 case TGSI_OPCODE_SCS:
1534 temp = temp_temp(pc);
1535 emit_precossin(pc, temp, src[0][0]);
1536 if (mask & (1 << 0))
1537 emit_flop(pc, 5, dst[0], temp);
1538 if (mask & (1 << 1))
1539 emit_flop(pc, 4, dst[1], temp);
1540 if (mask & (1 << 2))
1541 emit_mov_immdval(pc, dst[2], 0.0);
1542 if (mask & (1 << 3))
1543 emit_mov_immdval(pc, dst[3], 1.0);
1544 break;
1545 case TGSI_OPCODE_SGE:
1546 for (c = 0; c < 4; c++) {
1547 if (!(mask & (1 << c)))
1548 continue;
1549 emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1550 }
1551 break;
1552 case TGSI_OPCODE_SIN:
1553 temp = temp_temp(pc);
1554 emit_precossin(pc, temp, src[0][0]);
1555 emit_flop(pc, 4, temp, temp);
1556 for (c = 0; c < 4; c++) {
1557 if (!(mask & (1 << c)))
1558 continue;
1559 emit_mov(pc, dst[c], temp);
1560 }
1561 break;
1562 case TGSI_OPCODE_SLT:
1563 for (c = 0; c < 4; c++) {
1564 if (!(mask & (1 << c)))
1565 continue;
1566 emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1567 }
1568 break;
1569 case TGSI_OPCODE_SUB:
1570 for (c = 0; c < 4; c++) {
1571 if (!(mask & (1 << c)))
1572 continue;
1573 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1574 }
1575 break;
1576 case TGSI_OPCODE_TEX:
1577 emit_tex(pc, dst, mask, src[0], unit,
1578 inst->InstructionExtTexture.Texture, FALSE);
1579 break;
1580 case TGSI_OPCODE_TXP:
1581 emit_tex(pc, dst, mask, src[0], unit,
1582 inst->InstructionExtTexture.Texture, TRUE);
1583 break;
1584 case TGSI_OPCODE_XPD:
1585 temp = temp_temp(pc);
1586 if (mask & (1 << 0)) {
1587 emit_mul(pc, temp, src[0][2], src[1][1]);
1588 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1589 }
1590 if (mask & (1 << 1)) {
1591 emit_mul(pc, temp, src[0][0], src[1][2]);
1592 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1593 }
1594 if (mask & (1 << 2)) {
1595 emit_mul(pc, temp, src[0][1], src[1][0]);
1596 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1597 }
1598 if (mask & (1 << 3))
1599 emit_mov_immdval(pc, dst[3], 1.0);
1600 break;
1601 case TGSI_OPCODE_END:
1602 break;
1603 default:
1604 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1605 return FALSE;
1606 }
1607
1608 if (sat) {
1609 for (c = 0; c < 4; c++) {
1610 if (!(mask & (1 << c)))
1611 continue;
1612 emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
1613 CVT_F32_F32);
1614 }
1615 } else if (assimilate) {
1616 for (c = 0; c < 4; c++)
1617 if (rdst[c])
1618 assimilate_temp(pc, rdst[c], dst[c]);
1619 }
1620
1621 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1622 for (c = 0; c < 4; c++) {
1623 if (!src[i][c])
1624 continue;
1625 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1626 FREE(src[i][c]);
1627 else
1628 if (src[i][c]->acc == pc->insn_cur)
1629 release_hw(pc, src[i][c]);
1630 }
1631 }
1632
1633 kill_temp_temp(pc);
1634 return TRUE;
1635 }
1636
1637 /* Adjust a bitmask that indicates what components of a source are used,
1638 * we use this in tx_prep so we only load interpolants that are needed.
1639 */
1640 static void
1641 insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1642 {
1643 const struct tgsi_instruction_ext_texture *tex;
1644
1645 switch (insn->Instruction.Opcode) {
1646 case TGSI_OPCODE_DP3:
1647 *mask = 0x7;
1648 break;
1649 case TGSI_OPCODE_DP4:
1650 case TGSI_OPCODE_DPH:
1651 *mask = 0xF;
1652 break;
1653 case TGSI_OPCODE_LIT:
1654 *mask = 0xB;
1655 break;
1656 case TGSI_OPCODE_RCP:
1657 case TGSI_OPCODE_RSQ:
1658 *mask = 0x1;
1659 break;
1660 case TGSI_OPCODE_TEX:
1661 case TGSI_OPCODE_TXP:
1662 assert(insn->Instruction.Extended);
1663 tex = &insn->InstructionExtTexture;
1664
1665 *mask = 0x7;
1666 if (tex->Texture == TGSI_TEXTURE_1D)
1667 *mask = 0x1;
1668 else
1669 if (tex->Texture == TGSI_TEXTURE_2D)
1670 *mask = 0x3;
1671
1672 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1673 *mask |= 0x8;
1674 break;
1675 default:
1676 break;
1677 }
1678 }
1679
1680 static void
1681 prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1682 unsigned *r_usage[2])
1683 {
1684 const struct tgsi_full_instruction *insn;
1685 const struct tgsi_full_src_register *src;
1686 const struct tgsi_dst_register *dst;
1687
1688 unsigned i, c, k, n, mask, *acc_p;
1689
1690 insn = &tok->FullInstruction;
1691 dst = &insn->FullDstRegisters[0].DstRegister;
1692 mask = dst->WriteMask;
1693
1694 if (!r_usage[0])
1695 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1696 if (!r_usage[1])
1697 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1698
1699 if (dst->File == TGSI_FILE_TEMPORARY) {
1700 for (c = 0; c < 4; c++) {
1701 if (!(mask & (1 << c)))
1702 continue;
1703 r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1704 }
1705 }
1706
1707 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1708 src = &insn->FullSrcRegisters[i];
1709
1710 switch (src->SrcRegister.File) {
1711 case TGSI_FILE_TEMPORARY:
1712 acc_p = r_usage[0];
1713 break;
1714 case TGSI_FILE_INPUT:
1715 acc_p = r_usage[1];
1716 break;
1717 default:
1718 continue;
1719 }
1720
1721 insn_adjust_mask(insn, &mask);
1722
1723 for (c = 0; c < 4; c++) {
1724 if (!(mask & (1 << c)))
1725 continue;
1726
1727 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1728 switch (k) {
1729 case TGSI_EXTSWIZZLE_X:
1730 case TGSI_EXTSWIZZLE_Y:
1731 case TGSI_EXTSWIZZLE_Z:
1732 case TGSI_EXTSWIZZLE_W:
1733 n = src->SrcRegister.Index * 4 + k;
1734 acc_p[n] = pc->insn_nr;
1735 break;
1736 default:
1737 break;
1738 }
1739 }
1740 }
1741 }
1742
1743 static unsigned
1744 load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1745 int *aid, int *p_oid)
1746 {
1747 struct nv50_reg *iv;
1748 int oid, c, n;
1749 unsigned mask = 0;
1750
1751 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1752
1753 for (c = 0, n = i * 4; c < 4; c++, n++) {
1754 oid = (*p_oid)++;
1755 pc->attr[n].type = P_TEMP;
1756 pc->attr[n].index = i;
1757
1758 if (pc->attr[n].acc == acc[n])
1759 continue;
1760 mask |= (1 << c);
1761
1762 pc->attr[n].acc = acc[n];
1763 pc->attr[n].rhw = pc->attr[n].hw = -1;
1764 alloc_reg(pc, &pc->attr[n]);
1765
1766 pc->attr[n].rhw = (*aid)++;
1767 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1768
1769 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1770 (*mid)++;
1771 pc->p->cfg.fp.regs[1] += 0x00010001;
1772 }
1773
1774 return mask;
1775 }
1776
1777 static boolean
1778 nv50_program_tx_prep(struct nv50_pc *pc)
1779 {
1780 struct tgsi_parse_context p;
1781 boolean ret = FALSE;
1782 unsigned i, c;
1783 unsigned fcol, bcol, fcrd, depr;
1784
1785 /* count (centroid) perspective interpolations */
1786 unsigned centroid_loads = 0;
1787 unsigned perspect_loads = 0;
1788
1789 /* track register access for temps and attrs */
1790 unsigned *r_usage[2];
1791 r_usage[0] = NULL;
1792 r_usage[1] = NULL;
1793
1794 depr = fcol = bcol = fcrd = 0xffff;
1795
1796 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1797 pc->p->cfg.fp.regs[0] = 0x01000404;
1798 pc->p->cfg.fp.regs[1] = 0x00000400;
1799 }
1800
1801 tgsi_parse_init(&p, pc->p->pipe.tokens);
1802 while (!tgsi_parse_end_of_tokens(&p)) {
1803 const union tgsi_full_token *tok = &p.FullToken;
1804
1805 tgsi_parse_token(&p);
1806 switch (tok->Token.Type) {
1807 case TGSI_TOKEN_TYPE_IMMEDIATE:
1808 {
1809 const struct tgsi_full_immediate *imm =
1810 &p.FullToken.FullImmediate;
1811
1812 ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1813 imm->u.ImmediateFloat32[1].Float,
1814 imm->u.ImmediateFloat32[2].Float,
1815 imm->u.ImmediateFloat32[3].Float);
1816 }
1817 break;
1818 case TGSI_TOKEN_TYPE_DECLARATION:
1819 {
1820 const struct tgsi_full_declaration *d;
1821 unsigned last, first, mode;
1822
1823 d = &p.FullToken.FullDeclaration;
1824 first = d->DeclarationRange.First;
1825 last = d->DeclarationRange.Last;
1826
1827 switch (d->Declaration.File) {
1828 case TGSI_FILE_TEMPORARY:
1829 if (pc->temp_nr < (last + 1))
1830 pc->temp_nr = last + 1;
1831 break;
1832 case TGSI_FILE_OUTPUT:
1833 if (pc->result_nr < (last + 1))
1834 pc->result_nr = last + 1;
1835
1836 if (!d->Declaration.Semantic)
1837 break;
1838
1839 switch (d->Semantic.SemanticName) {
1840 case TGSI_SEMANTIC_POSITION:
1841 depr = first;
1842 pc->p->cfg.fp.regs[2] |= 0x00000100;
1843 pc->p->cfg.fp.regs[3] |= 0x00000011;
1844 break;
1845 default:
1846 break;
1847 }
1848
1849 break;
1850 case TGSI_FILE_INPUT:
1851 {
1852 if (pc->attr_nr < (last + 1))
1853 pc->attr_nr = last + 1;
1854
1855 if (pc->p->type != PIPE_SHADER_FRAGMENT)
1856 break;
1857
1858 switch (d->Declaration.Interpolate) {
1859 case TGSI_INTERPOLATE_CONSTANT:
1860 mode = INTERP_FLAT;
1861 break;
1862 case TGSI_INTERPOLATE_PERSPECTIVE:
1863 mode = INTERP_PERSPECTIVE;
1864 break;
1865 default:
1866 mode = INTERP_LINEAR;
1867 break;
1868 }
1869
1870 if (d->Declaration.Semantic) {
1871 switch (d->Semantic.SemanticName) {
1872 case TGSI_SEMANTIC_POSITION:
1873 fcrd = first;
1874 break;
1875 case TGSI_SEMANTIC_COLOR:
1876 fcol = first;
1877 mode = INTERP_PERSPECTIVE;
1878 break;
1879 case TGSI_SEMANTIC_BCOLOR:
1880 bcol = first;
1881 mode = INTERP_PERSPECTIVE;
1882 break;
1883 }
1884 }
1885
1886 if (d->Declaration.Centroid) {
1887 mode |= INTERP_CENTROID;
1888 if (mode & INTERP_PERSPECTIVE)
1889 centroid_loads++;
1890 } else
1891 if (mode & INTERP_PERSPECTIVE)
1892 perspect_loads++;
1893
1894 assert(last < 32);
1895 for (i = first; i <= last; i++)
1896 pc->interp_mode[i] = mode;
1897 }
1898 break;
1899 case TGSI_FILE_CONSTANT:
1900 if (pc->param_nr < (last + 1))
1901 pc->param_nr = last + 1;
1902 break;
1903 case TGSI_FILE_SAMPLER:
1904 break;
1905 default:
1906 NOUVEAU_ERR("bad decl file %d\n",
1907 d->Declaration.File);
1908 goto out_err;
1909 }
1910 }
1911 break;
1912 case TGSI_TOKEN_TYPE_INSTRUCTION:
1913 pc->insn_nr++;
1914 prep_inspect_insn(pc, tok, r_usage);
1915 break;
1916 default:
1917 break;
1918 }
1919 }
1920
1921 if (pc->temp_nr) {
1922 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1923 if (!pc->temp)
1924 goto out_err;
1925
1926 for (i = 0; i < pc->temp_nr; i++) {
1927 for (c = 0; c < 4; c++) {
1928 pc->temp[i*4+c].type = P_TEMP;
1929 pc->temp[i*4+c].hw = -1;
1930 pc->temp[i*4+c].rhw = -1;
1931 pc->temp[i*4+c].index = i;
1932 pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1933 }
1934 }
1935 }
1936
1937 if (pc->attr_nr) {
1938 int oid = 4, mid = 4, aid = 0;
1939 /* oid = VP output id
1940 * aid = FP attribute/interpolant id
1941 * mid = VP output mapping field ID
1942 */
1943
1944 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1945 if (!pc->attr)
1946 goto out_err;
1947
1948 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1949 /* position should be loaded first */
1950 if (fcrd != 0xffff) {
1951 unsigned mask;
1952 mid = 0;
1953 mask = load_fp_attrib(pc, fcrd, r_usage[1],
1954 &mid, &aid, &oid);
1955 oid = 0;
1956 pc->p->cfg.fp.regs[1] |= (mask << 24);
1957 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1958 }
1959 pc->p->cfg.fp.map[0] += 0x03020100;
1960
1961 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1962
1963 if (perspect_loads) {
1964 pc->iv_p = alloc_temp(pc, NULL);
1965
1966 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1967 pc->p->cfg.fp.regs[1] |= 0x08000000;
1968 pc->iv_p->rhw = aid++;
1969 emit_interp(pc, pc->iv_p, NULL,
1970 INTERP_LINEAR);
1971 emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1972 } else {
1973 pc->iv_p->rhw = aid - 1;
1974 emit_flop(pc, 0, pc->iv_p,
1975 &pc->attr[fcrd * 4 + 3]);
1976 }
1977 }
1978
1979 if (centroid_loads) {
1980 pc->iv_c = alloc_temp(pc, NULL);
1981 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1982 emit_interp(pc, pc->iv_c, NULL,
1983 INTERP_CENTROID);
1984 emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1985 pc->p->cfg.fp.regs[1] |= 0x08000000;
1986 }
1987
1988 for (c = 0; c < 4; c++) {
1989 /* I don't know what these values do, but
1990 * let's set them like the blob does:
1991 */
1992 if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1993 pc->p->cfg.fp.regs[0] += 0x00010000;
1994 if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1995 pc->p->cfg.fp.regs[0] += 0x00010000;
1996 }
1997
1998 for (i = 0; i < pc->attr_nr; i++)
1999 load_fp_attrib(pc, i, r_usage[1],
2000 &mid, &aid, &oid);
2001
2002 if (pc->iv_p)
2003 free_temp(pc, pc->iv_p);
2004 if (pc->iv_c)
2005 free_temp(pc, pc->iv_c);
2006
2007 pc->p->cfg.fp.high_map = (mid / 4);
2008 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
2009 } else {
2010 /* vertex program */
2011 for (i = 0; i < pc->attr_nr * 4; i++) {
2012 pc->p->cfg.vp.attr[aid / 32] |=
2013 (1 << (aid % 32));
2014 pc->attr[i].type = P_ATTR;
2015 pc->attr[i].hw = aid++;
2016 pc->attr[i].index = i / 4;
2017 }
2018 }
2019 }
2020
2021 if (pc->result_nr) {
2022 int rid = 0;
2023
2024 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
2025 if (!pc->result)
2026 goto out_err;
2027
2028 for (i = 0; i < pc->result_nr; i++) {
2029 for (c = 0; c < 4; c++) {
2030 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2031 pc->result[i*4+c].type = P_TEMP;
2032 pc->result[i*4+c].hw = -1;
2033 pc->result[i*4+c].rhw = (i == depr) ?
2034 -1 : rid++;
2035 } else {
2036 pc->result[i*4+c].type = P_RESULT;
2037 pc->result[i*4+c].hw = rid++;
2038 }
2039 pc->result[i*4+c].index = i;
2040 }
2041
2042 if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2043 depr != 0xffff) {
2044 pc->result[depr * 4 + 2].rhw =
2045 (pc->result_nr - 1) * 4;
2046 }
2047 }
2048 }
2049
2050 if (pc->param_nr) {
2051 int rid = 0;
2052
2053 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2054 if (!pc->param)
2055 goto out_err;
2056
2057 for (i = 0; i < pc->param_nr; i++) {
2058 for (c = 0; c < 4; c++) {
2059 pc->param[i*4+c].type = P_CONST;
2060 pc->param[i*4+c].hw = rid++;
2061 pc->param[i*4+c].index = i;
2062 }
2063 }
2064 }
2065
2066 if (pc->immd_nr) {
2067 int rid = 0;
2068
2069 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2070 if (!pc->immd)
2071 goto out_err;
2072
2073 for (i = 0; i < pc->immd_nr; i++) {
2074 for (c = 0; c < 4; c++) {
2075 pc->immd[i*4+c].type = P_IMMD;
2076 pc->immd[i*4+c].hw = rid++;
2077 pc->immd[i*4+c].index = i;
2078 }
2079 }
2080 }
2081
2082 ret = TRUE;
2083 out_err:
2084 if (r_usage[0])
2085 FREE(r_usage[0]);
2086 if (r_usage[1])
2087 FREE(r_usage[1]);
2088
2089 tgsi_parse_free(&p);
2090 return ret;
2091 }
2092
2093 static void
2094 free_nv50_pc(struct nv50_pc *pc)
2095 {
2096 if (pc->immd)
2097 FREE(pc->immd);
2098 if (pc->param)
2099 FREE(pc->param);
2100 if (pc->result)
2101 FREE(pc->result);
2102 if (pc->attr)
2103 FREE(pc->attr);
2104 if (pc->temp)
2105 FREE(pc->temp);
2106
2107 FREE(pc);
2108 }
2109
2110 static boolean
2111 nv50_program_tx(struct nv50_program *p)
2112 {
2113 struct tgsi_parse_context parse;
2114 struct nv50_pc *pc;
2115 unsigned k;
2116 boolean ret;
2117
2118 pc = CALLOC_STRUCT(nv50_pc);
2119 if (!pc)
2120 return FALSE;
2121 pc->p = p;
2122 pc->p->cfg.high_temp = 4;
2123
2124 ret = nv50_program_tx_prep(pc);
2125 if (ret == FALSE)
2126 goto out_cleanup;
2127
2128 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2129 while (!tgsi_parse_end_of_tokens(&parse)) {
2130 const union tgsi_full_token *tok = &parse.FullToken;
2131
2132 /* don't allow half insn/immd on first and last instruction */
2133 pc->allow32 = TRUE;
2134 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2135 pc->allow32 = FALSE;
2136
2137 tgsi_parse_token(&parse);
2138
2139 switch (tok->Token.Type) {
2140 case TGSI_TOKEN_TYPE_INSTRUCTION:
2141 ++pc->insn_cur;
2142 ret = nv50_program_tx_insn(pc, tok);
2143 if (ret == FALSE)
2144 goto out_err;
2145 break;
2146 default:
2147 break;
2148 }
2149 }
2150
2151 if (p->type == PIPE_SHADER_FRAGMENT) {
2152 struct nv50_reg out;
2153
2154 out.type = P_TEMP;
2155 for (k = 0; k < pc->result_nr * 4; k++) {
2156 if (pc->result[k].rhw == -1)
2157 continue;
2158 if (pc->result[k].hw != pc->result[k].rhw) {
2159 out.hw = pc->result[k].rhw;
2160 emit_mov(pc, &out, &pc->result[k]);
2161 }
2162 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2163 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2164 }
2165 }
2166
2167 /* look for single half instructions and make them long */
2168 struct nv50_program_exec *e, *e_prev;
2169
2170 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2171 if (!is_long(e))
2172 k++;
2173
2174 if (!e->next || is_long(e->next)) {
2175 if (k & 1)
2176 convert_to_long(pc, e);
2177 k = 0;
2178 }
2179
2180 if (e->next)
2181 e_prev = e;
2182 }
2183
2184 if (!is_long(pc->p->exec_tail)) {
2185 /* this may occur if moving FP results */
2186 assert(e_prev && !is_long(e_prev));
2187 convert_to_long(pc, e_prev);
2188 convert_to_long(pc, pc->p->exec_tail);
2189 }
2190
2191 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2192 pc->p->exec_tail->inst[1] |= 0x00000001;
2193
2194 p->param_nr = pc->param_nr * 4;
2195 p->immd_nr = pc->immd_nr * 4;
2196 p->immd = pc->immd_buf;
2197
2198 out_err:
2199 tgsi_parse_free(&parse);
2200
2201 out_cleanup:
2202 free_nv50_pc(pc);
2203 return ret;
2204 }
2205
2206 static void
2207 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2208 {
2209 if (nv50_program_tx(p) == FALSE)
2210 assert(0);
2211 p->translated = TRUE;
2212 }
2213
2214 static void
2215 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2216 unsigned start, unsigned count, unsigned cbuf)
2217 {
2218 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2219 struct nouveau_grobj *tesla = nv50->screen->tesla;
2220
2221 while (count) {
2222 unsigned nr = count > 2047 ? 2047 : count;
2223
2224 BEGIN_RING(chan, tesla, 0x00000f00, 1);
2225 OUT_RING (chan, (cbuf << 0) | (start << 8));
2226 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2227 OUT_RINGp (chan, map, nr);
2228
2229 map += nr;
2230 start += nr;
2231 count -= nr;
2232 }
2233 }
2234
2235 static void
2236 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2237 {
2238 struct nouveau_winsys *nvws = nv50->screen->nvws;
2239 struct pipe_winsys *ws = nv50->pipe.winsys;
2240
2241 if (!p->data[0] && p->immd_nr) {
2242 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2243
2244 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {
2245 while (heap->next && heap->size < p->immd_nr) {
2246 struct nv50_program *evict = heap->next->priv;
2247 nvws->res_free(&evict->data[0]);
2248 }
2249
2250 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))
2251 assert(0);
2252 }
2253
2254 /* immediates only need to be uploaded again when freed */
2255 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2256 p->immd_nr, NV50_CB_PMISC);
2257 }
2258
2259 if (!p->data[1] && p->param_nr) {
2260 struct nouveau_resource *heap =
2261 nv50->screen->parm_heap[p->type];
2262
2263 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {
2264 while (heap->next && heap->size < p->param_nr) {
2265 struct nv50_program *evict = heap->next->priv;
2266 nvws->res_free(&evict->data[1]);
2267 }
2268
2269 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))
2270 assert(0);
2271 }
2272 }
2273
2274 if (p->param_nr) {
2275 unsigned cbuf = NV50_CB_PVP;
2276 float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
2277 PIPE_BUFFER_USAGE_CPU_READ);
2278 if (p->type == PIPE_SHADER_FRAGMENT)
2279 cbuf = NV50_CB_PFP;
2280 nv50_program_upload_data(nv50, map, p->data[1]->start,
2281 p->param_nr, cbuf);
2282 ws->buffer_unmap(ws, nv50->constbuf[p->type]);
2283 }
2284 }
2285
2286 static void
2287 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2288 {
2289 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2290 struct nouveau_grobj *tesla = nv50->screen->tesla;
2291 struct pipe_screen *screen = nv50->pipe.screen;
2292 struct nv50_program_exec *e;
2293 struct nouveau_stateobj *so;
2294 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2295 unsigned start, count, *up, *ptr;
2296 boolean upload = FALSE;
2297
2298 if (!p->buffer) {
2299 p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
2300 upload = TRUE;
2301 }
2302
2303 if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2304 (p->data[1] && p->data[1]->start != p->data_start[1])) {
2305 for (e = p->exec_head; e; e = e->next) {
2306 unsigned ei, ci, bs;
2307
2308 if (e->param.index < 0)
2309 continue;
2310 bs = (e->inst[1] >> 22) & 0x07;
2311 assert(bs < 2);
2312 ei = e->param.shift >> 5;
2313 ci = e->param.index + p->data[bs]->start;
2314
2315 e->inst[ei] &= ~e->param.mask;
2316 e->inst[ei] |= (ci << e->param.shift);
2317 }
2318
2319 if (p->data[0])
2320 p->data_start[0] = p->data[0]->start;
2321 if (p->data[1])
2322 p->data_start[1] = p->data[1]->start;
2323
2324 upload = TRUE;
2325 }
2326
2327 if (!upload)
2328 return;
2329
2330 #ifdef NV50_PROGRAM_DUMP
2331 NOUVEAU_ERR("-------\n");
2332 for (e = p->exec_head; e; e = e->next) {
2333 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2334 if (is_long(e))
2335 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2336 }
2337 #endif
2338
2339 up = ptr = MALLOC(p->exec_size * 4);
2340 for (e = p->exec_head; e; e = e->next) {
2341 *(ptr++) = e->inst[0];
2342 if (is_long(e))
2343 *(ptr++) = e->inst[1];
2344 }
2345
2346 so = so_new(4,2);
2347 so_method(so, nv50->screen->tesla, 0x1280, 3);
2348 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2349 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2350 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2351
2352 start = 0; count = p->exec_size;
2353 while (count) {
2354 struct nouveau_winsys *nvws = nv50->screen->nvws;
2355 unsigned nr;
2356
2357 so_emit(nvws, so);
2358
2359 nr = MIN2(count, 2047);
2360 nr = MIN2(nvws->channel->pushbuf->remaining, nr);
2361 if (nvws->channel->pushbuf->remaining < (nr + 3)) {
2362 FIRE_RING(chan);
2363 continue;
2364 }
2365
2366 BEGIN_RING(chan, tesla, 0x0f00, 1);
2367 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2368 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2369 OUT_RINGp (chan, up + start, nr);
2370
2371 start += nr;
2372 count -= nr;
2373 }
2374
2375 FREE(up);
2376 so_ref(NULL, &so);
2377 }
2378
2379 void
2380 nv50_vertprog_validate(struct nv50_context *nv50)
2381 {
2382 struct nouveau_grobj *tesla = nv50->screen->tesla;
2383 struct nv50_program *p = nv50->vertprog;
2384 struct nouveau_stateobj *so;
2385
2386 if (!p->translated) {
2387 nv50_program_validate(nv50, p);
2388 if (!p->translated)
2389 assert(0);
2390 }
2391
2392 nv50_program_validate_data(nv50, p);
2393 nv50_program_validate_code(nv50, p);
2394
2395 so = so_new(13, 2);
2396 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2397 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2398 NOUVEAU_BO_HIGH, 0, 0);
2399 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2400 NOUVEAU_BO_LOW, 0, 0);
2401 so_method(so, tesla, 0x1650, 2);
2402 so_data (so, p->cfg.vp.attr[0]);
2403 so_data (so, p->cfg.vp.attr[1]);
2404 so_method(so, tesla, 0x16b8, 1);
2405 so_data (so, p->cfg.high_result);
2406 so_method(so, tesla, 0x16ac, 2);
2407 so_data (so, p->cfg.high_result); //8);
2408 so_data (so, p->cfg.high_temp);
2409 so_method(so, tesla, 0x140c, 1);
2410 so_data (so, 0); /* program start offset */
2411 so_ref(so, &nv50->state.vertprog);
2412 so_ref(NULL, &so);
2413 }
2414
2415 void
2416 nv50_fragprog_validate(struct nv50_context *nv50)
2417 {
2418 struct nouveau_grobj *tesla = nv50->screen->tesla;
2419 struct nv50_program *p = nv50->fragprog;
2420 struct nouveau_stateobj *so;
2421 unsigned i;
2422
2423 if (!p->translated) {
2424 nv50_program_validate(nv50, p);
2425 if (!p->translated)
2426 assert(0);
2427 }
2428
2429 nv50_program_validate_data(nv50, p);
2430 nv50_program_validate_code(nv50, p);
2431
2432 so = so_new(64, 2);
2433 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2434 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2435 NOUVEAU_BO_HIGH, 0, 0);
2436 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2437 NOUVEAU_BO_LOW, 0, 0);
2438 so_method(so, tesla, 0x1904, 4);
2439 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2440 so_data (so, 0x00000004);
2441 so_data (so, 0x00000000);
2442 so_data (so, 0x00000000);
2443 so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
2444 for (i = 0; i < p->cfg.fp.high_map; i++)
2445 so_data(so, p->cfg.fp.map[i]);
2446 so_method(so, tesla, 0x1988, 2);
2447 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2448 so_data (so, p->cfg.high_temp);
2449 so_method(so, tesla, 0x1298, 1);
2450 so_data (so, p->cfg.high_result);
2451 so_method(so, tesla, 0x19a8, 1);
2452 so_data (so, p->cfg.fp.regs[2]);
2453 so_method(so, tesla, 0x196c, 1);
2454 so_data (so, p->cfg.fp.regs[3]);
2455 so_method(so, tesla, 0x1414, 1);
2456 so_data (so, 0); /* program start offset */
2457 so_ref(so, &nv50->state.fragprog);
2458 so_ref(NULL, &so);
2459 }
2460
2461 void
2462 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2463 {
2464 struct pipe_screen *pscreen = nv50->pipe.screen;
2465
2466 while (p->exec_head) {
2467 struct nv50_program_exec *e = p->exec_head;
2468
2469 p->exec_head = e->next;
2470 FREE(e);
2471 }
2472 p->exec_tail = NULL;
2473 p->exec_size = 0;
2474
2475 if (p->buffer)
2476 pipe_buffer_reference(&p->buffer, NULL);
2477
2478 nv50->screen->nvws->res_free(&p->data[0]);
2479 nv50->screen->nvws->res_free(&p->data[1]);
2480
2481 p->translated = 0;
2482 }
2483