89051069bc65ae0ae32a04315f598896b291d10c
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 unsigned interp_mode[32];
116 /* perspective interpolation registers */
117 struct nv50_reg *iv_p;
118 struct nv50_reg *iv_c;
119
120 /* current instruction and total number of insns */
121 unsigned insn_cur;
122 unsigned insn_nr;
123
124 boolean allow32;
125 };
126
127 static void
128 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129 {
130 int i = 0;
131
132 if (reg->type == P_RESULT) {
133 if (pc->p->cfg.high_result < (reg->hw + 1))
134 pc->p->cfg.high_result = reg->hw + 1;
135 }
136
137 if (reg->type != P_TEMP)
138 return;
139
140 if (reg->hw >= 0) {
141 /*XXX: do this here too to catch FP temp-as-attr usage..
142 * not clean, but works */
143 if (pc->p->cfg.high_temp < (reg->hw + 1))
144 pc->p->cfg.high_temp = reg->hw + 1;
145 return;
146 }
147
148 if (reg->rhw != -1) {
149 /* try to allocate temporary with index rhw first */
150 if (!(pc->r_temp[reg->rhw])) {
151 pc->r_temp[reg->rhw] = reg;
152 reg->hw = reg->rhw;
153 if (pc->p->cfg.high_temp < (reg->rhw + 1))
154 pc->p->cfg.high_temp = reg->rhw + 1;
155 return;
156 }
157 /* make sure we don't get things like $r0 needs to go
158 * in $r1 and $r1 in $r0
159 */
160 i = pc->result_nr * 4;
161 }
162
163 for (; i < NV50_SU_MAX_TEMP; i++) {
164 if (!(pc->r_temp[i])) {
165 pc->r_temp[i] = reg;
166 reg->hw = i;
167 if (pc->p->cfg.high_temp < (i + 1))
168 pc->p->cfg.high_temp = i + 1;
169 return;
170 }
171 }
172
173 assert(0);
174 }
175
176 static struct nv50_reg *
177 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178 {
179 struct nv50_reg *r;
180 int i;
181
182 if (dst && dst->type == P_TEMP && dst->hw == -1)
183 return dst;
184
185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186 if (!pc->r_temp[i]) {
187 r = CALLOC_STRUCT(nv50_reg);
188 r->type = P_TEMP;
189 r->index = -1;
190 r->hw = i;
191 r->rhw = -1;
192 pc->r_temp[i] = r;
193 return r;
194 }
195 }
196
197 assert(0);
198 return NULL;
199 }
200
201 /* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204 static void
205 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206 {
207 assert(src->index == -1 && src->hw != -1);
208
209 if (dst->hw != -1)
210 pc->r_temp[dst->hw] = NULL;
211 pc->r_temp[src->hw] = dst;
212 dst->hw = src->hw;
213
214 FREE(src);
215 }
216
217 /* release the hardware resource held by r */
218 static void
219 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
220 {
221 assert(r->type == P_TEMP);
222 if (r->hw == -1)
223 return;
224
225 assert(pc->r_temp[r->hw] == r);
226 pc->r_temp[r->hw] = NULL;
227
228 r->acc = 0;
229 if (r->index == -1)
230 FREE(r);
231 }
232
233 static void
234 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
235 {
236 if (r->index == -1) {
237 unsigned hw = r->hw;
238
239 FREE(pc->r_temp[hw]);
240 pc->r_temp[hw] = NULL;
241 }
242 }
243
244 static int
245 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
246 {
247 int i;
248
249 if ((idx + 4) >= NV50_SU_MAX_TEMP)
250 return 1;
251
252 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
253 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
254 return alloc_temp4(pc, dst, idx + 1);
255
256 for (i = 0; i < 4; i++) {
257 dst[i] = CALLOC_STRUCT(nv50_reg);
258 dst[i]->type = P_TEMP;
259 dst[i]->index = -1;
260 dst[i]->hw = idx + i;
261 pc->r_temp[idx + i] = dst[i];
262 }
263
264 return 0;
265 }
266
267 static void
268 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
269 {
270 int i;
271
272 for (i = 0; i < 4; i++)
273 free_temp(pc, reg[i]);
274 }
275
276 static struct nv50_reg *
277 temp_temp(struct nv50_pc *pc)
278 {
279 if (pc->temp_temp_nr >= 16)
280 assert(0);
281
282 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
283 return pc->temp_temp[pc->temp_temp_nr++];
284 }
285
286 static void
287 kill_temp_temp(struct nv50_pc *pc)
288 {
289 int i;
290
291 for (i = 0; i < pc->temp_temp_nr; i++)
292 free_temp(pc, pc->temp_temp[i]);
293 pc->temp_temp_nr = 0;
294 }
295
296 static int
297 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
298 {
299 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
300 (pc->immd_nr + 1) * 4 * sizeof(float));
301 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
302 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
303 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
304 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
305
306 return pc->immd_nr++;
307 }
308
309 static struct nv50_reg *
310 alloc_immd(struct nv50_pc *pc, float f)
311 {
312 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
313 unsigned hw;
314
315 for (hw = 0; hw < pc->immd_nr * 4; hw++)
316 if (pc->immd_buf[hw] == f)
317 break;
318
319 if (hw == pc->immd_nr * 4)
320 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
321
322 r->type = P_IMMD;
323 r->hw = hw;
324 r->index = -1;
325 return r;
326 }
327
328 static struct nv50_program_exec *
329 exec(struct nv50_pc *pc)
330 {
331 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
332
333 e->param.index = -1;
334 return e;
335 }
336
337 static void
338 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
339 {
340 struct nv50_program *p = pc->p;
341
342 if (p->exec_tail)
343 p->exec_tail->next = e;
344 if (!p->exec_head)
345 p->exec_head = e;
346 p->exec_tail = e;
347 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
348 }
349
350 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
351
352 static boolean
353 is_long(struct nv50_program_exec *e)
354 {
355 if (e->inst[0] & 1)
356 return TRUE;
357 return FALSE;
358 }
359
360 static boolean
361 is_immd(struct nv50_program_exec *e)
362 {
363 if (is_long(e) && (e->inst[1] & 3) == 3)
364 return TRUE;
365 return FALSE;
366 }
367
368 static INLINE void
369 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
370 struct nv50_program_exec *e)
371 {
372 set_long(pc, e);
373 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
374 e->inst[1] |= (pred << 7) | (idx << 12);
375 }
376
377 static INLINE void
378 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
379 struct nv50_program_exec *e)
380 {
381 set_long(pc, e);
382 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
383 e->inst[1] |= (idx << 4) | (on << 6);
384 }
385
386 static INLINE void
387 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
388 {
389 if (is_long(e))
390 return;
391
392 e->inst[0] |= 1;
393 set_pred(pc, 0xf, 0, e);
394 set_pred_wr(pc, 0, 0, e);
395 }
396
397 static INLINE void
398 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
399 {
400 if (dst->type == P_RESULT) {
401 set_long(pc, e);
402 e->inst[1] |= 0x00000008;
403 }
404
405 alloc_reg(pc, dst);
406 e->inst[0] |= (dst->hw << 2);
407 }
408
409 static INLINE void
410 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
411 {
412 unsigned val = fui(pc->immd_buf[imm->hw]);
413
414 set_long(pc, e);
415 /*XXX: can't be predicated - bits overlap.. catch cases where both
416 * are required and avoid them. */
417 set_pred(pc, 0, 0, e);
418 set_pred_wr(pc, 0, 0, e);
419
420 e->inst[1] |= 0x00000002 | 0x00000001;
421 e->inst[0] |= (val & 0x3f) << 16;
422 e->inst[1] |= (val >> 6) << 2;
423 }
424
425
426 #define INTERP_LINEAR 0
427 #define INTERP_FLAT 1
428 #define INTERP_PERSPECTIVE 2
429 #define INTERP_CENTROID 4
430
431 /* interpolant index has been stored in dst->rhw */
432 static void
433 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
434 unsigned mode)
435 {
436 assert(dst->rhw != -1);
437 struct nv50_program_exec *e = exec(pc);
438
439 e->inst[0] |= 0x80000000;
440 set_dst(pc, dst, e);
441 e->inst[0] |= (dst->rhw << 16);
442
443 if (mode & INTERP_FLAT) {
444 e->inst[0] |= (1 << 8);
445 } else {
446 if (mode & INTERP_PERSPECTIVE) {
447 e->inst[0] |= (1 << 25);
448 alloc_reg(pc, iv);
449 e->inst[0] |= (iv->hw << 9);
450 }
451
452 if (mode & INTERP_CENTROID)
453 e->inst[0] |= (1 << 24);
454 }
455
456 emit(pc, e);
457 }
458
459 static void
460 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
461 struct nv50_program_exec *e)
462 {
463 set_long(pc, e);
464
465 e->param.index = src->hw;
466 e->param.shift = s;
467 e->param.mask = m << (s % 32);
468
469 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
470 }
471
472 static void
473 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
474 {
475 struct nv50_program_exec *e = exec(pc);
476
477 e->inst[0] |= 0x10000000;
478
479 set_dst(pc, dst, e);
480
481 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
482 set_immd(pc, src, e);
483 /*XXX: 32-bit, but steals part of "half" reg space - need to
484 * catch and handle this case if/when we do half-regs
485 */
486 } else
487 if (src->type == P_IMMD || src->type == P_CONST) {
488 set_long(pc, e);
489 set_data(pc, src, 0x7f, 9, e);
490 e->inst[1] |= 0x20000000; /* src0 const? */
491 } else {
492 if (src->type == P_ATTR) {
493 set_long(pc, e);
494 e->inst[1] |= 0x00200000;
495 }
496
497 alloc_reg(pc, src);
498 e->inst[0] |= (src->hw << 9);
499 }
500
501 if (is_long(e) && !is_immd(e)) {
502 e->inst[1] |= 0x04000000; /* 32-bit */
503 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
504 if (!(e->inst[1] & 0x20000000))
505 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
506 } else
507 e->inst[0] |= 0x00008000;
508
509 emit(pc, e);
510 }
511
512 static INLINE void
513 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
514 {
515 struct nv50_reg *imm = alloc_immd(pc, f);
516 emit_mov(pc, dst, imm);
517 FREE(imm);
518 }
519
520 static boolean
521 check_swap_src_0_1(struct nv50_pc *pc,
522 struct nv50_reg **s0, struct nv50_reg **s1)
523 {
524 struct nv50_reg *src0 = *s0, *src1 = *s1;
525
526 if (src0->type == P_CONST) {
527 if (src1->type != P_CONST) {
528 *s0 = src1;
529 *s1 = src0;
530 return TRUE;
531 }
532 } else
533 if (src1->type == P_ATTR) {
534 if (src0->type != P_ATTR) {
535 *s0 = src1;
536 *s1 = src0;
537 return TRUE;
538 }
539 }
540
541 return FALSE;
542 }
543
544 static void
545 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
546 {
547 if (src->type == P_ATTR) {
548 set_long(pc, e);
549 e->inst[1] |= 0x00200000;
550 } else
551 if (src->type == P_CONST || src->type == P_IMMD) {
552 struct nv50_reg *temp = temp_temp(pc);
553
554 emit_mov(pc, temp, src);
555 src = temp;
556 }
557
558 alloc_reg(pc, src);
559 e->inst[0] |= (src->hw << 9);
560 }
561
562 static void
563 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
564 {
565 if (src->type == P_ATTR) {
566 struct nv50_reg *temp = temp_temp(pc);
567
568 emit_mov(pc, temp, src);
569 src = temp;
570 } else
571 if (src->type == P_CONST || src->type == P_IMMD) {
572 assert(!(e->inst[0] & 0x00800000));
573 if (e->inst[0] & 0x01000000) {
574 struct nv50_reg *temp = temp_temp(pc);
575
576 emit_mov(pc, temp, src);
577 src = temp;
578 } else {
579 set_data(pc, src, 0x7f, 16, e);
580 e->inst[0] |= 0x00800000;
581 }
582 }
583
584 alloc_reg(pc, src);
585 e->inst[0] |= (src->hw << 16);
586 }
587
588 static void
589 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
590 {
591 set_long(pc, e);
592
593 if (src->type == P_ATTR) {
594 struct nv50_reg *temp = temp_temp(pc);
595
596 emit_mov(pc, temp, src);
597 src = temp;
598 } else
599 if (src->type == P_CONST || src->type == P_IMMD) {
600 assert(!(e->inst[0] & 0x01000000));
601 if (e->inst[0] & 0x00800000) {
602 struct nv50_reg *temp = temp_temp(pc);
603
604 emit_mov(pc, temp, src);
605 src = temp;
606 } else {
607 set_data(pc, src, 0x7f, 32+14, e);
608 e->inst[0] |= 0x01000000;
609 }
610 }
611
612 alloc_reg(pc, src);
613 e->inst[1] |= (src->hw << 14);
614 }
615
616 static void
617 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
618 struct nv50_reg *src1)
619 {
620 struct nv50_program_exec *e = exec(pc);
621
622 e->inst[0] |= 0xc0000000;
623
624 if (!pc->allow32)
625 set_long(pc, e);
626
627 check_swap_src_0_1(pc, &src0, &src1);
628 set_dst(pc, dst, e);
629 set_src_0(pc, src0, e);
630 if (src1->type == P_IMMD && !is_long(e))
631 set_immd(pc, src1, e);
632 else
633 set_src_1(pc, src1, e);
634
635 emit(pc, e);
636 }
637
638 static void
639 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
640 struct nv50_reg *src0, struct nv50_reg *src1)
641 {
642 struct nv50_program_exec *e = exec(pc);
643
644 e->inst[0] |= 0xb0000000;
645
646 if (!pc->allow32)
647 set_long(pc, e);
648
649 check_swap_src_0_1(pc, &src0, &src1);
650 set_dst(pc, dst, e);
651 set_src_0(pc, src0, e);
652 if (is_long(e) || src1->type == P_CONST || src1->type == P_ATTR)
653 set_src_2(pc, src1, e);
654 else
655 if (src1->type == P_IMMD)
656 set_immd(pc, src1, e);
657 else
658 set_src_1(pc, src1, e);
659
660 emit(pc, e);
661 }
662
663 static void
664 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
665 struct nv50_reg *src0, struct nv50_reg *src1)
666 {
667 struct nv50_program_exec *e = exec(pc);
668
669 set_long(pc, e);
670 e->inst[0] |= 0xb0000000;
671 e->inst[1] |= (sub << 29);
672
673 check_swap_src_0_1(pc, &src0, &src1);
674 set_dst(pc, dst, e);
675 set_src_0(pc, src0, e);
676 set_src_1(pc, src1, e);
677
678 emit(pc, e);
679 }
680
681 static void
682 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
683 struct nv50_reg *src1)
684 {
685 struct nv50_program_exec *e = exec(pc);
686
687 e->inst[0] |= 0xb0000000;
688
689 set_long(pc, e);
690 if (check_swap_src_0_1(pc, &src0, &src1))
691 e->inst[1] |= 0x04000000;
692 else
693 e->inst[1] |= 0x08000000;
694
695 set_dst(pc, dst, e);
696 set_src_0(pc, src0, e);
697 set_src_2(pc, src1, e);
698
699 emit(pc, e);
700 }
701
702 static void
703 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
704 struct nv50_reg *src1, struct nv50_reg *src2)
705 {
706 struct nv50_program_exec *e = exec(pc);
707
708 e->inst[0] |= 0xe0000000;
709
710 check_swap_src_0_1(pc, &src0, &src1);
711 set_dst(pc, dst, e);
712 set_src_0(pc, src0, e);
713 set_src_1(pc, src1, e);
714 set_src_2(pc, src2, e);
715
716 emit(pc, e);
717 }
718
719 static void
720 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721 struct nv50_reg *src1, struct nv50_reg *src2)
722 {
723 struct nv50_program_exec *e = exec(pc);
724
725 e->inst[0] |= 0xe0000000;
726 set_long(pc, e);
727 e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
728
729 check_swap_src_0_1(pc, &src0, &src1);
730 set_dst(pc, dst, e);
731 set_src_0(pc, src0, e);
732 set_src_1(pc, src1, e);
733 set_src_2(pc, src2, e);
734
735 emit(pc, e);
736 }
737
738 static void
739 emit_flop(struct nv50_pc *pc, unsigned sub,
740 struct nv50_reg *dst, struct nv50_reg *src)
741 {
742 struct nv50_program_exec *e = exec(pc);
743
744 e->inst[0] |= 0x90000000;
745 if (sub) {
746 set_long(pc, e);
747 e->inst[1] |= (sub << 29);
748 }
749
750 set_dst(pc, dst, e);
751 set_src_0(pc, src, e);
752
753 emit(pc, e);
754 }
755
756 static void
757 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
758 {
759 struct nv50_program_exec *e = exec(pc);
760
761 e->inst[0] |= 0xb0000000;
762
763 set_dst(pc, dst, e);
764 set_src_0(pc, src, e);
765 set_long(pc, e);
766 e->inst[1] |= (6 << 29) | 0x00004000;
767
768 emit(pc, e);
769 }
770
771 static void
772 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
773 {
774 struct nv50_program_exec *e = exec(pc);
775
776 e->inst[0] |= 0xb0000000;
777
778 set_dst(pc, dst, e);
779 set_src_0(pc, src, e);
780 set_long(pc, e);
781 e->inst[1] |= (6 << 29);
782
783 emit(pc, e);
784 }
785
786 #define CVTOP_RN 0x01
787 #define CVTOP_FLOOR 0x03
788 #define CVTOP_CEIL 0x05
789 #define CVTOP_TRUNC 0x07
790 #define CVTOP_SAT 0x08
791 #define CVTOP_ABS 0x10
792
793 #define CVT_F32_F32 0xc4
794 #define CVT_F32_S32 0x44
795 #define CVT_F32_U32 0x64
796 #define CVT_S32_F32 0x8c
797 #define CVT_S32_S32 0x0c
798 #define CVT_F32_F32_ROP 0xcc
799
800 static void
801 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
802 int wp, unsigned cop, unsigned fmt)
803 {
804 struct nv50_program_exec *e;
805
806 e = exec(pc);
807 set_long(pc, e);
808
809 e->inst[0] |= 0xa0000000;
810 e->inst[1] |= 0x00004000;
811 e->inst[1] |= (cop << 16);
812 e->inst[1] |= (fmt << 24);
813 set_src_0(pc, src, e);
814
815 if (wp >= 0)
816 set_pred_wr(pc, 1, wp, e);
817
818 if (dst)
819 set_dst(pc, dst, e);
820 else {
821 e->inst[0] |= 0x000001fc;
822 e->inst[1] |= 0x00000008;
823 }
824
825 emit(pc, e);
826 }
827
828 static void
829 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
830 struct nv50_reg *src0, struct nv50_reg *src1)
831 {
832 struct nv50_program_exec *e = exec(pc);
833 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
834 struct nv50_reg *rdst;
835
836 assert(c_op <= 7);
837 if (check_swap_src_0_1(pc, &src0, &src1))
838 c_op = inv_cop[c_op];
839
840 rdst = dst;
841 if (dst->type != P_TEMP)
842 dst = alloc_temp(pc, NULL);
843
844 /* set.u32 */
845 set_long(pc, e);
846 e->inst[0] |= 0xb0000000;
847 e->inst[1] |= (3 << 29);
848 e->inst[1] |= (c_op << 14);
849 /*XXX: breaks things, .u32 by default?
850 * decuda will disasm as .u16 and use .lo/.hi regs, but this
851 * doesn't seem to match what the hw actually does.
852 inst[1] |= 0x04000000; << breaks things.. .u32 by default?
853 */
854 set_dst(pc, dst, e);
855 set_src_0(pc, src0, e);
856 set_src_1(pc, src1, e);
857 emit(pc, e);
858
859 /* cvt.f32.u32 */
860 e = exec(pc);
861 e->inst[0] = 0xa0000001;
862 e->inst[1] = 0x64014780;
863 set_dst(pc, rdst, e);
864 set_src_0(pc, dst, e);
865 emit(pc, e);
866
867 if (dst != rdst)
868 free_temp(pc, dst);
869 }
870
871 static INLINE void
872 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
873 {
874 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
875 }
876
877 static void
878 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
879 struct nv50_reg *v, struct nv50_reg *e)
880 {
881 struct nv50_reg *temp = alloc_temp(pc, NULL);
882
883 emit_flop(pc, 3, temp, v);
884 emit_mul(pc, temp, temp, e);
885 emit_preex2(pc, temp, temp);
886 emit_flop(pc, 6, dst, temp);
887
888 free_temp(pc, temp);
889 }
890
891 static INLINE void
892 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
893 {
894 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
895 }
896
897 static void
898 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
899 struct nv50_reg **src)
900 {
901 struct nv50_reg *one = alloc_immd(pc, 1.0);
902 struct nv50_reg *zero = alloc_immd(pc, 0.0);
903 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
904 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
905 struct nv50_reg *tmp[4];
906 boolean allow32 = pc->allow32;
907
908 pc->allow32 = FALSE;
909
910 if (mask & (3 << 1)) {
911 tmp[0] = alloc_temp(pc, NULL);
912 emit_minmax(pc, 4, tmp[0], src[0], zero);
913 }
914
915 if (mask & (1 << 2)) {
916 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
917
918 tmp[1] = temp_temp(pc);
919 emit_minmax(pc, 4, tmp[1], src[1], zero);
920
921 tmp[3] = temp_temp(pc);
922 emit_minmax(pc, 4, tmp[3], src[3], neg128);
923 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
924
925 emit_pow(pc, dst[2], tmp[1], tmp[3]);
926 emit_mov(pc, dst[2], zero);
927 set_pred(pc, 3, 0, pc->p->exec_tail);
928 }
929
930 if (mask & (1 << 1))
931 assimilate_temp(pc, dst[1], tmp[0]);
932 else
933 if (mask & (1 << 2))
934 free_temp(pc, tmp[0]);
935
936 pc->allow32 = allow32;
937
938 /* do this last, in case src[i,j] == dst[0,3] */
939 if (mask & (1 << 0))
940 emit_mov(pc, dst[0], one);
941
942 if (mask & (1 << 3))
943 emit_mov(pc, dst[3], one);
944
945 FREE(pos128);
946 FREE(neg128);
947 FREE(zero);
948 FREE(one);
949 }
950
951 static void
952 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
953 {
954 struct nv50_program_exec *e = exec(pc);
955
956 set_long(pc, e);
957 e->inst[0] |= 0xa0000000; /* delta */
958 e->inst[1] |= (7 << 29); /* delta */
959 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
960 e->inst[1] |= (1 << 14); /* src .f32 */
961 set_dst(pc, dst, e);
962 set_src_0(pc, src, e);
963
964 emit(pc, e);
965 }
966
967 static void
968 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
969 {
970 struct nv50_program_exec *e;
971 const int r_pred = 1;
972
973 /* Sets predicate reg ? */
974 e = exec(pc);
975 e->inst[0] = 0xa00001fd;
976 e->inst[1] = 0xc4014788;
977 set_src_0(pc, src, e);
978 set_pred_wr(pc, 1, r_pred, e);
979 emit(pc, e);
980
981 /* This is probably KILP */
982 e = exec(pc);
983 e->inst[0] = 0x000001fe;
984 set_long(pc, e);
985 set_pred(pc, 1 /* LT? */, r_pred, e);
986 emit(pc, e);
987 }
988
989 static void
990 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
991 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
992 {
993 struct nv50_reg *temp, *t[4];
994 struct nv50_program_exec *e;
995
996 unsigned c, mode, dim;
997
998 switch (type) {
999 case TGSI_TEXTURE_1D:
1000 dim = 1;
1001 break;
1002 case TGSI_TEXTURE_UNKNOWN:
1003 case TGSI_TEXTURE_2D:
1004 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1005 case TGSI_TEXTURE_RECT:
1006 dim = 2;
1007 break;
1008 case TGSI_TEXTURE_3D:
1009 case TGSI_TEXTURE_CUBE:
1010 case TGSI_TEXTURE_SHADOW2D:
1011 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1012 dim = 3;
1013 break;
1014 default:
1015 assert(0);
1016 break;
1017 }
1018
1019 alloc_temp4(pc, t, 0);
1020
1021 if (proj) {
1022 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1023 mode = pc->interp_mode[src[0]->index];
1024
1025 t[3]->rhw = src[3]->rhw;
1026 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1027 emit_flop(pc, 0, t[3], t[3]);
1028
1029 for (c = 0; c < dim; c++) {
1030 t[c]->rhw = src[c]->rhw;
1031 emit_interp(pc, t[c], t[3],
1032 (mode | INTERP_PERSPECTIVE));
1033 }
1034 } else {
1035 emit_flop(pc, 0, t[3], src[3]);
1036 for (c = 0; c < dim; c++)
1037 emit_mul(pc, t[c], src[c], t[3]);
1038
1039 /* XXX: for some reason the blob sometimes uses MAD:
1040 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1041 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1042 */
1043 }
1044 } else {
1045 if (type == TGSI_TEXTURE_CUBE) {
1046 temp = temp_temp(pc);
1047 emit_minmax(pc, 4, temp, src[0], src[1]);
1048 emit_minmax(pc, 4, temp, temp, src[2]);
1049 emit_flop(pc, 0, temp, temp);
1050 for (c = 0; c < 3; c++)
1051 emit_mul(pc, t[c], src[c], temp);
1052 } else {
1053 for (c = 0; c < dim; c++)
1054 emit_mov(pc, t[c], src[c]);
1055 }
1056 }
1057
1058 e = exec(pc);
1059 set_long(pc, e);
1060 e->inst[0] |= 0xf0000000;
1061 e->inst[1] |= 0x00000004;
1062 set_dst(pc, t[0], e);
1063 e->inst[0] |= (unit << 9);
1064
1065 if (dim == 2)
1066 e->inst[0] |= 0x00400000;
1067 else
1068 if (dim == 3)
1069 e->inst[0] |= 0x00800000;
1070
1071 e->inst[0] |= (mask & 0x3) << 25;
1072 e->inst[1] |= (mask & 0xc) << 12;
1073
1074 emit(pc, e);
1075
1076 #if 1
1077 if (mask & 1) emit_mov(pc, dst[0], t[0]);
1078 if (mask & 2) emit_mov(pc, dst[1], t[1]);
1079 if (mask & 4) emit_mov(pc, dst[2], t[2]);
1080 if (mask & 8) emit_mov(pc, dst[3], t[3]);
1081
1082 free_temp4(pc, t);
1083 #else
1084 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1085 * the texture coordinates, not the fetched values: latency ? */
1086
1087 for (c = 0; c < 4; c++) {
1088 if (mask & (1 << c))
1089 assimilate_temp(pc, dst[c], t[c]);
1090 else
1091 free_temp(pc, t[c]);
1092 }
1093 #endif
1094 }
1095
1096 static void
1097 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1098 {
1099 unsigned q = 0, m = ~0;
1100
1101 assert(!is_long(e));
1102
1103 switch (e->inst[0] >> 28) {
1104 case 0x1:
1105 /* MOV */
1106 q = 0x0403c000;
1107 m = 0xffff7fff;
1108 break;
1109 case 0x8:
1110 /* INTERP */
1111 m = ~0x02000000;
1112 if (e->inst[0] & 0x02000000)
1113 q = 0x00020000;
1114 break;
1115 case 0x9:
1116 /* RCP */
1117 break;
1118 case 0xB:
1119 /* ADD */
1120 m = ~(127 << 16);
1121 q = ((e->inst[0] & (~m)) >> 2);
1122 break;
1123 case 0xC:
1124 /* MUL */
1125 m = ~0x00008000;
1126 q = ((e->inst[0] & (~m)) << 12);
1127 break;
1128 case 0xE:
1129 /* MAD (if src2 == dst) */
1130 q = ((e->inst[0] & 0x1fc) << 12);
1131 break;
1132 default:
1133 assert(0);
1134 break;
1135 }
1136
1137 set_long(pc, e);
1138 pc->p->exec_size++;
1139
1140 e->inst[0] &= m;
1141 e->inst[1] |= q;
1142 }
1143
1144 static struct nv50_reg *
1145 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1146 {
1147 switch (dst->DstRegister.File) {
1148 case TGSI_FILE_TEMPORARY:
1149 return &pc->temp[dst->DstRegister.Index * 4 + c];
1150 case TGSI_FILE_OUTPUT:
1151 return &pc->result[dst->DstRegister.Index * 4 + c];
1152 case TGSI_FILE_NULL:
1153 return NULL;
1154 default:
1155 break;
1156 }
1157
1158 return NULL;
1159 }
1160
1161 static struct nv50_reg *
1162 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
1163 {
1164 struct nv50_reg *r = NULL;
1165 struct nv50_reg *temp;
1166 unsigned sgn, c;
1167
1168 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1169
1170 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1171 switch (c) {
1172 case TGSI_EXTSWIZZLE_X:
1173 case TGSI_EXTSWIZZLE_Y:
1174 case TGSI_EXTSWIZZLE_Z:
1175 case TGSI_EXTSWIZZLE_W:
1176 switch (src->SrcRegister.File) {
1177 case TGSI_FILE_INPUT:
1178 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1179 break;
1180 case TGSI_FILE_TEMPORARY:
1181 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1182 break;
1183 case TGSI_FILE_CONSTANT:
1184 r = &pc->param[src->SrcRegister.Index * 4 + c];
1185 break;
1186 case TGSI_FILE_IMMEDIATE:
1187 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1188 break;
1189 case TGSI_FILE_SAMPLER:
1190 break;
1191 default:
1192 assert(0);
1193 break;
1194 }
1195 break;
1196 case TGSI_EXTSWIZZLE_ZERO:
1197 r = alloc_immd(pc, 0.0);
1198 return r;
1199 case TGSI_EXTSWIZZLE_ONE:
1200 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1201 return alloc_immd(pc, -1.0);
1202 return alloc_immd(pc, 1.0);
1203 default:
1204 assert(0);
1205 break;
1206 }
1207
1208 switch (sgn) {
1209 case TGSI_UTIL_SIGN_KEEP:
1210 break;
1211 case TGSI_UTIL_SIGN_CLEAR:
1212 temp = temp_temp(pc);
1213 emit_abs(pc, temp, r);
1214 r = temp;
1215 break;
1216 case TGSI_UTIL_SIGN_TOGGLE:
1217 temp = temp_temp(pc);
1218 emit_neg(pc, temp, r);
1219 r = temp;
1220 break;
1221 case TGSI_UTIL_SIGN_SET:
1222 temp = temp_temp(pc);
1223 emit_abs(pc, temp, r);
1224 emit_neg(pc, temp, temp);
1225 r = temp;
1226 break;
1227 default:
1228 assert(0);
1229 break;
1230 }
1231
1232 return r;
1233 }
1234
1235 /* returns TRUE if instruction can overwrite sources before they're read */
1236 static boolean
1237 direct2dest_op(const struct tgsi_full_instruction *insn)
1238 {
1239 if (insn->Instruction.Saturate)
1240 return FALSE;
1241
1242 switch (insn->Instruction.Opcode) {
1243 case TGSI_OPCODE_COS:
1244 case TGSI_OPCODE_DP3:
1245 case TGSI_OPCODE_DP4:
1246 case TGSI_OPCODE_DPH:
1247 case TGSI_OPCODE_KIL:
1248 case TGSI_OPCODE_LIT:
1249 case TGSI_OPCODE_POW:
1250 case TGSI_OPCODE_RCP:
1251 case TGSI_OPCODE_RSQ:
1252 case TGSI_OPCODE_SCS:
1253 case TGSI_OPCODE_SIN:
1254 case TGSI_OPCODE_TEX:
1255 case TGSI_OPCODE_TXP:
1256 return FALSE;
1257 default:
1258 return TRUE;
1259 }
1260 }
1261
1262 static boolean
1263 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1264 {
1265 const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1266 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1267 unsigned mask, sat, unit;
1268 boolean assimilate = FALSE;
1269 int i, c;
1270
1271 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1272 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1273
1274 for (c = 0; c < 4; c++) {
1275 if (mask & (1 << c))
1276 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1277 else
1278 dst[c] = NULL;
1279 rdst[c] = NULL;
1280 src[0][c] = NULL;
1281 src[1][c] = NULL;
1282 src[2][c] = NULL;
1283 }
1284
1285 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1286 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1287
1288 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1289 unit = fs->SrcRegister.Index;
1290
1291 for (c = 0; c < 4; c++)
1292 src[i][c] = tgsi_src(pc, c, fs);
1293 }
1294
1295 if (sat) {
1296 for (c = 0; c < 4; c++) {
1297 rdst[c] = dst[c];
1298 dst[c] = temp_temp(pc);
1299 }
1300 } else
1301 if (direct2dest_op(inst)) {
1302 for (c = 0; c < 4; c++) {
1303 if (!dst[c] || dst[c]->type != P_TEMP)
1304 continue;
1305
1306 for (i = c + 1; i < 4; i++) {
1307 if (dst[c] == src[0][i] ||
1308 dst[c] == src[1][i] ||
1309 dst[c] == src[2][i])
1310 break;
1311 }
1312 if (i == 4)
1313 continue;
1314
1315 assimilate = TRUE;
1316 rdst[c] = dst[c];
1317 dst[c] = alloc_temp(pc, NULL);
1318 }
1319 }
1320
1321 switch (inst->Instruction.Opcode) {
1322 case TGSI_OPCODE_ABS:
1323 for (c = 0; c < 4; c++) {
1324 if (!(mask & (1 << c)))
1325 continue;
1326 emit_abs(pc, dst[c], src[0][c]);
1327 }
1328 break;
1329 case TGSI_OPCODE_ADD:
1330 for (c = 0; c < 4; c++) {
1331 if (!(mask & (1 << c)))
1332 continue;
1333 emit_add(pc, dst[c], src[0][c], src[1][c]);
1334 }
1335 break;
1336 case TGSI_OPCODE_COS:
1337 temp = temp_temp(pc);
1338 emit_precossin(pc, temp, src[0][0]);
1339 emit_flop(pc, 5, temp, temp);
1340 for (c = 0; c < 4; c++) {
1341 if (!(mask & (1 << c)))
1342 continue;
1343 emit_mov(pc, dst[c], temp);
1344 }
1345 break;
1346 case TGSI_OPCODE_DP3:
1347 temp = temp_temp(pc);
1348 emit_mul(pc, temp, src[0][0], src[1][0]);
1349 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1350 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1351 for (c = 0; c < 4; c++) {
1352 if (!(mask & (1 << c)))
1353 continue;
1354 emit_mov(pc, dst[c], temp);
1355 }
1356 break;
1357 case TGSI_OPCODE_DP4:
1358 temp = temp_temp(pc);
1359 emit_mul(pc, temp, src[0][0], src[1][0]);
1360 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1361 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1362 emit_mad(pc, temp, src[0][3], src[1][3], temp);
1363 for (c = 0; c < 4; c++) {
1364 if (!(mask & (1 << c)))
1365 continue;
1366 emit_mov(pc, dst[c], temp);
1367 }
1368 break;
1369 case TGSI_OPCODE_DPH:
1370 temp = temp_temp(pc);
1371 emit_mul(pc, temp, src[0][0], src[1][0]);
1372 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1373 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1374 emit_add(pc, temp, src[1][3], temp);
1375 for (c = 0; c < 4; c++) {
1376 if (!(mask & (1 << c)))
1377 continue;
1378 emit_mov(pc, dst[c], temp);
1379 }
1380 break;
1381 case TGSI_OPCODE_DST:
1382 {
1383 struct nv50_reg *one = alloc_immd(pc, 1.0);
1384 if (mask & (1 << 0))
1385 emit_mov(pc, dst[0], one);
1386 if (mask & (1 << 1))
1387 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1388 if (mask & (1 << 2))
1389 emit_mov(pc, dst[2], src[0][2]);
1390 if (mask & (1 << 3))
1391 emit_mov(pc, dst[3], src[1][3]);
1392 FREE(one);
1393 }
1394 break;
1395 case TGSI_OPCODE_EX2:
1396 temp = temp_temp(pc);
1397 emit_preex2(pc, temp, src[0][0]);
1398 emit_flop(pc, 6, temp, temp);
1399 for (c = 0; c < 4; c++) {
1400 if (!(mask & (1 << c)))
1401 continue;
1402 emit_mov(pc, dst[c], temp);
1403 }
1404 break;
1405 case TGSI_OPCODE_FLR:
1406 for (c = 0; c < 4; c++) {
1407 if (!(mask & (1 << c)))
1408 continue;
1409 emit_flr(pc, dst[c], src[0][c]);
1410 }
1411 break;
1412 case TGSI_OPCODE_FRC:
1413 temp = temp_temp(pc);
1414 for (c = 0; c < 4; c++) {
1415 if (!(mask & (1 << c)))
1416 continue;
1417 emit_flr(pc, temp, src[0][c]);
1418 emit_sub(pc, dst[c], src[0][c], temp);
1419 }
1420 break;
1421 case TGSI_OPCODE_KIL:
1422 emit_kil(pc, src[0][0]);
1423 emit_kil(pc, src[0][1]);
1424 emit_kil(pc, src[0][2]);
1425 emit_kil(pc, src[0][3]);
1426 pc->p->cfg.fp.regs[2] |= 0x00100000;
1427 break;
1428 case TGSI_OPCODE_LIT:
1429 emit_lit(pc, &dst[0], mask, &src[0][0]);
1430 break;
1431 case TGSI_OPCODE_LG2:
1432 temp = temp_temp(pc);
1433 emit_flop(pc, 3, temp, src[0][0]);
1434 for (c = 0; c < 4; c++) {
1435 if (!(mask & (1 << c)))
1436 continue;
1437 emit_mov(pc, dst[c], temp);
1438 }
1439 break;
1440 case TGSI_OPCODE_LRP:
1441 temp = temp_temp(pc);
1442 for (c = 0; c < 4; c++) {
1443 if (!(mask & (1 << c)))
1444 continue;
1445 emit_sub(pc, temp, src[1][c], src[2][c]);
1446 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1447 }
1448 break;
1449 case TGSI_OPCODE_MAD:
1450 for (c = 0; c < 4; c++) {
1451 if (!(mask & (1 << c)))
1452 continue;
1453 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1454 }
1455 break;
1456 case TGSI_OPCODE_MAX:
1457 for (c = 0; c < 4; c++) {
1458 if (!(mask & (1 << c)))
1459 continue;
1460 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1461 }
1462 break;
1463 case TGSI_OPCODE_MIN:
1464 for (c = 0; c < 4; c++) {
1465 if (!(mask & (1 << c)))
1466 continue;
1467 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1468 }
1469 break;
1470 case TGSI_OPCODE_MOV:
1471 for (c = 0; c < 4; c++) {
1472 if (!(mask & (1 << c)))
1473 continue;
1474 emit_mov(pc, dst[c], src[0][c]);
1475 }
1476 break;
1477 case TGSI_OPCODE_MUL:
1478 for (c = 0; c < 4; c++) {
1479 if (!(mask & (1 << c)))
1480 continue;
1481 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1482 }
1483 break;
1484 case TGSI_OPCODE_POW:
1485 temp = temp_temp(pc);
1486 emit_pow(pc, temp, src[0][0], src[1][0]);
1487 for (c = 0; c < 4; c++) {
1488 if (!(mask & (1 << c)))
1489 continue;
1490 emit_mov(pc, dst[c], temp);
1491 }
1492 break;
1493 case TGSI_OPCODE_RCP:
1494 for (c = 3; c >= 0; c--) {
1495 if (!(mask & (1 << c)))
1496 continue;
1497 emit_flop(pc, 0, dst[c], src[0][0]);
1498 }
1499 break;
1500 case TGSI_OPCODE_RSQ:
1501 for (c = 3; c >= 0; c--) {
1502 if (!(mask & (1 << c)))
1503 continue;
1504 emit_flop(pc, 2, dst[c], src[0][0]);
1505 }
1506 break;
1507 case TGSI_OPCODE_SCS:
1508 temp = temp_temp(pc);
1509 emit_precossin(pc, temp, src[0][0]);
1510 if (mask & (1 << 0))
1511 emit_flop(pc, 5, dst[0], temp);
1512 if (mask & (1 << 1))
1513 emit_flop(pc, 4, dst[1], temp);
1514 if (mask & (1 << 2))
1515 emit_mov_immdval(pc, dst[2], 0.0);
1516 if (mask & (1 << 3))
1517 emit_mov_immdval(pc, dst[3], 1.0);
1518 break;
1519 case TGSI_OPCODE_SGE:
1520 for (c = 0; c < 4; c++) {
1521 if (!(mask & (1 << c)))
1522 continue;
1523 emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1524 }
1525 break;
1526 case TGSI_OPCODE_SIN:
1527 temp = temp_temp(pc);
1528 emit_precossin(pc, temp, src[0][0]);
1529 emit_flop(pc, 4, temp, temp);
1530 for (c = 0; c < 4; c++) {
1531 if (!(mask & (1 << c)))
1532 continue;
1533 emit_mov(pc, dst[c], temp);
1534 }
1535 break;
1536 case TGSI_OPCODE_SLT:
1537 for (c = 0; c < 4; c++) {
1538 if (!(mask & (1 << c)))
1539 continue;
1540 emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1541 }
1542 break;
1543 case TGSI_OPCODE_SUB:
1544 for (c = 0; c < 4; c++) {
1545 if (!(mask & (1 << c)))
1546 continue;
1547 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1548 }
1549 break;
1550 case TGSI_OPCODE_TEX:
1551 emit_tex(pc, dst, mask, src[0], unit,
1552 inst->InstructionExtTexture.Texture, FALSE);
1553 break;
1554 case TGSI_OPCODE_TXP:
1555 emit_tex(pc, dst, mask, src[0], unit,
1556 inst->InstructionExtTexture.Texture, TRUE);
1557 break;
1558 case TGSI_OPCODE_XPD:
1559 temp = temp_temp(pc);
1560 if (mask & (1 << 0)) {
1561 emit_mul(pc, temp, src[0][2], src[1][1]);
1562 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1563 }
1564 if (mask & (1 << 1)) {
1565 emit_mul(pc, temp, src[0][0], src[1][2]);
1566 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1567 }
1568 if (mask & (1 << 2)) {
1569 emit_mul(pc, temp, src[0][1], src[1][0]);
1570 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1571 }
1572 if (mask & (1 << 3))
1573 emit_mov_immdval(pc, dst[3], 1.0);
1574 break;
1575 case TGSI_OPCODE_END:
1576 break;
1577 default:
1578 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1579 return FALSE;
1580 }
1581
1582 if (sat) {
1583 for (c = 0; c < 4; c++) {
1584 if (!(mask & (1 << c)))
1585 continue;
1586 emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
1587 CVT_F32_F32);
1588 }
1589 } else if (assimilate) {
1590 for (c = 0; c < 4; c++)
1591 if (rdst[c])
1592 assimilate_temp(pc, rdst[c], dst[c]);
1593 }
1594
1595 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1596 for (c = 0; c < 4; c++) {
1597 if (!src[i][c])
1598 continue;
1599 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1600 FREE(src[i][c]);
1601 else
1602 if (src[i][c]->acc == pc->insn_cur)
1603 release_hw(pc, src[i][c]);
1604 }
1605 }
1606
1607 kill_temp_temp(pc);
1608 return TRUE;
1609 }
1610
1611 /* Adjust a bitmask that indicates what components of a source are used,
1612 * we use this in tx_prep so we only load interpolants that are needed.
1613 */
1614 static void
1615 insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1616 {
1617 const struct tgsi_instruction_ext_texture *tex;
1618
1619 switch (insn->Instruction.Opcode) {
1620 case TGSI_OPCODE_DP3:
1621 *mask = 0x7;
1622 break;
1623 case TGSI_OPCODE_DP4:
1624 case TGSI_OPCODE_DPH:
1625 *mask = 0xF;
1626 break;
1627 case TGSI_OPCODE_LIT:
1628 *mask = 0xB;
1629 break;
1630 case TGSI_OPCODE_RCP:
1631 case TGSI_OPCODE_RSQ:
1632 *mask = 0x1;
1633 break;
1634 case TGSI_OPCODE_TEX:
1635 case TGSI_OPCODE_TXP:
1636 assert(insn->Instruction.Extended);
1637 tex = &insn->InstructionExtTexture;
1638
1639 *mask = 0x7;
1640 if (tex->Texture == TGSI_TEXTURE_1D)
1641 *mask = 0x1;
1642 else
1643 if (tex->Texture == TGSI_TEXTURE_2D)
1644 *mask = 0x3;
1645
1646 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1647 *mask |= 0x8;
1648 break;
1649 default:
1650 break;
1651 }
1652 }
1653
1654 static void
1655 prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1656 unsigned *r_usage[2])
1657 {
1658 const struct tgsi_full_instruction *insn;
1659 const struct tgsi_full_src_register *src;
1660 const struct tgsi_dst_register *dst;
1661
1662 unsigned i, c, k, n, mask, *acc_p;
1663
1664 insn = &tok->FullInstruction;
1665 dst = &insn->FullDstRegisters[0].DstRegister;
1666 mask = dst->WriteMask;
1667
1668 if (!r_usage[0])
1669 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1670 if (!r_usage[1])
1671 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1672
1673 if (dst->File == TGSI_FILE_TEMPORARY) {
1674 for (c = 0; c < 4; c++) {
1675 if (!(mask & (1 << c)))
1676 continue;
1677 r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1678 }
1679 }
1680
1681 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1682 src = &insn->FullSrcRegisters[i];
1683
1684 switch (src->SrcRegister.File) {
1685 case TGSI_FILE_TEMPORARY:
1686 acc_p = r_usage[0];
1687 break;
1688 case TGSI_FILE_INPUT:
1689 acc_p = r_usage[1];
1690 break;
1691 default:
1692 continue;
1693 }
1694
1695 insn_adjust_mask(insn, &mask);
1696
1697 for (c = 0; c < 4; c++) {
1698 if (!(mask & (1 << c)))
1699 continue;
1700
1701 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1702 switch (k) {
1703 case TGSI_EXTSWIZZLE_X:
1704 case TGSI_EXTSWIZZLE_Y:
1705 case TGSI_EXTSWIZZLE_Z:
1706 case TGSI_EXTSWIZZLE_W:
1707 n = src->SrcRegister.Index * 4 + k;
1708 acc_p[n] = pc->insn_nr;
1709 break;
1710 default:
1711 break;
1712 }
1713 }
1714 }
1715 }
1716
1717 static unsigned
1718 load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1719 int *aid, int *p_oid)
1720 {
1721 struct nv50_reg *iv;
1722 int oid, c, n;
1723 unsigned mask = 0;
1724
1725 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1726
1727 for (c = 0, n = i * 4; c < 4; c++, n++) {
1728 oid = (*p_oid)++;
1729 pc->attr[n].type = P_TEMP;
1730 pc->attr[n].index = i;
1731
1732 if (pc->attr[n].acc == acc[n])
1733 continue;
1734 mask |= (1 << c);
1735
1736 pc->attr[n].acc = acc[n];
1737 pc->attr[n].rhw = pc->attr[n].hw = -1;
1738 alloc_reg(pc, &pc->attr[n]);
1739
1740 pc->attr[n].rhw = (*aid)++;
1741 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1742
1743 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1744 (*mid)++;
1745 pc->p->cfg.fp.regs[1] += 0x00010001;
1746 }
1747
1748 return mask;
1749 }
1750
1751 static boolean
1752 nv50_program_tx_prep(struct nv50_pc *pc)
1753 {
1754 struct tgsi_parse_context p;
1755 boolean ret = FALSE;
1756 unsigned i, c;
1757 unsigned fcol, bcol, fcrd, depr;
1758
1759 /* count (centroid) perspective interpolations */
1760 unsigned centroid_loads = 0;
1761 unsigned perspect_loads = 0;
1762
1763 /* track register access for temps and attrs */
1764 unsigned *r_usage[2];
1765 r_usage[0] = NULL;
1766 r_usage[1] = NULL;
1767
1768 depr = fcol = bcol = fcrd = 0xffff;
1769
1770 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1771 pc->p->cfg.fp.regs[0] = 0x01000404;
1772 pc->p->cfg.fp.regs[1] = 0x00000400;
1773 }
1774
1775 tgsi_parse_init(&p, pc->p->pipe.tokens);
1776 while (!tgsi_parse_end_of_tokens(&p)) {
1777 const union tgsi_full_token *tok = &p.FullToken;
1778
1779 tgsi_parse_token(&p);
1780 switch (tok->Token.Type) {
1781 case TGSI_TOKEN_TYPE_IMMEDIATE:
1782 {
1783 const struct tgsi_full_immediate *imm =
1784 &p.FullToken.FullImmediate;
1785
1786 ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1787 imm->u.ImmediateFloat32[1].Float,
1788 imm->u.ImmediateFloat32[2].Float,
1789 imm->u.ImmediateFloat32[3].Float);
1790 }
1791 break;
1792 case TGSI_TOKEN_TYPE_DECLARATION:
1793 {
1794 const struct tgsi_full_declaration *d;
1795 unsigned last, first, mode;
1796
1797 d = &p.FullToken.FullDeclaration;
1798 first = d->DeclarationRange.First;
1799 last = d->DeclarationRange.Last;
1800
1801 switch (d->Declaration.File) {
1802 case TGSI_FILE_TEMPORARY:
1803 if (pc->temp_nr < (last + 1))
1804 pc->temp_nr = last + 1;
1805 break;
1806 case TGSI_FILE_OUTPUT:
1807 if (pc->result_nr < (last + 1))
1808 pc->result_nr = last + 1;
1809
1810 if (!d->Declaration.Semantic)
1811 break;
1812
1813 switch (d->Semantic.SemanticName) {
1814 case TGSI_SEMANTIC_POSITION:
1815 depr = first;
1816 pc->p->cfg.fp.regs[2] |= 0x00000100;
1817 pc->p->cfg.fp.regs[3] |= 0x00000011;
1818 break;
1819 default:
1820 break;
1821 }
1822
1823 break;
1824 case TGSI_FILE_INPUT:
1825 {
1826 if (pc->attr_nr < (last + 1))
1827 pc->attr_nr = last + 1;
1828
1829 if (pc->p->type != PIPE_SHADER_FRAGMENT)
1830 break;
1831
1832 switch (d->Declaration.Interpolate) {
1833 case TGSI_INTERPOLATE_CONSTANT:
1834 mode = INTERP_FLAT;
1835 break;
1836 case TGSI_INTERPOLATE_PERSPECTIVE:
1837 mode = INTERP_PERSPECTIVE;
1838 break;
1839 default:
1840 mode = INTERP_LINEAR;
1841 break;
1842 }
1843
1844 if (d->Declaration.Semantic) {
1845 switch (d->Semantic.SemanticName) {
1846 case TGSI_SEMANTIC_POSITION:
1847 fcrd = first;
1848 break;
1849 case TGSI_SEMANTIC_COLOR:
1850 fcol = first;
1851 mode = INTERP_PERSPECTIVE;
1852 break;
1853 case TGSI_SEMANTIC_BCOLOR:
1854 bcol = first;
1855 mode = INTERP_PERSPECTIVE;
1856 break;
1857 }
1858 }
1859
1860 if (d->Declaration.Centroid) {
1861 mode |= INTERP_CENTROID;
1862 if (mode & INTERP_PERSPECTIVE)
1863 centroid_loads++;
1864 } else
1865 if (mode & INTERP_PERSPECTIVE)
1866 perspect_loads++;
1867
1868 assert(last < 32);
1869 for (i = first; i <= last; i++)
1870 pc->interp_mode[i] = mode;
1871 }
1872 break;
1873 case TGSI_FILE_CONSTANT:
1874 if (pc->param_nr < (last + 1))
1875 pc->param_nr = last + 1;
1876 break;
1877 case TGSI_FILE_SAMPLER:
1878 break;
1879 default:
1880 NOUVEAU_ERR("bad decl file %d\n",
1881 d->Declaration.File);
1882 goto out_err;
1883 }
1884 }
1885 break;
1886 case TGSI_TOKEN_TYPE_INSTRUCTION:
1887 pc->insn_nr++;
1888 prep_inspect_insn(pc, tok, r_usage);
1889 break;
1890 default:
1891 break;
1892 }
1893 }
1894
1895 if (pc->temp_nr) {
1896 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1897 if (!pc->temp)
1898 goto out_err;
1899
1900 for (i = 0; i < pc->temp_nr; i++) {
1901 for (c = 0; c < 4; c++) {
1902 pc->temp[i*4+c].type = P_TEMP;
1903 pc->temp[i*4+c].hw = -1;
1904 pc->temp[i*4+c].rhw = -1;
1905 pc->temp[i*4+c].index = i;
1906 pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1907 }
1908 }
1909 }
1910
1911 if (pc->attr_nr) {
1912 int oid = 4, mid = 4, aid = 0;
1913 /* oid = VP output id
1914 * aid = FP attribute/interpolant id
1915 * mid = VP output mapping field ID
1916 */
1917
1918 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1919 if (!pc->attr)
1920 goto out_err;
1921
1922 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1923 /* position should be loaded first */
1924 if (fcrd != 0xffff) {
1925 unsigned mask;
1926 mid = 0;
1927 mask = load_fp_attrib(pc, fcrd, r_usage[1],
1928 &mid, &aid, &oid);
1929 oid = 0;
1930 pc->p->cfg.fp.regs[1] |= (mask << 24);
1931 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1932 }
1933 pc->p->cfg.fp.map[0] += 0x03020100;
1934
1935 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1936
1937 if (perspect_loads) {
1938 pc->iv_p = alloc_temp(pc, NULL);
1939
1940 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1941 pc->p->cfg.fp.regs[1] |= 0x08000000;
1942 pc->iv_p->rhw = aid++;
1943 emit_interp(pc, pc->iv_p, NULL,
1944 INTERP_LINEAR);
1945 emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1946 } else {
1947 pc->iv_p->rhw = aid - 1;
1948 emit_flop(pc, 0, pc->iv_p,
1949 &pc->attr[fcrd * 4 + 3]);
1950 }
1951 }
1952
1953 if (centroid_loads) {
1954 pc->iv_c = alloc_temp(pc, NULL);
1955 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1956 emit_interp(pc, pc->iv_c, NULL,
1957 INTERP_CENTROID);
1958 emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1959 pc->p->cfg.fp.regs[1] |= 0x08000000;
1960 }
1961
1962 for (c = 0; c < 4; c++) {
1963 /* I don't know what these values do, but
1964 * let's set them like the blob does:
1965 */
1966 if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1967 pc->p->cfg.fp.regs[0] += 0x00010000;
1968 if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1969 pc->p->cfg.fp.regs[0] += 0x00010000;
1970 }
1971
1972 for (i = 0; i < pc->attr_nr; i++)
1973 load_fp_attrib(pc, i, r_usage[1],
1974 &mid, &aid, &oid);
1975
1976 if (pc->iv_p)
1977 free_temp(pc, pc->iv_p);
1978 if (pc->iv_c)
1979 free_temp(pc, pc->iv_c);
1980
1981 pc->p->cfg.fp.high_map = (mid / 4);
1982 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
1983 } else {
1984 /* vertex program */
1985 for (i = 0; i < pc->attr_nr * 4; i++) {
1986 pc->p->cfg.vp.attr[aid / 32] |=
1987 (1 << (aid % 32));
1988 pc->attr[i].type = P_ATTR;
1989 pc->attr[i].hw = aid++;
1990 pc->attr[i].index = i / 4;
1991 }
1992 }
1993 }
1994
1995 if (pc->result_nr) {
1996 int rid = 0;
1997
1998 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1999 if (!pc->result)
2000 goto out_err;
2001
2002 for (i = 0; i < pc->result_nr; i++) {
2003 for (c = 0; c < 4; c++) {
2004 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2005 pc->result[i*4+c].type = P_TEMP;
2006 pc->result[i*4+c].hw = -1;
2007 pc->result[i*4+c].rhw = (i == depr) ?
2008 -1 : rid++;
2009 } else {
2010 pc->result[i*4+c].type = P_RESULT;
2011 pc->result[i*4+c].hw = rid++;
2012 }
2013 pc->result[i*4+c].index = i;
2014 }
2015
2016 if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2017 depr != 0xffff) {
2018 pc->result[depr * 4 + 2].rhw =
2019 (pc->result_nr - 1) * 4;
2020 }
2021 }
2022 }
2023
2024 if (pc->param_nr) {
2025 int rid = 0;
2026
2027 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2028 if (!pc->param)
2029 goto out_err;
2030
2031 for (i = 0; i < pc->param_nr; i++) {
2032 for (c = 0; c < 4; c++) {
2033 pc->param[i*4+c].type = P_CONST;
2034 pc->param[i*4+c].hw = rid++;
2035 pc->param[i*4+c].index = i;
2036 }
2037 }
2038 }
2039
2040 if (pc->immd_nr) {
2041 int rid = 0;
2042
2043 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2044 if (!pc->immd)
2045 goto out_err;
2046
2047 for (i = 0; i < pc->immd_nr; i++) {
2048 for (c = 0; c < 4; c++) {
2049 pc->immd[i*4+c].type = P_IMMD;
2050 pc->immd[i*4+c].hw = rid++;
2051 pc->immd[i*4+c].index = i;
2052 }
2053 }
2054 }
2055
2056 ret = TRUE;
2057 out_err:
2058 if (r_usage[0])
2059 FREE(r_usage[0]);
2060 if (r_usage[1])
2061 FREE(r_usage[1]);
2062
2063 tgsi_parse_free(&p);
2064 return ret;
2065 }
2066
2067 static void
2068 free_nv50_pc(struct nv50_pc *pc)
2069 {
2070 if (pc->immd)
2071 FREE(pc->immd);
2072 if (pc->param)
2073 FREE(pc->param);
2074 if (pc->result)
2075 FREE(pc->result);
2076 if (pc->attr)
2077 FREE(pc->attr);
2078 if (pc->temp)
2079 FREE(pc->temp);
2080
2081 FREE(pc);
2082 }
2083
2084 static boolean
2085 nv50_program_tx(struct nv50_program *p)
2086 {
2087 struct tgsi_parse_context parse;
2088 struct nv50_pc *pc;
2089 unsigned k;
2090 boolean ret;
2091
2092 pc = CALLOC_STRUCT(nv50_pc);
2093 if (!pc)
2094 return FALSE;
2095 pc->p = p;
2096 pc->p->cfg.high_temp = 4;
2097
2098 ret = nv50_program_tx_prep(pc);
2099 if (ret == FALSE)
2100 goto out_cleanup;
2101
2102 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2103 while (!tgsi_parse_end_of_tokens(&parse)) {
2104 const union tgsi_full_token *tok = &parse.FullToken;
2105
2106 /* don't allow half insn/immd on first and last instruction */
2107 pc->allow32 = TRUE;
2108 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2109 pc->allow32 = FALSE;
2110
2111 tgsi_parse_token(&parse);
2112
2113 switch (tok->Token.Type) {
2114 case TGSI_TOKEN_TYPE_INSTRUCTION:
2115 ++pc->insn_cur;
2116 ret = nv50_program_tx_insn(pc, tok);
2117 if (ret == FALSE)
2118 goto out_err;
2119 break;
2120 default:
2121 break;
2122 }
2123 }
2124
2125 if (p->type == PIPE_SHADER_FRAGMENT) {
2126 struct nv50_reg out;
2127
2128 out.type = P_TEMP;
2129 for (k = 0; k < pc->result_nr * 4; k++) {
2130 if (pc->result[k].rhw == -1)
2131 continue;
2132 if (pc->result[k].hw != pc->result[k].rhw) {
2133 out.hw = pc->result[k].rhw;
2134 emit_mov(pc, &out, &pc->result[k]);
2135 }
2136 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2137 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2138 }
2139 }
2140
2141 /* look for single half instructions and make them long */
2142 struct nv50_program_exec *e, *e_prev;
2143
2144 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2145 if (!is_long(e))
2146 k++;
2147
2148 if (!e->next || is_long(e->next)) {
2149 if (k & 1)
2150 convert_to_long(pc, e);
2151 k = 0;
2152 }
2153
2154 if (e->next)
2155 e_prev = e;
2156 }
2157
2158 if (!is_long(pc->p->exec_tail)) {
2159 /* this may occur if moving FP results */
2160 assert(e_prev && !is_long(e_prev));
2161 convert_to_long(pc, e_prev);
2162 convert_to_long(pc, pc->p->exec_tail);
2163 }
2164
2165 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2166 pc->p->exec_tail->inst[1] |= 0x00000001;
2167
2168 p->param_nr = pc->param_nr * 4;
2169 p->immd_nr = pc->immd_nr * 4;
2170 p->immd = pc->immd_buf;
2171
2172 out_err:
2173 tgsi_parse_free(&parse);
2174
2175 out_cleanup:
2176 free_nv50_pc(pc);
2177 return ret;
2178 }
2179
2180 static void
2181 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2182 {
2183 if (nv50_program_tx(p) == FALSE)
2184 assert(0);
2185 p->translated = TRUE;
2186 }
2187
2188 static void
2189 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2190 unsigned start, unsigned count, unsigned cbuf)
2191 {
2192 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2193 struct nouveau_grobj *tesla = nv50->screen->tesla;
2194
2195 while (count) {
2196 unsigned nr = count > 2047 ? 2047 : count;
2197
2198 BEGIN_RING(chan, tesla, 0x00000f00, 1);
2199 OUT_RING (chan, (cbuf << 0) | (start << 8));
2200 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2201 OUT_RINGp (chan, map, nr);
2202
2203 map += nr;
2204 start += nr;
2205 count -= nr;
2206 }
2207 }
2208
2209 static void
2210 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2211 {
2212 struct nouveau_winsys *nvws = nv50->screen->nvws;
2213 struct pipe_winsys *ws = nv50->pipe.winsys;
2214
2215 if (!p->data[0] && p->immd_nr) {
2216 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2217
2218 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {
2219 while (heap->next && heap->size < p->immd_nr) {
2220 struct nv50_program *evict = heap->next->priv;
2221 nvws->res_free(&evict->data[0]);
2222 }
2223
2224 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))
2225 assert(0);
2226 }
2227
2228 /* immediates only need to be uploaded again when freed */
2229 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2230 p->immd_nr, NV50_CB_PMISC);
2231 }
2232
2233 if (!p->data[1] && p->param_nr) {
2234 struct nouveau_resource *heap =
2235 nv50->screen->parm_heap[p->type];
2236
2237 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {
2238 while (heap->next && heap->size < p->param_nr) {
2239 struct nv50_program *evict = heap->next->priv;
2240 nvws->res_free(&evict->data[1]);
2241 }
2242
2243 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))
2244 assert(0);
2245 }
2246 }
2247
2248 if (p->param_nr) {
2249 unsigned cbuf = NV50_CB_PVP;
2250 float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
2251 PIPE_BUFFER_USAGE_CPU_READ);
2252 if (p->type == PIPE_SHADER_FRAGMENT)
2253 cbuf = NV50_CB_PFP;
2254 nv50_program_upload_data(nv50, map, p->data[1]->start,
2255 p->param_nr, cbuf);
2256 ws->buffer_unmap(ws, nv50->constbuf[p->type]);
2257 }
2258 }
2259
2260 static void
2261 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2262 {
2263 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2264 struct nouveau_grobj *tesla = nv50->screen->tesla;
2265 struct pipe_screen *screen = nv50->pipe.screen;
2266 struct nv50_program_exec *e;
2267 struct nouveau_stateobj *so;
2268 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2269 unsigned start, count, *up, *ptr;
2270 boolean upload = FALSE;
2271
2272 if (!p->buffer) {
2273 p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
2274 upload = TRUE;
2275 }
2276
2277 if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2278 (p->data[1] && p->data[1]->start != p->data_start[1])) {
2279 for (e = p->exec_head; e; e = e->next) {
2280 unsigned ei, ci, bs;
2281
2282 if (e->param.index < 0)
2283 continue;
2284 bs = (e->inst[1] >> 22) & 0x07;
2285 assert(bs < 2);
2286 ei = e->param.shift >> 5;
2287 ci = e->param.index + p->data[bs]->start;
2288
2289 e->inst[ei] &= ~e->param.mask;
2290 e->inst[ei] |= (ci << e->param.shift);
2291 }
2292
2293 if (p->data[0])
2294 p->data_start[0] = p->data[0]->start;
2295 if (p->data[1])
2296 p->data_start[1] = p->data[1]->start;
2297
2298 upload = TRUE;
2299 }
2300
2301 if (!upload)
2302 return;
2303
2304 #ifdef NV50_PROGRAM_DUMP
2305 NOUVEAU_ERR("-------\n");
2306 for (e = p->exec_head; e; e = e->next) {
2307 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2308 if (is_long(e))
2309 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2310 }
2311 #endif
2312
2313 up = ptr = MALLOC(p->exec_size * 4);
2314 for (e = p->exec_head; e; e = e->next) {
2315 *(ptr++) = e->inst[0];
2316 if (is_long(e))
2317 *(ptr++) = e->inst[1];
2318 }
2319
2320 so = so_new(4,2);
2321 so_method(so, nv50->screen->tesla, 0x1280, 3);
2322 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2323 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2324 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2325
2326 start = 0; count = p->exec_size;
2327 while (count) {
2328 struct nouveau_winsys *nvws = nv50->screen->nvws;
2329 unsigned nr;
2330
2331 so_emit(nvws, so);
2332
2333 nr = MIN2(count, 2047);
2334 nr = MIN2(nvws->channel->pushbuf->remaining, nr);
2335 if (nvws->channel->pushbuf->remaining < (nr + 3)) {
2336 FIRE_RING(chan);
2337 continue;
2338 }
2339
2340 BEGIN_RING(chan, tesla, 0x0f00, 1);
2341 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2342 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2343 OUT_RINGp (chan, up + start, nr);
2344
2345 start += nr;
2346 count -= nr;
2347 }
2348
2349 FREE(up);
2350 so_ref(NULL, &so);
2351 }
2352
2353 void
2354 nv50_vertprog_validate(struct nv50_context *nv50)
2355 {
2356 struct nouveau_grobj *tesla = nv50->screen->tesla;
2357 struct nv50_program *p = nv50->vertprog;
2358 struct nouveau_stateobj *so;
2359
2360 if (!p->translated) {
2361 nv50_program_validate(nv50, p);
2362 if (!p->translated)
2363 assert(0);
2364 }
2365
2366 nv50_program_validate_data(nv50, p);
2367 nv50_program_validate_code(nv50, p);
2368
2369 so = so_new(13, 2);
2370 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2371 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2372 NOUVEAU_BO_HIGH, 0, 0);
2373 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2374 NOUVEAU_BO_LOW, 0, 0);
2375 so_method(so, tesla, 0x1650, 2);
2376 so_data (so, p->cfg.vp.attr[0]);
2377 so_data (so, p->cfg.vp.attr[1]);
2378 so_method(so, tesla, 0x16b8, 1);
2379 so_data (so, p->cfg.high_result);
2380 so_method(so, tesla, 0x16ac, 2);
2381 so_data (so, p->cfg.high_result); //8);
2382 so_data (so, p->cfg.high_temp);
2383 so_method(so, tesla, 0x140c, 1);
2384 so_data (so, 0); /* program start offset */
2385 so_ref(so, &nv50->state.vertprog);
2386 so_ref(NULL, &so);
2387 }
2388
2389 void
2390 nv50_fragprog_validate(struct nv50_context *nv50)
2391 {
2392 struct nouveau_grobj *tesla = nv50->screen->tesla;
2393 struct nv50_program *p = nv50->fragprog;
2394 struct nouveau_stateobj *so;
2395 unsigned i;
2396
2397 if (!p->translated) {
2398 nv50_program_validate(nv50, p);
2399 if (!p->translated)
2400 assert(0);
2401 }
2402
2403 nv50_program_validate_data(nv50, p);
2404 nv50_program_validate_code(nv50, p);
2405
2406 so = so_new(64, 2);
2407 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2408 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2409 NOUVEAU_BO_HIGH, 0, 0);
2410 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2411 NOUVEAU_BO_LOW, 0, 0);
2412 so_method(so, tesla, 0x1904, 4);
2413 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2414 so_data (so, 0x00000004);
2415 so_data (so, 0x00000000);
2416 so_data (so, 0x00000000);
2417 so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
2418 for (i = 0; i < p->cfg.fp.high_map; i++)
2419 so_data(so, p->cfg.fp.map[i]);
2420 so_method(so, tesla, 0x1988, 2);
2421 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2422 so_data (so, p->cfg.high_temp);
2423 so_method(so, tesla, 0x1298, 1);
2424 so_data (so, p->cfg.high_result);
2425 so_method(so, tesla, 0x19a8, 1);
2426 so_data (so, p->cfg.fp.regs[2]);
2427 so_method(so, tesla, 0x196c, 1);
2428 so_data (so, p->cfg.fp.regs[3]);
2429 so_method(so, tesla, 0x1414, 1);
2430 so_data (so, 0); /* program start offset */
2431 so_ref(so, &nv50->state.fragprog);
2432 so_ref(NULL, &so);
2433 }
2434
2435 void
2436 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2437 {
2438 struct pipe_screen *pscreen = nv50->pipe.screen;
2439
2440 while (p->exec_head) {
2441 struct nv50_program_exec *e = p->exec_head;
2442
2443 p->exec_head = e->next;
2444 FREE(e);
2445 }
2446 p->exec_tail = NULL;
2447 p->exec_size = 0;
2448
2449 if (p->buffer)
2450 pipe_buffer_reference(&p->buffer, NULL);
2451
2452 nv50->screen->nvws->res_free(&p->data[0]);
2453 nv50->screen->nvws->res_free(&p->data[1]);
2454
2455 p->translated = 0;
2456 }
2457