nv50: enable half insns for MOV and MUL
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 64
35 //#define NV50_PROGRAM_DUMP
36
37 /* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * - Fuck it off, introduce a way to negate args for ops that
41 * support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * FP attr/result assignment - how?
58 * attrib
59 * - 0x16bc maps vp output onto fp hpos
60 * - 0x16c0 maps vp output onto fp col0
61 * result
62 * - colr always 0-3
63 * - depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * - 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * - XX == FP high something
75 */
76 struct nv50_reg {
77 enum {
78 P_TEMP,
79 P_ATTR,
80 P_RESULT,
81 P_CONST,
82 P_IMMD
83 } type;
84 int index;
85
86 int hw;
87 int neg;
88
89 int rhw; /* result hw for FP outputs, or interpolant index */
90 int acc; /* instruction where this reg is last read (first insn == 1) */
91 };
92
93 struct nv50_pc {
94 struct nv50_program *p;
95
96 /* hw resources */
97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99 /* tgsi resources */
100 struct nv50_reg *temp;
101 int temp_nr;
102 struct nv50_reg *attr;
103 int attr_nr;
104 struct nv50_reg *result;
105 int result_nr;
106 struct nv50_reg *param;
107 int param_nr;
108 struct nv50_reg *immd;
109 float *immd_buf;
110 int immd_nr;
111
112 struct nv50_reg *temp_temp[16];
113 unsigned temp_temp_nr;
114
115 unsigned interp_mode[32];
116 /* perspective interpolation registers */
117 struct nv50_reg *iv_p;
118 struct nv50_reg *iv_c;
119
120 /* current instruction and total number of insns */
121 unsigned insn_cur;
122 unsigned insn_nr;
123
124 boolean allow32;
125 };
126
127 static void
128 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129 {
130 int i = 0;
131
132 if (reg->type == P_RESULT) {
133 if (pc->p->cfg.high_result < (reg->hw + 1))
134 pc->p->cfg.high_result = reg->hw + 1;
135 }
136
137 if (reg->type != P_TEMP)
138 return;
139
140 if (reg->hw >= 0) {
141 /*XXX: do this here too to catch FP temp-as-attr usage..
142 * not clean, but works */
143 if (pc->p->cfg.high_temp < (reg->hw + 1))
144 pc->p->cfg.high_temp = reg->hw + 1;
145 return;
146 }
147
148 if (reg->rhw != -1) {
149 /* try to allocate temporary with index rhw first */
150 if (!(pc->r_temp[reg->rhw])) {
151 pc->r_temp[reg->rhw] = reg;
152 reg->hw = reg->rhw;
153 if (pc->p->cfg.high_temp < (reg->rhw + 1))
154 pc->p->cfg.high_temp = reg->rhw + 1;
155 return;
156 }
157 /* make sure we don't get things like $r0 needs to go
158 * in $r1 and $r1 in $r0
159 */
160 i = pc->result_nr * 4;
161 }
162
163 for (; i < NV50_SU_MAX_TEMP; i++) {
164 if (!(pc->r_temp[i])) {
165 pc->r_temp[i] = reg;
166 reg->hw = i;
167 if (pc->p->cfg.high_temp < (i + 1))
168 pc->p->cfg.high_temp = i + 1;
169 return;
170 }
171 }
172
173 assert(0);
174 }
175
176 static struct nv50_reg *
177 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178 {
179 struct nv50_reg *r;
180 int i;
181
182 if (dst && dst->type == P_TEMP && dst->hw == -1)
183 return dst;
184
185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186 if (!pc->r_temp[i]) {
187 r = CALLOC_STRUCT(nv50_reg);
188 r->type = P_TEMP;
189 r->index = -1;
190 r->hw = i;
191 r->rhw = -1;
192 pc->r_temp[i] = r;
193 return r;
194 }
195 }
196
197 assert(0);
198 return NULL;
199 }
200
201 /* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204 static void
205 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206 {
207 assert(src->index == -1 && src->hw != -1);
208
209 if (dst->hw != -1)
210 pc->r_temp[dst->hw] = NULL;
211 pc->r_temp[src->hw] = dst;
212 dst->hw = src->hw;
213
214 FREE(src);
215 }
216
217 static void
218 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
219 {
220 if (r->index == -1) {
221 unsigned hw = r->hw;
222
223 FREE(pc->r_temp[hw]);
224 pc->r_temp[hw] = NULL;
225 }
226 }
227
228 static int
229 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
230 {
231 int i;
232
233 if ((idx + 4) >= NV50_SU_MAX_TEMP)
234 return 1;
235
236 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
237 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
238 return alloc_temp4(pc, dst, idx + 1);
239
240 for (i = 0; i < 4; i++) {
241 dst[i] = CALLOC_STRUCT(nv50_reg);
242 dst[i]->type = P_TEMP;
243 dst[i]->index = -1;
244 dst[i]->hw = idx + i;
245 pc->r_temp[idx + i] = dst[i];
246 }
247
248 return 0;
249 }
250
251 static void
252 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
253 {
254 int i;
255
256 for (i = 0; i < 4; i++)
257 free_temp(pc, reg[i]);
258 }
259
260 static struct nv50_reg *
261 temp_temp(struct nv50_pc *pc)
262 {
263 if (pc->temp_temp_nr >= 16)
264 assert(0);
265
266 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
267 return pc->temp_temp[pc->temp_temp_nr++];
268 }
269
270 static void
271 kill_temp_temp(struct nv50_pc *pc)
272 {
273 int i;
274
275 for (i = 0; i < pc->temp_temp_nr; i++)
276 free_temp(pc, pc->temp_temp[i]);
277 pc->temp_temp_nr = 0;
278 }
279
280 static int
281 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
282 {
283 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
284 (pc->immd_nr + 1) * 4 * sizeof(float));
285 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
286 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
287 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
288 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
289
290 return pc->immd_nr++;
291 }
292
293 static struct nv50_reg *
294 alloc_immd(struct nv50_pc *pc, float f)
295 {
296 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
297 unsigned hw;
298
299 for (hw = 0; hw < pc->immd_nr * 4; hw++)
300 if (pc->immd_buf[hw] == f)
301 break;
302
303 if (hw == pc->immd_nr * 4)
304 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
305
306 r->type = P_IMMD;
307 r->hw = hw;
308 r->index = -1;
309 return r;
310 }
311
312 static struct nv50_program_exec *
313 exec(struct nv50_pc *pc)
314 {
315 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
316
317 e->param.index = -1;
318 return e;
319 }
320
321 static void
322 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
323 {
324 struct nv50_program *p = pc->p;
325
326 if (p->exec_tail)
327 p->exec_tail->next = e;
328 if (!p->exec_head)
329 p->exec_head = e;
330 p->exec_tail = e;
331 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
332 }
333
334 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
335
336 static boolean
337 is_long(struct nv50_program_exec *e)
338 {
339 if (e->inst[0] & 1)
340 return TRUE;
341 return FALSE;
342 }
343
344 static boolean
345 is_immd(struct nv50_program_exec *e)
346 {
347 if (is_long(e) && (e->inst[1] & 3) == 3)
348 return TRUE;
349 return FALSE;
350 }
351
352 static INLINE void
353 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
354 struct nv50_program_exec *e)
355 {
356 set_long(pc, e);
357 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
358 e->inst[1] |= (pred << 7) | (idx << 12);
359 }
360
361 static INLINE void
362 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
363 struct nv50_program_exec *e)
364 {
365 set_long(pc, e);
366 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
367 e->inst[1] |= (idx << 4) | (on << 6);
368 }
369
370 static INLINE void
371 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
372 {
373 if (is_long(e))
374 return;
375
376 e->inst[0] |= 1;
377 set_pred(pc, 0xf, 0, e);
378 set_pred_wr(pc, 0, 0, e);
379 }
380
381 static INLINE void
382 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
383 {
384 if (dst->type == P_RESULT) {
385 set_long(pc, e);
386 e->inst[1] |= 0x00000008;
387 }
388
389 alloc_reg(pc, dst);
390 e->inst[0] |= (dst->hw << 2);
391 }
392
393 static INLINE void
394 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
395 {
396 unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
397
398 set_long(pc, e);
399 /*XXX: can't be predicated - bits overlap.. catch cases where both
400 * are required and avoid them. */
401 set_pred(pc, 0, 0, e);
402 set_pred_wr(pc, 0, 0, e);
403
404 e->inst[1] |= 0x00000002 | 0x00000001;
405 e->inst[0] |= (val & 0x3f) << 16;
406 e->inst[1] |= (val >> 6) << 2;
407 }
408
409
410 #define INTERP_LINEAR 0
411 #define INTERP_FLAT 1
412 #define INTERP_PERSPECTIVE 2
413 #define INTERP_CENTROID 4
414
415 /* interpolant index has been stored in dst->rhw */
416 static void
417 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
418 unsigned mode)
419 {
420 assert(dst->rhw != -1);
421 struct nv50_program_exec *e = exec(pc);
422
423 e->inst[0] |= 0x80000000;
424 set_dst(pc, dst, e);
425 e->inst[0] |= (dst->rhw << 16);
426
427 if (mode & INTERP_FLAT) {
428 e->inst[0] |= (1 << 8);
429 } else {
430 if (mode & INTERP_PERSPECTIVE) {
431 e->inst[0] |= (1 << 25);
432 alloc_reg(pc, iv);
433 e->inst[0] |= (iv->hw << 9);
434 }
435
436 if (mode & INTERP_CENTROID)
437 e->inst[0] |= (1 << 24);
438 }
439
440 emit(pc, e);
441 }
442
443 static void
444 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
445 struct nv50_program_exec *e)
446 {
447 set_long(pc, e);
448 #if 1
449 e->inst[1] |= (1 << 22);
450 #else
451 if (src->type == P_IMMD) {
452 e->inst[1] |= (NV50_CB_PMISC << 22);
453 } else {
454 if (pc->p->type == PIPE_SHADER_VERTEX)
455 e->inst[1] |= (NV50_CB_PVP << 22);
456 else
457 e->inst[1] |= (NV50_CB_PFP << 22);
458 }
459 #endif
460
461 e->param.index = src->hw;
462 e->param.shift = s;
463 e->param.mask = m << (s % 32);
464 }
465
466 static void
467 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
468 {
469 struct nv50_program_exec *e = exec(pc);
470
471 e->inst[0] |= 0x10000000;
472
473 set_dst(pc, dst, e);
474
475 if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
476 set_immd(pc, src, e);
477 /*XXX: 32-bit, but steals part of "half" reg space - need to
478 * catch and handle this case if/when we do half-regs
479 */
480 e->inst[0] |= 0x00008000;
481 } else
482 if (src->type == P_IMMD || src->type == P_CONST) {
483 set_long(pc, e);
484 set_data(pc, src, 0x7f, 9, e);
485 e->inst[1] |= 0x20000000; /* src0 const? */
486 } else {
487 if (src->type == P_ATTR) {
488 set_long(pc, e);
489 e->inst[1] |= 0x00200000;
490 }
491
492 alloc_reg(pc, src);
493 e->inst[0] |= (src->hw << 9);
494 }
495
496 if (is_long(e) && !is_immd(e)) {
497 e->inst[1] |= 0x04000000; /* 32-bit */
498 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
499 if (!(e->inst[1] & 0x20000000))
500 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
501 } else
502 e->inst[0] |= 0x00008000;
503
504 emit(pc, e);
505 }
506
507 static INLINE void
508 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
509 {
510 struct nv50_reg *imm = alloc_immd(pc, f);
511 emit_mov(pc, dst, imm);
512 FREE(imm);
513 }
514
515 static boolean
516 check_swap_src_0_1(struct nv50_pc *pc,
517 struct nv50_reg **s0, struct nv50_reg **s1)
518 {
519 struct nv50_reg *src0 = *s0, *src1 = *s1;
520
521 if (src0->type == P_CONST) {
522 if (src1->type != P_CONST) {
523 *s0 = src1;
524 *s1 = src0;
525 return TRUE;
526 }
527 } else
528 if (src1->type == P_ATTR) {
529 if (src0->type != P_ATTR) {
530 *s0 = src1;
531 *s1 = src0;
532 return TRUE;
533 }
534 }
535
536 return FALSE;
537 }
538
539 static void
540 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
541 {
542 if (src->type == P_ATTR) {
543 set_long(pc, e);
544 e->inst[1] |= 0x00200000;
545 } else
546 if (src->type == P_CONST || src->type == P_IMMD) {
547 struct nv50_reg *temp = temp_temp(pc);
548
549 emit_mov(pc, temp, src);
550 src = temp;
551 }
552
553 alloc_reg(pc, src);
554 e->inst[0] |= (src->hw << 9);
555 }
556
557 static void
558 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
559 {
560 if (src->type == P_ATTR) {
561 struct nv50_reg *temp = temp_temp(pc);
562
563 emit_mov(pc, temp, src);
564 src = temp;
565 } else
566 if (src->type == P_CONST || src->type == P_IMMD) {
567 assert(!(e->inst[0] & 0x00800000));
568 if (e->inst[0] & 0x01000000) {
569 struct nv50_reg *temp = temp_temp(pc);
570
571 emit_mov(pc, temp, src);
572 src = temp;
573 } else {
574 set_data(pc, src, 0x7f, 16, e);
575 e->inst[0] |= 0x00800000;
576 }
577 }
578
579 alloc_reg(pc, src);
580 e->inst[0] |= (src->hw << 16);
581 }
582
583 static void
584 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
585 {
586 set_long(pc, e);
587
588 if (src->type == P_ATTR) {
589 struct nv50_reg *temp = temp_temp(pc);
590
591 emit_mov(pc, temp, src);
592 src = temp;
593 } else
594 if (src->type == P_CONST || src->type == P_IMMD) {
595 assert(!(e->inst[0] & 0x01000000));
596 if (e->inst[0] & 0x00800000) {
597 struct nv50_reg *temp = temp_temp(pc);
598
599 emit_mov(pc, temp, src);
600 src = temp;
601 } else {
602 set_data(pc, src, 0x7f, 32+14, e);
603 e->inst[0] |= 0x01000000;
604 }
605 }
606
607 alloc_reg(pc, src);
608 e->inst[1] |= (src->hw << 14);
609 }
610
611 static void
612 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
613 struct nv50_reg *src1)
614 {
615 struct nv50_program_exec *e = exec(pc);
616
617 e->inst[0] |= 0xc0000000;
618
619 check_swap_src_0_1(pc, &src0, &src1);
620 set_dst(pc, dst, e);
621 set_src_0(pc, src0, e);
622 set_src_1(pc, src1, e);
623
624 emit(pc, e);
625 }
626
627 static void
628 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
629 struct nv50_reg *src0, struct nv50_reg *src1)
630 {
631 struct nv50_program_exec *e = exec(pc);
632
633 e->inst[0] |= 0xb0000000;
634
635 check_swap_src_0_1(pc, &src0, &src1);
636 set_dst(pc, dst, e);
637 set_src_0(pc, src0, e);
638 if (is_long(e))
639 set_src_2(pc, src1, e);
640 else
641 set_src_1(pc, src1, e);
642
643 emit(pc, e);
644 }
645
646 static void
647 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
648 struct nv50_reg *src0, struct nv50_reg *src1)
649 {
650 struct nv50_program_exec *e = exec(pc);
651
652 set_long(pc, e);
653 e->inst[0] |= 0xb0000000;
654 e->inst[1] |= (sub << 29);
655
656 check_swap_src_0_1(pc, &src0, &src1);
657 set_dst(pc, dst, e);
658 set_src_0(pc, src0, e);
659 set_src_1(pc, src1, e);
660
661 emit(pc, e);
662 }
663
664 static void
665 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
666 struct nv50_reg *src1)
667 {
668 struct nv50_program_exec *e = exec(pc);
669
670 e->inst[0] |= 0xb0000000;
671
672 set_long(pc, e);
673 if (check_swap_src_0_1(pc, &src0, &src1))
674 e->inst[1] |= 0x04000000;
675 else
676 e->inst[1] |= 0x08000000;
677
678 set_dst(pc, dst, e);
679 set_src_0(pc, src0, e);
680 set_src_2(pc, src1, e);
681
682 emit(pc, e);
683 }
684
685 static void
686 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
687 struct nv50_reg *src1, struct nv50_reg *src2)
688 {
689 struct nv50_program_exec *e = exec(pc);
690
691 e->inst[0] |= 0xe0000000;
692
693 check_swap_src_0_1(pc, &src0, &src1);
694 set_dst(pc, dst, e);
695 set_src_0(pc, src0, e);
696 set_src_1(pc, src1, e);
697 set_src_2(pc, src2, e);
698
699 emit(pc, e);
700 }
701
702 static void
703 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
704 struct nv50_reg *src1, struct nv50_reg *src2)
705 {
706 struct nv50_program_exec *e = exec(pc);
707
708 e->inst[0] |= 0xe0000000;
709 set_long(pc, e);
710 e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
711
712 check_swap_src_0_1(pc, &src0, &src1);
713 set_dst(pc, dst, e);
714 set_src_0(pc, src0, e);
715 set_src_1(pc, src1, e);
716 set_src_2(pc, src2, e);
717
718 emit(pc, e);
719 }
720
721 static void
722 emit_flop(struct nv50_pc *pc, unsigned sub,
723 struct nv50_reg *dst, struct nv50_reg *src)
724 {
725 struct nv50_program_exec *e = exec(pc);
726
727 e->inst[0] |= 0x90000000;
728 if (sub) {
729 set_long(pc, e);
730 e->inst[1] |= (sub << 29);
731 }
732
733 set_dst(pc, dst, e);
734 set_src_0(pc, src, e);
735
736 emit(pc, e);
737 }
738
739 static void
740 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
741 {
742 struct nv50_program_exec *e = exec(pc);
743
744 e->inst[0] |= 0xb0000000;
745
746 set_dst(pc, dst, e);
747 set_src_0(pc, src, e);
748 set_long(pc, e);
749 e->inst[1] |= (6 << 29) | 0x00004000;
750
751 emit(pc, e);
752 }
753
754 static void
755 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
756 {
757 struct nv50_program_exec *e = exec(pc);
758
759 e->inst[0] |= 0xb0000000;
760
761 set_dst(pc, dst, e);
762 set_src_0(pc, src, e);
763 set_long(pc, e);
764 e->inst[1] |= (6 << 29);
765
766 emit(pc, e);
767 }
768
769 static void
770 emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
771 struct nv50_reg *src0, struct nv50_reg *src1)
772 {
773 struct nv50_program_exec *e = exec(pc);
774 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
775 struct nv50_reg *rdst;
776
777 assert(c_op <= 7);
778 if (check_swap_src_0_1(pc, &src0, &src1))
779 c_op = inv_cop[c_op];
780
781 rdst = dst;
782 if (dst->type != P_TEMP)
783 dst = alloc_temp(pc, NULL);
784
785 /* set.u32 */
786 set_long(pc, e);
787 e->inst[0] |= 0xb0000000;
788 e->inst[1] |= (3 << 29);
789 e->inst[1] |= (c_op << 14);
790 /*XXX: breaks things, .u32 by default?
791 * decuda will disasm as .u16 and use .lo/.hi regs, but this
792 * doesn't seem to match what the hw actually does.
793 inst[1] |= 0x04000000; << breaks things.. .u32 by default?
794 */
795 set_dst(pc, dst, e);
796 set_src_0(pc, src0, e);
797 set_src_1(pc, src1, e);
798 emit(pc, e);
799
800 /* cvt.f32.u32 */
801 e = exec(pc);
802 e->inst[0] = 0xa0000001;
803 e->inst[1] = 0x64014780;
804 set_dst(pc, rdst, e);
805 set_src_0(pc, dst, e);
806 emit(pc, e);
807
808 if (dst != rdst)
809 free_temp(pc, dst);
810 }
811
812 static void
813 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
814 {
815 struct nv50_program_exec *e = exec(pc);
816
817 e->inst[0] = 0xa0000000; /* cvt */
818 set_long(pc, e);
819 e->inst[1] |= (6 << 29); /* cvt */
820 e->inst[1] |= 0x08000000; /* integer mode */
821 e->inst[1] |= 0x04000000; /* 32 bit */
822 e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
823 e->inst[1] |= (1 << 14); /* src .f32 */
824 set_dst(pc, dst, e);
825 set_src_0(pc, src, e);
826
827 emit(pc, e);
828 }
829
830 static void
831 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
832 struct nv50_reg *v, struct nv50_reg *e)
833 {
834 struct nv50_reg *temp = alloc_temp(pc, NULL);
835
836 emit_flop(pc, 3, temp, v);
837 emit_mul(pc, temp, temp, e);
838 emit_preex2(pc, temp, temp);
839 emit_flop(pc, 6, dst, temp);
840
841 free_temp(pc, temp);
842 }
843
844 static void
845 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
846 {
847 struct nv50_program_exec *e = exec(pc);
848
849 e->inst[0] = 0xa0000000; /* cvt */
850 set_long(pc, e);
851 e->inst[1] |= (6 << 29); /* cvt */
852 e->inst[1] |= 0x04000000; /* 32 bit */
853 e->inst[1] |= (1 << 14); /* src .f32 */
854 e->inst[1] |= ((1 << 6) << 14); /* .abs */
855 set_dst(pc, dst, e);
856 set_src_0(pc, src, e);
857
858 emit(pc, e);
859 }
860
861 static void
862 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
863 struct nv50_reg **src)
864 {
865 struct nv50_reg *one = alloc_immd(pc, 1.0);
866 struct nv50_reg *zero = alloc_immd(pc, 0.0);
867 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
868 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
869 struct nv50_reg *tmp[4];
870
871 if (mask & (3 << 1)) {
872 tmp[0] = alloc_temp(pc, NULL);
873 emit_minmax(pc, 4, tmp[0], src[0], zero);
874 }
875
876 if (mask & (1 << 2)) {
877 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
878
879 tmp[1] = temp_temp(pc);
880 emit_minmax(pc, 4, tmp[1], src[1], zero);
881
882 tmp[3] = temp_temp(pc);
883 emit_minmax(pc, 4, tmp[3], src[3], neg128);
884 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
885
886 emit_pow(pc, dst[2], tmp[1], tmp[3]);
887 emit_mov(pc, dst[2], zero);
888 set_pred(pc, 3, 0, pc->p->exec_tail);
889 }
890
891 if (mask & (1 << 1))
892 assimilate_temp(pc, dst[1], tmp[0]);
893 else
894 if (mask & (1 << 2))
895 free_temp(pc, tmp[0]);
896
897 /* do this last, in case src[i,j] == dst[0,3] */
898 if (mask & (1 << 0))
899 emit_mov(pc, dst[0], one);
900
901 if (mask & (1 << 3))
902 emit_mov(pc, dst[3], one);
903
904 FREE(pos128);
905 FREE(neg128);
906 FREE(zero);
907 FREE(one);
908 }
909
910 static void
911 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
912 {
913 struct nv50_program_exec *e = exec(pc);
914
915 set_long(pc, e);
916 e->inst[0] |= 0xa0000000; /* delta */
917 e->inst[1] |= (7 << 29); /* delta */
918 e->inst[1] |= 0x04000000; /* negate arg0? probably not */
919 e->inst[1] |= (1 << 14); /* src .f32 */
920 set_dst(pc, dst, e);
921 set_src_0(pc, src, e);
922
923 emit(pc, e);
924 }
925
926 static void
927 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
928 {
929 struct nv50_program_exec *e;
930 const int r_pred = 1;
931
932 /* Sets predicate reg ? */
933 e = exec(pc);
934 e->inst[0] = 0xa00001fd;
935 e->inst[1] = 0xc4014788;
936 set_src_0(pc, src, e);
937 set_pred_wr(pc, 1, r_pred, e);
938 emit(pc, e);
939
940 /* This is probably KILP */
941 e = exec(pc);
942 e->inst[0] = 0x000001fe;
943 set_long(pc, e);
944 set_pred(pc, 1 /* LT? */, r_pred, e);
945 emit(pc, e);
946 }
947
948 static void
949 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
950 {
951 unsigned q = 0, m = ~0;
952
953 assert(!is_long(e));
954
955 switch (e->inst[0] >> 28) {
956 case 0x1:
957 /* MOV */
958 q = 0x0403c000;
959 m = 0xffff7fff;
960 break;
961 case 0x8:
962 /* INTERP */
963 m = ~0x02000000;
964 if (e->inst[0] & 0x02000000)
965 q = 0x00020000;
966 break;
967 case 0x9:
968 /* RCP */
969 break;
970 case 0xB:
971 /* ADD */
972 m = ~(127 << 16);
973 q = ((e->inst[0] & (~m)) >> 2);
974 break;
975 case 0xC:
976 /* MUL */
977 m = ~0x00008000;
978 q = ((e->inst[0] & (~m)) << 12);
979 break;
980 case 0xE:
981 /* MAD (if src2 == dst) */
982 q = ((e->inst[0] & 0x1fc) << 12);
983 break;
984 default:
985 assert(0);
986 break;
987 }
988
989 set_long(pc, e);
990 pc->p->exec_size++;
991
992 e->inst[0] &= m;
993 e->inst[1] |= q;
994 }
995
996 static struct nv50_reg *
997 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
998 {
999 switch (dst->DstRegister.File) {
1000 case TGSI_FILE_TEMPORARY:
1001 return &pc->temp[dst->DstRegister.Index * 4 + c];
1002 case TGSI_FILE_OUTPUT:
1003 return &pc->result[dst->DstRegister.Index * 4 + c];
1004 case TGSI_FILE_NULL:
1005 return NULL;
1006 default:
1007 break;
1008 }
1009
1010 return NULL;
1011 }
1012
1013 static struct nv50_reg *
1014 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
1015 {
1016 struct nv50_reg *r = NULL;
1017 struct nv50_reg *temp;
1018 unsigned sgn, c;
1019
1020 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1021
1022 c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1023 switch (c) {
1024 case TGSI_EXTSWIZZLE_X:
1025 case TGSI_EXTSWIZZLE_Y:
1026 case TGSI_EXTSWIZZLE_Z:
1027 case TGSI_EXTSWIZZLE_W:
1028 switch (src->SrcRegister.File) {
1029 case TGSI_FILE_INPUT:
1030 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1031 break;
1032 case TGSI_FILE_TEMPORARY:
1033 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1034 break;
1035 case TGSI_FILE_CONSTANT:
1036 r = &pc->param[src->SrcRegister.Index * 4 + c];
1037 break;
1038 case TGSI_FILE_IMMEDIATE:
1039 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1040 break;
1041 case TGSI_FILE_SAMPLER:
1042 break;
1043 default:
1044 assert(0);
1045 break;
1046 }
1047 break;
1048 case TGSI_EXTSWIZZLE_ZERO:
1049 r = alloc_immd(pc, 0.0);
1050 return r;
1051 case TGSI_EXTSWIZZLE_ONE:
1052 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1053 return alloc_immd(pc, -1.0);
1054 return alloc_immd(pc, 1.0);
1055 default:
1056 assert(0);
1057 break;
1058 }
1059
1060 switch (sgn) {
1061 case TGSI_UTIL_SIGN_KEEP:
1062 break;
1063 case TGSI_UTIL_SIGN_CLEAR:
1064 temp = temp_temp(pc);
1065 emit_abs(pc, temp, r);
1066 r = temp;
1067 break;
1068 case TGSI_UTIL_SIGN_TOGGLE:
1069 temp = temp_temp(pc);
1070 emit_neg(pc, temp, r);
1071 r = temp;
1072 break;
1073 case TGSI_UTIL_SIGN_SET:
1074 temp = temp_temp(pc);
1075 emit_abs(pc, temp, r);
1076 emit_neg(pc, temp, temp);
1077 r = temp;
1078 break;
1079 default:
1080 assert(0);
1081 break;
1082 }
1083
1084 return r;
1085 }
1086
1087 /* returns TRUE if instruction can overwrite sources before they're read */
1088 static boolean
1089 direct2dest_op(const struct tgsi_full_instruction *insn)
1090 {
1091 if (insn->Instruction.Saturate)
1092 return FALSE;
1093
1094 switch (insn->Instruction.Opcode) {
1095 case TGSI_OPCODE_COS:
1096 case TGSI_OPCODE_DP3:
1097 case TGSI_OPCODE_DP4:
1098 case TGSI_OPCODE_DPH:
1099 case TGSI_OPCODE_KIL:
1100 case TGSI_OPCODE_LIT:
1101 case TGSI_OPCODE_POW:
1102 case TGSI_OPCODE_RCP:
1103 case TGSI_OPCODE_RSQ:
1104 case TGSI_OPCODE_SCS:
1105 case TGSI_OPCODE_SIN:
1106 case TGSI_OPCODE_TEX:
1107 case TGSI_OPCODE_TXP:
1108 return FALSE;
1109 default:
1110 return TRUE;
1111 }
1112 }
1113
1114 static boolean
1115 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1116 {
1117 const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1118 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1119 unsigned mask, sat, unit;
1120 boolean assimilate = FALSE;
1121 int i, c;
1122
1123 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1124 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1125
1126 for (c = 0; c < 4; c++) {
1127 if (mask & (1 << c))
1128 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1129 else
1130 dst[c] = NULL;
1131 rdst[c] = NULL;
1132 src[0][c] = NULL;
1133 src[1][c] = NULL;
1134 src[2][c] = NULL;
1135 }
1136
1137 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1138 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1139
1140 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1141 unit = fs->SrcRegister.Index;
1142
1143 for (c = 0; c < 4; c++)
1144 src[i][c] = tgsi_src(pc, c, fs);
1145 }
1146
1147 if (sat) {
1148 for (c = 0; c < 4; c++) {
1149 rdst[c] = dst[c];
1150 dst[c] = temp_temp(pc);
1151 }
1152 } else
1153 if (direct2dest_op(inst)) {
1154 for (c = 0; c < 4; c++) {
1155 if (!dst[c] || dst[c]->type != P_TEMP)
1156 continue;
1157
1158 for (i = c + 1; i < 4; i++) {
1159 if (dst[c] == src[0][i] ||
1160 dst[c] == src[1][i] ||
1161 dst[c] == src[2][i])
1162 break;
1163 }
1164 if (i == 4)
1165 continue;
1166
1167 assimilate = TRUE;
1168 rdst[c] = dst[c];
1169 dst[c] = alloc_temp(pc, NULL);
1170 }
1171 }
1172
1173 switch (inst->Instruction.Opcode) {
1174 case TGSI_OPCODE_ABS:
1175 for (c = 0; c < 4; c++) {
1176 if (!(mask & (1 << c)))
1177 continue;
1178 emit_abs(pc, dst[c], src[0][c]);
1179 }
1180 break;
1181 case TGSI_OPCODE_ADD:
1182 for (c = 0; c < 4; c++) {
1183 if (!(mask & (1 << c)))
1184 continue;
1185 emit_add(pc, dst[c], src[0][c], src[1][c]);
1186 }
1187 break;
1188 case TGSI_OPCODE_COS:
1189 temp = temp_temp(pc);
1190 emit_precossin(pc, temp, src[0][0]);
1191 emit_flop(pc, 5, temp, temp);
1192 for (c = 0; c < 4; c++) {
1193 if (!(mask & (1 << c)))
1194 continue;
1195 emit_mov(pc, dst[c], temp);
1196 }
1197 break;
1198 case TGSI_OPCODE_DP3:
1199 temp = temp_temp(pc);
1200 emit_mul(pc, temp, src[0][0], src[1][0]);
1201 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1202 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1203 for (c = 0; c < 4; c++) {
1204 if (!(mask & (1 << c)))
1205 continue;
1206 emit_mov(pc, dst[c], temp);
1207 }
1208 break;
1209 case TGSI_OPCODE_DP4:
1210 temp = temp_temp(pc);
1211 emit_mul(pc, temp, src[0][0], src[1][0]);
1212 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1213 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1214 emit_mad(pc, temp, src[0][3], src[1][3], temp);
1215 for (c = 0; c < 4; c++) {
1216 if (!(mask & (1 << c)))
1217 continue;
1218 emit_mov(pc, dst[c], temp);
1219 }
1220 break;
1221 case TGSI_OPCODE_DPH:
1222 temp = temp_temp(pc);
1223 emit_mul(pc, temp, src[0][0], src[1][0]);
1224 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1225 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1226 emit_add(pc, temp, src[1][3], temp);
1227 for (c = 0; c < 4; c++) {
1228 if (!(mask & (1 << c)))
1229 continue;
1230 emit_mov(pc, dst[c], temp);
1231 }
1232 break;
1233 case TGSI_OPCODE_DST:
1234 {
1235 struct nv50_reg *one = alloc_immd(pc, 1.0);
1236 if (mask & (1 << 0))
1237 emit_mov(pc, dst[0], one);
1238 if (mask & (1 << 1))
1239 emit_mul(pc, dst[1], src[0][1], src[1][1]);
1240 if (mask & (1 << 2))
1241 emit_mov(pc, dst[2], src[0][2]);
1242 if (mask & (1 << 3))
1243 emit_mov(pc, dst[3], src[1][3]);
1244 FREE(one);
1245 }
1246 break;
1247 case TGSI_OPCODE_EX2:
1248 temp = temp_temp(pc);
1249 emit_preex2(pc, temp, src[0][0]);
1250 emit_flop(pc, 6, temp, temp);
1251 for (c = 0; c < 4; c++) {
1252 if (!(mask & (1 << c)))
1253 continue;
1254 emit_mov(pc, dst[c], temp);
1255 }
1256 break;
1257 case TGSI_OPCODE_FLR:
1258 for (c = 0; c < 4; c++) {
1259 if (!(mask & (1 << c)))
1260 continue;
1261 emit_flr(pc, dst[c], src[0][c]);
1262 }
1263 break;
1264 case TGSI_OPCODE_FRC:
1265 temp = temp_temp(pc);
1266 for (c = 0; c < 4; c++) {
1267 if (!(mask & (1 << c)))
1268 continue;
1269 emit_flr(pc, temp, src[0][c]);
1270 emit_sub(pc, dst[c], src[0][c], temp);
1271 }
1272 break;
1273 case TGSI_OPCODE_KIL:
1274 emit_kil(pc, src[0][0]);
1275 emit_kil(pc, src[0][1]);
1276 emit_kil(pc, src[0][2]);
1277 emit_kil(pc, src[0][3]);
1278 pc->p->cfg.fp.regs[2] |= 0x00100000;
1279 break;
1280 case TGSI_OPCODE_LIT:
1281 emit_lit(pc, &dst[0], mask, &src[0][0]);
1282 break;
1283 case TGSI_OPCODE_LG2:
1284 temp = temp_temp(pc);
1285 emit_flop(pc, 3, temp, src[0][0]);
1286 for (c = 0; c < 4; c++) {
1287 if (!(mask & (1 << c)))
1288 continue;
1289 emit_mov(pc, dst[c], temp);
1290 }
1291 break;
1292 case TGSI_OPCODE_LRP:
1293 temp = temp_temp(pc);
1294 for (c = 0; c < 4; c++) {
1295 if (!(mask & (1 << c)))
1296 continue;
1297 emit_sub(pc, temp, src[1][c], src[2][c]);
1298 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1299 }
1300 break;
1301 case TGSI_OPCODE_MAD:
1302 for (c = 0; c < 4; c++) {
1303 if (!(mask & (1 << c)))
1304 continue;
1305 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1306 }
1307 break;
1308 case TGSI_OPCODE_MAX:
1309 for (c = 0; c < 4; c++) {
1310 if (!(mask & (1 << c)))
1311 continue;
1312 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1313 }
1314 break;
1315 case TGSI_OPCODE_MIN:
1316 for (c = 0; c < 4; c++) {
1317 if (!(mask & (1 << c)))
1318 continue;
1319 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1320 }
1321 break;
1322 case TGSI_OPCODE_MOV:
1323 for (c = 0; c < 4; c++) {
1324 if (!(mask & (1 << c)))
1325 continue;
1326 emit_mov(pc, dst[c], src[0][c]);
1327 }
1328 break;
1329 case TGSI_OPCODE_MUL:
1330 for (c = 0; c < 4; c++) {
1331 if (!(mask & (1 << c)))
1332 continue;
1333 emit_mul(pc, dst[c], src[0][c], src[1][c]);
1334 }
1335 break;
1336 case TGSI_OPCODE_POW:
1337 temp = temp_temp(pc);
1338 emit_pow(pc, temp, src[0][0], src[1][0]);
1339 for (c = 0; c < 4; c++) {
1340 if (!(mask & (1 << c)))
1341 continue;
1342 emit_mov(pc, dst[c], temp);
1343 }
1344 break;
1345 case TGSI_OPCODE_RCP:
1346 for (c = 3; c >= 0; c--) {
1347 if (!(mask & (1 << c)))
1348 continue;
1349 emit_flop(pc, 0, dst[c], src[0][0]);
1350 }
1351 break;
1352 case TGSI_OPCODE_RSQ:
1353 for (c = 3; c >= 0; c--) {
1354 if (!(mask & (1 << c)))
1355 continue;
1356 emit_flop(pc, 2, dst[c], src[0][0]);
1357 }
1358 break;
1359 case TGSI_OPCODE_SCS:
1360 temp = temp_temp(pc);
1361 emit_precossin(pc, temp, src[0][0]);
1362 if (mask & (1 << 0))
1363 emit_flop(pc, 5, dst[0], temp);
1364 if (mask & (1 << 1))
1365 emit_flop(pc, 4, dst[1], temp);
1366 if (mask & (1 << 2))
1367 emit_mov_immdval(pc, dst[2], 0.0);
1368 if (mask & (1 << 3))
1369 emit_mov_immdval(pc, dst[3], 1.0);
1370 break;
1371 case TGSI_OPCODE_SGE:
1372 for (c = 0; c < 4; c++) {
1373 if (!(mask & (1 << c)))
1374 continue;
1375 emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1376 }
1377 break;
1378 case TGSI_OPCODE_SIN:
1379 temp = temp_temp(pc);
1380 emit_precossin(pc, temp, src[0][0]);
1381 emit_flop(pc, 4, temp, temp);
1382 for (c = 0; c < 4; c++) {
1383 if (!(mask & (1 << c)))
1384 continue;
1385 emit_mov(pc, dst[c], temp);
1386 }
1387 break;
1388 case TGSI_OPCODE_SLT:
1389 for (c = 0; c < 4; c++) {
1390 if (!(mask & (1 << c)))
1391 continue;
1392 emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1393 }
1394 break;
1395 case TGSI_OPCODE_SUB:
1396 for (c = 0; c < 4; c++) {
1397 if (!(mask & (1 << c)))
1398 continue;
1399 emit_sub(pc, dst[c], src[0][c], src[1][c]);
1400 }
1401 break;
1402 case TGSI_OPCODE_TEX:
1403 case TGSI_OPCODE_TXP:
1404 {
1405 struct nv50_reg *t[4];
1406 struct nv50_program_exec *e;
1407
1408 alloc_temp4(pc, t, 0);
1409 emit_mov(pc, t[0], src[0][0]);
1410 emit_mov(pc, t[1], src[0][1]);
1411
1412 e = exec(pc);
1413 e->inst[0] = 0xf6400000;
1414 e->inst[0] |= (unit << 9);
1415 set_long(pc, e);
1416 e->inst[1] |= 0x0000c004;
1417 set_dst(pc, t[0], e);
1418 emit(pc, e);
1419
1420 if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
1421 if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
1422 if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
1423 if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
1424
1425 free_temp4(pc, t);
1426 }
1427 break;
1428 case TGSI_OPCODE_XPD:
1429 temp = temp_temp(pc);
1430 if (mask & (1 << 0)) {
1431 emit_mul(pc, temp, src[0][2], src[1][1]);
1432 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1433 }
1434 if (mask & (1 << 1)) {
1435 emit_mul(pc, temp, src[0][0], src[1][2]);
1436 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1437 }
1438 if (mask & (1 << 2)) {
1439 emit_mul(pc, temp, src[0][1], src[1][0]);
1440 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1441 }
1442 if (mask & (1 << 3))
1443 emit_mov_immdval(pc, dst[3], 1.0);
1444 break;
1445 case TGSI_OPCODE_END:
1446 break;
1447 default:
1448 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1449 return FALSE;
1450 }
1451
1452 if (sat) {
1453 for (c = 0; c < 4; c++) {
1454 struct nv50_program_exec *e;
1455
1456 if (!(mask & (1 << c)))
1457 continue;
1458 e = exec(pc);
1459
1460 e->inst[0] = 0xa0000000; /* cvt */
1461 set_long(pc, e);
1462 e->inst[1] |= (6 << 29); /* cvt */
1463 e->inst[1] |= 0x04000000; /* 32 bit */
1464 e->inst[1] |= (1 << 14); /* src .f32 */
1465 e->inst[1] |= ((1 << 5) << 14); /* .sat */
1466 set_dst(pc, rdst[c], e);
1467 set_src_0(pc, dst[c], e);
1468 emit(pc, e);
1469 }
1470 } else if (assimilate) {
1471 for (c = 0; c < 4; c++)
1472 if (rdst[c])
1473 assimilate_temp(pc, rdst[c], dst[c]);
1474 }
1475
1476 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1477 for (c = 0; c < 4; c++) {
1478 if (!src[i][c])
1479 continue;
1480 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1481 FREE(src[i][c]);
1482 }
1483 }
1484
1485 kill_temp_temp(pc);
1486 return TRUE;
1487 }
1488
1489 /* Adjust a bitmask that indicates what components of a source are used,
1490 * we use this in tx_prep so we only load interpolants that are needed.
1491 */
1492 static void
1493 insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1494 {
1495 const struct tgsi_instruction_ext_texture *tex;
1496
1497 switch (insn->Instruction.Opcode) {
1498 case TGSI_OPCODE_DP3:
1499 *mask = 0x7;
1500 break;
1501 case TGSI_OPCODE_DP4:
1502 case TGSI_OPCODE_DPH:
1503 *mask = 0xF;
1504 break;
1505 case TGSI_OPCODE_LIT:
1506 *mask = 0xB;
1507 break;
1508 case TGSI_OPCODE_RCP:
1509 case TGSI_OPCODE_RSQ:
1510 *mask = 0x1;
1511 break;
1512 case TGSI_OPCODE_TEX:
1513 case TGSI_OPCODE_TXP:
1514 assert(insn->Instruction.Extended);
1515 tex = &insn->InstructionExtTexture;
1516
1517 *mask = 0x7;
1518 if (tex->Texture == TGSI_TEXTURE_1D)
1519 *mask = 0x1;
1520 else
1521 if (tex->Texture == TGSI_TEXTURE_2D)
1522 *mask = 0x3;
1523
1524 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1525 *mask |= 0x8;
1526 break;
1527 default:
1528 break;
1529 }
1530 }
1531
1532 static void
1533 prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1534 unsigned *r_usage[2])
1535 {
1536 const struct tgsi_full_instruction *insn;
1537 const struct tgsi_full_src_register *src;
1538 const struct tgsi_dst_register *dst;
1539
1540 unsigned i, c, k, n, mask, *acc_p;
1541
1542 insn = &tok->FullInstruction;
1543 dst = &insn->FullDstRegisters[0].DstRegister;
1544 mask = dst->WriteMask;
1545
1546 if (!r_usage[0])
1547 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1548 if (!r_usage[1])
1549 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1550
1551 if (dst->File == TGSI_FILE_TEMPORARY) {
1552 for (c = 0; c < 4; c++) {
1553 if (!(mask & (1 << c)))
1554 continue;
1555 r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1556 }
1557 }
1558
1559 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1560 src = &insn->FullSrcRegisters[i];
1561
1562 switch (src->SrcRegister.File) {
1563 case TGSI_FILE_TEMPORARY:
1564 acc_p = r_usage[0];
1565 break;
1566 case TGSI_FILE_INPUT:
1567 acc_p = r_usage[1];
1568 break;
1569 default:
1570 continue;
1571 }
1572
1573 insn_adjust_mask(insn, &mask);
1574
1575 for (c = 0; c < 4; c++) {
1576 if (!(mask & (1 << c)))
1577 continue;
1578
1579 k = tgsi_util_get_full_src_register_extswizzle(src, c);
1580 switch (k) {
1581 case TGSI_EXTSWIZZLE_X:
1582 case TGSI_EXTSWIZZLE_Y:
1583 case TGSI_EXTSWIZZLE_Z:
1584 case TGSI_EXTSWIZZLE_W:
1585 n = src->SrcRegister.Index * 4 + k;
1586 acc_p[n] = pc->insn_nr;
1587 break;
1588 default:
1589 break;
1590 }
1591 }
1592 }
1593 }
1594
1595 static unsigned
1596 load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1597 int *aid, int *p_oid)
1598 {
1599 struct nv50_reg *iv;
1600 int oid, c, n;
1601 unsigned mask = 0;
1602
1603 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1604
1605 for (c = 0, n = i * 4; c < 4; c++, n++) {
1606 oid = (*p_oid)++;
1607 pc->attr[n].type = P_TEMP;
1608 pc->attr[n].index = i;
1609
1610 if (pc->attr[n].acc == acc[n])
1611 continue;
1612 mask |= (1 << c);
1613
1614 pc->attr[n].acc = acc[n];
1615 pc->attr[n].rhw = pc->attr[n].hw = -1;
1616 alloc_reg(pc, &pc->attr[n]);
1617
1618 pc->attr[n].rhw = (*aid)++;
1619 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1620
1621 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1622 (*mid)++;
1623 pc->p->cfg.fp.regs[1] += 0x00010001;
1624 }
1625
1626 return mask;
1627 }
1628
1629 static boolean
1630 nv50_program_tx_prep(struct nv50_pc *pc)
1631 {
1632 struct tgsi_parse_context p;
1633 boolean ret = FALSE;
1634 unsigned i, c;
1635 unsigned fcol, bcol, fcrd, depr;
1636
1637 /* count (centroid) perspective interpolations */
1638 unsigned centroid_loads = 0;
1639 unsigned perspect_loads = 0;
1640
1641 /* track register access for temps and attrs */
1642 unsigned *r_usage[2];
1643 r_usage[0] = NULL;
1644 r_usage[1] = NULL;
1645
1646 depr = fcol = bcol = fcrd = 0xffff;
1647
1648 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1649 pc->p->cfg.fp.regs[0] = 0x01000404;
1650 pc->p->cfg.fp.regs[1] = 0x00000400;
1651 }
1652
1653 tgsi_parse_init(&p, pc->p->pipe.tokens);
1654 while (!tgsi_parse_end_of_tokens(&p)) {
1655 const union tgsi_full_token *tok = &p.FullToken;
1656
1657 tgsi_parse_token(&p);
1658 switch (tok->Token.Type) {
1659 case TGSI_TOKEN_TYPE_IMMEDIATE:
1660 {
1661 const struct tgsi_full_immediate *imm =
1662 &p.FullToken.FullImmediate;
1663
1664 ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1665 imm->u.ImmediateFloat32[1].Float,
1666 imm->u.ImmediateFloat32[2].Float,
1667 imm->u.ImmediateFloat32[3].Float);
1668 }
1669 break;
1670 case TGSI_TOKEN_TYPE_DECLARATION:
1671 {
1672 const struct tgsi_full_declaration *d;
1673 unsigned last, first, mode;
1674
1675 d = &p.FullToken.FullDeclaration;
1676 first = d->DeclarationRange.First;
1677 last = d->DeclarationRange.Last;
1678
1679 switch (d->Declaration.File) {
1680 case TGSI_FILE_TEMPORARY:
1681 if (pc->temp_nr < (last + 1))
1682 pc->temp_nr = last + 1;
1683 break;
1684 case TGSI_FILE_OUTPUT:
1685 if (pc->result_nr < (last + 1))
1686 pc->result_nr = last + 1;
1687
1688 if (!d->Declaration.Semantic)
1689 break;
1690
1691 switch (d->Semantic.SemanticName) {
1692 case TGSI_SEMANTIC_POSITION:
1693 depr = first;
1694 pc->p->cfg.fp.regs[2] |= 0x00000100;
1695 pc->p->cfg.fp.regs[3] |= 0x00000011;
1696 break;
1697 default:
1698 break;
1699 }
1700
1701 break;
1702 case TGSI_FILE_INPUT:
1703 {
1704 if (pc->attr_nr < (last + 1))
1705 pc->attr_nr = last + 1;
1706
1707 if (pc->p->type != PIPE_SHADER_FRAGMENT)
1708 break;
1709
1710 switch (d->Declaration.Interpolate) {
1711 case TGSI_INTERPOLATE_CONSTANT:
1712 mode = INTERP_FLAT;
1713 break;
1714 case TGSI_INTERPOLATE_PERSPECTIVE:
1715 mode = INTERP_PERSPECTIVE;
1716 break;
1717 default:
1718 mode = INTERP_LINEAR;
1719 break;
1720 }
1721
1722 if (d->Declaration.Semantic) {
1723 switch (d->Semantic.SemanticName) {
1724 case TGSI_SEMANTIC_POSITION:
1725 fcrd = first;
1726 break;
1727 case TGSI_SEMANTIC_COLOR:
1728 fcol = first;
1729 mode = INTERP_PERSPECTIVE;
1730 break;
1731 case TGSI_SEMANTIC_BCOLOR:
1732 bcol = first;
1733 mode = INTERP_PERSPECTIVE;
1734 break;
1735 }
1736 }
1737
1738 if (d->Declaration.Centroid) {
1739 mode |= INTERP_CENTROID;
1740 if (mode & INTERP_PERSPECTIVE)
1741 centroid_loads++;
1742 } else
1743 if (mode & INTERP_PERSPECTIVE)
1744 perspect_loads++;
1745
1746 assert(last < 32);
1747 for (i = first; i <= last; i++)
1748 pc->interp_mode[i] = mode;
1749 }
1750 break;
1751 case TGSI_FILE_CONSTANT:
1752 if (pc->param_nr < (last + 1))
1753 pc->param_nr = last + 1;
1754 break;
1755 case TGSI_FILE_SAMPLER:
1756 break;
1757 default:
1758 NOUVEAU_ERR("bad decl file %d\n",
1759 d->Declaration.File);
1760 goto out_err;
1761 }
1762 }
1763 break;
1764 case TGSI_TOKEN_TYPE_INSTRUCTION:
1765 pc->insn_nr++;
1766 prep_inspect_insn(pc, tok, r_usage);
1767 break;
1768 default:
1769 break;
1770 }
1771 }
1772
1773 if (pc->temp_nr) {
1774 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1775 if (!pc->temp)
1776 goto out_err;
1777
1778 for (i = 0; i < pc->temp_nr; i++) {
1779 for (c = 0; c < 4; c++) {
1780 pc->temp[i*4+c].type = P_TEMP;
1781 pc->temp[i*4+c].hw = -1;
1782 pc->temp[i*4+c].rhw = -1;
1783 pc->temp[i*4+c].index = i;
1784 pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1785 }
1786 }
1787 }
1788
1789 if (pc->attr_nr) {
1790 int oid = 4, mid = 4, aid = 0;
1791 /* oid = VP output id
1792 * aid = FP attribute/interpolant id
1793 * mid = VP output mapping field ID
1794 */
1795
1796 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1797 if (!pc->attr)
1798 goto out_err;
1799
1800 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1801 /* position should be loaded first */
1802 if (fcrd != 0xffff) {
1803 unsigned mask;
1804 mid = 0;
1805 mask = load_fp_attrib(pc, fcrd, r_usage[1],
1806 &mid, &aid, &oid);
1807 oid = 0;
1808 pc->p->cfg.fp.regs[1] |= (mask << 24);
1809 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1810 }
1811 pc->p->cfg.fp.map[0] += 0x03020100;
1812
1813 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1814
1815 if (perspect_loads) {
1816 pc->iv_p = alloc_temp(pc, NULL);
1817
1818 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1819 pc->p->cfg.fp.regs[1] |= 0x08000000;
1820 pc->iv_p->rhw = aid++;
1821 emit_interp(pc, pc->iv_p, NULL,
1822 INTERP_LINEAR);
1823 emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1824 } else {
1825 pc->iv_p->rhw = aid - 1;
1826 emit_flop(pc, 0, pc->iv_p,
1827 &pc->attr[fcrd * 4 + 3]);
1828 }
1829 }
1830
1831 if (centroid_loads) {
1832 pc->iv_c = alloc_temp(pc, NULL);
1833 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1834 emit_interp(pc, pc->iv_c, NULL,
1835 INTERP_CENTROID);
1836 emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1837 pc->p->cfg.fp.regs[1] |= 0x08000000;
1838 }
1839
1840 for (c = 0; c < 4; c++) {
1841 /* I don't know what these values do, but
1842 * let's set them like the blob does:
1843 */
1844 if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1845 pc->p->cfg.fp.regs[0] += 0x00010000;
1846 if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1847 pc->p->cfg.fp.regs[0] += 0x00010000;
1848 }
1849
1850 for (i = 0; i < pc->attr_nr; i++)
1851 load_fp_attrib(pc, i, r_usage[1],
1852 &mid, &aid, &oid);
1853
1854 if (pc->iv_p)
1855 free_temp(pc, pc->iv_p);
1856 if (pc->iv_c)
1857 free_temp(pc, pc->iv_c);
1858
1859 pc->p->cfg.fp.high_map = (mid / 4);
1860 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
1861 } else {
1862 /* vertex program */
1863 for (i = 0; i < pc->attr_nr * 4; i++) {
1864 pc->p->cfg.vp.attr[aid / 32] |=
1865 (1 << (aid % 32));
1866 pc->attr[i].type = P_ATTR;
1867 pc->attr[i].hw = aid++;
1868 pc->attr[i].index = i / 4;
1869 }
1870 }
1871 }
1872
1873 if (pc->result_nr) {
1874 int rid = 0;
1875
1876 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1877 if (!pc->result)
1878 goto out_err;
1879
1880 for (i = 0; i < pc->result_nr; i++) {
1881 for (c = 0; c < 4; c++) {
1882 if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1883 pc->result[i*4+c].type = P_TEMP;
1884 pc->result[i*4+c].hw = -1;
1885 pc->result[i*4+c].rhw = (i == depr) ?
1886 -1 : rid++;
1887 } else {
1888 pc->result[i*4+c].type = P_RESULT;
1889 pc->result[i*4+c].hw = rid++;
1890 }
1891 pc->result[i*4+c].index = i;
1892 }
1893
1894 if (pc->p->type == PIPE_SHADER_FRAGMENT &&
1895 depr != 0xffff) {
1896 pc->result[depr * 4 + 2].rhw =
1897 (pc->result_nr - 1) * 4;
1898 }
1899 }
1900 }
1901
1902 if (pc->param_nr) {
1903 int rid = 0;
1904
1905 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
1906 if (!pc->param)
1907 goto out_err;
1908
1909 for (i = 0; i < pc->param_nr; i++) {
1910 for (c = 0; c < 4; c++) {
1911 pc->param[i*4+c].type = P_CONST;
1912 pc->param[i*4+c].hw = rid++;
1913 pc->param[i*4+c].index = i;
1914 }
1915 }
1916 }
1917
1918 if (pc->immd_nr) {
1919 int rid = pc->param_nr * 4;
1920
1921 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
1922 if (!pc->immd)
1923 goto out_err;
1924
1925 for (i = 0; i < pc->immd_nr; i++) {
1926 for (c = 0; c < 4; c++) {
1927 pc->immd[i*4+c].type = P_IMMD;
1928 pc->immd[i*4+c].hw = rid++;
1929 pc->immd[i*4+c].index = i;
1930 }
1931 }
1932 }
1933
1934 ret = TRUE;
1935 out_err:
1936 if (r_usage[0])
1937 FREE(r_usage[0]);
1938 if (r_usage[1])
1939 FREE(r_usage[1]);
1940
1941 tgsi_parse_free(&p);
1942 return ret;
1943 }
1944
1945 static void
1946 free_nv50_pc(struct nv50_pc *pc)
1947 {
1948 unsigned i;
1949
1950 if (pc->immd)
1951 FREE(pc->immd);
1952 if (pc->param)
1953 FREE(pc->param);
1954 if (pc->result)
1955 FREE(pc->result);
1956 if (pc->attr)
1957 FREE(pc->attr);
1958 if (pc->temp)
1959 FREE(pc->temp);
1960
1961 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
1962 /* deallocate fragment program attributes */
1963 if (pc->r_temp[i] && pc->r_temp[i]->index == -1)
1964 FREE(pc->r_temp[i]);
1965 }
1966
1967 FREE(pc);
1968 }
1969
1970 static boolean
1971 nv50_program_tx(struct nv50_program *p)
1972 {
1973 struct tgsi_parse_context parse;
1974 struct nv50_pc *pc;
1975 unsigned k;
1976 boolean ret;
1977
1978 pc = CALLOC_STRUCT(nv50_pc);
1979 if (!pc)
1980 return FALSE;
1981 pc->p = p;
1982 pc->p->cfg.high_temp = 4;
1983
1984 ret = nv50_program_tx_prep(pc);
1985 if (ret == FALSE)
1986 goto out_cleanup;
1987
1988 tgsi_parse_init(&parse, pc->p->pipe.tokens);
1989 while (!tgsi_parse_end_of_tokens(&parse)) {
1990 const union tgsi_full_token *tok = &parse.FullToken;
1991
1992 /* don't allow half insn/immd on first and last instruction */
1993 pc->allow32 = TRUE;
1994 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
1995 pc->allow32 = FALSE;
1996
1997 tgsi_parse_token(&parse);
1998
1999 switch (tok->Token.Type) {
2000 case TGSI_TOKEN_TYPE_INSTRUCTION:
2001 ++pc->insn_cur;
2002 ret = nv50_program_tx_insn(pc, tok);
2003 if (ret == FALSE)
2004 goto out_err;
2005 break;
2006 default:
2007 break;
2008 }
2009 }
2010
2011 if (p->type == PIPE_SHADER_FRAGMENT) {
2012 struct nv50_reg out;
2013
2014 out.type = P_TEMP;
2015 for (k = 0; k < pc->result_nr * 4; k++) {
2016 if (pc->result[k].rhw == -1)
2017 continue;
2018 if (pc->result[k].hw != pc->result[k].rhw) {
2019 out.hw = pc->result[k].rhw;
2020 emit_mov(pc, &out, &pc->result[k]);
2021 }
2022 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2023 pc->p->cfg.high_result = pc->result[k].rhw + 1;
2024 }
2025 }
2026
2027 /* look for single half instructions and make them long */
2028 struct nv50_program_exec *e, *e_prev;
2029
2030 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2031 if (!is_long(e))
2032 k++;
2033
2034 if (!e->next || is_long(e->next)) {
2035 if (k & 1)
2036 convert_to_long(pc, e);
2037 k = 0;
2038 }
2039
2040 if (e->next)
2041 e_prev = e;
2042 }
2043
2044 if (!is_long(pc->p->exec_tail)) {
2045 /* this may occur if moving FP results */
2046 assert(e_prev && !is_long(e_prev));
2047 convert_to_long(pc, e_prev);
2048 convert_to_long(pc, pc->p->exec_tail);
2049 }
2050
2051 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2052 pc->p->exec_tail->inst[1] |= 0x00000001;
2053
2054 p->param_nr = pc->param_nr * 4;
2055 p->immd_nr = pc->immd_nr * 4;
2056 p->immd = pc->immd_buf;
2057
2058 out_err:
2059 tgsi_parse_free(&parse);
2060
2061 out_cleanup:
2062 free_nv50_pc(pc);
2063 return ret;
2064 }
2065
2066 static void
2067 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2068 {
2069 if (nv50_program_tx(p) == FALSE)
2070 assert(0);
2071 p->translated = TRUE;
2072 }
2073
2074 static void
2075 nv50_program_upload_data(struct nv50_context *nv50, float *map,
2076 unsigned start, unsigned count)
2077 {
2078 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2079 struct nouveau_grobj *tesla = nv50->screen->tesla;
2080
2081 while (count) {
2082 unsigned nr = count > 2047 ? 2047 : count;
2083
2084 BEGIN_RING(chan, tesla, 0x00000f00, 1);
2085 OUT_RING (chan, (NV50_CB_PMISC << 0) | (start << 8));
2086 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2087 OUT_RINGp (chan, map, nr);
2088
2089 map += nr;
2090 start += nr;
2091 count -= nr;
2092 }
2093 }
2094
2095 static void
2096 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2097 {
2098 struct nouveau_winsys *nvws = nv50->screen->nvws;
2099 struct pipe_winsys *ws = nv50->pipe.winsys;
2100 unsigned nr = p->param_nr + p->immd_nr;
2101
2102 if (!p->data && nr) {
2103 struct nouveau_resource *heap = nv50->screen->vp_data_heap;
2104
2105 if (nvws->res_alloc(heap, nr, p, &p->data)) {
2106 while (heap->next && heap->size < nr) {
2107 struct nv50_program *evict = heap->next->priv;
2108 nvws->res_free(&evict->data);
2109 }
2110
2111 if (nvws->res_alloc(heap, nr, p, &p->data))
2112 assert(0);
2113 }
2114 }
2115
2116 if (p->param_nr) {
2117 float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
2118 PIPE_BUFFER_USAGE_CPU_READ);
2119 nv50_program_upload_data(nv50, map, p->data->start,
2120 p->param_nr);
2121 ws->buffer_unmap(ws, nv50->constbuf[p->type]);
2122 }
2123
2124 if (p->immd_nr) {
2125 nv50_program_upload_data(nv50, p->immd,
2126 p->data->start + p->param_nr,
2127 p->immd_nr);
2128 }
2129 }
2130
2131 static void
2132 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2133 {
2134 struct nouveau_channel *chan = nv50->screen->nvws->channel;
2135 struct nouveau_grobj *tesla = nv50->screen->tesla;
2136 struct pipe_screen *screen = nv50->pipe.screen;
2137 struct nv50_program_exec *e;
2138 struct nouveau_stateobj *so;
2139 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2140 unsigned start, count, *up, *ptr;
2141 boolean upload = FALSE;
2142
2143 if (!p->buffer) {
2144 p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
2145 upload = TRUE;
2146 }
2147
2148 if (p->data && p->data->start != p->data_start) {
2149 for (e = p->exec_head; e; e = e->next) {
2150 unsigned ei, ci;
2151
2152 if (e->param.index < 0)
2153 continue;
2154 ei = e->param.shift >> 5;
2155 ci = e->param.index + p->data->start;
2156
2157 e->inst[ei] &= ~e->param.mask;
2158 e->inst[ei] |= (ci << e->param.shift);
2159 }
2160
2161 p->data_start = p->data->start;
2162 upload = TRUE;
2163 }
2164
2165 if (!upload)
2166 return;
2167
2168 #ifdef NV50_PROGRAM_DUMP
2169 NOUVEAU_ERR("-------\n");
2170 for (e = p->exec_head; e; e = e->next) {
2171 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2172 if (is_long(e))
2173 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2174 }
2175 #endif
2176
2177 up = ptr = MALLOC(p->exec_size * 4);
2178 for (e = p->exec_head; e; e = e->next) {
2179 *(ptr++) = e->inst[0];
2180 if (is_long(e))
2181 *(ptr++) = e->inst[1];
2182 }
2183
2184 so = so_new(4,2);
2185 so_method(so, nv50->screen->tesla, 0x1280, 3);
2186 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2187 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2188 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2189
2190 start = 0; count = p->exec_size;
2191 while (count) {
2192 struct nouveau_winsys *nvws = nv50->screen->nvws;
2193 unsigned nr;
2194
2195 so_emit(nvws, so);
2196
2197 nr = MIN2(count, 2047);
2198 nr = MIN2(nvws->channel->pushbuf->remaining, nr);
2199 if (nvws->channel->pushbuf->remaining < (nr + 3)) {
2200 FIRE_RING(chan);
2201 continue;
2202 }
2203
2204 BEGIN_RING(chan, tesla, 0x0f00, 1);
2205 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD);
2206 BEGIN_RING(chan, tesla, 0x40000f04, nr);
2207 OUT_RINGp (chan, up + start, nr);
2208
2209 start += nr;
2210 count -= nr;
2211 }
2212
2213 FREE(up);
2214 so_ref(NULL, &so);
2215 }
2216
2217 void
2218 nv50_vertprog_validate(struct nv50_context *nv50)
2219 {
2220 struct nouveau_grobj *tesla = nv50->screen->tesla;
2221 struct nv50_program *p = nv50->vertprog;
2222 struct nouveau_stateobj *so;
2223
2224 if (!p->translated) {
2225 nv50_program_validate(nv50, p);
2226 if (!p->translated)
2227 assert(0);
2228 }
2229
2230 nv50_program_validate_data(nv50, p);
2231 nv50_program_validate_code(nv50, p);
2232
2233 so = so_new(13, 2);
2234 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2235 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2236 NOUVEAU_BO_HIGH, 0, 0);
2237 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2238 NOUVEAU_BO_LOW, 0, 0);
2239 so_method(so, tesla, 0x1650, 2);
2240 so_data (so, p->cfg.vp.attr[0]);
2241 so_data (so, p->cfg.vp.attr[1]);
2242 so_method(so, tesla, 0x16b8, 1);
2243 so_data (so, p->cfg.high_result);
2244 so_method(so, tesla, 0x16ac, 2);
2245 so_data (so, p->cfg.high_result); //8);
2246 so_data (so, p->cfg.high_temp);
2247 so_method(so, tesla, 0x140c, 1);
2248 so_data (so, 0); /* program start offset */
2249 so_ref(so, &nv50->state.vertprog);
2250 so_ref(NULL, &so);
2251 }
2252
2253 void
2254 nv50_fragprog_validate(struct nv50_context *nv50)
2255 {
2256 struct nouveau_grobj *tesla = nv50->screen->tesla;
2257 struct nv50_program *p = nv50->fragprog;
2258 struct nouveau_stateobj *so;
2259 unsigned i;
2260
2261 if (!p->translated) {
2262 nv50_program_validate(nv50, p);
2263 if (!p->translated)
2264 assert(0);
2265 }
2266
2267 nv50_program_validate_data(nv50, p);
2268 nv50_program_validate_code(nv50, p);
2269
2270 so = so_new(64, 2);
2271 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2272 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2273 NOUVEAU_BO_HIGH, 0, 0);
2274 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2275 NOUVEAU_BO_LOW, 0, 0);
2276 so_method(so, tesla, 0x1904, 4);
2277 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2278 so_data (so, 0x00000004);
2279 so_data (so, 0x00000000);
2280 so_data (so, 0x00000000);
2281 so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
2282 for (i = 0; i < p->cfg.fp.high_map; i++)
2283 so_data(so, p->cfg.fp.map[i]);
2284 so_method(so, tesla, 0x1988, 2);
2285 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2286 so_data (so, p->cfg.high_temp);
2287 so_method(so, tesla, 0x1298, 1);
2288 so_data (so, p->cfg.high_result);
2289 so_method(so, tesla, 0x19a8, 1);
2290 so_data (so, p->cfg.fp.regs[2]);
2291 so_method(so, tesla, 0x196c, 1);
2292 so_data (so, p->cfg.fp.regs[3]);
2293 so_method(so, tesla, 0x1414, 1);
2294 so_data (so, 0); /* program start offset */
2295 so_ref(so, &nv50->state.fragprog);
2296 so_ref(NULL, &so);
2297 }
2298
2299 void
2300 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2301 {
2302 struct pipe_screen *pscreen = nv50->pipe.screen;
2303
2304 while (p->exec_head) {
2305 struct nv50_program_exec *e = p->exec_head;
2306
2307 p->exec_head = e->next;
2308 FREE(e);
2309 }
2310 p->exec_tail = NULL;
2311 p->exec_size = 0;
2312
2313 if (p->buffer)
2314 pipe_buffer_reference(&p->buffer, NULL);
2315
2316 nv50->screen->nvws->res_free(&p->data);
2317
2318 p->translated = 0;
2319 }
2320