Merge commit 'origin/master' into i965g-restart
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 127
35 #define NV50_SU_MAX_ADDR 4
36 //#define NV50_PROGRAM_DUMP
37
38 /* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40 /* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * - Fuck it off, introduce a way to negate args for ops that
44 * support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * FP attr/result assignment - how?
61 * attrib
62 * - 0x16bc maps vp output onto fp hpos
63 * - 0x16c0 maps vp output onto fp col0
64 * result
65 * - colr always 0-3
66 * - depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * - 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * - XX == FP high something
78 */
79 struct nv50_reg {
80 enum {
81 P_TEMP,
82 P_ATTR,
83 P_RESULT,
84 P_CONST,
85 P_IMMD,
86 P_ADDR
87 } type;
88 int index;
89
90 int hw;
91 int mod;
92
93 int rhw; /* result hw for FP outputs, or interpolant index */
94 int acc; /* instruction where this reg is last read (first insn == 1) */
95 };
96
97 #define NV50_MOD_NEG 1
98 #define NV50_MOD_ABS 2
99 #define NV50_MOD_SAT 4
100
101 /* arbitrary limits */
102 #define MAX_IF_DEPTH 4
103 #define MAX_LOOP_DEPTH 4
104
105 struct nv50_pc {
106 struct nv50_program *p;
107
108 /* hw resources */
109 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
110 struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
111
112 /* tgsi resources */
113 struct nv50_reg *temp;
114 int temp_nr;
115 struct nv50_reg *attr;
116 int attr_nr;
117 struct nv50_reg *result;
118 int result_nr;
119 struct nv50_reg *param;
120 int param_nr;
121 struct nv50_reg *immd;
122 float *immd_buf;
123 int immd_nr;
124 struct nv50_reg **addr;
125 int addr_nr;
126
127 struct nv50_reg *temp_temp[16];
128 unsigned temp_temp_nr;
129
130 /* broadcast and destination replacement regs */
131 struct nv50_reg *r_brdc;
132 struct nv50_reg *r_dst[4];
133
134 unsigned interp_mode[32];
135 /* perspective interpolation registers */
136 struct nv50_reg *iv_p;
137 struct nv50_reg *iv_c;
138
139 struct nv50_program_exec *if_cond;
140 struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
141 struct nv50_program_exec *br_join[MAX_IF_DEPTH];
142 struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
143 int if_lvl, loop_lvl;
144 unsigned loop_pos[MAX_LOOP_DEPTH];
145
146 /* current instruction and total number of insns */
147 unsigned insn_cur;
148 unsigned insn_nr;
149
150 boolean allow32;
151 };
152
153 static INLINE void
154 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
155 {
156 reg->type = type;
157 reg->index = index;
158 reg->hw = hw;
159 reg->mod = 0;
160 reg->rhw = -1;
161 reg->acc = 0;
162 }
163
164 static INLINE unsigned
165 popcnt4(uint32_t val)
166 {
167 static const unsigned cnt[16]
168 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
169 return cnt[val & 0xf];
170 }
171
172 static void
173 terminate_mbb(struct nv50_pc *pc)
174 {
175 int i;
176
177 /* remove records of temporary address register values */
178 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
179 if (pc->r_addr[i].index < 0)
180 pc->r_addr[i].rhw = -1;
181 }
182
183 static void
184 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
185 {
186 int i = 0;
187
188 if (reg->type == P_RESULT) {
189 if (pc->p->cfg.high_result < (reg->hw + 1))
190 pc->p->cfg.high_result = reg->hw + 1;
191 }
192
193 if (reg->type != P_TEMP)
194 return;
195
196 if (reg->hw >= 0) {
197 /*XXX: do this here too to catch FP temp-as-attr usage..
198 * not clean, but works */
199 if (pc->p->cfg.high_temp < (reg->hw + 1))
200 pc->p->cfg.high_temp = reg->hw + 1;
201 return;
202 }
203
204 if (reg->rhw != -1) {
205 /* try to allocate temporary with index rhw first */
206 if (!(pc->r_temp[reg->rhw])) {
207 pc->r_temp[reg->rhw] = reg;
208 reg->hw = reg->rhw;
209 if (pc->p->cfg.high_temp < (reg->rhw + 1))
210 pc->p->cfg.high_temp = reg->rhw + 1;
211 return;
212 }
213 /* make sure we don't get things like $r0 needs to go
214 * in $r1 and $r1 in $r0
215 */
216 i = pc->result_nr * 4;
217 }
218
219 for (; i < NV50_SU_MAX_TEMP; i++) {
220 if (!(pc->r_temp[i])) {
221 pc->r_temp[i] = reg;
222 reg->hw = i;
223 if (pc->p->cfg.high_temp < (i + 1))
224 pc->p->cfg.high_temp = i + 1;
225 return;
226 }
227 }
228
229 assert(0);
230 }
231
232 /* XXX: For shaders that aren't executed linearly (e.g. shaders that
233 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
234 * lest we risk temp_temps overwriting regs alloc'd "later".
235 */
236 static struct nv50_reg *
237 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
238 {
239 struct nv50_reg *r;
240 int i;
241
242 if (dst && dst->type == P_TEMP && dst->hw == -1)
243 return dst;
244
245 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
246 if (!pc->r_temp[i]) {
247 r = MALLOC_STRUCT(nv50_reg);
248 ctor_reg(r, P_TEMP, -1, i);
249 pc->r_temp[i] = r;
250 return r;
251 }
252 }
253
254 assert(0);
255 return NULL;
256 }
257
258 /* Assign the hw of the discarded temporary register src
259 * to the tgsi register dst and free src.
260 */
261 static void
262 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
263 {
264 assert(src->index == -1 && src->hw != -1);
265
266 if (dst->hw != -1)
267 pc->r_temp[dst->hw] = NULL;
268 pc->r_temp[src->hw] = dst;
269 dst->hw = src->hw;
270
271 FREE(src);
272 }
273
274 /* release the hardware resource held by r */
275 static void
276 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
277 {
278 assert(r->type == P_TEMP);
279 if (r->hw == -1)
280 return;
281
282 assert(pc->r_temp[r->hw] == r);
283 pc->r_temp[r->hw] = NULL;
284
285 r->acc = 0;
286 if (r->index == -1)
287 FREE(r);
288 }
289
290 static void
291 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
292 {
293 if (r->index == -1) {
294 unsigned hw = r->hw;
295
296 FREE(pc->r_temp[hw]);
297 pc->r_temp[hw] = NULL;
298 }
299 }
300
301 static int
302 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
303 {
304 int i;
305
306 if ((idx + 4) >= NV50_SU_MAX_TEMP)
307 return 1;
308
309 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
310 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
311 return alloc_temp4(pc, dst, idx + 4);
312
313 for (i = 0; i < 4; i++) {
314 dst[i] = MALLOC_STRUCT(nv50_reg);
315 ctor_reg(dst[i], P_TEMP, -1, idx + i);
316 pc->r_temp[idx + i] = dst[i];
317 }
318
319 return 0;
320 }
321
322 static void
323 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
324 {
325 int i;
326
327 for (i = 0; i < 4; i++)
328 free_temp(pc, reg[i]);
329 }
330
331 static struct nv50_reg *
332 temp_temp(struct nv50_pc *pc)
333 {
334 if (pc->temp_temp_nr >= 16)
335 assert(0);
336
337 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
338 return pc->temp_temp[pc->temp_temp_nr++];
339 }
340
341 static void
342 kill_temp_temp(struct nv50_pc *pc)
343 {
344 int i;
345
346 for (i = 0; i < pc->temp_temp_nr; i++)
347 free_temp(pc, pc->temp_temp[i]);
348 pc->temp_temp_nr = 0;
349 }
350
351 static int
352 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
353 {
354 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
355 (pc->immd_nr + 1) * 4 * sizeof(float));
356 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
357 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
358 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
359 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
360
361 return pc->immd_nr++;
362 }
363
364 static struct nv50_reg *
365 alloc_immd(struct nv50_pc *pc, float f)
366 {
367 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
368 unsigned hw;
369
370 for (hw = 0; hw < pc->immd_nr * 4; hw++)
371 if (pc->immd_buf[hw] == f)
372 break;
373
374 if (hw == pc->immd_nr * 4)
375 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
376
377 ctor_reg(r, P_IMMD, -1, hw);
378 return r;
379 }
380
381 static struct nv50_program_exec *
382 exec(struct nv50_pc *pc)
383 {
384 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
385
386 e->param.index = -1;
387 return e;
388 }
389
390 static void
391 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
392 {
393 struct nv50_program *p = pc->p;
394
395 if (p->exec_tail)
396 p->exec_tail->next = e;
397 if (!p->exec_head)
398 p->exec_head = e;
399 p->exec_tail = e;
400 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
401 }
402
403 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
404
405 static boolean
406 is_long(struct nv50_program_exec *e)
407 {
408 if (e->inst[0] & 1)
409 return TRUE;
410 return FALSE;
411 }
412
413 static boolean
414 is_immd(struct nv50_program_exec *e)
415 {
416 if (is_long(e) && (e->inst[1] & 3) == 3)
417 return TRUE;
418 return FALSE;
419 }
420
421 static INLINE void
422 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
423 struct nv50_program_exec *e)
424 {
425 set_long(pc, e);
426 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
427 e->inst[1] |= (pred << 7) | (idx << 12);
428 }
429
430 static INLINE void
431 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
432 struct nv50_program_exec *e)
433 {
434 set_long(pc, e);
435 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
436 e->inst[1] |= (idx << 4) | (on << 6);
437 }
438
439 static INLINE void
440 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
441 {
442 if (is_long(e))
443 return;
444
445 e->inst[0] |= 1;
446 set_pred(pc, 0xf, 0, e);
447 set_pred_wr(pc, 0, 0, e);
448 }
449
450 static INLINE void
451 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
452 {
453 if (dst->type == P_RESULT) {
454 set_long(pc, e);
455 e->inst[1] |= 0x00000008;
456 }
457
458 alloc_reg(pc, dst);
459 if (dst->hw > 63)
460 set_long(pc, e);
461 e->inst[0] |= (dst->hw << 2);
462 }
463
464 static INLINE void
465 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
466 {
467 unsigned val;
468 float f = pc->immd_buf[imm->hw];
469
470 if (imm->mod & NV50_MOD_ABS)
471 f = fabsf(f);
472 val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
473
474 set_long(pc, e);
475 /*XXX: can't be predicated - bits overlap.. catch cases where both
476 * are required and avoid them. */
477 set_pred(pc, 0, 0, e);
478 set_pred_wr(pc, 0, 0, e);
479
480 e->inst[1] |= 0x00000002 | 0x00000001;
481 e->inst[0] |= (val & 0x3f) << 16;
482 e->inst[1] |= (val >> 6) << 2;
483 }
484
485 static INLINE void
486 set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
487 {
488 assert(!(e->inst[0] & 0x0c000000));
489 assert(!(e->inst[1] & 0x00000004));
490
491 e->inst[0] |= (a->hw & 3) << 26;
492 e->inst[1] |= (a->hw >> 2) << 2;
493 }
494
495 static void
496 emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
497 struct nv50_reg *src0, uint16_t src1_val)
498 {
499 struct nv50_program_exec *e = exec(pc);
500
501 e->inst[0] = 0xd0000000 | (src1_val << 9);
502 e->inst[1] = 0x20000000;
503 set_long(pc, e);
504 e->inst[0] |= dst->hw << 2;
505 if (src0) /* otherwise will add to $a0, which is always 0 */
506 set_addr(e, src0);
507
508 emit(pc, e);
509 }
510
511 static struct nv50_reg *
512 alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
513 {
514 int i;
515 struct nv50_reg *a_tgsi = NULL, *a = NULL;
516
517 if (!ref) {
518 /* allocate for TGSI address reg */
519 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
520 if (pc->r_addr[i].index >= 0)
521 continue;
522 if (pc->r_addr[i].rhw >= 0 &&
523 pc->r_addr[i].acc == pc->insn_cur)
524 continue;
525
526 pc->r_addr[i].rhw = -1;
527 pc->r_addr[i].index = i;
528 return &pc->r_addr[i];
529 }
530 assert(0);
531 return NULL;
532 }
533
534 /* Allocate and set an address reg so we can access 'ref'.
535 *
536 * If and r_addr has index < 0, it is not reserved for TGSI,
537 * and index will be the negative of the TGSI addr index the
538 * value in rhw is relative to, or -256 if rhw is an offset
539 * from 0. If rhw < 0, the reg has not been initialized.
540 */
541 for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
542 if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
543 continue;
544 if (pc->r_addr[i].rhw < 0) { /* unused */
545 a = &pc->r_addr[i];
546 continue;
547 }
548 if (!a && pc->r_addr[i].acc != pc->insn_cur)
549 a = &pc->r_addr[i];
550
551 if (ref->hw - pc->r_addr[i].rhw >= 128)
552 continue;
553
554 if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
555 (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
556 pc->r_addr[i].acc = pc->insn_cur;
557 return &pc->r_addr[i];
558 }
559 }
560 assert(a);
561
562 if (ref->acc < 0)
563 a_tgsi = pc->addr[ref->index];
564
565 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
566
567 a->rhw = ref->hw & ~0x7f;
568 a->acc = pc->insn_cur;
569 a->index = a_tgsi ? -ref->index : -256;
570 return a;
571 }
572
573 #define INTERP_LINEAR 0
574 #define INTERP_FLAT 1
575 #define INTERP_PERSPECTIVE 2
576 #define INTERP_CENTROID 4
577
578 /* interpolant index has been stored in dst->rhw */
579 static void
580 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
581 unsigned mode)
582 {
583 assert(dst->rhw != -1);
584 struct nv50_program_exec *e = exec(pc);
585
586 e->inst[0] |= 0x80000000;
587 set_dst(pc, dst, e);
588 e->inst[0] |= (dst->rhw << 16);
589
590 if (mode & INTERP_FLAT) {
591 e->inst[0] |= (1 << 8);
592 } else {
593 if (mode & INTERP_PERSPECTIVE) {
594 e->inst[0] |= (1 << 25);
595 alloc_reg(pc, iv);
596 e->inst[0] |= (iv->hw << 9);
597 }
598
599 if (mode & INTERP_CENTROID)
600 e->inst[0] |= (1 << 24);
601 }
602
603 emit(pc, e);
604 }
605
606 static void
607 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
608 struct nv50_program_exec *e)
609 {
610 set_long(pc, e);
611
612 e->param.index = src->hw & 127;
613 e->param.shift = s;
614 e->param.mask = m << (s % 32);
615
616 if (src->hw > 127)
617 set_addr(e, alloc_addr(pc, src));
618 else
619 if (src->acc < 0) {
620 assert(src->type == P_CONST);
621 set_addr(e, pc->addr[src->index]);
622 }
623
624 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
625 }
626
627 static void
628 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
629 {
630 struct nv50_program_exec *e = exec(pc);
631
632 e->inst[0] = 0x10000000;
633 if (!pc->allow32)
634 set_long(pc, e);
635
636 set_dst(pc, dst, e);
637
638 if (!is_long(e) && src->type == P_IMMD) {
639 set_immd(pc, src, e);
640 /*XXX: 32-bit, but steals part of "half" reg space - need to
641 * catch and handle this case if/when we do half-regs
642 */
643 } else
644 if (src->type == P_IMMD || src->type == P_CONST) {
645 set_long(pc, e);
646 set_data(pc, src, 0x7f, 9, e);
647 e->inst[1] |= 0x20000000; /* src0 const? */
648 } else {
649 if (src->type == P_ATTR) {
650 set_long(pc, e);
651 e->inst[1] |= 0x00200000;
652 }
653
654 alloc_reg(pc, src);
655 if (src->hw > 63)
656 set_long(pc, e);
657 e->inst[0] |= (src->hw << 9);
658 }
659
660 if (is_long(e) && !is_immd(e)) {
661 e->inst[1] |= 0x04000000; /* 32-bit */
662 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
663 if (!(e->inst[1] & 0x20000000))
664 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
665 } else
666 e->inst[0] |= 0x00008000;
667
668 emit(pc, e);
669 }
670
671 static INLINE void
672 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
673 {
674 struct nv50_reg *imm = alloc_immd(pc, f);
675 emit_mov(pc, dst, imm);
676 FREE(imm);
677 }
678
679 static boolean
680 check_swap_src_0_1(struct nv50_pc *pc,
681 struct nv50_reg **s0, struct nv50_reg **s1)
682 {
683 struct nv50_reg *src0 = *s0, *src1 = *s1;
684
685 if (src0->type == P_CONST) {
686 if (src1->type != P_CONST) {
687 *s0 = src1;
688 *s1 = src0;
689 return TRUE;
690 }
691 } else
692 if (src1->type == P_ATTR) {
693 if (src0->type != P_ATTR) {
694 *s0 = src1;
695 *s1 = src0;
696 return TRUE;
697 }
698 }
699
700 return FALSE;
701 }
702
703 static void
704 set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
705 struct nv50_program_exec *e)
706 {
707 struct nv50_reg *temp;
708
709 if (src->type != P_TEMP) {
710 temp = temp_temp(pc);
711 emit_mov(pc, temp, src);
712 src = temp;
713 }
714
715 alloc_reg(pc, src);
716 if (src->hw > 63)
717 set_long(pc, e);
718 e->inst[0] |= (src->hw << 9);
719 }
720
721 static void
722 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
723 {
724 if (src->type == P_ATTR) {
725 set_long(pc, e);
726 e->inst[1] |= 0x00200000;
727 } else
728 if (src->type == P_CONST || src->type == P_IMMD) {
729 struct nv50_reg *temp = temp_temp(pc);
730
731 emit_mov(pc, temp, src);
732 src = temp;
733 }
734
735 alloc_reg(pc, src);
736 if (src->hw > 63)
737 set_long(pc, e);
738 e->inst[0] |= (src->hw << 9);
739 }
740
741 static void
742 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
743 {
744 if (src->type == P_ATTR) {
745 struct nv50_reg *temp = temp_temp(pc);
746
747 emit_mov(pc, temp, src);
748 src = temp;
749 } else
750 if (src->type == P_CONST || src->type == P_IMMD) {
751 assert(!(e->inst[0] & 0x00800000));
752 if (e->inst[0] & 0x01000000) {
753 struct nv50_reg *temp = temp_temp(pc);
754
755 emit_mov(pc, temp, src);
756 src = temp;
757 } else {
758 set_data(pc, src, 0x7f, 16, e);
759 e->inst[0] |= 0x00800000;
760 }
761 }
762
763 alloc_reg(pc, src);
764 if (src->hw > 63)
765 set_long(pc, e);
766 e->inst[0] |= ((src->hw & 127) << 16);
767 }
768
769 static void
770 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
771 {
772 set_long(pc, e);
773
774 if (src->type == P_ATTR) {
775 struct nv50_reg *temp = temp_temp(pc);
776
777 emit_mov(pc, temp, src);
778 src = temp;
779 } else
780 if (src->type == P_CONST || src->type == P_IMMD) {
781 assert(!(e->inst[0] & 0x01000000));
782 if (e->inst[0] & 0x00800000) {
783 struct nv50_reg *temp = temp_temp(pc);
784
785 emit_mov(pc, temp, src);
786 src = temp;
787 } else {
788 set_data(pc, src, 0x7f, 32+14, e);
789 e->inst[0] |= 0x01000000;
790 }
791 }
792
793 alloc_reg(pc, src);
794 e->inst[1] |= ((src->hw & 127) << 14);
795 }
796
797 static void
798 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
799 struct nv50_reg *src1)
800 {
801 struct nv50_program_exec *e = exec(pc);
802
803 e->inst[0] |= 0xc0000000;
804
805 if (!pc->allow32)
806 set_long(pc, e);
807
808 check_swap_src_0_1(pc, &src0, &src1);
809 set_dst(pc, dst, e);
810 set_src_0(pc, src0, e);
811 if (src1->type == P_IMMD && !is_long(e)) {
812 if (src0->mod & NV50_MOD_NEG)
813 e->inst[0] |= 0x00008000;
814 set_immd(pc, src1, e);
815 } else {
816 set_src_1(pc, src1, e);
817 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
818 if (is_long(e))
819 e->inst[1] |= 0x08000000;
820 else
821 e->inst[0] |= 0x00008000;
822 }
823 }
824
825 emit(pc, e);
826 }
827
828 static void
829 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
830 struct nv50_reg *src0, struct nv50_reg *src1)
831 {
832 struct nv50_program_exec *e = exec(pc);
833
834 e->inst[0] = 0xb0000000;
835
836 alloc_reg(pc, src1);
837 check_swap_src_0_1(pc, &src0, &src1);
838
839 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
840 set_long(pc, e);
841 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
842 ((src1->mod & NV50_MOD_NEG) << 27);
843 }
844
845 set_dst(pc, dst, e);
846 set_src_0(pc, src0, e);
847 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
848 set_src_2(pc, src1, e);
849 else
850 if (src1->type == P_IMMD)
851 set_immd(pc, src1, e);
852 else
853 set_src_1(pc, src1, e);
854
855 emit(pc, e);
856 }
857
858 static void
859 emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
860 uint8_t s)
861 {
862 struct nv50_program_exec *e = exec(pc);
863
864 set_long(pc, e);
865 e->inst[1] |= 0xc0000000;
866
867 e->inst[0] |= dst->hw << 2;
868 e->inst[0] |= s << 16; /* shift left */
869 set_src_0_restricted(pc, src, e);
870
871 emit(pc, e);
872 }
873
874 static void
875 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
876 struct nv50_reg *src0, struct nv50_reg *src1)
877 {
878 struct nv50_program_exec *e = exec(pc);
879
880 set_long(pc, e);
881 e->inst[0] |= 0xb0000000;
882 e->inst[1] |= (sub << 29);
883
884 check_swap_src_0_1(pc, &src0, &src1);
885 set_dst(pc, dst, e);
886 set_src_0(pc, src0, e);
887 set_src_1(pc, src1, e);
888
889 if (src0->mod & NV50_MOD_ABS)
890 e->inst[1] |= 0x00100000;
891 if (src1->mod & NV50_MOD_ABS)
892 e->inst[1] |= 0x00080000;
893
894 emit(pc, e);
895 }
896
897 static INLINE void
898 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
899 struct nv50_reg *src1)
900 {
901 assert(src0 != src1);
902 src1->mod ^= NV50_MOD_NEG;
903 emit_add(pc, dst, src0, src1);
904 src1->mod ^= NV50_MOD_NEG;
905 }
906
907 static void
908 emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
909 struct nv50_reg *src1, unsigned op)
910 {
911 struct nv50_program_exec *e = exec(pc);
912
913 e->inst[0] = 0xd0000000;
914 set_long(pc, e);
915
916 check_swap_src_0_1(pc, &src0, &src1);
917 set_dst(pc, dst, e);
918 set_src_0(pc, src0, e);
919
920 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
921 op != TGSI_OPCODE_XOR)
922 assert(!"invalid bit op");
923
924 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
925 set_immd(pc, src1, e);
926 if (op == TGSI_OPCODE_OR)
927 e->inst[0] |= 0x0100;
928 else
929 if (op == TGSI_OPCODE_XOR)
930 e->inst[0] |= 0x8000;
931 } else {
932 set_src_1(pc, src1, e);
933 e->inst[1] |= 0x04000000; /* 32 bit */
934 if (op == TGSI_OPCODE_OR)
935 e->inst[1] |= 0x4000;
936 else
937 if (op == TGSI_OPCODE_XOR)
938 e->inst[1] |= 0x8000;
939 }
940
941 emit(pc, e);
942 }
943
944 static void
945 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
946 struct nv50_reg *src1, struct nv50_reg *src2)
947 {
948 struct nv50_program_exec *e = exec(pc);
949
950 e->inst[0] |= 0xe0000000;
951
952 check_swap_src_0_1(pc, &src0, &src1);
953 set_dst(pc, dst, e);
954 set_src_0(pc, src0, e);
955 set_src_1(pc, src1, e);
956 set_src_2(pc, src2, e);
957
958 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
959 e->inst[1] |= 0x04000000;
960 if (src2->mod & NV50_MOD_NEG)
961 e->inst[1] |= 0x08000000;
962
963 emit(pc, e);
964 }
965
966 static INLINE void
967 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
968 struct nv50_reg *src1, struct nv50_reg *src2)
969 {
970 assert(src2 != src0 && src2 != src1);
971 src2->mod ^= NV50_MOD_NEG;
972 emit_mad(pc, dst, src0, src1, src2);
973 src2->mod ^= NV50_MOD_NEG;
974 }
975
976 static void
977 emit_flop(struct nv50_pc *pc, unsigned sub,
978 struct nv50_reg *dst, struct nv50_reg *src)
979 {
980 struct nv50_program_exec *e = exec(pc);
981
982 e->inst[0] |= 0x90000000;
983 if (sub) {
984 set_long(pc, e);
985 e->inst[1] |= (sub << 29);
986 }
987
988 set_dst(pc, dst, e);
989
990 if (sub == 0 || sub == 2)
991 set_src_0_restricted(pc, src, e);
992 else
993 set_src_0(pc, src, e);
994
995 emit(pc, e);
996 }
997
998 static void
999 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1000 {
1001 struct nv50_program_exec *e = exec(pc);
1002
1003 e->inst[0] |= 0xb0000000;
1004
1005 set_dst(pc, dst, e);
1006 set_src_0(pc, src, e);
1007 set_long(pc, e);
1008 e->inst[1] |= (6 << 29) | 0x00004000;
1009
1010 emit(pc, e);
1011 }
1012
1013 static void
1014 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1015 {
1016 struct nv50_program_exec *e = exec(pc);
1017
1018 e->inst[0] |= 0xb0000000;
1019
1020 set_dst(pc, dst, e);
1021 set_src_0(pc, src, e);
1022 set_long(pc, e);
1023 e->inst[1] |= (6 << 29);
1024
1025 emit(pc, e);
1026 }
1027
1028 #define CVTOP_RN 0x01
1029 #define CVTOP_FLOOR 0x03
1030 #define CVTOP_CEIL 0x05
1031 #define CVTOP_TRUNC 0x07
1032 #define CVTOP_SAT 0x08
1033 #define CVTOP_ABS 0x10
1034
1035 /* 0x04 == 32 bit dst */
1036 /* 0x40 == dst is float */
1037 /* 0x80 == src is float */
1038 #define CVT_F32_F32 0xc4
1039 #define CVT_F32_S32 0x44
1040 #define CVT_S32_F32 0x8c
1041 #define CVT_S32_S32 0x0c
1042 #define CVT_NEG 0x20
1043 #define CVT_RI 0x08
1044
1045 static void
1046 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1047 int wp, unsigned cvn, unsigned fmt)
1048 {
1049 struct nv50_program_exec *e;
1050
1051 e = exec(pc);
1052 set_long(pc, e);
1053
1054 e->inst[0] |= 0xa0000000;
1055 e->inst[1] |= 0x00004000; /* 32 bit src */
1056 e->inst[1] |= (cvn << 16);
1057 e->inst[1] |= (fmt << 24);
1058 set_src_0(pc, src, e);
1059
1060 if (wp >= 0)
1061 set_pred_wr(pc, 1, wp, e);
1062
1063 if (dst)
1064 set_dst(pc, dst, e);
1065 else {
1066 e->inst[0] |= 0x000001fc;
1067 e->inst[1] |= 0x00000008;
1068 }
1069
1070 emit(pc, e);
1071 }
1072
1073 /* nv50 Condition codes:
1074 * 0x1 = LT
1075 * 0x2 = EQ
1076 * 0x3 = LE
1077 * 0x4 = GT
1078 * 0x5 = NE
1079 * 0x6 = GE
1080 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1081 * 0x8 = unordered bit (allows NaN)
1082 */
1083 static void
1084 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1085 struct nv50_reg *src0, struct nv50_reg *src1)
1086 {
1087 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1088
1089 struct nv50_program_exec *e = exec(pc);
1090 struct nv50_reg *rdst;
1091
1092 assert(ccode < 16);
1093 if (check_swap_src_0_1(pc, &src0, &src1))
1094 ccode = cc_swapped[ccode & 7] | (ccode & 8);
1095
1096 rdst = dst;
1097 if (dst && dst->type != P_TEMP)
1098 dst = alloc_temp(pc, NULL);
1099
1100 /* set.u32 */
1101 set_long(pc, e);
1102 e->inst[0] |= 0xb0000000;
1103 e->inst[1] |= 0x60000000 | (ccode << 14);
1104
1105 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1106 * that doesn't seem to match what the hw actually does
1107 e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1108 */
1109
1110 if (wp >= 0)
1111 set_pred_wr(pc, 1, wp, e);
1112 if (dst)
1113 set_dst(pc, dst, e);
1114 else {
1115 e->inst[0] |= 0x000001fc;
1116 e->inst[1] |= 0x00000008;
1117 }
1118
1119 set_src_0(pc, src0, e);
1120 set_src_1(pc, src1, e);
1121
1122 emit(pc, e);
1123 pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
1124
1125 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1126 if (rdst)
1127 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1128 if (rdst && rdst != dst)
1129 free_temp(pc, dst);
1130 }
1131
1132 static INLINE unsigned
1133 map_tgsi_setop_cc(unsigned op)
1134 {
1135 switch (op) {
1136 case TGSI_OPCODE_SLT: return 0x1;
1137 case TGSI_OPCODE_SGE: return 0x6;
1138 case TGSI_OPCODE_SEQ: return 0x2;
1139 case TGSI_OPCODE_SGT: return 0x4;
1140 case TGSI_OPCODE_SLE: return 0x3;
1141 case TGSI_OPCODE_SNE: return 0xd;
1142 default:
1143 assert(0);
1144 return 0;
1145 }
1146 }
1147
1148 static INLINE void
1149 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1150 {
1151 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1152 }
1153
1154 static void
1155 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1156 struct nv50_reg *v, struct nv50_reg *e)
1157 {
1158 struct nv50_reg *temp = alloc_temp(pc, NULL);
1159
1160 emit_flop(pc, 3, temp, v);
1161 emit_mul(pc, temp, temp, e);
1162 emit_preex2(pc, temp, temp);
1163 emit_flop(pc, 6, dst, temp);
1164
1165 free_temp(pc, temp);
1166 }
1167
1168 static INLINE void
1169 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1170 {
1171 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1172 }
1173
1174 static INLINE void
1175 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1176 {
1177 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1178 }
1179
1180 static void
1181 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1182 struct nv50_reg **src)
1183 {
1184 struct nv50_reg *one = alloc_immd(pc, 1.0);
1185 struct nv50_reg *zero = alloc_immd(pc, 0.0);
1186 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1187 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
1188 struct nv50_reg *tmp[4];
1189 boolean allow32 = pc->allow32;
1190
1191 pc->allow32 = FALSE;
1192
1193 if (mask & (3 << 1)) {
1194 tmp[0] = alloc_temp(pc, NULL);
1195 emit_minmax(pc, 4, tmp[0], src[0], zero);
1196 }
1197
1198 if (mask & (1 << 2)) {
1199 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1200
1201 tmp[1] = temp_temp(pc);
1202 emit_minmax(pc, 4, tmp[1], src[1], zero);
1203
1204 tmp[3] = temp_temp(pc);
1205 emit_minmax(pc, 4, tmp[3], src[3], neg128);
1206 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1207
1208 emit_pow(pc, dst[2], tmp[1], tmp[3]);
1209 emit_mov(pc, dst[2], zero);
1210 set_pred(pc, 3, 0, pc->p->exec_tail);
1211 }
1212
1213 if (mask & (1 << 1))
1214 assimilate_temp(pc, dst[1], tmp[0]);
1215 else
1216 if (mask & (1 << 2))
1217 free_temp(pc, tmp[0]);
1218
1219 pc->allow32 = allow32;
1220
1221 /* do this last, in case src[i,j] == dst[0,3] */
1222 if (mask & (1 << 0))
1223 emit_mov(pc, dst[0], one);
1224
1225 if (mask & (1 << 3))
1226 emit_mov(pc, dst[3], one);
1227
1228 FREE(pos128);
1229 FREE(neg128);
1230 FREE(zero);
1231 FREE(one);
1232 }
1233
1234 static INLINE void
1235 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1236 {
1237 emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1238 }
1239
1240 static void
1241 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1242 {
1243 struct nv50_program_exec *e;
1244 const int r_pred = 1;
1245 unsigned cvn = CVT_F32_F32;
1246
1247 if (src->mod & NV50_MOD_NEG)
1248 cvn |= CVT_NEG;
1249 /* write predicate reg */
1250 emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1251
1252 /* conditional discard */
1253 e = exec(pc);
1254 e->inst[0] = 0x00000002;
1255 set_long(pc, e);
1256 set_pred(pc, 0x1 /* LT */, r_pred, e);
1257 emit(pc, e);
1258 }
1259
1260 static void
1261 load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1262 struct nv50_reg **src, boolean proj)
1263 {
1264 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
1265
1266 src[0]->mod |= NV50_MOD_ABS;
1267 src[1]->mod |= NV50_MOD_ABS;
1268 src[2]->mod |= NV50_MOD_ABS;
1269
1270 emit_minmax(pc, 4, t[2], src[0], src[1]);
1271 emit_minmax(pc, 4, t[2], src[2], t[2]);
1272
1273 src[0]->mod = mod[0];
1274 src[1]->mod = mod[1];
1275 src[2]->mod = mod[2];
1276
1277 if (proj && 0 /* looks more correct without this */)
1278 emit_mul(pc, t[2], t[2], src[3]);
1279 emit_flop(pc, 0, t[2], t[2]);
1280
1281 emit_mul(pc, t[0], src[0], t[2]);
1282 emit_mul(pc, t[1], src[1], t[2]);
1283 emit_mul(pc, t[2], src[2], t[2]);
1284 }
1285
1286 static void
1287 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1288 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1289 {
1290 struct nv50_reg *t[4];
1291 struct nv50_program_exec *e;
1292
1293 unsigned c, mode, dim;
1294
1295 switch (type) {
1296 case TGSI_TEXTURE_1D:
1297 dim = 1;
1298 break;
1299 case TGSI_TEXTURE_UNKNOWN:
1300 case TGSI_TEXTURE_2D:
1301 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1302 case TGSI_TEXTURE_RECT:
1303 dim = 2;
1304 break;
1305 case TGSI_TEXTURE_3D:
1306 case TGSI_TEXTURE_CUBE:
1307 case TGSI_TEXTURE_SHADOW2D:
1308 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1309 dim = 3;
1310 break;
1311 default:
1312 assert(0);
1313 break;
1314 }
1315
1316 /* some cards need t[0]'s hw index to be a multiple of 4 */
1317 alloc_temp4(pc, t, 0);
1318
1319 if (type == TGSI_TEXTURE_CUBE) {
1320 load_cube_tex_coords(pc, t, src, proj);
1321 } else
1322 if (proj) {
1323 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1324 mode = pc->interp_mode[src[0]->index];
1325
1326 t[3]->rhw = src[3]->rhw;
1327 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1328 emit_flop(pc, 0, t[3], t[3]);
1329
1330 for (c = 0; c < dim; c++) {
1331 t[c]->rhw = src[c]->rhw;
1332 emit_interp(pc, t[c], t[3],
1333 (mode | INTERP_PERSPECTIVE));
1334 }
1335 } else {
1336 emit_flop(pc, 0, t[3], src[3]);
1337 for (c = 0; c < dim; c++)
1338 emit_mul(pc, t[c], src[c], t[3]);
1339
1340 /* XXX: for some reason the blob sometimes uses MAD:
1341 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1342 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1343 */
1344 }
1345 } else {
1346 for (c = 0; c < dim; c++)
1347 emit_mov(pc, t[c], src[c]);
1348 }
1349
1350 e = exec(pc);
1351 set_long(pc, e);
1352 e->inst[0] |= 0xf0000000;
1353 e->inst[1] |= 0x00000004;
1354 set_dst(pc, t[0], e);
1355 e->inst[0] |= (unit << 9);
1356
1357 if (dim == 2)
1358 e->inst[0] |= 0x00400000;
1359 else
1360 if (dim == 3) {
1361 e->inst[0] |= 0x00800000;
1362 if (type == TGSI_TEXTURE_CUBE)
1363 e->inst[0] |= 0x08000000;
1364 }
1365
1366 e->inst[0] |= (mask & 0x3) << 25;
1367 e->inst[1] |= (mask & 0xc) << 12;
1368
1369 emit(pc, e);
1370 #if 1
1371 c = 0;
1372 if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1373 if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1374 if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1375 if (mask & 8) emit_mov(pc, dst[3], t[c]);
1376
1377 free_temp4(pc, t);
1378 #else
1379 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1380 * the texture coordinates, not the fetched values: latency ? */
1381
1382 for (c = 0; c < 4; c++) {
1383 if (mask & (1 << c))
1384 assimilate_temp(pc, dst[c], t[c]);
1385 else
1386 free_temp(pc, t[c]);
1387 }
1388 #endif
1389 }
1390
1391 static void
1392 emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1393 struct nv50_program_exec **join)
1394 {
1395 struct nv50_program_exec *e = exec(pc);
1396
1397 if (join) {
1398 set_long(pc, e);
1399 e->inst[0] |= 0xa0000002;
1400 emit(pc, e);
1401 *join = e;
1402 e = exec(pc);
1403 }
1404
1405 set_long(pc, e);
1406 e->inst[0] |= 0x10000002;
1407 if (pred >= 0)
1408 set_pred(pc, cc, pred, e);
1409 emit(pc, e);
1410 }
1411
1412 static void
1413 emit_nop(struct nv50_pc *pc)
1414 {
1415 struct nv50_program_exec *e = exec(pc);
1416
1417 e->inst[0] = 0xf0000000;
1418 set_long(pc, e);
1419 e->inst[1] = 0xe0000000;
1420 emit(pc, e);
1421 }
1422
1423 static void
1424 emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1425 {
1426 struct nv50_program_exec *e = exec(pc);
1427
1428 assert(src->type == P_TEMP);
1429
1430 e->inst[0] = 0xc0140000;
1431 e->inst[1] = 0x89800000;
1432 set_long(pc, e);
1433 set_dst(pc, dst, e);
1434 set_src_0(pc, src, e);
1435 set_src_2(pc, src, e);
1436
1437 emit(pc, e);
1438 }
1439
1440 static void
1441 emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1442 {
1443 struct nv50_reg *r = src;
1444 struct nv50_program_exec *e = exec(pc);
1445
1446 assert(src->type == P_TEMP);
1447
1448 if (!(src->mod & NV50_MOD_NEG)) { /* ! double negation */
1449 r = alloc_temp(pc, NULL);
1450 emit_neg(pc, r, src);
1451 }
1452
1453 e->inst[0] = 0xc0150000;
1454 e->inst[1] = 0x8a400000;
1455 set_long(pc, e);
1456 set_dst(pc, dst, e);
1457 set_src_0(pc, r, e);
1458 set_src_2(pc, r, e);
1459
1460 if (r != src)
1461 free_temp(pc, r);
1462
1463 emit(pc, e);
1464 }
1465
1466 static void
1467 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1468 {
1469 unsigned q = 0, m = ~0;
1470
1471 assert(!is_long(e));
1472
1473 switch (e->inst[0] >> 28) {
1474 case 0x1:
1475 /* MOV */
1476 q = 0x0403c000;
1477 m = 0xffff7fff;
1478 break;
1479 case 0x8:
1480 /* INTERP (move centroid, perspective and flat bits) */
1481 m = ~0x03000100;
1482 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1483 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1484 break;
1485 case 0x9:
1486 /* RCP */
1487 break;
1488 case 0xB:
1489 /* ADD */
1490 m = ~(127 << 16);
1491 q = ((e->inst[0] & (~m)) >> 2);
1492 break;
1493 case 0xC:
1494 /* MUL */
1495 m = ~0x00008000;
1496 q = ((e->inst[0] & (~m)) << 12);
1497 break;
1498 case 0xE:
1499 /* MAD (if src2 == dst) */
1500 q = ((e->inst[0] & 0x1fc) << 12);
1501 break;
1502 default:
1503 assert(0);
1504 break;
1505 }
1506
1507 set_long(pc, e);
1508 pc->p->exec_size++;
1509
1510 e->inst[0] &= m;
1511 e->inst[1] |= q;
1512 }
1513
1514 /* Some operations support an optional negation flag. */
1515 static boolean
1516 negate_supported(const struct tgsi_full_instruction *insn, int i)
1517 {
1518 int s;
1519
1520 switch (insn->Instruction.Opcode) {
1521 case TGSI_OPCODE_DDY:
1522 case TGSI_OPCODE_DP3:
1523 case TGSI_OPCODE_DP4:
1524 case TGSI_OPCODE_MUL:
1525 case TGSI_OPCODE_KIL:
1526 case TGSI_OPCODE_ADD:
1527 case TGSI_OPCODE_SUB:
1528 case TGSI_OPCODE_MAD:
1529 break;
1530 case TGSI_OPCODE_POW:
1531 if (i == 1)
1532 break;
1533 return FALSE;
1534 default:
1535 return FALSE;
1536 }
1537
1538 /* Watch out for possible multiple uses of an nv50_reg, we
1539 * can't use nv50_reg::neg in these cases.
1540 */
1541 for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1542 if (s == i)
1543 continue;
1544 if ((insn->Src[s].Register.Index ==
1545 insn->Src[i].Register.Index) &&
1546 (insn->Src[s].Register.File ==
1547 insn->Src[i].Register.File))
1548 return FALSE;
1549 }
1550
1551 return TRUE;
1552 }
1553
1554 /* Return a read mask for source registers deduced from opcode & write mask. */
1555 static unsigned
1556 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1557 {
1558 unsigned x, mask = insn->Dst[0].Register.WriteMask;
1559
1560 switch (insn->Instruction.Opcode) {
1561 case TGSI_OPCODE_COS:
1562 case TGSI_OPCODE_SIN:
1563 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1564 case TGSI_OPCODE_DP3:
1565 return 0x7;
1566 case TGSI_OPCODE_DP4:
1567 case TGSI_OPCODE_DPH:
1568 case TGSI_OPCODE_KIL: /* WriteMask ignored */
1569 return 0xf;
1570 case TGSI_OPCODE_DST:
1571 return mask & (c ? 0xa : 0x6);
1572 case TGSI_OPCODE_EX2:
1573 case TGSI_OPCODE_LG2:
1574 case TGSI_OPCODE_POW:
1575 case TGSI_OPCODE_RCP:
1576 case TGSI_OPCODE_RSQ:
1577 case TGSI_OPCODE_SCS:
1578 return 0x1;
1579 case TGSI_OPCODE_LIT:
1580 return 0xb;
1581 case TGSI_OPCODE_TEX:
1582 case TGSI_OPCODE_TXP:
1583 {
1584 const struct tgsi_instruction_texture *tex;
1585
1586 assert(insn->Instruction.Texture);
1587 tex = &insn->Texture;
1588
1589 mask = 0x7;
1590 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1591 mask |= 0x8;
1592
1593 switch (tex->Texture) {
1594 case TGSI_TEXTURE_1D:
1595 mask &= 0x9;
1596 break;
1597 case TGSI_TEXTURE_2D:
1598 mask &= 0xb;
1599 break;
1600 default:
1601 break;
1602 }
1603 }
1604 return mask;
1605 case TGSI_OPCODE_XPD:
1606 x = 0;
1607 if (mask & 1) x |= 0x6;
1608 if (mask & 2) x |= 0x5;
1609 if (mask & 4) x |= 0x3;
1610 return x;
1611 default:
1612 break;
1613 }
1614
1615 return mask;
1616 }
1617
1618 static struct nv50_reg *
1619 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1620 {
1621 switch (dst->Register.File) {
1622 case TGSI_FILE_TEMPORARY:
1623 return &pc->temp[dst->Register.Index * 4 + c];
1624 case TGSI_FILE_OUTPUT:
1625 return &pc->result[dst->Register.Index * 4 + c];
1626 case TGSI_FILE_ADDRESS:
1627 {
1628 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
1629 if (!r) {
1630 r = alloc_addr(pc, NULL);
1631 pc->addr[dst->Register.Index * 4 + c] = r;
1632 }
1633 assert(r);
1634 return r;
1635 }
1636 case TGSI_FILE_NULL:
1637 return NULL;
1638 default:
1639 break;
1640 }
1641
1642 return NULL;
1643 }
1644
1645 static struct nv50_reg *
1646 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1647 boolean neg)
1648 {
1649 struct nv50_reg *r = NULL;
1650 struct nv50_reg *temp;
1651 unsigned sgn, c, swz;
1652
1653 if (src->Register.File != TGSI_FILE_CONSTANT)
1654 assert(!src->Register.Indirect);
1655
1656 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1657
1658 c = tgsi_util_get_full_src_register_swizzle(src, chan);
1659 switch (c) {
1660 case TGSI_SWIZZLE_X:
1661 case TGSI_SWIZZLE_Y:
1662 case TGSI_SWIZZLE_Z:
1663 case TGSI_SWIZZLE_W:
1664 switch (src->Register.File) {
1665 case TGSI_FILE_INPUT:
1666 r = &pc->attr[src->Register.Index * 4 + c];
1667 break;
1668 case TGSI_FILE_TEMPORARY:
1669 r = &pc->temp[src->Register.Index * 4 + c];
1670 break;
1671 case TGSI_FILE_CONSTANT:
1672 if (!src->Register.Indirect) {
1673 r = &pc->param[src->Register.Index * 4 + c];
1674 break;
1675 }
1676 /* Indicate indirection by setting r->acc < 0 and
1677 * use the index field to select the address reg.
1678 */
1679 r = MALLOC_STRUCT(nv50_reg);
1680 swz = tgsi_util_get_src_register_swizzle(
1681 &src->Indirect, 0);
1682 ctor_reg(r, P_CONST,
1683 src->Indirect.Index * 4 + swz,
1684 src->Register.Index * 4 + c);
1685 r->acc = -1;
1686 break;
1687 case TGSI_FILE_IMMEDIATE:
1688 r = &pc->immd[src->Register.Index * 4 + c];
1689 break;
1690 case TGSI_FILE_SAMPLER:
1691 break;
1692 case TGSI_FILE_ADDRESS:
1693 r = pc->addr[src->Register.Index * 4 + c];
1694 assert(r);
1695 break;
1696 default:
1697 assert(0);
1698 break;
1699 }
1700 break;
1701 default:
1702 assert(0);
1703 break;
1704 }
1705
1706 switch (sgn) {
1707 case TGSI_UTIL_SIGN_KEEP:
1708 break;
1709 case TGSI_UTIL_SIGN_CLEAR:
1710 temp = temp_temp(pc);
1711 emit_abs(pc, temp, r);
1712 r = temp;
1713 break;
1714 case TGSI_UTIL_SIGN_TOGGLE:
1715 if (neg)
1716 r->mod = NV50_MOD_NEG;
1717 else {
1718 temp = temp_temp(pc);
1719 emit_neg(pc, temp, r);
1720 r = temp;
1721 }
1722 break;
1723 case TGSI_UTIL_SIGN_SET:
1724 temp = temp_temp(pc);
1725 emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
1726 r = temp;
1727 break;
1728 default:
1729 assert(0);
1730 break;
1731 }
1732
1733 return r;
1734 }
1735
1736 /* return TRUE for ops that produce only a single result */
1737 static boolean
1738 is_scalar_op(unsigned op)
1739 {
1740 switch (op) {
1741 case TGSI_OPCODE_COS:
1742 case TGSI_OPCODE_DP2:
1743 case TGSI_OPCODE_DP3:
1744 case TGSI_OPCODE_DP4:
1745 case TGSI_OPCODE_DPH:
1746 case TGSI_OPCODE_EX2:
1747 case TGSI_OPCODE_LG2:
1748 case TGSI_OPCODE_POW:
1749 case TGSI_OPCODE_RCP:
1750 case TGSI_OPCODE_RSQ:
1751 case TGSI_OPCODE_SIN:
1752 /*
1753 case TGSI_OPCODE_KIL:
1754 case TGSI_OPCODE_LIT:
1755 case TGSI_OPCODE_SCS:
1756 */
1757 return TRUE;
1758 default:
1759 return FALSE;
1760 }
1761 }
1762
1763 /* Returns a bitmask indicating which dst components depend
1764 * on source s, component c (reverse of nv50_tgsi_src_mask).
1765 */
1766 static unsigned
1767 nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1768 {
1769 if (is_scalar_op(op))
1770 return 0x1;
1771
1772 switch (op) {
1773 case TGSI_OPCODE_DST:
1774 return (1 << c) & (s ? 0xa : 0x6);
1775 case TGSI_OPCODE_XPD:
1776 switch (c) {
1777 case 0: return 0x6;
1778 case 1: return 0x5;
1779 case 2: return 0x3;
1780 case 3: return 0x0;
1781 default:
1782 assert(0);
1783 return 0x0;
1784 }
1785 case TGSI_OPCODE_LIT:
1786 case TGSI_OPCODE_SCS:
1787 case TGSI_OPCODE_TEX:
1788 case TGSI_OPCODE_TXP:
1789 /* these take care of dangerous swizzles themselves */
1790 return 0x0;
1791 case TGSI_OPCODE_IF:
1792 case TGSI_OPCODE_KIL:
1793 /* don't call this function for these ops */
1794 assert(0);
1795 return 0;
1796 default:
1797 /* linear vector instruction */
1798 return (1 << c);
1799 }
1800 }
1801
1802 static INLINE boolean
1803 has_pred(struct nv50_program_exec *e, unsigned cc)
1804 {
1805 if (!is_long(e) || is_immd(e))
1806 return FALSE;
1807 return ((e->inst[1] & 0x780) == (cc << 7));
1808 }
1809
1810 /* on ENDIF see if we can do "@p0.neu single_op" instead of:
1811 * join_at ENDIF
1812 * @p0.eq bra ENDIF
1813 * single_op
1814 * ENDIF: nop.join
1815 */
1816 static boolean
1817 nv50_kill_branch(struct nv50_pc *pc)
1818 {
1819 int lvl = pc->if_lvl;
1820
1821 if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1822 return FALSE;
1823
1824 /* if ccode == 'true', the BRA is from an ELSE and the predicate
1825 * reg may no longer be valid, since we currently always use $p0
1826 */
1827 if (has_pred(pc->if_insn[lvl], 0xf))
1828 return FALSE;
1829 assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1830
1831 /* We'll use the exec allocated for JOIN_AT (as we can't easily
1832 * update prev's next); if exec_tail is BRK, update the pointer.
1833 */
1834 if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1835 pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1836
1837 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1838
1839 *pc->br_join[lvl] = *pc->p->exec_tail;
1840
1841 FREE(pc->if_insn[lvl]);
1842 FREE(pc->p->exec_tail);
1843
1844 pc->p->exec_tail = pc->br_join[lvl];
1845 pc->p->exec_tail->next = NULL;
1846 set_pred(pc, 0xd, 0, pc->p->exec_tail);
1847
1848 return TRUE;
1849 }
1850
1851 static boolean
1852 nv50_program_tx_insn(struct nv50_pc *pc,
1853 const struct tgsi_full_instruction *inst)
1854 {
1855 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1856 unsigned mask, sat, unit;
1857 int i, c;
1858
1859 mask = inst->Dst[0].Register.WriteMask;
1860 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1861
1862 memset(src, 0, sizeof(src));
1863
1864 for (c = 0; c < 4; c++) {
1865 if ((mask & (1 << c)) && !pc->r_dst[c])
1866 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
1867 else
1868 dst[c] = pc->r_dst[c];
1869 rdst[c] = dst[c];
1870 }
1871
1872 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1873 const struct tgsi_full_src_register *fs = &inst->Src[i];
1874 unsigned src_mask;
1875 boolean neg_supp;
1876
1877 src_mask = nv50_tgsi_src_mask(inst, i);
1878 neg_supp = negate_supported(inst, i);
1879
1880 if (fs->Register.File == TGSI_FILE_SAMPLER)
1881 unit = fs->Register.Index;
1882
1883 for (c = 0; c < 4; c++)
1884 if (src_mask & (1 << c))
1885 src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1886 }
1887
1888 brdc = temp = pc->r_brdc;
1889 if (brdc && brdc->type != P_TEMP) {
1890 temp = temp_temp(pc);
1891 if (sat)
1892 brdc = temp;
1893 } else
1894 if (sat) {
1895 for (c = 0; c < 4; c++) {
1896 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1897 continue;
1898 /* rdst[c] = dst[c]; */ /* done above */
1899 dst[c] = temp_temp(pc);
1900 }
1901 }
1902
1903 assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1904
1905 switch (inst->Instruction.Opcode) {
1906 case TGSI_OPCODE_ABS:
1907 for (c = 0; c < 4; c++) {
1908 if (!(mask & (1 << c)))
1909 continue;
1910 emit_abs(pc, dst[c], src[0][c]);
1911 }
1912 break;
1913 case TGSI_OPCODE_ADD:
1914 for (c = 0; c < 4; c++) {
1915 if (!(mask & (1 << c)))
1916 continue;
1917 emit_add(pc, dst[c], src[0][c], src[1][c]);
1918 }
1919 break;
1920 case TGSI_OPCODE_AND:
1921 case TGSI_OPCODE_XOR:
1922 case TGSI_OPCODE_OR:
1923 for (c = 0; c < 4; c++) {
1924 if (!(mask & (1 << c)))
1925 continue;
1926 emit_bitop2(pc, dst[c], src[0][c], src[1][c],
1927 inst->Instruction.Opcode);
1928 }
1929 break;
1930 case TGSI_OPCODE_ARL:
1931 assert(src[0][0]);
1932 temp = temp_temp(pc);
1933 emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
1934 emit_arl(pc, dst[0], temp, 4);
1935 break;
1936 case TGSI_OPCODE_BGNLOOP:
1937 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1938 terminate_mbb(pc);
1939 break;
1940 case TGSI_OPCODE_BRK:
1941 emit_branch(pc, -1, 0, NULL);
1942 assert(pc->loop_lvl > 0);
1943 pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1944 break;
1945 case TGSI_OPCODE_CEIL:
1946 for (c = 0; c < 4; c++) {
1947 if (!(mask & (1 << c)))
1948 continue;
1949 emit_cvt(pc, dst[c], src[0][c], -1,
1950 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1951 }
1952 break;
1953 case TGSI_OPCODE_CMP:
1954 pc->allow32 = FALSE;
1955 for (c = 0; c < 4; c++) {
1956 if (!(mask & (1 << c)))
1957 continue;
1958 emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
1959 emit_mov(pc, dst[c], src[1][c]);
1960 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
1961 emit_mov(pc, dst[c], src[2][c]);
1962 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
1963 }
1964 break;
1965 case TGSI_OPCODE_COS:
1966 if (mask & 8) {
1967 emit_precossin(pc, temp, src[0][3]);
1968 emit_flop(pc, 5, dst[3], temp);
1969 if (!(mask &= 7))
1970 break;
1971 if (temp == dst[3])
1972 temp = brdc = temp_temp(pc);
1973 }
1974 emit_precossin(pc, temp, src[0][0]);
1975 emit_flop(pc, 5, brdc, temp);
1976 break;
1977 case TGSI_OPCODE_DDX:
1978 for (c = 0; c < 4; c++) {
1979 if (!(mask & (1 << c)))
1980 continue;
1981 emit_ddx(pc, dst[c], src[0][c]);
1982 }
1983 break;
1984 case TGSI_OPCODE_DDY:
1985 for (c = 0; c < 4; c++) {
1986 if (!(mask & (1 << c)))
1987 continue;
1988 emit_ddy(pc, dst[c], src[0][c]);
1989 }
1990 break;
1991 case TGSI_OPCODE_DP3:
1992 emit_mul(pc, temp, src[0][0], src[1][0]);
1993 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1994 emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1995 break;
1996 case TGSI_OPCODE_DP4:
1997 emit_mul(pc, temp, src[0][0], src[1][0]);
1998 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1999 emit_mad(pc, temp, src[0][2], src[1][2], temp);
2000 emit_mad(pc, brdc, src[0][3], src[1][3], temp);
2001 break;
2002 case TGSI_OPCODE_DPH:
2003 emit_mul(pc, temp, src[0][0], src[1][0]);
2004 emit_mad(pc, temp, src[0][1], src[1][1], temp);
2005 emit_mad(pc, temp, src[0][2], src[1][2], temp);
2006 emit_add(pc, brdc, src[1][3], temp);
2007 break;
2008 case TGSI_OPCODE_DST:
2009 if (mask & (1 << 1))
2010 emit_mul(pc, dst[1], src[0][1], src[1][1]);
2011 if (mask & (1 << 2))
2012 emit_mov(pc, dst[2], src[0][2]);
2013 if (mask & (1 << 3))
2014 emit_mov(pc, dst[3], src[1][3]);
2015 if (mask & (1 << 0))
2016 emit_mov_immdval(pc, dst[0], 1.0f);
2017 break;
2018 case TGSI_OPCODE_ELSE:
2019 emit_branch(pc, -1, 0, NULL);
2020 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2021 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2022 terminate_mbb(pc);
2023 break;
2024 case TGSI_OPCODE_ENDIF:
2025 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2026
2027 /* try to replace branch over 1 insn with a predicated insn */
2028 if (nv50_kill_branch(pc) == TRUE)
2029 break;
2030
2031 if (pc->br_join[pc->if_lvl]) {
2032 pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
2033 pc->br_join[pc->if_lvl] = NULL;
2034 }
2035 terminate_mbb(pc);
2036 /* emit a NOP as join point, we could set it on the next
2037 * one, but would have to make sure it is long and !immd
2038 */
2039 emit_nop(pc);
2040 pc->p->exec_tail->inst[1] |= 2;
2041 break;
2042 case TGSI_OPCODE_ENDLOOP:
2043 emit_branch(pc, -1, 0, NULL);
2044 pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
2045 pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
2046 terminate_mbb(pc);
2047 break;
2048 case TGSI_OPCODE_EX2:
2049 emit_preex2(pc, temp, src[0][0]);
2050 emit_flop(pc, 6, brdc, temp);
2051 break;
2052 case TGSI_OPCODE_FLR:
2053 for (c = 0; c < 4; c++) {
2054 if (!(mask & (1 << c)))
2055 continue;
2056 emit_flr(pc, dst[c], src[0][c]);
2057 }
2058 break;
2059 case TGSI_OPCODE_FRC:
2060 temp = temp_temp(pc);
2061 for (c = 0; c < 4; c++) {
2062 if (!(mask & (1 << c)))
2063 continue;
2064 emit_flr(pc, temp, src[0][c]);
2065 emit_sub(pc, dst[c], src[0][c], temp);
2066 }
2067 break;
2068 case TGSI_OPCODE_IF:
2069 /* emitting a join_at may not be necessary */
2070 assert(pc->if_lvl < MAX_IF_DEPTH);
2071 /* set_pred_wr(pc, 1, 0, pc->if_cond); */
2072 emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2073 CVT_F32_F32);
2074 emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
2075 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2076 terminate_mbb(pc);
2077 break;
2078 case TGSI_OPCODE_KIL:
2079 emit_kil(pc, src[0][0]);
2080 emit_kil(pc, src[0][1]);
2081 emit_kil(pc, src[0][2]);
2082 emit_kil(pc, src[0][3]);
2083 break;
2084 case TGSI_OPCODE_LIT:
2085 emit_lit(pc, &dst[0], mask, &src[0][0]);
2086 break;
2087 case TGSI_OPCODE_LG2:
2088 emit_flop(pc, 3, brdc, src[0][0]);
2089 break;
2090 case TGSI_OPCODE_LRP:
2091 temp = temp_temp(pc);
2092 for (c = 0; c < 4; c++) {
2093 if (!(mask & (1 << c)))
2094 continue;
2095 emit_sub(pc, temp, src[1][c], src[2][c]);
2096 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2097 }
2098 break;
2099 case TGSI_OPCODE_MAD:
2100 for (c = 0; c < 4; c++) {
2101 if (!(mask & (1 << c)))
2102 continue;
2103 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2104 }
2105 break;
2106 case TGSI_OPCODE_MAX:
2107 for (c = 0; c < 4; c++) {
2108 if (!(mask & (1 << c)))
2109 continue;
2110 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2111 }
2112 break;
2113 case TGSI_OPCODE_MIN:
2114 for (c = 0; c < 4; c++) {
2115 if (!(mask & (1 << c)))
2116 continue;
2117 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2118 }
2119 break;
2120 case TGSI_OPCODE_MOV:
2121 for (c = 0; c < 4; c++) {
2122 if (!(mask & (1 << c)))
2123 continue;
2124 emit_mov(pc, dst[c], src[0][c]);
2125 }
2126 break;
2127 case TGSI_OPCODE_MUL:
2128 for (c = 0; c < 4; c++) {
2129 if (!(mask & (1 << c)))
2130 continue;
2131 emit_mul(pc, dst[c], src[0][c], src[1][c]);
2132 }
2133 break;
2134 case TGSI_OPCODE_POW:
2135 emit_pow(pc, brdc, src[0][0], src[1][0]);
2136 break;
2137 case TGSI_OPCODE_RCP:
2138 emit_flop(pc, 0, brdc, src[0][0]);
2139 break;
2140 case TGSI_OPCODE_RSQ:
2141 emit_flop(pc, 2, brdc, src[0][0]);
2142 break;
2143 case TGSI_OPCODE_SCS:
2144 temp = temp_temp(pc);
2145 if (mask & 3)
2146 emit_precossin(pc, temp, src[0][0]);
2147 if (mask & (1 << 0))
2148 emit_flop(pc, 5, dst[0], temp);
2149 if (mask & (1 << 1))
2150 emit_flop(pc, 4, dst[1], temp);
2151 if (mask & (1 << 2))
2152 emit_mov_immdval(pc, dst[2], 0.0);
2153 if (mask & (1 << 3))
2154 emit_mov_immdval(pc, dst[3], 1.0);
2155 break;
2156 case TGSI_OPCODE_SIN:
2157 if (mask & 8) {
2158 emit_precossin(pc, temp, src[0][3]);
2159 emit_flop(pc, 4, dst[3], temp);
2160 if (!(mask &= 7))
2161 break;
2162 if (temp == dst[3])
2163 temp = brdc = temp_temp(pc);
2164 }
2165 emit_precossin(pc, temp, src[0][0]);
2166 emit_flop(pc, 4, brdc, temp);
2167 break;
2168 case TGSI_OPCODE_SLT:
2169 case TGSI_OPCODE_SGE:
2170 case TGSI_OPCODE_SEQ:
2171 case TGSI_OPCODE_SGT:
2172 case TGSI_OPCODE_SLE:
2173 case TGSI_OPCODE_SNE:
2174 i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2175 for (c = 0; c < 4; c++) {
2176 if (!(mask & (1 << c)))
2177 continue;
2178 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2179 }
2180 break;
2181 case TGSI_OPCODE_SUB:
2182 for (c = 0; c < 4; c++) {
2183 if (!(mask & (1 << c)))
2184 continue;
2185 emit_sub(pc, dst[c], src[0][c], src[1][c]);
2186 }
2187 break;
2188 case TGSI_OPCODE_TEX:
2189 emit_tex(pc, dst, mask, src[0], unit,
2190 inst->Texture.Texture, FALSE);
2191 break;
2192 case TGSI_OPCODE_TXP:
2193 emit_tex(pc, dst, mask, src[0], unit,
2194 inst->Texture.Texture, TRUE);
2195 break;
2196 case TGSI_OPCODE_TRUNC:
2197 for (c = 0; c < 4; c++) {
2198 if (!(mask & (1 << c)))
2199 continue;
2200 emit_cvt(pc, dst[c], src[0][c], -1,
2201 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2202 }
2203 break;
2204 case TGSI_OPCODE_XPD:
2205 temp = temp_temp(pc);
2206 if (mask & (1 << 0)) {
2207 emit_mul(pc, temp, src[0][2], src[1][1]);
2208 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2209 }
2210 if (mask & (1 << 1)) {
2211 emit_mul(pc, temp, src[0][0], src[1][2]);
2212 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2213 }
2214 if (mask & (1 << 2)) {
2215 emit_mul(pc, temp, src[0][1], src[1][0]);
2216 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2217 }
2218 if (mask & (1 << 3))
2219 emit_mov_immdval(pc, dst[3], 1.0);
2220 break;
2221 case TGSI_OPCODE_END:
2222 break;
2223 default:
2224 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2225 return FALSE;
2226 }
2227
2228 if (brdc) {
2229 if (sat)
2230 emit_sat(pc, brdc, brdc);
2231 for (c = 0; c < 4; c++)
2232 if ((mask & (1 << c)) && dst[c] != brdc)
2233 emit_mov(pc, dst[c], brdc);
2234 } else
2235 if (sat) {
2236 for (c = 0; c < 4; c++) {
2237 if (!(mask & (1 << c)))
2238 continue;
2239 /* In this case we saturate later, and dst[c] won't
2240 * be another temp_temp (and thus lost), since rdst
2241 * already is TEMP (see above). */
2242 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2243 continue;
2244 emit_sat(pc, rdst[c], dst[c]);
2245 }
2246 }
2247
2248 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2249 for (c = 0; c < 4; c++) {
2250 if (!src[i][c])
2251 continue;
2252 src[i][c]->mod = 0;
2253 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
2254 FREE(src[i][c]);
2255 else
2256 if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
2257 FREE(src[i][c]); /* indirect constant */
2258 }
2259 }
2260
2261 kill_temp_temp(pc);
2262 return TRUE;
2263 }
2264
2265 static void
2266 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2267 {
2268 struct nv50_reg *reg = NULL;
2269 const struct tgsi_full_src_register *src;
2270 const struct tgsi_dst_register *dst;
2271 unsigned i, c, k, mask;
2272
2273 dst = &insn->Dst[0].Register;
2274 mask = dst->WriteMask;
2275
2276 if (dst->File == TGSI_FILE_TEMPORARY)
2277 reg = pc->temp;
2278 else
2279 if (dst->File == TGSI_FILE_OUTPUT)
2280 reg = pc->result;
2281
2282 if (reg) {
2283 for (c = 0; c < 4; c++) {
2284 if (!(mask & (1 << c)))
2285 continue;
2286 reg[dst->Index * 4 + c].acc = pc->insn_nr;
2287 }
2288 }
2289
2290 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2291 src = &insn->Src[i];
2292
2293 if (src->Register.File == TGSI_FILE_TEMPORARY)
2294 reg = pc->temp;
2295 else
2296 if (src->Register.File == TGSI_FILE_INPUT)
2297 reg = pc->attr;
2298 else
2299 continue;
2300
2301 mask = nv50_tgsi_src_mask(insn, i);
2302
2303 for (c = 0; c < 4; c++) {
2304 if (!(mask & (1 << c)))
2305 continue;
2306 k = tgsi_util_get_full_src_register_swizzle(src, c);
2307
2308 reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
2309 }
2310 }
2311 }
2312
2313 /* Returns a bitmask indicating which dst components need to be
2314 * written to temporaries first to avoid 'corrupting' sources.
2315 *
2316 * m[i] (out) indicate component to write in the i-th position
2317 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2318 */
2319 static unsigned
2320 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2321 {
2322 unsigned i, c, x, unsafe;
2323
2324 for (c = 0; c < 4; c++)
2325 m[c] = c;
2326
2327 /* Swap as long as a dst component written earlier is depended on
2328 * by one written later, but the next one isn't depended on by it.
2329 */
2330 for (c = 0; c < 3; c++) {
2331 if (rdep[m[c + 1]] & (1 << m[c]))
2332 continue; /* if next one is depended on by us */
2333 for (i = c + 1; i < 4; i++)
2334 /* if we are depended on by a later one */
2335 if (rdep[m[c]] & (1 << m[i]))
2336 break;
2337 if (i == 4)
2338 continue;
2339 /* now, swap */
2340 x = m[c];
2341 m[c] = m[c + 1];
2342 m[c + 1] = x;
2343
2344 /* restart */
2345 c = 0;
2346 }
2347
2348 /* mark dependencies that could not be resolved by reordering */
2349 for (i = 0; i < 3; ++i)
2350 for (c = i + 1; c < 4; ++c)
2351 if (rdep[m[i]] & (1 << m[c]))
2352 unsafe |= (1 << i);
2353
2354 /* NOTE: $unsafe is with respect to order, not component */
2355 return unsafe;
2356 }
2357
2358 /* Select a suitable dst register for broadcasting scalar results,
2359 * or return NULL if we have to allocate an extra TEMP.
2360 *
2361 * If e.g. only 1 component is written, we may also emit the final
2362 * result to a write-only register.
2363 */
2364 static struct nv50_reg *
2365 tgsi_broadcast_dst(struct nv50_pc *pc,
2366 const struct tgsi_full_dst_register *fd, unsigned mask)
2367 {
2368 if (fd->Register.File == TGSI_FILE_TEMPORARY) {
2369 int c = ffs(~mask & fd->Register.WriteMask);
2370 if (c)
2371 return tgsi_dst(pc, c - 1, fd);
2372 } else {
2373 int c = ffs(fd->Register.WriteMask) - 1;
2374 if ((1 << c) == fd->Register.WriteMask)
2375 return tgsi_dst(pc, c, fd);
2376 }
2377
2378 return NULL;
2379 }
2380
2381 /* Scan source swizzles and return a bitmask indicating dst regs that
2382 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2383 */
2384 static unsigned
2385 nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2386 unsigned rdep[4])
2387 {
2388 const struct tgsi_full_dst_register *fd = &insn->Dst[0];
2389 const struct tgsi_full_src_register *fs;
2390 unsigned i, deqs = 0;
2391
2392 for (i = 0; i < 4; ++i)
2393 rdep[i] = 0;
2394
2395 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2396 unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2397 boolean neg_supp = negate_supported(insn, i);
2398
2399 fs = &insn->Src[i];
2400 if (fs->Register.File != fd->Register.File ||
2401 fs->Register.Index != fd->Register.Index)
2402 continue;
2403
2404 for (chn = 0; chn < 4; ++chn) {
2405 unsigned s, c;
2406
2407 if (!(mask & (1 << chn))) /* src is not read */
2408 continue;
2409 c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2410 s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2411
2412 if (!(fd->Register.WriteMask & (1 << c)))
2413 continue;
2414
2415 /* no danger if src is copied to TEMP first */
2416 if ((s != TGSI_UTIL_SIGN_KEEP) &&
2417 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2418 continue;
2419
2420 rdep[c] |= nv50_tgsi_dst_revdep(
2421 insn->Instruction.Opcode, i, chn);
2422 deqs |= (1 << c);
2423 }
2424 }
2425
2426 return deqs;
2427 }
2428
2429 static boolean
2430 nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2431 {
2432 struct tgsi_full_instruction insn = tok->FullInstruction;
2433 const struct tgsi_full_dst_register *fd;
2434 unsigned i, deqs, rdep[4], m[4];
2435
2436 fd = &tok->FullInstruction.Dst[0];
2437 deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2438
2439 if (is_scalar_op(insn.Instruction.Opcode)) {
2440 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2441 if (!pc->r_brdc)
2442 pc->r_brdc = temp_temp(pc);
2443 return nv50_program_tx_insn(pc, &insn);
2444 }
2445 pc->r_brdc = NULL;
2446
2447 if (!deqs)
2448 return nv50_program_tx_insn(pc, &insn);
2449
2450 deqs = nv50_revdep_reorder(m, rdep);
2451
2452 for (i = 0; i < 4; ++i) {
2453 assert(pc->r_dst[m[i]] == NULL);
2454
2455 insn.Dst[0].Register.WriteMask =
2456 fd->Register.WriteMask & (1 << m[i]);
2457
2458 if (!insn.Dst[0].Register.WriteMask)
2459 continue;
2460
2461 if (deqs & (1 << i))
2462 pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2463
2464 if (!nv50_program_tx_insn(pc, &insn))
2465 return FALSE;
2466 }
2467
2468 for (i = 0; i < 4; i++) {
2469 struct nv50_reg *reg = pc->r_dst[i];
2470 if (!reg)
2471 continue;
2472 pc->r_dst[i] = NULL;
2473
2474 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2475 emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2476 else
2477 emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2478 free_temp(pc, reg);
2479 }
2480
2481 return TRUE;
2482 }
2483
2484 static void
2485 load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2486 {
2487 struct nv50_reg *iv, **ppiv;
2488 unsigned mode = pc->interp_mode[reg->index];
2489
2490 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2491 iv = *ppiv;
2492
2493 if ((mode & INTERP_PERSPECTIVE) && !iv) {
2494 iv = *ppiv = alloc_temp(pc, NULL);
2495 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2496
2497 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2498 emit_flop(pc, 0, iv, iv);
2499
2500 /* XXX: when loading interpolants dynamically, move these
2501 * to the program head, or make sure it can't be skipped.
2502 */
2503 }
2504
2505 emit_interp(pc, reg, iv, mode);
2506 }
2507
2508 /* The face input is always at v[255] (varying space), with a
2509 * value of 0 for back-facing, and 0xffffffff for front-facing.
2510 */
2511 static void
2512 load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
2513 {
2514 struct nv50_reg *one = alloc_immd(pc, 1.0f);
2515
2516 assert(a->rhw == -1);
2517 alloc_reg(pc, a); /* do this before rhw is set */
2518 a->rhw = 255;
2519 load_interpolant(pc, a);
2520 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
2521
2522 FREE(one);
2523 }
2524
2525 static boolean
2526 nv50_program_tx_prep(struct nv50_pc *pc)
2527 {
2528 struct tgsi_parse_context tp;
2529 struct nv50_program *p = pc->p;
2530 boolean ret = FALSE;
2531 unsigned i, c, flat_nr = 0;
2532
2533 tgsi_parse_init(&tp, pc->p->pipe.tokens);
2534 while (!tgsi_parse_end_of_tokens(&tp)) {
2535 const union tgsi_full_token *tok = &tp.FullToken;
2536
2537 tgsi_parse_token(&tp);
2538 switch (tok->Token.Type) {
2539 case TGSI_TOKEN_TYPE_IMMEDIATE:
2540 {
2541 const struct tgsi_full_immediate *imm =
2542 &tp.FullToken.FullImmediate;
2543
2544 ctor_immd(pc, imm->u[0].Float,
2545 imm->u[1].Float,
2546 imm->u[2].Float,
2547 imm->u[3].Float);
2548 }
2549 break;
2550 case TGSI_TOKEN_TYPE_DECLARATION:
2551 {
2552 const struct tgsi_full_declaration *d;
2553 unsigned si, last, first, mode;
2554
2555 d = &tp.FullToken.FullDeclaration;
2556 first = d->Range.First;
2557 last = d->Range.Last;
2558
2559 switch (d->Declaration.File) {
2560 case TGSI_FILE_TEMPORARY:
2561 break;
2562 case TGSI_FILE_OUTPUT:
2563 if (!d->Declaration.Semantic ||
2564 p->type == PIPE_SHADER_FRAGMENT)
2565 break;
2566
2567 si = d->Semantic.Index;
2568 switch (d->Semantic.Name) {
2569 case TGSI_SEMANTIC_BCOLOR:
2570 p->cfg.two_side[si].hw = first;
2571 if (p->cfg.io_nr > first)
2572 p->cfg.io_nr = first;
2573 break;
2574 case TGSI_SEMANTIC_PSIZE:
2575 p->cfg.psiz = first;
2576 if (p->cfg.io_nr > first)
2577 p->cfg.io_nr = first;
2578 break;
2579 /*
2580 case TGSI_SEMANTIC_CLIP_DISTANCE:
2581 p->cfg.clpd = MIN2(p->cfg.clpd, first);
2582 break;
2583 */
2584 default:
2585 break;
2586 }
2587 break;
2588 case TGSI_FILE_INPUT:
2589 {
2590 if (p->type != PIPE_SHADER_FRAGMENT)
2591 break;
2592
2593 switch (d->Declaration.Interpolate) {
2594 case TGSI_INTERPOLATE_CONSTANT:
2595 mode = INTERP_FLAT;
2596 flat_nr++;
2597 break;
2598 case TGSI_INTERPOLATE_PERSPECTIVE:
2599 mode = INTERP_PERSPECTIVE;
2600 p->cfg.regs[1] |= 0x08 << 24;
2601 break;
2602 default:
2603 mode = INTERP_LINEAR;
2604 break;
2605 }
2606 if (d->Declaration.Centroid)
2607 mode |= INTERP_CENTROID;
2608
2609 assert(last < 32);
2610 for (i = first; i <= last; i++)
2611 pc->interp_mode[i] = mode;
2612 }
2613 break;
2614 case TGSI_FILE_ADDRESS:
2615 case TGSI_FILE_CONSTANT:
2616 case TGSI_FILE_SAMPLER:
2617 break;
2618 default:
2619 NOUVEAU_ERR("bad decl file %d\n",
2620 d->Declaration.File);
2621 goto out_err;
2622 }
2623 }
2624 break;
2625 case TGSI_TOKEN_TYPE_INSTRUCTION:
2626 pc->insn_nr++;
2627 prep_inspect_insn(pc, &tok->FullInstruction);
2628 break;
2629 default:
2630 break;
2631 }
2632 }
2633
2634 if (p->type == PIPE_SHADER_VERTEX) {
2635 int rid = 0;
2636
2637 for (i = 0; i < pc->attr_nr * 4; ++i) {
2638 if (pc->attr[i].acc) {
2639 pc->attr[i].hw = rid++;
2640 p->cfg.attr[i / 32] |= 1 << (i % 32);
2641 }
2642 }
2643
2644 for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2645 p->cfg.io[i].hw = rid;
2646 p->cfg.io[i].id = i;
2647
2648 for (c = 0; c < 4; ++c) {
2649 int n = i * 4 + c;
2650 if (!pc->result[n].acc)
2651 continue;
2652 pc->result[n].hw = rid++;
2653 p->cfg.io[i].mask |= 1 << c;
2654 }
2655 }
2656
2657 for (c = 0; c < 2; ++c)
2658 if (p->cfg.two_side[c].hw < 0x40)
2659 p->cfg.two_side[c] = p->cfg.io[
2660 p->cfg.two_side[c].hw];
2661
2662 if (p->cfg.psiz < 0x40)
2663 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2664 } else
2665 if (p->type == PIPE_SHADER_FRAGMENT) {
2666 int rid, aid;
2667 unsigned n = 0, m = pc->attr_nr - flat_nr;
2668
2669 pc->allow32 = TRUE;
2670
2671 int base = (TGSI_SEMANTIC_POSITION ==
2672 p->info.input_semantic_name[0]) ? 0 : 1;
2673
2674 /* non-flat interpolants have to be mapped to
2675 * the lower hardware IDs, so sort them:
2676 */
2677 for (i = 0; i < pc->attr_nr; i++) {
2678 if (pc->interp_mode[i] == INTERP_FLAT)
2679 p->cfg.io[m++].id = i;
2680 else {
2681 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2682 p->cfg.io[n].linear = TRUE;
2683 p->cfg.io[n++].id = i;
2684 }
2685 }
2686
2687 if (!base) /* set w-coordinate mask from perspective interp */
2688 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2689
2690 aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2691 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2692
2693 for (n = 0; n < pc->attr_nr; ++n) {
2694 p->cfg.io[n].hw = rid = aid;
2695 i = p->cfg.io[n].id;
2696
2697 if (p->info.input_semantic_name[n] ==
2698 TGSI_SEMANTIC_FACE) {
2699 load_frontfacing(pc, &pc->attr[i * 4]);
2700 continue;
2701 }
2702
2703 for (c = 0; c < 4; ++c) {
2704 if (!pc->attr[i * 4 + c].acc)
2705 continue;
2706 pc->attr[i * 4 + c].rhw = rid++;
2707 p->cfg.io[n].mask |= 1 << c;
2708
2709 load_interpolant(pc, &pc->attr[i * 4 + c]);
2710 }
2711 aid += popcnt4(p->cfg.io[n].mask);
2712 }
2713
2714 if (!base)
2715 p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2716
2717 m = popcnt4(p->cfg.regs[1] >> 24);
2718
2719 /* set count of non-position inputs and of non-flat
2720 * non-position inputs for FP_INTERPOLANT_CTRL
2721 */
2722 p->cfg.regs[1] |= aid - m;
2723
2724 if (flat_nr) {
2725 i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2726 p->cfg.regs[1] |= (i - m) << 16;
2727 } else
2728 p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2729
2730 /* mark color semantic for light-twoside */
2731 n = 0x40;
2732 for (i = 0; i < pc->attr_nr; i++) {
2733 ubyte si, sn;
2734
2735 sn = p->info.input_semantic_name[p->cfg.io[i].id];
2736 si = p->info.input_semantic_index[p->cfg.io[i].id];
2737
2738 if (sn == TGSI_SEMANTIC_COLOR) {
2739 p->cfg.two_side[si] = p->cfg.io[i];
2740
2741 /* increase colour count */
2742 p->cfg.regs[0] += popcnt4(
2743 p->cfg.two_side[si].mask) << 16;
2744
2745 n = MIN2(n, p->cfg.io[i].hw - m);
2746 }
2747 }
2748 if (n < 0x40)
2749 p->cfg.regs[0] += n;
2750
2751 /* Initialize FP results:
2752 * FragDepth is always first TGSI and last hw output
2753 */
2754 i = p->info.writes_z ? 4 : 0;
2755 for (rid = 0; i < pc->result_nr * 4; i++)
2756 pc->result[i].rhw = rid++;
2757 if (p->info.writes_z)
2758 pc->result[2].rhw = rid;
2759
2760 p->cfg.high_result = rid;
2761
2762 /* separate/different colour results for MRTs ? */
2763 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
2764 p->cfg.regs[2] |= 1;
2765 }
2766
2767 if (pc->immd_nr) {
2768 int rid = 0;
2769
2770 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2771 if (!pc->immd)
2772 goto out_err;
2773
2774 for (i = 0; i < pc->immd_nr; i++) {
2775 for (c = 0; c < 4; c++, rid++)
2776 ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2777 }
2778 }
2779
2780 ret = TRUE;
2781 out_err:
2782 if (pc->iv_p)
2783 free_temp(pc, pc->iv_p);
2784 if (pc->iv_c)
2785 free_temp(pc, pc->iv_c);
2786
2787 tgsi_parse_free(&tp);
2788 return ret;
2789 }
2790
2791 static void
2792 free_nv50_pc(struct nv50_pc *pc)
2793 {
2794 if (pc->immd)
2795 FREE(pc->immd);
2796 if (pc->param)
2797 FREE(pc->param);
2798 if (pc->result)
2799 FREE(pc->result);
2800 if (pc->attr)
2801 FREE(pc->attr);
2802 if (pc->temp)
2803 FREE(pc->temp);
2804
2805 FREE(pc);
2806 }
2807
2808 static boolean
2809 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2810 {
2811 int i, c;
2812 unsigned rtype[2] = { P_ATTR, P_RESULT };
2813
2814 pc->p = p;
2815 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2816 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2817 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2818 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2819 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
2820 assert(pc->addr_nr <= 2);
2821
2822 p->cfg.high_temp = 4;
2823
2824 p->cfg.two_side[0].hw = 0x40;
2825 p->cfg.two_side[1].hw = 0x40;
2826
2827 switch (p->type) {
2828 case PIPE_SHADER_VERTEX:
2829 p->cfg.psiz = 0x40;
2830 p->cfg.clpd = 0x40;
2831 p->cfg.io_nr = pc->result_nr;
2832 break;
2833 case PIPE_SHADER_FRAGMENT:
2834 rtype[0] = rtype[1] = P_TEMP;
2835
2836 p->cfg.regs[0] = 0x01000004;
2837 p->cfg.io_nr = pc->attr_nr;
2838
2839 if (p->info.writes_z) {
2840 p->cfg.regs[2] |= 0x00000100;
2841 p->cfg.regs[3] |= 0x00000011;
2842 }
2843 if (p->info.uses_kill)
2844 p->cfg.regs[2] |= 0x00100000;
2845 break;
2846 }
2847
2848 if (pc->temp_nr) {
2849 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2850 if (!pc->temp)
2851 return FALSE;
2852
2853 for (i = 0; i < pc->temp_nr * 4; ++i)
2854 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2855 }
2856
2857 if (pc->attr_nr) {
2858 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2859 if (!pc->attr)
2860 return FALSE;
2861
2862 for (i = 0; i < pc->attr_nr * 4; ++i)
2863 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2864 }
2865
2866 if (pc->result_nr) {
2867 unsigned nr = pc->result_nr * 4;
2868
2869 pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2870 if (!pc->result)
2871 return FALSE;
2872
2873 for (i = 0; i < nr; ++i)
2874 ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2875 }
2876
2877 if (pc->param_nr) {
2878 int rid = 0;
2879
2880 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2881 if (!pc->param)
2882 return FALSE;
2883
2884 for (i = 0; i < pc->param_nr; ++i)
2885 for (c = 0; c < 4; ++c, ++rid)
2886 ctor_reg(&pc->param[rid], P_CONST, i, rid);
2887 }
2888
2889 if (pc->addr_nr) {
2890 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
2891 if (!pc->addr)
2892 return FALSE;
2893 }
2894 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
2895 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
2896
2897 return TRUE;
2898 }
2899
2900 static void
2901 nv50_fp_move_results(struct nv50_pc *pc)
2902 {
2903 struct nv50_reg reg;
2904 unsigned i;
2905
2906 ctor_reg(&reg, P_TEMP, -1, -1);
2907
2908 for (i = 0; i < pc->result_nr * 4; ++i) {
2909 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2910 continue;
2911 if (pc->result[i].rhw != pc->result[i].hw) {
2912 reg.hw = pc->result[i].rhw;
2913 emit_mov(pc, &reg, &pc->result[i]);
2914 }
2915 }
2916 }
2917
2918 static void
2919 nv50_program_fixup_insns(struct nv50_pc *pc)
2920 {
2921 struct nv50_program_exec *e, **bra_list;
2922 unsigned i, n, pos;
2923
2924 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2925
2926 /* Collect branch instructions, we need to adjust their offsets
2927 * when converting 32 bit instructions to 64 bit ones
2928 */
2929 for (n = 0, e = pc->p->exec_head; e; e = e->next)
2930 if (e->param.index >= 0 && !e->param.mask)
2931 bra_list[n++] = e;
2932
2933 /* last instruction must be long so it can have the exit bit set */
2934 if (!is_long(pc->p->exec_tail))
2935 convert_to_long(pc, pc->p->exec_tail);
2936 /* set exit bit */
2937 pc->p->exec_tail->inst[1] |= 1;
2938
2939 /* !immd on exit insn simultaneously means !join */
2940 assert(!is_immd(pc->p->exec_head));
2941 assert(!is_immd(pc->p->exec_tail));
2942
2943 /* Make sure we don't have any single 32 bit instructions. */
2944 for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2945 pos += is_long(e) ? 2 : 1;
2946
2947 if ((pos & 1) && (!e->next || is_long(e->next))) {
2948 for (i = 0; i < n; ++i)
2949 if (bra_list[i]->param.index >= pos)
2950 bra_list[i]->param.index += 1;
2951 convert_to_long(pc, e);
2952 ++pos;
2953 }
2954 }
2955
2956 FREE(bra_list);
2957 }
2958
2959 static boolean
2960 nv50_program_tx(struct nv50_program *p)
2961 {
2962 struct tgsi_parse_context parse;
2963 struct nv50_pc *pc;
2964 boolean ret;
2965
2966 pc = CALLOC_STRUCT(nv50_pc);
2967 if (!pc)
2968 return FALSE;
2969
2970 ret = ctor_nv50_pc(pc, p);
2971 if (ret == FALSE)
2972 goto out_cleanup;
2973
2974 ret = nv50_program_tx_prep(pc);
2975 if (ret == FALSE)
2976 goto out_cleanup;
2977
2978 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2979 while (!tgsi_parse_end_of_tokens(&parse)) {
2980 const union tgsi_full_token *tok = &parse.FullToken;
2981
2982 /* don't allow half insn/immd on first and last instruction */
2983 pc->allow32 = TRUE;
2984 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2985 pc->allow32 = FALSE;
2986
2987 tgsi_parse_token(&parse);
2988
2989 switch (tok->Token.Type) {
2990 case TGSI_TOKEN_TYPE_INSTRUCTION:
2991 ++pc->insn_cur;
2992 ret = nv50_tgsi_insn(pc, tok);
2993 if (ret == FALSE)
2994 goto out_err;
2995 break;
2996 default:
2997 break;
2998 }
2999 }
3000
3001 if (pc->p->type == PIPE_SHADER_FRAGMENT)
3002 nv50_fp_move_results(pc);
3003
3004 nv50_program_fixup_insns(pc);
3005
3006 p->param_nr = pc->param_nr * 4;
3007 p->immd_nr = pc->immd_nr * 4;
3008 p->immd = pc->immd_buf;
3009
3010 out_err:
3011 tgsi_parse_free(&parse);
3012
3013 out_cleanup:
3014 free_nv50_pc(pc);
3015 return ret;
3016 }
3017
3018 static void
3019 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
3020 {
3021 if (nv50_program_tx(p) == FALSE)
3022 assert(0);
3023 p->translated = TRUE;
3024 }
3025
3026 static void
3027 nv50_program_upload_data(struct nv50_context *nv50, float *map,
3028 unsigned start, unsigned count, unsigned cbuf)
3029 {
3030 struct nouveau_channel *chan = nv50->screen->base.channel;
3031 struct nouveau_grobj *tesla = nv50->screen->tesla;
3032
3033 while (count) {
3034 unsigned nr = count > 2047 ? 2047 : count;
3035
3036 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3037 OUT_RING (chan, (cbuf << 0) | (start << 8));
3038 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3039 OUT_RINGp (chan, map, nr);
3040
3041 map += nr;
3042 start += nr;
3043 count -= nr;
3044 }
3045 }
3046
3047 static void
3048 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
3049 {
3050 struct pipe_screen *pscreen = nv50->pipe.screen;
3051
3052 if (!p->data[0] && p->immd_nr) {
3053 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
3054
3055 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
3056 while (heap->next && heap->size < p->immd_nr) {
3057 struct nv50_program *evict = heap->next->priv;
3058 nouveau_resource_free(&evict->data[0]);
3059 }
3060
3061 if (nouveau_resource_alloc(heap, p->immd_nr, p,
3062 &p->data[0]))
3063 assert(0);
3064 }
3065
3066 /* immediates only need to be uploaded again when freed */
3067 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3068 p->immd_nr, NV50_CB_PMISC);
3069 }
3070
3071 assert(p->param_nr <= 512);
3072
3073 if (p->param_nr) {
3074 unsigned cb;
3075 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3076 PIPE_BUFFER_USAGE_CPU_READ);
3077
3078 if (p->type == PIPE_SHADER_VERTEX)
3079 cb = NV50_CB_PVP;
3080 else
3081 cb = NV50_CB_PFP;
3082
3083 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3084 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3085 }
3086 }
3087
3088 static void
3089 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3090 {
3091 struct nouveau_channel *chan = nv50->screen->base.channel;
3092 struct nv50_program_exec *e;
3093 uint32_t *up, i;
3094 boolean upload = FALSE;
3095
3096 if (!p->bo) {
3097 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3098 p->exec_size * 4, &p->bo);
3099 upload = TRUE;
3100 }
3101
3102 if (p->data[0] && p->data[0]->start != p->data_start[0])
3103 upload = TRUE;
3104
3105 if (!upload)
3106 return;
3107
3108 up = MALLOC(p->exec_size * 4);
3109
3110 for (i = 0, e = p->exec_head; e; e = e->next) {
3111 unsigned ei, ci, bs;
3112
3113 if (e->param.index >= 0 && e->param.mask) {
3114 bs = (e->inst[1] >> 22) & 0x07;
3115 assert(bs < 2);
3116 ei = e->param.shift >> 5;
3117 ci = e->param.index;
3118 if (bs == 0)
3119 ci += p->data[bs]->start;
3120
3121 e->inst[ei] &= ~e->param.mask;
3122 e->inst[ei] |= (ci << e->param.shift);
3123 } else
3124 if (e->param.index >= 0) {
3125 /* zero mask means param is a jump/branch offset */
3126 assert(!(e->param.index & 1));
3127 /* seem to be 8 byte steps */
3128 ei = (e->param.index >> 1) + 0 /* START_ID */;
3129
3130 e->inst[0] &= 0xf0000fff;
3131 e->inst[0] |= ei << 12;
3132 }
3133
3134 up[i++] = e->inst[0];
3135 if (is_long(e))
3136 up[i++] = e->inst[1];
3137 }
3138 assert(i == p->exec_size);
3139
3140 if (p->data[0])
3141 p->data_start[0] = p->data[0]->start;
3142
3143 #ifdef NV50_PROGRAM_DUMP
3144 NOUVEAU_ERR("-------\n");
3145 for (e = p->exec_head; e; e = e->next) {
3146 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3147 if (is_long(e))
3148 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3149 }
3150 #endif
3151 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3152 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3153 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3154 0, 0, p->exec_size * 4, 1, 1);
3155
3156 FREE(up);
3157 }
3158
3159 void
3160 nv50_vertprog_validate(struct nv50_context *nv50)
3161 {
3162 struct nouveau_grobj *tesla = nv50->screen->tesla;
3163 struct nv50_program *p = nv50->vertprog;
3164 struct nouveau_stateobj *so;
3165
3166 if (!p->translated) {
3167 nv50_program_validate(nv50, p);
3168 if (!p->translated)
3169 assert(0);
3170 }
3171
3172 nv50_program_validate_data(nv50, p);
3173 nv50_program_validate_code(nv50, p);
3174
3175 so = so_new(13, 2);
3176 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3177 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3178 NOUVEAU_BO_HIGH, 0, 0);
3179 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3180 NOUVEAU_BO_LOW, 0, 0);
3181 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3182 so_data (so, p->cfg.attr[0]);
3183 so_data (so, p->cfg.attr[1]);
3184 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3185 so_data (so, p->cfg.high_result);
3186 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3187 so_data (so, p->cfg.high_result); //8);
3188 so_data (so, p->cfg.high_temp);
3189 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3190 so_data (so, 0); /* program start offset */
3191 so_ref(so, &nv50->state.vertprog);
3192 so_ref(NULL, &so);
3193 }
3194
3195 void
3196 nv50_fragprog_validate(struct nv50_context *nv50)
3197 {
3198 struct nouveau_grobj *tesla = nv50->screen->tesla;
3199 struct nv50_program *p = nv50->fragprog;
3200 struct nouveau_stateobj *so;
3201
3202 if (!p->translated) {
3203 nv50_program_validate(nv50, p);
3204 if (!p->translated)
3205 assert(0);
3206 }
3207
3208 nv50_program_validate_data(nv50, p);
3209 nv50_program_validate_code(nv50, p);
3210
3211 so = so_new(64, 2);
3212 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3213 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3214 NOUVEAU_BO_HIGH, 0, 0);
3215 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3216 NOUVEAU_BO_LOW, 0, 0);
3217 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3218 so_data (so, p->cfg.high_temp);
3219 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3220 so_data (so, p->cfg.high_result);
3221 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3222 so_data (so, p->cfg.regs[2]);
3223 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3224 so_data (so, p->cfg.regs[3]);
3225 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3226 so_data (so, 0); /* program start offset */
3227 so_ref(so, &nv50->state.fragprog);
3228 so_ref(NULL, &so);
3229 }
3230
3231 static void
3232 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3233 {
3234 struct nv50_program *fp = nv50->fragprog;
3235 struct nv50_program *vp = nv50->vertprog;
3236 unsigned i, c, m = base;
3237
3238 /* XXX: this might not work correctly in all cases yet - we'll
3239 * just assume that an FP generic input that is not written in
3240 * the VP is PointCoord.
3241 */
3242 memset(pntc, 0, 8 * sizeof(uint32_t));
3243
3244 for (i = 0; i < fp->cfg.io_nr; i++) {
3245 uint8_t sn, si;
3246 uint8_t j, k = fp->cfg.io[i].id;
3247 unsigned n = popcnt4(fp->cfg.io[i].mask);
3248
3249 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3250 m += n;
3251 continue;
3252 }
3253
3254 for (j = 0; j < vp->info.num_outputs; ++j) {
3255 sn = vp->info.output_semantic_name[j];
3256 si = vp->info.output_semantic_index[j];
3257
3258 if (sn == fp->info.input_semantic_name[k] &&
3259 si == fp->info.input_semantic_index[k])
3260 break;
3261 }
3262
3263 if (j < vp->info.num_outputs) {
3264 ubyte mode =
3265 nv50->rasterizer->pipe.sprite_coord_mode[si];
3266
3267 if (mode == PIPE_SPRITE_COORD_NONE) {
3268 m += n;
3269 continue;
3270 }
3271 }
3272
3273 /* this is either PointCoord or replaced by sprite coords */
3274 for (c = 0; c < 4; c++) {
3275 if (!(fp->cfg.io[i].mask & (1 << c)))
3276 continue;
3277 pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3278 ++m;
3279 }
3280 }
3281 }
3282
3283 static int
3284 nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3285 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3286 {
3287 int c;
3288 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3289 uint8_t *map = (uint8_t *)p_map;
3290
3291 for (c = 0; c < 4; ++c) {
3292 if (mf & 1) {
3293 if (fpi->linear == TRUE)
3294 lin[mid / 32] |= 1 << (mid % 32);
3295 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3296 }
3297
3298 oid += mv & 1;
3299 mf >>= 1;
3300 mv >>= 1;
3301 }
3302
3303 return mid;
3304 }
3305
3306 void
3307 nv50_linkage_validate(struct nv50_context *nv50)
3308 {
3309 struct nouveau_grobj *tesla = nv50->screen->tesla;
3310 struct nv50_program *vp = nv50->vertprog;
3311 struct nv50_program *fp = nv50->fragprog;
3312 struct nouveau_stateobj *so;
3313 struct nv50_sreg4 dummy, *vpo;
3314 int i, n, c, m = 0;
3315 uint32_t map[16], lin[4], reg[5], pcrd[8];
3316
3317 memset(map, 0, sizeof(map));
3318 memset(lin, 0, sizeof(lin));
3319
3320 reg[1] = 0x00000004; /* low and high clip distance map ids */
3321 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3322 reg[3] = 0x00000000; /* point size map id & enable */
3323 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3324 reg[4] = fp->cfg.regs[1]; /* interpolant info */
3325
3326 dummy.linear = FALSE;
3327 dummy.mask = 0xf; /* map all components of HPOS */
3328 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3329
3330 dummy.mask = 0x0;
3331
3332 if (vp->cfg.clpd < 0x40) {
3333 for (c = 0; c < vp->cfg.clpd_nr; ++c)
3334 map[m++] = vp->cfg.clpd + c;
3335 reg[1] = (m << 8);
3336 }
3337
3338 reg[0] |= m << 8; /* adjust BFC0 id */
3339
3340 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3341 if (nv50->rasterizer->pipe.light_twoside) {
3342 vpo = &vp->cfg.two_side[0];
3343
3344 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3345 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3346 }
3347
3348 reg[0] += m - 4; /* adjust FFC0 id */
3349 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3350
3351 for (i = 0; i < fp->cfg.io_nr; i++) {
3352 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id];
3353 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id];
3354
3355 /* position must be mapped first */
3356 assert(i == 0 || sn != TGSI_SEMANTIC_POSITION);
3357
3358 /* maybe even remove these from cfg.io */
3359 if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE)
3360 continue;
3361
3362 /* VP outputs and vp->cfg.io are in the same order */
3363 for (n = 0; n < vp->info.num_outputs; ++n) {
3364 if (vp->info.output_semantic_name[n] == sn &&
3365 vp->info.output_semantic_index[n] == si)
3366 break;
3367 }
3368 vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy;
3369
3370 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3371 }
3372
3373 if (nv50->rasterizer->pipe.point_size_per_vertex) {
3374 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3375 reg[3] = (m++ << 4) | 1;
3376 }
3377
3378 /* now fill the stateobj */
3379 so = so_new(64, 0);
3380
3381 n = (m + 3) / 4;
3382 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3383 so_data (so, m);
3384 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3385 so_datap (so, map, n);
3386
3387 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3388 so_datap (so, reg, 4);
3389
3390 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3391 so_data (so, reg[4]);
3392
3393 so_method(so, tesla, 0x1540, 4);
3394 so_datap (so, lin, 4);
3395
3396 if (nv50->rasterizer->pipe.point_sprite) {
3397 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3398
3399 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3400 so_datap (so, pcrd, 8);
3401 }
3402
3403 so_ref(so, &nv50->state.programs);
3404 so_ref(NULL, &so);
3405 }
3406
3407 void
3408 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3409 {
3410 while (p->exec_head) {
3411 struct nv50_program_exec *e = p->exec_head;
3412
3413 p->exec_head = e->next;
3414 FREE(e);
3415 }
3416 p->exec_tail = NULL;
3417 p->exec_size = 0;
3418
3419 nouveau_bo_ref(NULL, &p->bo);
3420
3421 nouveau_resource_free(&p->data[0]);
3422
3423 p->translated = 0;
3424 }