gallium: try and update r300 and nv drivers for tgsi changes
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 127
35 #define NV50_SU_MAX_ADDR 4
36 //#define NV50_PROGRAM_DUMP
37
38 /* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40 /* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * - Fuck it off, introduce a way to negate args for ops that
44 * support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * FP attr/result assignment - how?
61 * attrib
62 * - 0x16bc maps vp output onto fp hpos
63 * - 0x16c0 maps vp output onto fp col0
64 * result
65 * - colr always 0-3
66 * - depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * - 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * - XX == FP high something
78 */
79 struct nv50_reg {
80 enum {
81 P_TEMP,
82 P_ATTR,
83 P_RESULT,
84 P_CONST,
85 P_IMMD,
86 P_ADDR
87 } type;
88 int index;
89
90 int hw;
91 int mod;
92
93 int rhw; /* result hw for FP outputs, or interpolant index */
94 int acc; /* instruction where this reg is last read (first insn == 1) */
95 };
96
97 #define NV50_MOD_NEG 1
98 #define NV50_MOD_ABS 2
99 #define NV50_MOD_SAT 4
100
101 /* arbitrary limits */
102 #define MAX_IF_DEPTH 4
103 #define MAX_LOOP_DEPTH 4
104
105 struct nv50_pc {
106 struct nv50_program *p;
107
108 /* hw resources */
109 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
110 struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
111
112 /* tgsi resources */
113 struct nv50_reg *temp;
114 int temp_nr;
115 struct nv50_reg *attr;
116 int attr_nr;
117 struct nv50_reg *result;
118 int result_nr;
119 struct nv50_reg *param;
120 int param_nr;
121 struct nv50_reg *immd;
122 float *immd_buf;
123 int immd_nr;
124 struct nv50_reg **addr;
125 int addr_nr;
126
127 struct nv50_reg *temp_temp[16];
128 unsigned temp_temp_nr;
129
130 /* broadcast and destination replacement regs */
131 struct nv50_reg *r_brdc;
132 struct nv50_reg *r_dst[4];
133
134 unsigned interp_mode[32];
135 /* perspective interpolation registers */
136 struct nv50_reg *iv_p;
137 struct nv50_reg *iv_c;
138
139 struct nv50_program_exec *if_cond;
140 struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
141 struct nv50_program_exec *br_join[MAX_IF_DEPTH];
142 struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
143 int if_lvl, loop_lvl;
144 unsigned loop_pos[MAX_LOOP_DEPTH];
145
146 /* current instruction and total number of insns */
147 unsigned insn_cur;
148 unsigned insn_nr;
149
150 boolean allow32;
151 };
152
153 static INLINE void
154 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
155 {
156 reg->type = type;
157 reg->index = index;
158 reg->hw = hw;
159 reg->mod = 0;
160 reg->rhw = -1;
161 reg->acc = 0;
162 }
163
164 static INLINE unsigned
165 popcnt4(uint32_t val)
166 {
167 static const unsigned cnt[16]
168 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
169 return cnt[val & 0xf];
170 }
171
172 static void
173 terminate_mbb(struct nv50_pc *pc)
174 {
175 int i;
176
177 /* remove records of temporary address register values */
178 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
179 if (pc->r_addr[i].index < 0)
180 pc->r_addr[i].rhw = -1;
181 }
182
183 static void
184 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
185 {
186 int i = 0;
187
188 if (reg->type == P_RESULT) {
189 if (pc->p->cfg.high_result < (reg->hw + 1))
190 pc->p->cfg.high_result = reg->hw + 1;
191 }
192
193 if (reg->type != P_TEMP)
194 return;
195
196 if (reg->hw >= 0) {
197 /*XXX: do this here too to catch FP temp-as-attr usage..
198 * not clean, but works */
199 if (pc->p->cfg.high_temp < (reg->hw + 1))
200 pc->p->cfg.high_temp = reg->hw + 1;
201 return;
202 }
203
204 if (reg->rhw != -1) {
205 /* try to allocate temporary with index rhw first */
206 if (!(pc->r_temp[reg->rhw])) {
207 pc->r_temp[reg->rhw] = reg;
208 reg->hw = reg->rhw;
209 if (pc->p->cfg.high_temp < (reg->rhw + 1))
210 pc->p->cfg.high_temp = reg->rhw + 1;
211 return;
212 }
213 /* make sure we don't get things like $r0 needs to go
214 * in $r1 and $r1 in $r0
215 */
216 i = pc->result_nr * 4;
217 }
218
219 for (; i < NV50_SU_MAX_TEMP; i++) {
220 if (!(pc->r_temp[i])) {
221 pc->r_temp[i] = reg;
222 reg->hw = i;
223 if (pc->p->cfg.high_temp < (i + 1))
224 pc->p->cfg.high_temp = i + 1;
225 return;
226 }
227 }
228
229 assert(0);
230 }
231
232 /* XXX: For shaders that aren't executed linearly (e.g. shaders that
233 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
234 * lest we risk temp_temps overwriting regs alloc'd "later".
235 */
236 static struct nv50_reg *
237 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
238 {
239 struct nv50_reg *r;
240 int i;
241
242 if (dst && dst->type == P_TEMP && dst->hw == -1)
243 return dst;
244
245 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
246 if (!pc->r_temp[i]) {
247 r = MALLOC_STRUCT(nv50_reg);
248 ctor_reg(r, P_TEMP, -1, i);
249 pc->r_temp[i] = r;
250 return r;
251 }
252 }
253
254 assert(0);
255 return NULL;
256 }
257
258 /* Assign the hw of the discarded temporary register src
259 * to the tgsi register dst and free src.
260 */
261 static void
262 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
263 {
264 assert(src->index == -1 && src->hw != -1);
265
266 if (dst->hw != -1)
267 pc->r_temp[dst->hw] = NULL;
268 pc->r_temp[src->hw] = dst;
269 dst->hw = src->hw;
270
271 FREE(src);
272 }
273
274 /* release the hardware resource held by r */
275 static void
276 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
277 {
278 assert(r->type == P_TEMP);
279 if (r->hw == -1)
280 return;
281
282 assert(pc->r_temp[r->hw] == r);
283 pc->r_temp[r->hw] = NULL;
284
285 r->acc = 0;
286 if (r->index == -1)
287 FREE(r);
288 }
289
290 static void
291 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
292 {
293 if (r->index == -1) {
294 unsigned hw = r->hw;
295
296 FREE(pc->r_temp[hw]);
297 pc->r_temp[hw] = NULL;
298 }
299 }
300
301 static int
302 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
303 {
304 int i;
305
306 if ((idx + 4) >= NV50_SU_MAX_TEMP)
307 return 1;
308
309 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
310 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
311 return alloc_temp4(pc, dst, idx + 4);
312
313 for (i = 0; i < 4; i++) {
314 dst[i] = MALLOC_STRUCT(nv50_reg);
315 ctor_reg(dst[i], P_TEMP, -1, idx + i);
316 pc->r_temp[idx + i] = dst[i];
317 }
318
319 return 0;
320 }
321
322 static void
323 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
324 {
325 int i;
326
327 for (i = 0; i < 4; i++)
328 free_temp(pc, reg[i]);
329 }
330
331 static struct nv50_reg *
332 temp_temp(struct nv50_pc *pc)
333 {
334 if (pc->temp_temp_nr >= 16)
335 assert(0);
336
337 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
338 return pc->temp_temp[pc->temp_temp_nr++];
339 }
340
341 static void
342 kill_temp_temp(struct nv50_pc *pc)
343 {
344 int i;
345
346 for (i = 0; i < pc->temp_temp_nr; i++)
347 free_temp(pc, pc->temp_temp[i]);
348 pc->temp_temp_nr = 0;
349 }
350
351 static int
352 ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
353 {
354 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
355 (pc->immd_nr + 1) * 4 * sizeof(float));
356 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
357 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
358 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
359 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
360
361 return pc->immd_nr++;
362 }
363
364 static struct nv50_reg *
365 alloc_immd(struct nv50_pc *pc, float f)
366 {
367 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
368 unsigned hw;
369
370 for (hw = 0; hw < pc->immd_nr * 4; hw++)
371 if (pc->immd_buf[hw] == f)
372 break;
373
374 if (hw == pc->immd_nr * 4)
375 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
376
377 ctor_reg(r, P_IMMD, -1, hw);
378 return r;
379 }
380
381 static struct nv50_program_exec *
382 exec(struct nv50_pc *pc)
383 {
384 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
385
386 e->param.index = -1;
387 return e;
388 }
389
390 static void
391 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
392 {
393 struct nv50_program *p = pc->p;
394
395 if (p->exec_tail)
396 p->exec_tail->next = e;
397 if (!p->exec_head)
398 p->exec_head = e;
399 p->exec_tail = e;
400 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
401 }
402
403 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
404
405 static boolean
406 is_long(struct nv50_program_exec *e)
407 {
408 if (e->inst[0] & 1)
409 return TRUE;
410 return FALSE;
411 }
412
413 static boolean
414 is_immd(struct nv50_program_exec *e)
415 {
416 if (is_long(e) && (e->inst[1] & 3) == 3)
417 return TRUE;
418 return FALSE;
419 }
420
421 static INLINE void
422 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
423 struct nv50_program_exec *e)
424 {
425 set_long(pc, e);
426 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
427 e->inst[1] |= (pred << 7) | (idx << 12);
428 }
429
430 static INLINE void
431 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
432 struct nv50_program_exec *e)
433 {
434 set_long(pc, e);
435 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
436 e->inst[1] |= (idx << 4) | (on << 6);
437 }
438
439 static INLINE void
440 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
441 {
442 if (is_long(e))
443 return;
444
445 e->inst[0] |= 1;
446 set_pred(pc, 0xf, 0, e);
447 set_pred_wr(pc, 0, 0, e);
448 }
449
450 static INLINE void
451 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
452 {
453 if (dst->type == P_RESULT) {
454 set_long(pc, e);
455 e->inst[1] |= 0x00000008;
456 }
457
458 alloc_reg(pc, dst);
459 if (dst->hw > 63)
460 set_long(pc, e);
461 e->inst[0] |= (dst->hw << 2);
462 }
463
464 static INLINE void
465 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
466 {
467 unsigned val;
468 float f = pc->immd_buf[imm->hw];
469
470 if (imm->mod & NV50_MOD_ABS)
471 f = fabsf(f);
472 val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
473
474 set_long(pc, e);
475 /*XXX: can't be predicated - bits overlap.. catch cases where both
476 * are required and avoid them. */
477 set_pred(pc, 0, 0, e);
478 set_pred_wr(pc, 0, 0, e);
479
480 e->inst[1] |= 0x00000002 | 0x00000001;
481 e->inst[0] |= (val & 0x3f) << 16;
482 e->inst[1] |= (val >> 6) << 2;
483 }
484
485 static INLINE void
486 set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
487 {
488 assert(!(e->inst[0] & 0x0c000000));
489 assert(!(e->inst[1] & 0x00000004));
490
491 e->inst[0] |= (a->hw & 3) << 26;
492 e->inst[1] |= (a->hw >> 2) << 2;
493 }
494
495 static void
496 emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
497 struct nv50_reg *src0, uint16_t src1_val)
498 {
499 struct nv50_program_exec *e = exec(pc);
500
501 e->inst[0] = 0xd0000000 | (src1_val << 9);
502 e->inst[1] = 0x20000000;
503 set_long(pc, e);
504 e->inst[0] |= dst->hw << 2;
505 if (src0) /* otherwise will add to $a0, which is always 0 */
506 set_addr(e, src0);
507
508 emit(pc, e);
509 }
510
511 static struct nv50_reg *
512 alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
513 {
514 int i;
515 struct nv50_reg *a_tgsi = NULL, *a = NULL;
516
517 if (!ref) {
518 /* allocate for TGSI address reg */
519 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
520 if (pc->r_addr[i].index >= 0)
521 continue;
522 if (pc->r_addr[i].rhw >= 0 &&
523 pc->r_addr[i].acc == pc->insn_cur)
524 continue;
525
526 pc->r_addr[i].rhw = -1;
527 pc->r_addr[i].index = i;
528 return &pc->r_addr[i];
529 }
530 assert(0);
531 return NULL;
532 }
533
534 /* Allocate and set an address reg so we can access 'ref'.
535 *
536 * If and r_addr has index < 0, it is not reserved for TGSI,
537 * and index will be the negative of the TGSI addr index the
538 * value in rhw is relative to, or -256 if rhw is an offset
539 * from 0. If rhw < 0, the reg has not been initialized.
540 */
541 for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
542 if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
543 continue;
544 if (pc->r_addr[i].rhw < 0) { /* unused */
545 a = &pc->r_addr[i];
546 continue;
547 }
548 if (!a && pc->r_addr[i].acc != pc->insn_cur)
549 a = &pc->r_addr[i];
550
551 if (ref->hw - pc->r_addr[i].rhw >= 128)
552 continue;
553
554 if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
555 (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
556 pc->r_addr[i].acc = pc->insn_cur;
557 return &pc->r_addr[i];
558 }
559 }
560 assert(a);
561
562 if (ref->acc < 0)
563 a_tgsi = pc->addr[ref->index];
564
565 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
566
567 a->rhw = ref->hw & ~0x7f;
568 a->acc = pc->insn_cur;
569 a->index = a_tgsi ? -ref->index : -256;
570 return a;
571 }
572
573 #define INTERP_LINEAR 0
574 #define INTERP_FLAT 1
575 #define INTERP_PERSPECTIVE 2
576 #define INTERP_CENTROID 4
577
578 /* interpolant index has been stored in dst->rhw */
579 static void
580 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
581 unsigned mode)
582 {
583 assert(dst->rhw != -1);
584 struct nv50_program_exec *e = exec(pc);
585
586 e->inst[0] |= 0x80000000;
587 set_dst(pc, dst, e);
588 e->inst[0] |= (dst->rhw << 16);
589
590 if (mode & INTERP_FLAT) {
591 e->inst[0] |= (1 << 8);
592 } else {
593 if (mode & INTERP_PERSPECTIVE) {
594 e->inst[0] |= (1 << 25);
595 alloc_reg(pc, iv);
596 e->inst[0] |= (iv->hw << 9);
597 }
598
599 if (mode & INTERP_CENTROID)
600 e->inst[0] |= (1 << 24);
601 }
602
603 emit(pc, e);
604 }
605
606 static void
607 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
608 struct nv50_program_exec *e)
609 {
610 set_long(pc, e);
611
612 e->param.index = src->hw & 127;
613 e->param.shift = s;
614 e->param.mask = m << (s % 32);
615
616 if (src->hw > 127)
617 set_addr(e, alloc_addr(pc, src));
618 else
619 if (src->acc < 0) {
620 assert(src->type == P_CONST);
621 set_addr(e, pc->addr[src->index]);
622 }
623
624 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
625 }
626
627 static void
628 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
629 {
630 struct nv50_program_exec *e = exec(pc);
631
632 e->inst[0] = 0x10000000;
633 if (!pc->allow32)
634 set_long(pc, e);
635
636 set_dst(pc, dst, e);
637
638 if (!is_long(e) && src->type == P_IMMD) {
639 set_immd(pc, src, e);
640 /*XXX: 32-bit, but steals part of "half" reg space - need to
641 * catch and handle this case if/when we do half-regs
642 */
643 } else
644 if (src->type == P_IMMD || src->type == P_CONST) {
645 set_long(pc, e);
646 set_data(pc, src, 0x7f, 9, e);
647 e->inst[1] |= 0x20000000; /* src0 const? */
648 } else {
649 if (src->type == P_ATTR) {
650 set_long(pc, e);
651 e->inst[1] |= 0x00200000;
652 }
653
654 alloc_reg(pc, src);
655 if (src->hw > 63)
656 set_long(pc, e);
657 e->inst[0] |= (src->hw << 9);
658 }
659
660 if (is_long(e) && !is_immd(e)) {
661 e->inst[1] |= 0x04000000; /* 32-bit */
662 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
663 if (!(e->inst[1] & 0x20000000))
664 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
665 } else
666 e->inst[0] |= 0x00008000;
667
668 emit(pc, e);
669 }
670
671 static INLINE void
672 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
673 {
674 struct nv50_reg *imm = alloc_immd(pc, f);
675 emit_mov(pc, dst, imm);
676 FREE(imm);
677 }
678
679 static boolean
680 check_swap_src_0_1(struct nv50_pc *pc,
681 struct nv50_reg **s0, struct nv50_reg **s1)
682 {
683 struct nv50_reg *src0 = *s0, *src1 = *s1;
684
685 if (src0->type == P_CONST) {
686 if (src1->type != P_CONST) {
687 *s0 = src1;
688 *s1 = src0;
689 return TRUE;
690 }
691 } else
692 if (src1->type == P_ATTR) {
693 if (src0->type != P_ATTR) {
694 *s0 = src1;
695 *s1 = src0;
696 return TRUE;
697 }
698 }
699
700 return FALSE;
701 }
702
703 static void
704 set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
705 struct nv50_program_exec *e)
706 {
707 struct nv50_reg *temp;
708
709 if (src->type != P_TEMP) {
710 temp = temp_temp(pc);
711 emit_mov(pc, temp, src);
712 src = temp;
713 }
714
715 alloc_reg(pc, src);
716 if (src->hw > 63)
717 set_long(pc, e);
718 e->inst[0] |= (src->hw << 9);
719 }
720
721 static void
722 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
723 {
724 if (src->type == P_ATTR) {
725 set_long(pc, e);
726 e->inst[1] |= 0x00200000;
727 } else
728 if (src->type == P_CONST || src->type == P_IMMD) {
729 struct nv50_reg *temp = temp_temp(pc);
730
731 emit_mov(pc, temp, src);
732 src = temp;
733 }
734
735 alloc_reg(pc, src);
736 if (src->hw > 63)
737 set_long(pc, e);
738 e->inst[0] |= (src->hw << 9);
739 }
740
741 static void
742 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
743 {
744 if (src->type == P_ATTR) {
745 struct nv50_reg *temp = temp_temp(pc);
746
747 emit_mov(pc, temp, src);
748 src = temp;
749 } else
750 if (src->type == P_CONST || src->type == P_IMMD) {
751 assert(!(e->inst[0] & 0x00800000));
752 if (e->inst[0] & 0x01000000) {
753 struct nv50_reg *temp = temp_temp(pc);
754
755 emit_mov(pc, temp, src);
756 src = temp;
757 } else {
758 set_data(pc, src, 0x7f, 16, e);
759 e->inst[0] |= 0x00800000;
760 }
761 }
762
763 alloc_reg(pc, src);
764 if (src->hw > 63)
765 set_long(pc, e);
766 e->inst[0] |= ((src->hw & 127) << 16);
767 }
768
769 static void
770 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
771 {
772 set_long(pc, e);
773
774 if (src->type == P_ATTR) {
775 struct nv50_reg *temp = temp_temp(pc);
776
777 emit_mov(pc, temp, src);
778 src = temp;
779 } else
780 if (src->type == P_CONST || src->type == P_IMMD) {
781 assert(!(e->inst[0] & 0x01000000));
782 if (e->inst[0] & 0x00800000) {
783 struct nv50_reg *temp = temp_temp(pc);
784
785 emit_mov(pc, temp, src);
786 src = temp;
787 } else {
788 set_data(pc, src, 0x7f, 32+14, e);
789 e->inst[0] |= 0x01000000;
790 }
791 }
792
793 alloc_reg(pc, src);
794 e->inst[1] |= ((src->hw & 127) << 14);
795 }
796
797 static void
798 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
799 struct nv50_reg *src1)
800 {
801 struct nv50_program_exec *e = exec(pc);
802
803 e->inst[0] |= 0xc0000000;
804
805 if (!pc->allow32)
806 set_long(pc, e);
807
808 check_swap_src_0_1(pc, &src0, &src1);
809 set_dst(pc, dst, e);
810 set_src_0(pc, src0, e);
811 if (src1->type == P_IMMD && !is_long(e)) {
812 if (src0->mod & NV50_MOD_NEG)
813 e->inst[0] |= 0x00008000;
814 set_immd(pc, src1, e);
815 } else {
816 set_src_1(pc, src1, e);
817 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
818 if (is_long(e))
819 e->inst[1] |= 0x08000000;
820 else
821 e->inst[0] |= 0x00008000;
822 }
823 }
824
825 emit(pc, e);
826 }
827
828 static void
829 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
830 struct nv50_reg *src0, struct nv50_reg *src1)
831 {
832 struct nv50_program_exec *e = exec(pc);
833
834 e->inst[0] = 0xb0000000;
835
836 alloc_reg(pc, src1);
837 check_swap_src_0_1(pc, &src0, &src1);
838
839 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
840 set_long(pc, e);
841 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
842 ((src1->mod & NV50_MOD_NEG) << 27);
843 }
844
845 set_dst(pc, dst, e);
846 set_src_0(pc, src0, e);
847 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
848 set_src_2(pc, src1, e);
849 else
850 if (src1->type == P_IMMD)
851 set_immd(pc, src1, e);
852 else
853 set_src_1(pc, src1, e);
854
855 emit(pc, e);
856 }
857
858 static void
859 emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
860 uint8_t s)
861 {
862 struct nv50_program_exec *e = exec(pc);
863
864 set_long(pc, e);
865 e->inst[1] |= 0xc0000000;
866
867 e->inst[0] |= dst->hw << 2;
868 e->inst[0] |= s << 16; /* shift left */
869 set_src_0_restricted(pc, src, e);
870
871 emit(pc, e);
872 }
873
874 static void
875 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
876 struct nv50_reg *src0, struct nv50_reg *src1)
877 {
878 struct nv50_program_exec *e = exec(pc);
879
880 set_long(pc, e);
881 e->inst[0] |= 0xb0000000;
882 e->inst[1] |= (sub << 29);
883
884 check_swap_src_0_1(pc, &src0, &src1);
885 set_dst(pc, dst, e);
886 set_src_0(pc, src0, e);
887 set_src_1(pc, src1, e);
888
889 if (src0->mod & NV50_MOD_ABS)
890 e->inst[1] |= 0x00100000;
891 if (src1->mod & NV50_MOD_ABS)
892 e->inst[1] |= 0x00080000;
893
894 emit(pc, e);
895 }
896
897 static INLINE void
898 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
899 struct nv50_reg *src1)
900 {
901 assert(src0 != src1);
902 src1->mod ^= NV50_MOD_NEG;
903 emit_add(pc, dst, src0, src1);
904 src1->mod ^= NV50_MOD_NEG;
905 }
906
907 static void
908 emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
909 struct nv50_reg *src1, unsigned op)
910 {
911 struct nv50_program_exec *e = exec(pc);
912
913 e->inst[0] = 0xd0000000;
914 set_long(pc, e);
915
916 check_swap_src_0_1(pc, &src0, &src1);
917 set_dst(pc, dst, e);
918 set_src_0(pc, src0, e);
919
920 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
921 op != TGSI_OPCODE_XOR)
922 assert(!"invalid bit op");
923
924 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
925 set_immd(pc, src1, e);
926 if (op == TGSI_OPCODE_OR)
927 e->inst[0] |= 0x0100;
928 else
929 if (op == TGSI_OPCODE_XOR)
930 e->inst[0] |= 0x8000;
931 } else {
932 set_src_1(pc, src1, e);
933 e->inst[1] |= 0x04000000; /* 32 bit */
934 if (op == TGSI_OPCODE_OR)
935 e->inst[1] |= 0x4000;
936 else
937 if (op == TGSI_OPCODE_XOR)
938 e->inst[1] |= 0x8000;
939 }
940
941 emit(pc, e);
942 }
943
944 static void
945 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
946 struct nv50_reg *src1, struct nv50_reg *src2)
947 {
948 struct nv50_program_exec *e = exec(pc);
949
950 e->inst[0] |= 0xe0000000;
951
952 check_swap_src_0_1(pc, &src0, &src1);
953 set_dst(pc, dst, e);
954 set_src_0(pc, src0, e);
955 set_src_1(pc, src1, e);
956 set_src_2(pc, src2, e);
957
958 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
959 e->inst[1] |= 0x04000000;
960 if (src2->mod & NV50_MOD_NEG)
961 e->inst[1] |= 0x08000000;
962
963 emit(pc, e);
964 }
965
966 static INLINE void
967 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
968 struct nv50_reg *src1, struct nv50_reg *src2)
969 {
970 assert(src2 != src0 && src2 != src1);
971 src2->mod ^= NV50_MOD_NEG;
972 emit_mad(pc, dst, src0, src1, src2);
973 src2->mod ^= NV50_MOD_NEG;
974 }
975
976 static void
977 emit_flop(struct nv50_pc *pc, unsigned sub,
978 struct nv50_reg *dst, struct nv50_reg *src)
979 {
980 struct nv50_program_exec *e = exec(pc);
981
982 e->inst[0] |= 0x90000000;
983 if (sub) {
984 set_long(pc, e);
985 e->inst[1] |= (sub << 29);
986 }
987
988 set_dst(pc, dst, e);
989
990 if (sub == 0 || sub == 2)
991 set_src_0_restricted(pc, src, e);
992 else
993 set_src_0(pc, src, e);
994
995 emit(pc, e);
996 }
997
998 static void
999 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1000 {
1001 struct nv50_program_exec *e = exec(pc);
1002
1003 e->inst[0] |= 0xb0000000;
1004
1005 set_dst(pc, dst, e);
1006 set_src_0(pc, src, e);
1007 set_long(pc, e);
1008 e->inst[1] |= (6 << 29) | 0x00004000;
1009
1010 emit(pc, e);
1011 }
1012
1013 static void
1014 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1015 {
1016 struct nv50_program_exec *e = exec(pc);
1017
1018 e->inst[0] |= 0xb0000000;
1019
1020 set_dst(pc, dst, e);
1021 set_src_0(pc, src, e);
1022 set_long(pc, e);
1023 e->inst[1] |= (6 << 29);
1024
1025 emit(pc, e);
1026 }
1027
1028 #define CVTOP_RN 0x01
1029 #define CVTOP_FLOOR 0x03
1030 #define CVTOP_CEIL 0x05
1031 #define CVTOP_TRUNC 0x07
1032 #define CVTOP_SAT 0x08
1033 #define CVTOP_ABS 0x10
1034
1035 /* 0x04 == 32 bit dst */
1036 /* 0x40 == dst is float */
1037 /* 0x80 == src is float */
1038 #define CVT_F32_F32 0xc4
1039 #define CVT_F32_S32 0x44
1040 #define CVT_S32_F32 0x8c
1041 #define CVT_S32_S32 0x0c
1042 #define CVT_NEG 0x20
1043 #define CVT_RI 0x08
1044
1045 static void
1046 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1047 int wp, unsigned cvn, unsigned fmt)
1048 {
1049 struct nv50_program_exec *e;
1050
1051 e = exec(pc);
1052 set_long(pc, e);
1053
1054 e->inst[0] |= 0xa0000000;
1055 e->inst[1] |= 0x00004000; /* 32 bit src */
1056 e->inst[1] |= (cvn << 16);
1057 e->inst[1] |= (fmt << 24);
1058 set_src_0(pc, src, e);
1059
1060 if (wp >= 0)
1061 set_pred_wr(pc, 1, wp, e);
1062
1063 if (dst)
1064 set_dst(pc, dst, e);
1065 else {
1066 e->inst[0] |= 0x000001fc;
1067 e->inst[1] |= 0x00000008;
1068 }
1069
1070 emit(pc, e);
1071 }
1072
1073 /* nv50 Condition codes:
1074 * 0x1 = LT
1075 * 0x2 = EQ
1076 * 0x3 = LE
1077 * 0x4 = GT
1078 * 0x5 = NE
1079 * 0x6 = GE
1080 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1081 * 0x8 = unordered bit (allows NaN)
1082 */
1083 static void
1084 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1085 struct nv50_reg *src0, struct nv50_reg *src1)
1086 {
1087 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1088
1089 struct nv50_program_exec *e = exec(pc);
1090 struct nv50_reg *rdst;
1091
1092 assert(ccode < 16);
1093 if (check_swap_src_0_1(pc, &src0, &src1))
1094 ccode = cc_swapped[ccode & 7] | (ccode & 8);
1095
1096 rdst = dst;
1097 if (dst && dst->type != P_TEMP)
1098 dst = alloc_temp(pc, NULL);
1099
1100 /* set.u32 */
1101 set_long(pc, e);
1102 e->inst[0] |= 0xb0000000;
1103 e->inst[1] |= 0x60000000 | (ccode << 14);
1104
1105 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1106 * that doesn't seem to match what the hw actually does
1107 e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1108 */
1109
1110 if (wp >= 0)
1111 set_pred_wr(pc, 1, wp, e);
1112 if (dst)
1113 set_dst(pc, dst, e);
1114 else {
1115 e->inst[0] |= 0x000001fc;
1116 e->inst[1] |= 0x00000008;
1117 }
1118
1119 set_src_0(pc, src0, e);
1120 set_src_1(pc, src1, e);
1121
1122 emit(pc, e);
1123 pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
1124
1125 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1126 if (rdst)
1127 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1128 if (rdst && rdst != dst)
1129 free_temp(pc, dst);
1130 }
1131
1132 static INLINE unsigned
1133 map_tgsi_setop_cc(unsigned op)
1134 {
1135 switch (op) {
1136 case TGSI_OPCODE_SLT: return 0x1;
1137 case TGSI_OPCODE_SGE: return 0x6;
1138 case TGSI_OPCODE_SEQ: return 0x2;
1139 case TGSI_OPCODE_SGT: return 0x4;
1140 case TGSI_OPCODE_SLE: return 0x3;
1141 case TGSI_OPCODE_SNE: return 0xd;
1142 default:
1143 assert(0);
1144 return 0;
1145 }
1146 }
1147
1148 static INLINE void
1149 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1150 {
1151 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1152 }
1153
1154 static void
1155 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1156 struct nv50_reg *v, struct nv50_reg *e)
1157 {
1158 struct nv50_reg *temp = alloc_temp(pc, NULL);
1159
1160 emit_flop(pc, 3, temp, v);
1161 emit_mul(pc, temp, temp, e);
1162 emit_preex2(pc, temp, temp);
1163 emit_flop(pc, 6, dst, temp);
1164
1165 free_temp(pc, temp);
1166 }
1167
1168 static INLINE void
1169 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1170 {
1171 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1172 }
1173
1174 static INLINE void
1175 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1176 {
1177 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1178 }
1179
1180 static void
1181 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1182 struct nv50_reg **src)
1183 {
1184 struct nv50_reg *one = alloc_immd(pc, 1.0);
1185 struct nv50_reg *zero = alloc_immd(pc, 0.0);
1186 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1187 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
1188 struct nv50_reg *tmp[4];
1189 boolean allow32 = pc->allow32;
1190
1191 pc->allow32 = FALSE;
1192
1193 if (mask & (3 << 1)) {
1194 tmp[0] = alloc_temp(pc, NULL);
1195 emit_minmax(pc, 4, tmp[0], src[0], zero);
1196 }
1197
1198 if (mask & (1 << 2)) {
1199 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1200
1201 tmp[1] = temp_temp(pc);
1202 emit_minmax(pc, 4, tmp[1], src[1], zero);
1203
1204 tmp[3] = temp_temp(pc);
1205 emit_minmax(pc, 4, tmp[3], src[3], neg128);
1206 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1207
1208 emit_pow(pc, dst[2], tmp[1], tmp[3]);
1209 emit_mov(pc, dst[2], zero);
1210 set_pred(pc, 3, 0, pc->p->exec_tail);
1211 }
1212
1213 if (mask & (1 << 1))
1214 assimilate_temp(pc, dst[1], tmp[0]);
1215 else
1216 if (mask & (1 << 2))
1217 free_temp(pc, tmp[0]);
1218
1219 pc->allow32 = allow32;
1220
1221 /* do this last, in case src[i,j] == dst[0,3] */
1222 if (mask & (1 << 0))
1223 emit_mov(pc, dst[0], one);
1224
1225 if (mask & (1 << 3))
1226 emit_mov(pc, dst[3], one);
1227
1228 FREE(pos128);
1229 FREE(neg128);
1230 FREE(zero);
1231 FREE(one);
1232 }
1233
1234 static INLINE void
1235 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1236 {
1237 emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1238 }
1239
1240 static void
1241 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1242 {
1243 struct nv50_program_exec *e;
1244 const int r_pred = 1;
1245 unsigned cvn = CVT_F32_F32;
1246
1247 if (src->mod & NV50_MOD_NEG)
1248 cvn |= CVT_NEG;
1249 /* write predicate reg */
1250 emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1251
1252 /* conditional discard */
1253 e = exec(pc);
1254 e->inst[0] = 0x00000002;
1255 set_long(pc, e);
1256 set_pred(pc, 0x1 /* LT */, r_pred, e);
1257 emit(pc, e);
1258 }
1259
1260 static void
1261 load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1262 struct nv50_reg **src, boolean proj)
1263 {
1264 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
1265
1266 src[0]->mod |= NV50_MOD_ABS;
1267 src[1]->mod |= NV50_MOD_ABS;
1268 src[2]->mod |= NV50_MOD_ABS;
1269
1270 emit_minmax(pc, 4, t[2], src[0], src[1]);
1271 emit_minmax(pc, 4, t[2], src[2], t[2]);
1272
1273 src[0]->mod = mod[0];
1274 src[1]->mod = mod[1];
1275 src[2]->mod = mod[2];
1276
1277 if (proj && 0 /* looks more correct without this */)
1278 emit_mul(pc, t[2], t[2], src[3]);
1279 emit_flop(pc, 0, t[2], t[2]);
1280
1281 emit_mul(pc, t[0], src[0], t[2]);
1282 emit_mul(pc, t[1], src[1], t[2]);
1283 emit_mul(pc, t[2], src[2], t[2]);
1284 }
1285
1286 static void
1287 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1288 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1289 {
1290 struct nv50_reg *t[4];
1291 struct nv50_program_exec *e;
1292
1293 unsigned c, mode, dim;
1294
1295 switch (type) {
1296 case TGSI_TEXTURE_1D:
1297 dim = 1;
1298 break;
1299 case TGSI_TEXTURE_UNKNOWN:
1300 case TGSI_TEXTURE_2D:
1301 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1302 case TGSI_TEXTURE_RECT:
1303 dim = 2;
1304 break;
1305 case TGSI_TEXTURE_3D:
1306 case TGSI_TEXTURE_CUBE:
1307 case TGSI_TEXTURE_SHADOW2D:
1308 case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1309 dim = 3;
1310 break;
1311 default:
1312 assert(0);
1313 break;
1314 }
1315
1316 /* some cards need t[0]'s hw index to be a multiple of 4 */
1317 alloc_temp4(pc, t, 0);
1318
1319 if (type == TGSI_TEXTURE_CUBE) {
1320 load_cube_tex_coords(pc, t, src, proj);
1321 } else
1322 if (proj) {
1323 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1324 mode = pc->interp_mode[src[0]->index];
1325
1326 t[3]->rhw = src[3]->rhw;
1327 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1328 emit_flop(pc, 0, t[3], t[3]);
1329
1330 for (c = 0; c < dim; c++) {
1331 t[c]->rhw = src[c]->rhw;
1332 emit_interp(pc, t[c], t[3],
1333 (mode | INTERP_PERSPECTIVE));
1334 }
1335 } else {
1336 emit_flop(pc, 0, t[3], src[3]);
1337 for (c = 0; c < dim; c++)
1338 emit_mul(pc, t[c], src[c], t[3]);
1339
1340 /* XXX: for some reason the blob sometimes uses MAD:
1341 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1342 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1343 */
1344 }
1345 } else {
1346 for (c = 0; c < dim; c++)
1347 emit_mov(pc, t[c], src[c]);
1348 }
1349
1350 e = exec(pc);
1351 set_long(pc, e);
1352 e->inst[0] |= 0xf0000000;
1353 e->inst[1] |= 0x00000004;
1354 set_dst(pc, t[0], e);
1355 e->inst[0] |= (unit << 9);
1356
1357 if (dim == 2)
1358 e->inst[0] |= 0x00400000;
1359 else
1360 if (dim == 3) {
1361 e->inst[0] |= 0x00800000;
1362 if (type == TGSI_TEXTURE_CUBE)
1363 e->inst[0] |= 0x08000000;
1364 }
1365
1366 e->inst[0] |= (mask & 0x3) << 25;
1367 e->inst[1] |= (mask & 0xc) << 12;
1368
1369 emit(pc, e);
1370 #if 1
1371 c = 0;
1372 if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1373 if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1374 if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1375 if (mask & 8) emit_mov(pc, dst[3], t[c]);
1376
1377 free_temp4(pc, t);
1378 #else
1379 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1380 * the texture coordinates, not the fetched values: latency ? */
1381
1382 for (c = 0; c < 4; c++) {
1383 if (mask & (1 << c))
1384 assimilate_temp(pc, dst[c], t[c]);
1385 else
1386 free_temp(pc, t[c]);
1387 }
1388 #endif
1389 }
1390
1391 static void
1392 emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1393 struct nv50_program_exec **join)
1394 {
1395 struct nv50_program_exec *e = exec(pc);
1396
1397 if (join) {
1398 set_long(pc, e);
1399 e->inst[0] |= 0xa0000002;
1400 emit(pc, e);
1401 *join = e;
1402 e = exec(pc);
1403 }
1404
1405 set_long(pc, e);
1406 e->inst[0] |= 0x10000002;
1407 if (pred >= 0)
1408 set_pred(pc, cc, pred, e);
1409 emit(pc, e);
1410 }
1411
1412 static void
1413 emit_nop(struct nv50_pc *pc)
1414 {
1415 struct nv50_program_exec *e = exec(pc);
1416
1417 e->inst[0] = 0xf0000000;
1418 set_long(pc, e);
1419 e->inst[1] = 0xe0000000;
1420 emit(pc, e);
1421 }
1422
1423 static void
1424 emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1425 {
1426 struct nv50_program_exec *e = exec(pc);
1427
1428 assert(src->type == P_TEMP);
1429
1430 e->inst[0] = 0xc0140000;
1431 e->inst[1] = 0x89800000;
1432 set_long(pc, e);
1433 set_dst(pc, dst, e);
1434 set_src_0(pc, src, e);
1435 set_src_2(pc, src, e);
1436
1437 emit(pc, e);
1438 }
1439
1440 static void
1441 emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1442 {
1443 struct nv50_program_exec *e = exec(pc);
1444
1445 assert(src->type == P_TEMP);
1446
1447 if (!(src->mod & NV50_MOD_NEG)) /* ! double negation */
1448 emit_neg(pc, src, src);
1449
1450 e->inst[0] = 0xc0150000;
1451 e->inst[1] = 0x8a400000;
1452 set_long(pc, e);
1453 set_dst(pc, dst, e);
1454 set_src_0(pc, src, e);
1455 set_src_2(pc, src, e);
1456
1457 emit(pc, e);
1458 }
1459
1460 static void
1461 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1462 {
1463 unsigned q = 0, m = ~0;
1464
1465 assert(!is_long(e));
1466
1467 switch (e->inst[0] >> 28) {
1468 case 0x1:
1469 /* MOV */
1470 q = 0x0403c000;
1471 m = 0xffff7fff;
1472 break;
1473 case 0x8:
1474 /* INTERP (move centroid, perspective and flat bits) */
1475 m = ~0x03000100;
1476 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1477 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1478 break;
1479 case 0x9:
1480 /* RCP */
1481 break;
1482 case 0xB:
1483 /* ADD */
1484 m = ~(127 << 16);
1485 q = ((e->inst[0] & (~m)) >> 2);
1486 break;
1487 case 0xC:
1488 /* MUL */
1489 m = ~0x00008000;
1490 q = ((e->inst[0] & (~m)) << 12);
1491 break;
1492 case 0xE:
1493 /* MAD (if src2 == dst) */
1494 q = ((e->inst[0] & 0x1fc) << 12);
1495 break;
1496 default:
1497 assert(0);
1498 break;
1499 }
1500
1501 set_long(pc, e);
1502 pc->p->exec_size++;
1503
1504 e->inst[0] &= m;
1505 e->inst[1] |= q;
1506 }
1507
1508 /* Some operations support an optional negation flag. */
1509 static boolean
1510 negate_supported(const struct tgsi_full_instruction *insn, int i)
1511 {
1512 int s;
1513
1514 switch (insn->Instruction.Opcode) {
1515 case TGSI_OPCODE_DDY:
1516 case TGSI_OPCODE_DP3:
1517 case TGSI_OPCODE_DP4:
1518 case TGSI_OPCODE_MUL:
1519 case TGSI_OPCODE_KIL:
1520 case TGSI_OPCODE_ADD:
1521 case TGSI_OPCODE_SUB:
1522 case TGSI_OPCODE_MAD:
1523 break;
1524 case TGSI_OPCODE_POW:
1525 if (i == 1)
1526 break;
1527 return FALSE;
1528 default:
1529 return FALSE;
1530 }
1531
1532 /* Watch out for possible multiple uses of an nv50_reg, we
1533 * can't use nv50_reg::neg in these cases.
1534 */
1535 for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1536 if (s == i)
1537 continue;
1538 if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
1539 insn->FullSrcRegisters[i].SrcRegister.Index) &&
1540 (insn->FullSrcRegisters[s].SrcRegister.File ==
1541 insn->FullSrcRegisters[i].SrcRegister.File))
1542 return FALSE;
1543 }
1544
1545 return TRUE;
1546 }
1547
1548 /* Return a read mask for source registers deduced from opcode & write mask. */
1549 static unsigned
1550 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1551 {
1552 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1553
1554 switch (insn->Instruction.Opcode) {
1555 case TGSI_OPCODE_COS:
1556 case TGSI_OPCODE_SIN:
1557 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1558 case TGSI_OPCODE_DP3:
1559 return 0x7;
1560 case TGSI_OPCODE_DP4:
1561 case TGSI_OPCODE_DPH:
1562 case TGSI_OPCODE_KIL: /* WriteMask ignored */
1563 return 0xf;
1564 case TGSI_OPCODE_DST:
1565 return mask & (c ? 0xa : 0x6);
1566 case TGSI_OPCODE_EX2:
1567 case TGSI_OPCODE_LG2:
1568 case TGSI_OPCODE_POW:
1569 case TGSI_OPCODE_RCP:
1570 case TGSI_OPCODE_RSQ:
1571 case TGSI_OPCODE_SCS:
1572 return 0x1;
1573 case TGSI_OPCODE_LIT:
1574 return 0xb;
1575 case TGSI_OPCODE_TEX:
1576 case TGSI_OPCODE_TXP:
1577 {
1578 const struct tgsi_instruction_texture *tex;
1579
1580 assert(insn->Instruction.Texture);
1581 tex = &insn->InstructionTexture;
1582
1583 mask = 0x7;
1584 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1585 mask |= 0x8;
1586
1587 switch (tex->Texture) {
1588 case TGSI_TEXTURE_1D:
1589 mask &= 0x9;
1590 break;
1591 case TGSI_TEXTURE_2D:
1592 mask &= 0xb;
1593 break;
1594 default:
1595 break;
1596 }
1597 }
1598 return mask;
1599 case TGSI_OPCODE_XPD:
1600 x = 0;
1601 if (mask & 1) x |= 0x6;
1602 if (mask & 2) x |= 0x5;
1603 if (mask & 4) x |= 0x3;
1604 return x;
1605 default:
1606 break;
1607 }
1608
1609 return mask;
1610 }
1611
1612 static struct nv50_reg *
1613 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1614 {
1615 switch (dst->DstRegister.File) {
1616 case TGSI_FILE_TEMPORARY:
1617 return &pc->temp[dst->DstRegister.Index * 4 + c];
1618 case TGSI_FILE_OUTPUT:
1619 return &pc->result[dst->DstRegister.Index * 4 + c];
1620 case TGSI_FILE_ADDRESS:
1621 {
1622 struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c];
1623 if (!r) {
1624 r = alloc_addr(pc, NULL);
1625 pc->addr[dst->DstRegister.Index * 4 + c] = r;
1626 }
1627 assert(r);
1628 return r;
1629 }
1630 case TGSI_FILE_NULL:
1631 return NULL;
1632 default:
1633 break;
1634 }
1635
1636 return NULL;
1637 }
1638
1639 static struct nv50_reg *
1640 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1641 boolean neg)
1642 {
1643 struct nv50_reg *r = NULL;
1644 struct nv50_reg *temp;
1645 unsigned sgn, c, swz;
1646
1647 if (src->SrcRegister.File != TGSI_FILE_CONSTANT)
1648 assert(!src->SrcRegister.Indirect);
1649
1650 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1651
1652 c = tgsi_util_get_full_src_register_swizzle(src, chan);
1653 switch (c) {
1654 case TGSI_SWIZZLE_X:
1655 case TGSI_SWIZZLE_Y:
1656 case TGSI_SWIZZLE_Z:
1657 case TGSI_SWIZZLE_W:
1658 switch (src->SrcRegister.File) {
1659 case TGSI_FILE_INPUT:
1660 r = &pc->attr[src->SrcRegister.Index * 4 + c];
1661 break;
1662 case TGSI_FILE_TEMPORARY:
1663 r = &pc->temp[src->SrcRegister.Index * 4 + c];
1664 break;
1665 case TGSI_FILE_CONSTANT:
1666 if (!src->SrcRegister.Indirect) {
1667 r = &pc->param[src->SrcRegister.Index * 4 + c];
1668 break;
1669 }
1670 /* Indicate indirection by setting r->acc < 0 and
1671 * use the index field to select the address reg.
1672 */
1673 r = MALLOC_STRUCT(nv50_reg);
1674 swz = tgsi_util_get_src_register_swizzle(
1675 &src->SrcRegisterInd, 0);
1676 ctor_reg(r, P_CONST,
1677 src->SrcRegisterInd.Index * 4 + swz,
1678 src->SrcRegister.Index * 4 + c);
1679 r->acc = -1;
1680 break;
1681 case TGSI_FILE_IMMEDIATE:
1682 r = &pc->immd[src->SrcRegister.Index * 4 + c];
1683 break;
1684 case TGSI_FILE_SAMPLER:
1685 break;
1686 case TGSI_FILE_ADDRESS:
1687 r = pc->addr[src->SrcRegister.Index * 4 + c];
1688 assert(r);
1689 break;
1690 default:
1691 assert(0);
1692 break;
1693 }
1694 break;
1695 default:
1696 assert(0);
1697 break;
1698 }
1699
1700 switch (sgn) {
1701 case TGSI_UTIL_SIGN_KEEP:
1702 break;
1703 case TGSI_UTIL_SIGN_CLEAR:
1704 temp = temp_temp(pc);
1705 emit_abs(pc, temp, r);
1706 r = temp;
1707 break;
1708 case TGSI_UTIL_SIGN_TOGGLE:
1709 if (neg)
1710 r->mod = NV50_MOD_NEG;
1711 else {
1712 temp = temp_temp(pc);
1713 emit_neg(pc, temp, r);
1714 r = temp;
1715 }
1716 break;
1717 case TGSI_UTIL_SIGN_SET:
1718 temp = temp_temp(pc);
1719 emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
1720 r = temp;
1721 break;
1722 default:
1723 assert(0);
1724 break;
1725 }
1726
1727 return r;
1728 }
1729
1730 /* return TRUE for ops that produce only a single result */
1731 static boolean
1732 is_scalar_op(unsigned op)
1733 {
1734 switch (op) {
1735 case TGSI_OPCODE_COS:
1736 case TGSI_OPCODE_DP2:
1737 case TGSI_OPCODE_DP3:
1738 case TGSI_OPCODE_DP4:
1739 case TGSI_OPCODE_DPH:
1740 case TGSI_OPCODE_EX2:
1741 case TGSI_OPCODE_LG2:
1742 case TGSI_OPCODE_POW:
1743 case TGSI_OPCODE_RCP:
1744 case TGSI_OPCODE_RSQ:
1745 case TGSI_OPCODE_SIN:
1746 /*
1747 case TGSI_OPCODE_KIL:
1748 case TGSI_OPCODE_LIT:
1749 case TGSI_OPCODE_SCS:
1750 */
1751 return TRUE;
1752 default:
1753 return FALSE;
1754 }
1755 }
1756
1757 /* Returns a bitmask indicating which dst components depend
1758 * on source s, component c (reverse of nv50_tgsi_src_mask).
1759 */
1760 static unsigned
1761 nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1762 {
1763 if (is_scalar_op(op))
1764 return 0x1;
1765
1766 switch (op) {
1767 case TGSI_OPCODE_DST:
1768 return (1 << c) & (s ? 0xa : 0x6);
1769 case TGSI_OPCODE_XPD:
1770 switch (c) {
1771 case 0: return 0x6;
1772 case 1: return 0x5;
1773 case 2: return 0x3;
1774 case 3: return 0x0;
1775 default:
1776 assert(0);
1777 return 0x0;
1778 }
1779 case TGSI_OPCODE_LIT:
1780 case TGSI_OPCODE_SCS:
1781 case TGSI_OPCODE_TEX:
1782 case TGSI_OPCODE_TXP:
1783 /* these take care of dangerous swizzles themselves */
1784 return 0x0;
1785 case TGSI_OPCODE_IF:
1786 case TGSI_OPCODE_KIL:
1787 /* don't call this function for these ops */
1788 assert(0);
1789 return 0;
1790 default:
1791 /* linear vector instruction */
1792 return (1 << c);
1793 }
1794 }
1795
1796 static INLINE boolean
1797 has_pred(struct nv50_program_exec *e, unsigned cc)
1798 {
1799 if (!is_long(e) || is_immd(e))
1800 return FALSE;
1801 return ((e->inst[1] & 0x780) == (cc << 7));
1802 }
1803
1804 /* on ENDIF see if we can do "@p0.neu single_op" instead of:
1805 * join_at ENDIF
1806 * @p0.eq bra ENDIF
1807 * single_op
1808 * ENDIF: nop.join
1809 */
1810 static boolean
1811 nv50_kill_branch(struct nv50_pc *pc)
1812 {
1813 int lvl = pc->if_lvl;
1814
1815 if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1816 return FALSE;
1817
1818 /* if ccode == 'true', the BRA is from an ELSE and the predicate
1819 * reg may no longer be valid, since we currently always use $p0
1820 */
1821 if (has_pred(pc->if_insn[lvl], 0xf))
1822 return FALSE;
1823 assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1824
1825 /* We'll use the exec allocated for JOIN_AT (as we can't easily
1826 * update prev's next); if exec_tail is BRK, update the pointer.
1827 */
1828 if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1829 pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1830
1831 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1832
1833 *pc->br_join[lvl] = *pc->p->exec_tail;
1834
1835 FREE(pc->if_insn[lvl]);
1836 FREE(pc->p->exec_tail);
1837
1838 pc->p->exec_tail = pc->br_join[lvl];
1839 pc->p->exec_tail->next = NULL;
1840 set_pred(pc, 0xd, 0, pc->p->exec_tail);
1841
1842 return TRUE;
1843 }
1844
1845 static boolean
1846 nv50_program_tx_insn(struct nv50_pc *pc,
1847 const struct tgsi_full_instruction *inst)
1848 {
1849 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1850 unsigned mask, sat, unit;
1851 int i, c;
1852
1853 mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1854 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1855
1856 memset(src, 0, sizeof(src));
1857
1858 for (c = 0; c < 4; c++) {
1859 if ((mask & (1 << c)) && !pc->r_dst[c])
1860 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1861 else
1862 dst[c] = pc->r_dst[c];
1863 rdst[c] = dst[c];
1864 }
1865
1866 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1867 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1868 unsigned src_mask;
1869 boolean neg_supp;
1870
1871 src_mask = nv50_tgsi_src_mask(inst, i);
1872 neg_supp = negate_supported(inst, i);
1873
1874 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1875 unit = fs->SrcRegister.Index;
1876
1877 for (c = 0; c < 4; c++)
1878 if (src_mask & (1 << c))
1879 src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1880 }
1881
1882 brdc = temp = pc->r_brdc;
1883 if (brdc && brdc->type != P_TEMP) {
1884 temp = temp_temp(pc);
1885 if (sat)
1886 brdc = temp;
1887 } else
1888 if (sat) {
1889 for (c = 0; c < 4; c++) {
1890 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1891 continue;
1892 /* rdst[c] = dst[c]; */ /* done above */
1893 dst[c] = temp_temp(pc);
1894 }
1895 }
1896
1897 assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1898
1899 switch (inst->Instruction.Opcode) {
1900 case TGSI_OPCODE_ABS:
1901 for (c = 0; c < 4; c++) {
1902 if (!(mask & (1 << c)))
1903 continue;
1904 emit_abs(pc, dst[c], src[0][c]);
1905 }
1906 break;
1907 case TGSI_OPCODE_ADD:
1908 for (c = 0; c < 4; c++) {
1909 if (!(mask & (1 << c)))
1910 continue;
1911 emit_add(pc, dst[c], src[0][c], src[1][c]);
1912 }
1913 break;
1914 case TGSI_OPCODE_AND:
1915 case TGSI_OPCODE_XOR:
1916 case TGSI_OPCODE_OR:
1917 for (c = 0; c < 4; c++) {
1918 if (!(mask & (1 << c)))
1919 continue;
1920 emit_bitop2(pc, dst[c], src[0][c], src[1][c],
1921 inst->Instruction.Opcode);
1922 }
1923 break;
1924 case TGSI_OPCODE_ARL:
1925 assert(src[0][0]);
1926 temp = temp_temp(pc);
1927 emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
1928 emit_arl(pc, dst[0], temp, 4);
1929 break;
1930 case TGSI_OPCODE_BGNLOOP:
1931 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1932 terminate_mbb(pc);
1933 break;
1934 case TGSI_OPCODE_BRK:
1935 emit_branch(pc, -1, 0, NULL);
1936 assert(pc->loop_lvl > 0);
1937 pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1938 break;
1939 case TGSI_OPCODE_CEIL:
1940 for (c = 0; c < 4; c++) {
1941 if (!(mask & (1 << c)))
1942 continue;
1943 emit_cvt(pc, dst[c], src[0][c], -1,
1944 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1945 }
1946 break;
1947 case TGSI_OPCODE_CMP:
1948 pc->allow32 = FALSE;
1949 for (c = 0; c < 4; c++) {
1950 if (!(mask & (1 << c)))
1951 continue;
1952 emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
1953 emit_mov(pc, dst[c], src[1][c]);
1954 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
1955 emit_mov(pc, dst[c], src[2][c]);
1956 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
1957 }
1958 break;
1959 case TGSI_OPCODE_COS:
1960 if (mask & 8) {
1961 emit_precossin(pc, temp, src[0][3]);
1962 emit_flop(pc, 5, dst[3], temp);
1963 if (!(mask &= 7))
1964 break;
1965 if (temp == dst[3])
1966 temp = brdc = temp_temp(pc);
1967 }
1968 emit_precossin(pc, temp, src[0][0]);
1969 emit_flop(pc, 5, brdc, temp);
1970 break;
1971 case TGSI_OPCODE_DDX:
1972 for (c = 0; c < 4; c++) {
1973 if (!(mask & (1 << c)))
1974 continue;
1975 emit_ddx(pc, dst[c], src[0][c]);
1976 }
1977 break;
1978 case TGSI_OPCODE_DDY:
1979 for (c = 0; c < 4; c++) {
1980 if (!(mask & (1 << c)))
1981 continue;
1982 emit_ddy(pc, dst[c], src[0][c]);
1983 }
1984 break;
1985 case TGSI_OPCODE_DP3:
1986 emit_mul(pc, temp, src[0][0], src[1][0]);
1987 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1988 emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1989 break;
1990 case TGSI_OPCODE_DP4:
1991 emit_mul(pc, temp, src[0][0], src[1][0]);
1992 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1993 emit_mad(pc, temp, src[0][2], src[1][2], temp);
1994 emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1995 break;
1996 case TGSI_OPCODE_DPH:
1997 emit_mul(pc, temp, src[0][0], src[1][0]);
1998 emit_mad(pc, temp, src[0][1], src[1][1], temp);
1999 emit_mad(pc, temp, src[0][2], src[1][2], temp);
2000 emit_add(pc, brdc, src[1][3], temp);
2001 break;
2002 case TGSI_OPCODE_DST:
2003 if (mask & (1 << 1))
2004 emit_mul(pc, dst[1], src[0][1], src[1][1]);
2005 if (mask & (1 << 2))
2006 emit_mov(pc, dst[2], src[0][2]);
2007 if (mask & (1 << 3))
2008 emit_mov(pc, dst[3], src[1][3]);
2009 if (mask & (1 << 0))
2010 emit_mov_immdval(pc, dst[0], 1.0f);
2011 break;
2012 case TGSI_OPCODE_ELSE:
2013 emit_branch(pc, -1, 0, NULL);
2014 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2015 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2016 terminate_mbb(pc);
2017 break;
2018 case TGSI_OPCODE_ENDIF:
2019 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2020
2021 /* try to replace branch over 1 insn with a predicated insn */
2022 if (nv50_kill_branch(pc) == TRUE)
2023 break;
2024
2025 if (pc->br_join[pc->if_lvl]) {
2026 pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
2027 pc->br_join[pc->if_lvl] = NULL;
2028 }
2029 terminate_mbb(pc);
2030 /* emit a NOP as join point, we could set it on the next
2031 * one, but would have to make sure it is long and !immd
2032 */
2033 emit_nop(pc);
2034 pc->p->exec_tail->inst[1] |= 2;
2035 break;
2036 case TGSI_OPCODE_ENDLOOP:
2037 emit_branch(pc, -1, 0, NULL);
2038 pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
2039 pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
2040 terminate_mbb(pc);
2041 break;
2042 case TGSI_OPCODE_EX2:
2043 emit_preex2(pc, temp, src[0][0]);
2044 emit_flop(pc, 6, brdc, temp);
2045 break;
2046 case TGSI_OPCODE_FLR:
2047 for (c = 0; c < 4; c++) {
2048 if (!(mask & (1 << c)))
2049 continue;
2050 emit_flr(pc, dst[c], src[0][c]);
2051 }
2052 break;
2053 case TGSI_OPCODE_FRC:
2054 temp = temp_temp(pc);
2055 for (c = 0; c < 4; c++) {
2056 if (!(mask & (1 << c)))
2057 continue;
2058 emit_flr(pc, temp, src[0][c]);
2059 emit_sub(pc, dst[c], src[0][c], temp);
2060 }
2061 break;
2062 case TGSI_OPCODE_IF:
2063 /* emitting a join_at may not be necessary */
2064 assert(pc->if_lvl < MAX_IF_DEPTH);
2065 /* set_pred_wr(pc, 1, 0, pc->if_cond); */
2066 emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2067 CVT_F32_F32);
2068 emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
2069 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2070 terminate_mbb(pc);
2071 break;
2072 case TGSI_OPCODE_KIL:
2073 emit_kil(pc, src[0][0]);
2074 emit_kil(pc, src[0][1]);
2075 emit_kil(pc, src[0][2]);
2076 emit_kil(pc, src[0][3]);
2077 break;
2078 case TGSI_OPCODE_LIT:
2079 emit_lit(pc, &dst[0], mask, &src[0][0]);
2080 break;
2081 case TGSI_OPCODE_LG2:
2082 emit_flop(pc, 3, brdc, src[0][0]);
2083 break;
2084 case TGSI_OPCODE_LRP:
2085 temp = temp_temp(pc);
2086 for (c = 0; c < 4; c++) {
2087 if (!(mask & (1 << c)))
2088 continue;
2089 emit_sub(pc, temp, src[1][c], src[2][c]);
2090 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2091 }
2092 break;
2093 case TGSI_OPCODE_MAD:
2094 for (c = 0; c < 4; c++) {
2095 if (!(mask & (1 << c)))
2096 continue;
2097 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2098 }
2099 break;
2100 case TGSI_OPCODE_MAX:
2101 for (c = 0; c < 4; c++) {
2102 if (!(mask & (1 << c)))
2103 continue;
2104 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2105 }
2106 break;
2107 case TGSI_OPCODE_MIN:
2108 for (c = 0; c < 4; c++) {
2109 if (!(mask & (1 << c)))
2110 continue;
2111 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2112 }
2113 break;
2114 case TGSI_OPCODE_MOV:
2115 for (c = 0; c < 4; c++) {
2116 if (!(mask & (1 << c)))
2117 continue;
2118 emit_mov(pc, dst[c], src[0][c]);
2119 }
2120 break;
2121 case TGSI_OPCODE_MUL:
2122 for (c = 0; c < 4; c++) {
2123 if (!(mask & (1 << c)))
2124 continue;
2125 emit_mul(pc, dst[c], src[0][c], src[1][c]);
2126 }
2127 break;
2128 case TGSI_OPCODE_POW:
2129 emit_pow(pc, brdc, src[0][0], src[1][0]);
2130 break;
2131 case TGSI_OPCODE_RCP:
2132 emit_flop(pc, 0, brdc, src[0][0]);
2133 break;
2134 case TGSI_OPCODE_RSQ:
2135 emit_flop(pc, 2, brdc, src[0][0]);
2136 break;
2137 case TGSI_OPCODE_SCS:
2138 temp = temp_temp(pc);
2139 if (mask & 3)
2140 emit_precossin(pc, temp, src[0][0]);
2141 if (mask & (1 << 0))
2142 emit_flop(pc, 5, dst[0], temp);
2143 if (mask & (1 << 1))
2144 emit_flop(pc, 4, dst[1], temp);
2145 if (mask & (1 << 2))
2146 emit_mov_immdval(pc, dst[2], 0.0);
2147 if (mask & (1 << 3))
2148 emit_mov_immdval(pc, dst[3], 1.0);
2149 break;
2150 case TGSI_OPCODE_SIN:
2151 if (mask & 8) {
2152 emit_precossin(pc, temp, src[0][3]);
2153 emit_flop(pc, 4, dst[3], temp);
2154 if (!(mask &= 7))
2155 break;
2156 if (temp == dst[3])
2157 temp = brdc = temp_temp(pc);
2158 }
2159 emit_precossin(pc, temp, src[0][0]);
2160 emit_flop(pc, 4, brdc, temp);
2161 break;
2162 case TGSI_OPCODE_SLT:
2163 case TGSI_OPCODE_SGE:
2164 case TGSI_OPCODE_SEQ:
2165 case TGSI_OPCODE_SGT:
2166 case TGSI_OPCODE_SLE:
2167 case TGSI_OPCODE_SNE:
2168 i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2169 for (c = 0; c < 4; c++) {
2170 if (!(mask & (1 << c)))
2171 continue;
2172 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2173 }
2174 break;
2175 case TGSI_OPCODE_SUB:
2176 for (c = 0; c < 4; c++) {
2177 if (!(mask & (1 << c)))
2178 continue;
2179 emit_sub(pc, dst[c], src[0][c], src[1][c]);
2180 }
2181 break;
2182 case TGSI_OPCODE_TEX:
2183 emit_tex(pc, dst, mask, src[0], unit,
2184 inst->InstructionTexture.Texture, FALSE);
2185 break;
2186 case TGSI_OPCODE_TXP:
2187 emit_tex(pc, dst, mask, src[0], unit,
2188 inst->InstructionTexture.Texture, TRUE);
2189 break;
2190 case TGSI_OPCODE_TRUNC:
2191 for (c = 0; c < 4; c++) {
2192 if (!(mask & (1 << c)))
2193 continue;
2194 emit_cvt(pc, dst[c], src[0][c], -1,
2195 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2196 }
2197 break;
2198 case TGSI_OPCODE_XPD:
2199 temp = temp_temp(pc);
2200 if (mask & (1 << 0)) {
2201 emit_mul(pc, temp, src[0][2], src[1][1]);
2202 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2203 }
2204 if (mask & (1 << 1)) {
2205 emit_mul(pc, temp, src[0][0], src[1][2]);
2206 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2207 }
2208 if (mask & (1 << 2)) {
2209 emit_mul(pc, temp, src[0][1], src[1][0]);
2210 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2211 }
2212 if (mask & (1 << 3))
2213 emit_mov_immdval(pc, dst[3], 1.0);
2214 break;
2215 case TGSI_OPCODE_END:
2216 break;
2217 default:
2218 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2219 return FALSE;
2220 }
2221
2222 if (brdc) {
2223 if (sat)
2224 emit_sat(pc, brdc, brdc);
2225 for (c = 0; c < 4; c++)
2226 if ((mask & (1 << c)) && dst[c] != brdc)
2227 emit_mov(pc, dst[c], brdc);
2228 } else
2229 if (sat) {
2230 for (c = 0; c < 4; c++) {
2231 if (!(mask & (1 << c)))
2232 continue;
2233 /* In this case we saturate later, and dst[c] won't
2234 * be another temp_temp (and thus lost), since rdst
2235 * already is TEMP (see above). */
2236 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2237 continue;
2238 emit_sat(pc, rdst[c], dst[c]);
2239 }
2240 }
2241
2242 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2243 for (c = 0; c < 4; c++) {
2244 if (!src[i][c])
2245 continue;
2246 src[i][c]->mod = 0;
2247 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
2248 FREE(src[i][c]);
2249 else
2250 if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
2251 FREE(src[i][c]); /* indirect constant */
2252 }
2253 }
2254
2255 kill_temp_temp(pc);
2256 return TRUE;
2257 }
2258
2259 static void
2260 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2261 {
2262 struct nv50_reg *reg = NULL;
2263 const struct tgsi_full_src_register *src;
2264 const struct tgsi_dst_register *dst;
2265 unsigned i, c, k, mask;
2266
2267 dst = &insn->FullDstRegisters[0].DstRegister;
2268 mask = dst->WriteMask;
2269
2270 if (dst->File == TGSI_FILE_TEMPORARY)
2271 reg = pc->temp;
2272 else
2273 if (dst->File == TGSI_FILE_OUTPUT)
2274 reg = pc->result;
2275
2276 if (reg) {
2277 for (c = 0; c < 4; c++) {
2278 if (!(mask & (1 << c)))
2279 continue;
2280 reg[dst->Index * 4 + c].acc = pc->insn_nr;
2281 }
2282 }
2283
2284 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2285 src = &insn->FullSrcRegisters[i];
2286
2287 if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
2288 reg = pc->temp;
2289 else
2290 if (src->SrcRegister.File == TGSI_FILE_INPUT)
2291 reg = pc->attr;
2292 else
2293 continue;
2294
2295 mask = nv50_tgsi_src_mask(insn, i);
2296
2297 for (c = 0; c < 4; c++) {
2298 if (!(mask & (1 << c)))
2299 continue;
2300 k = tgsi_util_get_full_src_register_swizzle(src, c);
2301
2302 reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
2303 }
2304 }
2305 }
2306
2307 /* Returns a bitmask indicating which dst components need to be
2308 * written to temporaries first to avoid 'corrupting' sources.
2309 *
2310 * m[i] (out) indicate component to write in the i-th position
2311 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2312 */
2313 static unsigned
2314 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2315 {
2316 unsigned i, c, x, unsafe;
2317
2318 for (c = 0; c < 4; c++)
2319 m[c] = c;
2320
2321 /* Swap as long as a dst component written earlier is depended on
2322 * by one written later, but the next one isn't depended on by it.
2323 */
2324 for (c = 0; c < 3; c++) {
2325 if (rdep[m[c + 1]] & (1 << m[c]))
2326 continue; /* if next one is depended on by us */
2327 for (i = c + 1; i < 4; i++)
2328 /* if we are depended on by a later one */
2329 if (rdep[m[c]] & (1 << m[i]))
2330 break;
2331 if (i == 4)
2332 continue;
2333 /* now, swap */
2334 x = m[c];
2335 m[c] = m[c + 1];
2336 m[c + 1] = x;
2337
2338 /* restart */
2339 c = 0;
2340 }
2341
2342 /* mark dependencies that could not be resolved by reordering */
2343 for (i = 0; i < 3; ++i)
2344 for (c = i + 1; c < 4; ++c)
2345 if (rdep[m[i]] & (1 << m[c]))
2346 unsafe |= (1 << i);
2347
2348 /* NOTE: $unsafe is with respect to order, not component */
2349 return unsafe;
2350 }
2351
2352 /* Select a suitable dst register for broadcasting scalar results,
2353 * or return NULL if we have to allocate an extra TEMP.
2354 *
2355 * If e.g. only 1 component is written, we may also emit the final
2356 * result to a write-only register.
2357 */
2358 static struct nv50_reg *
2359 tgsi_broadcast_dst(struct nv50_pc *pc,
2360 const struct tgsi_full_dst_register *fd, unsigned mask)
2361 {
2362 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
2363 int c = ffs(~mask & fd->DstRegister.WriteMask);
2364 if (c)
2365 return tgsi_dst(pc, c - 1, fd);
2366 } else {
2367 int c = ffs(fd->DstRegister.WriteMask) - 1;
2368 if ((1 << c) == fd->DstRegister.WriteMask)
2369 return tgsi_dst(pc, c, fd);
2370 }
2371
2372 return NULL;
2373 }
2374
2375 /* Scan source swizzles and return a bitmask indicating dst regs that
2376 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2377 */
2378 static unsigned
2379 nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2380 unsigned rdep[4])
2381 {
2382 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
2383 const struct tgsi_full_src_register *fs;
2384 unsigned i, deqs = 0;
2385
2386 for (i = 0; i < 4; ++i)
2387 rdep[i] = 0;
2388
2389 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2390 unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2391 boolean neg_supp = negate_supported(insn, i);
2392
2393 fs = &insn->FullSrcRegisters[i];
2394 if (fs->SrcRegister.File != fd->DstRegister.File ||
2395 fs->SrcRegister.Index != fd->DstRegister.Index)
2396 continue;
2397
2398 for (chn = 0; chn < 4; ++chn) {
2399 unsigned s, c;
2400
2401 if (!(mask & (1 << chn))) /* src is not read */
2402 continue;
2403 c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2404 s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2405
2406 if (!(fd->DstRegister.WriteMask & (1 << c)))
2407 continue;
2408
2409 /* no danger if src is copied to TEMP first */
2410 if ((s != TGSI_UTIL_SIGN_KEEP) &&
2411 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2412 continue;
2413
2414 rdep[c] |= nv50_tgsi_dst_revdep(
2415 insn->Instruction.Opcode, i, chn);
2416 deqs |= (1 << c);
2417 }
2418 }
2419
2420 return deqs;
2421 }
2422
2423 static boolean
2424 nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2425 {
2426 struct tgsi_full_instruction insn = tok->FullInstruction;
2427 const struct tgsi_full_dst_register *fd;
2428 unsigned i, deqs, rdep[4], m[4];
2429
2430 fd = &tok->FullInstruction.FullDstRegisters[0];
2431 deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2432
2433 if (is_scalar_op(insn.Instruction.Opcode)) {
2434 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2435 if (!pc->r_brdc)
2436 pc->r_brdc = temp_temp(pc);
2437 return nv50_program_tx_insn(pc, &insn);
2438 }
2439 pc->r_brdc = NULL;
2440
2441 if (!deqs)
2442 return nv50_program_tx_insn(pc, &insn);
2443
2444 deqs = nv50_revdep_reorder(m, rdep);
2445
2446 for (i = 0; i < 4; ++i) {
2447 assert(pc->r_dst[m[i]] == NULL);
2448
2449 insn.FullDstRegisters[0].DstRegister.WriteMask =
2450 fd->DstRegister.WriteMask & (1 << m[i]);
2451
2452 if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2453 continue;
2454
2455 if (deqs & (1 << i))
2456 pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2457
2458 if (!nv50_program_tx_insn(pc, &insn))
2459 return FALSE;
2460 }
2461
2462 for (i = 0; i < 4; i++) {
2463 struct nv50_reg *reg = pc->r_dst[i];
2464 if (!reg)
2465 continue;
2466 pc->r_dst[i] = NULL;
2467
2468 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2469 emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2470 else
2471 emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2472 free_temp(pc, reg);
2473 }
2474
2475 return TRUE;
2476 }
2477
2478 static void
2479 load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2480 {
2481 struct nv50_reg *iv, **ppiv;
2482 unsigned mode = pc->interp_mode[reg->index];
2483
2484 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2485 iv = *ppiv;
2486
2487 if ((mode & INTERP_PERSPECTIVE) && !iv) {
2488 iv = *ppiv = alloc_temp(pc, NULL);
2489 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2490
2491 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2492 emit_flop(pc, 0, iv, iv);
2493
2494 /* XXX: when loading interpolants dynamically, move these
2495 * to the program head, or make sure it can't be skipped.
2496 */
2497 }
2498
2499 emit_interp(pc, reg, iv, mode);
2500 }
2501
2502 /* The face input is always at v[255] (varying space), with a
2503 * value of 0 for back-facing, and 0xffffffff for front-facing.
2504 */
2505 static void
2506 load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
2507 {
2508 struct nv50_reg *one = alloc_immd(pc, 1.0f);
2509
2510 assert(a->rhw == -1);
2511 alloc_reg(pc, a); /* do this before rhw is set */
2512 a->rhw = 255;
2513 load_interpolant(pc, a);
2514 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
2515
2516 FREE(one);
2517 }
2518
2519 static boolean
2520 nv50_program_tx_prep(struct nv50_pc *pc)
2521 {
2522 struct tgsi_parse_context tp;
2523 struct nv50_program *p = pc->p;
2524 boolean ret = FALSE;
2525 unsigned i, c, flat_nr = 0;
2526
2527 tgsi_parse_init(&tp, pc->p->pipe.tokens);
2528 while (!tgsi_parse_end_of_tokens(&tp)) {
2529 const union tgsi_full_token *tok = &tp.FullToken;
2530
2531 tgsi_parse_token(&tp);
2532 switch (tok->Token.Type) {
2533 case TGSI_TOKEN_TYPE_IMMEDIATE:
2534 {
2535 const struct tgsi_full_immediate *imm =
2536 &tp.FullToken.FullImmediate;
2537
2538 ctor_immd(pc, imm->u[0].Float,
2539 imm->u[1].Float,
2540 imm->u[2].Float,
2541 imm->u[3].Float);
2542 }
2543 break;
2544 case TGSI_TOKEN_TYPE_DECLARATION:
2545 {
2546 const struct tgsi_full_declaration *d;
2547 unsigned si, last, first, mode;
2548
2549 d = &tp.FullToken.FullDeclaration;
2550 first = d->DeclarationRange.First;
2551 last = d->DeclarationRange.Last;
2552
2553 switch (d->Declaration.File) {
2554 case TGSI_FILE_TEMPORARY:
2555 break;
2556 case TGSI_FILE_OUTPUT:
2557 if (!d->Declaration.Semantic ||
2558 p->type == PIPE_SHADER_FRAGMENT)
2559 break;
2560
2561 si = d->Semantic.SemanticIndex;
2562 switch (d->Semantic.SemanticName) {
2563 case TGSI_SEMANTIC_BCOLOR:
2564 p->cfg.two_side[si].hw = first;
2565 if (p->cfg.io_nr > first)
2566 p->cfg.io_nr = first;
2567 break;
2568 case TGSI_SEMANTIC_PSIZE:
2569 p->cfg.psiz = first;
2570 if (p->cfg.io_nr > first)
2571 p->cfg.io_nr = first;
2572 break;
2573 /*
2574 case TGSI_SEMANTIC_CLIP_DISTANCE:
2575 p->cfg.clpd = MIN2(p->cfg.clpd, first);
2576 break;
2577 */
2578 default:
2579 break;
2580 }
2581 break;
2582 case TGSI_FILE_INPUT:
2583 {
2584 if (p->type != PIPE_SHADER_FRAGMENT)
2585 break;
2586
2587 switch (d->Declaration.Interpolate) {
2588 case TGSI_INTERPOLATE_CONSTANT:
2589 mode = INTERP_FLAT;
2590 flat_nr++;
2591 break;
2592 case TGSI_INTERPOLATE_PERSPECTIVE:
2593 mode = INTERP_PERSPECTIVE;
2594 p->cfg.regs[1] |= 0x08 << 24;
2595 break;
2596 default:
2597 mode = INTERP_LINEAR;
2598 break;
2599 }
2600 if (d->Declaration.Centroid)
2601 mode |= INTERP_CENTROID;
2602
2603 assert(last < 32);
2604 for (i = first; i <= last; i++)
2605 pc->interp_mode[i] = mode;
2606 }
2607 break;
2608 case TGSI_FILE_ADDRESS:
2609 case TGSI_FILE_CONSTANT:
2610 case TGSI_FILE_SAMPLER:
2611 break;
2612 default:
2613 NOUVEAU_ERR("bad decl file %d\n",
2614 d->Declaration.File);
2615 goto out_err;
2616 }
2617 }
2618 break;
2619 case TGSI_TOKEN_TYPE_INSTRUCTION:
2620 pc->insn_nr++;
2621 prep_inspect_insn(pc, &tok->FullInstruction);
2622 break;
2623 default:
2624 break;
2625 }
2626 }
2627
2628 if (p->type == PIPE_SHADER_VERTEX) {
2629 int rid = 0;
2630
2631 for (i = 0; i < pc->attr_nr * 4; ++i) {
2632 if (pc->attr[i].acc) {
2633 pc->attr[i].hw = rid++;
2634 p->cfg.attr[i / 32] |= 1 << (i % 32);
2635 }
2636 }
2637
2638 for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2639 p->cfg.io[i].hw = rid;
2640 p->cfg.io[i].id_vp = i;
2641
2642 for (c = 0; c < 4; ++c) {
2643 int n = i * 4 + c;
2644 if (!pc->result[n].acc)
2645 continue;
2646 pc->result[n].hw = rid++;
2647 p->cfg.io[i].mask |= 1 << c;
2648 }
2649 }
2650
2651 for (c = 0; c < 2; ++c)
2652 if (p->cfg.two_side[c].hw < 0x40)
2653 p->cfg.two_side[c] = p->cfg.io[
2654 p->cfg.two_side[c].hw];
2655
2656 if (p->cfg.psiz < 0x40)
2657 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2658 } else
2659 if (p->type == PIPE_SHADER_FRAGMENT) {
2660 int rid, aid;
2661 unsigned n = 0, m = pc->attr_nr - flat_nr;
2662
2663 pc->allow32 = TRUE;
2664
2665 int base = (TGSI_SEMANTIC_POSITION ==
2666 p->info.input_semantic_name[0]) ? 0 : 1;
2667
2668 /* non-flat interpolants have to be mapped to
2669 * the lower hardware IDs, so sort them:
2670 */
2671 for (i = 0; i < pc->attr_nr; i++) {
2672 if (pc->interp_mode[i] == INTERP_FLAT) {
2673 p->cfg.io[m].id_vp = i + base;
2674 p->cfg.io[m++].id_fp = i;
2675 } else {
2676 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2677 p->cfg.io[n].linear = TRUE;
2678 p->cfg.io[n].id_vp = i + base;
2679 p->cfg.io[n++].id_fp = i;
2680 }
2681 }
2682
2683 if (!base) /* set w-coordinate mask from perspective interp */
2684 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2685
2686 aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2687 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2688
2689 for (n = 0; n < pc->attr_nr; ++n) {
2690 p->cfg.io[n].hw = rid = aid;
2691 i = p->cfg.io[n].id_fp;
2692
2693 if (p->info.input_semantic_name[n] ==
2694 TGSI_SEMANTIC_FACE) {
2695 load_frontfacing(pc, &pc->attr[i * 4]);
2696 continue;
2697 }
2698
2699 for (c = 0; c < 4; ++c) {
2700 if (!pc->attr[i * 4 + c].acc)
2701 continue;
2702 pc->attr[i * 4 + c].rhw = rid++;
2703 p->cfg.io[n].mask |= 1 << c;
2704
2705 load_interpolant(pc, &pc->attr[i * 4 + c]);
2706 }
2707 aid += popcnt4(p->cfg.io[n].mask);
2708 }
2709
2710 if (!base)
2711 p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2712
2713 m = popcnt4(p->cfg.regs[1] >> 24);
2714
2715 /* set count of non-position inputs and of non-flat
2716 * non-position inputs for FP_INTERPOLANT_CTRL
2717 */
2718 p->cfg.regs[1] |= aid - m;
2719
2720 if (flat_nr) {
2721 i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2722 p->cfg.regs[1] |= (i - m) << 16;
2723 } else
2724 p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2725
2726 /* mark color semantic for light-twoside */
2727 n = 0x40;
2728 for (i = 0; i < pc->attr_nr; i++) {
2729 ubyte si, sn;
2730
2731 sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2732 si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2733
2734 if (sn == TGSI_SEMANTIC_COLOR) {
2735 p->cfg.two_side[si] = p->cfg.io[i];
2736
2737 /* increase colour count */
2738 p->cfg.regs[0] += popcnt4(
2739 p->cfg.two_side[si].mask) << 16;
2740
2741 n = MIN2(n, p->cfg.io[i].hw - m);
2742 }
2743 }
2744 if (n < 0x40)
2745 p->cfg.regs[0] += n;
2746
2747 /* Initialize FP results:
2748 * FragDepth is always first TGSI and last hw output
2749 */
2750 i = p->info.writes_z ? 4 : 0;
2751 for (rid = 0; i < pc->result_nr * 4; i++)
2752 pc->result[i].rhw = rid++;
2753 if (p->info.writes_z)
2754 pc->result[2].rhw = rid;
2755
2756 p->cfg.high_result = rid;
2757
2758 /* separate/different colour results for MRTs ? */
2759 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
2760 p->cfg.regs[2] |= 1;
2761 }
2762
2763 if (pc->immd_nr) {
2764 int rid = 0;
2765
2766 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2767 if (!pc->immd)
2768 goto out_err;
2769
2770 for (i = 0; i < pc->immd_nr; i++) {
2771 for (c = 0; c < 4; c++, rid++)
2772 ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2773 }
2774 }
2775
2776 ret = TRUE;
2777 out_err:
2778 if (pc->iv_p)
2779 free_temp(pc, pc->iv_p);
2780 if (pc->iv_c)
2781 free_temp(pc, pc->iv_c);
2782
2783 tgsi_parse_free(&tp);
2784 return ret;
2785 }
2786
2787 static void
2788 free_nv50_pc(struct nv50_pc *pc)
2789 {
2790 if (pc->immd)
2791 FREE(pc->immd);
2792 if (pc->param)
2793 FREE(pc->param);
2794 if (pc->result)
2795 FREE(pc->result);
2796 if (pc->attr)
2797 FREE(pc->attr);
2798 if (pc->temp)
2799 FREE(pc->temp);
2800
2801 FREE(pc);
2802 }
2803
2804 static boolean
2805 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2806 {
2807 int i, c;
2808 unsigned rtype[2] = { P_ATTR, P_RESULT };
2809
2810 pc->p = p;
2811 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2812 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2813 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2814 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2815 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
2816 assert(pc->addr_nr <= 2);
2817
2818 p->cfg.high_temp = 4;
2819
2820 p->cfg.two_side[0].hw = 0x40;
2821 p->cfg.two_side[1].hw = 0x40;
2822
2823 switch (p->type) {
2824 case PIPE_SHADER_VERTEX:
2825 p->cfg.psiz = 0x40;
2826 p->cfg.clpd = 0x40;
2827 p->cfg.io_nr = pc->result_nr;
2828 break;
2829 case PIPE_SHADER_FRAGMENT:
2830 rtype[0] = rtype[1] = P_TEMP;
2831
2832 p->cfg.regs[0] = 0x01000004;
2833 p->cfg.io_nr = pc->attr_nr;
2834
2835 if (p->info.writes_z) {
2836 p->cfg.regs[2] |= 0x00000100;
2837 p->cfg.regs[3] |= 0x00000011;
2838 }
2839 if (p->info.uses_kill)
2840 p->cfg.regs[2] |= 0x00100000;
2841 break;
2842 }
2843
2844 if (pc->temp_nr) {
2845 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2846 if (!pc->temp)
2847 return FALSE;
2848
2849 for (i = 0; i < pc->temp_nr * 4; ++i)
2850 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2851 }
2852
2853 if (pc->attr_nr) {
2854 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2855 if (!pc->attr)
2856 return FALSE;
2857
2858 for (i = 0; i < pc->attr_nr * 4; ++i)
2859 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2860 }
2861
2862 if (pc->result_nr) {
2863 unsigned nr = pc->result_nr * 4;
2864
2865 pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2866 if (!pc->result)
2867 return FALSE;
2868
2869 for (i = 0; i < nr; ++i)
2870 ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2871 }
2872
2873 if (pc->param_nr) {
2874 int rid = 0;
2875
2876 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2877 if (!pc->param)
2878 return FALSE;
2879
2880 for (i = 0; i < pc->param_nr; ++i)
2881 for (c = 0; c < 4; ++c, ++rid)
2882 ctor_reg(&pc->param[rid], P_CONST, i, rid);
2883 }
2884
2885 if (pc->addr_nr) {
2886 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
2887 if (!pc->addr)
2888 return FALSE;
2889 }
2890 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
2891 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
2892
2893 return TRUE;
2894 }
2895
2896 static void
2897 nv50_fp_move_results(struct nv50_pc *pc)
2898 {
2899 struct nv50_reg reg;
2900 unsigned i;
2901
2902 ctor_reg(&reg, P_TEMP, -1, -1);
2903
2904 for (i = 0; i < pc->result_nr * 4; ++i) {
2905 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2906 continue;
2907 if (pc->result[i].rhw != pc->result[i].hw) {
2908 reg.hw = pc->result[i].rhw;
2909 emit_mov(pc, &reg, &pc->result[i]);
2910 }
2911 }
2912 }
2913
2914 static void
2915 nv50_program_fixup_insns(struct nv50_pc *pc)
2916 {
2917 struct nv50_program_exec *e, *prev = NULL, **bra_list;
2918 unsigned i, n, pos;
2919
2920 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2921
2922 /* Collect branch instructions, we need to adjust their offsets
2923 * when converting 32 bit instructions to 64 bit ones
2924 */
2925 for (n = 0, e = pc->p->exec_head; e; e = e->next)
2926 if (e->param.index >= 0 && !e->param.mask)
2927 bra_list[n++] = e;
2928
2929 /* Make sure we don't have any single 32 bit instructions. */
2930 for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2931 pos += is_long(e) ? 2 : 1;
2932
2933 if ((pos & 1) && (!e->next || is_long(e->next))) {
2934 for (i = 0; i < n; ++i)
2935 if (bra_list[i]->param.index >= pos)
2936 bra_list[i]->param.index += 1;
2937 convert_to_long(pc, e);
2938 ++pos;
2939 }
2940 if (e->next)
2941 prev = e;
2942 }
2943
2944 assert(!is_immd(pc->p->exec_head));
2945 assert(!is_immd(pc->p->exec_tail));
2946
2947 /* last instruction must be long so it can have the end bit set */
2948 if (!is_long(pc->p->exec_tail)) {
2949 convert_to_long(pc, pc->p->exec_tail);
2950 if (prev)
2951 convert_to_long(pc, prev);
2952 }
2953 assert(!(pc->p->exec_tail->inst[1] & 2));
2954 /* set the end-bit */
2955 pc->p->exec_tail->inst[1] |= 1;
2956
2957 FREE(bra_list);
2958 }
2959
2960 static boolean
2961 nv50_program_tx(struct nv50_program *p)
2962 {
2963 struct tgsi_parse_context parse;
2964 struct nv50_pc *pc;
2965 boolean ret;
2966
2967 pc = CALLOC_STRUCT(nv50_pc);
2968 if (!pc)
2969 return FALSE;
2970
2971 ret = ctor_nv50_pc(pc, p);
2972 if (ret == FALSE)
2973 goto out_cleanup;
2974
2975 ret = nv50_program_tx_prep(pc);
2976 if (ret == FALSE)
2977 goto out_cleanup;
2978
2979 tgsi_parse_init(&parse, pc->p->pipe.tokens);
2980 while (!tgsi_parse_end_of_tokens(&parse)) {
2981 const union tgsi_full_token *tok = &parse.FullToken;
2982
2983 /* don't allow half insn/immd on first and last instruction */
2984 pc->allow32 = TRUE;
2985 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2986 pc->allow32 = FALSE;
2987
2988 tgsi_parse_token(&parse);
2989
2990 switch (tok->Token.Type) {
2991 case TGSI_TOKEN_TYPE_INSTRUCTION:
2992 ++pc->insn_cur;
2993 ret = nv50_tgsi_insn(pc, tok);
2994 if (ret == FALSE)
2995 goto out_err;
2996 break;
2997 default:
2998 break;
2999 }
3000 }
3001
3002 if (pc->p->type == PIPE_SHADER_FRAGMENT)
3003 nv50_fp_move_results(pc);
3004
3005 nv50_program_fixup_insns(pc);
3006
3007 p->param_nr = pc->param_nr * 4;
3008 p->immd_nr = pc->immd_nr * 4;
3009 p->immd = pc->immd_buf;
3010
3011 out_err:
3012 tgsi_parse_free(&parse);
3013
3014 out_cleanup:
3015 free_nv50_pc(pc);
3016 return ret;
3017 }
3018
3019 static void
3020 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
3021 {
3022 if (nv50_program_tx(p) == FALSE)
3023 assert(0);
3024 p->translated = TRUE;
3025 }
3026
3027 static void
3028 nv50_program_upload_data(struct nv50_context *nv50, float *map,
3029 unsigned start, unsigned count, unsigned cbuf)
3030 {
3031 struct nouveau_channel *chan = nv50->screen->base.channel;
3032 struct nouveau_grobj *tesla = nv50->screen->tesla;
3033
3034 while (count) {
3035 unsigned nr = count > 2047 ? 2047 : count;
3036
3037 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3038 OUT_RING (chan, (cbuf << 0) | (start << 8));
3039 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3040 OUT_RINGp (chan, map, nr);
3041
3042 map += nr;
3043 start += nr;
3044 count -= nr;
3045 }
3046 }
3047
3048 static void
3049 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
3050 {
3051 struct pipe_screen *pscreen = nv50->pipe.screen;
3052
3053 if (!p->data[0] && p->immd_nr) {
3054 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
3055
3056 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
3057 while (heap->next && heap->size < p->immd_nr) {
3058 struct nv50_program *evict = heap->next->priv;
3059 nouveau_resource_free(&evict->data[0]);
3060 }
3061
3062 if (nouveau_resource_alloc(heap, p->immd_nr, p,
3063 &p->data[0]))
3064 assert(0);
3065 }
3066
3067 /* immediates only need to be uploaded again when freed */
3068 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3069 p->immd_nr, NV50_CB_PMISC);
3070 }
3071
3072 assert(p->param_nr <= 512);
3073
3074 if (p->param_nr) {
3075 unsigned cb;
3076 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3077 PIPE_BUFFER_USAGE_CPU_READ);
3078
3079 if (p->type == PIPE_SHADER_VERTEX)
3080 cb = NV50_CB_PVP;
3081 else
3082 cb = NV50_CB_PFP;
3083
3084 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3085 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3086 }
3087 }
3088
3089 static void
3090 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3091 {
3092 struct nouveau_channel *chan = nv50->screen->base.channel;
3093 struct nv50_program_exec *e;
3094 uint32_t *up, i;
3095 boolean upload = FALSE;
3096
3097 if (!p->bo) {
3098 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3099 p->exec_size * 4, &p->bo);
3100 upload = TRUE;
3101 }
3102
3103 if (p->data[0] && p->data[0]->start != p->data_start[0])
3104 upload = TRUE;
3105
3106 if (!upload)
3107 return;
3108
3109 up = MALLOC(p->exec_size * 4);
3110
3111 for (i = 0, e = p->exec_head; e; e = e->next) {
3112 unsigned ei, ci, bs;
3113
3114 if (e->param.index >= 0 && e->param.mask) {
3115 bs = (e->inst[1] >> 22) & 0x07;
3116 assert(bs < 2);
3117 ei = e->param.shift >> 5;
3118 ci = e->param.index;
3119 if (bs == 0)
3120 ci += p->data[bs]->start;
3121
3122 e->inst[ei] &= ~e->param.mask;
3123 e->inst[ei] |= (ci << e->param.shift);
3124 } else
3125 if (e->param.index >= 0) {
3126 /* zero mask means param is a jump/branch offset */
3127 assert(!(e->param.index & 1));
3128 /* seem to be 8 byte steps */
3129 ei = (e->param.index >> 1) + 0 /* START_ID */;
3130
3131 e->inst[0] &= 0xf0000fff;
3132 e->inst[0] |= ei << 12;
3133 }
3134
3135 up[i++] = e->inst[0];
3136 if (is_long(e))
3137 up[i++] = e->inst[1];
3138 }
3139 assert(i == p->exec_size);
3140
3141 if (p->data[0])
3142 p->data_start[0] = p->data[0]->start;
3143
3144 #ifdef NV50_PROGRAM_DUMP
3145 NOUVEAU_ERR("-------\n");
3146 for (e = p->exec_head; e; e = e->next) {
3147 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3148 if (is_long(e))
3149 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3150 }
3151 #endif
3152 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3153 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3154 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3155 0, 0, p->exec_size * 4, 1, 1);
3156
3157 FREE(up);
3158 }
3159
3160 void
3161 nv50_vertprog_validate(struct nv50_context *nv50)
3162 {
3163 struct nouveau_grobj *tesla = nv50->screen->tesla;
3164 struct nv50_program *p = nv50->vertprog;
3165 struct nouveau_stateobj *so;
3166
3167 if (!p->translated) {
3168 nv50_program_validate(nv50, p);
3169 if (!p->translated)
3170 assert(0);
3171 }
3172
3173 nv50_program_validate_data(nv50, p);
3174 nv50_program_validate_code(nv50, p);
3175
3176 so = so_new(13, 2);
3177 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3178 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3179 NOUVEAU_BO_HIGH, 0, 0);
3180 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3181 NOUVEAU_BO_LOW, 0, 0);
3182 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3183 so_data (so, p->cfg.attr[0]);
3184 so_data (so, p->cfg.attr[1]);
3185 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3186 so_data (so, p->cfg.high_result);
3187 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3188 so_data (so, p->cfg.high_result); //8);
3189 so_data (so, p->cfg.high_temp);
3190 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3191 so_data (so, 0); /* program start offset */
3192 so_ref(so, &nv50->state.vertprog);
3193 so_ref(NULL, &so);
3194 }
3195
3196 void
3197 nv50_fragprog_validate(struct nv50_context *nv50)
3198 {
3199 struct nouveau_grobj *tesla = nv50->screen->tesla;
3200 struct nv50_program *p = nv50->fragprog;
3201 struct nouveau_stateobj *so;
3202
3203 if (!p->translated) {
3204 nv50_program_validate(nv50, p);
3205 if (!p->translated)
3206 assert(0);
3207 }
3208
3209 nv50_program_validate_data(nv50, p);
3210 nv50_program_validate_code(nv50, p);
3211
3212 so = so_new(64, 2);
3213 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3214 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3215 NOUVEAU_BO_HIGH, 0, 0);
3216 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3217 NOUVEAU_BO_LOW, 0, 0);
3218 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3219 so_data (so, p->cfg.high_temp);
3220 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3221 so_data (so, p->cfg.high_result);
3222 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3223 so_data (so, p->cfg.regs[2]);
3224 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3225 so_data (so, p->cfg.regs[3]);
3226 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3227 so_data (so, 0); /* program start offset */
3228 so_ref(so, &nv50->state.fragprog);
3229 so_ref(NULL, &so);
3230 }
3231
3232 static void
3233 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3234 {
3235 struct nv50_program *fp = nv50->fragprog;
3236 struct nv50_program *vp = nv50->vertprog;
3237 unsigned i, c, m = base;
3238
3239 /* XXX: This can't work correctly in all cases yet, we either
3240 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
3241 * to be per FP input instead of per VP output
3242 */
3243 memset(pntc, 0, 8 * sizeof(uint32_t));
3244
3245 for (i = 0; i < fp->cfg.io_nr; i++) {
3246 uint8_t sn, si;
3247 uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
3248 unsigned n = popcnt4(fp->cfg.io[i].mask);
3249
3250 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3251 m += n;
3252 continue;
3253 }
3254
3255 sn = vp->info.input_semantic_name[j];
3256 si = vp->info.input_semantic_index[j];
3257
3258 if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
3259 ubyte mode =
3260 nv50->rasterizer->pipe.sprite_coord_mode[si];
3261
3262 if (mode == PIPE_SPRITE_COORD_NONE) {
3263 m += n;
3264 continue;
3265 }
3266 }
3267
3268 /* this is either PointCoord or replaced by sprite coords */
3269 for (c = 0; c < 4; c++) {
3270 if (!(fp->cfg.io[i].mask & (1 << c)))
3271 continue;
3272 pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3273 ++m;
3274 }
3275 }
3276 }
3277
3278 static int
3279 nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3280 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3281 {
3282 int c;
3283 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3284 uint8_t *map = (uint8_t *)p_map;
3285
3286 for (c = 0; c < 4; ++c) {
3287 if (mf & 1) {
3288 if (fpi->linear == TRUE)
3289 lin[mid / 32] |= 1 << (mid % 32);
3290 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3291 }
3292
3293 oid += mv & 1;
3294 mf >>= 1;
3295 mv >>= 1;
3296 }
3297
3298 return mid;
3299 }
3300
3301 void
3302 nv50_linkage_validate(struct nv50_context *nv50)
3303 {
3304 struct nouveau_grobj *tesla = nv50->screen->tesla;
3305 struct nv50_program *vp = nv50->vertprog;
3306 struct nv50_program *fp = nv50->fragprog;
3307 struct nouveau_stateobj *so;
3308 struct nv50_sreg4 dummy, *vpo;
3309 int i, n, c, m = 0;
3310 uint32_t map[16], lin[4], reg[5], pcrd[8];
3311
3312 memset(map, 0, sizeof(map));
3313 memset(lin, 0, sizeof(lin));
3314
3315 reg[1] = 0x00000004; /* low and high clip distance map ids */
3316 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3317 reg[3] = 0x00000000; /* point size map id & enable */
3318 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3319 reg[4] = fp->cfg.regs[1]; /* interpolant info */
3320
3321 dummy.linear = FALSE;
3322 dummy.mask = 0xf; /* map all components of HPOS */
3323 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3324
3325 dummy.mask = 0x0;
3326
3327 if (vp->cfg.clpd < 0x40) {
3328 for (c = 0; c < vp->cfg.clpd_nr; ++c)
3329 map[m++] = vp->cfg.clpd + c;
3330 reg[1] = (m << 8);
3331 }
3332
3333 reg[0] |= m << 8; /* adjust BFC0 id */
3334
3335 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3336 if (nv50->rasterizer->pipe.light_twoside) {
3337 vpo = &vp->cfg.two_side[0];
3338
3339 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3340 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3341 }
3342
3343 reg[0] += m - 4; /* adjust FFC0 id */
3344 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3345
3346 i = 0;
3347 if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
3348 i = 1;
3349 for (; i < fp->cfg.io_nr; i++) {
3350 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
3351 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
3352
3353 n = fp->cfg.io[i].id_vp;
3354 if (n >= vp->cfg.io_nr ||
3355 vp->info.output_semantic_name[n] != sn ||
3356 vp->info.output_semantic_index[n] != si)
3357 vpo = &dummy;
3358 else
3359 vpo = &vp->cfg.io[n];
3360
3361 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3362 }
3363
3364 if (nv50->rasterizer->pipe.point_size_per_vertex) {
3365 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3366 reg[3] = (m++ << 4) | 1;
3367 }
3368
3369 /* now fill the stateobj */
3370 so = so_new(64, 0);
3371
3372 n = (m + 3) / 4;
3373 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3374 so_data (so, m);
3375 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3376 so_datap (so, map, n);
3377
3378 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3379 so_datap (so, reg, 4);
3380
3381 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3382 so_data (so, reg[4]);
3383
3384 so_method(so, tesla, 0x1540, 4);
3385 so_datap (so, lin, 4);
3386
3387 if (nv50->rasterizer->pipe.point_sprite) {
3388 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3389
3390 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3391 so_datap (so, pcrd, 8);
3392 }
3393
3394 so_ref(so, &nv50->state.programs);
3395 so_ref(NULL, &so);
3396 }
3397
3398 void
3399 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3400 {
3401 while (p->exec_head) {
3402 struct nv50_program_exec *e = p->exec_head;
3403
3404 p->exec_head = e->next;
3405 FREE(e);
3406 }
3407 p->exec_tail = NULL;
3408 p->exec_size = 0;
3409
3410 nouveau_bo_ref(NULL, &p->bo);
3411
3412 nouveau_resource_free(&p->data[0]);
3413
3414 p->translated = 0;
3415 }