nv50: make address reg allocation a little less hacky
[mesa.git] / src / gallium / drivers / nv50 / nv50_program.c
1 /*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "pipe/p_context.h"
24 #include "pipe/p_defines.h"
25 #include "pipe/p_state.h"
26 #include "pipe/p_inlines.h"
27
28 #include "pipe/p_shader_tokens.h"
29 #include "tgsi/tgsi_parse.h"
30 #include "tgsi/tgsi_util.h"
31
32 #include "nv50_context.h"
33
34 #define NV50_SU_MAX_TEMP 127
35 #define NV50_SU_MAX_ADDR 4
36 //#define NV50_PROGRAM_DUMP
37
38 /* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40 /* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * - Fuck it off, introduce a way to negate args for ops that
44 * support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * FP attr/result assignment - how?
61 * attrib
62 * - 0x16bc maps vp output onto fp hpos
63 * - 0x16c0 maps vp output onto fp col0
64 * result
65 * - colr always 0-3
66 * - depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * - 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * - XX == FP high something
78 */
79 struct nv50_reg {
80 enum {
81 P_TEMP,
82 P_ATTR,
83 P_RESULT,
84 P_CONST,
85 P_IMMD,
86 P_ADDR
87 } type;
88 int index;
89
90 int hw;
91 int mod;
92
93 int rhw; /* result hw for FP outputs, or interpolant index */
94 int acc; /* instruction where this reg is last read (first insn == 1) */
95 };
96
97 #define NV50_MOD_NEG 1
98 #define NV50_MOD_ABS 2
99 #define NV50_MOD_SAT 4
100
101 /* STACK: Conditionals and loops have to use the (per warp) stack.
102 * Stack entries consist of an entry type (divergent path, join at),
103 * a mask indicating the active threads of the warp, and an address.
104 * MPs can store 12 stack entries internally, if we need more (and
105 * we probably do), we have to create a stack buffer in VRAM.
106 */
107 /* impose low limits for now */
108 #define NV50_MAX_COND_NESTING 4
109 #define NV50_MAX_LOOP_NESTING 3
110
111 #define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
112
113 struct nv50_pc {
114 struct nv50_program *p;
115
116 /* hw resources */
117 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
118 struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
119
120 /* tgsi resources */
121 struct nv50_reg *temp;
122 int temp_nr;
123 struct nv50_reg *attr;
124 int attr_nr;
125 struct nv50_reg *result;
126 int result_nr;
127 struct nv50_reg *param;
128 int param_nr;
129 struct nv50_reg *immd;
130 uint32_t *immd_buf;
131 int immd_nr;
132 struct nv50_reg **addr;
133 int addr_nr;
134 uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
135
136 struct nv50_reg *temp_temp[16];
137 unsigned temp_temp_nr;
138
139 /* broadcast and destination replacement regs */
140 struct nv50_reg *r_brdc;
141 struct nv50_reg *r_dst[4];
142
143 struct nv50_reg reg_instances[16];
144 unsigned reg_instance_nr;
145
146 unsigned interp_mode[32];
147 /* perspective interpolation registers */
148 struct nv50_reg *iv_p;
149 struct nv50_reg *iv_c;
150
151 struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
152 struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
153 struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
154 int if_lvl, loop_lvl;
155 unsigned loop_pos[NV50_MAX_LOOP_NESTING];
156
157 /* current instruction and total number of insns */
158 unsigned insn_cur;
159 unsigned insn_nr;
160
161 boolean allow32;
162 };
163
164 static INLINE struct nv50_reg *
165 reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
166 {
167 struct nv50_reg *ri;
168
169 assert(pc->reg_instance_nr < 16);
170 ri = &pc->reg_instances[pc->reg_instance_nr++];
171 if (reg) {
172 *ri = *reg;
173 reg->mod = 0;
174 }
175 return ri;
176 }
177
178 static INLINE void
179 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
180 {
181 reg->type = type;
182 reg->index = index;
183 reg->hw = hw;
184 reg->mod = 0;
185 reg->rhw = -1;
186 reg->acc = 0;
187 }
188
189 static INLINE unsigned
190 popcnt4(uint32_t val)
191 {
192 static const unsigned cnt[16]
193 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
194 return cnt[val & 0xf];
195 }
196
197 static void
198 terminate_mbb(struct nv50_pc *pc)
199 {
200 int i;
201
202 /* remove records of temporary address register values */
203 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
204 pc->r_addr[i].rhw = -1;
205 }
206
207 static void
208 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
209 {
210 int i = 0;
211
212 if (reg->type == P_RESULT) {
213 if (pc->p->cfg.high_result < (reg->hw + 1))
214 pc->p->cfg.high_result = reg->hw + 1;
215 }
216
217 if (reg->type != P_TEMP)
218 return;
219
220 if (reg->hw >= 0) {
221 /*XXX: do this here too to catch FP temp-as-attr usage..
222 * not clean, but works */
223 if (pc->p->cfg.high_temp < (reg->hw + 1))
224 pc->p->cfg.high_temp = reg->hw + 1;
225 return;
226 }
227
228 if (reg->rhw != -1) {
229 /* try to allocate temporary with index rhw first */
230 if (!(pc->r_temp[reg->rhw])) {
231 pc->r_temp[reg->rhw] = reg;
232 reg->hw = reg->rhw;
233 if (pc->p->cfg.high_temp < (reg->rhw + 1))
234 pc->p->cfg.high_temp = reg->rhw + 1;
235 return;
236 }
237 /* make sure we don't get things like $r0 needs to go
238 * in $r1 and $r1 in $r0
239 */
240 i = pc->result_nr * 4;
241 }
242
243 for (; i < NV50_SU_MAX_TEMP; i++) {
244 if (!(pc->r_temp[i])) {
245 pc->r_temp[i] = reg;
246 reg->hw = i;
247 if (pc->p->cfg.high_temp < (i + 1))
248 pc->p->cfg.high_temp = i + 1;
249 return;
250 }
251 }
252
253 assert(0);
254 }
255
256 /* XXX: For shaders that aren't executed linearly (e.g. shaders that
257 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
258 * lest we risk temp_temps overwriting regs alloc'd "later".
259 */
260 static struct nv50_reg *
261 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
262 {
263 struct nv50_reg *r;
264 int i;
265
266 if (dst && dst->type == P_TEMP && dst->hw == -1)
267 return dst;
268
269 for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
270 if (!pc->r_temp[i]) {
271 r = MALLOC_STRUCT(nv50_reg);
272 ctor_reg(r, P_TEMP, -1, i);
273 pc->r_temp[i] = r;
274 return r;
275 }
276 }
277
278 assert(0);
279 return NULL;
280 }
281
282 /* Assign the hw of the discarded temporary register src
283 * to the tgsi register dst and free src.
284 */
285 static void
286 assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
287 {
288 assert(src->index == -1 && src->hw != -1);
289
290 if (dst->hw != -1)
291 pc->r_temp[dst->hw] = NULL;
292 pc->r_temp[src->hw] = dst;
293 dst->hw = src->hw;
294
295 FREE(src);
296 }
297
298 /* release the hardware resource held by r */
299 static void
300 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
301 {
302 assert(r->type == P_TEMP);
303 if (r->hw == -1)
304 return;
305
306 assert(pc->r_temp[r->hw] == r);
307 pc->r_temp[r->hw] = NULL;
308
309 r->acc = 0;
310 if (r->index == -1)
311 FREE(r);
312 }
313
314 static void
315 free_temp(struct nv50_pc *pc, struct nv50_reg *r)
316 {
317 if (r->index == -1) {
318 unsigned hw = r->hw;
319
320 FREE(pc->r_temp[hw]);
321 pc->r_temp[hw] = NULL;
322 }
323 }
324
325 static int
326 alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
327 {
328 int i;
329
330 if ((idx + 4) >= NV50_SU_MAX_TEMP)
331 return 1;
332
333 if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
334 pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
335 return alloc_temp4(pc, dst, idx + 4);
336
337 for (i = 0; i < 4; i++) {
338 dst[i] = MALLOC_STRUCT(nv50_reg);
339 ctor_reg(dst[i], P_TEMP, -1, idx + i);
340 pc->r_temp[idx + i] = dst[i];
341 }
342
343 return 0;
344 }
345
346 static void
347 free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
348 {
349 int i;
350
351 for (i = 0; i < 4; i++)
352 free_temp(pc, reg[i]);
353 }
354
355 static struct nv50_reg *
356 temp_temp(struct nv50_pc *pc)
357 {
358 if (pc->temp_temp_nr >= 16)
359 assert(0);
360
361 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
362 return pc->temp_temp[pc->temp_temp_nr++];
363 }
364
365 static void
366 kill_temp_temp(struct nv50_pc *pc)
367 {
368 int i;
369
370 for (i = 0; i < pc->temp_temp_nr; i++)
371 free_temp(pc, pc->temp_temp[i]);
372 pc->temp_temp_nr = 0;
373 }
374
375 static int
376 ctor_immd_4u32(struct nv50_pc *pc,
377 uint32_t x, uint32_t y, uint32_t z, uint32_t w)
378 {
379 unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
380
381 pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
382
383 pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
384 pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
385 pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
386 pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
387
388 return pc->immd_nr++;
389 }
390
391 static INLINE int
392 ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
393 {
394 return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
395 }
396
397 static struct nv50_reg *
398 alloc_immd(struct nv50_pc *pc, float f)
399 {
400 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
401 unsigned hw;
402
403 for (hw = 0; hw < pc->immd_nr * 4; hw++)
404 if (pc->immd_buf[hw] == fui(f))
405 break;
406
407 if (hw == pc->immd_nr * 4)
408 hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
409
410 ctor_reg(r, P_IMMD, -1, hw);
411 return r;
412 }
413
414 static struct nv50_program_exec *
415 exec(struct nv50_pc *pc)
416 {
417 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
418
419 e->param.index = -1;
420 return e;
421 }
422
423 static void
424 emit(struct nv50_pc *pc, struct nv50_program_exec *e)
425 {
426 struct nv50_program *p = pc->p;
427
428 if (p->exec_tail)
429 p->exec_tail->next = e;
430 if (!p->exec_head)
431 p->exec_head = e;
432 p->exec_tail = e;
433 p->exec_size += (e->inst[0] & 1) ? 2 : 1;
434 }
435
436 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
437
438 static boolean
439 is_long(struct nv50_program_exec *e)
440 {
441 if (e->inst[0] & 1)
442 return TRUE;
443 return FALSE;
444 }
445
446 static boolean
447 is_immd(struct nv50_program_exec *e)
448 {
449 if (is_long(e) && (e->inst[1] & 3) == 3)
450 return TRUE;
451 return FALSE;
452 }
453
454 static INLINE void
455 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
456 struct nv50_program_exec *e)
457 {
458 set_long(pc, e);
459 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
460 e->inst[1] |= (pred << 7) | (idx << 12);
461 }
462
463 static INLINE void
464 set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
465 struct nv50_program_exec *e)
466 {
467 set_long(pc, e);
468 e->inst[1] &= ~((0x3 << 4) | (1 << 6));
469 e->inst[1] |= (idx << 4) | (on << 6);
470 }
471
472 static INLINE void
473 set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
474 {
475 if (is_long(e))
476 return;
477
478 e->inst[0] |= 1;
479 set_pred(pc, 0xf, 0, e);
480 set_pred_wr(pc, 0, 0, e);
481 }
482
483 static INLINE void
484 set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
485 {
486 if (dst->type == P_RESULT) {
487 set_long(pc, e);
488 e->inst[1] |= 0x00000008;
489 }
490
491 alloc_reg(pc, dst);
492 if (dst->hw > 63)
493 set_long(pc, e);
494 e->inst[0] |= (dst->hw << 2);
495 }
496
497 static INLINE void
498 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
499 {
500 union {
501 float f;
502 uint32_t ui;
503 } u;
504 u.ui = pc->immd_buf[imm->hw];
505
506 u.f = (imm->mod & NV50_MOD_ABS) ? fabsf(u.f) : u.f;
507 u.f = (imm->mod & NV50_MOD_NEG) ? -u.f : u.f;
508
509 set_long(pc, e);
510 /* XXX: can't be predicated - bits overlap; cases where both
511 * are required should be avoided by using pc->allow32 */
512 set_pred(pc, 0, 0, e);
513 set_pred_wr(pc, 0, 0, e);
514
515 e->inst[1] |= 0x00000002 | 0x00000001;
516 e->inst[0] |= (u.ui & 0x3f) << 16;
517 e->inst[1] |= (u.ui >> 6) << 2;
518 }
519
520 static INLINE void
521 set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
522 {
523 assert(!(e->inst[0] & 0x0c000000));
524 assert(!(e->inst[1] & 0x00000004));
525
526 e->inst[0] |= (a->hw & 3) << 26;
527 e->inst[1] |= (a->hw >> 2) << 2;
528 }
529
530 static void
531 emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
532 struct nv50_reg *src0, uint16_t src1_val)
533 {
534 struct nv50_program_exec *e = exec(pc);
535
536 e->inst[0] = 0xd0000000 | (src1_val << 9);
537 e->inst[1] = 0x20000000;
538 set_long(pc, e);
539 e->inst[0] |= dst->hw << 2;
540 if (src0) /* otherwise will add to $a0, which is always 0 */
541 set_addr(e, src0);
542
543 emit(pc, e);
544 }
545
546 static struct nv50_reg *
547 alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
548 {
549 struct nv50_reg *a_tgsi = NULL, *a = NULL;
550 int i;
551 uint8_t avail = ~pc->addr_alloc;
552
553 if (!ref) {
554 /* allocate for TGSI_FILE_ADDRESS */
555 while (avail) {
556 i = ffs(avail) - 1;
557
558 if (pc->r_addr[i].rhw < 0 ||
559 pc->r_addr[i].acc != pc->insn_cur) {
560 pc->addr_alloc |= (1 << i);
561
562 pc->r_addr[i].rhw = -1;
563 pc->r_addr[i].index = i;
564 return &pc->r_addr[i];
565 }
566 avail &= ~(1 << i);
567 }
568 assert(0);
569 return NULL;
570 }
571
572 /* Allocate and set an address reg so we can access 'ref'.
573 *
574 * If and r_addr->index will be -1 or the hw index the value
575 * value in rhw is relative to. If rhw < 0, the reg has not
576 * been initialized or is in use for TGSI_FILE_ADDRESS.
577 */
578 while (avail) { /* only consider regs that are not TGSI */
579 i = ffs(avail) - 1;
580 avail &= ~(1 << i);
581
582 if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) {
583 /* prefer an usused reg with low hw index */
584 a = &pc->r_addr[i];
585 continue;
586 }
587 if (!a && pc->r_addr[i].acc != pc->insn_cur)
588 a = &pc->r_addr[i];
589
590 if (ref->hw - pc->r_addr[i].rhw >= 128)
591 continue;
592
593 if ((ref->acc >= 0 && pc->r_addr[i].index < 0) ||
594 (ref->acc < 0 && pc->r_addr[i].index == ref->index)) {
595 pc->r_addr[i].acc = pc->insn_cur;
596 return &pc->r_addr[i];
597 }
598 }
599 assert(a);
600
601 if (ref->acc < 0)
602 a_tgsi = pc->addr[ref->index];
603
604 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
605
606 a->rhw = ref->hw & ~0x7f;
607 a->acc = pc->insn_cur;
608 a->index = a_tgsi ? ref->index : -1;
609 return a;
610 }
611
612 #define INTERP_LINEAR 0
613 #define INTERP_FLAT 1
614 #define INTERP_PERSPECTIVE 2
615 #define INTERP_CENTROID 4
616
617 /* interpolant index has been stored in dst->rhw */
618 static void
619 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
620 unsigned mode)
621 {
622 assert(dst->rhw != -1);
623 struct nv50_program_exec *e = exec(pc);
624
625 e->inst[0] |= 0x80000000;
626 set_dst(pc, dst, e);
627 e->inst[0] |= (dst->rhw << 16);
628
629 if (mode & INTERP_FLAT) {
630 e->inst[0] |= (1 << 8);
631 } else {
632 if (mode & INTERP_PERSPECTIVE) {
633 e->inst[0] |= (1 << 25);
634 alloc_reg(pc, iv);
635 e->inst[0] |= (iv->hw << 9);
636 }
637
638 if (mode & INTERP_CENTROID)
639 e->inst[0] |= (1 << 24);
640 }
641
642 emit(pc, e);
643 }
644
645 static void
646 set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
647 struct nv50_program_exec *e)
648 {
649 set_long(pc, e);
650
651 e->param.index = src->hw & 127;
652 e->param.shift = s;
653 e->param.mask = m << (s % 32);
654
655 if (src->hw > 127)
656 set_addr(e, alloc_addr(pc, src));
657 else
658 if (src->acc < 0) {
659 assert(src->type == P_CONST);
660 set_addr(e, pc->addr[src->index]);
661 }
662
663 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
664 }
665
666 static void
667 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
668 {
669 struct nv50_program_exec *e = exec(pc);
670
671 e->inst[0] = 0x10000000;
672 if (!pc->allow32)
673 set_long(pc, e);
674
675 set_dst(pc, dst, e);
676
677 if (!is_long(e) && src->type == P_IMMD) {
678 set_immd(pc, src, e);
679 /*XXX: 32-bit, but steals part of "half" reg space - need to
680 * catch and handle this case if/when we do half-regs
681 */
682 } else
683 if (src->type == P_IMMD || src->type == P_CONST) {
684 set_long(pc, e);
685 set_data(pc, src, 0x7f, 9, e);
686 e->inst[1] |= 0x20000000; /* mov from c[] */
687 } else {
688 if (src->type == P_ATTR) {
689 set_long(pc, e);
690 e->inst[1] |= 0x00200000;
691 }
692
693 alloc_reg(pc, src);
694 if (src->hw > 63)
695 set_long(pc, e);
696 e->inst[0] |= (src->hw << 9);
697 }
698
699 if (is_long(e) && !is_immd(e)) {
700 e->inst[1] |= 0x04000000; /* 32-bit */
701 e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
702 if (!(e->inst[1] & 0x20000000))
703 e->inst[1] |= 0x00030000; /* lane mask 2:3 */
704 } else
705 e->inst[0] |= 0x00008000;
706
707 emit(pc, e);
708 }
709
710 static INLINE void
711 emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
712 {
713 struct nv50_reg *imm = alloc_immd(pc, f);
714 emit_mov(pc, dst, imm);
715 FREE(imm);
716 }
717
718 static void
719 emit_nop(struct nv50_pc *pc)
720 {
721 struct nv50_program_exec *e = exec(pc);
722
723 e->inst[0] = 0xf0000000;
724 set_long(pc, e);
725 e->inst[1] = 0xe0000000;
726 emit(pc, e);
727 }
728
729 static boolean
730 check_swap_src_0_1(struct nv50_pc *pc,
731 struct nv50_reg **s0, struct nv50_reg **s1)
732 {
733 struct nv50_reg *src0 = *s0, *src1 = *s1;
734
735 if (src0->type == P_CONST) {
736 if (src1->type != P_CONST) {
737 *s0 = src1;
738 *s1 = src0;
739 return TRUE;
740 }
741 } else
742 if (src1->type == P_ATTR) {
743 if (src0->type != P_ATTR) {
744 *s0 = src1;
745 *s1 = src0;
746 return TRUE;
747 }
748 }
749
750 return FALSE;
751 }
752
753 static void
754 set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
755 struct nv50_program_exec *e)
756 {
757 struct nv50_reg *temp;
758
759 if (src->type != P_TEMP) {
760 temp = temp_temp(pc);
761 emit_mov(pc, temp, src);
762 src = temp;
763 }
764
765 alloc_reg(pc, src);
766 if (src->hw > 63)
767 set_long(pc, e);
768 e->inst[0] |= (src->hw << 9);
769 }
770
771 static void
772 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
773 {
774 if (src->type == P_ATTR) {
775 set_long(pc, e);
776 e->inst[1] |= 0x00200000;
777 } else
778 if (src->type == P_CONST || src->type == P_IMMD) {
779 struct nv50_reg *temp = temp_temp(pc);
780
781 emit_mov(pc, temp, src);
782 src = temp;
783 }
784
785 alloc_reg(pc, src);
786 if (src->hw > 63)
787 set_long(pc, e);
788 e->inst[0] |= (src->hw << 9);
789 }
790
791 static void
792 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
793 {
794 if (src->type == P_ATTR) {
795 struct nv50_reg *temp = temp_temp(pc);
796
797 emit_mov(pc, temp, src);
798 src = temp;
799 } else
800 if (src->type == P_CONST || src->type == P_IMMD) {
801 assert(!(e->inst[0] & 0x00800000));
802 if (e->inst[0] & 0x01000000) {
803 struct nv50_reg *temp = temp_temp(pc);
804
805 emit_mov(pc, temp, src);
806 src = temp;
807 } else {
808 set_data(pc, src, 0x7f, 16, e);
809 e->inst[0] |= 0x00800000;
810 }
811 }
812
813 alloc_reg(pc, src);
814 if (src->hw > 63)
815 set_long(pc, e);
816 e->inst[0] |= ((src->hw & 127) << 16);
817 }
818
819 static void
820 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
821 {
822 set_long(pc, e);
823
824 if (src->type == P_ATTR) {
825 struct nv50_reg *temp = temp_temp(pc);
826
827 emit_mov(pc, temp, src);
828 src = temp;
829 } else
830 if (src->type == P_CONST || src->type == P_IMMD) {
831 assert(!(e->inst[0] & 0x01000000));
832 if (e->inst[0] & 0x00800000) {
833 struct nv50_reg *temp = temp_temp(pc);
834
835 emit_mov(pc, temp, src);
836 src = temp;
837 } else {
838 set_data(pc, src, 0x7f, 32+14, e);
839 e->inst[0] |= 0x01000000;
840 }
841 }
842
843 alloc_reg(pc, src);
844 e->inst[1] |= ((src->hw & 127) << 14);
845 }
846
847 static void
848 emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
849 {
850 struct nv50_program_exec *e = exec(pc);
851
852 assert(dst->type == P_TEMP);
853 e->inst[1] = 0x20000000 | (pred << 12);
854 set_long(pc, e);
855 set_dst(pc, dst, e);
856
857 emit(pc, e);
858 }
859
860 static void
861 emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
862 {
863 struct nv50_program_exec *e = exec(pc);
864
865 e->inst[0] = 0x000001fc;
866 e->inst[1] = 0xa0000008;
867 set_long(pc, e);
868 set_pred_wr(pc, 1, pred, e);
869 set_src_0_restricted(pc, src, e);
870
871 emit(pc, e);
872 }
873
874 static void
875 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
876 struct nv50_reg *src1)
877 {
878 struct nv50_program_exec *e = exec(pc);
879
880 e->inst[0] |= 0xc0000000;
881
882 if (!pc->allow32)
883 set_long(pc, e);
884
885 check_swap_src_0_1(pc, &src0, &src1);
886 set_dst(pc, dst, e);
887 set_src_0(pc, src0, e);
888 if (src1->type == P_IMMD && !is_long(e)) {
889 if (src0->mod & NV50_MOD_NEG)
890 e->inst[0] |= 0x00008000;
891 set_immd(pc, src1, e);
892 } else {
893 set_src_1(pc, src1, e);
894 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
895 if (is_long(e))
896 e->inst[1] |= 0x08000000;
897 else
898 e->inst[0] |= 0x00008000;
899 }
900 }
901
902 emit(pc, e);
903 }
904
905 static void
906 emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
907 struct nv50_reg *src0, struct nv50_reg *src1)
908 {
909 struct nv50_program_exec *e = exec(pc);
910
911 e->inst[0] = 0xb0000000;
912
913 alloc_reg(pc, src1);
914 check_swap_src_0_1(pc, &src0, &src1);
915
916 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
917 set_long(pc, e);
918 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
919 ((src1->mod & NV50_MOD_NEG) << 27);
920 }
921
922 set_dst(pc, dst, e);
923 set_src_0(pc, src0, e);
924 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
925 set_src_2(pc, src1, e);
926 else
927 if (src1->type == P_IMMD)
928 set_immd(pc, src1, e);
929 else
930 set_src_1(pc, src1, e);
931
932 emit(pc, e);
933 }
934
935 static void
936 emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
937 uint8_t s)
938 {
939 struct nv50_program_exec *e = exec(pc);
940
941 set_long(pc, e);
942 e->inst[1] |= 0xc0000000;
943
944 e->inst[0] |= dst->hw << 2;
945 e->inst[0] |= s << 16; /* shift left */
946 set_src_0_restricted(pc, src, e);
947
948 emit(pc, e);
949 }
950
951 static void
952 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
953 struct nv50_reg *src0, struct nv50_reg *src1)
954 {
955 struct nv50_program_exec *e = exec(pc);
956
957 set_long(pc, e);
958 e->inst[0] |= 0xb0000000;
959 e->inst[1] |= (sub << 29);
960
961 check_swap_src_0_1(pc, &src0, &src1);
962 set_dst(pc, dst, e);
963 set_src_0(pc, src0, e);
964 set_src_1(pc, src1, e);
965
966 if (src0->mod & NV50_MOD_ABS)
967 e->inst[1] |= 0x00100000;
968 if (src1->mod & NV50_MOD_ABS)
969 e->inst[1] |= 0x00080000;
970
971 emit(pc, e);
972 }
973
974 static INLINE void
975 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
976 struct nv50_reg *src1)
977 {
978 src1->mod ^= NV50_MOD_NEG;
979 emit_add(pc, dst, src0, src1);
980 src1->mod ^= NV50_MOD_NEG;
981 }
982
983 static void
984 emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
985 struct nv50_reg *src1, unsigned op)
986 {
987 struct nv50_program_exec *e = exec(pc);
988
989 e->inst[0] = 0xd0000000;
990 set_long(pc, e);
991
992 check_swap_src_0_1(pc, &src0, &src1);
993 set_dst(pc, dst, e);
994 set_src_0(pc, src0, e);
995
996 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
997 op != TGSI_OPCODE_XOR)
998 assert(!"invalid bit op");
999
1000 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
1001 set_immd(pc, src1, e);
1002 if (op == TGSI_OPCODE_OR)
1003 e->inst[0] |= 0x0100;
1004 else
1005 if (op == TGSI_OPCODE_XOR)
1006 e->inst[0] |= 0x8000;
1007 } else {
1008 set_src_1(pc, src1, e);
1009 e->inst[1] |= 0x04000000; /* 32 bit */
1010 if (op == TGSI_OPCODE_OR)
1011 e->inst[1] |= 0x4000;
1012 else
1013 if (op == TGSI_OPCODE_XOR)
1014 e->inst[1] |= 0x8000;
1015 }
1016
1017 emit(pc, e);
1018 }
1019
1020 static void
1021 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
1022 struct nv50_reg *src1, struct nv50_reg *src2)
1023 {
1024 struct nv50_program_exec *e = exec(pc);
1025
1026 e->inst[0] |= 0xe0000000;
1027
1028 check_swap_src_0_1(pc, &src0, &src1);
1029 set_dst(pc, dst, e);
1030 set_src_0(pc, src0, e);
1031 set_src_1(pc, src1, e);
1032 set_src_2(pc, src2, e);
1033
1034 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
1035 e->inst[1] |= 0x04000000;
1036 if (src2->mod & NV50_MOD_NEG)
1037 e->inst[1] |= 0x08000000;
1038
1039 emit(pc, e);
1040 }
1041
1042 static INLINE void
1043 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
1044 struct nv50_reg *src1, struct nv50_reg *src2)
1045 {
1046 src2->mod ^= NV50_MOD_NEG;
1047 emit_mad(pc, dst, src0, src1, src2);
1048 src2->mod ^= NV50_MOD_NEG;
1049 }
1050
1051 static void
1052 emit_flop(struct nv50_pc *pc, unsigned sub,
1053 struct nv50_reg *dst, struct nv50_reg *src)
1054 {
1055 struct nv50_program_exec *e = exec(pc);
1056
1057 e->inst[0] |= 0x90000000;
1058 if (sub) {
1059 set_long(pc, e);
1060 e->inst[1] |= (sub << 29);
1061 }
1062
1063 set_dst(pc, dst, e);
1064
1065 if (sub == 0 || sub == 2)
1066 set_src_0_restricted(pc, src, e);
1067 else
1068 set_src_0(pc, src, e);
1069
1070 emit(pc, e);
1071 }
1072
1073 static void
1074 emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1075 {
1076 struct nv50_program_exec *e = exec(pc);
1077
1078 e->inst[0] |= 0xb0000000;
1079
1080 set_dst(pc, dst, e);
1081 set_src_0(pc, src, e);
1082 set_long(pc, e);
1083 e->inst[1] |= (6 << 29) | 0x00004000;
1084
1085 emit(pc, e);
1086 }
1087
1088 static void
1089 emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1090 {
1091 struct nv50_program_exec *e = exec(pc);
1092
1093 e->inst[0] |= 0xb0000000;
1094
1095 set_dst(pc, dst, e);
1096 set_src_0(pc, src, e);
1097 set_long(pc, e);
1098 e->inst[1] |= (6 << 29);
1099
1100 emit(pc, e);
1101 }
1102
1103 #define CVTOP_RN 0x01
1104 #define CVTOP_FLOOR 0x03
1105 #define CVTOP_CEIL 0x05
1106 #define CVTOP_TRUNC 0x07
1107 #define CVTOP_SAT 0x08
1108 #define CVTOP_ABS 0x10
1109
1110 /* 0x04 == 32 bit dst */
1111 /* 0x40 == dst is float */
1112 /* 0x80 == src is float */
1113 #define CVT_F32_F32 0xc4
1114 #define CVT_F32_S32 0x44
1115 #define CVT_S32_F32 0x8c
1116 #define CVT_S32_S32 0x0c
1117 #define CVT_NEG 0x20
1118 #define CVT_RI 0x08
1119
1120 static void
1121 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1122 int wp, unsigned cvn, unsigned fmt)
1123 {
1124 struct nv50_program_exec *e;
1125
1126 e = exec(pc);
1127 set_long(pc, e);
1128
1129 e->inst[0] |= 0xa0000000;
1130 e->inst[1] |= 0x00004000; /* 32 bit src */
1131 e->inst[1] |= (cvn << 16);
1132 e->inst[1] |= (fmt << 24);
1133 set_src_0(pc, src, e);
1134
1135 if (wp >= 0)
1136 set_pred_wr(pc, 1, wp, e);
1137
1138 if (dst)
1139 set_dst(pc, dst, e);
1140 else {
1141 e->inst[0] |= 0x000001fc;
1142 e->inst[1] |= 0x00000008;
1143 }
1144
1145 emit(pc, e);
1146 }
1147
1148 /* nv50 Condition codes:
1149 * 0x1 = LT
1150 * 0x2 = EQ
1151 * 0x3 = LE
1152 * 0x4 = GT
1153 * 0x5 = NE
1154 * 0x6 = GE
1155 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1156 * 0x8 = unordered bit (allows NaN)
1157 */
1158 static void
1159 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1160 struct nv50_reg *src0, struct nv50_reg *src1)
1161 {
1162 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1163
1164 struct nv50_program_exec *e = exec(pc);
1165 struct nv50_reg *rdst;
1166
1167 assert(ccode < 16);
1168 if (check_swap_src_0_1(pc, &src0, &src1))
1169 ccode = cc_swapped[ccode & 7] | (ccode & 8);
1170
1171 rdst = dst;
1172 if (dst && dst->type != P_TEMP)
1173 dst = alloc_temp(pc, NULL);
1174
1175 /* set.u32 */
1176 set_long(pc, e);
1177 e->inst[0] |= 0xb0000000;
1178 e->inst[1] |= 0x60000000 | (ccode << 14);
1179
1180 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1181 * that doesn't seem to match what the hw actually does
1182 e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1183 */
1184
1185 if (wp >= 0)
1186 set_pred_wr(pc, 1, wp, e);
1187 if (dst)
1188 set_dst(pc, dst, e);
1189 else {
1190 e->inst[0] |= 0x000001fc;
1191 e->inst[1] |= 0x00000008;
1192 }
1193
1194 set_src_0(pc, src0, e);
1195 set_src_1(pc, src1, e);
1196
1197 emit(pc, e);
1198
1199 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1200 if (rdst)
1201 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1202 if (rdst && rdst != dst)
1203 free_temp(pc, dst);
1204 }
1205
1206 static INLINE unsigned
1207 map_tgsi_setop_cc(unsigned op)
1208 {
1209 switch (op) {
1210 case TGSI_OPCODE_SLT: return 0x1;
1211 case TGSI_OPCODE_SGE: return 0x6;
1212 case TGSI_OPCODE_SEQ: return 0x2;
1213 case TGSI_OPCODE_SGT: return 0x4;
1214 case TGSI_OPCODE_SLE: return 0x3;
1215 case TGSI_OPCODE_SNE: return 0xd;
1216 default:
1217 assert(0);
1218 return 0;
1219 }
1220 }
1221
1222 static INLINE void
1223 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1224 {
1225 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1226 }
1227
1228 static void
1229 emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1230 struct nv50_reg *v, struct nv50_reg *e)
1231 {
1232 struct nv50_reg *temp = alloc_temp(pc, NULL);
1233
1234 emit_flop(pc, 3, temp, v);
1235 emit_mul(pc, temp, temp, e);
1236 emit_preex2(pc, temp, temp);
1237 emit_flop(pc, 6, dst, temp);
1238
1239 free_temp(pc, temp);
1240 }
1241
1242 static INLINE void
1243 emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1244 {
1245 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1246 }
1247
1248 static INLINE void
1249 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1250 {
1251 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1252 }
1253
1254 static void
1255 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1256 struct nv50_reg **src)
1257 {
1258 struct nv50_reg *one = alloc_immd(pc, 1.0);
1259 struct nv50_reg *zero = alloc_immd(pc, 0.0);
1260 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1261 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);
1262 struct nv50_reg *tmp[4];
1263 boolean allow32 = pc->allow32;
1264
1265 pc->allow32 = FALSE;
1266
1267 if (mask & (3 << 1)) {
1268 tmp[0] = alloc_temp(pc, NULL);
1269 emit_minmax(pc, 4, tmp[0], src[0], zero);
1270 }
1271
1272 if (mask & (1 << 2)) {
1273 set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1274
1275 tmp[1] = temp_temp(pc);
1276 emit_minmax(pc, 4, tmp[1], src[1], zero);
1277
1278 tmp[3] = temp_temp(pc);
1279 emit_minmax(pc, 4, tmp[3], src[3], neg128);
1280 emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1281
1282 emit_pow(pc, dst[2], tmp[1], tmp[3]);
1283 emit_mov(pc, dst[2], zero);
1284 set_pred(pc, 3, 0, pc->p->exec_tail);
1285 }
1286
1287 if (mask & (1 << 1))
1288 assimilate_temp(pc, dst[1], tmp[0]);
1289 else
1290 if (mask & (1 << 2))
1291 free_temp(pc, tmp[0]);
1292
1293 pc->allow32 = allow32;
1294
1295 /* do this last, in case src[i,j] == dst[0,3] */
1296 if (mask & (1 << 0))
1297 emit_mov(pc, dst[0], one);
1298
1299 if (mask & (1 << 3))
1300 emit_mov(pc, dst[3], one);
1301
1302 FREE(pos128);
1303 FREE(neg128);
1304 FREE(zero);
1305 FREE(one);
1306 }
1307
1308 static INLINE void
1309 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1310 {
1311 emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1312 }
1313
1314 static void
1315 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1316 {
1317 struct nv50_program_exec *e;
1318 const int r_pred = 1;
1319
1320 e = exec(pc);
1321 e->inst[0] = 0x00000002; /* discard */
1322 set_long(pc, e); /* sets cond code to ALWAYS */
1323
1324 if (src) {
1325 unsigned cvn = CVT_F32_F32;
1326
1327 set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
1328
1329 if (src->mod & NV50_MOD_NEG)
1330 cvn |= CVT_NEG;
1331 /* write predicate reg */
1332 emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1333 }
1334
1335 emit(pc, e);
1336 }
1337
1338 static struct nv50_program_exec *
1339 emit_breakaddr(struct nv50_pc *pc)
1340 {
1341 struct nv50_program_exec *e = exec(pc);
1342
1343 e->inst[0] = 0x40000002;
1344 set_long(pc, e);
1345
1346 emit(pc, e);
1347 return e;
1348 }
1349
1350 static void
1351 emit_break(struct nv50_pc *pc, int pred, unsigned cc)
1352 {
1353 struct nv50_program_exec *e = exec(pc);
1354
1355 e->inst[0] = 0x50000002;
1356 set_long(pc, e);
1357 if (pred >= 0)
1358 set_pred(pc, cc, pred, e);
1359
1360 emit(pc, e);
1361 }
1362
1363 static struct nv50_program_exec *
1364 emit_joinat(struct nv50_pc *pc)
1365 {
1366 struct nv50_program_exec *e = exec(pc);
1367
1368 e->inst[0] = 0xa0000002;
1369 set_long(pc, e);
1370
1371 emit(pc, e);
1372 return e;
1373 }
1374
1375 static struct nv50_program_exec *
1376 emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
1377 {
1378 struct nv50_program_exec *e = exec(pc);
1379
1380 e->inst[0] = 0x10000002;
1381 set_long(pc, e);
1382 if (pred >= 0)
1383 set_pred(pc, cc, pred, e);
1384 emit(pc, e);
1385 return pc->p->exec_tail;
1386 }
1387
1388 static void
1389 emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
1390 {
1391 struct nv50_program_exec *e = exec(pc);
1392
1393 e->inst[0] = 0x30000002;
1394 set_long(pc, e);
1395 if (pred >= 0)
1396 set_pred(pc, cc, pred, e);
1397
1398 emit(pc, e);
1399 }
1400
1401 #define QOP_ADD 0
1402 #define QOP_SUBR 1
1403 #define QOP_SUB 2
1404 #define QOP_MOV_SRC1 3
1405
1406 /* For a quad of threads / top left, top right, bottom left, bottom right
1407 * pixels, do a different operation, and take src0 from a specific thread.
1408 */
1409 static void
1410 emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
1411 struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
1412 {
1413 struct nv50_program_exec *e = exec(pc);
1414
1415 e->inst[0] = 0xc0000000;
1416 e->inst[1] = 0x80000000;
1417 set_long(pc, e);
1418 e->inst[0] |= lane_src0 << 16;
1419 set_src_0(pc, src0, e);
1420 set_src_2(pc, src1, e);
1421
1422 if (wp >= 0)
1423 set_pred_wr(pc, 1, wp, e);
1424
1425 if (dst)
1426 set_dst(pc, dst, e);
1427 else {
1428 e->inst[0] |= 0x000001fc;
1429 e->inst[1] |= 0x00000008;
1430 }
1431
1432 e->inst[0] |= (qop & 3) << 20;
1433 e->inst[1] |= (qop >> 2) << 22;
1434
1435 emit(pc, e);
1436 }
1437
1438 static void
1439 load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1440 struct nv50_reg **src, unsigned arg, boolean proj)
1441 {
1442 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
1443
1444 src[0]->mod |= NV50_MOD_ABS;
1445 src[1]->mod |= NV50_MOD_ABS;
1446 src[2]->mod |= NV50_MOD_ABS;
1447
1448 emit_minmax(pc, 4, t[2], src[0], src[1]);
1449 emit_minmax(pc, 4, t[2], src[2], t[2]);
1450
1451 src[0]->mod = mod[0];
1452 src[1]->mod = mod[1];
1453 src[2]->mod = mod[2];
1454
1455 if (proj && 0 /* looks more correct without this */)
1456 emit_mul(pc, t[2], t[2], src[3]);
1457 else
1458 if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
1459 emit_mov(pc, t[3], src[3]);
1460
1461 emit_flop(pc, 0, t[2], t[2]);
1462
1463 emit_mul(pc, t[0], src[0], t[2]);
1464 emit_mul(pc, t[1], src[1], t[2]);
1465 emit_mul(pc, t[2], src[2], t[2]);
1466 }
1467
1468 static void
1469 load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1470 struct nv50_reg **src, unsigned dim, unsigned arg)
1471 {
1472 unsigned c, mode;
1473
1474 if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1475 mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
1476
1477 t[3]->rhw = src[3]->rhw;
1478 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1479 emit_flop(pc, 0, t[3], t[3]);
1480
1481 for (c = 0; c < dim; ++c) {
1482 t[c]->rhw = src[c]->rhw;
1483 emit_interp(pc, t[c], t[3], mode);
1484 }
1485 if (arg != dim) { /* depth reference value */
1486 t[dim]->rhw = src[2]->rhw;
1487 emit_interp(pc, t[dim], t[3], mode);
1488 }
1489 } else {
1490 /* XXX: for some reason the blob sometimes uses MAD
1491 * (mad f32 $rX $rY $rZ neg $r63)
1492 */
1493 emit_flop(pc, 0, t[3], src[3]);
1494 for (c = 0; c < dim; ++c)
1495 emit_mul(pc, t[c], src[c], t[3]);
1496 if (arg != dim) /* depth reference value */
1497 emit_mul(pc, t[dim], src[2], t[3]);
1498 }
1499 }
1500
1501 static INLINE void
1502 get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
1503 {
1504 switch (type) {
1505 case TGSI_TEXTURE_1D:
1506 *arg = *dim = 1;
1507 break;
1508 case TGSI_TEXTURE_SHADOW1D:
1509 *dim = 1;
1510 *arg = 2;
1511 break;
1512 case TGSI_TEXTURE_UNKNOWN:
1513 case TGSI_TEXTURE_2D:
1514 case TGSI_TEXTURE_RECT:
1515 *arg = *dim = 2;
1516 break;
1517 case TGSI_TEXTURE_SHADOW2D:
1518 case TGSI_TEXTURE_SHADOWRECT:
1519 *dim = 2;
1520 *arg = 3;
1521 break;
1522 case TGSI_TEXTURE_3D:
1523 case TGSI_TEXTURE_CUBE:
1524 *dim = *arg = 3;
1525 break;
1526 default:
1527 assert(0);
1528 break;
1529 }
1530 }
1531
1532 /* We shouldn't execute TEXLOD if any of the pixels in a quad have
1533 * different LOD values, so branch off groups of equal LOD.
1534 */
1535 static void
1536 emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
1537 struct nv50_reg *src, struct nv50_program_exec *tex)
1538 {
1539 struct nv50_program_exec *join_at;
1540 unsigned i, target = pc->p->exec_size + 7 * 2;
1541
1542 /* Subtract lod of each pixel from lod of top left pixel, jump
1543 * texlod insn if result is 0, then repeat for 2 other pixels.
1544 */
1545 join_at = emit_joinat(pc);
1546 emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
1547 emit_branch(pc, 0, 2)->param.index = target;
1548
1549 for (i = 1; i < 4; ++i) {
1550 emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
1551 emit_branch(pc, 0, 2)->param.index = target;
1552 }
1553
1554 emit_mov(pc, tlod, src); /* target */
1555 emit(pc, tex); /* texlod */
1556
1557 join_at->param.index = target + 2 * 2;
1558 JOIN_ON(emit_nop(pc)); /* join _after_ tex */
1559 }
1560
1561 static void
1562 emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
1563 struct nv50_program_exec *tex)
1564 {
1565 struct nv50_program_exec *e;
1566 struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
1567 int r_pred = 0;
1568 unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
1569
1570 pc->allow32 = FALSE;
1571 ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
1572
1573 /* Subtract bias value of thread i from bias values of each thread,
1574 * store result in r_pred, and set bit i in r_bits if result was 0.
1575 */
1576 assert(arg < 4);
1577 for (i = 0; i < 4; ++i, ++imm_1248.hw) {
1578 emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
1579 emit_mov(pc, r_bits, &imm_1248);
1580 set_pred(pc, 2, r_pred, pc->p->exec_tail);
1581 }
1582 emit_mov_to_pred(pc, r_pred, r_bits);
1583
1584 /* The lanes of a quad are now grouped by the bit in r_pred they have
1585 * set. Put the input values for TEX into a new register set for each
1586 * group and execute TEX only for a specific group.
1587 * We cannot use the same register set for each group because we need
1588 * the derivatives, which are implicitly calculated, to be correct.
1589 */
1590 for (i = 1; i < 4; ++i) {
1591 alloc_temp4(pc, t123[i], 0);
1592
1593 for (c = 0; c <= arg; ++c)
1594 emit_mov(pc, t123[i][c], t[c]);
1595
1596 *(e = exec(pc)) = *(tex);
1597 e->inst[0] &= ~0x01fc;
1598 set_dst(pc, t123[i][0], e);
1599 set_pred(pc, cc[i], r_pred, e);
1600 emit(pc, e);
1601 }
1602 /* finally TEX on the original regs (where we kept the input) */
1603 set_pred(pc, cc[0], r_pred, tex);
1604 emit(pc, tex);
1605
1606 /* put the 3 * n other results into regs for lane 0 */
1607 n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
1608 for (i = 1; i < 4; ++i) {
1609 for (c = 0; c < n; ++c) {
1610 emit_mov(pc, t[c], t123[i][c]);
1611 set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
1612 }
1613 free_temp4(pc, t123[i]);
1614 }
1615
1616 emit_nop(pc);
1617 free_temp(pc, r_bits);
1618 }
1619
1620 static void
1621 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1622 struct nv50_reg **src, unsigned unit, unsigned type,
1623 boolean proj, int bias_lod)
1624 {
1625 struct nv50_reg *t[4];
1626 struct nv50_program_exec *e;
1627 unsigned c, dim, arg;
1628
1629 /* t[i] must be within a single 128 bit super-reg */
1630 alloc_temp4(pc, t, 0);
1631
1632 e = exec(pc);
1633 e->inst[0] = 0xf0000000;
1634 set_long(pc, e);
1635 set_dst(pc, t[0], e);
1636
1637 /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
1638 e->inst[0] |= (unit << 9) /* | (unit << 17) */;
1639
1640 /* live flag (don't set if TEX results affect input to another TEX): */
1641 /* e->inst[0] |= 0x00000004; */
1642
1643 get_tex_dim(type, &dim, &arg);
1644
1645 if (type == TGSI_TEXTURE_CUBE) {
1646 e->inst[0] |= 0x08000000;
1647 load_cube_tex_coords(pc, t, src, arg, proj);
1648 } else
1649 if (proj)
1650 load_proj_tex_coords(pc, t, src, dim, arg);
1651 else {
1652 for (c = 0; c < dim; c++)
1653 emit_mov(pc, t[c], src[c]);
1654 if (arg != dim) /* depth reference value (always src.z here) */
1655 emit_mov(pc, t[dim], src[2]);
1656 }
1657
1658 e->inst[0] |= (mask & 0x3) << 25;
1659 e->inst[1] |= (mask & 0xc) << 12;
1660
1661 if (!bias_lod) {
1662 e->inst[0] |= (arg - 1) << 22;
1663 emit(pc, e);
1664 } else
1665 if (bias_lod < 0) {
1666 e->inst[0] |= arg << 22;
1667 e->inst[1] |= 0x20000000; /* texbias */
1668 emit_mov(pc, t[arg], src[3]);
1669 emit_texbias_sequence(pc, t, arg, e);
1670 } else {
1671 e->inst[0] |= arg << 22;
1672 e->inst[1] |= 0x40000000; /* texlod */
1673 emit_mov(pc, t[arg], src[3]);
1674 emit_texlod_sequence(pc, t[arg], src[3], e);
1675 }
1676
1677 #if 1
1678 c = 0;
1679 if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1680 if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1681 if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1682 if (mask & 8) emit_mov(pc, dst[3], t[c]);
1683
1684 free_temp4(pc, t);
1685 #else
1686 /* XXX: if p.e. MUL is used directly after TEX, it would still use
1687 * the texture coordinates, not the fetched values: latency ? */
1688
1689 for (c = 0; c < 4; c++) {
1690 if (mask & (1 << c))
1691 assimilate_temp(pc, dst[c], t[c]);
1692 else
1693 free_temp(pc, t[c]);
1694 }
1695 #endif
1696 }
1697
1698 static void
1699 emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1700 {
1701 struct nv50_program_exec *e = exec(pc);
1702
1703 assert(src->type == P_TEMP);
1704
1705 e->inst[0] = 0xc0140000;
1706 e->inst[1] = 0x89800000;
1707 set_long(pc, e);
1708 set_dst(pc, dst, e);
1709 set_src_0(pc, src, e);
1710 set_src_2(pc, src, e);
1711
1712 emit(pc, e);
1713 }
1714
1715 static void
1716 emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1717 {
1718 struct nv50_reg *r = src;
1719 struct nv50_program_exec *e = exec(pc);
1720
1721 assert(src->type == P_TEMP);
1722
1723 if (!(src->mod & NV50_MOD_NEG)) { /* ! double negation */
1724 r = alloc_temp(pc, NULL);
1725 emit_neg(pc, r, src);
1726 }
1727
1728 e->inst[0] = 0xc0150000;
1729 e->inst[1] = 0x8a400000;
1730 set_long(pc, e);
1731 set_dst(pc, dst, e);
1732 set_src_0(pc, r, e);
1733 set_src_2(pc, r, e);
1734
1735 if (r != src)
1736 free_temp(pc, r);
1737
1738 emit(pc, e);
1739 }
1740
1741 static void
1742 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1743 {
1744 unsigned q = 0, m = ~0;
1745
1746 assert(!is_long(e));
1747
1748 switch (e->inst[0] >> 28) {
1749 case 0x1:
1750 /* MOV */
1751 q = 0x0403c000;
1752 m = 0xffff7fff;
1753 break;
1754 case 0x8:
1755 /* INTERP (move centroid, perspective and flat bits) */
1756 m = ~0x03000100;
1757 q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1758 q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1759 break;
1760 case 0x9:
1761 /* RCP */
1762 break;
1763 case 0xB:
1764 /* ADD */
1765 m = ~(127 << 16);
1766 q = ((e->inst[0] & (~m)) >> 2);
1767 break;
1768 case 0xC:
1769 /* MUL */
1770 m = ~0x00008000;
1771 q = ((e->inst[0] & (~m)) << 12);
1772 break;
1773 case 0xE:
1774 /* MAD (if src2 == dst) */
1775 q = ((e->inst[0] & 0x1fc) << 12);
1776 break;
1777 default:
1778 assert(0);
1779 break;
1780 }
1781
1782 set_long(pc, e);
1783 pc->p->exec_size++;
1784
1785 e->inst[0] &= m;
1786 e->inst[1] |= q;
1787 }
1788
1789 /* Some operations support an optional negation flag. */
1790 static boolean
1791 negate_supported(const struct tgsi_full_instruction *insn, int i)
1792 {
1793 switch (insn->Instruction.Opcode) {
1794 case TGSI_OPCODE_DDY:
1795 case TGSI_OPCODE_DP3:
1796 case TGSI_OPCODE_DP4:
1797 case TGSI_OPCODE_MUL:
1798 case TGSI_OPCODE_KIL:
1799 case TGSI_OPCODE_ADD:
1800 case TGSI_OPCODE_SUB:
1801 case TGSI_OPCODE_MAD:
1802 return TRUE;
1803 case TGSI_OPCODE_POW:
1804 if (i == 1)
1805 return TRUE;
1806 return FALSE;
1807 default:
1808 return FALSE;
1809 }
1810 }
1811
1812 /* Return a read mask for source registers deduced from opcode & write mask. */
1813 static unsigned
1814 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1815 {
1816 unsigned x, mask = insn->Dst[0].Register.WriteMask;
1817
1818 switch (insn->Instruction.Opcode) {
1819 case TGSI_OPCODE_COS:
1820 case TGSI_OPCODE_SIN:
1821 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1822 case TGSI_OPCODE_DP3:
1823 return 0x7;
1824 case TGSI_OPCODE_DP4:
1825 case TGSI_OPCODE_DPH:
1826 case TGSI_OPCODE_KIL: /* WriteMask ignored */
1827 return 0xf;
1828 case TGSI_OPCODE_DST:
1829 return mask & (c ? 0xa : 0x6);
1830 case TGSI_OPCODE_EX2:
1831 case TGSI_OPCODE_LG2:
1832 case TGSI_OPCODE_POW:
1833 case TGSI_OPCODE_RCP:
1834 case TGSI_OPCODE_RSQ:
1835 case TGSI_OPCODE_SCS:
1836 return 0x1;
1837 case TGSI_OPCODE_IF:
1838 return 0x1;
1839 case TGSI_OPCODE_LIT:
1840 return 0xb;
1841 case TGSI_OPCODE_TEX:
1842 case TGSI_OPCODE_TXB:
1843 case TGSI_OPCODE_TXL:
1844 case TGSI_OPCODE_TXP:
1845 {
1846 const struct tgsi_instruction_texture *tex;
1847
1848 assert(insn->Instruction.Texture);
1849 tex = &insn->Texture;
1850
1851 mask = 0x7;
1852 if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
1853 insn->Instruction.Opcode != TGSI_OPCODE_TXD)
1854 mask |= 0x8; /* bias, lod or proj */
1855
1856 switch (tex->Texture) {
1857 case TGSI_TEXTURE_1D:
1858 mask &= 0x9;
1859 break;
1860 case TGSI_TEXTURE_SHADOW1D:
1861 mask &= 0x5;
1862 break;
1863 case TGSI_TEXTURE_2D:
1864 mask &= 0xb;
1865 break;
1866 default:
1867 break;
1868 }
1869 }
1870 return mask;
1871 case TGSI_OPCODE_XPD:
1872 x = 0;
1873 if (mask & 1) x |= 0x6;
1874 if (mask & 2) x |= 0x5;
1875 if (mask & 4) x |= 0x3;
1876 return x;
1877 default:
1878 break;
1879 }
1880
1881 return mask;
1882 }
1883
1884 static struct nv50_reg *
1885 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1886 {
1887 switch (dst->Register.File) {
1888 case TGSI_FILE_TEMPORARY:
1889 return &pc->temp[dst->Register.Index * 4 + c];
1890 case TGSI_FILE_OUTPUT:
1891 return &pc->result[dst->Register.Index * 4 + c];
1892 case TGSI_FILE_ADDRESS:
1893 {
1894 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
1895 if (!r) {
1896 r = alloc_addr(pc, NULL);
1897 pc->addr[dst->Register.Index * 4 + c] = r;
1898 }
1899 assert(r);
1900 return r;
1901 }
1902 case TGSI_FILE_NULL:
1903 return NULL;
1904 default:
1905 break;
1906 }
1907
1908 return NULL;
1909 }
1910
1911 static struct nv50_reg *
1912 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1913 boolean neg)
1914 {
1915 struct nv50_reg *r = NULL;
1916 struct nv50_reg *temp;
1917 unsigned sgn, c, swz;
1918
1919 if (src->Register.File != TGSI_FILE_CONSTANT)
1920 assert(!src->Register.Indirect);
1921
1922 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1923
1924 c = tgsi_util_get_full_src_register_swizzle(src, chan);
1925 switch (c) {
1926 case TGSI_SWIZZLE_X:
1927 case TGSI_SWIZZLE_Y:
1928 case TGSI_SWIZZLE_Z:
1929 case TGSI_SWIZZLE_W:
1930 switch (src->Register.File) {
1931 case TGSI_FILE_INPUT:
1932 r = &pc->attr[src->Register.Index * 4 + c];
1933 break;
1934 case TGSI_FILE_TEMPORARY:
1935 r = &pc->temp[src->Register.Index * 4 + c];
1936 break;
1937 case TGSI_FILE_CONSTANT:
1938 if (!src->Register.Indirect) {
1939 r = &pc->param[src->Register.Index * 4 + c];
1940 break;
1941 }
1942 /* Indicate indirection by setting r->acc < 0 and
1943 * use the index field to select the address reg.
1944 */
1945 r = reg_instance(pc, NULL);
1946 swz = tgsi_util_get_src_register_swizzle(
1947 &src->Indirect, 0);
1948 ctor_reg(r, P_CONST,
1949 src->Indirect.Index * 4 + swz,
1950 src->Register.Index * 4 + c);
1951 r->acc = -1;
1952 break;
1953 case TGSI_FILE_IMMEDIATE:
1954 r = &pc->immd[src->Register.Index * 4 + c];
1955 break;
1956 case TGSI_FILE_SAMPLER:
1957 break;
1958 case TGSI_FILE_ADDRESS:
1959 r = pc->addr[src->Register.Index * 4 + c];
1960 assert(r);
1961 break;
1962 default:
1963 assert(0);
1964 break;
1965 }
1966 break;
1967 default:
1968 assert(0);
1969 break;
1970 }
1971
1972 switch (sgn) {
1973 case TGSI_UTIL_SIGN_KEEP:
1974 break;
1975 case TGSI_UTIL_SIGN_CLEAR:
1976 temp = temp_temp(pc);
1977 emit_abs(pc, temp, r);
1978 r = temp;
1979 break;
1980 case TGSI_UTIL_SIGN_TOGGLE:
1981 if (neg)
1982 r->mod = NV50_MOD_NEG;
1983 else {
1984 temp = temp_temp(pc);
1985 emit_neg(pc, temp, r);
1986 r = temp;
1987 }
1988 break;
1989 case TGSI_UTIL_SIGN_SET:
1990 temp = temp_temp(pc);
1991 emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
1992 r = temp;
1993 break;
1994 default:
1995 assert(0);
1996 break;
1997 }
1998
1999 if (r && r->acc >= 0 && r != temp)
2000 return reg_instance(pc, r);
2001 return r;
2002 }
2003
2004 /* return TRUE for ops that produce only a single result */
2005 static boolean
2006 is_scalar_op(unsigned op)
2007 {
2008 switch (op) {
2009 case TGSI_OPCODE_COS:
2010 case TGSI_OPCODE_DP2:
2011 case TGSI_OPCODE_DP3:
2012 case TGSI_OPCODE_DP4:
2013 case TGSI_OPCODE_DPH:
2014 case TGSI_OPCODE_EX2:
2015 case TGSI_OPCODE_LG2:
2016 case TGSI_OPCODE_POW:
2017 case TGSI_OPCODE_RCP:
2018 case TGSI_OPCODE_RSQ:
2019 case TGSI_OPCODE_SIN:
2020 /*
2021 case TGSI_OPCODE_KIL:
2022 case TGSI_OPCODE_LIT:
2023 case TGSI_OPCODE_SCS:
2024 */
2025 return TRUE;
2026 default:
2027 return FALSE;
2028 }
2029 }
2030
2031 /* Returns a bitmask indicating which dst components depend
2032 * on source s, component c (reverse of nv50_tgsi_src_mask).
2033 */
2034 static unsigned
2035 nv50_tgsi_dst_revdep(unsigned op, int s, int c)
2036 {
2037 if (is_scalar_op(op))
2038 return 0x1;
2039
2040 switch (op) {
2041 case TGSI_OPCODE_DST:
2042 return (1 << c) & (s ? 0xa : 0x6);
2043 case TGSI_OPCODE_XPD:
2044 switch (c) {
2045 case 0: return 0x6;
2046 case 1: return 0x5;
2047 case 2: return 0x3;
2048 case 3: return 0x0;
2049 default:
2050 assert(0);
2051 return 0x0;
2052 }
2053 case TGSI_OPCODE_LIT:
2054 case TGSI_OPCODE_SCS:
2055 case TGSI_OPCODE_TEX:
2056 case TGSI_OPCODE_TXB:
2057 case TGSI_OPCODE_TXL:
2058 case TGSI_OPCODE_TXP:
2059 /* these take care of dangerous swizzles themselves */
2060 return 0x0;
2061 case TGSI_OPCODE_IF:
2062 case TGSI_OPCODE_KIL:
2063 /* don't call this function for these ops */
2064 assert(0);
2065 return 0;
2066 default:
2067 /* linear vector instruction */
2068 return (1 << c);
2069 }
2070 }
2071
2072 static INLINE boolean
2073 has_pred(struct nv50_program_exec *e, unsigned cc)
2074 {
2075 if (!is_long(e) || is_immd(e))
2076 return FALSE;
2077 return ((e->inst[1] & 0x780) == (cc << 7));
2078 }
2079
2080 /* on ENDIF see if we can do "@p0.neu single_op" instead of:
2081 * join_at ENDIF
2082 * @p0.eq bra ENDIF
2083 * single_op
2084 * ENDIF: nop.join
2085 */
2086 static boolean
2087 nv50_kill_branch(struct nv50_pc *pc)
2088 {
2089 int lvl = pc->if_lvl;
2090
2091 if (pc->if_insn[lvl]->next != pc->p->exec_tail)
2092 return FALSE;
2093
2094 /* if ccode == 'true', the BRA is from an ELSE and the predicate
2095 * reg may no longer be valid, since we currently always use $p0
2096 */
2097 if (has_pred(pc->if_insn[lvl], 0xf))
2098 return FALSE;
2099 assert(pc->if_insn[lvl] && pc->if_join[lvl]);
2100
2101 /* We'll use the exec allocated for JOIN_AT (we can't easily
2102 * access nv50_program_exec's prev).
2103 */
2104 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
2105
2106 *pc->if_join[lvl] = *pc->p->exec_tail;
2107
2108 FREE(pc->if_insn[lvl]);
2109 FREE(pc->p->exec_tail);
2110
2111 pc->p->exec_tail = pc->if_join[lvl];
2112 pc->p->exec_tail->next = NULL;
2113 set_pred(pc, 0xd, 0, pc->p->exec_tail);
2114
2115 return TRUE;
2116 }
2117
2118 static void
2119 nv50_fp_move_results(struct nv50_pc *pc)
2120 {
2121 struct nv50_reg reg;
2122 unsigned i;
2123
2124 ctor_reg(&reg, P_TEMP, -1, -1);
2125
2126 for (i = 0; i < pc->result_nr * 4; ++i) {
2127 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2128 continue;
2129 if (pc->result[i].rhw != pc->result[i].hw) {
2130 reg.hw = pc->result[i].rhw;
2131 emit_mov(pc, &reg, &pc->result[i]);
2132 }
2133 }
2134 }
2135
2136 static boolean
2137 nv50_program_tx_insn(struct nv50_pc *pc,
2138 const struct tgsi_full_instruction *inst)
2139 {
2140 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
2141 unsigned mask, sat, unit;
2142 int i, c;
2143
2144 mask = inst->Dst[0].Register.WriteMask;
2145 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
2146
2147 memset(src, 0, sizeof(src));
2148
2149 for (c = 0; c < 4; c++) {
2150 if ((mask & (1 << c)) && !pc->r_dst[c])
2151 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
2152 else
2153 dst[c] = pc->r_dst[c];
2154 rdst[c] = dst[c];
2155 }
2156
2157 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2158 const struct tgsi_full_src_register *fs = &inst->Src[i];
2159 unsigned src_mask;
2160 boolean neg_supp;
2161
2162 src_mask = nv50_tgsi_src_mask(inst, i);
2163 neg_supp = negate_supported(inst, i);
2164
2165 if (fs->Register.File == TGSI_FILE_SAMPLER)
2166 unit = fs->Register.Index;
2167
2168 for (c = 0; c < 4; c++)
2169 if (src_mask & (1 << c))
2170 src[i][c] = tgsi_src(pc, c, fs, neg_supp);
2171 }
2172
2173 brdc = temp = pc->r_brdc;
2174 if (brdc && brdc->type != P_TEMP) {
2175 temp = temp_temp(pc);
2176 if (sat)
2177 brdc = temp;
2178 } else
2179 if (sat) {
2180 for (c = 0; c < 4; c++) {
2181 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
2182 continue;
2183 /* rdst[c] = dst[c]; */ /* done above */
2184 dst[c] = temp_temp(pc);
2185 }
2186 }
2187
2188 assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
2189
2190 switch (inst->Instruction.Opcode) {
2191 case TGSI_OPCODE_ABS:
2192 for (c = 0; c < 4; c++) {
2193 if (!(mask & (1 << c)))
2194 continue;
2195 emit_abs(pc, dst[c], src[0][c]);
2196 }
2197 break;
2198 case TGSI_OPCODE_ADD:
2199 for (c = 0; c < 4; c++) {
2200 if (!(mask & (1 << c)))
2201 continue;
2202 emit_add(pc, dst[c], src[0][c], src[1][c]);
2203 }
2204 break;
2205 case TGSI_OPCODE_AND:
2206 case TGSI_OPCODE_XOR:
2207 case TGSI_OPCODE_OR:
2208 for (c = 0; c < 4; c++) {
2209 if (!(mask & (1 << c)))
2210 continue;
2211 emit_bitop2(pc, dst[c], src[0][c], src[1][c],
2212 inst->Instruction.Opcode);
2213 }
2214 break;
2215 case TGSI_OPCODE_ARL:
2216 assert(src[0][0]);
2217 temp = temp_temp(pc);
2218 emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
2219 emit_arl(pc, dst[0], temp, 4);
2220 break;
2221 case TGSI_OPCODE_BGNLOOP:
2222 pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
2223 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
2224 terminate_mbb(pc);
2225 break;
2226 case TGSI_OPCODE_BRK:
2227 assert(pc->loop_lvl > 0);
2228 emit_break(pc, -1, 0);
2229 break;
2230 case TGSI_OPCODE_CEIL:
2231 for (c = 0; c < 4; c++) {
2232 if (!(mask & (1 << c)))
2233 continue;
2234 emit_cvt(pc, dst[c], src[0][c], -1,
2235 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
2236 }
2237 break;
2238 case TGSI_OPCODE_CMP:
2239 pc->allow32 = FALSE;
2240 for (c = 0; c < 4; c++) {
2241 if (!(mask & (1 << c)))
2242 continue;
2243 emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
2244 emit_mov(pc, dst[c], src[1][c]);
2245 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
2246 emit_mov(pc, dst[c], src[2][c]);
2247 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
2248 }
2249 break;
2250 case TGSI_OPCODE_COS:
2251 if (mask & 8) {
2252 emit_precossin(pc, temp, src[0][3]);
2253 emit_flop(pc, 5, dst[3], temp);
2254 if (!(mask &= 7))
2255 break;
2256 if (temp == dst[3])
2257 temp = brdc = temp_temp(pc);
2258 }
2259 emit_precossin(pc, temp, src[0][0]);
2260 emit_flop(pc, 5, brdc, temp);
2261 break;
2262 case TGSI_OPCODE_DDX:
2263 for (c = 0; c < 4; c++) {
2264 if (!(mask & (1 << c)))
2265 continue;
2266 emit_ddx(pc, dst[c], src[0][c]);
2267 }
2268 break;
2269 case TGSI_OPCODE_DDY:
2270 for (c = 0; c < 4; c++) {
2271 if (!(mask & (1 << c)))
2272 continue;
2273 emit_ddy(pc, dst[c], src[0][c]);
2274 }
2275 break;
2276 case TGSI_OPCODE_DP3:
2277 emit_mul(pc, temp, src[0][0], src[1][0]);
2278 emit_mad(pc, temp, src[0][1], src[1][1], temp);
2279 emit_mad(pc, brdc, src[0][2], src[1][2], temp);
2280 break;
2281 case TGSI_OPCODE_DP4:
2282 emit_mul(pc, temp, src[0][0], src[1][0]);
2283 emit_mad(pc, temp, src[0][1], src[1][1], temp);
2284 emit_mad(pc, temp, src[0][2], src[1][2], temp);
2285 emit_mad(pc, brdc, src[0][3], src[1][3], temp);
2286 break;
2287 case TGSI_OPCODE_DPH:
2288 emit_mul(pc, temp, src[0][0], src[1][0]);
2289 emit_mad(pc, temp, src[0][1], src[1][1], temp);
2290 emit_mad(pc, temp, src[0][2], src[1][2], temp);
2291 emit_add(pc, brdc, src[1][3], temp);
2292 break;
2293 case TGSI_OPCODE_DST:
2294 if (mask & (1 << 1))
2295 emit_mul(pc, dst[1], src[0][1], src[1][1]);
2296 if (mask & (1 << 2))
2297 emit_mov(pc, dst[2], src[0][2]);
2298 if (mask & (1 << 3))
2299 emit_mov(pc, dst[3], src[1][3]);
2300 if (mask & (1 << 0))
2301 emit_mov_immdval(pc, dst[0], 1.0f);
2302 break;
2303 case TGSI_OPCODE_ELSE:
2304 emit_branch(pc, -1, 0);
2305 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2306 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2307 terminate_mbb(pc);
2308 break;
2309 case TGSI_OPCODE_ENDIF:
2310 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2311
2312 /* try to replace branch over 1 insn with a predicated insn */
2313 if (nv50_kill_branch(pc) == TRUE)
2314 break;
2315
2316 if (pc->if_join[pc->if_lvl]) {
2317 pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
2318 pc->if_join[pc->if_lvl] = NULL;
2319 }
2320 terminate_mbb(pc);
2321 /* emit a NOP as join point, we could set it on the next
2322 * one, but would have to make sure it is long and !immd
2323 */
2324 JOIN_ON(emit_nop(pc));
2325 break;
2326 case TGSI_OPCODE_ENDLOOP:
2327 emit_branch(pc, -1, 0)->param.index =
2328 pc->loop_pos[--pc->loop_lvl];
2329 pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
2330 terminate_mbb(pc);
2331 break;
2332 case TGSI_OPCODE_EX2:
2333 emit_preex2(pc, temp, src[0][0]);
2334 emit_flop(pc, 6, brdc, temp);
2335 break;
2336 case TGSI_OPCODE_FLR:
2337 for (c = 0; c < 4; c++) {
2338 if (!(mask & (1 << c)))
2339 continue;
2340 emit_flr(pc, dst[c], src[0][c]);
2341 }
2342 break;
2343 case TGSI_OPCODE_FRC:
2344 temp = temp_temp(pc);
2345 for (c = 0; c < 4; c++) {
2346 if (!(mask & (1 << c)))
2347 continue;
2348 emit_flr(pc, temp, src[0][c]);
2349 emit_sub(pc, dst[c], src[0][c], temp);
2350 }
2351 break;
2352 case TGSI_OPCODE_IF:
2353 assert(pc->if_lvl < NV50_MAX_COND_NESTING);
2354 emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2355 CVT_F32_F32);
2356 pc->if_join[pc->if_lvl] = emit_joinat(pc);
2357 pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
2358 terminate_mbb(pc);
2359 break;
2360 case TGSI_OPCODE_KIL:
2361 assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
2362 emit_kil(pc, src[0][0]);
2363 emit_kil(pc, src[0][1]);
2364 emit_kil(pc, src[0][2]);
2365 emit_kil(pc, src[0][3]);
2366 break;
2367 case TGSI_OPCODE_KILP:
2368 emit_kil(pc, NULL);
2369 break;
2370 case TGSI_OPCODE_LIT:
2371 emit_lit(pc, &dst[0], mask, &src[0][0]);
2372 break;
2373 case TGSI_OPCODE_LG2:
2374 emit_flop(pc, 3, brdc, src[0][0]);
2375 break;
2376 case TGSI_OPCODE_LRP:
2377 temp = temp_temp(pc);
2378 for (c = 0; c < 4; c++) {
2379 if (!(mask & (1 << c)))
2380 continue;
2381 emit_sub(pc, temp, src[1][c], src[2][c]);
2382 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2383 }
2384 break;
2385 case TGSI_OPCODE_MAD:
2386 for (c = 0; c < 4; c++) {
2387 if (!(mask & (1 << c)))
2388 continue;
2389 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2390 }
2391 break;
2392 case TGSI_OPCODE_MAX:
2393 for (c = 0; c < 4; c++) {
2394 if (!(mask & (1 << c)))
2395 continue;
2396 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2397 }
2398 break;
2399 case TGSI_OPCODE_MIN:
2400 for (c = 0; c < 4; c++) {
2401 if (!(mask & (1 << c)))
2402 continue;
2403 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2404 }
2405 break;
2406 case TGSI_OPCODE_MOV:
2407 for (c = 0; c < 4; c++) {
2408 if (!(mask & (1 << c)))
2409 continue;
2410 emit_mov(pc, dst[c], src[0][c]);
2411 }
2412 break;
2413 case TGSI_OPCODE_MUL:
2414 for (c = 0; c < 4; c++) {
2415 if (!(mask & (1 << c)))
2416 continue;
2417 emit_mul(pc, dst[c], src[0][c], src[1][c]);
2418 }
2419 break;
2420 case TGSI_OPCODE_POW:
2421 emit_pow(pc, brdc, src[0][0], src[1][0]);
2422 break;
2423 case TGSI_OPCODE_RCP:
2424 emit_flop(pc, 0, brdc, src[0][0]);
2425 break;
2426 case TGSI_OPCODE_RET:
2427 if (pc->p->type == PIPE_SHADER_FRAGMENT)
2428 nv50_fp_move_results(pc);
2429 emit_ret(pc, -1, 0);
2430 break;
2431 case TGSI_OPCODE_RSQ:
2432 emit_flop(pc, 2, brdc, src[0][0]);
2433 break;
2434 case TGSI_OPCODE_SCS:
2435 temp = temp_temp(pc);
2436 if (mask & 3)
2437 emit_precossin(pc, temp, src[0][0]);
2438 if (mask & (1 << 0))
2439 emit_flop(pc, 5, dst[0], temp);
2440 if (mask & (1 << 1))
2441 emit_flop(pc, 4, dst[1], temp);
2442 if (mask & (1 << 2))
2443 emit_mov_immdval(pc, dst[2], 0.0);
2444 if (mask & (1 << 3))
2445 emit_mov_immdval(pc, dst[3], 1.0);
2446 break;
2447 case TGSI_OPCODE_SIN:
2448 if (mask & 8) {
2449 emit_precossin(pc, temp, src[0][3]);
2450 emit_flop(pc, 4, dst[3], temp);
2451 if (!(mask &= 7))
2452 break;
2453 if (temp == dst[3])
2454 temp = brdc = temp_temp(pc);
2455 }
2456 emit_precossin(pc, temp, src[0][0]);
2457 emit_flop(pc, 4, brdc, temp);
2458 break;
2459 case TGSI_OPCODE_SLT:
2460 case TGSI_OPCODE_SGE:
2461 case TGSI_OPCODE_SEQ:
2462 case TGSI_OPCODE_SGT:
2463 case TGSI_OPCODE_SLE:
2464 case TGSI_OPCODE_SNE:
2465 i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2466 for (c = 0; c < 4; c++) {
2467 if (!(mask & (1 << c)))
2468 continue;
2469 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2470 }
2471 break;
2472 case TGSI_OPCODE_SUB:
2473 for (c = 0; c < 4; c++) {
2474 if (!(mask & (1 << c)))
2475 continue;
2476 emit_sub(pc, dst[c], src[0][c], src[1][c]);
2477 }
2478 break;
2479 case TGSI_OPCODE_TEX:
2480 emit_tex(pc, dst, mask, src[0], unit,
2481 inst->Texture.Texture, FALSE, 0);
2482 break;
2483 case TGSI_OPCODE_TXB:
2484 emit_tex(pc, dst, mask, src[0], unit,
2485 inst->Texture.Texture, FALSE, -1);
2486 break;
2487 case TGSI_OPCODE_TXL:
2488 emit_tex(pc, dst, mask, src[0], unit,
2489 inst->Texture.Texture, FALSE, 1);
2490 break;
2491 case TGSI_OPCODE_TXP:
2492 emit_tex(pc, dst, mask, src[0], unit,
2493 inst->Texture.Texture, TRUE, 0);
2494 break;
2495 case TGSI_OPCODE_TRUNC:
2496 for (c = 0; c < 4; c++) {
2497 if (!(mask & (1 << c)))
2498 continue;
2499 emit_cvt(pc, dst[c], src[0][c], -1,
2500 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2501 }
2502 break;
2503 case TGSI_OPCODE_XPD:
2504 temp = temp_temp(pc);
2505 if (mask & (1 << 0)) {
2506 emit_mul(pc, temp, src[0][2], src[1][1]);
2507 emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2508 }
2509 if (mask & (1 << 1)) {
2510 emit_mul(pc, temp, src[0][0], src[1][2]);
2511 emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2512 }
2513 if (mask & (1 << 2)) {
2514 emit_mul(pc, temp, src[0][1], src[1][0]);
2515 emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2516 }
2517 if (mask & (1 << 3))
2518 emit_mov_immdval(pc, dst[3], 1.0);
2519 break;
2520 case TGSI_OPCODE_END:
2521 break;
2522 default:
2523 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2524 return FALSE;
2525 }
2526
2527 if (brdc) {
2528 if (sat)
2529 emit_sat(pc, brdc, brdc);
2530 for (c = 0; c < 4; c++)
2531 if ((mask & (1 << c)) && dst[c] != brdc)
2532 emit_mov(pc, dst[c], brdc);
2533 } else
2534 if (sat) {
2535 for (c = 0; c < 4; c++) {
2536 if (!(mask & (1 << c)))
2537 continue;
2538 /* In this case we saturate later, and dst[c] won't
2539 * be another temp_temp (and thus lost), since rdst
2540 * already is TEMP (see above). */
2541 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2542 continue;
2543 emit_sat(pc, rdst[c], dst[c]);
2544 }
2545 }
2546
2547 kill_temp_temp(pc);
2548 pc->reg_instance_nr = 0;
2549
2550 return TRUE;
2551 }
2552
2553 static void
2554 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2555 {
2556 struct nv50_reg *reg = NULL;
2557 const struct tgsi_full_src_register *src;
2558 const struct tgsi_dst_register *dst;
2559 unsigned i, c, k, mask;
2560
2561 dst = &insn->Dst[0].Register;
2562 mask = dst->WriteMask;
2563
2564 if (dst->File == TGSI_FILE_TEMPORARY)
2565 reg = pc->temp;
2566 else
2567 if (dst->File == TGSI_FILE_OUTPUT)
2568 reg = pc->result;
2569
2570 if (reg) {
2571 for (c = 0; c < 4; c++) {
2572 if (!(mask & (1 << c)))
2573 continue;
2574 reg[dst->Index * 4 + c].acc = pc->insn_nr;
2575 }
2576 }
2577
2578 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2579 src = &insn->Src[i];
2580
2581 if (src->Register.File == TGSI_FILE_TEMPORARY)
2582 reg = pc->temp;
2583 else
2584 if (src->Register.File == TGSI_FILE_INPUT)
2585 reg = pc->attr;
2586 else
2587 continue;
2588
2589 mask = nv50_tgsi_src_mask(insn, i);
2590
2591 for (c = 0; c < 4; c++) {
2592 if (!(mask & (1 << c)))
2593 continue;
2594 k = tgsi_util_get_full_src_register_swizzle(src, c);
2595
2596 reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
2597 }
2598 }
2599 }
2600
2601 /* Returns a bitmask indicating which dst components need to be
2602 * written to temporaries first to avoid 'corrupting' sources.
2603 *
2604 * m[i] (out) indicate component to write in the i-th position
2605 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2606 */
2607 static unsigned
2608 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2609 {
2610 unsigned i, c, x, unsafe;
2611
2612 for (c = 0; c < 4; c++)
2613 m[c] = c;
2614
2615 /* Swap as long as a dst component written earlier is depended on
2616 * by one written later, but the next one isn't depended on by it.
2617 */
2618 for (c = 0; c < 3; c++) {
2619 if (rdep[m[c + 1]] & (1 << m[c]))
2620 continue; /* if next one is depended on by us */
2621 for (i = c + 1; i < 4; i++)
2622 /* if we are depended on by a later one */
2623 if (rdep[m[c]] & (1 << m[i]))
2624 break;
2625 if (i == 4)
2626 continue;
2627 /* now, swap */
2628 x = m[c];
2629 m[c] = m[c + 1];
2630 m[c + 1] = x;
2631
2632 /* restart */
2633 c = 0;
2634 }
2635
2636 /* mark dependencies that could not be resolved by reordering */
2637 for (i = 0; i < 3; ++i)
2638 for (c = i + 1; c < 4; ++c)
2639 if (rdep[m[i]] & (1 << m[c]))
2640 unsafe |= (1 << i);
2641
2642 /* NOTE: $unsafe is with respect to order, not component */
2643 return unsafe;
2644 }
2645
2646 /* Select a suitable dst register for broadcasting scalar results,
2647 * or return NULL if we have to allocate an extra TEMP.
2648 *
2649 * If e.g. only 1 component is written, we may also emit the final
2650 * result to a write-only register.
2651 */
2652 static struct nv50_reg *
2653 tgsi_broadcast_dst(struct nv50_pc *pc,
2654 const struct tgsi_full_dst_register *fd, unsigned mask)
2655 {
2656 if (fd->Register.File == TGSI_FILE_TEMPORARY) {
2657 int c = ffs(~mask & fd->Register.WriteMask);
2658 if (c)
2659 return tgsi_dst(pc, c - 1, fd);
2660 } else {
2661 int c = ffs(fd->Register.WriteMask) - 1;
2662 if ((1 << c) == fd->Register.WriteMask)
2663 return tgsi_dst(pc, c, fd);
2664 }
2665
2666 return NULL;
2667 }
2668
2669 /* Scan source swizzles and return a bitmask indicating dst regs that
2670 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2671 */
2672 static unsigned
2673 nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2674 unsigned rdep[4])
2675 {
2676 const struct tgsi_full_dst_register *fd = &insn->Dst[0];
2677 const struct tgsi_full_src_register *fs;
2678 unsigned i, deqs = 0;
2679
2680 for (i = 0; i < 4; ++i)
2681 rdep[i] = 0;
2682
2683 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2684 unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2685 boolean neg_supp = negate_supported(insn, i);
2686
2687 fs = &insn->Src[i];
2688 if (fs->Register.File != fd->Register.File ||
2689 fs->Register.Index != fd->Register.Index)
2690 continue;
2691
2692 for (chn = 0; chn < 4; ++chn) {
2693 unsigned s, c;
2694
2695 if (!(mask & (1 << chn))) /* src is not read */
2696 continue;
2697 c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2698 s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2699
2700 if (!(fd->Register.WriteMask & (1 << c)))
2701 continue;
2702
2703 /* no danger if src is copied to TEMP first */
2704 if ((s != TGSI_UTIL_SIGN_KEEP) &&
2705 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2706 continue;
2707
2708 rdep[c] |= nv50_tgsi_dst_revdep(
2709 insn->Instruction.Opcode, i, chn);
2710 deqs |= (1 << c);
2711 }
2712 }
2713
2714 return deqs;
2715 }
2716
2717 static boolean
2718 nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2719 {
2720 struct tgsi_full_instruction insn = tok->FullInstruction;
2721 const struct tgsi_full_dst_register *fd;
2722 unsigned i, deqs, rdep[4], m[4];
2723
2724 fd = &tok->FullInstruction.Dst[0];
2725 deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2726
2727 if (is_scalar_op(insn.Instruction.Opcode)) {
2728 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2729 if (!pc->r_brdc)
2730 pc->r_brdc = temp_temp(pc);
2731 return nv50_program_tx_insn(pc, &insn);
2732 }
2733 pc->r_brdc = NULL;
2734
2735 if (!deqs)
2736 return nv50_program_tx_insn(pc, &insn);
2737
2738 deqs = nv50_revdep_reorder(m, rdep);
2739
2740 for (i = 0; i < 4; ++i) {
2741 assert(pc->r_dst[m[i]] == NULL);
2742
2743 insn.Dst[0].Register.WriteMask =
2744 fd->Register.WriteMask & (1 << m[i]);
2745
2746 if (!insn.Dst[0].Register.WriteMask)
2747 continue;
2748
2749 if (deqs & (1 << i))
2750 pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2751
2752 if (!nv50_program_tx_insn(pc, &insn))
2753 return FALSE;
2754 }
2755
2756 for (i = 0; i < 4; i++) {
2757 struct nv50_reg *reg = pc->r_dst[i];
2758 if (!reg)
2759 continue;
2760 pc->r_dst[i] = NULL;
2761
2762 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2763 emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2764 else
2765 emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2766 free_temp(pc, reg);
2767 }
2768
2769 return TRUE;
2770 }
2771
2772 static void
2773 load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2774 {
2775 struct nv50_reg *iv, **ppiv;
2776 unsigned mode = pc->interp_mode[reg->index];
2777
2778 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2779 iv = *ppiv;
2780
2781 if ((mode & INTERP_PERSPECTIVE) && !iv) {
2782 iv = *ppiv = alloc_temp(pc, NULL);
2783 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2784
2785 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2786 emit_flop(pc, 0, iv, iv);
2787
2788 /* XXX: when loading interpolants dynamically, move these
2789 * to the program head, or make sure it can't be skipped.
2790 */
2791 }
2792
2793 emit_interp(pc, reg, iv, mode);
2794 }
2795
2796 /* The face input is always at v[255] (varying space), with a
2797 * value of 0 for back-facing, and 0xffffffff for front-facing.
2798 */
2799 static void
2800 load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
2801 {
2802 struct nv50_reg *one = alloc_immd(pc, 1.0f);
2803
2804 assert(a->rhw == -1);
2805 alloc_reg(pc, a); /* do this before rhw is set */
2806 a->rhw = 255;
2807 load_interpolant(pc, a);
2808 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
2809
2810 FREE(one);
2811 }
2812
2813 static boolean
2814 nv50_program_tx_prep(struct nv50_pc *pc)
2815 {
2816 struct tgsi_parse_context tp;
2817 struct nv50_program *p = pc->p;
2818 boolean ret = FALSE;
2819 unsigned i, c, flat_nr = 0;
2820
2821 tgsi_parse_init(&tp, pc->p->pipe.tokens);
2822 while (!tgsi_parse_end_of_tokens(&tp)) {
2823 const union tgsi_full_token *tok = &tp.FullToken;
2824
2825 tgsi_parse_token(&tp);
2826 switch (tok->Token.Type) {
2827 case TGSI_TOKEN_TYPE_IMMEDIATE:
2828 {
2829 const struct tgsi_full_immediate *imm =
2830 &tp.FullToken.FullImmediate;
2831
2832 ctor_immd_4f32(pc, imm->u[0].Float,
2833 imm->u[1].Float,
2834 imm->u[2].Float,
2835 imm->u[3].Float);
2836 }
2837 break;
2838 case TGSI_TOKEN_TYPE_DECLARATION:
2839 {
2840 const struct tgsi_full_declaration *d;
2841 unsigned si, last, first, mode;
2842
2843 d = &tp.FullToken.FullDeclaration;
2844 first = d->Range.First;
2845 last = d->Range.Last;
2846
2847 switch (d->Declaration.File) {
2848 case TGSI_FILE_TEMPORARY:
2849 break;
2850 case TGSI_FILE_OUTPUT:
2851 if (!d->Declaration.Semantic ||
2852 p->type == PIPE_SHADER_FRAGMENT)
2853 break;
2854
2855 si = d->Semantic.Index;
2856 switch (d->Semantic.Name) {
2857 case TGSI_SEMANTIC_BCOLOR:
2858 p->cfg.two_side[si].hw = first;
2859 if (p->cfg.io_nr > first)
2860 p->cfg.io_nr = first;
2861 break;
2862 case TGSI_SEMANTIC_PSIZE:
2863 p->cfg.psiz = first;
2864 if (p->cfg.io_nr > first)
2865 p->cfg.io_nr = first;
2866 break;
2867 /*
2868 case TGSI_SEMANTIC_CLIP_DISTANCE:
2869 p->cfg.clpd = MIN2(p->cfg.clpd, first);
2870 break;
2871 */
2872 default:
2873 break;
2874 }
2875 break;
2876 case TGSI_FILE_INPUT:
2877 {
2878 if (p->type != PIPE_SHADER_FRAGMENT)
2879 break;
2880
2881 switch (d->Declaration.Interpolate) {
2882 case TGSI_INTERPOLATE_CONSTANT:
2883 mode = INTERP_FLAT;
2884 flat_nr++;
2885 break;
2886 case TGSI_INTERPOLATE_PERSPECTIVE:
2887 mode = INTERP_PERSPECTIVE;
2888 p->cfg.regs[1] |= 0x08 << 24;
2889 break;
2890 default:
2891 mode = INTERP_LINEAR;
2892 break;
2893 }
2894 if (d->Declaration.Centroid)
2895 mode |= INTERP_CENTROID;
2896
2897 assert(last < 32);
2898 for (i = first; i <= last; i++)
2899 pc->interp_mode[i] = mode;
2900 }
2901 break;
2902 case TGSI_FILE_ADDRESS:
2903 case TGSI_FILE_CONSTANT:
2904 case TGSI_FILE_SAMPLER:
2905 break;
2906 default:
2907 NOUVEAU_ERR("bad decl file %d\n",
2908 d->Declaration.File);
2909 goto out_err;
2910 }
2911 }
2912 break;
2913 case TGSI_TOKEN_TYPE_INSTRUCTION:
2914 pc->insn_nr++;
2915 prep_inspect_insn(pc, &tok->FullInstruction);
2916 break;
2917 default:
2918 break;
2919 }
2920 }
2921
2922 if (p->type == PIPE_SHADER_VERTEX) {
2923 int rid = 0;
2924
2925 for (i = 0; i < pc->attr_nr * 4; ++i) {
2926 if (pc->attr[i].acc) {
2927 pc->attr[i].hw = rid++;
2928 p->cfg.attr[i / 32] |= 1 << (i % 32);
2929 }
2930 }
2931
2932 for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2933 p->cfg.io[i].hw = rid;
2934 p->cfg.io[i].id = i;
2935
2936 for (c = 0; c < 4; ++c) {
2937 int n = i * 4 + c;
2938 if (!pc->result[n].acc)
2939 continue;
2940 pc->result[n].hw = rid++;
2941 p->cfg.io[i].mask |= 1 << c;
2942 }
2943 }
2944
2945 for (c = 0; c < 2; ++c)
2946 if (p->cfg.two_side[c].hw < 0x40)
2947 p->cfg.two_side[c] = p->cfg.io[
2948 p->cfg.two_side[c].hw];
2949
2950 if (p->cfg.psiz < 0x40)
2951 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2952 } else
2953 if (p->type == PIPE_SHADER_FRAGMENT) {
2954 int rid, aid;
2955 unsigned n = 0, m = pc->attr_nr - flat_nr;
2956
2957 pc->allow32 = TRUE;
2958
2959 int base = (TGSI_SEMANTIC_POSITION ==
2960 p->info.input_semantic_name[0]) ? 0 : 1;
2961
2962 /* non-flat interpolants have to be mapped to
2963 * the lower hardware IDs, so sort them:
2964 */
2965 for (i = 0; i < pc->attr_nr; i++) {
2966 if (pc->interp_mode[i] == INTERP_FLAT)
2967 p->cfg.io[m++].id = i;
2968 else {
2969 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2970 p->cfg.io[n].linear = TRUE;
2971 p->cfg.io[n++].id = i;
2972 }
2973 }
2974
2975 if (!base) /* set w-coordinate mask from perspective interp */
2976 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2977
2978 aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2979 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2980
2981 for (n = 0; n < pc->attr_nr; ++n) {
2982 p->cfg.io[n].hw = rid = aid;
2983 i = p->cfg.io[n].id;
2984
2985 if (p->info.input_semantic_name[n] ==
2986 TGSI_SEMANTIC_FACE) {
2987 load_frontfacing(pc, &pc->attr[i * 4]);
2988 continue;
2989 }
2990
2991 for (c = 0; c < 4; ++c) {
2992 if (!pc->attr[i * 4 + c].acc)
2993 continue;
2994 pc->attr[i * 4 + c].rhw = rid++;
2995 p->cfg.io[n].mask |= 1 << c;
2996
2997 load_interpolant(pc, &pc->attr[i * 4 + c]);
2998 }
2999 aid += popcnt4(p->cfg.io[n].mask);
3000 }
3001
3002 if (!base)
3003 p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
3004
3005 m = popcnt4(p->cfg.regs[1] >> 24);
3006
3007 /* set count of non-position inputs and of non-flat
3008 * non-position inputs for FP_INTERPOLANT_CTRL
3009 */
3010 p->cfg.regs[1] |= aid - m;
3011
3012 if (flat_nr) {
3013 i = p->cfg.io[pc->attr_nr - flat_nr].hw;
3014 p->cfg.regs[1] |= (i - m) << 16;
3015 } else
3016 p->cfg.regs[1] |= p->cfg.regs[1] << 16;
3017
3018 /* mark color semantic for light-twoside */
3019 n = 0x40;
3020 for (i = 0; i < pc->attr_nr; i++) {
3021 ubyte si, sn;
3022
3023 sn = p->info.input_semantic_name[p->cfg.io[i].id];
3024 si = p->info.input_semantic_index[p->cfg.io[i].id];
3025
3026 if (sn == TGSI_SEMANTIC_COLOR) {
3027 p->cfg.two_side[si] = p->cfg.io[i];
3028
3029 /* increase colour count */
3030 p->cfg.regs[0] += popcnt4(
3031 p->cfg.two_side[si].mask) << 16;
3032
3033 n = MIN2(n, p->cfg.io[i].hw - m);
3034 }
3035 }
3036 if (n < 0x40)
3037 p->cfg.regs[0] += n;
3038
3039 /* Initialize FP results:
3040 * FragDepth is always first TGSI and last hw output
3041 */
3042 i = p->info.writes_z ? 4 : 0;
3043 for (rid = 0; i < pc->result_nr * 4; i++)
3044 pc->result[i].rhw = rid++;
3045 if (p->info.writes_z)
3046 pc->result[2].rhw = rid;
3047
3048 p->cfg.high_result = rid;
3049
3050 /* separate/different colour results for MRTs ? */
3051 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
3052 p->cfg.regs[2] |= 1;
3053 }
3054
3055 if (pc->immd_nr) {
3056 int rid = 0;
3057
3058 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
3059 if (!pc->immd)
3060 goto out_err;
3061
3062 for (i = 0; i < pc->immd_nr; i++) {
3063 for (c = 0; c < 4; c++, rid++)
3064 ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
3065 }
3066 }
3067
3068 ret = TRUE;
3069 out_err:
3070 if (pc->iv_p)
3071 free_temp(pc, pc->iv_p);
3072 if (pc->iv_c)
3073 free_temp(pc, pc->iv_c);
3074
3075 tgsi_parse_free(&tp);
3076 return ret;
3077 }
3078
3079 static void
3080 free_nv50_pc(struct nv50_pc *pc)
3081 {
3082 if (pc->immd)
3083 FREE(pc->immd);
3084 if (pc->param)
3085 FREE(pc->param);
3086 if (pc->result)
3087 FREE(pc->result);
3088 if (pc->attr)
3089 FREE(pc->attr);
3090 if (pc->temp)
3091 FREE(pc->temp);
3092
3093 FREE(pc);
3094 }
3095
3096 static boolean
3097 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
3098 {
3099 int i, c;
3100 unsigned rtype[2] = { P_ATTR, P_RESULT };
3101
3102 pc->p = p;
3103 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
3104 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
3105 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
3106 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
3107 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
3108 assert(pc->addr_nr <= 2);
3109
3110 p->cfg.high_temp = 4;
3111
3112 p->cfg.two_side[0].hw = 0x40;
3113 p->cfg.two_side[1].hw = 0x40;
3114
3115 switch (p->type) {
3116 case PIPE_SHADER_VERTEX:
3117 p->cfg.psiz = 0x40;
3118 p->cfg.clpd = 0x40;
3119 p->cfg.io_nr = pc->result_nr;
3120 break;
3121 case PIPE_SHADER_FRAGMENT:
3122 rtype[0] = rtype[1] = P_TEMP;
3123
3124 p->cfg.regs[0] = 0x01000004;
3125 p->cfg.io_nr = pc->attr_nr;
3126
3127 if (p->info.writes_z) {
3128 p->cfg.regs[2] |= 0x00000100;
3129 p->cfg.regs[3] |= 0x00000011;
3130 }
3131 if (p->info.uses_kill)
3132 p->cfg.regs[2] |= 0x00100000;
3133 break;
3134 }
3135
3136 if (pc->temp_nr) {
3137 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
3138 if (!pc->temp)
3139 return FALSE;
3140
3141 for (i = 0; i < pc->temp_nr * 4; ++i)
3142 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
3143 }
3144
3145 if (pc->attr_nr) {
3146 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
3147 if (!pc->attr)
3148 return FALSE;
3149
3150 for (i = 0; i < pc->attr_nr * 4; ++i)
3151 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
3152 }
3153
3154 if (pc->result_nr) {
3155 unsigned nr = pc->result_nr * 4;
3156
3157 pc->result = MALLOC(nr * sizeof(struct nv50_reg));
3158 if (!pc->result)
3159 return FALSE;
3160
3161 for (i = 0; i < nr; ++i)
3162 ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
3163 }
3164
3165 if (pc->param_nr) {
3166 int rid = 0;
3167
3168 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
3169 if (!pc->param)
3170 return FALSE;
3171
3172 for (i = 0; i < pc->param_nr; ++i)
3173 for (c = 0; c < 4; ++c, ++rid)
3174 ctor_reg(&pc->param[rid], P_CONST, i, rid);
3175 }
3176
3177 if (pc->addr_nr) {
3178 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
3179 if (!pc->addr)
3180 return FALSE;
3181 }
3182 for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
3183 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
3184
3185 return TRUE;
3186 }
3187
3188 static void
3189 nv50_program_fixup_insns(struct nv50_pc *pc)
3190 {
3191 struct nv50_program_exec *e, **bra_list;
3192 unsigned i, n, pos;
3193
3194 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
3195
3196 /* Collect branch instructions, we need to adjust their offsets
3197 * when converting 32 bit instructions to 64 bit ones
3198 */
3199 for (n = 0, e = pc->p->exec_head; e; e = e->next)
3200 if (e->param.index >= 0 && !e->param.mask)
3201 bra_list[n++] = e;
3202
3203 /* last instruction must be long so it can have the exit bit set */
3204 if (!is_long(pc->p->exec_tail))
3205 convert_to_long(pc, pc->p->exec_tail);
3206 /* set exit bit */
3207 pc->p->exec_tail->inst[1] |= 1;
3208
3209 /* !immd on exit insn simultaneously means !join */
3210 assert(!is_immd(pc->p->exec_head));
3211 assert(!is_immd(pc->p->exec_tail));
3212
3213 /* Make sure we don't have any single 32 bit instructions. */
3214 for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
3215 pos += is_long(e) ? 2 : 1;
3216
3217 if ((pos & 1) && (!e->next || is_long(e->next))) {
3218 for (i = 0; i < n; ++i)
3219 if (bra_list[i]->param.index >= pos)
3220 bra_list[i]->param.index += 1;
3221 convert_to_long(pc, e);
3222 ++pos;
3223 }
3224 }
3225
3226 FREE(bra_list);
3227 }
3228
3229 static boolean
3230 nv50_program_tx(struct nv50_program *p)
3231 {
3232 struct tgsi_parse_context parse;
3233 struct nv50_pc *pc;
3234 boolean ret;
3235
3236 pc = CALLOC_STRUCT(nv50_pc);
3237 if (!pc)
3238 return FALSE;
3239
3240 ret = ctor_nv50_pc(pc, p);
3241 if (ret == FALSE)
3242 goto out_cleanup;
3243
3244 ret = nv50_program_tx_prep(pc);
3245 if (ret == FALSE)
3246 goto out_cleanup;
3247
3248 tgsi_parse_init(&parse, pc->p->pipe.tokens);
3249 while (!tgsi_parse_end_of_tokens(&parse)) {
3250 const union tgsi_full_token *tok = &parse.FullToken;
3251
3252 /* don't allow half insn/immd on first and last instruction */
3253 pc->allow32 = TRUE;
3254 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
3255 pc->allow32 = FALSE;
3256
3257 tgsi_parse_token(&parse);
3258
3259 switch (tok->Token.Type) {
3260 case TGSI_TOKEN_TYPE_INSTRUCTION:
3261 ++pc->insn_cur;
3262 ret = nv50_tgsi_insn(pc, tok);
3263 if (ret == FALSE)
3264 goto out_err;
3265 break;
3266 default:
3267 break;
3268 }
3269 }
3270
3271 if (pc->p->type == PIPE_SHADER_FRAGMENT)
3272 nv50_fp_move_results(pc);
3273
3274 nv50_program_fixup_insns(pc);
3275
3276 p->param_nr = pc->param_nr * 4;
3277 p->immd_nr = pc->immd_nr * 4;
3278 p->immd = pc->immd_buf;
3279
3280 out_err:
3281 tgsi_parse_free(&parse);
3282
3283 out_cleanup:
3284 free_nv50_pc(pc);
3285 return ret;
3286 }
3287
3288 static void
3289 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
3290 {
3291 if (nv50_program_tx(p) == FALSE)
3292 assert(0);
3293 p->translated = TRUE;
3294 }
3295
3296 static void
3297 nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
3298 unsigned start, unsigned count, unsigned cbuf)
3299 {
3300 struct nouveau_channel *chan = nv50->screen->base.channel;
3301 struct nouveau_grobj *tesla = nv50->screen->tesla;
3302
3303 while (count) {
3304 unsigned nr = count > 2047 ? 2047 : count;
3305
3306 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3307 OUT_RING (chan, (cbuf << 0) | (start << 8));
3308 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3309 OUT_RINGp (chan, map, nr);
3310
3311 map += nr;
3312 start += nr;
3313 count -= nr;
3314 }
3315 }
3316
3317 static void
3318 nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
3319 {
3320 struct pipe_screen *pscreen = nv50->pipe.screen;
3321
3322 if (!p->data[0] && p->immd_nr) {
3323 struct nouveau_resource *heap = nv50->screen->immd_heap[0];
3324
3325 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
3326 while (heap->next && heap->size < p->immd_nr) {
3327 struct nv50_program *evict = heap->next->priv;
3328 nouveau_resource_free(&evict->data[0]);
3329 }
3330
3331 if (nouveau_resource_alloc(heap, p->immd_nr, p,
3332 &p->data[0]))
3333 assert(0);
3334 }
3335
3336 /* immediates only need to be uploaded again when freed */
3337 nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3338 p->immd_nr, NV50_CB_PMISC);
3339 }
3340
3341 assert(p->param_nr <= 512);
3342
3343 if (p->param_nr) {
3344 unsigned cb;
3345 uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3346 PIPE_BUFFER_USAGE_CPU_READ);
3347
3348 if (p->type == PIPE_SHADER_VERTEX)
3349 cb = NV50_CB_PVP;
3350 else
3351 cb = NV50_CB_PFP;
3352
3353 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3354 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3355 }
3356 }
3357
3358 static void
3359 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3360 {
3361 struct nouveau_channel *chan = nv50->screen->base.channel;
3362 struct nv50_program_exec *e;
3363 uint32_t *up, i;
3364 boolean upload = FALSE;
3365
3366 if (!p->bo) {
3367 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3368 p->exec_size * 4, &p->bo);
3369 upload = TRUE;
3370 }
3371
3372 if (p->data[0] && p->data[0]->start != p->data_start[0])
3373 upload = TRUE;
3374
3375 if (!upload)
3376 return;
3377
3378 up = MALLOC(p->exec_size * 4);
3379
3380 for (i = 0, e = p->exec_head; e; e = e->next) {
3381 unsigned ei, ci, bs;
3382
3383 if (e->param.index >= 0 && e->param.mask) {
3384 bs = (e->inst[1] >> 22) & 0x07;
3385 assert(bs < 2);
3386 ei = e->param.shift >> 5;
3387 ci = e->param.index;
3388 if (bs == 0)
3389 ci += p->data[bs]->start;
3390
3391 e->inst[ei] &= ~e->param.mask;
3392 e->inst[ei] |= (ci << e->param.shift);
3393 } else
3394 if (e->param.index >= 0) {
3395 /* zero mask means param is a jump/branch offset */
3396 assert(!(e->param.index & 1));
3397 /* seem to be 8 byte steps */
3398 ei = (e->param.index >> 1) + 0 /* START_ID */;
3399
3400 e->inst[0] &= 0xf0000fff;
3401 e->inst[0] |= ei << 12;
3402 }
3403
3404 up[i++] = e->inst[0];
3405 if (is_long(e))
3406 up[i++] = e->inst[1];
3407 }
3408 assert(i == p->exec_size);
3409
3410 if (p->data[0])
3411 p->data_start[0] = p->data[0]->start;
3412
3413 #ifdef NV50_PROGRAM_DUMP
3414 NOUVEAU_ERR("-------\n");
3415 for (e = p->exec_head; e; e = e->next) {
3416 NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3417 if (is_long(e))
3418 NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3419 }
3420 #endif
3421 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3422 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3423 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3424 0, 0, p->exec_size * 4, 1, 1);
3425
3426 FREE(up);
3427 }
3428
3429 void
3430 nv50_vertprog_validate(struct nv50_context *nv50)
3431 {
3432 struct nouveau_grobj *tesla = nv50->screen->tesla;
3433 struct nv50_program *p = nv50->vertprog;
3434 struct nouveau_stateobj *so;
3435
3436 if (!p->translated) {
3437 nv50_program_validate(nv50, p);
3438 if (!p->translated)
3439 assert(0);
3440 }
3441
3442 nv50_program_validate_data(nv50, p);
3443 nv50_program_validate_code(nv50, p);
3444
3445 so = so_new(13, 2);
3446 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3447 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3448 NOUVEAU_BO_HIGH, 0, 0);
3449 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3450 NOUVEAU_BO_LOW, 0, 0);
3451 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3452 so_data (so, p->cfg.attr[0]);
3453 so_data (so, p->cfg.attr[1]);
3454 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3455 so_data (so, p->cfg.high_result);
3456 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3457 so_data (so, p->cfg.high_result); //8);
3458 so_data (so, p->cfg.high_temp);
3459 so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3460 so_data (so, 0); /* program start offset */
3461 so_ref(so, &nv50->state.vertprog);
3462 so_ref(NULL, &so);
3463 }
3464
3465 void
3466 nv50_fragprog_validate(struct nv50_context *nv50)
3467 {
3468 struct nouveau_grobj *tesla = nv50->screen->tesla;
3469 struct nv50_program *p = nv50->fragprog;
3470 struct nouveau_stateobj *so;
3471
3472 if (!p->translated) {
3473 nv50_program_validate(nv50, p);
3474 if (!p->translated)
3475 assert(0);
3476 }
3477
3478 nv50_program_validate_data(nv50, p);
3479 nv50_program_validate_code(nv50, p);
3480
3481 so = so_new(64, 2);
3482 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3483 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3484 NOUVEAU_BO_HIGH, 0, 0);
3485 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3486 NOUVEAU_BO_LOW, 0, 0);
3487 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3488 so_data (so, p->cfg.high_temp);
3489 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3490 so_data (so, p->cfg.high_result);
3491 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3492 so_data (so, p->cfg.regs[2]);
3493 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3494 so_data (so, p->cfg.regs[3]);
3495 so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3496 so_data (so, 0); /* program start offset */
3497 so_ref(so, &nv50->state.fragprog);
3498 so_ref(NULL, &so);
3499 }
3500
3501 static void
3502 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3503 {
3504 struct nv50_program *fp = nv50->fragprog;
3505 struct nv50_program *vp = nv50->vertprog;
3506 unsigned i, c, m = base;
3507
3508 /* XXX: this might not work correctly in all cases yet - we'll
3509 * just assume that an FP generic input that is not written in
3510 * the VP is PointCoord.
3511 */
3512 memset(pntc, 0, 8 * sizeof(uint32_t));
3513
3514 for (i = 0; i < fp->cfg.io_nr; i++) {
3515 uint8_t sn, si;
3516 uint8_t j, k = fp->cfg.io[i].id;
3517 unsigned n = popcnt4(fp->cfg.io[i].mask);
3518
3519 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3520 m += n;
3521 continue;
3522 }
3523
3524 for (j = 0; j < vp->info.num_outputs; ++j) {
3525 sn = vp->info.output_semantic_name[j];
3526 si = vp->info.output_semantic_index[j];
3527
3528 if (sn == fp->info.input_semantic_name[k] &&
3529 si == fp->info.input_semantic_index[k])
3530 break;
3531 }
3532
3533 if (j < vp->info.num_outputs) {
3534 ubyte mode =
3535 nv50->rasterizer->pipe.sprite_coord_mode[si];
3536
3537 if (mode == PIPE_SPRITE_COORD_NONE) {
3538 m += n;
3539 continue;
3540 }
3541 }
3542
3543 /* this is either PointCoord or replaced by sprite coords */
3544 for (c = 0; c < 4; c++) {
3545 if (!(fp->cfg.io[i].mask & (1 << c)))
3546 continue;
3547 pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3548 ++m;
3549 }
3550 }
3551 }
3552
3553 static int
3554 nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3555 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3556 {
3557 int c;
3558 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3559 uint8_t *map = (uint8_t *)p_map;
3560
3561 for (c = 0; c < 4; ++c) {
3562 if (mf & 1) {
3563 if (fpi->linear == TRUE)
3564 lin[mid / 32] |= 1 << (mid % 32);
3565 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3566 }
3567
3568 oid += mv & 1;
3569 mf >>= 1;
3570 mv >>= 1;
3571 }
3572
3573 return mid;
3574 }
3575
3576 void
3577 nv50_linkage_validate(struct nv50_context *nv50)
3578 {
3579 struct nouveau_grobj *tesla = nv50->screen->tesla;
3580 struct nv50_program *vp = nv50->vertprog;
3581 struct nv50_program *fp = nv50->fragprog;
3582 struct nouveau_stateobj *so;
3583 struct nv50_sreg4 dummy, *vpo;
3584 int i, n, c, m = 0;
3585 uint32_t map[16], lin[4], reg[5], pcrd[8];
3586
3587 memset(map, 0, sizeof(map));
3588 memset(lin, 0, sizeof(lin));
3589
3590 reg[1] = 0x00000004; /* low and high clip distance map ids */
3591 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3592 reg[3] = 0x00000000; /* point size map id & enable */
3593 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3594 reg[4] = fp->cfg.regs[1]; /* interpolant info */
3595
3596 dummy.linear = FALSE;
3597 dummy.mask = 0xf; /* map all components of HPOS */
3598 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3599
3600 dummy.mask = 0x0;
3601
3602 if (vp->cfg.clpd < 0x40) {
3603 for (c = 0; c < vp->cfg.clpd_nr; ++c)
3604 map[m++] = vp->cfg.clpd + c;
3605 reg[1] = (m << 8);
3606 }
3607
3608 reg[0] |= m << 8; /* adjust BFC0 id */
3609
3610 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3611 if (nv50->rasterizer->pipe.light_twoside) {
3612 vpo = &vp->cfg.two_side[0];
3613
3614 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3615 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3616 }
3617
3618 reg[0] += m - 4; /* adjust FFC0 id */
3619 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3620
3621 for (i = 0; i < fp->cfg.io_nr; i++) {
3622 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id];
3623 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id];
3624
3625 /* position must be mapped first */
3626 assert(i == 0 || sn != TGSI_SEMANTIC_POSITION);
3627
3628 /* maybe even remove these from cfg.io */
3629 if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE)
3630 continue;
3631
3632 /* VP outputs and vp->cfg.io are in the same order */
3633 for (n = 0; n < vp->info.num_outputs; ++n) {
3634 if (vp->info.output_semantic_name[n] == sn &&
3635 vp->info.output_semantic_index[n] == si)
3636 break;
3637 }
3638 vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy;
3639
3640 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3641 }
3642
3643 if (nv50->rasterizer->pipe.point_size_per_vertex) {
3644 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3645 reg[3] = (m++ << 4) | 1;
3646 }
3647
3648 /* now fill the stateobj */
3649 so = so_new(64, 0);
3650
3651 n = (m + 3) / 4;
3652 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3653 so_data (so, m);
3654 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3655 so_datap (so, map, n);
3656
3657 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3658 so_datap (so, reg, 4);
3659
3660 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3661 so_data (so, reg[4]);
3662
3663 so_method(so, tesla, 0x1540, 4);
3664 so_datap (so, lin, 4);
3665
3666 if (nv50->rasterizer->pipe.point_sprite) {
3667 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3668
3669 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3670 so_datap (so, pcrd, 8);
3671 }
3672
3673 so_ref(so, &nv50->state.programs);
3674 so_ref(NULL, &so);
3675 }
3676
3677 void
3678 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3679 {
3680 while (p->exec_head) {
3681 struct nv50_program_exec *e = p->exec_head;
3682
3683 p->exec_head = e->next;
3684 FREE(e);
3685 }
3686 p->exec_tail = NULL;
3687 p->exec_size = 0;
3688
3689 nouveau_bo_ref(NULL, &p->bo);
3690
3691 nouveau_resource_free(&p->data[0]);
3692
3693 p->translated = 0;
3694 }