Merge remote branch 'upstream/gallium-0.2' into nouveau-gallium-0.2
[mesa.git] / src / gallium / drivers / nv40 / nv40_fragprog.c
1 #include "pipe/p_context.h"
2 #include "pipe/p_defines.h"
3 #include "pipe/p_state.h"
4
5 #include "pipe/p_shader_tokens.h"
6 #include "tgsi/tgsi_parse.h"
7 #include "tgsi/tgsi_util.h"
8
9 #include "nv40_context.h"
10
11 #define SWZ_X 0
12 #define SWZ_Y 1
13 #define SWZ_Z 2
14 #define SWZ_W 3
15 #define MASK_X 1
16 #define MASK_Y 2
17 #define MASK_Z 4
18 #define MASK_W 8
19 #define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
20 #define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
21 #define DEF_CTEST NV40_FP_OP_COND_TR
22 #include "nv40_shader.h"
23
24 #define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
25 #define neg(s) nv40_sr_neg((s))
26 #define abs(s) nv40_sr_abs((s))
27 #define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
28
29 #define MAX_CONSTS 128
30 #define MAX_IMM 32
31 struct nv40_fpc {
32 struct nv40_fragment_program *fp;
33
34 uint attrib_map[PIPE_MAX_SHADER_INPUTS];
35
36 unsigned r_temps;
37 unsigned r_temps_discard;
38 struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
39 struct nv40_sreg *r_temp;
40
41 int num_regs;
42
43 unsigned inst_offset;
44 unsigned have_const;
45
46 struct {
47 int pipe;
48 float vals[4];
49 } consts[MAX_CONSTS];
50 int nr_consts;
51
52 struct nv40_sreg imm[MAX_IMM];
53 unsigned nr_imm;
54 };
55
56 static INLINE struct nv40_sreg
57 temp(struct nv40_fpc *fpc)
58 {
59 int idx = ffs(~fpc->r_temps) - 1;
60
61 if (idx < 0) {
62 NOUVEAU_ERR("out of temps!!\n");
63 assert(0);
64 return nv40_sr(NV40SR_TEMP, 0);
65 }
66
67 fpc->r_temps |= (1 << idx);
68 fpc->r_temps_discard |= (1 << idx);
69 return nv40_sr(NV40SR_TEMP, idx);
70 }
71
72 static INLINE void
73 release_temps(struct nv40_fpc *fpc)
74 {
75 fpc->r_temps &= ~fpc->r_temps_discard;
76 fpc->r_temps_discard = 0;
77 }
78
79 static INLINE struct nv40_sreg
80 constant(struct nv40_fpc *fpc, int pipe, float vals[4])
81 {
82 int idx;
83
84 if (fpc->nr_consts == MAX_CONSTS)
85 assert(0);
86 idx = fpc->nr_consts++;
87
88 fpc->consts[idx].pipe = pipe;
89 if (pipe == -1)
90 memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
91 return nv40_sr(NV40SR_CONST, idx);
92 }
93
94 #define arith(cc,s,o,d,m,s0,s1,s2) \
95 nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
96 (d), (m), (s0), (s1), (s2))
97 #define tex(cc,s,o,u,d,m,s0,s1,s2) \
98 nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
99 (d), (m), (s0), none, none)
100
101 static void
102 grow_insns(struct nv40_fpc *fpc, int size)
103 {
104 struct nv40_fragment_program *fp = fpc->fp;
105
106 fp->insn_len += size;
107 fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
108 }
109
110 static void
111 emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
112 {
113 struct nv40_fragment_program *fp = fpc->fp;
114 uint32_t *hw = &fp->insn[fpc->inst_offset];
115 uint32_t sr = 0;
116
117 switch (src.type) {
118 case NV40SR_INPUT:
119 sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
120 hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
121 break;
122 case NV40SR_OUTPUT:
123 sr |= NV40_FP_REG_SRC_HALF;
124 /* fall-through */
125 case NV40SR_TEMP:
126 sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
127 sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
128 break;
129 case NV40SR_CONST:
130 if (!fpc->have_const) {
131 grow_insns(fpc, 4);
132 fpc->have_const = 1;
133 }
134
135 hw = &fp->insn[fpc->inst_offset];
136 if (fpc->consts[src.index].pipe >= 0) {
137 struct nv40_fragment_program_data *fpd;
138
139 fp->consts = realloc(fp->consts, ++fp->nr_consts *
140 sizeof(*fpd));
141 fpd = &fp->consts[fp->nr_consts - 1];
142 fpd->offset = fpc->inst_offset + 4;
143 fpd->index = fpc->consts[src.index].pipe;
144 memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
145 } else {
146 memcpy(&fp->insn[fpc->inst_offset + 4],
147 fpc->consts[src.index].vals,
148 sizeof(uint32_t) * 4);
149 }
150
151 sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);
152 break;
153 case NV40SR_NONE:
154 sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
155 break;
156 default:
157 assert(0);
158 }
159
160 if (src.negate)
161 sr |= NV40_FP_REG_NEGATE;
162
163 if (src.abs)
164 hw[1] |= (1 << (29 + pos));
165
166 sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
167 (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
168 (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
169 (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
170
171 hw[pos + 1] |= sr;
172 }
173
174 static void
175 emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
176 {
177 struct nv40_fragment_program *fp = fpc->fp;
178 uint32_t *hw = &fp->insn[fpc->inst_offset];
179
180 switch (dst.type) {
181 case NV40SR_TEMP:
182 if (fpc->num_regs < (dst.index + 1))
183 fpc->num_regs = dst.index + 1;
184 break;
185 case NV40SR_OUTPUT:
186 if (dst.index == 1) {
187 fp->fp_control |= 0xe;
188 } else {
189 hw[0] |= NV40_FP_OP_OUT_REG_HALF;
190 }
191 break;
192 case NV40SR_NONE:
193 hw[0] |= (1 << 30);
194 break;
195 default:
196 assert(0);
197 }
198
199 hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
200 }
201
202 static void
203 nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
204 struct nv40_sreg dst, int mask,
205 struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
206 {
207 struct nv40_fragment_program *fp = fpc->fp;
208 uint32_t *hw;
209
210 fpc->inst_offset = fp->insn_len;
211 fpc->have_const = 0;
212 grow_insns(fpc, 4);
213 hw = &fp->insn[fpc->inst_offset];
214 memset(hw, 0, sizeof(uint32_t) * 4);
215
216 if (op == NV40_FP_OP_OPCODE_KIL)
217 fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
218 hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
219 hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
220 hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
221
222 if (sat)
223 hw[0] |= NV40_FP_OP_OUT_SAT;
224
225 if (dst.cc_update)
226 hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
227 hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
228 hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
229 (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
230 (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
231 (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
232
233 emit_dst(fpc, dst);
234 emit_src(fpc, 0, s0);
235 emit_src(fpc, 1, s1);
236 emit_src(fpc, 2, s2);
237 }
238
239 static void
240 nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
241 struct nv40_sreg dst, int mask,
242 struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
243 {
244 struct nv40_fragment_program *fp = fpc->fp;
245
246 nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
247
248 fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
249 fp->samplers |= (1 << unit);
250 }
251
252 static INLINE struct nv40_sreg
253 tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
254 {
255 struct nv40_sreg src;
256
257 switch (fsrc->SrcRegister.File) {
258 case TGSI_FILE_INPUT:
259 src = nv40_sr(NV40SR_INPUT,
260 fpc->attrib_map[fsrc->SrcRegister.Index]);
261 break;
262 case TGSI_FILE_CONSTANT:
263 src = constant(fpc, fsrc->SrcRegister.Index, NULL);
264 break;
265 case TGSI_FILE_IMMEDIATE:
266 assert(fsrc->SrcRegister.Index < fpc->nr_imm);
267 src = fpc->imm[fsrc->SrcRegister.Index];
268 break;
269 case TGSI_FILE_TEMPORARY:
270 src = fpc->r_temp[fsrc->SrcRegister.Index];
271 break;
272 /* NV40 fragprog result regs are just temps, so this is simple */
273 case TGSI_FILE_OUTPUT:
274 src = fpc->r_result[fsrc->SrcRegister.Index];
275 break;
276 default:
277 NOUVEAU_ERR("bad src file\n");
278 break;
279 }
280
281 src.abs = fsrc->SrcRegisterExtMod.Absolute;
282 src.negate = fsrc->SrcRegister.Negate;
283 src.swz[0] = fsrc->SrcRegister.SwizzleX;
284 src.swz[1] = fsrc->SrcRegister.SwizzleY;
285 src.swz[2] = fsrc->SrcRegister.SwizzleZ;
286 src.swz[3] = fsrc->SrcRegister.SwizzleW;
287 return src;
288 }
289
290 static INLINE struct nv40_sreg
291 tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
292 switch (fdst->DstRegister.File) {
293 case TGSI_FILE_OUTPUT:
294 return fpc->r_result[fdst->DstRegister.Index];
295 case TGSI_FILE_TEMPORARY:
296 return fpc->r_temp[fdst->DstRegister.Index];
297 case TGSI_FILE_NULL:
298 return nv40_sr(NV40SR_NONE, 0);
299 default:
300 NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
301 return nv40_sr(NV40SR_NONE, 0);
302 }
303 }
304
305 static INLINE int
306 tgsi_mask(uint tgsi)
307 {
308 int mask = 0;
309
310 if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
311 if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
312 if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
313 if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
314 return mask;
315 }
316
317 static boolean
318 src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
319 struct nv40_sreg *src)
320 {
321 const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
322 struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
323 uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
324 uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
325 fsrc->SrcRegisterExtSwz.NegateY,
326 fsrc->SrcRegisterExtSwz.NegateZ,
327 fsrc->SrcRegisterExtSwz.NegateW };
328 uint c;
329
330 for (c = 0; c < 4; c++) {
331 switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
332 case TGSI_EXTSWIZZLE_X:
333 case TGSI_EXTSWIZZLE_Y:
334 case TGSI_EXTSWIZZLE_Z:
335 case TGSI_EXTSWIZZLE_W:
336 mask |= (1 << c);
337 break;
338 case TGSI_EXTSWIZZLE_ZERO:
339 zero_mask |= (1 << c);
340 tgsi.swz[c] = SWZ_X;
341 break;
342 case TGSI_EXTSWIZZLE_ONE:
343 one_mask |= (1 << c);
344 tgsi.swz[c] = SWZ_X;
345 break;
346 default:
347 assert(0);
348 }
349
350 if (!tgsi.negate && neg[c])
351 neg_mask |= (1 << c);
352 }
353
354 if (mask == MASK_ALL && !neg_mask)
355 return TRUE;
356
357 *src = temp(fpc);
358
359 if (mask)
360 arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
361
362 if (zero_mask)
363 arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
364
365 if (one_mask)
366 arith(fpc, 0, STR, *src, one_mask, *src, none, none);
367
368 if (neg_mask) {
369 struct nv40_sreg one = temp(fpc);
370 arith(fpc, 0, STR, one, neg_mask, one, none, none);
371 arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
372 }
373
374 return FALSE;
375 }
376
377 static boolean
378 nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
379 const struct tgsi_full_instruction *finst)
380 {
381 const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
382 struct nv40_sreg src[3], dst, tmp;
383 int mask, sat, unit;
384 int ai = -1, ci = -1, ii = -1;
385 int i;
386
387 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
388 return TRUE;
389
390 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
391 const struct tgsi_full_src_register *fsrc;
392
393 fsrc = &finst->FullSrcRegisters[i];
394 if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
395 src[i] = tgsi_src(fpc, fsrc);
396 }
397 }
398
399 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
400 const struct tgsi_full_src_register *fsrc;
401
402 fsrc = &finst->FullSrcRegisters[i];
403
404 switch (fsrc->SrcRegister.File) {
405 case TGSI_FILE_INPUT:
406 case TGSI_FILE_CONSTANT:
407 case TGSI_FILE_TEMPORARY:
408 if (!src_native_swz(fpc, fsrc, &src[i]))
409 continue;
410 break;
411 default:
412 break;
413 }
414
415 switch (fsrc->SrcRegister.File) {
416 case TGSI_FILE_INPUT:
417 if (ai == -1 || ai == fsrc->SrcRegister.Index) {
418 ai = fsrc->SrcRegister.Index;
419 src[i] = tgsi_src(fpc, fsrc);
420 } else {
421 src[i] = temp(fpc);
422 arith(fpc, 0, MOV, src[i], MASK_ALL,
423 tgsi_src(fpc, fsrc), none, none);
424 }
425 break;
426 case TGSI_FILE_CONSTANT:
427 if ((ci == -1 && ii == -1) ||
428 ci == fsrc->SrcRegister.Index) {
429 ci = fsrc->SrcRegister.Index;
430 src[i] = tgsi_src(fpc, fsrc);
431 } else {
432 src[i] = temp(fpc);
433 arith(fpc, 0, MOV, src[i], MASK_ALL,
434 tgsi_src(fpc, fsrc), none, none);
435 }
436 break;
437 case TGSI_FILE_IMMEDIATE:
438 if ((ci == -1 && ii == -1) ||
439 ii == fsrc->SrcRegister.Index) {
440 ii = fsrc->SrcRegister.Index;
441 src[i] = tgsi_src(fpc, fsrc);
442 } else {
443 src[i] = temp(fpc);
444 arith(fpc, 0, MOV, src[i], MASK_ALL,
445 tgsi_src(fpc, fsrc), none, none);
446 }
447 break;
448 case TGSI_FILE_TEMPORARY:
449 /* handled above */
450 break;
451 case TGSI_FILE_SAMPLER:
452 unit = fsrc->SrcRegister.Index;
453 break;
454 case TGSI_FILE_OUTPUT:
455 break;
456 default:
457 NOUVEAU_ERR("bad src file\n");
458 return FALSE;
459 }
460 }
461
462 dst = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
463 mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
464 sat = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
465
466 switch (finst->Instruction.Opcode) {
467 case TGSI_OPCODE_ABS:
468 arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
469 break;
470 case TGSI_OPCODE_ADD:
471 arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
472 break;
473 case TGSI_OPCODE_CMP:
474 tmp = temp(fpc);
475 arith(fpc, sat, MOV, dst, mask, src[2], none, none);
476 tmp.cc_update = 1;
477 arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
478 dst.cc_test = NV40_VP_INST_COND_LT;
479 arith(fpc, sat, MOV, dst, mask, src[1], none, none);
480 break;
481 case TGSI_OPCODE_COS:
482 arith(fpc, sat, COS, dst, mask, src[0], none, none);
483 break;
484 case TGSI_OPCODE_DDX:
485 if (mask & (MASK_Z | MASK_W)) {
486 tmp = temp(fpc);
487 arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
488 swz(src[0], Z, W, Z, W), none, none);
489 arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
490 swz(tmp, X, Y, X, Y), none, none);
491 arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
492 none, none);
493 arith(fpc, 0, MOV, dst, mask, tmp, none, none);
494 } else {
495 arith(fpc, sat, DDX, dst, mask, src[0], none, none);
496 }
497 break;
498 case TGSI_OPCODE_DDY:
499 if (mask & (MASK_Z | MASK_W)) {
500 tmp = temp(fpc);
501 arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
502 swz(src[0], Z, W, Z, W), none, none);
503 arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
504 swz(tmp, X, Y, X, Y), none, none);
505 arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
506 none, none);
507 arith(fpc, 0, MOV, dst, mask, tmp, none, none);
508 } else {
509 arith(fpc, sat, DDY, dst, mask, src[0], none, none);
510 }
511 break;
512 case TGSI_OPCODE_DP3:
513 arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
514 break;
515 case TGSI_OPCODE_DP4:
516 arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
517 break;
518 case TGSI_OPCODE_DPH:
519 tmp = temp(fpc);
520 arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
521 arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
522 swz(src[1], W, W, W, W), none);
523 break;
524 case TGSI_OPCODE_DST:
525 arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
526 break;
527 case TGSI_OPCODE_EX2:
528 arith(fpc, sat, EX2, dst, mask, src[0], none, none);
529 break;
530 case TGSI_OPCODE_FLR:
531 arith(fpc, sat, FLR, dst, mask, src[0], none, none);
532 break;
533 case TGSI_OPCODE_FRC:
534 arith(fpc, sat, FRC, dst, mask, src[0], none, none);
535 break;
536 case TGSI_OPCODE_KILP:
537 arith(fpc, 0, KIL, none, 0, none, none, none);
538 break;
539 case TGSI_OPCODE_KIL:
540 dst = nv40_sr(NV40SR_NONE, 0);
541 dst.cc_update = 1;
542 arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
543 dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
544 arith(fpc, 0, KIL, dst, 0, none, none, none);
545 break;
546 case TGSI_OPCODE_LG2:
547 arith(fpc, sat, LG2, dst, mask, src[0], none, none);
548 break;
549 // case TGSI_OPCODE_LIT:
550 case TGSI_OPCODE_LRP:
551 tmp = temp(fpc);
552 arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
553 arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
554 break;
555 case TGSI_OPCODE_MAD:
556 arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
557 break;
558 case TGSI_OPCODE_MAX:
559 arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
560 break;
561 case TGSI_OPCODE_MIN:
562 arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
563 break;
564 case TGSI_OPCODE_MOV:
565 arith(fpc, sat, MOV, dst, mask, src[0], none, none);
566 break;
567 case TGSI_OPCODE_MUL:
568 arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
569 break;
570 case TGSI_OPCODE_NOISE1:
571 case TGSI_OPCODE_NOISE2:
572 case TGSI_OPCODE_NOISE3:
573 case TGSI_OPCODE_NOISE4:
574 arith(fpc, sat, SFL, dst, mask, none, none, none);
575 break;
576 case TGSI_OPCODE_POW:
577 tmp = temp(fpc);
578 arith(fpc, 0, LG2, tmp, MASK_X,
579 swz(src[0], X, X, X, X), none, none);
580 arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
581 swz(src[1], X, X, X, X), none);
582 arith(fpc, sat, EX2, dst, mask,
583 swz(tmp, X, X, X, X), none, none);
584 break;
585 case TGSI_OPCODE_RCP:
586 arith(fpc, sat, RCP, dst, mask, src[0], none, none);
587 break;
588 case TGSI_OPCODE_RET:
589 assert(0);
590 break;
591 case TGSI_OPCODE_RFL:
592 tmp = temp(fpc);
593 arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
594 arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
595 arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
596 swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
597 arith(fpc, sat, MAD, dst, mask,
598 swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
599 break;
600 case TGSI_OPCODE_RSQ:
601 tmp = temp(fpc);
602 arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
603 abs(swz(src[0], X, X, X, X)), none, none);
604 arith(fpc, sat, EX2, dst, mask,
605 neg(swz(tmp, X, X, X, X)), none, none);
606 break;
607 case TGSI_OPCODE_SCS:
608 if (mask & MASK_X) {
609 arith(fpc, sat, COS, dst, MASK_X,
610 swz(src[0], X, X, X, X), none, none);
611 }
612 if (mask & MASK_Y) {
613 arith(fpc, sat, SIN, dst, MASK_Y,
614 swz(src[0], X, X, X, X), none, none);
615 }
616 break;
617 case TGSI_OPCODE_SEQ:
618 arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
619 break;
620 case TGSI_OPCODE_SFL:
621 arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
622 break;
623 case TGSI_OPCODE_SGE:
624 arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
625 break;
626 case TGSI_OPCODE_SGT:
627 arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
628 break;
629 case TGSI_OPCODE_SIN:
630 arith(fpc, sat, SIN, dst, mask, src[0], none, none);
631 break;
632 case TGSI_OPCODE_SLE:
633 arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
634 break;
635 case TGSI_OPCODE_SLT:
636 arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
637 break;
638 case TGSI_OPCODE_SNE:
639 arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
640 break;
641 case TGSI_OPCODE_STR:
642 arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
643 break;
644 case TGSI_OPCODE_SUB:
645 arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
646 break;
647 case TGSI_OPCODE_TEX:
648 tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
649 break;
650 case TGSI_OPCODE_TXB:
651 tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
652 break;
653 case TGSI_OPCODE_TXP:
654 tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
655 break;
656 case TGSI_OPCODE_XPD:
657 tmp = temp(fpc);
658 arith(fpc, 0, MUL, tmp, mask,
659 swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
660 arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
661 swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
662 neg(tmp));
663 break;
664 default:
665 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
666 return FALSE;
667 }
668
669 release_temps(fpc);
670 return TRUE;
671 }
672
673 static boolean
674 nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
675 const struct tgsi_full_declaration *fdec)
676 {
677 int hw;
678
679 switch (fdec->Semantic.SemanticName) {
680 case TGSI_SEMANTIC_POSITION:
681 hw = NV40_FP_OP_INPUT_SRC_POSITION;
682 break;
683 case TGSI_SEMANTIC_COLOR:
684 if (fdec->Semantic.SemanticIndex == 0) {
685 hw = NV40_FP_OP_INPUT_SRC_COL0;
686 } else
687 if (fdec->Semantic.SemanticIndex == 1) {
688 hw = NV40_FP_OP_INPUT_SRC_COL1;
689 } else {
690 NOUVEAU_ERR("bad colour semantic index\n");
691 return FALSE;
692 }
693 break;
694 case TGSI_SEMANTIC_FOG:
695 hw = NV40_FP_OP_INPUT_SRC_FOGC;
696 break;
697 case TGSI_SEMANTIC_GENERIC:
698 if (fdec->Semantic.SemanticIndex <= 7) {
699 hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
700 SemanticIndex);
701 } else {
702 NOUVEAU_ERR("bad generic semantic index\n");
703 return FALSE;
704 }
705 break;
706 default:
707 NOUVEAU_ERR("bad input semantic\n");
708 return FALSE;
709 }
710
711 fpc->attrib_map[fdec->DeclarationRange.First] = hw;
712 return TRUE;
713 }
714
715 static boolean
716 nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
717 const struct tgsi_full_declaration *fdec)
718 {
719 unsigned idx = fdec->DeclarationRange.First;
720 unsigned hw;
721
722 switch (fdec->Semantic.SemanticName) {
723 case TGSI_SEMANTIC_POSITION:
724 hw = 1;
725 break;
726 case TGSI_SEMANTIC_COLOR:
727 switch (fdec->Semantic.SemanticIndex) {
728 case 0: hw = 0; break;
729 case 1: hw = 2; break;
730 case 2: hw = 3; break;
731 case 3: hw = 4; break;
732 default:
733 NOUVEAU_ERR("bad rcol index\n");
734 return FALSE;
735 }
736 break;
737 default:
738 NOUVEAU_ERR("bad output semantic\n");
739 return FALSE;
740 }
741
742 fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
743 fpc->r_temps |= (1 << hw);
744 return TRUE;
745 }
746
747 static boolean
748 nv40_fragprog_prepare(struct nv40_fpc *fpc)
749 {
750 struct tgsi_parse_context p;
751 int high_temp = -1, i;
752
753 tgsi_parse_init(&p, fpc->fp->pipe.tokens);
754 while (!tgsi_parse_end_of_tokens(&p)) {
755 const union tgsi_full_token *tok = &p.FullToken;
756
757 tgsi_parse_token(&p);
758 switch(tok->Token.Type) {
759 case TGSI_TOKEN_TYPE_DECLARATION:
760 {
761 const struct tgsi_full_declaration *fdec;
762 fdec = &p.FullToken.FullDeclaration;
763 switch (fdec->Declaration.File) {
764 case TGSI_FILE_INPUT:
765 if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
766 goto out_err;
767 break;
768 case TGSI_FILE_OUTPUT:
769 if (!nv40_fragprog_parse_decl_output(fpc, fdec))
770 goto out_err;
771 break;
772 case TGSI_FILE_TEMPORARY:
773 if (fdec->DeclarationRange.Last > high_temp) {
774 high_temp =
775 fdec->DeclarationRange.Last;
776 }
777 break;
778 default:
779 break;
780 }
781 }
782 break;
783 case TGSI_TOKEN_TYPE_IMMEDIATE:
784 {
785 struct tgsi_full_immediate *imm;
786 float vals[4];
787
788 imm = &p.FullToken.FullImmediate;
789 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
790 assert(fpc->nr_imm < MAX_IMM);
791
792 vals[0] = imm->u.ImmediateFloat32[0].Float;
793 vals[1] = imm->u.ImmediateFloat32[1].Float;
794 vals[2] = imm->u.ImmediateFloat32[2].Float;
795 vals[3] = imm->u.ImmediateFloat32[3].Float;
796 fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
797 }
798 break;
799 default:
800 break;
801 }
802 }
803 tgsi_parse_free(&p);
804
805 if (++high_temp) {
806 fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
807 for (i = 0; i < high_temp; i++)
808 fpc->r_temp[i] = temp(fpc);
809 fpc->r_temps_discard = 0;
810 }
811
812 return TRUE;
813
814 out_err:
815 if (fpc->r_temp)
816 FREE(fpc->r_temp);
817 tgsi_parse_free(&p);
818 return FALSE;
819 }
820
821 static void
822 nv40_fragprog_translate(struct nv40_context *nv40,
823 struct nv40_fragment_program *fp)
824 {
825 struct tgsi_parse_context parse;
826 struct nv40_fpc *fpc = NULL;
827
828 fpc = CALLOC(1, sizeof(struct nv40_fpc));
829 if (!fpc)
830 return;
831 fpc->fp = fp;
832 fpc->num_regs = 2;
833
834 if (!nv40_fragprog_prepare(fpc)) {
835 FREE(fpc);
836 return;
837 }
838
839 tgsi_parse_init(&parse, fp->pipe.tokens);
840
841 while (!tgsi_parse_end_of_tokens(&parse)) {
842 tgsi_parse_token(&parse);
843
844 switch (parse.FullToken.Token.Type) {
845 case TGSI_TOKEN_TYPE_INSTRUCTION:
846 {
847 const struct tgsi_full_instruction *finst;
848
849 finst = &parse.FullToken.FullInstruction;
850 if (!nv40_fragprog_parse_instruction(fpc, finst))
851 goto out_err;
852 }
853 break;
854 default:
855 break;
856 }
857 }
858
859 fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
860
861 /* Terminate final instruction */
862 fp->insn[fpc->inst_offset] |= 0x00000001;
863
864 /* Append NOP + END instruction, may or may not be necessary. */
865 fpc->inst_offset = fp->insn_len;
866 grow_insns(fpc, 4);
867 fp->insn[fpc->inst_offset + 0] = 0x00000001;
868 fp->insn[fpc->inst_offset + 1] = 0x00000000;
869 fp->insn[fpc->inst_offset + 2] = 0x00000000;
870 fp->insn[fpc->inst_offset + 3] = 0x00000000;
871
872 fp->translated = TRUE;
873 out_err:
874 tgsi_parse_free(&parse);
875 if (fpc->r_temp)
876 FREE(fpc->r_temp);
877 FREE(fpc);
878 }
879
880 static void
881 nv40_fragprog_upload(struct nv40_context *nv40,
882 struct nv40_fragment_program *fp)
883 {
884 struct pipe_winsys *ws = nv40->pipe.winsys;
885 const uint32_t le = 1;
886 uint32_t *map;
887 int i;
888
889 map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
890
891 #if 0
892 for (i = 0; i < fp->insn_len; i++) {
893 fflush(stdout); fflush(stderr);
894 NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
895 fflush(stdout); fflush(stderr);
896 }
897 #endif
898
899 if ((*(const uint8_t *)&le)) {
900 for (i = 0; i < fp->insn_len; i++) {
901 map[i] = fp->insn[i];
902 }
903 } else {
904 /* Weird swapping for big-endian chips */
905 for (i = 0; i < fp->insn_len; i++) {
906 map[i] = ((fp->insn[i] & 0xffff) << 16) |
907 ((fp->insn[i] >> 16) & 0xffff);
908 }
909 }
910
911 ws->buffer_unmap(ws, fp->buffer);
912 }
913
914 static boolean
915 nv40_fragprog_validate(struct nv40_context *nv40)
916 {
917 struct nv40_fragment_program *fp = nv40->fragprog;
918 struct pipe_buffer *constbuf =
919 nv40->constbuf[PIPE_SHADER_FRAGMENT];
920 struct pipe_winsys *ws = nv40->pipe.winsys;
921 struct nouveau_stateobj *so;
922 boolean new_consts = FALSE;
923 int i;
924
925 if (fp->translated)
926 goto update_constants;
927
928 nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
929 nv40_fragprog_translate(nv40, fp);
930 if (!fp->translated) {
931 nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
932 return FALSE;
933 }
934
935 fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
936 nv40_fragprog_upload(nv40, fp);
937
938 so = so_new(4, 1);
939 so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
940 so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
941 NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
942 NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
943 so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
944 so_data (so, fp->fp_control);
945 so_ref(so, &fp->so);
946
947 update_constants:
948 if (fp->nr_consts) {
949 float *map;
950
951 map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
952 for (i = 0; i < fp->nr_consts; i++) {
953 struct nv40_fragment_program_data *fpd = &fp->consts[i];
954 uint32_t *p = &fp->insn[fpd->offset];
955 uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
956
957 if (!memcmp(p, cb, 4 * sizeof(float)))
958 continue;
959 memcpy(p, cb, 4 * sizeof(float));
960 new_consts = TRUE;
961 }
962 ws->buffer_unmap(ws, constbuf);
963
964 if (new_consts)
965 nv40_fragprog_upload(nv40, fp);
966 }
967
968 if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
969 so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
970 return TRUE;
971 }
972
973 return FALSE;
974 }
975
976 void
977 nv40_fragprog_destroy(struct nv40_context *nv40,
978 struct nv40_fragment_program *fp)
979 {
980 if (fp->insn_len)
981 FREE(fp->insn);
982 }
983
984 struct nv40_state_entry nv40_state_fragprog = {
985 .validate = nv40_fragprog_validate,
986 .dirty = {
987 .pipe = NV40_NEW_FRAGPROG,
988 .hw = NV40_STATE_FRAGPROG
989 }
990 };
991