i915g: Improve SIN/COS a bit.
[mesa.git] / src / gallium / drivers / i915 / i915_fpc_translate.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include <stdarg.h>
30
31 #include "i915_reg.h"
32 #include "i915_context.h"
33 #include "i915_fpc.h"
34
35 #include "pipe/p_shader_tokens.h"
36 #include "util/u_math.h"
37 #include "util/u_memory.h"
38 #include "util/u_string.h"
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "draw/draw_vertex.h"
43
44
45 /**
46 * Simple pass-through fragment shader to use when we don't have
47 * a real shader (or it fails to compile for some reason).
48 */
49 static unsigned passthrough[] =
50 {
51 _3DSTATE_PIXEL_SHADER_PROGRAM | ((2*3)-1),
52
53 /* declare input color:
54 */
55 (D0_DCL |
56 (REG_TYPE_T << D0_TYPE_SHIFT) |
57 (T_DIFFUSE << D0_NR_SHIFT) |
58 D0_CHANNEL_ALL),
59 0,
60 0,
61
62 /* move to output color:
63 */
64 (A0_MOV |
65 (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) |
66 A0_DEST_CHANNEL_ALL |
67 (REG_TYPE_T << A0_SRC0_TYPE_SHIFT) |
68 (T_DIFFUSE << A0_SRC0_NR_SHIFT)),
69 0x01230000, /* .xyzw */
70 0
71 };
72
73
74 /* 1, -1/3!, 1/5!, -1/7! */
75 static const float scs_sin_constants[4] = { 1.0,
76 -1.0f / (3 * 2 * 1),
77 1.0f / (5 * 4 * 3 * 2 * 1),
78 -1.0f / (7 * 6 * 5 * 4 * 3 * 2 * 1)
79 };
80
81 /* 1, -1/2!, 1/4!, -1/6! */
82 static const float scs_cos_constants[4] = { 1.0,
83 -1.0f / (2 * 1),
84 1.0f / (4 * 3 * 2 * 1),
85 -1.0f / (6 * 5 * 4 * 3 * 2 * 1)
86 };
87
88 /* 1, -1/3!, 1/5!, -1/7! */
89 static const float sin_constants[4] = { 2.0 * M_PI,
90 -8.0f * M_PI * M_PI * M_PI / (3 * 2 * 1),
91 32.0f * M_PI * M_PI * M_PI * M_PI * M_PI / (5 * 4 * 3 * 2 * 1),
92 -128.0f * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI / (7 * 6 * 5 * 4 * 3 * 2 * 1)
93 };
94
95 /* 1, -1/2!, 1/4!, -1/6! */
96 static const float cos_constants[4] = { 1.0,
97 -4.0f * M_PI * M_PI / (2 * 1),
98 16.0f * M_PI * M_PI * M_PI * M_PI / (4 * 3 * 2 * 1),
99 -64.0f * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI / (6 * 5 * 4 * 3 * 2 * 1)
100 };
101
102
103
104 /**
105 * component-wise negation of ureg
106 */
107 static INLINE int
108 negate(int reg, int x, int y, int z, int w)
109 {
110 /* Another neat thing about the UREG representation */
111 return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
112 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
113 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
114 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
115 }
116
117
118 /**
119 * In the event of a translation failure, we'll generate a simple color
120 * pass-through program.
121 */
122 static void
123 i915_use_passthrough_shader(struct i915_fragment_shader *fs)
124 {
125 fs->program = (uint *) MALLOC(sizeof(passthrough));
126 if (fs->program) {
127 memcpy(fs->program, passthrough, sizeof(passthrough));
128 fs->program_len = Elements(passthrough);
129 }
130 fs->num_constants = 0;
131 }
132
133
134 void
135 i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
136 {
137 va_list args;
138 char buffer[1024];
139
140 debug_printf("i915_program_error: ");
141 va_start( args, msg );
142 util_vsnprintf( buffer, sizeof(buffer), msg, args );
143 va_end( args );
144 debug_printf("%s", buffer);
145 debug_printf("\n");
146
147 p->error = 1;
148 }
149
150 static uint get_mapping(struct i915_fragment_shader* fs, int unit)
151 {
152 int i;
153 for (i = 0; i < I915_TEX_UNITS; i++)
154 {
155 if (fs->generic_mapping[i] == -1) {
156 fs->generic_mapping[i] = unit;
157 return i;
158 }
159 if (fs->generic_mapping[i] == unit)
160 return i;
161 }
162 debug_printf("Exceeded max generics\n");
163 return 0;
164 }
165
166 /**
167 * Construct a ureg for the given source register. Will emit
168 * constants, apply swizzling and negation as needed.
169 */
170 static uint
171 src_vector(struct i915_fp_compile *p,
172 const struct tgsi_full_src_register *source,
173 struct i915_fragment_shader* fs)
174 {
175 uint index = source->Register.Index;
176 uint src = 0, sem_name, sem_ind;
177
178 switch (source->Register.File) {
179 case TGSI_FILE_TEMPORARY:
180 if (source->Register.Index >= I915_MAX_TEMPORARY) {
181 i915_program_error(p, "Exceeded max temporary reg");
182 return 0;
183 }
184 src = UREG(REG_TYPE_R, index);
185 break;
186 case TGSI_FILE_INPUT:
187 /* XXX: Packing COL1, FOGC into a single attribute works for
188 * texenv programs, but will fail for real fragment programs
189 * that use these attributes and expect them to be a full 4
190 * components wide. Could use a texcoord to pass these
191 * attributes if necessary, but that won't work in the general
192 * case.
193 *
194 * We also use a texture coordinate to pass wpos when possible.
195 */
196
197 sem_name = p->shader->info.input_semantic_name[index];
198 sem_ind = p->shader->info.input_semantic_index[index];
199
200 switch (sem_name) {
201 case TGSI_SEMANTIC_POSITION:
202 {
203 /* for fragcoord */
204 int real_tex_unit = get_mapping(fs, I915_SEMANTIC_POS);
205 src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_ALL);
206 break;
207 }
208 case TGSI_SEMANTIC_COLOR:
209 if (sem_ind == 0) {
210 src = i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
211 }
212 else {
213 /* secondary color */
214 assert(sem_ind == 1);
215 src = i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ);
216 src = swizzle(src, X, Y, Z, ONE);
217 }
218 break;
219 case TGSI_SEMANTIC_FOG:
220 src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W);
221 src = swizzle(src, W, W, W, W);
222 break;
223 case TGSI_SEMANTIC_GENERIC:
224 {
225 int real_tex_unit = get_mapping(fs, sem_ind);
226 src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_ALL);
227 break;
228 }
229 case TGSI_SEMANTIC_FACE:
230 {
231 /* for back/front faces */
232 int real_tex_unit = get_mapping(fs, I915_SEMANTIC_FACE);
233 src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_X);
234 break;
235 }
236 default:
237 i915_program_error(p, "Bad source->Index");
238 return 0;
239 }
240 break;
241
242 case TGSI_FILE_IMMEDIATE:
243 assert(index < p->num_immediates);
244 index = p->immediates_map[index];
245 /* fall-through */
246 case TGSI_FILE_CONSTANT:
247 src = UREG(REG_TYPE_CONST, index);
248 break;
249
250 default:
251 i915_program_error(p, "Bad source->File");
252 return 0;
253 }
254
255 src = swizzle(src,
256 source->Register.SwizzleX,
257 source->Register.SwizzleY,
258 source->Register.SwizzleZ,
259 source->Register.SwizzleW);
260
261 /* There's both negate-all-components and per-component negation.
262 * Try to handle both here.
263 */
264 {
265 int n = source->Register.Negate;
266 src = negate(src, n, n, n, n);
267 }
268
269 /* no abs() */
270 #if 0
271 /* XXX assertions disabled to allow arbfplight.c to run */
272 /* XXX enable these assertions, or fix things */
273 assert(!source->Register.Absolute);
274 #endif
275 return src;
276 }
277
278
279 /**
280 * Construct a ureg for a destination register.
281 */
282 static uint
283 get_result_vector(struct i915_fp_compile *p,
284 const struct tgsi_full_dst_register *dest)
285 {
286 switch (dest->Register.File) {
287 case TGSI_FILE_OUTPUT:
288 {
289 uint sem_name = p->shader->info.output_semantic_name[dest->Register.Index];
290 switch (sem_name) {
291 case TGSI_SEMANTIC_POSITION:
292 return UREG(REG_TYPE_OD, 0);
293 case TGSI_SEMANTIC_COLOR:
294 return UREG(REG_TYPE_OC, 0);
295 default:
296 i915_program_error(p, "Bad inst->DstReg.Index/semantics");
297 return 0;
298 }
299 }
300 case TGSI_FILE_TEMPORARY:
301 return UREG(REG_TYPE_R, dest->Register.Index);
302 default:
303 i915_program_error(p, "Bad inst->DstReg.File");
304 return 0;
305 }
306 }
307
308
309 /**
310 * Compute flags for saturation and writemask.
311 */
312 static uint
313 get_result_flags(const struct tgsi_full_instruction *inst)
314 {
315 const uint writeMask
316 = inst->Dst[0].Register.WriteMask;
317 uint flags = 0x0;
318
319 if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
320 flags |= A0_DEST_SATURATE;
321
322 if (writeMask & TGSI_WRITEMASK_X)
323 flags |= A0_DEST_CHANNEL_X;
324 if (writeMask & TGSI_WRITEMASK_Y)
325 flags |= A0_DEST_CHANNEL_Y;
326 if (writeMask & TGSI_WRITEMASK_Z)
327 flags |= A0_DEST_CHANNEL_Z;
328 if (writeMask & TGSI_WRITEMASK_W)
329 flags |= A0_DEST_CHANNEL_W;
330
331 return flags;
332 }
333
334
335 /**
336 * Convert TGSI_TEXTURE_x token to DO_SAMPLE_TYPE_x token
337 */
338 static uint
339 translate_tex_src_target(struct i915_fp_compile *p, uint tex)
340 {
341 switch (tex) {
342 case TGSI_TEXTURE_SHADOW1D:
343 /* fall-through */
344 case TGSI_TEXTURE_1D:
345 return D0_SAMPLE_TYPE_2D;
346
347 case TGSI_TEXTURE_SHADOW2D:
348 /* fall-through */
349 case TGSI_TEXTURE_2D:
350 return D0_SAMPLE_TYPE_2D;
351
352 case TGSI_TEXTURE_SHADOWRECT:
353 /* fall-through */
354 case TGSI_TEXTURE_RECT:
355 return D0_SAMPLE_TYPE_2D;
356
357 case TGSI_TEXTURE_3D:
358 return D0_SAMPLE_TYPE_VOLUME;
359
360 case TGSI_TEXTURE_CUBE:
361 return D0_SAMPLE_TYPE_CUBE;
362
363 default:
364 i915_program_error(p, "TexSrc type");
365 return 0;
366 }
367 }
368
369
370 /**
371 * Generate texel lookup instruction.
372 */
373 static void
374 emit_tex(struct i915_fp_compile *p,
375 const struct tgsi_full_instruction *inst,
376 uint opcode,
377 struct i915_fragment_shader* fs)
378 {
379 uint texture = inst->Texture.Texture;
380 uint unit = inst->Src[1].Register.Index;
381 uint tex = translate_tex_src_target( p, texture );
382 uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex);
383 uint coord = src_vector( p, &inst->Src[0], fs);
384
385 i915_emit_texld( p,
386 get_result_vector( p, &inst->Dst[0] ),
387 get_result_flags( inst ),
388 sampler,
389 coord,
390 opcode);
391 }
392
393
394 /**
395 * Generate a simple arithmetic instruction
396 * \param opcode the i915 opcode
397 * \param numArgs the number of input/src arguments
398 */
399 static void
400 emit_simple_arith(struct i915_fp_compile *p,
401 const struct tgsi_full_instruction *inst,
402 uint opcode, uint numArgs,
403 struct i915_fragment_shader* fs)
404 {
405 uint arg1, arg2, arg3;
406
407 assert(numArgs <= 3);
408
409 arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->Src[0], fs );
410 arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->Src[1], fs );
411 arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->Src[2], fs );
412
413 i915_emit_arith( p,
414 opcode,
415 get_result_vector( p, &inst->Dst[0]),
416 get_result_flags( inst ), 0,
417 arg1,
418 arg2,
419 arg3 );
420 }
421
422
423 /** As above, but swap the first two src regs */
424 static void
425 emit_simple_arith_swap2(struct i915_fp_compile *p,
426 const struct tgsi_full_instruction *inst,
427 uint opcode, uint numArgs,
428 struct i915_fragment_shader* fs)
429 {
430 struct tgsi_full_instruction inst2;
431
432 assert(numArgs == 2);
433
434 /* transpose first two registers */
435 inst2 = *inst;
436 inst2.Src[0] = inst->Src[1];
437 inst2.Src[1] = inst->Src[0];
438
439 emit_simple_arith(p, &inst2, opcode, numArgs, fs);
440 }
441
442
443 #ifndef M_PI
444 #define M_PI 3.14159265358979323846
445 #endif
446
447 /*
448 * Translate TGSI instruction to i915 instruction.
449 *
450 * Possible concerns:
451 *
452 * SIN, COS -- could use another taylor step?
453 * LIT -- results seem a little different to sw mesa
454 * LOG -- different to mesa on negative numbers, but this is conformant.
455 */
456 static void
457 i915_translate_instruction(struct i915_fp_compile *p,
458 const struct tgsi_full_instruction *inst,
459 struct i915_fragment_shader *fs)
460 {
461 uint writemask;
462 uint src0, src1, src2, flags;
463 uint tmp = 0;
464
465 switch (inst->Instruction.Opcode) {
466 case TGSI_OPCODE_ABS:
467 src0 = src_vector(p, &inst->Src[0], fs);
468 i915_emit_arith(p,
469 A0_MAX,
470 get_result_vector(p, &inst->Dst[0]),
471 get_result_flags(inst), 0,
472 src0, negate(src0, 1, 1, 1, 1), 0);
473 break;
474
475 case TGSI_OPCODE_ADD:
476 emit_simple_arith(p, inst, A0_ADD, 2, fs);
477 break;
478
479 case TGSI_OPCODE_CMP:
480 src0 = src_vector(p, &inst->Src[0], fs);
481 src1 = src_vector(p, &inst->Src[1], fs);
482 src2 = src_vector(p, &inst->Src[2], fs);
483 i915_emit_arith(p, A0_CMP,
484 get_result_vector(p, &inst->Dst[0]),
485 get_result_flags(inst),
486 0, src0, src2, src1); /* NOTE: order of src2, src1 */
487 break;
488
489 case TGSI_OPCODE_COS:
490 src0 = src_vector(p, &inst->Src[0], fs);
491 tmp = i915_get_utemp(p);
492
493 i915_emit_arith(p,
494 A0_MUL,
495 tmp, A0_DEST_CHANNEL_X, 0,
496 src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
497
498 i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
499
500 /*
501 * t0.xy = MUL x.xx11, x.x1111 ; x^2, x, 1, 1
502 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1
503 * t0 = MUL t0.xxz1 t0.z111 ; x^6 x^4 x^2 1
504 * result = DP4 t0, cos_constants
505 */
506 i915_emit_arith(p,
507 A0_MUL,
508 tmp, A0_DEST_CHANNEL_XY, 0,
509 swizzle(tmp, X, X, ONE, ONE),
510 swizzle(tmp, X, ONE, ONE, ONE), 0);
511
512 i915_emit_arith(p,
513 A0_MUL,
514 tmp, A0_DEST_CHANNEL_XYZ, 0,
515 swizzle(tmp, X, Y, X, ONE),
516 swizzle(tmp, X, X, ONE, ONE), 0);
517
518 i915_emit_arith(p,
519 A0_MUL,
520 tmp, A0_DEST_CHANNEL_XYZ, 0,
521 swizzle(tmp, X, X, Z, ONE),
522 swizzle(tmp, Z, ONE, ONE, ONE), 0);
523
524 i915_emit_arith(p,
525 A0_DP4,
526 get_result_vector(p, &inst->Dst[0]),
527 get_result_flags(inst), 0,
528 swizzle(tmp, ONE, Z, Y, X),
529 i915_emit_const4fv(p, cos_constants), 0);
530 break;
531
532 case TGSI_OPCODE_DDX:
533 case TGSI_OPCODE_DDY:
534 /* XXX We just output 0 here */
535 debug_printf("Punting DDX/DDX\n");
536 src0 = get_result_vector(p, &inst->Dst[0]);
537 i915_emit_arith(p,
538 A0_MOV,
539 get_result_vector(p, &inst->Dst[0]),
540 get_result_flags(inst), 0,
541 swizzle(src0, ZERO, ZERO, ZERO, ZERO), 0, 0);
542 break;
543
544 case TGSI_OPCODE_DP2:
545 src0 = src_vector(p, &inst->Src[0], fs);
546 src1 = src_vector(p, &inst->Src[1], fs);
547
548 i915_emit_arith(p,
549 A0_DP3,
550 get_result_vector(p, &inst->Dst[0]),
551 get_result_flags(inst), 0,
552 swizzle(src0, X, Y, ZERO, ZERO), src1, 0);
553 break;
554
555 case TGSI_OPCODE_DP3:
556 emit_simple_arith(p, inst, A0_DP3, 2, fs);
557 break;
558
559 case TGSI_OPCODE_DP4:
560 emit_simple_arith(p, inst, A0_DP4, 2, fs);
561 break;
562
563 case TGSI_OPCODE_DPH:
564 src0 = src_vector(p, &inst->Src[0], fs);
565 src1 = src_vector(p, &inst->Src[1], fs);
566
567 i915_emit_arith(p,
568 A0_DP4,
569 get_result_vector(p, &inst->Dst[0]),
570 get_result_flags(inst), 0,
571 swizzle(src0, X, Y, Z, ONE), src1, 0);
572 break;
573
574 case TGSI_OPCODE_DST:
575 src0 = src_vector(p, &inst->Src[0], fs);
576 src1 = src_vector(p, &inst->Src[1], fs);
577
578 /* result[0] = 1 * 1;
579 * result[1] = a[1] * b[1];
580 * result[2] = a[2] * 1;
581 * result[3] = 1 * b[3];
582 */
583 i915_emit_arith(p,
584 A0_MUL,
585 get_result_vector(p, &inst->Dst[0]),
586 get_result_flags(inst), 0,
587 swizzle(src0, ONE, Y, Z, ONE),
588 swizzle(src1, ONE, Y, ONE, W), 0);
589 break;
590
591 case TGSI_OPCODE_END:
592 /* no-op */
593 break;
594
595 case TGSI_OPCODE_EX2:
596 src0 = src_vector(p, &inst->Src[0], fs);
597
598 i915_emit_arith(p,
599 A0_EXP,
600 get_result_vector(p, &inst->Dst[0]),
601 get_result_flags(inst), 0,
602 swizzle(src0, X, X, X, X), 0, 0);
603 break;
604
605 case TGSI_OPCODE_FLR:
606 emit_simple_arith(p, inst, A0_FLR, 1, fs);
607 break;
608
609 case TGSI_OPCODE_FRC:
610 emit_simple_arith(p, inst, A0_FRC, 1, fs);
611 break;
612
613 case TGSI_OPCODE_KIL:
614 /* kill if src[0].x < 0 || src[0].y < 0 ... */
615 src0 = src_vector(p, &inst->Src[0], fs);
616 tmp = i915_get_utemp(p);
617
618 i915_emit_texld(p,
619 tmp, /* dest reg: a dummy reg */
620 A0_DEST_CHANNEL_ALL, /* dest writemask */
621 0, /* sampler */
622 src0, /* coord*/
623 T0_TEXKILL); /* opcode */
624 break;
625
626 case TGSI_OPCODE_KILP:
627 assert(0); /* not tested yet */
628 break;
629
630 case TGSI_OPCODE_LG2:
631 src0 = src_vector(p, &inst->Src[0], fs);
632
633 i915_emit_arith(p,
634 A0_LOG,
635 get_result_vector(p, &inst->Dst[0]),
636 get_result_flags(inst), 0,
637 swizzle(src0, X, X, X, X), 0, 0);
638 break;
639
640 case TGSI_OPCODE_LIT:
641 src0 = src_vector(p, &inst->Src[0], fs);
642 tmp = i915_get_utemp(p);
643
644 /* tmp = max( a.xyzw, a.00zw )
645 * XXX: Clamp tmp.w to -128..128
646 * tmp.y = log(tmp.y)
647 * tmp.y = tmp.w * tmp.y
648 * tmp.y = exp(tmp.y)
649 * result = cmp (a.11-x1, a.1x01, a.1xy1 )
650 */
651 i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
652 src0, swizzle(src0, ZERO, ZERO, Z, W), 0);
653
654 i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_Y, 0,
655 swizzle(tmp, Y, Y, Y, Y), 0, 0);
656
657 i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_Y, 0,
658 swizzle(tmp, ZERO, Y, ZERO, ZERO),
659 swizzle(tmp, ZERO, W, ZERO, ZERO), 0);
660
661 i915_emit_arith(p, A0_EXP, tmp, A0_DEST_CHANNEL_Y, 0,
662 swizzle(tmp, Y, Y, Y, Y), 0, 0);
663
664 i915_emit_arith(p, A0_CMP,
665 get_result_vector(p, &inst->Dst[0]),
666 get_result_flags(inst), 0,
667 negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0),
668 swizzle(tmp, ONE, X, ZERO, ONE),
669 swizzle(tmp, ONE, X, Y, ONE));
670
671 break;
672
673 case TGSI_OPCODE_LRP:
674 src0 = src_vector(p, &inst->Src[0], fs);
675 src1 = src_vector(p, &inst->Src[1], fs);
676 src2 = src_vector(p, &inst->Src[2], fs);
677 flags = get_result_flags(inst);
678 tmp = i915_get_utemp(p);
679
680 /* b*a + c*(1-a)
681 *
682 * b*a + c - ca
683 *
684 * tmp = b*a + c,
685 * result = (-c)*a + tmp
686 */
687 i915_emit_arith(p, A0_MAD, tmp,
688 flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2);
689
690 i915_emit_arith(p, A0_MAD,
691 get_result_vector(p, &inst->Dst[0]),
692 flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp);
693 break;
694
695 case TGSI_OPCODE_MAD:
696 emit_simple_arith(p, inst, A0_MAD, 3, fs);
697 break;
698
699 case TGSI_OPCODE_MAX:
700 emit_simple_arith(p, inst, A0_MAX, 2, fs);
701 break;
702
703 case TGSI_OPCODE_MIN:
704 src0 = src_vector(p, &inst->Src[0], fs);
705 src1 = src_vector(p, &inst->Src[1], fs);
706 tmp = i915_get_utemp(p);
707 flags = get_result_flags(inst);
708
709 i915_emit_arith(p,
710 A0_MAX,
711 tmp, flags & A0_DEST_CHANNEL_ALL, 0,
712 negate(src0, 1, 1, 1, 1),
713 negate(src1, 1, 1, 1, 1), 0);
714
715 i915_emit_arith(p,
716 A0_MOV,
717 get_result_vector(p, &inst->Dst[0]),
718 flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0);
719 break;
720
721 case TGSI_OPCODE_MOV:
722 emit_simple_arith(p, inst, A0_MOV, 1, fs);
723 break;
724
725 case TGSI_OPCODE_MUL:
726 emit_simple_arith(p, inst, A0_MUL, 2, fs);
727 break;
728
729 case TGSI_OPCODE_POW:
730 src0 = src_vector(p, &inst->Src[0], fs);
731 src1 = src_vector(p, &inst->Src[1], fs);
732 tmp = i915_get_utemp(p);
733 flags = get_result_flags(inst);
734
735 /* XXX: masking on intermediate values, here and elsewhere.
736 */
737 i915_emit_arith(p,
738 A0_LOG,
739 tmp, A0_DEST_CHANNEL_X, 0,
740 swizzle(src0, X, X, X, X), 0, 0);
741
742 i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
743
744 i915_emit_arith(p,
745 A0_EXP,
746 get_result_vector(p, &inst->Dst[0]),
747 flags, 0, swizzle(tmp, X, X, X, X), 0, 0);
748 break;
749
750 case TGSI_OPCODE_RET:
751 /* XXX: no-op? */
752 break;
753
754 case TGSI_OPCODE_RCP:
755 src0 = src_vector(p, &inst->Src[0], fs);
756
757 i915_emit_arith(p,
758 A0_RCP,
759 get_result_vector(p, &inst->Dst[0]),
760 get_result_flags(inst), 0,
761 swizzle(src0, X, X, X, X), 0, 0);
762 break;
763
764 case TGSI_OPCODE_RSQ:
765 src0 = src_vector(p, &inst->Src[0], fs);
766
767 i915_emit_arith(p,
768 A0_RSQ,
769 get_result_vector(p, &inst->Dst[0]),
770 get_result_flags(inst), 0,
771 swizzle(src0, X, X, X, X), 0, 0);
772 break;
773
774 case TGSI_OPCODE_SCS:
775 src0 = src_vector(p, &inst->Src[0], fs);
776 tmp = i915_get_utemp(p);
777
778 /*
779 * t0.xy = MUL x.xx11, x.x1111 ; x^2, x, 1, 1
780 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
781 * t1 = MUL t0.xyyw t0.yz11 ; x^7 x^5 x^3 x
782 * scs.x = DP4 t1, scs_sin_constants
783 * t1 = MUL t0.xxz1 t0.z111 ; x^6 x^4 x^2 1
784 * scs.y = DP4 t1, scs_cos_constants
785 */
786 i915_emit_arith(p,
787 A0_MUL,
788 tmp, A0_DEST_CHANNEL_XY, 0,
789 swizzle(src0, X, X, ONE, ONE),
790 swizzle(src0, X, ONE, ONE, ONE), 0);
791
792 i915_emit_arith(p,
793 A0_MUL,
794 tmp, A0_DEST_CHANNEL_ALL, 0,
795 swizzle(tmp, X, Y, X, Y),
796 swizzle(tmp, X, X, ONE, ONE), 0);
797
798 writemask = inst->Dst[0].Register.WriteMask;
799
800 if (writemask & TGSI_WRITEMASK_Y) {
801 uint tmp1;
802
803 if (writemask & TGSI_WRITEMASK_X)
804 tmp1 = i915_get_utemp(p);
805 else
806 tmp1 = tmp;
807
808 i915_emit_arith(p,
809 A0_MUL,
810 tmp1, A0_DEST_CHANNEL_ALL, 0,
811 swizzle(tmp, X, Y, Y, W),
812 swizzle(tmp, X, Z, ONE, ONE), 0);
813
814 i915_emit_arith(p,
815 A0_DP4,
816 get_result_vector(p, &inst->Dst[0]),
817 A0_DEST_CHANNEL_Y, 0,
818 swizzle(tmp1, W, Z, Y, X),
819 i915_emit_const4fv(p, scs_sin_constants), 0);
820 }
821
822 if (writemask & TGSI_WRITEMASK_X) {
823 i915_emit_arith(p,
824 A0_MUL,
825 tmp, A0_DEST_CHANNEL_XYZ, 0,
826 swizzle(tmp, X, X, Z, ONE),
827 swizzle(tmp, Z, ONE, ONE, ONE), 0);
828
829 i915_emit_arith(p,
830 A0_DP4,
831 get_result_vector(p, &inst->Dst[0]),
832 A0_DEST_CHANNEL_X, 0,
833 swizzle(tmp, ONE, Z, Y, X),
834 i915_emit_const4fv(p, scs_cos_constants), 0);
835 }
836 break;
837
838 case TGSI_OPCODE_SEQ:
839 /* if we're both >= and <= then we're == */
840 src0 = src_vector(p, &inst->Src[0], fs);
841 src1 = src_vector(p, &inst->Src[1], fs);
842 tmp = i915_get_utemp(p);
843
844 i915_emit_arith(p,
845 A0_SGE,
846 tmp, A0_DEST_CHANNEL_ALL, 0,
847 src0,
848 src1, 0);
849
850 i915_emit_arith(p,
851 A0_SGE,
852 get_result_vector(p, &inst->Dst[0]),
853 A0_DEST_CHANNEL_ALL, 0,
854 src1,
855 src0, 0);
856
857 i915_emit_arith(p,
858 A0_MUL,
859 get_result_vector(p, &inst->Dst[0]),
860 A0_DEST_CHANNEL_ALL, 0,
861 get_result_vector(p, &inst->Dst[0]),
862 tmp, 0);
863
864 break;
865
866 case TGSI_OPCODE_SGE:
867 emit_simple_arith(p, inst, A0_SGE, 2, fs);
868 break;
869
870 case TGSI_OPCODE_SIN:
871 src0 = src_vector(p, &inst->Src[0], fs);
872 tmp = i915_get_utemp(p);
873
874 i915_emit_arith(p,
875 A0_MUL,
876 tmp, A0_DEST_CHANNEL_X, 0,
877 src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
878
879 i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
880
881 /*
882 * t0.xy = MUL x.xx11, x.x1111 ; x^2, x, 1, 1
883 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
884 * t1 = MUL t0.xyyw t0.yz11 ; x^7 x^5 x^3 x
885 * result = DP4 t1.wzyx, sin_constants
886 */
887 i915_emit_arith(p,
888 A0_MUL,
889 tmp, A0_DEST_CHANNEL_XY, 0,
890 swizzle(tmp, X, X, ONE, ONE),
891 swizzle(tmp, X, ONE, ONE, ONE), 0);
892
893 i915_emit_arith(p,
894 A0_MUL,
895 tmp, A0_DEST_CHANNEL_ALL, 0,
896 swizzle(tmp, X, Y, X, Y),
897 swizzle(tmp, X, X, ONE, ONE), 0);
898
899 i915_emit_arith(p,
900 A0_MUL,
901 tmp, A0_DEST_CHANNEL_ALL, 0,
902 swizzle(tmp, X, Y, Y, W),
903 swizzle(tmp, X, Z, ONE, ONE), 0);
904
905 i915_emit_arith(p,
906 A0_DP4,
907 get_result_vector(p, &inst->Dst[0]),
908 get_result_flags(inst), 0,
909 swizzle(tmp, W, Z, Y, X),
910 i915_emit_const4fv(p, sin_constants), 0);
911 break;
912
913 case TGSI_OPCODE_SLE:
914 /* like SGE, but swap reg0, reg1 */
915 emit_simple_arith_swap2(p, inst, A0_SGE, 2, fs);
916 break;
917
918 case TGSI_OPCODE_SLT:
919 emit_simple_arith(p, inst, A0_SLT, 2, fs);
920 break;
921
922 case TGSI_OPCODE_SGT:
923 /* like SLT, but swap reg0, reg1 */
924 emit_simple_arith_swap2(p, inst, A0_SLT, 2, fs);
925 break;
926
927 case TGSI_OPCODE_SNE:
928 /* if we're < or > then we're != */
929 src0 = src_vector(p, &inst->Src[0], fs);
930 src1 = src_vector(p, &inst->Src[1], fs);
931 tmp = i915_get_utemp(p);
932
933 i915_emit_arith(p,
934 A0_SLT,
935 tmp,
936 A0_DEST_CHANNEL_ALL, 0,
937 src0,
938 src1, 0);
939
940 i915_emit_arith(p,
941 A0_SLT,
942 get_result_vector(p, &inst->Dst[0]),
943 A0_DEST_CHANNEL_ALL, 0,
944 src1,
945 src0, 0);
946
947 i915_emit_arith(p,
948 A0_ADD,
949 get_result_vector(p, &inst->Dst[0]),
950 A0_DEST_CHANNEL_ALL, 0,
951 get_result_vector(p, &inst->Dst[0]),
952 tmp, 0);
953 break;
954
955 case TGSI_OPCODE_SSG:
956 /* compute (src>0) - (src<0) */
957 src0 = src_vector(p, &inst->Src[0], fs);
958 tmp = i915_get_utemp(p);
959
960 i915_emit_arith(p,
961 A0_SLT,
962 tmp,
963 A0_DEST_CHANNEL_ALL, 0,
964 src0,
965 swizzle(src0, ZERO, ZERO, ZERO, ZERO), 0);
966
967 i915_emit_arith(p,
968 A0_SLT,
969 get_result_vector(p, &inst->Dst[0]),
970 A0_DEST_CHANNEL_ALL, 0,
971 swizzle(src0, ZERO, ZERO, ZERO, ZERO),
972 src0, 0);
973
974 i915_emit_arith(p,
975 A0_ADD,
976 get_result_vector(p, &inst->Dst[0]),
977 A0_DEST_CHANNEL_ALL, 0,
978 get_result_vector(p, &inst->Dst[0]),
979 negate(tmp, 1, 1, 1, 1), 0);
980 break;
981
982 case TGSI_OPCODE_SUB:
983 src0 = src_vector(p, &inst->Src[0], fs);
984 src1 = src_vector(p, &inst->Src[1], fs);
985
986 i915_emit_arith(p,
987 A0_ADD,
988 get_result_vector(p, &inst->Dst[0]),
989 get_result_flags(inst), 0,
990 src0, negate(src1, 1, 1, 1, 1), 0);
991 break;
992
993 case TGSI_OPCODE_TEX:
994 emit_tex(p, inst, T0_TEXLD, fs);
995 break;
996
997 case TGSI_OPCODE_TRUNC:
998 emit_simple_arith(p, inst, A0_TRC, 1, fs);
999 break;
1000
1001 case TGSI_OPCODE_TXB:
1002 emit_tex(p, inst, T0_TEXLDB, fs);
1003 break;
1004
1005 case TGSI_OPCODE_TXP:
1006 emit_tex(p, inst, T0_TEXLDP, fs);
1007 break;
1008
1009 case TGSI_OPCODE_XPD:
1010 /* Cross product:
1011 * result.x = src0.y * src1.z - src0.z * src1.y;
1012 * result.y = src0.z * src1.x - src0.x * src1.z;
1013 * result.z = src0.x * src1.y - src0.y * src1.x;
1014 * result.w = undef;
1015 */
1016 src0 = src_vector(p, &inst->Src[0], fs);
1017 src1 = src_vector(p, &inst->Src[1], fs);
1018 tmp = i915_get_utemp(p);
1019
1020 i915_emit_arith(p,
1021 A0_MUL,
1022 tmp, A0_DEST_CHANNEL_ALL, 0,
1023 swizzle(src0, Z, X, Y, ONE),
1024 swizzle(src1, Y, Z, X, ONE), 0);
1025
1026 i915_emit_arith(p,
1027 A0_MAD,
1028 get_result_vector(p, &inst->Dst[0]),
1029 get_result_flags(inst), 0,
1030 swizzle(src0, Y, Z, X, ONE),
1031 swizzle(src1, Z, X, Y, ONE),
1032 negate(tmp, 1, 1, 1, 0));
1033 break;
1034
1035 default:
1036 i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode);
1037 p->error = 1;
1038 return;
1039 }
1040
1041 i915_release_utemps(p);
1042 }
1043
1044
1045 /**
1046 * Translate TGSI fragment shader into i915 hardware instructions.
1047 * \param p the translation state
1048 * \param tokens the TGSI token array
1049 */
1050 static void
1051 i915_translate_instructions(struct i915_fp_compile *p,
1052 const struct tgsi_token *tokens,
1053 struct i915_fragment_shader *fs)
1054 {
1055 struct i915_fragment_shader *ifs = p->shader;
1056 struct tgsi_parse_context parse;
1057
1058 tgsi_parse_init( &parse, tokens );
1059
1060 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1061
1062 tgsi_parse_token( &parse );
1063
1064 switch( parse.FullToken.Token.Type ) {
1065 case TGSI_TOKEN_TYPE_PROPERTY:
1066 /*
1067 * We only support one cbuf, but we still need to ignore the property
1068 * correctly so we don't hit the assert at the end of the switch case.
1069 */
1070 assert(parse.FullToken.FullProperty.Property.PropertyName ==
1071 TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS);
1072 break;
1073 case TGSI_TOKEN_TYPE_DECLARATION:
1074 if (parse.FullToken.FullDeclaration.Declaration.File
1075 == TGSI_FILE_CONSTANT) {
1076 uint i;
1077 for (i = parse.FullToken.FullDeclaration.Range.First;
1078 i <= parse.FullToken.FullDeclaration.Range.Last;
1079 i++) {
1080 assert(ifs->constant_flags[i] == 0x0);
1081 ifs->constant_flags[i] = I915_CONSTFLAG_USER;
1082 ifs->num_constants = MAX2(ifs->num_constants, i + 1);
1083 }
1084 }
1085 else if (parse.FullToken.FullDeclaration.Declaration.File
1086 == TGSI_FILE_TEMPORARY) {
1087 uint i;
1088 for (i = parse.FullToken.FullDeclaration.Range.First;
1089 i <= parse.FullToken.FullDeclaration.Range.Last;
1090 i++) {
1091 assert(i < I915_MAX_TEMPORARY);
1092 /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */
1093 p->temp_flag |= (1 << i); /* mark temp as used */
1094 }
1095 }
1096 break;
1097
1098 case TGSI_TOKEN_TYPE_IMMEDIATE:
1099 {
1100 const struct tgsi_full_immediate *imm
1101 = &parse.FullToken.FullImmediate;
1102 const uint pos = p->num_immediates++;
1103 uint j;
1104 assert( imm->Immediate.NrTokens <= 4 + 1 );
1105 for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
1106 p->immediates[pos][j] = imm->u[j].Float;
1107 }
1108 }
1109 break;
1110
1111 case TGSI_TOKEN_TYPE_INSTRUCTION:
1112 if (p->first_instruction) {
1113 /* resolve location of immediates */
1114 uint i, j;
1115 for (i = 0; i < p->num_immediates; i++) {
1116 /* find constant slot for this immediate */
1117 for (j = 0; j < I915_MAX_CONSTANT; j++) {
1118 if (ifs->constant_flags[j] == 0x0) {
1119 memcpy(ifs->constants[j],
1120 p->immediates[i],
1121 4 * sizeof(float));
1122 /*printf("immediate %d maps to const %d\n", i, j);*/
1123 ifs->constant_flags[j] = 0xf; /* all four comps used */
1124 p->immediates_map[i] = j;
1125 ifs->num_constants = MAX2(ifs->num_constants, j + 1);
1126 break;
1127 }
1128 }
1129 }
1130
1131 p->first_instruction = FALSE;
1132 }
1133
1134 i915_translate_instruction(p, &parse.FullToken.FullInstruction, fs);
1135 break;
1136
1137 default:
1138 assert( 0 );
1139 }
1140
1141 } /* while */
1142
1143 tgsi_parse_free (&parse);
1144 }
1145
1146
1147 static struct i915_fp_compile *
1148 i915_init_compile(struct i915_context *i915,
1149 struct i915_fragment_shader *ifs)
1150 {
1151 struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
1152 int i;
1153
1154 p->shader = ifs;
1155
1156 /* Put new constants at end of const buffer, growing downward.
1157 * The problem is we don't know how many user-defined constants might
1158 * be specified with pipe->set_constant_buffer().
1159 * Should pre-scan the user's program to determine the highest-numbered
1160 * constant referenced.
1161 */
1162 ifs->num_constants = 0;
1163 memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
1164
1165 for (i = 0; i < I915_TEX_UNITS; i++)
1166 ifs->generic_mapping[i] = -1;
1167
1168 p->first_instruction = TRUE;
1169
1170 p->nr_tex_indirect = 1; /* correct? */
1171 p->nr_tex_insn = 0;
1172 p->nr_alu_insn = 0;
1173 p->nr_decl_insn = 0;
1174
1175 p->csr = p->program;
1176 p->decl = p->declarations;
1177 p->decl_s = 0;
1178 p->decl_t = 0;
1179 p->temp_flag = ~0x0 << I915_MAX_TEMPORARY;
1180 p->utemp_flag = ~0x7;
1181
1182 /* initialize the first program word */
1183 *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
1184
1185 return p;
1186 }
1187
1188
1189 /* Copy compile results to the fragment program struct and destroy the
1190 * compilation context.
1191 */
1192 static void
1193 i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
1194 {
1195 struct i915_fragment_shader *ifs = p->shader;
1196 unsigned long program_size = (unsigned long) (p->csr - p->program);
1197 unsigned long decl_size = (unsigned long) (p->decl - p->declarations);
1198
1199 if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
1200 i915_program_error(p, "Exceeded max nr indirect texture lookups");
1201
1202 if (p->nr_tex_insn > I915_MAX_TEX_INSN)
1203 i915_program_error(p, "Exceeded max TEX instructions");
1204
1205 if (p->nr_alu_insn > I915_MAX_ALU_INSN)
1206 i915_program_error(p, "Exceeded max ALU instructions");
1207
1208 if (p->nr_decl_insn > I915_MAX_DECL_INSN)
1209 i915_program_error(p, "Exceeded max DECL instructions");
1210
1211 if (p->error) {
1212 p->NumNativeInstructions = 0;
1213 p->NumNativeAluInstructions = 0;
1214 p->NumNativeTexInstructions = 0;
1215 p->NumNativeTexIndirections = 0;
1216
1217 i915_use_passthrough_shader(ifs);
1218 }
1219 else {
1220 p->NumNativeInstructions
1221 = p->nr_alu_insn + p->nr_tex_insn + p->nr_decl_insn;
1222 p->NumNativeAluInstructions = p->nr_alu_insn;
1223 p->NumNativeTexInstructions = p->nr_tex_insn;
1224 p->NumNativeTexIndirections = p->nr_tex_indirect;
1225
1226 /* patch in the program length */
1227 p->declarations[0] |= program_size + decl_size - 2;
1228
1229 /* Copy compilation results to fragment program struct:
1230 */
1231 assert(!ifs->program);
1232 ifs->program
1233 = (uint *) MALLOC((program_size + decl_size) * sizeof(uint));
1234 if (ifs->program) {
1235 ifs->program_len = program_size + decl_size;
1236
1237 memcpy(ifs->program,
1238 p->declarations,
1239 decl_size * sizeof(uint));
1240
1241 memcpy(ifs->program + decl_size,
1242 p->program,
1243 program_size * sizeof(uint));
1244 }
1245 }
1246
1247 /* Release the compilation struct:
1248 */
1249 FREE(p);
1250 }
1251
1252
1253
1254
1255
1256 /**
1257 * Rather than trying to intercept and jiggle depth writes during
1258 * emit, just move the value into its correct position at the end of
1259 * the program:
1260 */
1261 static void
1262 i915_fixup_depth_write(struct i915_fp_compile *p)
1263 {
1264 /* XXX assuming pos/depth is always in output[0] */
1265 if (p->shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
1266 const uint depth = UREG(REG_TYPE_OD, 0);
1267
1268 i915_emit_arith(p,
1269 A0_MOV, /* opcode */
1270 depth, /* dest reg */
1271 A0_DEST_CHANNEL_W, /* write mask */
1272 0, /* saturate? */
1273 swizzle(depth, X, Y, Z, Z), /* src0 */
1274 0, 0 /* src1, src2 */);
1275 }
1276 }
1277
1278
1279 void
1280 i915_translate_fragment_program( struct i915_context *i915,
1281 struct i915_fragment_shader *fs)
1282 {
1283 struct i915_fp_compile *p;
1284 const struct tgsi_token *tokens = fs->state.tokens;
1285
1286 #if 0
1287 tgsi_dump(tokens, 0);
1288 #endif
1289
1290 /* hw doesn't seem to like empty frag programs, even when the depth write
1291 * fixup gets emitted below - may that one is fishy, too? */
1292 if (fs->info.num_instructions == 1) {
1293 i915_use_passthrough_shader(fs);
1294
1295 return;
1296 }
1297
1298 p = i915_init_compile(i915, fs);
1299
1300 i915_translate_instructions(p, tokens, fs);
1301 i915_fixup_depth_write(p);
1302
1303 i915_fini_compile(i915, p);
1304 }