i965g: use Elements in loops over arrays
[mesa.git] / src / gallium / drivers / i965 / brw_wm_fp.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "pipe/p_shader_tokens.h"
34
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37
38 #include "tgsi/tgsi_parse.h"
39 #include "tgsi/tgsi_dump.h"
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_util.h"
42
43 #include "brw_wm.h"
44 #include "brw_util.h"
45 #include "brw_debug.h"
46
47
48
49
50 static const char *wm_opcode_strings[] = {
51 "PIXELXY",
52 "DELTAXY",
53 "PIXELW",
54 "LINTERP",
55 "PINTERP",
56 "CINTERP",
57 "WPOSXY",
58 "FB_WRITE",
59 "FRONTFACING",
60 };
61
62 /***********************************************************************
63 * Source regs
64 */
65
66 static struct brw_fp_src src_reg(GLuint file, GLuint idx)
67 {
68 struct brw_fp_src reg;
69 reg.file = file;
70 reg.index = idx;
71 reg.swizzle = BRW_SWIZZLE_XYZW;
72 reg.indirect = 0;
73 reg.negate = 0;
74 reg.abs = 0;
75 return reg;
76 }
77
78 static struct brw_fp_src src_reg_from_dst(struct brw_fp_dst dst)
79 {
80 return src_reg(dst.file, dst.index);
81 }
82
83 static struct brw_fp_src src_undef( void )
84 {
85 return src_reg(TGSI_FILE_NULL, 0);
86 }
87
88 static GLboolean src_is_undef(struct brw_fp_src src)
89 {
90 return src.file == TGSI_FILE_NULL;
91 }
92
93 static struct brw_fp_src src_swizzle( struct brw_fp_src reg, int x, int y, int z, int w )
94 {
95 unsigned swz = reg.swizzle;
96
97 reg.swizzle = ( GET_SWZ(swz, x) << 0 |
98 GET_SWZ(swz, y) << 2 |
99 GET_SWZ(swz, z) << 4 |
100 GET_SWZ(swz, w) << 6 );
101
102 return reg;
103 }
104
105 static struct brw_fp_src src_scalar( struct brw_fp_src reg, int x )
106 {
107 return src_swizzle(reg, x, x, x, x);
108 }
109
110 static struct brw_fp_src src_abs( struct brw_fp_src src )
111 {
112 src.negate = 0;
113 src.abs = 1;
114 return src;
115 }
116
117 static struct brw_fp_src src_negate( struct brw_fp_src src )
118 {
119 src.negate = 1;
120 src.abs = 0;
121 return src;
122 }
123
124
125 static int match_or_expand_immediate( const float *v,
126 unsigned nr,
127 float *v2,
128 unsigned *nr2,
129 unsigned *swizzle )
130 {
131 unsigned i, j;
132
133 *swizzle = 0;
134
135 for (i = 0; i < nr; i++) {
136 boolean found = FALSE;
137
138 for (j = 0; j < *nr2 && !found; j++) {
139 if (v[i] == v2[j]) {
140 *swizzle |= j << (i * 2);
141 found = TRUE;
142 }
143 }
144
145 if (!found) {
146 if (*nr2 >= 4)
147 return FALSE;
148
149 v2[*nr2] = v[i];
150 *swizzle |= *nr2 << (i * 2);
151 (*nr2)++;
152 }
153 }
154
155 return TRUE;
156 }
157
158
159
160 /* Internally generated immediates: overkill...
161 */
162 static struct brw_fp_src src_imm( struct brw_wm_compile *c,
163 const GLfloat *v,
164 unsigned nr)
165 {
166 unsigned i, j;
167 unsigned swizzle;
168
169 /* Could do a first pass where we examine all existing immediates
170 * without expanding.
171 */
172
173 for (i = 0; i < c->nr_immediates; i++) {
174 if (match_or_expand_immediate( v,
175 nr,
176 c->immediate[i].v,
177 &c->immediate[i].nr,
178 &swizzle ))
179 goto out;
180 }
181
182 if (c->nr_immediates < Elements(c->immediate)) {
183 i = c->nr_immediates++;
184 if (match_or_expand_immediate( v,
185 nr,
186 c->immediate[i].v,
187 &c->immediate[i].nr,
188 &swizzle ))
189 goto out;
190 }
191
192 c->error = 1;
193 return src_undef();
194
195 out:
196 /* Make sure that all referenced elements are from this immediate.
197 * Has the effect of making size-one immediates into scalars.
198 */
199 for (j = nr; j < 4; j++)
200 swizzle |= (swizzle & 0x3) << (j * 2);
201
202 return src_swizzle( src_reg( TGSI_FILE_IMMEDIATE, i ),
203 GET_SWZ(swizzle, X),
204 GET_SWZ(swizzle, Y),
205 GET_SWZ(swizzle, Z),
206 GET_SWZ(swizzle, W) );
207 }
208
209
210
211 static struct brw_fp_src src_imm1f( struct brw_wm_compile *c,
212 GLfloat f )
213 {
214 return src_imm(c, &f, 1);
215 }
216
217 static struct brw_fp_src src_imm4f( struct brw_wm_compile *c,
218 GLfloat x,
219 GLfloat y,
220 GLfloat z,
221 GLfloat w)
222 {
223 GLfloat f[4] = {x,y,z,w};
224 return src_imm(c, f, 4);
225 }
226
227
228
229 /***********************************************************************
230 * Dest regs
231 */
232
233 static struct brw_fp_dst dst_reg(GLuint file, GLuint idx)
234 {
235 struct brw_fp_dst reg;
236 reg.file = file;
237 reg.index = idx;
238 reg.writemask = BRW_WRITEMASK_XYZW;
239 reg.indirect = 0;
240 return reg;
241 }
242
243 static struct brw_fp_dst dst_mask( struct brw_fp_dst reg, int mask )
244 {
245 reg.writemask &= mask;
246 return reg;
247 }
248
249 static struct brw_fp_dst dst_undef( void )
250 {
251 return dst_reg(TGSI_FILE_NULL, 0);
252 }
253
254 static boolean dst_is_undef( struct brw_fp_dst dst )
255 {
256 return dst.file == TGSI_FILE_NULL;
257 }
258
259 static struct brw_fp_dst dst_saturate( struct brw_fp_dst reg, boolean flag )
260 {
261 reg.saturate = flag;
262 return reg;
263 }
264
265 static struct brw_fp_dst get_temp( struct brw_wm_compile *c )
266 {
267 int bit = ffs( ~c->fp_temp );
268
269 if (!bit) {
270 debug_printf("%s: out of temporaries\n", __FILE__);
271 }
272
273 c->fp_temp |= 1<<(bit-1);
274 return dst_reg(TGSI_FILE_TEMPORARY, c->fp_first_internal_temp+(bit-1));
275 }
276
277
278 static void release_temp( struct brw_wm_compile *c, struct brw_fp_dst temp )
279 {
280 c->fp_temp &= ~(1 << (temp.index - c->fp_first_internal_temp));
281 }
282
283
284 /***********************************************************************
285 * Instructions
286 */
287
288 static struct brw_fp_instruction *get_fp_inst(struct brw_wm_compile *c)
289 {
290 return &c->fp_instructions[c->nr_fp_insns++];
291 }
292
293 static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
294 GLuint op,
295 struct brw_fp_dst dest,
296 GLuint tex_src_unit,
297 GLuint tex_src_target,
298 struct brw_fp_src src0,
299 struct brw_fp_src src1,
300 struct brw_fp_src src2 )
301 {
302 struct brw_fp_instruction *inst = get_fp_inst(c);
303
304 inst->opcode = op;
305 inst->dst = dest;
306 inst->tex_unit = tex_src_unit;
307 inst->tex_target = tex_src_target;
308 inst->src[0] = src0;
309 inst->src[1] = src1;
310 inst->src[2] = src2;
311
312 return inst;
313 }
314
315
316 static INLINE void emit_op3(struct brw_wm_compile *c,
317 GLuint op,
318 struct brw_fp_dst dest,
319 struct brw_fp_src src0,
320 struct brw_fp_src src1,
321 struct brw_fp_src src2 )
322 {
323 emit_tex_op(c, op, dest, 0, 0, src0, src1, src2);
324 }
325
326
327 static INLINE void emit_op2(struct brw_wm_compile *c,
328 GLuint op,
329 struct brw_fp_dst dest,
330 struct brw_fp_src src0,
331 struct brw_fp_src src1)
332 {
333 emit_tex_op(c, op, dest, 0, 0, src0, src1, src_undef());
334 }
335
336 static INLINE void emit_op1(struct brw_wm_compile *c,
337 GLuint op,
338 struct brw_fp_dst dest,
339 struct brw_fp_src src0)
340 {
341 emit_tex_op(c, op, dest, 0, 0, src0, src_undef(), src_undef());
342 }
343
344 static INLINE void emit_op0(struct brw_wm_compile *c,
345 GLuint op,
346 struct brw_fp_dst dest)
347 {
348 emit_tex_op(c, op, dest, 0, 0, src_undef(), src_undef(), src_undef());
349 }
350
351
352
353 /* Many opcodes produce the same value across all the result channels.
354 * We'd rather not have to support that splatting in the opcode implementations,
355 * and brw_wm_pass*.c wants to optimize them out by shuffling references around
356 * anyway. We can easily get both by emitting the opcode to one channel, and
357 * then MOVing it to the others, which brw_wm_pass*.c already understands.
358 */
359 static void emit_scalar_insn(struct brw_wm_compile *c,
360 unsigned opcode,
361 struct brw_fp_dst dst,
362 struct brw_fp_src src0,
363 struct brw_fp_src src1,
364 struct brw_fp_src src2 )
365 {
366 unsigned first_chan = ffs(dst.writemask) - 1;
367 unsigned first_mask = 1 << first_chan;
368
369 if (dst.writemask == 0)
370 return;
371
372 emit_op3( c, opcode,
373 dst_mask(dst, first_mask),
374 src0, src1, src2 );
375
376 if (dst.writemask != first_mask) {
377 emit_op1(c, TGSI_OPCODE_MOV,
378 dst_mask(dst, ~first_mask),
379 src_scalar(src_reg_from_dst(dst), first_chan));
380 }
381 }
382
383
384 /***********************************************************************
385 * Special instructions for interpolation and other tasks
386 */
387
388 static struct brw_fp_src get_pixel_xy( struct brw_wm_compile *c )
389 {
390 if (src_is_undef(c->fp_pixel_xy)) {
391 struct brw_fp_dst pixel_xy = get_temp(c);
392 struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
393
394
395 /* Emit the out calculations, and hold onto the results. Use
396 * two instructions as a temporary is required.
397 */
398 /* pixel_xy.xy = PIXELXY payload[0];
399 */
400 emit_op1(c,
401 WM_PIXELXY,
402 dst_mask(pixel_xy, BRW_WRITEMASK_XY),
403 payload_r0_depth);
404
405 c->fp_pixel_xy = src_reg_from_dst(pixel_xy);
406 }
407
408 return c->fp_pixel_xy;
409 }
410
411 static struct brw_fp_src get_delta_xy( struct brw_wm_compile *c )
412 {
413 if (src_is_undef(c->fp_delta_xy)) {
414 struct brw_fp_dst delta_xy = get_temp(c);
415 struct brw_fp_src pixel_xy = get_pixel_xy(c);
416 struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
417
418 /* deltas.xy = DELTAXY pixel_xy, payload[0]
419 */
420 emit_op3(c,
421 WM_DELTAXY,
422 dst_mask(delta_xy, BRW_WRITEMASK_XY),
423 pixel_xy,
424 payload_r0_depth,
425 src_undef());
426
427 c->fp_delta_xy = src_reg_from_dst(delta_xy);
428 }
429
430 return c->fp_delta_xy;
431 }
432
433 static struct brw_fp_src get_pixel_w( struct brw_wm_compile *c )
434 {
435 if (src_is_undef(c->fp_pixel_w)) {
436 struct brw_fp_dst pixel_w = get_temp(c);
437 struct brw_fp_src deltas = get_delta_xy(c);
438
439 /* XXX: assuming position is always first -- valid?
440 */
441 struct brw_fp_src interp_wpos = src_reg(BRW_FILE_PAYLOAD, 0);
442
443 /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
444 */
445 emit_op3(c,
446 WM_PIXELW,
447 dst_mask(pixel_w, BRW_WRITEMASK_W),
448 interp_wpos,
449 deltas,
450 src_undef());
451
452
453 c->fp_pixel_w = src_reg_from_dst(pixel_w);
454 }
455
456 return c->fp_pixel_w;
457 }
458
459
460 /***********************************************************************
461 * Emit INTERP instructions ahead of first use of each attrib.
462 */
463
464 static void emit_interp( struct brw_wm_compile *c,
465 GLuint idx,
466 GLuint semantic,
467 GLuint interp_mode )
468 {
469 struct brw_fp_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
470 struct brw_fp_src interp = src_reg(BRW_FILE_PAYLOAD, idx);
471 struct brw_fp_src deltas = get_delta_xy(c);
472
473 /* Need to use PINTERP on attributes which have been
474 * multiplied by 1/W in the SF program, and LINTERP on those
475 * which have not:
476 */
477 switch (semantic) {
478 case TGSI_SEMANTIC_POSITION:
479 /* Have to treat wpos.xy specially:
480 */
481 emit_op1(c,
482 WM_WPOSXY,
483 dst_mask(dst, BRW_WRITEMASK_XY),
484 get_pixel_xy(c));
485
486 /* TGSI_FILE_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
487 */
488 emit_op2(c,
489 WM_LINTERP,
490 dst_mask(dst, BRW_WRITEMASK_ZW),
491 interp,
492 deltas);
493 break;
494
495 case TGSI_SEMANTIC_COLOR:
496 if (c->key.flat_shade) {
497 emit_op1(c,
498 WM_CINTERP,
499 dst,
500 interp);
501 }
502 else if (interp_mode == TGSI_INTERPOLATE_LINEAR) {
503 emit_op2(c,
504 WM_LINTERP,
505 dst,
506 interp,
507 deltas);
508 }
509 else {
510 emit_op3(c,
511 WM_PINTERP,
512 dst,
513 interp,
514 deltas,
515 get_pixel_w(c));
516 }
517
518 break;
519
520 case TGSI_SEMANTIC_FOG:
521 /* Interpolate the fog coordinate */
522 emit_op3(c,
523 WM_PINTERP,
524 dst_mask(dst, BRW_WRITEMASK_X),
525 interp,
526 deltas,
527 get_pixel_w(c));
528
529 emit_op1(c,
530 TGSI_OPCODE_MOV,
531 dst_mask(dst, BRW_WRITEMASK_YZ),
532 src_imm1f(c, 0.0));
533
534 emit_op1(c,
535 TGSI_OPCODE_MOV,
536 dst_mask(dst, BRW_WRITEMASK_W),
537 src_imm1f(c, 1.0));
538 break;
539
540 case TGSI_SEMANTIC_FACE:
541 /* XXX review/test this case */
542 emit_op0(c,
543 WM_FRONTFACING,
544 dst_mask(dst, BRW_WRITEMASK_X));
545
546 emit_op1(c,
547 TGSI_OPCODE_MOV,
548 dst_mask(dst, BRW_WRITEMASK_YZ),
549 src_imm1f(c, 0.0));
550
551 emit_op1(c,
552 TGSI_OPCODE_MOV,
553 dst_mask(dst, BRW_WRITEMASK_W),
554 src_imm1f(c, 1.0));
555 break;
556
557 case TGSI_SEMANTIC_PSIZE:
558 /* XXX review/test this case */
559 emit_op3(c,
560 WM_PINTERP,
561 dst_mask(dst, BRW_WRITEMASK_XY),
562 interp,
563 deltas,
564 get_pixel_w(c));
565
566 emit_op1(c,
567 TGSI_OPCODE_MOV,
568 dst_mask(dst, BRW_WRITEMASK_Z),
569 src_imm1f(c, 0.0f));
570
571 emit_op1(c,
572 TGSI_OPCODE_MOV,
573 dst_mask(dst, BRW_WRITEMASK_W),
574 src_imm1f(c, 1.0f));
575 break;
576
577 default:
578 switch (interp_mode) {
579 case TGSI_INTERPOLATE_CONSTANT:
580 emit_op1(c,
581 WM_CINTERP,
582 dst,
583 interp);
584 break;
585
586 case TGSI_INTERPOLATE_LINEAR:
587 emit_op2(c,
588 WM_LINTERP,
589 dst,
590 interp,
591 deltas);
592 break;
593
594 case TGSI_INTERPOLATE_PERSPECTIVE:
595 emit_op3(c,
596 WM_PINTERP,
597 dst,
598 interp,
599 deltas,
600 get_pixel_w(c));
601 break;
602 }
603 break;
604 }
605 }
606
607
608 /***********************************************************************
609 * Expand various instructions here to simpler forms.
610 */
611 static void precalc_dst( struct brw_wm_compile *c,
612 struct brw_fp_dst dst,
613 struct brw_fp_src src0,
614 struct brw_fp_src src1 )
615 {
616 if (dst.writemask & BRW_WRITEMASK_Y) {
617 /* dst.y = mul src0.y, src1.y
618 */
619 emit_op2(c,
620 TGSI_OPCODE_MUL,
621 dst_mask(dst, BRW_WRITEMASK_Y),
622 src0,
623 src1);
624 }
625
626 if (dst.writemask & BRW_WRITEMASK_XZ) {
627 /* dst.z = mov src0.zzzz
628 */
629 emit_op1(c,
630 TGSI_OPCODE_MOV,
631 dst_mask(dst, BRW_WRITEMASK_Z),
632 src_scalar(src0, Z));
633
634 /* dst.x = imm1f(1.0)
635 */
636 emit_op1(c,
637 TGSI_OPCODE_MOV,
638 dst_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
639 src_imm1f(c, 1.0));
640 }
641 if (dst.writemask & BRW_WRITEMASK_W) {
642 /* dst.w = mov src1.w
643 */
644 emit_op1(c,
645 TGSI_OPCODE_MOV,
646 dst_mask(dst, BRW_WRITEMASK_W),
647 src1);
648 }
649 }
650
651
652 static void precalc_lit( struct brw_wm_compile *c,
653 struct brw_fp_dst dst,
654 struct brw_fp_src src0 )
655 {
656 if (dst.writemask & BRW_WRITEMASK_XW) {
657 /* dst.xw = imm(1.0f)
658 */
659 emit_op1(c,
660 TGSI_OPCODE_MOV,
661 dst_saturate(dst_mask(dst, BRW_WRITEMASK_XW), 0),
662 src_imm1f(c, 1.0f));
663 }
664
665 if (dst.writemask & BRW_WRITEMASK_YZ) {
666 emit_op1(c,
667 TGSI_OPCODE_LIT,
668 dst_mask(dst, BRW_WRITEMASK_YZ),
669 src0);
670 }
671 }
672
673
674 /**
675 * Some TEX instructions require extra code, cube map coordinate
676 * normalization, or coordinate scaling for RECT textures, etc.
677 * This function emits those extra instructions and the TEX
678 * instruction itself.
679 */
680 static void precalc_tex( struct brw_wm_compile *c,
681 struct brw_fp_dst dst,
682 unsigned target,
683 unsigned unit,
684 struct brw_fp_src src0 )
685 {
686 struct brw_fp_src coord = src_undef();
687 struct brw_fp_dst tmp = dst_undef();
688
689 assert(unit < BRW_MAX_TEX_UNIT);
690
691 /* Cubemap: find longest component of coord vector and normalize
692 * it.
693 */
694 if (target == TGSI_TEXTURE_CUBE) {
695 struct brw_fp_src tmpsrc;
696
697 tmp = get_temp(c);
698 tmpsrc = src_reg_from_dst(tmp);
699
700 /* tmp = abs(src0) */
701 emit_op1(c,
702 TGSI_OPCODE_MOV,
703 tmp,
704 src_abs(src0));
705
706 /* tmp.X = MAX(tmp.X, tmp.Y) */
707 emit_op2(c, TGSI_OPCODE_MAX,
708 dst_mask(tmp, BRW_WRITEMASK_X),
709 src_scalar(tmpsrc, X),
710 src_scalar(tmpsrc, Y));
711
712 /* tmp.X = MAX(tmp.X, tmp.Z) */
713 emit_op2(c, TGSI_OPCODE_MAX,
714 dst_mask(tmp, BRW_WRITEMASK_X),
715 tmpsrc,
716 src_scalar(tmpsrc, Z));
717
718 /* tmp.X = 1 / tmp.X */
719 emit_op1(c, TGSI_OPCODE_RCP,
720 dst_mask(tmp, BRW_WRITEMASK_X),
721 tmpsrc);
722
723 /* tmp = src0 * tmp.xxxx */
724 emit_op2(c, TGSI_OPCODE_MUL,
725 tmp,
726 src0,
727 src_scalar(tmpsrc, X));
728
729 coord = tmpsrc;
730 }
731 else if (target == TGSI_TEXTURE_RECT ||
732 target == TGSI_TEXTURE_SHADOWRECT) {
733 /* XXX: need a mechanism for internally generated constants.
734 */
735 coord = src0;
736 }
737 else {
738 coord = src0;
739 }
740
741 /* Need to emit YUV texture conversions by hand. Probably need to
742 * do this here - the alternative is in brw_wm_emit.c, but the
743 * conversion requires allocating a temporary variable which we
744 * don't have the facility to do that late in the compilation.
745 */
746 if (c->key.yuvtex_mask & (1 << unit)) {
747 /* convert ycbcr to RGBA */
748 GLboolean swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
749 struct brw_fp_dst tmp = get_temp(c);
750 struct brw_fp_src tmpsrc = src_reg_from_dst(tmp);
751 struct brw_fp_src C0 = src_imm4f( c, -.5, -.0625, -.5, 1.164 );
752 struct brw_fp_src C1 = src_imm4f( c, 1.596, -0.813, 2.018, -.391 );
753
754 /* tmp = TEX ...
755 */
756 emit_tex_op(c,
757 TGSI_OPCODE_TEX,
758 dst_saturate(tmp, dst.saturate),
759 unit,
760 target,
761 coord,
762 src_undef(),
763 src_undef());
764
765 /* tmp.xyz = ADD TMP, C0
766 */
767 emit_op2(c, TGSI_OPCODE_ADD,
768 dst_mask(tmp, BRW_WRITEMASK_XYZ),
769 tmpsrc,
770 C0);
771
772 /* YUV.y = MUL YUV.y, C0.w
773 */
774 emit_op2(c, TGSI_OPCODE_MUL,
775 dst_mask(tmp, BRW_WRITEMASK_Y),
776 tmpsrc,
777 src_scalar(C0, W));
778
779 /*
780 * if (UV swaped)
781 * RGB.xyz = MAD YUV.zzx, C1, YUV.y
782 * else
783 * RGB.xyz = MAD YUV.xxz, C1, YUV.y
784 */
785
786 emit_op3(c, TGSI_OPCODE_MAD,
787 dst_mask(dst, BRW_WRITEMASK_XYZ),
788 ( swap_uv ?
789 src_swizzle(tmpsrc, Z,Z,X,X) :
790 src_swizzle(tmpsrc, X,X,Z,Z)),
791 C1,
792 src_scalar(tmpsrc, Y));
793
794 /* RGB.y = MAD YUV.z, C1.w, RGB.y
795 */
796 emit_op3(c,
797 TGSI_OPCODE_MAD,
798 dst_mask(dst, BRW_WRITEMASK_Y),
799 src_scalar(tmpsrc, Z),
800 src_scalar(C1, W),
801 src_scalar(src_reg_from_dst(dst), Y));
802
803 release_temp(c, tmp);
804 }
805 else {
806 /* ordinary RGBA tex instruction */
807 emit_tex_op(c,
808 TGSI_OPCODE_TEX,
809 dst,
810 unit,
811 target,
812 coord,
813 src_undef(),
814 src_undef());
815 }
816
817 /* XXX: add GL_EXT_texture_swizzle support to gallium -- by
818 * generating shader varients in mesa state tracker.
819 */
820
821 /* Release this temp if we ended up allocating it:
822 */
823 if (!dst_is_undef(tmp))
824 release_temp(c, tmp);
825 }
826
827
828 /**
829 * Check if the given TXP instruction really needs the divide-by-W step.
830 */
831 static GLboolean projtex( struct brw_wm_compile *c,
832 unsigned target,
833 struct brw_fp_src src )
834 {
835 /* Only try to detect the simplest cases. Could detect (later)
836 * cases where we are trying to emit code like RCP {1.0}, MUL x,
837 * {1.0}, and so on.
838 *
839 * More complex cases than this typically only arise from
840 * user-provided fragment programs anyway:
841 */
842 if (target == TGSI_TEXTURE_CUBE)
843 return GL_FALSE; /* ut2004 gun rendering !?! */
844
845 if (src.file == TGSI_FILE_INPUT &&
846 GET_SWZ(src.swizzle, W) == W &&
847 c->fp->info.input_interpolate[src.index] != TGSI_INTERPOLATE_PERSPECTIVE)
848 return GL_FALSE;
849
850 return GL_TRUE;
851 }
852
853
854 /**
855 * Emit code for TXP.
856 */
857 static void precalc_txp( struct brw_wm_compile *c,
858 struct brw_fp_dst dst,
859 unsigned target,
860 unsigned unit,
861 struct brw_fp_src src0 )
862 {
863 if (projtex(c, target, src0)) {
864 struct brw_fp_dst tmp = get_temp(c);
865
866 /* tmp0.w = RCP inst.arg[0][3]
867 */
868 emit_op1(c,
869 TGSI_OPCODE_RCP,
870 dst_mask(tmp, BRW_WRITEMASK_W),
871 src_scalar(src0, W));
872
873 /* tmp0.xyz = MUL inst.arg[0], tmp0.wwww
874 */
875 emit_op2(c,
876 TGSI_OPCODE_MUL,
877 dst_mask(tmp, BRW_WRITEMASK_XYZ),
878 src0,
879 src_scalar(src_reg_from_dst(tmp), W));
880
881 /* dst = TEX tmp0
882 */
883 precalc_tex(c,
884 dst,
885 target,
886 unit,
887 src_reg_from_dst(tmp));
888
889 release_temp(c, tmp);
890 }
891 else
892 {
893 /* dst = TEX src0
894 */
895 precalc_tex(c, dst, target, unit, src0);
896 }
897 }
898
899
900 /* XXX: note this returns a src_reg.
901 */
902 static struct brw_fp_src
903 find_output_by_semantic( struct brw_wm_compile *c,
904 unsigned semantic,
905 unsigned index )
906 {
907 const struct tgsi_shader_info *info = &c->fp->info;
908 unsigned i;
909
910 for (i = 0; i < info->num_outputs; i++)
911 if (info->output_semantic_name[i] == semantic &&
912 info->output_semantic_index[i] == index)
913 return src_reg( TGSI_FILE_OUTPUT, i );
914
915 /* If not found, return some arbitrary immediate value:
916 */
917 return src_imm1f(c, 1.0);
918 }
919
920
921 static void emit_fb_write( struct brw_wm_compile *c )
922 {
923 struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
924 struct brw_fp_src outdepth = find_output_by_semantic(c, TGSI_SEMANTIC_POSITION, 0);
925 GLuint i;
926
927
928 outdepth = src_scalar(outdepth, Z);
929
930 for (i = 0 ; i < c->key.nr_cbufs; i++) {
931 struct brw_fp_src outcolor;
932 unsigned target = 1<<i;
933
934 /* Set EOT flag on last inst:
935 */
936 if (i == c->key.nr_cbufs - 1)
937 target |= 1;
938
939 outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
940
941 /* Use emit_tex_op so that we can specify the inst->tex_target
942 * field, which is abused to contain the FB write target and the
943 * EOT marker
944 */
945 emit_tex_op(c, WM_FB_WRITE,
946 dst_undef(),
947 target,
948 0,
949 outcolor,
950 payload_r0_depth,
951 outdepth);
952 }
953 }
954
955
956 static struct brw_fp_dst translate_dst( struct brw_wm_compile *c,
957 const struct tgsi_full_dst_register *dst,
958 unsigned saturate )
959 {
960 struct brw_fp_dst out;
961
962 out.file = dst->DstRegister.File;
963 out.index = dst->DstRegister.Index;
964 out.writemask = dst->DstRegister.WriteMask;
965 out.indirect = dst->DstRegister.Indirect;
966 out.saturate = (saturate == TGSI_SAT_ZERO_ONE);
967
968 if (out.indirect) {
969 assert(dst->DstRegisterInd.File == TGSI_FILE_ADDRESS);
970 assert(dst->DstRegisterInd.Index == 0);
971 }
972
973 return out;
974 }
975
976
977 static struct brw_fp_src translate_src( struct brw_wm_compile *c,
978 const struct tgsi_full_src_register *src )
979 {
980 struct brw_fp_src out;
981
982 out.file = src->SrcRegister.File;
983 out.index = src->SrcRegister.Index;
984 out.indirect = src->SrcRegister.Indirect;
985
986 out.swizzle = ((src->SrcRegister.SwizzleX << 0) |
987 (src->SrcRegister.SwizzleY << 2) |
988 (src->SrcRegister.SwizzleZ << 4) |
989 (src->SrcRegister.SwizzleW << 6));
990
991 switch (tgsi_util_get_full_src_register_sign_mode( src, 0 )) {
992 case TGSI_UTIL_SIGN_CLEAR:
993 out.abs = 1;
994 out.negate = 0;
995 break;
996
997 case TGSI_UTIL_SIGN_SET:
998 out.abs = 1;
999 out.negate = 1;
1000 break;
1001
1002 case TGSI_UTIL_SIGN_TOGGLE:
1003 out.abs = 0;
1004 out.negate = 1;
1005 break;
1006
1007 case TGSI_UTIL_SIGN_KEEP:
1008 default:
1009 out.abs = 0;
1010 out.negate = 0;
1011 break;
1012 }
1013
1014 if (out.indirect) {
1015 assert(src->SrcRegisterInd.File == TGSI_FILE_ADDRESS);
1016 assert(src->SrcRegisterInd.Index == 0);
1017 }
1018
1019 return out;
1020 }
1021
1022
1023
1024 static void emit_insn( struct brw_wm_compile *c,
1025 const struct tgsi_full_instruction *inst )
1026 {
1027 unsigned opcode = inst->Instruction.Opcode;
1028 struct brw_fp_dst dst;
1029 struct brw_fp_src src[3];
1030 int i;
1031
1032 dst = translate_dst( c, &inst->FullDstRegisters[0],
1033 inst->Instruction.Saturate );
1034
1035 for (i = 0; i < inst->Instruction.NumSrcRegs; i++)
1036 src[i] = translate_src( c, &inst->FullSrcRegisters[0] );
1037
1038 switch (opcode) {
1039 case TGSI_OPCODE_ABS:
1040 emit_op1(c, TGSI_OPCODE_MOV,
1041 dst,
1042 src_abs(src[0]));
1043 break;
1044
1045 case TGSI_OPCODE_SUB:
1046 emit_op2(c, TGSI_OPCODE_ADD,
1047 dst,
1048 src[0],
1049 src_negate(src[1]));
1050 break;
1051
1052 case TGSI_OPCODE_SCS:
1053 emit_op1(c, TGSI_OPCODE_SCS,
1054 dst_mask(dst, BRW_WRITEMASK_XY),
1055 src[0]);
1056 break;
1057
1058 case TGSI_OPCODE_DST:
1059 precalc_dst(c, dst, src[0], src[1]);
1060 break;
1061
1062 case TGSI_OPCODE_LIT:
1063 precalc_lit(c, dst, src[0]);
1064 break;
1065
1066 case TGSI_OPCODE_TEX:
1067 precalc_tex(c, dst,
1068 inst->InstructionExtTexture.Texture,
1069 src[0].file, /* sampler unit */
1070 src[1] );
1071 break;
1072
1073 case TGSI_OPCODE_TXP:
1074 precalc_txp(c, dst,
1075 inst->InstructionExtTexture.Texture,
1076 src[0].file, /* sampler unit */
1077 src[1] );
1078 break;
1079
1080 case TGSI_OPCODE_TXB:
1081 /* XXX: TXB not done
1082 */
1083 precalc_tex(c, dst,
1084 inst->InstructionExtTexture.Texture,
1085 src[0].file, /* sampler unit */
1086 src[1] );
1087 break;
1088
1089 case TGSI_OPCODE_XPD:
1090 emit_op2(c, TGSI_OPCODE_XPD,
1091 dst_mask(dst, BRW_WRITEMASK_XYZ),
1092 src[0],
1093 src[1]);
1094 break;
1095
1096 case TGSI_OPCODE_KIL:
1097 emit_op1(c, TGSI_OPCODE_KIL,
1098 dst_mask(dst_undef(), 0),
1099 src[0]);
1100 break;
1101
1102 case TGSI_OPCODE_END:
1103 emit_fb_write(c);
1104 break;
1105 default:
1106 if (!c->key.has_flow_control &&
1107 brw_wm_is_scalar_result(opcode))
1108 emit_scalar_insn(c, opcode, dst, src[0], src[1], src[2]);
1109 else
1110 emit_op3(c, opcode, dst, src[0], src[1], src[2]);
1111 break;
1112 }
1113 }
1114
1115 /**
1116 * Initial pass for fragment program code generation.
1117 * This function is used by both the GLSL and non-GLSL paths.
1118 */
1119 int brw_wm_pass_fp( struct brw_wm_compile *c )
1120 {
1121 struct brw_fragment_shader *fs = c->fp;
1122 struct tgsi_parse_context parse;
1123 struct tgsi_full_instruction *inst;
1124 struct tgsi_full_declaration *decl;
1125 const float *imm;
1126 GLuint size;
1127 GLuint i;
1128
1129 if (BRW_DEBUG & DEBUG_WM) {
1130 debug_printf("pre-fp:\n");
1131 tgsi_dump(fs->tokens, 0);
1132 }
1133
1134 c->fp_pixel_xy = src_undef();
1135 c->fp_delta_xy = src_undef();
1136 c->fp_pixel_w = src_undef();
1137 c->nr_fp_insns = 0;
1138 c->nr_immediates = 0;
1139
1140
1141 /* Loop over all instructions doing assorted simplifications and
1142 * transformations.
1143 */
1144 tgsi_parse_init( &parse, fs->tokens );
1145 while( !tgsi_parse_end_of_tokens( &parse ) ) {
1146 tgsi_parse_token( &parse );
1147
1148 switch( parse.FullToken.Token.Type ) {
1149 case TGSI_TOKEN_TYPE_DECLARATION:
1150 /* Turn intput declarations into special WM_* instructions.
1151 *
1152 * XXX: For non-branching shaders, consider deferring variable
1153 * initialization as late as possible to minimize register
1154 * usage. This is how the original BRW driver worked.
1155 *
1156 * In a branching shader, must preamble instructions at decl
1157 * time, as instruction order in the shader does not
1158 * correspond to the order instructions are executed in the
1159 * wild.
1160 *
1161 * This is where special instructions such as WM_CINTERP,
1162 * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
1163 * compute shader inputs from the payload registers and pixel
1164 * position.
1165 */
1166 decl = &parse.FullToken.FullDeclaration;
1167 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1168 unsigned first, last, mask;
1169 unsigned attrib;
1170
1171 first = decl->DeclarationRange.First;
1172 last = decl->DeclarationRange.Last;
1173 mask = decl->Declaration.UsageMask;
1174
1175 for (attrib = first; attrib <= last; attrib++) {
1176 emit_interp(c,
1177 attrib,
1178 decl->Semantic.SemanticName,
1179 decl->Declaration.Interpolate );
1180 }
1181 }
1182
1183 break;
1184
1185 case TGSI_TOKEN_TYPE_IMMEDIATE:
1186 /* Unlike VS programs we can probably manage fine encoding
1187 * immediate values directly into the emitted EU
1188 * instructions, as we probably only need to reference one
1189 * float value per instruction. Just save the data for now
1190 * and use directly later.
1191 */
1192 i = c->nr_immediates++;
1193 imm = &parse.FullToken.FullImmediate.u[i].Float;
1194 size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1195
1196 if (c->nr_immediates >= BRW_WM_MAX_CONST)
1197 return PIPE_ERROR_OUT_OF_MEMORY;
1198
1199 for (i = 0; i < size; i++)
1200 c->immediate[c->nr_immediates].v[i] = imm[i];
1201
1202 for (; i < 4; i++)
1203 c->immediate[c->nr_immediates].v[i] = 0.0;
1204
1205 c->immediate[c->nr_immediates].nr = size;
1206 c->nr_immediates++;
1207 break;
1208
1209 case TGSI_TOKEN_TYPE_INSTRUCTION:
1210 inst = &parse.FullToken.FullInstruction;
1211 emit_insn(c, inst);
1212 break;
1213 }
1214 }
1215
1216 if (BRW_DEBUG & DEBUG_WM) {
1217 debug_printf("pass_fp:\n");
1218 //brw_print_program( c->fp_brw_program );
1219 debug_printf("\n");
1220 }
1221
1222 return c->error;
1223 }
1224