2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
32 #include "util/u_math.h"
33 #include "tgsi/tgsi_info.h"
35 #include "brw_context.h"
37 #include "brw_debug.h"
38 #include "brw_disasm.h"
40 /* Not quite sure how correct this is - need to understand horiz
41 * vs. vertical strides a little better.
43 static INLINE
struct brw_reg
sechalf( struct brw_reg reg
)
52 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 quads,
53 * corresponding to each of the 16 execution channels.
55 * R1.0 -- triangle vertex 0.X
56 * R1.1 -- triangle vertex 0.Y
57 * R1.2 -- quad 0 x,y coords (2 packed uwords)
58 * R1.3 -- quad 1 x,y coords (2 packed uwords)
59 * R1.4 -- quad 2 x,y coords (2 packed uwords)
60 * R1.5 -- quad 3 x,y coords (2 packed uwords)
67 static void emit_pixel_xy(struct brw_compile
*p
,
68 const struct brw_reg
*dst
,
71 struct brw_reg r1
= brw_vec1_grf(1, 0);
72 struct brw_reg r1_uw
= retype(r1
, BRW_REGISTER_TYPE_UW
);
74 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
76 /* Calculate pixel centers by adding 1 or 0 to each of the
77 * micro-tile coordinates passed in r1.
79 if (mask
& BRW_WRITEMASK_X
) {
81 vec16(retype(dst
[0], BRW_REGISTER_TYPE_UW
)),
82 stride(suboffset(r1_uw
, 4), 2, 4, 0),
83 brw_imm_v(0x10101010));
86 if (mask
& BRW_WRITEMASK_Y
) {
88 vec16(retype(dst
[1], BRW_REGISTER_TYPE_UW
)),
89 stride(suboffset(r1_uw
,5), 2, 4, 0),
90 brw_imm_v(0x11001100));
93 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
98 static void emit_delta_xy(struct brw_compile
*p
,
99 const struct brw_reg
*dst
,
101 const struct brw_reg
*arg0
)
103 struct brw_reg r1
= brw_vec1_grf(1, 0);
105 /* Calc delta X,Y by subtracting origin in r1 from the pixel
108 if (mask
& BRW_WRITEMASK_X
) {
111 retype(arg0
[0], BRW_REGISTER_TYPE_UW
),
115 if (mask
& BRW_WRITEMASK_Y
) {
118 retype(arg0
[1], BRW_REGISTER_TYPE_UW
),
119 negate(suboffset(r1
,1)));
124 static void emit_wpos_xy(struct brw_wm_compile
*c
,
125 const struct brw_reg
*dst
,
127 const struct brw_reg
*arg0
)
129 struct brw_compile
*p
= &c
->func
;
131 if (mask
& BRW_WRITEMASK_X
) {
135 retype(arg0
[0], BRW_REGISTER_TYPE_W
));
138 /* XXX: is this needed any more, or is this a NOOP?
140 if (mask
& BRW_WRITEMASK_Y
) {
142 /* Y' = height - 1 - Y */
145 negate(retype(arg0
[1], BRW_REGISTER_TYPE_W
)),
146 brw_imm_d(c
->key
.drawable_height
- 1));
150 retype(arg0
[0], BRW_REGISTER_TYPE_W
));
156 static void emit_pixel_w( struct brw_compile
*p
,
157 const struct brw_reg
*dst
,
159 const struct brw_reg
*arg0
,
160 const struct brw_reg
*deltas
)
162 /* Don't need this if all you are doing is interpolating color, for
165 if (mask
& BRW_WRITEMASK_W
) {
166 struct brw_reg interp3
= brw_vec1_grf(arg0
[0].nr
+1, 4);
168 /* Calc 1/w - just linterp wpos[3] optimized by putting the
169 * result straight into a message reg.
171 brw_LINE(p
, brw_null_reg(), interp3
, deltas
[0]);
172 brw_MAC(p
, brw_message_reg(2), suboffset(interp3
, 1), deltas
[1]);
175 brw_math_16( p
, dst
[3],
176 BRW_MATH_FUNCTION_INV
,
177 BRW_MATH_SATURATE_NONE
,
179 BRW_MATH_PRECISION_FULL
);
185 static void emit_linterp( struct brw_compile
*p
,
186 const struct brw_reg
*dst
,
188 const struct brw_reg
*arg0
,
189 const struct brw_reg
*deltas
)
191 struct brw_reg interp
[4];
192 GLuint nr
= arg0
[0].nr
;
195 interp
[0] = brw_vec1_grf(nr
, 0);
196 interp
[1] = brw_vec1_grf(nr
, 4);
197 interp
[2] = brw_vec1_grf(nr
+1, 0);
198 interp
[3] = brw_vec1_grf(nr
+1, 4);
200 for (i
= 0; i
< 4; i
++) {
202 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
203 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
209 static void emit_pinterp( struct brw_compile
*p
,
210 const struct brw_reg
*dst
,
212 const struct brw_reg
*arg0
,
213 const struct brw_reg
*deltas
,
214 const struct brw_reg
*w
)
216 struct brw_reg interp
[4];
217 GLuint nr
= arg0
[0].nr
;
220 interp
[0] = brw_vec1_grf(nr
, 0);
221 interp
[1] = brw_vec1_grf(nr
, 4);
222 interp
[2] = brw_vec1_grf(nr
+1, 0);
223 interp
[3] = brw_vec1_grf(nr
+1, 4);
225 for (i
= 0; i
< 4; i
++) {
227 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
228 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
231 for (i
= 0; i
< 4; i
++) {
233 brw_MUL(p
, dst
[i
], dst
[i
], w
[3]);
239 static void emit_cinterp( struct brw_compile
*p
,
240 const struct brw_reg
*dst
,
242 const struct brw_reg
*arg0
)
244 struct brw_reg interp
[4];
245 GLuint nr
= arg0
[0].nr
;
248 interp
[0] = brw_vec1_grf(nr
, 0);
249 interp
[1] = brw_vec1_grf(nr
, 4);
250 interp
[2] = brw_vec1_grf(nr
+1, 0);
251 interp
[3] = brw_vec1_grf(nr
+1, 4);
253 for (i
= 0; i
< 4; i
++) {
255 brw_MOV(p
, dst
[i
], suboffset(interp
[i
],3)); /* TODO: optimize away like other moves */
260 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
261 static void emit_frontfacing( struct brw_compile
*p
,
262 const struct brw_reg
*dst
,
265 struct brw_reg r1_6ud
= retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD
);
268 if (!(mask
& BRW_WRITEMASK_XYZW
))
271 for (i
= 0; i
< 4; i
++) {
273 brw_MOV(p
, dst
[i
], brw_imm_f(0.0));
277 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
280 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, r1_6ud
, brw_imm_ud(1 << 31));
281 for (i
= 0; i
< 4; i
++) {
283 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
286 brw_set_predicate_control_flag_value(p
, 0xff);
289 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
292 * arg0: q0.tl q0.tr q0.bl q0.br q1.tl q1.tr q1.bl q1.br
294 * and we're trying to produce:
297 * dst: (q0.tr - q0.tl) (q0.tl - q0.bl)
298 * (q0.tr - q0.tl) (q0.tr - q0.br)
299 * (q0.br - q0.bl) (q0.tl - q0.bl)
300 * (q0.br - q0.bl) (q0.tr - q0.br)
301 * (q1.tr - q1.tl) (q1.tl - q1.bl)
302 * (q1.tr - q1.tl) (q1.tr - q1.br)
303 * (q1.br - q1.bl) (q1.tl - q1.bl)
304 * (q1.br - q1.bl) (q1.tr - q1.br)
306 * and add two more quads if in 16-pixel dispatch mode.
308 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
309 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
310 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
311 * between each other. We could probably do it like ddx and swizzle the right
312 * order later, but bail for now and just produce
313 * ((q0.tl - q0.bl)x4 (q1.tl - q1.bl)x4)
315 void emit_ddxy(struct brw_compile
*p
,
316 const struct brw_reg
*dst
,
319 const struct brw_reg
*arg0
)
322 struct brw_reg src0
, src1
;
325 brw_set_saturate(p
, 1);
326 for (i
= 0; i
< 4; i
++ ) {
329 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 1,
331 BRW_VERTICAL_STRIDE_2
,
333 BRW_HORIZONTAL_STRIDE_0
,
334 BRW_SWIZZLE_XYZW
, BRW_WRITEMASK_XYZW
);
335 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
337 BRW_VERTICAL_STRIDE_2
,
339 BRW_HORIZONTAL_STRIDE_0
,
340 BRW_SWIZZLE_XYZW
, BRW_WRITEMASK_XYZW
);
342 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
344 BRW_VERTICAL_STRIDE_4
,
346 BRW_HORIZONTAL_STRIDE_0
,
347 BRW_SWIZZLE_XYZW
, BRW_WRITEMASK_XYZW
);
348 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 2,
350 BRW_VERTICAL_STRIDE_4
,
352 BRW_HORIZONTAL_STRIDE_0
,
353 BRW_SWIZZLE_XYZW
, BRW_WRITEMASK_XYZW
);
355 brw_ADD(p
, dst
[i
], src0
, negate(src1
));
359 brw_set_saturate(p
, 0);
362 static void emit_alu1( struct brw_compile
*p
,
363 struct brw_instruction
*(*func
)(struct brw_compile
*,
366 const struct brw_reg
*dst
,
368 const struct brw_reg
*arg0
)
373 brw_set_saturate(p
, 1);
375 for (i
= 0; i
< 4; i
++) {
377 func(p
, dst
[i
], arg0
[i
]);
382 brw_set_saturate(p
, 0);
386 static void emit_alu2( struct brw_compile
*p
,
387 struct brw_instruction
*(*func
)(struct brw_compile
*,
391 const struct brw_reg
*dst
,
393 const struct brw_reg
*arg0
,
394 const struct brw_reg
*arg1
)
399 brw_set_saturate(p
, 1);
401 for (i
= 0; i
< 4; i
++) {
403 func(p
, dst
[i
], arg0
[i
], arg1
[i
]);
408 brw_set_saturate(p
, 0);
412 static void emit_mad( struct brw_compile
*p
,
413 const struct brw_reg
*dst
,
415 const struct brw_reg
*arg0
,
416 const struct brw_reg
*arg1
,
417 const struct brw_reg
*arg2
)
421 for (i
= 0; i
< 4; i
++) {
423 brw_MUL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
425 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
426 brw_ADD(p
, dst
[i
], dst
[i
], arg2
[i
]);
427 brw_set_saturate(p
, 0);
432 static void emit_trunc( struct brw_compile
*p
,
433 const struct brw_reg
*dst
,
435 const struct brw_reg
*arg0
)
439 for (i
= 0; i
< 4; i
++) {
441 brw_RNDZ(p
, dst
[i
], arg0
[i
]);
446 static void emit_lrp( struct brw_compile
*p
,
447 const struct brw_reg
*dst
,
449 const struct brw_reg
*arg0
,
450 const struct brw_reg
*arg1
,
451 const struct brw_reg
*arg2
)
455 /* Uses dst as a temporary:
457 for (i
= 0; i
< 4; i
++) {
459 /* Can I use the LINE instruction for this?
461 brw_ADD(p
, dst
[i
], negate(arg0
[i
]), brw_imm_f(1.0));
462 brw_MUL(p
, brw_null_reg(), dst
[i
], arg2
[i
]);
464 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
465 brw_MAC(p
, dst
[i
], arg0
[i
], arg1
[i
]);
466 brw_set_saturate(p
, 0);
471 static void emit_sop( struct brw_compile
*p
,
472 const struct brw_reg
*dst
,
475 const struct brw_reg
*arg0
,
476 const struct brw_reg
*arg1
)
480 for (i
= 0; i
< 4; i
++) {
482 brw_MOV(p
, dst
[i
], brw_imm_f(0));
483 brw_CMP(p
, brw_null_reg(), cond
, arg0
[i
], arg1
[i
]);
484 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
485 brw_set_predicate_control_flag_value(p
, 0xff);
490 static void emit_slt( struct brw_compile
*p
,
491 const struct brw_reg
*dst
,
493 const struct brw_reg
*arg0
,
494 const struct brw_reg
*arg1
)
496 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_L
, arg0
, arg1
);
499 static void emit_sle( struct brw_compile
*p
,
500 const struct brw_reg
*dst
,
502 const struct brw_reg
*arg0
,
503 const struct brw_reg
*arg1
)
505 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_LE
, arg0
, arg1
);
508 static void emit_sgt( struct brw_compile
*p
,
509 const struct brw_reg
*dst
,
511 const struct brw_reg
*arg0
,
512 const struct brw_reg
*arg1
)
514 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_G
, arg0
, arg1
);
517 static void emit_sge( struct brw_compile
*p
,
518 const struct brw_reg
*dst
,
520 const struct brw_reg
*arg0
,
521 const struct brw_reg
*arg1
)
523 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_GE
, arg0
, arg1
);
526 static void emit_seq( struct brw_compile
*p
,
527 const struct brw_reg
*dst
,
529 const struct brw_reg
*arg0
,
530 const struct brw_reg
*arg1
)
532 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_EQ
, arg0
, arg1
);
535 static void emit_sne( struct brw_compile
*p
,
536 const struct brw_reg
*dst
,
538 const struct brw_reg
*arg0
,
539 const struct brw_reg
*arg1
)
541 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_NEQ
, arg0
, arg1
);
544 static void emit_cmp( struct brw_compile
*p
,
545 const struct brw_reg
*dst
,
547 const struct brw_reg
*arg0
,
548 const struct brw_reg
*arg1
,
549 const struct brw_reg
*arg2
)
553 for (i
= 0; i
< 4; i
++) {
555 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
556 brw_MOV(p
, dst
[i
], arg2
[i
]);
557 brw_set_saturate(p
, 0);
559 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], brw_imm_f(0));
561 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
562 brw_MOV(p
, dst
[i
], arg1
[i
]);
563 brw_set_saturate(p
, 0);
564 brw_set_predicate_control_flag_value(p
, 0xff);
569 static void emit_max( struct brw_compile
*p
,
570 const struct brw_reg
*dst
,
572 const struct brw_reg
*arg0
,
573 const struct brw_reg
*arg1
)
577 for (i
= 0; i
< 4; i
++) {
579 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
580 brw_MOV(p
, dst
[i
], arg0
[i
]);
581 brw_set_saturate(p
, 0);
583 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], arg1
[i
]);
585 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
586 brw_MOV(p
, dst
[i
], arg1
[i
]);
587 brw_set_saturate(p
, 0);
588 brw_set_predicate_control_flag_value(p
, 0xff);
593 static void emit_min( struct brw_compile
*p
,
594 const struct brw_reg
*dst
,
596 const struct brw_reg
*arg0
,
597 const struct brw_reg
*arg1
)
601 for (i
= 0; i
< 4; i
++) {
603 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
604 brw_MOV(p
, dst
[i
], arg1
[i
]);
605 brw_set_saturate(p
, 0);
607 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], arg1
[i
]);
609 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
610 brw_MOV(p
, dst
[i
], arg0
[i
]);
611 brw_set_saturate(p
, 0);
612 brw_set_predicate_control_flag_value(p
, 0xff);
618 static void emit_dp3( struct brw_compile
*p
,
619 const struct brw_reg
*dst
,
621 const struct brw_reg
*arg0
,
622 const struct brw_reg
*arg1
)
624 int dst_chan
= ffs(mask
& BRW_WRITEMASK_XYZW
) - 1;
626 if (!(mask
& BRW_WRITEMASK_XYZW
))
627 return; /* Do not emit dead code */
629 assert(util_is_power_of_two(mask
& BRW_WRITEMASK_XYZW
));
631 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
632 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
634 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
635 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
636 brw_set_saturate(p
, 0);
640 static void emit_dp4( struct brw_compile
*p
,
641 const struct brw_reg
*dst
,
643 const struct brw_reg
*arg0
,
644 const struct brw_reg
*arg1
)
646 int dst_chan
= ffs(mask
& BRW_WRITEMASK_XYZW
) - 1;
648 if (!(mask
& BRW_WRITEMASK_XYZW
))
649 return; /* Do not emit dead code */
651 assert(util_is_power_of_two(mask
& BRW_WRITEMASK_XYZW
));
653 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
654 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
655 brw_MAC(p
, brw_null_reg(), arg0
[2], arg1
[2]);
657 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
658 brw_MAC(p
, dst
[dst_chan
], arg0
[3], arg1
[3]);
659 brw_set_saturate(p
, 0);
663 static void emit_dph( struct brw_compile
*p
,
664 const struct brw_reg
*dst
,
666 const struct brw_reg
*arg0
,
667 const struct brw_reg
*arg1
)
669 const int dst_chan
= ffs(mask
& BRW_WRITEMASK_XYZW
) - 1;
671 if (!(mask
& BRW_WRITEMASK_XYZW
))
672 return; /* Do not emit dead code */
674 assert(util_is_power_of_two(mask
& BRW_WRITEMASK_XYZW
));
676 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
677 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
678 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
680 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
681 brw_ADD(p
, dst
[dst_chan
], dst
[dst_chan
], arg1
[3]);
682 brw_set_saturate(p
, 0);
686 static void emit_xpd( struct brw_compile
*p
,
687 const struct brw_reg
*dst
,
689 const struct brw_reg
*arg0
,
690 const struct brw_reg
*arg1
)
694 assert(!(mask
& BRW_WRITEMASK_W
) == BRW_WRITEMASK_W
);
696 for (i
= 0 ; i
< 3; i
++) {
701 brw_MUL(p
, brw_null_reg(), negate(arg0
[i2
]), arg1
[i1
]);
703 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
704 brw_MAC(p
, dst
[i
], arg0
[i1
], arg1
[i2
]);
705 brw_set_saturate(p
, 0);
711 static void emit_math1( struct brw_compile
*p
,
713 const struct brw_reg
*dst
,
715 const struct brw_reg
*arg0
)
717 int dst_chan
= ffs(mask
& BRW_WRITEMASK_XYZW
) - 1;
719 if (!(mask
& BRW_WRITEMASK_XYZW
))
720 return; /* Do not emit dead code */
722 assert(util_is_power_of_two(mask
& BRW_WRITEMASK_XYZW
));
724 brw_MOV(p
, brw_message_reg(2), arg0
[0]);
726 /* Send two messages to perform all 16 operations:
731 (mask
& SATURATE
) ? BRW_MATH_SATURATE_SATURATE
: BRW_MATH_SATURATE_NONE
,
734 BRW_MATH_PRECISION_FULL
);
738 static void emit_math2( struct brw_compile
*p
,
740 const struct brw_reg
*dst
,
742 const struct brw_reg
*arg0
,
743 const struct brw_reg
*arg1
)
745 int dst_chan
= ffs(mask
& BRW_WRITEMASK_XYZW
) - 1;
747 if (!(mask
& BRW_WRITEMASK_XYZW
))
748 return; /* Do not emit dead code */
750 assert(util_is_power_of_two(mask
& BRW_WRITEMASK_XYZW
));
752 brw_push_insn_state(p
);
754 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
755 brw_MOV(p
, brw_message_reg(2), arg0
[0]);
756 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
757 brw_MOV(p
, brw_message_reg(4), sechalf(arg0
[0]));
759 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
760 brw_MOV(p
, brw_message_reg(3), arg1
[0]);
761 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
762 brw_MOV(p
, brw_message_reg(5), sechalf(arg1
[0]));
765 /* Send two messages to perform all 16 operations:
767 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
771 (mask
& SATURATE
) ? BRW_MATH_SATURATE_SATURATE
: BRW_MATH_SATURATE_NONE
,
774 BRW_MATH_DATA_VECTOR
,
775 BRW_MATH_PRECISION_FULL
);
777 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
779 offset(dst
[dst_chan
],1),
781 (mask
& SATURATE
) ? BRW_MATH_SATURATE_SATURATE
: BRW_MATH_SATURATE_NONE
,
784 BRW_MATH_DATA_VECTOR
,
785 BRW_MATH_PRECISION_FULL
);
787 brw_pop_insn_state(p
);
792 static void emit_tex( struct brw_wm_compile
*c
,
793 const struct brw_wm_instruction
*inst
,
796 struct brw_reg
*coord
,
799 struct brw_compile
*p
= &c
->func
;
800 GLuint msgLength
, responseLength
;
804 GLboolean shadow
= FALSE
;
806 /* How many input regs are there?
808 switch (inst
->target
) {
809 case TGSI_TEXTURE_1D
:
810 emit
= BRW_WRITEMASK_X
;
813 case TGSI_TEXTURE_SHADOW1D
:
814 emit
= BRW_WRITEMASK_XW
;
818 case TGSI_TEXTURE_2D
:
819 emit
= BRW_WRITEMASK_XY
;
822 case TGSI_TEXTURE_SHADOW2D
:
823 case TGSI_TEXTURE_SHADOWRECT
:
824 emit
= BRW_WRITEMASK_XYW
;
828 case TGSI_TEXTURE_3D
:
829 case TGSI_TEXTURE_CUBE
:
830 emit
= BRW_WRITEMASK_XYZ
;
834 /* unexpected target */
840 for (i
= 0; i
< nr
; i
++) {
841 static const GLuint swz
[4] = {0,1,2,2};
843 brw_MOV(p
, brw_message_reg(msgLength
+1), coord
[swz
[i
]]);
845 brw_MOV(p
, brw_message_reg(msgLength
+1), brw_imm_f(0));
849 responseLength
= 8; /* always */
851 if (BRW_IS_IGDNG(p
->brw
)) {
853 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG
;
855 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG
;
858 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE
;
860 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE
;
864 retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
),
866 retype(c
->payload
.depth
[0].hw_reg
, BRW_REGISTER_TYPE_UW
),
867 BTI_TEXTURE(inst
->tex_unit
),
868 sampler
, /* sampler index */
875 BRW_SAMPLER_SIMD_MODE_SIMD16
);
879 static void emit_txb( struct brw_wm_compile
*c
,
880 const struct brw_wm_instruction
*inst
,
883 struct brw_reg
*coord
,
886 struct brw_compile
*p
= &c
->func
;
889 /* Shadow ignored for txb.
891 switch (inst
->target
) {
892 case TGSI_TEXTURE_1D
:
893 case TGSI_TEXTURE_SHADOW1D
:
894 brw_MOV(p
, brw_message_reg(2), coord
[0]);
895 brw_MOV(p
, brw_message_reg(4), brw_imm_f(0));
896 brw_MOV(p
, brw_message_reg(6), brw_imm_f(0));
898 case TGSI_TEXTURE_2D
:
899 case TGSI_TEXTURE_RECT
:
900 case TGSI_TEXTURE_SHADOW2D
:
901 case TGSI_TEXTURE_SHADOWRECT
:
902 brw_MOV(p
, brw_message_reg(2), coord
[0]);
903 brw_MOV(p
, brw_message_reg(4), coord
[1]);
904 brw_MOV(p
, brw_message_reg(6), brw_imm_f(0));
906 case TGSI_TEXTURE_3D
:
907 case TGSI_TEXTURE_CUBE
:
908 brw_MOV(p
, brw_message_reg(2), coord
[0]);
909 brw_MOV(p
, brw_message_reg(4), coord
[1]);
910 brw_MOV(p
, brw_message_reg(6), coord
[2]);
913 /* unexpected target */
917 brw_MOV(p
, brw_message_reg(8), coord
[3]);
920 if (BRW_IS_IGDNG(p
->brw
))
921 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG
;
923 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
926 retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
),
928 retype(c
->payload
.depth
[0].hw_reg
, BRW_REGISTER_TYPE_UW
),
929 BTI_TEXTURE(inst
->tex_unit
),
930 sampler
, /* sampler index */
933 8, /* responseLength */
937 BRW_SAMPLER_SIMD_MODE_SIMD16
);
941 static void emit_lit( struct brw_compile
*p
,
942 const struct brw_reg
*dst
,
944 const struct brw_reg
*arg0
)
946 assert((mask
& BRW_WRITEMASK_XW
) == 0);
948 if (mask
& BRW_WRITEMASK_Y
) {
949 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
950 brw_MOV(p
, dst
[1], arg0
[0]);
951 brw_set_saturate(p
, 0);
954 if (mask
& BRW_WRITEMASK_Z
) {
955 emit_math2(p
, BRW_MATH_FUNCTION_POW
,
957 BRW_WRITEMASK_X
| (mask
& SATURATE
),
962 /* Ordinarily you'd use an iff statement to skip or shortcircuit
963 * some of the POW calculations above, but 16-wide iff statements
964 * seem to lock c1 hardware, so this is a nasty workaround:
966 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_LE
, arg0
[0], brw_imm_f(0));
968 if (mask
& BRW_WRITEMASK_Y
)
969 brw_MOV(p
, dst
[1], brw_imm_f(0));
971 if (mask
& BRW_WRITEMASK_Z
)
972 brw_MOV(p
, dst
[2], brw_imm_f(0));
974 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
978 /* Kill pixel - set execution mask to zero for those pixels which
981 static void emit_kil( struct brw_wm_compile
*c
,
982 struct brw_reg
*arg0
)
984 struct brw_compile
*p
= &c
->func
;
985 struct brw_reg r0uw
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
988 /* XXX - usually won't need 4 compares!
990 for (i
= 0; i
< 4; i
++) {
991 brw_push_insn_state(p
);
992 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
[i
], brw_imm_f(0));
993 brw_set_predicate_control_flag_value(p
, 0xff);
994 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
995 brw_AND(p
, r0uw
, brw_flag_reg(), r0uw
);
996 brw_pop_insn_state(p
);
1000 /* KILLP kills the pixels that are currently executing, not based on a test
1003 static void emit_killp( struct brw_wm_compile
*c
)
1005 struct brw_compile
*p
= &c
->func
;
1006 struct brw_reg r0uw
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
1008 brw_push_insn_state(p
);
1009 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
1010 brw_NOT(p
, c
->emit_mask_reg
, brw_mask_reg(1)); /* IMASK */
1011 brw_AND(p
, r0uw
, c
->emit_mask_reg
, r0uw
);
1012 brw_pop_insn_state(p
);
1015 static void fire_fb_write( struct brw_wm_compile
*c
,
1021 struct brw_compile
*p
= &c
->func
;
1023 /* Pass through control information:
1025 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1027 brw_push_insn_state(p
);
1028 brw_set_mask_control(p
, BRW_MASK_DISABLE
); /* ? */
1029 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1031 brw_message_reg(base_reg
+ 1),
1032 brw_vec8_grf(1, 0));
1033 brw_pop_insn_state(p
);
1036 /* Send framebuffer write message: */
1037 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1039 retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
),
1041 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
),
1049 static void emit_aa( struct brw_wm_compile
*c
,
1050 struct brw_reg
*arg1
,
1053 struct brw_compile
*p
= &c
->func
;
1054 GLuint comp
= c
->key
.aa_dest_stencil_reg
/ 2;
1055 GLuint off
= c
->key
.aa_dest_stencil_reg
% 2;
1056 struct brw_reg aa
= offset(arg1
[comp
], off
);
1058 brw_push_insn_state(p
);
1059 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
); /* ?? */
1060 brw_MOV(p
, brw_message_reg(reg
), aa
);
1061 brw_pop_insn_state(p
);
1065 /* Post-fragment-program processing. Send the results to the
1067 * \param arg0 the fragment color
1068 * \param arg1 the pass-through depth value
1069 * \param arg2 the shader-computed depth value
1071 static void emit_fb_write( struct brw_wm_compile
*c
,
1072 struct brw_reg
*arg0
,
1073 struct brw_reg
*arg1
,
1074 struct brw_reg
*arg2
,
1078 struct brw_compile
*p
= &c
->func
;
1082 /* Reserve a space for AA - may not be needed:
1084 if (c
->key
.aa_dest_stencil_reg
)
1087 /* I don't really understand how this achieves the color interleave
1088 * (ie RGBARGBA) in the result: [Do the saturation here]
1091 brw_push_insn_state(p
);
1093 for (channel
= 0; channel
< 4; channel
++) {
1094 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1095 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1097 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1099 brw_message_reg(nr
+ channel
),
1102 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1104 brw_message_reg(nr
+ channel
+ 4),
1105 sechalf(arg0
[channel
]));
1108 /* skip over the regs populated above:
1112 brw_pop_insn_state(p
);
1115 if (c
->key
.source_depth_to_render_target
)
1117 if (c
->key
.computes_depth
)
1118 brw_MOV(p
, brw_message_reg(nr
), arg2
[2]);
1120 brw_MOV(p
, brw_message_reg(nr
), arg1
[1]); /* ? */
1125 if (c
->key
.dest_depth_reg
)
1127 GLuint comp
= c
->key
.dest_depth_reg
/ 2;
1128 GLuint off
= c
->key
.dest_depth_reg
% 2;
1131 brw_push_insn_state(p
);
1132 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1134 brw_MOV(p
, brw_message_reg(nr
), offset(arg1
[comp
],1));
1136 brw_MOV(p
, brw_message_reg(nr
+1), arg1
[comp
+1]);
1137 brw_pop_insn_state(p
);
1140 brw_MOV(p
, brw_message_reg(nr
), arg1
[comp
]);
1145 if (!c
->key
.runtime_check_aads_emit
) {
1146 if (c
->key
.aa_dest_stencil_reg
)
1147 emit_aa(c
, arg1
, 2);
1149 fire_fb_write(c
, 0, nr
, target
, eot
);
1152 struct brw_reg v1_null_ud
= vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD
));
1153 struct brw_reg ip
= brw_ip_reg();
1154 struct brw_instruction
*jmp
;
1156 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1157 brw_set_conditionalmod(p
, BRW_CONDITIONAL_Z
);
1160 get_element_ud(brw_vec8_grf(1,0), 6),
1163 jmp
= brw_JMPI(p
, ip
, ip
, brw_imm_d(0));
1165 emit_aa(c
, arg1
, 2);
1166 fire_fb_write(c
, 0, nr
, target
, eot
);
1167 /* note - thread killed in subroutine */
1169 brw_land_fwd_jump(p
, jmp
);
1171 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1173 fire_fb_write(c
, 1, nr
-1, target
, eot
);
1179 * Move a GPR to scratch memory.
1181 static void emit_spill( struct brw_wm_compile
*c
,
1185 struct brw_compile
*p
= &c
->func
;
1188 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1190 brw_MOV(p
, brw_message_reg(2), reg
);
1193 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1194 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1197 retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW
),
1203 * Load a GPR from scratch memory.
1205 static void emit_unspill( struct brw_wm_compile
*c
,
1209 struct brw_compile
*p
= &c
->func
;
1211 /* Slot 0 is the undef value.
1214 brw_MOV(p
, reg
, brw_imm_f(0));
1219 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1220 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1224 retype(vec16(reg
), BRW_REGISTER_TYPE_UW
),
1230 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1231 * Args with unspill_reg != 0 will be loaded from scratch memory.
1233 static void get_argument_regs( struct brw_wm_compile
*c
,
1234 struct brw_wm_ref
*arg
[],
1235 struct brw_reg
*regs
)
1239 for (i
= 0; i
< 4; i
++) {
1241 if (arg
[i
]->unspill_reg
)
1243 brw_vec8_grf(arg
[i
]->unspill_reg
, 0),
1244 arg
[i
]->value
->spill_slot
);
1246 regs
[i
] = arg
[i
]->hw_reg
;
1249 regs
[i
] = brw_null_reg();
1256 * For values that have a spill_slot!=0, write those regs to scratch memory.
1258 static void spill_values( struct brw_wm_compile
*c
,
1259 struct brw_wm_value
*values
,
1264 for (i
= 0; i
< nr
; i
++)
1265 if (values
[i
].spill_slot
)
1266 emit_spill(c
, values
[i
].hw_reg
, values
[i
].spill_slot
);
1270 /* Emit the fragment program instructions here.
1272 void brw_wm_emit( struct brw_wm_compile
*c
)
1274 struct brw_compile
*p
= &c
->func
;
1277 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
1279 /* Check if any of the payload regs need to be spilled:
1281 spill_values(c
, c
->payload
.depth
, 4);
1282 spill_values(c
, c
->creg
, c
->nr_creg
);
1283 spill_values(c
, c
->payload
.input_interp
, PIPE_MAX_SHADER_INPUTS
);
1286 for (insn
= 0; insn
< c
->nr_insns
; insn
++) {
1288 struct brw_wm_instruction
*inst
= &c
->instruction
[insn
];
1289 struct brw_reg args
[3][4], dst
[4];
1290 GLuint i
, dst_flags
;
1292 /* Get argument regs:
1294 for (i
= 0; i
< 3; i
++)
1295 get_argument_regs(c
, inst
->src
[i
], args
[i
]);
1299 for (i
= 0; i
< 4; i
++)
1301 dst
[i
] = inst
->dst
[i
]->hw_reg
;
1303 dst
[i
] = brw_null_reg();
1307 dst_flags
= inst
->writemask
;
1309 dst_flags
|= SATURATE
;
1311 switch (inst
->opcode
) {
1312 /* Generated instructions for calculating triangle interpolants:
1315 emit_pixel_xy(p
, dst
, dst_flags
);
1319 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
1323 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
1327 emit_pixel_w(p
, dst
, dst_flags
, args
[0], args
[1]);
1331 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
1335 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1339 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
1343 emit_fb_write(c
, args
[0], args
[1], args
[2], inst
->target
, inst
->eot
);
1346 case WM_FRONTFACING
:
1347 emit_frontfacing(p
, dst
, dst_flags
);
1350 /* Straightforward arithmetic:
1352 case TGSI_OPCODE_ADD
:
1353 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
1356 case TGSI_OPCODE_FRC
:
1357 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
1360 case TGSI_OPCODE_FLR
:
1361 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
1364 case TGSI_OPCODE_DDX
:
1365 emit_ddxy(p
, dst
, dst_flags
, GL_TRUE
, args
[0]);
1368 case TGSI_OPCODE_DDY
:
1369 emit_ddxy(p
, dst
, dst_flags
, GL_FALSE
, args
[0]);
1372 case TGSI_OPCODE_DP3
:
1373 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
1376 case TGSI_OPCODE_DP4
:
1377 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
1380 case TGSI_OPCODE_DPH
:
1381 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
1384 case TGSI_OPCODE_TRUNC
:
1385 emit_trunc(p
, dst
, dst_flags
, args
[0]);
1388 case TGSI_OPCODE_LRP
:
1389 emit_lrp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1392 case TGSI_OPCODE_MAD
:
1393 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1396 case TGSI_OPCODE_MOV
:
1397 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
1400 case TGSI_OPCODE_MUL
:
1401 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
1404 case TGSI_OPCODE_XPD
:
1405 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
1408 /* Higher math functions:
1410 case TGSI_OPCODE_RCP
:
1411 emit_math1(p
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
1414 case TGSI_OPCODE_RSQ
:
1415 emit_math1(p
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
1418 case TGSI_OPCODE_SIN
:
1419 emit_math1(p
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
1422 case TGSI_OPCODE_COS
:
1423 emit_math1(p
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
1426 case TGSI_OPCODE_EX2
:
1427 emit_math1(p
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
1430 case TGSI_OPCODE_LG2
:
1431 emit_math1(p
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
1434 case TGSI_OPCODE_SCS
:
1435 /* There is an scs math function, but it would need some
1436 * fixup for 16-element execution.
1438 if (dst_flags
& BRW_WRITEMASK_X
)
1439 emit_math1(p
, BRW_MATH_FUNCTION_COS
, dst
, (dst_flags
&SATURATE
)|BRW_WRITEMASK_X
, args
[0]);
1440 if (dst_flags
& BRW_WRITEMASK_Y
)
1441 emit_math1(p
, BRW_MATH_FUNCTION_SIN
, dst
+1, (dst_flags
&SATURATE
)|BRW_WRITEMASK_X
, args
[0]);
1444 case TGSI_OPCODE_POW
:
1445 emit_math2(p
, BRW_MATH_FUNCTION_POW
, dst
, dst_flags
, args
[0], args
[1]);
1450 case TGSI_OPCODE_CMP
:
1451 emit_cmp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1454 case TGSI_OPCODE_MAX
:
1455 emit_max(p
, dst
, dst_flags
, args
[0], args
[1]);
1458 case TGSI_OPCODE_MIN
:
1459 emit_min(p
, dst
, dst_flags
, args
[0], args
[1]);
1462 case TGSI_OPCODE_SLT
:
1463 emit_slt(p
, dst
, dst_flags
, args
[0], args
[1]);
1466 case TGSI_OPCODE_SLE
:
1467 emit_sle(p
, dst
, dst_flags
, args
[0], args
[1]);
1469 case TGSI_OPCODE_SGT
:
1470 emit_sgt(p
, dst
, dst_flags
, args
[0], args
[1]);
1472 case TGSI_OPCODE_SGE
:
1473 emit_sge(p
, dst
, dst_flags
, args
[0], args
[1]);
1475 case TGSI_OPCODE_SEQ
:
1476 emit_seq(p
, dst
, dst_flags
, args
[0], args
[1]);
1478 case TGSI_OPCODE_SNE
:
1479 emit_sne(p
, dst
, dst_flags
, args
[0], args
[1]);
1482 case TGSI_OPCODE_LIT
:
1483 emit_lit(p
, dst
, dst_flags
, args
[0]);
1486 /* Texturing operations:
1488 case TGSI_OPCODE_TEX
:
1489 emit_tex(c
, inst
, dst
, dst_flags
, args
[0], inst
->sampler
);
1492 case TGSI_OPCODE_TXB
:
1493 emit_txb(c
, inst
, dst
, dst_flags
, args
[0], inst
->sampler
);
1496 case TGSI_OPCODE_KIL
:
1497 emit_kil(c
, args
[0]);
1500 case TGSI_OPCODE_KILP
:
1505 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
1507 tgsi_get_opcode_info(inst
->opcode
)->mnemonic
);
1510 for (i
= 0; i
< 4; i
++)
1511 if (inst
->dst
[i
] && inst
->dst
[i
]->spill_slot
)
1513 inst
->dst
[i
]->hw_reg
,
1514 inst
->dst
[i
]->spill_slot
);
1517 if (BRW_DEBUG
& DEBUG_WM
) {
1518 debug_printf("wm-native:\n");
1519 brw_disasm(stderr
, p
->store
, p
->nr_insn
);