2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "brw_context.h"
38 can_do_pln(struct intel_context
*intel
, const struct brw_reg
*deltas
)
40 struct brw_context
*brw
= brw_context(&intel
->ctx
);
45 if (deltas
[1].nr
!= deltas
[0].nr
+ 1)
48 if (intel
->gen
< 6 && ((deltas
[0].nr
& 1) != 0))
54 /* Return the SrcReg index of the channels that can be immediate float operands
55 * instead of usage of PROGRAM_CONSTANT values through push/pull.
58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
60 int opcode_array
[] = {
80 /* These opcodes get broken down in a way that allow two
81 * args to be immediates.
83 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
84 if (arg
== 1 || arg
== 2)
88 if (opcode
> ARRAY_SIZE(opcode_array
))
91 return arg
== opcode_array
[opcode
] - 1;
95 * Computes the screen-space x,y position of the pixels.
97 * This will be used by emit_delta_xy() or emit_wpos_xy() for
98 * interpolation of attributes..
102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
103 * corresponding to each of the 16 execution channels.
105 * R1.0 -- triangle vertex 0.X
106 * R1.1 -- triangle vertex 0.Y
107 * R1.2 -- tile 0 x,y coords (2 packed uwords)
108 * R1.3 -- tile 1 x,y coords (2 packed uwords)
109 * R1.4 -- tile 2 x,y coords (2 packed uwords)
110 * R1.5 -- tile 3 x,y coords (2 packed uwords)
115 void emit_pixel_xy(struct brw_wm_compile
*c
,
116 const struct brw_reg
*dst
,
119 struct brw_compile
*p
= &c
->func
;
120 struct brw_reg r1
= brw_vec1_grf(1, 0);
121 struct brw_reg r1_uw
= retype(r1
, BRW_REGISTER_TYPE_UW
);
122 struct brw_reg dst0_uw
, dst1_uw
;
124 brw_push_insn_state(p
);
125 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
127 if (c
->dispatch_width
== 16) {
128 dst0_uw
= vec16(retype(dst
[0], BRW_REGISTER_TYPE_UW
));
129 dst1_uw
= vec16(retype(dst
[1], BRW_REGISTER_TYPE_UW
));
131 dst0_uw
= vec8(retype(dst
[0], BRW_REGISTER_TYPE_UW
));
132 dst1_uw
= vec8(retype(dst
[1], BRW_REGISTER_TYPE_UW
));
135 /* Calculate pixel centers by adding 1 or 0 to each of the
136 * micro-tile coordinates passed in r1.
138 if (mask
& WRITEMASK_X
) {
141 stride(suboffset(r1_uw
, 4), 2, 4, 0),
142 brw_imm_v(0x10101010));
145 if (mask
& WRITEMASK_Y
) {
148 stride(suboffset(r1_uw
,5), 2, 4, 0),
149 brw_imm_v(0x11001100));
151 brw_pop_insn_state(p
);
155 * Computes the screen-space x,y distance of the pixels from the start
158 * This will be used in linterp or pinterp with the start vertex value
159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
160 * to produce interpolated attribute values.
162 void emit_delta_xy(struct brw_compile
*p
,
163 const struct brw_reg
*dst
,
165 const struct brw_reg
*arg0
)
167 struct intel_context
*intel
= &p
->brw
->intel
;
168 struct brw_reg r1
= brw_vec1_grf(1, 0);
173 assert(mask
== WRITEMASK_XY
);
175 if (intel
->gen
>= 6) {
176 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
177 Just add them with 0.0 for dst reg.. */
178 r1
= brw_imm_v(0x00000000);
181 retype(arg0
[0], BRW_REGISTER_TYPE_UW
),
185 retype(arg0
[1], BRW_REGISTER_TYPE_UW
),
190 /* Calc delta X,Y by subtracting origin in r1 from the pixel
191 * centers produced by emit_pixel_xy().
195 retype(arg0
[0], BRW_REGISTER_TYPE_UW
),
199 retype(arg0
[1], BRW_REGISTER_TYPE_UW
),
200 negate(suboffset(r1
,1)));
204 * Computes the pixel offset from the window origin for gl_FragCoord().
206 void emit_wpos_xy(struct brw_wm_compile
*c
,
207 const struct brw_reg
*dst
,
209 const struct brw_reg
*arg0
)
211 struct brw_compile
*p
= &c
->func
;
212 struct intel_context
*intel
= &p
->brw
->intel
;
213 struct brw_reg delta_x
= retype(arg0
[0], BRW_REGISTER_TYPE_W
);
214 struct brw_reg delta_y
= retype(arg0
[1], BRW_REGISTER_TYPE_W
);
216 if (mask
& WRITEMASK_X
) {
217 if (intel
->gen
>= 6) {
218 struct brw_reg delta_x_f
= retype(delta_x
, BRW_REGISTER_TYPE_F
);
219 brw_MOV(p
, delta_x_f
, delta_x
);
223 if (c
->fp
->program
.PixelCenterInteger
) {
225 brw_MOV(p
, dst
[0], delta_x
);
228 brw_ADD(p
, dst
[0], delta_x
, brw_imm_f(0.5));
232 if (mask
& WRITEMASK_Y
) {
233 if (intel
->gen
>= 6) {
234 struct brw_reg delta_y_f
= retype(delta_y
, BRW_REGISTER_TYPE_F
);
235 brw_MOV(p
, delta_y_f
, delta_y
);
239 if (c
->fp
->program
.OriginUpperLeft
) {
240 if (c
->fp
->program
.PixelCenterInteger
) {
242 brw_MOV(p
, dst
[1], delta_y
);
244 brw_ADD(p
, dst
[1], delta_y
, brw_imm_f(0.5));
247 float center_offset
= c
->fp
->program
.PixelCenterInteger
? 0.0 : 0.5;
249 /* Y' = (height - 1) - Y + center */
250 brw_ADD(p
, dst
[1], negate(delta_y
),
251 brw_imm_f(c
->key
.drawable_height
- 1 + center_offset
));
257 void emit_pixel_w(struct brw_wm_compile
*c
,
258 const struct brw_reg
*dst
,
260 const struct brw_reg
*arg0
,
261 const struct brw_reg
*deltas
)
263 struct brw_compile
*p
= &c
->func
;
264 struct intel_context
*intel
= &p
->brw
->intel
;
266 struct brw_reg temp_dst
;
271 temp_dst
= brw_message_reg(2);
273 assert(intel
->gen
< 6);
275 /* Don't need this if all you are doing is interpolating color, for
278 if (mask
& WRITEMASK_W
) {
279 struct brw_reg interp3
= brw_vec1_grf(arg0
[0].nr
+1, 4);
281 /* Calc 1/w - just linterp wpos[3] optimized by putting the
282 * result straight into a message reg.
284 if (can_do_pln(intel
, deltas
)) {
285 brw_PLN(p
, temp_dst
, interp3
, deltas
[0]);
287 brw_LINE(p
, brw_null_reg(), interp3
, deltas
[0]);
288 brw_MAC(p
, temp_dst
, suboffset(interp3
, 1), deltas
[1]);
295 src
= brw_null_reg();
297 if (c
->dispatch_width
== 16) {
298 brw_math_16(p
, dst
[3],
299 BRW_MATH_FUNCTION_INV
,
300 BRW_MATH_SATURATE_NONE
,
302 BRW_MATH_PRECISION_FULL
);
305 BRW_MATH_FUNCTION_INV
,
306 BRW_MATH_SATURATE_NONE
,
308 BRW_MATH_DATA_VECTOR
,
309 BRW_MATH_PRECISION_FULL
);
314 void emit_linterp(struct brw_compile
*p
,
315 const struct brw_reg
*dst
,
317 const struct brw_reg
*arg0
,
318 const struct brw_reg
*deltas
)
320 struct intel_context
*intel
= &p
->brw
->intel
;
321 struct brw_reg interp
[4];
322 GLuint nr
= arg0
[0].nr
;
325 interp
[0] = brw_vec1_grf(nr
, 0);
326 interp
[1] = brw_vec1_grf(nr
, 4);
327 interp
[2] = brw_vec1_grf(nr
+1, 0);
328 interp
[3] = brw_vec1_grf(nr
+1, 4);
330 for (i
= 0; i
< 4; i
++) {
332 if (intel
->gen
>= 6) {
333 brw_PLN(p
, dst
[i
], interp
[i
], brw_vec8_grf(2, 0));
334 } else if (can_do_pln(intel
, deltas
)) {
335 brw_PLN(p
, dst
[i
], interp
[i
], deltas
[0]);
337 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
338 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
345 void emit_pinterp(struct brw_compile
*p
,
346 const struct brw_reg
*dst
,
348 const struct brw_reg
*arg0
,
349 const struct brw_reg
*deltas
,
350 const struct brw_reg
*w
)
352 struct intel_context
*intel
= &p
->brw
->intel
;
353 struct brw_reg interp
[4];
354 GLuint nr
= arg0
[0].nr
;
357 if (intel
->gen
>= 6) {
358 emit_linterp(p
, dst
, mask
, arg0
, interp
);
362 interp
[0] = brw_vec1_grf(nr
, 0);
363 interp
[1] = brw_vec1_grf(nr
, 4);
364 interp
[2] = brw_vec1_grf(nr
+1, 0);
365 interp
[3] = brw_vec1_grf(nr
+1, 4);
367 for (i
= 0; i
< 4; i
++) {
369 if (can_do_pln(intel
, deltas
)) {
370 brw_PLN(p
, dst
[i
], interp
[i
], deltas
[0]);
372 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
373 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
377 for (i
= 0; i
< 4; i
++) {
379 brw_MUL(p
, dst
[i
], dst
[i
], w
[3]);
385 void emit_cinterp(struct brw_compile
*p
,
386 const struct brw_reg
*dst
,
388 const struct brw_reg
*arg0
)
390 struct brw_reg interp
[4];
391 GLuint nr
= arg0
[0].nr
;
394 interp
[0] = brw_vec1_grf(nr
, 0);
395 interp
[1] = brw_vec1_grf(nr
, 4);
396 interp
[2] = brw_vec1_grf(nr
+1, 0);
397 interp
[3] = brw_vec1_grf(nr
+1, 4);
399 for (i
= 0; i
< 4; i
++) {
401 brw_MOV(p
, dst
[i
], suboffset(interp
[i
],3)); /* TODO: optimize away like other moves */
406 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
407 void emit_frontfacing(struct brw_compile
*p
,
408 const struct brw_reg
*dst
,
411 struct brw_reg r1_6ud
= retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD
);
414 if (!(mask
& WRITEMASK_XYZW
))
417 for (i
= 0; i
< 4; i
++) {
419 brw_MOV(p
, dst
[i
], brw_imm_f(0.0));
423 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
426 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, r1_6ud
, brw_imm_ud(1 << 31));
427 for (i
= 0; i
< 4; i
++) {
429 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
432 brw_set_predicate_control_flag_value(p
, 0xff);
435 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
438 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
440 * and we're trying to produce:
443 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
444 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
445 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
446 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
447 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
448 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
449 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
450 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
452 * and add another set of two more subspans if in 16-pixel dispatch mode.
454 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
455 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
456 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
457 * between each other. We could probably do it like ddx and swizzle the right
458 * order later, but bail for now and just produce
459 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
461 void emit_ddxy(struct brw_compile
*p
,
462 const struct brw_reg
*dst
,
465 const struct brw_reg
*arg0
)
468 struct brw_reg src0
, src1
;
471 brw_set_saturate(p
, 1);
472 for (i
= 0; i
< 4; i
++ ) {
475 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 1,
477 BRW_VERTICAL_STRIDE_2
,
479 BRW_HORIZONTAL_STRIDE_0
,
480 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
481 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
483 BRW_VERTICAL_STRIDE_2
,
485 BRW_HORIZONTAL_STRIDE_0
,
486 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
488 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
490 BRW_VERTICAL_STRIDE_4
,
492 BRW_HORIZONTAL_STRIDE_0
,
493 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
494 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 2,
496 BRW_VERTICAL_STRIDE_4
,
498 BRW_HORIZONTAL_STRIDE_0
,
499 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
501 brw_ADD(p
, dst
[i
], src0
, negate(src1
));
505 brw_set_saturate(p
, 0);
508 void emit_alu1(struct brw_compile
*p
,
509 struct brw_instruction
*(*func
)(struct brw_compile
*,
512 const struct brw_reg
*dst
,
514 const struct brw_reg
*arg0
)
519 brw_set_saturate(p
, 1);
521 for (i
= 0; i
< 4; i
++) {
523 func(p
, dst
[i
], arg0
[i
]);
528 brw_set_saturate(p
, 0);
532 void emit_alu2(struct brw_compile
*p
,
533 struct brw_instruction
*(*func
)(struct brw_compile
*,
537 const struct brw_reg
*dst
,
539 const struct brw_reg
*arg0
,
540 const struct brw_reg
*arg1
)
545 brw_set_saturate(p
, 1);
547 for (i
= 0; i
< 4; i
++) {
549 func(p
, dst
[i
], arg0
[i
], arg1
[i
]);
554 brw_set_saturate(p
, 0);
558 void emit_mad(struct brw_compile
*p
,
559 const struct brw_reg
*dst
,
561 const struct brw_reg
*arg0
,
562 const struct brw_reg
*arg1
,
563 const struct brw_reg
*arg2
)
567 for (i
= 0; i
< 4; i
++) {
569 brw_MUL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
571 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
572 brw_ADD(p
, dst
[i
], dst
[i
], arg2
[i
]);
573 brw_set_saturate(p
, 0);
578 void emit_lrp(struct brw_compile
*p
,
579 const struct brw_reg
*dst
,
581 const struct brw_reg
*arg0
,
582 const struct brw_reg
*arg1
,
583 const struct brw_reg
*arg2
)
587 /* Uses dst as a temporary:
589 for (i
= 0; i
< 4; i
++) {
591 /* Can I use the LINE instruction for this?
593 brw_ADD(p
, dst
[i
], negate(arg0
[i
]), brw_imm_f(1.0));
594 brw_MUL(p
, brw_null_reg(), dst
[i
], arg2
[i
]);
596 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
597 brw_MAC(p
, dst
[i
], arg0
[i
], arg1
[i
]);
598 brw_set_saturate(p
, 0);
603 void emit_sop(struct brw_compile
*p
,
604 const struct brw_reg
*dst
,
607 const struct brw_reg
*arg0
,
608 const struct brw_reg
*arg1
)
612 for (i
= 0; i
< 4; i
++) {
614 brw_push_insn_state(p
);
615 brw_CMP(p
, brw_null_reg(), cond
, arg0
[i
], arg1
[i
]);
616 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
617 brw_MOV(p
, dst
[i
], brw_imm_f(0));
618 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
619 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
620 brw_pop_insn_state(p
);
625 static void emit_slt( struct brw_compile
*p
,
626 const struct brw_reg
*dst
,
628 const struct brw_reg
*arg0
,
629 const struct brw_reg
*arg1
)
631 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_L
, arg0
, arg1
);
634 static void emit_sle( struct brw_compile
*p
,
635 const struct brw_reg
*dst
,
637 const struct brw_reg
*arg0
,
638 const struct brw_reg
*arg1
)
640 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_LE
, arg0
, arg1
);
643 static void emit_sgt( struct brw_compile
*p
,
644 const struct brw_reg
*dst
,
646 const struct brw_reg
*arg0
,
647 const struct brw_reg
*arg1
)
649 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_G
, arg0
, arg1
);
652 static void emit_sge( struct brw_compile
*p
,
653 const struct brw_reg
*dst
,
655 const struct brw_reg
*arg0
,
656 const struct brw_reg
*arg1
)
658 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_GE
, arg0
, arg1
);
661 static void emit_seq( struct brw_compile
*p
,
662 const struct brw_reg
*dst
,
664 const struct brw_reg
*arg0
,
665 const struct brw_reg
*arg1
)
667 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_EQ
, arg0
, arg1
);
670 static void emit_sne( struct brw_compile
*p
,
671 const struct brw_reg
*dst
,
673 const struct brw_reg
*arg0
,
674 const struct brw_reg
*arg1
)
676 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_NEQ
, arg0
, arg1
);
679 void emit_cmp(struct brw_compile
*p
,
680 const struct brw_reg
*dst
,
682 const struct brw_reg
*arg0
,
683 const struct brw_reg
*arg1
,
684 const struct brw_reg
*arg2
)
688 for (i
= 0; i
< 4; i
++) {
690 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], brw_imm_f(0));
692 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
693 brw_SEL(p
, dst
[i
], arg1
[i
], arg2
[i
]);
694 brw_set_saturate(p
, 0);
695 brw_set_predicate_control_flag_value(p
, 0xff);
700 void emit_sign(struct brw_compile
*p
,
701 const struct brw_reg
*dst
,
703 const struct brw_reg
*arg0
)
707 for (i
= 0; i
< 4; i
++) {
709 brw_MOV(p
, dst
[i
], brw_imm_f(0.0));
711 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], brw_imm_f(0));
712 brw_MOV(p
, dst
[i
], brw_imm_f(-1.0));
713 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
715 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
[i
], brw_imm_f(0));
716 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
717 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
722 void emit_max(struct brw_compile
*p
,
723 const struct brw_reg
*dst
,
725 const struct brw_reg
*arg0
,
726 const struct brw_reg
*arg1
)
730 for (i
= 0; i
< 4; i
++) {
732 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
[i
], arg1
[i
]);
734 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
735 brw_SEL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
736 brw_set_saturate(p
, 0);
737 brw_set_predicate_control_flag_value(p
, 0xff);
742 void emit_min(struct brw_compile
*p
,
743 const struct brw_reg
*dst
,
745 const struct brw_reg
*arg0
,
746 const struct brw_reg
*arg1
)
750 for (i
= 0; i
< 4; i
++) {
752 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], arg1
[i
]);
754 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
755 brw_SEL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
756 brw_set_saturate(p
, 0);
757 brw_set_predicate_control_flag_value(p
, 0xff);
763 void emit_dp2(struct brw_compile
*p
,
764 const struct brw_reg
*dst
,
766 const struct brw_reg
*arg0
,
767 const struct brw_reg
*arg1
)
769 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
771 if (!(mask
& WRITEMASK_XYZW
))
772 return; /* Do not emit dead code */
774 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
776 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
778 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
779 brw_MAC(p
, dst
[dst_chan
], arg0
[1], arg1
[1]);
780 brw_set_saturate(p
, 0);
784 void emit_dp3(struct brw_compile
*p
,
785 const struct brw_reg
*dst
,
787 const struct brw_reg
*arg0
,
788 const struct brw_reg
*arg1
)
790 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
792 if (!(mask
& WRITEMASK_XYZW
))
793 return; /* Do not emit dead code */
795 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
797 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
798 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
800 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
801 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
802 brw_set_saturate(p
, 0);
806 void emit_dp4(struct brw_compile
*p
,
807 const struct brw_reg
*dst
,
809 const struct brw_reg
*arg0
,
810 const struct brw_reg
*arg1
)
812 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
814 if (!(mask
& WRITEMASK_XYZW
))
815 return; /* Do not emit dead code */
817 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
819 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
820 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
821 brw_MAC(p
, brw_null_reg(), arg0
[2], arg1
[2]);
823 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
824 brw_MAC(p
, dst
[dst_chan
], arg0
[3], arg1
[3]);
825 brw_set_saturate(p
, 0);
829 void emit_dph(struct brw_compile
*p
,
830 const struct brw_reg
*dst
,
832 const struct brw_reg
*arg0
,
833 const struct brw_reg
*arg1
)
835 const int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
837 if (!(mask
& WRITEMASK_XYZW
))
838 return; /* Do not emit dead code */
840 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
842 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
843 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
844 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
846 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
847 brw_ADD(p
, dst
[dst_chan
], dst
[dst_chan
], arg1
[3]);
848 brw_set_saturate(p
, 0);
852 void emit_xpd(struct brw_compile
*p
,
853 const struct brw_reg
*dst
,
855 const struct brw_reg
*arg0
,
856 const struct brw_reg
*arg1
)
860 assert((mask
& WRITEMASK_W
) != WRITEMASK_W
);
862 for (i
= 0 ; i
< 3; i
++) {
867 brw_MUL(p
, brw_null_reg(), negate(arg0
[i2
]), arg1
[i1
]);
869 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
870 brw_MAC(p
, dst
[i
], arg0
[i1
], arg1
[i2
]);
871 brw_set_saturate(p
, 0);
877 void emit_math1(struct brw_wm_compile
*c
,
879 const struct brw_reg
*dst
,
881 const struct brw_reg
*arg0
)
883 struct brw_compile
*p
= &c
->func
;
884 struct intel_context
*intel
= &p
->brw
->intel
;
885 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
886 GLuint saturate
= ((mask
& SATURATE
) ?
887 BRW_MATH_SATURATE_SATURATE
:
888 BRW_MATH_SATURATE_NONE
);
891 if (!(mask
& WRITEMASK_XYZW
))
892 return; /* Do not emit dead code */
894 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
896 if (intel
->gen
>= 6 && ((arg0
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
||
897 arg0
[0].file
!= BRW_GENERAL_REGISTER_FILE
) ||
898 arg0
[0].negate
|| arg0
[0].abs
)) {
899 /* Gen6 math requires that source and dst horizontal stride be 1,
900 * and that the argument be in the GRF.
902 * The hardware ignores source modifiers (negate and abs) on math
903 * instructions, so we also move to a temp to set those up.
906 brw_MOV(p
, src
, arg0
[0]);
911 /* Send two messages to perform all 16 operations:
913 brw_push_insn_state(p
);
914 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
921 BRW_MATH_DATA_VECTOR
,
922 BRW_MATH_PRECISION_FULL
);
924 if (c
->dispatch_width
== 16) {
925 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
927 offset(dst
[dst_chan
],1),
932 BRW_MATH_DATA_VECTOR
,
933 BRW_MATH_PRECISION_FULL
);
935 brw_pop_insn_state(p
);
939 void emit_math2(struct brw_wm_compile
*c
,
941 const struct brw_reg
*dst
,
943 const struct brw_reg
*arg0
,
944 const struct brw_reg
*arg1
)
946 struct brw_compile
*p
= &c
->func
;
947 struct intel_context
*intel
= &p
->brw
->intel
;
948 int dst_chan
= _mesa_ffs(mask
& WRITEMASK_XYZW
) - 1;
950 if (!(mask
& WRITEMASK_XYZW
))
951 return; /* Do not emit dead code */
953 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
955 brw_push_insn_state(p
);
957 /* math can only operate on up to a vec8 at a time, so in
958 * dispatch_width==16 we have to do the second half manually.
960 if (intel
->gen
>= 6) {
961 struct brw_reg src0
= arg0
[0];
962 struct brw_reg src1
= arg1
[0];
963 struct brw_reg temp_dst
= dst
[dst_chan
];
965 if (arg0
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
) {
966 brw_MOV(p
, temp_dst
, src0
);
970 if (arg1
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
) {
971 /* This is a heinous hack to get a temporary register for use
972 * in case both arg0 and arg1 are constants. Why you're
973 * doing exponentiation on constant values in the shader, we
976 * max_wm_grf is almost surely less than the maximum GRF, and
977 * gen6 doesn't care about the number of GRFs used in a
978 * shader like pre-gen6 did.
980 struct brw_reg temp
= brw_vec8_grf(c
->max_wm_grf
, 0);
981 brw_MOV(p
, temp
, src1
);
985 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
986 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
992 if (c
->dispatch_width
== 16) {
993 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1001 GLuint saturate
= ((mask
& SATURATE
) ?
1002 BRW_MATH_SATURATE_SATURATE
:
1003 BRW_MATH_SATURATE_NONE
);
1005 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1006 brw_MOV(p
, brw_message_reg(3), arg1
[0]);
1007 if (c
->dispatch_width
== 16) {
1008 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1009 brw_MOV(p
, brw_message_reg(5), sechalf(arg1
[0]));
1012 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1019 BRW_MATH_DATA_VECTOR
,
1020 BRW_MATH_PRECISION_FULL
);
1022 /* Send two messages to perform all 16 operations:
1024 if (c
->dispatch_width
== 16) {
1025 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1027 offset(dst
[dst_chan
],1),
1032 BRW_MATH_DATA_VECTOR
,
1033 BRW_MATH_PRECISION_FULL
);
1036 brw_pop_insn_state(p
);
1040 void emit_tex(struct brw_wm_compile
*c
,
1041 struct brw_reg
*dst
,
1043 struct brw_reg
*arg
,
1044 struct brw_reg depth_payload
,
1049 struct brw_compile
*p
= &c
->func
;
1050 struct intel_context
*intel
= &p
->brw
->intel
;
1051 struct brw_reg dst_retyped
;
1052 GLuint cur_mrf
= 2, response_length
;
1053 GLuint i
, nr_texcoords
;
1056 GLuint mrf_per_channel
;
1059 if (c
->dispatch_width
== 16) {
1060 mrf_per_channel
= 2;
1061 response_length
= 8;
1062 dst_retyped
= retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
);
1063 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
1065 mrf_per_channel
= 1;
1066 response_length
= 4;
1067 dst_retyped
= retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
);
1068 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
1071 /* How many input regs are there?
1074 case TEXTURE_1D_INDEX
:
1078 case TEXTURE_2D_INDEX
:
1079 case TEXTURE_1D_ARRAY_INDEX
:
1080 case TEXTURE_RECT_INDEX
:
1081 case TEXTURE_EXTERNAL_INDEX
:
1082 emit
= WRITEMASK_XY
;
1085 case TEXTURE_3D_INDEX
:
1086 case TEXTURE_2D_ARRAY_INDEX
:
1087 case TEXTURE_CUBE_INDEX
:
1088 emit
= WRITEMASK_XYZ
;
1092 /* unexpected target */
1096 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1097 if (intel
->gen
< 5 && c
->dispatch_width
== 8)
1101 if (intel
->gen
< 7) {
1102 /* For shadow comparisons, we have to supply u,v,r. */
1105 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
1106 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[2]);
1107 cur_mrf
+= mrf_per_channel
;
1111 /* Emit the texcoords. */
1112 for (i
= 0; i
< nr_texcoords
; i
++) {
1113 if (c
->key
.tex
.gl_clamp_mask
[i
] & (1 << sampler
))
1114 brw_set_saturate(p
, true);
1117 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[i
]);
1119 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1120 cur_mrf
+= mrf_per_channel
;
1122 brw_set_saturate(p
, false);
1125 /* Fill in the shadow comparison reference value. */
1126 if (shadow
&& intel
->gen
< 7) {
1127 if (intel
->gen
>= 5) {
1128 /* Fill in the cube map array index value. */
1129 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1130 cur_mrf
+= mrf_per_channel
;
1131 } else if (c
->dispatch_width
== 8) {
1132 /* Fill in the LOD bias value. */
1133 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1134 cur_mrf
+= mrf_per_channel
;
1136 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[2]);
1137 cur_mrf
+= mrf_per_channel
;
1140 if (intel
->gen
>= 5) {
1142 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
1144 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
1146 /* Note that G45 and older determines shadow compare and dispatch width
1147 * from message length for most messages.
1149 if (c
->dispatch_width
== 16 && shadow
)
1150 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE
;
1152 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE
;
1158 retype(depth_payload
, BRW_REGISTER_TYPE_UW
),
1159 SURF_INDEX_TEXTURE(sampler
),
1161 dst_flags
& WRITEMASK_XYZW
,
1167 BRW_SAMPLER_RETURN_FORMAT_FLOAT32
);
1171 void emit_txb(struct brw_wm_compile
*c
,
1172 struct brw_reg
*dst
,
1174 struct brw_reg
*arg
,
1175 struct brw_reg depth_payload
,
1179 struct brw_compile
*p
= &c
->func
;
1180 struct intel_context
*intel
= &p
->brw
->intel
;
1183 GLuint mrf_per_channel
;
1184 GLuint response_length
;
1185 struct brw_reg dst_retyped
;
1187 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1188 * samples, so we'll use the 16-wide instruction, leave the second halves
1189 * undefined, and trust the execution mask to keep the undefined pixels
1192 if (c
->dispatch_width
== 16 || intel
->gen
< 5) {
1193 if (intel
->gen
>= 5)
1194 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
1196 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
1197 mrf_per_channel
= 2;
1198 dst_retyped
= retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
);
1199 response_length
= 8;
1201 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
1202 mrf_per_channel
= 1;
1203 dst_retyped
= retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
);
1204 response_length
= 4;
1207 /* Shadow ignored for txb. */
1209 case TEXTURE_1D_INDEX
:
1210 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1211 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), brw_imm_f(0));
1212 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), brw_imm_f(0));
1214 case TEXTURE_2D_INDEX
:
1215 case TEXTURE_RECT_INDEX
:
1216 case TEXTURE_EXTERNAL_INDEX
:
1217 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1218 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), arg
[1]);
1219 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), brw_imm_f(0));
1221 case TEXTURE_3D_INDEX
:
1222 case TEXTURE_CUBE_INDEX
:
1223 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1224 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), arg
[1]);
1225 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), arg
[2]);
1228 /* unexpected target */
1232 brw_MOV(p
, brw_message_reg(2 + 3 * mrf_per_channel
), arg
[3]);
1233 msgLength
= 2 + 4 * mrf_per_channel
- 1;
1238 retype(depth_payload
, BRW_REGISTER_TYPE_UW
),
1239 SURF_INDEX_TEXTURE(sampler
),
1241 dst_flags
& WRITEMASK_XYZW
,
1246 BRW_SAMPLER_SIMD_MODE_SIMD16
,
1247 BRW_SAMPLER_RETURN_FORMAT_FLOAT32
);
1251 static void emit_lit(struct brw_wm_compile
*c
,
1252 const struct brw_reg
*dst
,
1254 const struct brw_reg
*arg0
)
1256 struct brw_compile
*p
= &c
->func
;
1258 assert((mask
& WRITEMASK_XW
) == 0);
1260 if (mask
& WRITEMASK_Y
) {
1261 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
1262 brw_MOV(p
, dst
[1], arg0
[0]);
1263 brw_set_saturate(p
, 0);
1266 if (mask
& WRITEMASK_Z
) {
1267 emit_math2(c
, BRW_MATH_FUNCTION_POW
,
1269 WRITEMASK_X
| (mask
& SATURATE
),
1274 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1275 * some of the POW calculations above, but 16-wide iff statements
1276 * seem to lock c1 hardware, so this is a nasty workaround:
1278 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_LE
, arg0
[0], brw_imm_f(0));
1280 if (mask
& WRITEMASK_Y
)
1281 brw_MOV(p
, dst
[1], brw_imm_f(0));
1283 if (mask
& WRITEMASK_Z
)
1284 brw_MOV(p
, dst
[2], brw_imm_f(0));
1286 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1290 /* Kill pixel - set execution mask to zero for those pixels which
1293 static void emit_kil( struct brw_wm_compile
*c
,
1294 struct brw_reg
*arg0
)
1296 struct brw_compile
*p
= &c
->func
;
1297 struct intel_context
*intel
= &p
->brw
->intel
;
1298 struct brw_reg pixelmask
;
1301 if (intel
->gen
>= 6)
1302 pixelmask
= retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
1304 pixelmask
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
1306 for (i
= 0; i
< 4; i
++) {
1307 /* Check if we've already done the comparison for this reg
1308 * -- common when someone does KIL TEMP.wwww.
1310 for (j
= 0; j
< i
; j
++) {
1311 if (memcmp(&arg0
[j
], &arg0
[i
], sizeof(arg0
[0])) == 0)
1317 brw_push_insn_state(p
);
1318 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
[i
], brw_imm_f(0));
1319 brw_set_predicate_control_flag_value(p
, 0xff);
1320 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1321 brw_AND(p
, pixelmask
, brw_flag_reg(), pixelmask
);
1322 brw_pop_insn_state(p
);
1326 static void fire_fb_write( struct brw_wm_compile
*c
,
1332 struct brw_compile
*p
= &c
->func
;
1333 struct intel_context
*intel
= &p
->brw
->intel
;
1335 /* Pass through control information:
1337 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1339 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1342 brw_push_insn_state(p
);
1343 brw_set_mask_control(p
, BRW_MASK_DISABLE
); /* ? */
1344 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1346 brw_message_reg(base_reg
+ 1),
1347 brw_vec8_grf(1, 0));
1348 brw_pop_insn_state(p
);
1351 /* Send framebuffer write message: */
1352 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1356 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
),
1365 static void emit_aa( struct brw_wm_compile
*c
,
1366 struct brw_reg
*arg1
,
1369 struct brw_compile
*p
= &c
->func
;
1370 GLuint comp
= c
->aa_dest_stencil_reg
/ 2;
1371 GLuint off
= c
->aa_dest_stencil_reg
% 2;
1372 struct brw_reg aa
= offset(arg1
[comp
], off
);
1374 brw_push_insn_state(p
);
1375 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
); /* ?? */
1376 brw_MOV(p
, brw_message_reg(reg
), aa
);
1377 brw_pop_insn_state(p
);
1381 /* Post-fragment-program processing. Send the results to the
1383 * \param arg0 the fragment color
1384 * \param arg1 the pass-through depth value
1385 * \param arg2 the shader-computed depth value
1387 void emit_fb_write(struct brw_wm_compile
*c
,
1388 struct brw_reg
*arg0
,
1389 struct brw_reg
*arg1
,
1390 struct brw_reg
*arg2
,
1394 struct brw_compile
*p
= &c
->func
;
1395 struct brw_context
*brw
= p
->brw
;
1396 struct intel_context
*intel
= &brw
->intel
;
1400 /* Reserve a space for AA - may not be needed:
1402 if (c
->aa_dest_stencil_reg
)
1405 /* I don't really understand how this achieves the color interleave
1406 * (ie RGBARGBA) in the result: [Do the saturation here]
1408 brw_push_insn_state(p
);
1410 if (c
->key
.clamp_fragment_color
)
1411 brw_set_saturate(p
, 1);
1413 for (channel
= 0; channel
< 4; channel
++) {
1414 if (intel
->gen
>= 6) {
1415 /* gen6 SIMD16 single source DP write looks like:
1425 if (c
->dispatch_width
== 16) {
1426 brw_MOV(p
, brw_message_reg(nr
+ channel
* 2), arg0
[channel
]);
1428 brw_MOV(p
, brw_message_reg(nr
+ channel
), arg0
[channel
]);
1430 } else if (c
->dispatch_width
== 16 && brw
->has_compr4
) {
1431 /* pre-gen6 SIMD16 single source DP write looks like:
1441 * By setting the high bit of the MRF register number, we indicate
1442 * that we want COMPR4 mode - instead of doing the usual destination
1443 * + 1 for the second half we get destination + 4.
1446 brw_message_reg(nr
+ channel
+ BRW_MRF_COMPR4
),
1449 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1450 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1451 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1453 brw_message_reg(nr
+ channel
),
1456 if (c
->dispatch_width
== 16) {
1457 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1459 brw_message_reg(nr
+ channel
+ 4),
1460 sechalf(arg0
[channel
]));
1465 brw_set_saturate(p
, 0);
1467 /* skip over the regs populated above:
1469 if (c
->dispatch_width
== 16)
1474 brw_pop_insn_state(p
);
1476 if (c
->source_depth_to_render_target
)
1478 if (c
->computes_depth
)
1479 brw_MOV(p
, brw_message_reg(nr
), arg2
[2]);
1481 brw_MOV(p
, brw_message_reg(nr
), arg1
[1]); /* ? */
1486 if (c
->dest_depth_reg
)
1488 GLuint comp
= c
->dest_depth_reg
/ 2;
1489 GLuint off
= c
->dest_depth_reg
% 2;
1492 brw_push_insn_state(p
);
1493 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1495 brw_MOV(p
, brw_message_reg(nr
), offset(arg1
[comp
],1));
1497 brw_MOV(p
, brw_message_reg(nr
+1), arg1
[comp
+1]);
1498 brw_pop_insn_state(p
);
1501 brw_MOV(p
, brw_message_reg(nr
), arg1
[comp
]);
1506 if (intel
->gen
>= 6) {
1507 /* Load the message header. There's no implied move from src0
1508 * to the base mrf on gen6.
1510 brw_push_insn_state(p
);
1511 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
1512 brw_MOV(p
, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD
),
1513 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
1514 brw_pop_insn_state(p
);
1517 brw_MOV(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
1519 2), BRW_REGISTER_TYPE_UD
),
1520 brw_imm_ud(target
));
1524 if (!c
->runtime_check_aads_emit
) {
1525 if (c
->aa_dest_stencil_reg
)
1526 emit_aa(c
, arg1
, 2);
1528 fire_fb_write(c
, 0, nr
, target
, eot
);
1531 struct brw_reg v1_null_ud
= vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD
));
1532 struct brw_reg ip
= brw_ip_reg();
1535 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1536 brw_set_conditionalmod(p
, BRW_CONDITIONAL_Z
);
1539 get_element_ud(brw_vec8_grf(1,0), 6),
1542 jmp
= brw_JMPI(p
, ip
, ip
, brw_imm_w(0)) - p
->store
;
1544 emit_aa(c
, arg1
, 2);
1545 fire_fb_write(c
, 0, nr
, target
, eot
);
1546 /* note - thread killed in subroutine */
1548 brw_land_fwd_jump(p
, jmp
);
1550 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1552 fire_fb_write(c
, 1, nr
-1, target
, eot
);
1557 * Move a GPR to scratch memory.
1559 static void emit_spill( struct brw_wm_compile
*c
,
1563 struct brw_compile
*p
= &c
->func
;
1566 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1568 brw_MOV(p
, brw_message_reg(2), reg
);
1571 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1572 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1574 brw_oword_block_write_scratch(p
, brw_message_reg(1), 2, slot
);
1579 * Load a GPR from scratch memory.
1581 static void emit_unspill( struct brw_wm_compile
*c
,
1585 struct brw_compile
*p
= &c
->func
;
1587 /* Slot 0 is the undef value.
1590 brw_MOV(p
, reg
, brw_imm_f(0));
1595 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1596 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1599 brw_oword_block_read(p
, vec16(reg
), brw_message_reg(1), 2, slot
);
1604 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1605 * Args with unspill_reg != 0 will be loaded from scratch memory.
1607 static void get_argument_regs( struct brw_wm_compile
*c
,
1608 struct brw_wm_ref
*arg
[],
1609 struct brw_reg
*regs
)
1613 for (i
= 0; i
< 4; i
++) {
1615 if (arg
[i
]->unspill_reg
)
1617 brw_vec8_grf(arg
[i
]->unspill_reg
, 0),
1618 arg
[i
]->value
->spill_slot
);
1620 regs
[i
] = arg
[i
]->hw_reg
;
1623 regs
[i
] = brw_null_reg();
1630 * For values that have a spill_slot!=0, write those regs to scratch memory.
1632 static void spill_values( struct brw_wm_compile
*c
,
1633 struct brw_wm_value
*values
,
1638 for (i
= 0; i
< nr
; i
++)
1639 if (values
[i
].spill_slot
)
1640 emit_spill(c
, values
[i
].hw_reg
, values
[i
].spill_slot
);
1644 /* Emit the fragment program instructions here.
1646 void brw_wm_emit( struct brw_wm_compile
*c
)
1648 struct brw_compile
*p
= &c
->func
;
1649 struct intel_context
*intel
= &p
->brw
->intel
;
1652 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
1653 if (intel
->gen
>= 6)
1654 brw_set_acc_write_control(p
, 1);
1656 /* Check if any of the payload regs need to be spilled:
1658 spill_values(c
, c
->payload
.depth
, 4);
1659 spill_values(c
, c
->creg
, c
->nr_creg
);
1660 spill_values(c
, c
->payload
.input_interp
, FRAG_ATTRIB_MAX
);
1663 for (insn
= 0; insn
< c
->nr_insns
; insn
++) {
1665 struct brw_wm_instruction
*inst
= &c
->instruction
[insn
];
1666 struct brw_reg args
[3][4], dst
[4];
1667 GLuint i
, dst_flags
;
1669 /* Get argument regs:
1671 for (i
= 0; i
< 3; i
++)
1672 get_argument_regs(c
, inst
->src
[i
], args
[i
]);
1676 for (i
= 0; i
< 4; i
++)
1678 dst
[i
] = inst
->dst
[i
]->hw_reg
;
1680 dst
[i
] = brw_null_reg();
1684 dst_flags
= inst
->writemask
;
1686 dst_flags
|= SATURATE
;
1688 switch (inst
->opcode
) {
1689 /* Generated instructions for calculating triangle interpolants:
1692 emit_pixel_xy(c
, dst
, dst_flags
);
1696 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
1700 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
1704 emit_pixel_w(c
, dst
, dst_flags
, args
[0], args
[1]);
1708 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
1712 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1716 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
1720 emit_fb_write(c
, args
[0], args
[1], args
[2], inst
->target
, inst
->eot
);
1723 case WM_FRONTFACING
:
1724 emit_frontfacing(p
, dst
, dst_flags
);
1727 /* Straightforward arithmetic:
1730 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
1734 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
1738 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
1742 emit_ddxy(p
, dst
, dst_flags
, true, args
[0]);
1746 emit_ddxy(p
, dst
, dst_flags
, false, args
[0]);
1750 emit_dp2(p
, dst
, dst_flags
, args
[0], args
[1]);
1754 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
1758 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
1762 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
1766 for (i
= 0; i
< 4; i
++) {
1767 if (dst_flags
& (1<<i
)) {
1768 brw_RNDZ(p
, dst
[i
], args
[0][i
]);
1774 emit_lrp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1778 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1783 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
1787 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
1791 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
1794 /* Higher math functions:
1797 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
1801 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
1805 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
1809 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
1813 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
1817 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
1821 /* There is an scs math function, but it would need some
1822 * fixup for 16-element execution.
1824 if (dst_flags
& WRITEMASK_X
)
1825 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, (dst_flags
&SATURATE
)|WRITEMASK_X
, args
[0]);
1826 if (dst_flags
& WRITEMASK_Y
)
1827 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
+1, (dst_flags
&SATURATE
)|WRITEMASK_X
, args
[0]);
1831 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, dst_flags
, args
[0], args
[1]);
1837 emit_cmp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1841 emit_max(p
, dst
, dst_flags
, args
[0], args
[1]);
1845 emit_min(p
, dst
, dst_flags
, args
[0], args
[1]);
1849 emit_slt(p
, dst
, dst_flags
, args
[0], args
[1]);
1853 emit_sle(p
, dst
, dst_flags
, args
[0], args
[1]);
1856 emit_sgt(p
, dst
, dst_flags
, args
[0], args
[1]);
1859 emit_sge(p
, dst
, dst_flags
, args
[0], args
[1]);
1862 emit_seq(p
, dst
, dst_flags
, args
[0], args
[1]);
1865 emit_sne(p
, dst
, dst_flags
, args
[0], args
[1]);
1869 emit_sign(p
, dst
, dst_flags
, args
[0]);
1873 emit_lit(c
, dst
, dst_flags
, args
[0]);
1876 /* Texturing operations:
1879 emit_tex(c
, dst
, dst_flags
, args
[0], c
->payload
.depth
[0].hw_reg
,
1880 inst
->tex_idx
, inst
->tex_unit
,
1885 emit_txb(c
, dst
, dst_flags
, args
[0], c
->payload
.depth
[0].hw_reg
,
1886 inst
->tex_idx
, inst
->tex_unit
);
1890 emit_kil(c
, args
[0]);
1894 printf("Unsupported opcode %i (%s) in fragment shader\n",
1895 inst
->opcode
, inst
->opcode
< MAX_OPCODE
?
1896 _mesa_opcode_string(inst
->opcode
) :
1900 for (i
= 0; i
< 4; i
++)
1901 if (inst
->dst
[i
] && inst
->dst
[i
]->spill_slot
)
1903 inst
->dst
[i
]->hw_reg
,
1904 inst
->dst
[i
]->spill_slot
);
1907 /* Only properly tested on ILK */
1908 if (p
->brw
->intel
.gen
== 5) {
1909 brw_remove_duplicate_mrf_moves(p
);
1910 if (c
->dispatch_width
== 16)
1911 brw_remove_grf_to_mrf_moves(p
);
1914 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1917 printf("wm-native:\n");
1918 for (i
= 0; i
< p
->nr_insn
; i
++)
1919 brw_disasm(stdout
, &p
->store
[i
], p
->brw
->intel
.gen
);