2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "brw_context.h"
38 can_do_pln(struct intel_context
*intel
, const struct brw_reg
*deltas
)
40 struct brw_context
*brw
= brw_context(&intel
->ctx
);
45 if (deltas
[1].nr
!= deltas
[0].nr
+ 1)
48 if (intel
->gen
< 6 && ((deltas
[0].nr
& 1) != 0))
54 /* Return the SrcReg index of the channels that can be immediate float operands
55 * instead of usage of PROGRAM_CONSTANT values through push/pull.
58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
60 int opcode_array
[] = {
80 /* These opcodes get broken down in a way that allow two
81 * args to be immediates.
83 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
84 if (arg
== 1 || arg
== 2)
88 if (opcode
> ARRAY_SIZE(opcode_array
))
91 return arg
== opcode_array
[opcode
] - 1;
95 * Computes the screen-space x,y position of the pixels.
97 * This will be used by emit_delta_xy() or emit_wpos_xy() for
98 * interpolation of attributes..
102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
103 * corresponding to each of the 16 execution channels.
105 * R1.0 -- triangle vertex 0.X
106 * R1.1 -- triangle vertex 0.Y
107 * R1.2 -- tile 0 x,y coords (2 packed uwords)
108 * R1.3 -- tile 1 x,y coords (2 packed uwords)
109 * R1.4 -- tile 2 x,y coords (2 packed uwords)
110 * R1.5 -- tile 3 x,y coords (2 packed uwords)
115 void emit_pixel_xy(struct brw_wm_compile
*c
,
116 const struct brw_reg
*dst
,
119 struct brw_compile
*p
= &c
->func
;
120 struct brw_reg r1
= brw_vec1_grf(1, 0);
121 struct brw_reg r1_uw
= retype(r1
, BRW_REGISTER_TYPE_UW
);
122 struct brw_reg dst0_uw
, dst1_uw
;
124 brw_push_insn_state(p
);
125 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
127 if (c
->dispatch_width
== 16) {
128 dst0_uw
= vec16(retype(dst
[0], BRW_REGISTER_TYPE_UW
));
129 dst1_uw
= vec16(retype(dst
[1], BRW_REGISTER_TYPE_UW
));
131 dst0_uw
= vec8(retype(dst
[0], BRW_REGISTER_TYPE_UW
));
132 dst1_uw
= vec8(retype(dst
[1], BRW_REGISTER_TYPE_UW
));
135 /* Calculate pixel centers by adding 1 or 0 to each of the
136 * micro-tile coordinates passed in r1.
138 if (mask
& WRITEMASK_X
) {
141 stride(suboffset(r1_uw
, 4), 2, 4, 0),
142 brw_imm_v(0x10101010));
145 if (mask
& WRITEMASK_Y
) {
148 stride(suboffset(r1_uw
,5), 2, 4, 0),
149 brw_imm_v(0x11001100));
151 brw_pop_insn_state(p
);
155 * Computes the screen-space x,y distance of the pixels from the start
158 * This will be used in linterp or pinterp with the start vertex value
159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
160 * to produce interpolated attribute values.
162 void emit_delta_xy(struct brw_compile
*p
,
163 const struct brw_reg
*dst
,
165 const struct brw_reg
*arg0
)
167 struct intel_context
*intel
= &p
->brw
->intel
;
168 struct brw_reg r1
= brw_vec1_grf(1, 0);
173 assert(mask
== WRITEMASK_XY
);
175 if (intel
->gen
>= 6) {
176 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
177 Just add them with 0.0 for dst reg.. */
178 r1
= brw_imm_v(0x00000000);
181 retype(arg0
[0], BRW_REGISTER_TYPE_UW
),
185 retype(arg0
[1], BRW_REGISTER_TYPE_UW
),
190 /* Calc delta X,Y by subtracting origin in r1 from the pixel
191 * centers produced by emit_pixel_xy().
195 retype(arg0
[0], BRW_REGISTER_TYPE_UW
),
199 retype(arg0
[1], BRW_REGISTER_TYPE_UW
),
200 negate(suboffset(r1
,1)));
204 * Computes the pixel offset from the window origin for gl_FragCoord().
206 void emit_wpos_xy(struct brw_wm_compile
*c
,
207 const struct brw_reg
*dst
,
209 const struct brw_reg
*arg0
)
211 struct brw_compile
*p
= &c
->func
;
212 struct intel_context
*intel
= &p
->brw
->intel
;
213 struct brw_reg delta_x
= retype(arg0
[0], BRW_REGISTER_TYPE_W
);
214 struct brw_reg delta_y
= retype(arg0
[1], BRW_REGISTER_TYPE_W
);
216 if (mask
& WRITEMASK_X
) {
217 if (intel
->gen
>= 6) {
218 struct brw_reg delta_x_f
= retype(delta_x
, BRW_REGISTER_TYPE_F
);
219 brw_MOV(p
, delta_x_f
, delta_x
);
223 if (c
->fp
->program
.PixelCenterInteger
) {
225 brw_MOV(p
, dst
[0], delta_x
);
228 brw_ADD(p
, dst
[0], delta_x
, brw_imm_f(0.5));
232 if (mask
& WRITEMASK_Y
) {
233 if (intel
->gen
>= 6) {
234 struct brw_reg delta_y_f
= retype(delta_y
, BRW_REGISTER_TYPE_F
);
235 brw_MOV(p
, delta_y_f
, delta_y
);
239 if (c
->fp
->program
.OriginUpperLeft
) {
240 if (c
->fp
->program
.PixelCenterInteger
) {
242 brw_MOV(p
, dst
[1], delta_y
);
244 brw_ADD(p
, dst
[1], delta_y
, brw_imm_f(0.5));
247 float center_offset
= c
->fp
->program
.PixelCenterInteger
? 0.0 : 0.5;
249 /* Y' = (height - 1) - Y + center */
250 brw_ADD(p
, dst
[1], negate(delta_y
),
251 brw_imm_f(c
->key
.drawable_height
- 1 + center_offset
));
257 void emit_pixel_w(struct brw_wm_compile
*c
,
258 const struct brw_reg
*dst
,
260 const struct brw_reg
*arg0
,
261 const struct brw_reg
*deltas
)
263 struct brw_compile
*p
= &c
->func
;
264 struct intel_context
*intel
= &p
->brw
->intel
;
266 struct brw_reg temp_dst
;
271 temp_dst
= brw_message_reg(2);
273 assert(intel
->gen
< 6);
275 /* Don't need this if all you are doing is interpolating color, for
278 if (mask
& WRITEMASK_W
) {
279 struct brw_reg interp3
= brw_vec1_grf(arg0
[0].nr
+1, 4);
281 /* Calc 1/w - just linterp wpos[3] optimized by putting the
282 * result straight into a message reg.
284 if (can_do_pln(intel
, deltas
)) {
285 brw_PLN(p
, temp_dst
, interp3
, deltas
[0]);
287 brw_LINE(p
, brw_null_reg(), interp3
, deltas
[0]);
288 brw_MAC(p
, temp_dst
, suboffset(interp3
, 1), deltas
[1]);
295 src
= brw_null_reg();
297 if (c
->dispatch_width
== 16) {
298 brw_math_16(p
, dst
[3],
299 BRW_MATH_FUNCTION_INV
,
301 BRW_MATH_PRECISION_FULL
);
304 BRW_MATH_FUNCTION_INV
,
306 BRW_MATH_DATA_VECTOR
,
307 BRW_MATH_PRECISION_FULL
);
312 void emit_linterp(struct brw_compile
*p
,
313 const struct brw_reg
*dst
,
315 const struct brw_reg
*arg0
,
316 const struct brw_reg
*deltas
)
318 struct intel_context
*intel
= &p
->brw
->intel
;
319 struct brw_reg interp
[4];
320 GLuint nr
= arg0
[0].nr
;
323 interp
[0] = brw_vec1_grf(nr
, 0);
324 interp
[1] = brw_vec1_grf(nr
, 4);
325 interp
[2] = brw_vec1_grf(nr
+1, 0);
326 interp
[3] = brw_vec1_grf(nr
+1, 4);
328 for (i
= 0; i
< 4; i
++) {
330 if (intel
->gen
>= 6) {
331 brw_PLN(p
, dst
[i
], interp
[i
], brw_vec8_grf(2, 0));
332 } else if (can_do_pln(intel
, deltas
)) {
333 brw_PLN(p
, dst
[i
], interp
[i
], deltas
[0]);
335 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
336 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
343 void emit_pinterp(struct brw_compile
*p
,
344 const struct brw_reg
*dst
,
346 const struct brw_reg
*arg0
,
347 const struct brw_reg
*deltas
,
348 const struct brw_reg
*w
)
350 struct intel_context
*intel
= &p
->brw
->intel
;
351 struct brw_reg interp
[4];
352 GLuint nr
= arg0
[0].nr
;
355 if (intel
->gen
>= 6) {
356 emit_linterp(p
, dst
, mask
, arg0
, interp
);
360 interp
[0] = brw_vec1_grf(nr
, 0);
361 interp
[1] = brw_vec1_grf(nr
, 4);
362 interp
[2] = brw_vec1_grf(nr
+1, 0);
363 interp
[3] = brw_vec1_grf(nr
+1, 4);
365 for (i
= 0; i
< 4; i
++) {
367 if (can_do_pln(intel
, deltas
)) {
368 brw_PLN(p
, dst
[i
], interp
[i
], deltas
[0]);
370 brw_LINE(p
, brw_null_reg(), interp
[i
], deltas
[0]);
371 brw_MAC(p
, dst
[i
], suboffset(interp
[i
],1), deltas
[1]);
375 for (i
= 0; i
< 4; i
++) {
377 brw_MUL(p
, dst
[i
], dst
[i
], w
[3]);
383 void emit_cinterp(struct brw_compile
*p
,
384 const struct brw_reg
*dst
,
386 const struct brw_reg
*arg0
)
388 struct brw_reg interp
[4];
389 GLuint nr
= arg0
[0].nr
;
392 interp
[0] = brw_vec1_grf(nr
, 0);
393 interp
[1] = brw_vec1_grf(nr
, 4);
394 interp
[2] = brw_vec1_grf(nr
+1, 0);
395 interp
[3] = brw_vec1_grf(nr
+1, 4);
397 for (i
= 0; i
< 4; i
++) {
399 brw_MOV(p
, dst
[i
], suboffset(interp
[i
],3)); /* TODO: optimize away like other moves */
404 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
405 void emit_frontfacing(struct brw_compile
*p
,
406 const struct brw_reg
*dst
,
409 struct brw_reg r1_6ud
= retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD
);
412 if (!(mask
& WRITEMASK_XYZW
))
415 for (i
= 0; i
< 4; i
++) {
417 brw_MOV(p
, dst
[i
], brw_imm_f(0.0));
421 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
424 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, r1_6ud
, brw_imm_ud(1 << 31));
425 for (i
= 0; i
< 4; i
++) {
427 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
430 brw_set_predicate_control_flag_value(p
, 0xff);
433 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
436 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
438 * and we're trying to produce:
441 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
442 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
443 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
444 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
445 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
446 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
447 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
448 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
450 * and add another set of two more subspans if in 16-pixel dispatch mode.
452 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
453 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
454 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
455 * between each other. We could probably do it like ddx and swizzle the right
456 * order later, but bail for now and just produce
457 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
459 * The negate_value boolean is used to negate the d/dy computation for FBOs,
460 * since they place the origin at the upper left instead of the lower left.
462 void emit_ddxy(struct brw_compile
*p
,
463 const struct brw_reg
*dst
,
466 const struct brw_reg
*arg0
,
470 struct brw_reg src0
, src1
;
473 brw_set_saturate(p
, 1);
474 for (i
= 0; i
< 4; i
++ ) {
477 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 1,
479 BRW_VERTICAL_STRIDE_2
,
481 BRW_HORIZONTAL_STRIDE_0
,
482 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
483 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
485 BRW_VERTICAL_STRIDE_2
,
487 BRW_HORIZONTAL_STRIDE_0
,
488 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
490 src0
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 0,
492 BRW_VERTICAL_STRIDE_4
,
494 BRW_HORIZONTAL_STRIDE_0
,
495 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
496 src1
= brw_reg(arg0
[i
].file
, arg0
[i
].nr
, 2,
498 BRW_VERTICAL_STRIDE_4
,
500 BRW_HORIZONTAL_STRIDE_0
,
501 BRW_SWIZZLE_XYZW
, WRITEMASK_XYZW
);
504 brw_ADD(p
, dst
[i
], src1
, negate(src0
));
506 brw_ADD(p
, dst
[i
], src0
, negate(src1
));
510 brw_set_saturate(p
, 0);
513 void emit_alu1(struct brw_compile
*p
,
514 struct brw_instruction
*(*func
)(struct brw_compile
*,
517 const struct brw_reg
*dst
,
519 const struct brw_reg
*arg0
)
524 brw_set_saturate(p
, 1);
526 for (i
= 0; i
< 4; i
++) {
528 func(p
, dst
[i
], arg0
[i
]);
533 brw_set_saturate(p
, 0);
537 void emit_alu2(struct brw_compile
*p
,
538 struct brw_instruction
*(*func
)(struct brw_compile
*,
542 const struct brw_reg
*dst
,
544 const struct brw_reg
*arg0
,
545 const struct brw_reg
*arg1
)
550 brw_set_saturate(p
, 1);
552 for (i
= 0; i
< 4; i
++) {
554 func(p
, dst
[i
], arg0
[i
], arg1
[i
]);
559 brw_set_saturate(p
, 0);
563 void emit_mad(struct brw_compile
*p
,
564 const struct brw_reg
*dst
,
566 const struct brw_reg
*arg0
,
567 const struct brw_reg
*arg1
,
568 const struct brw_reg
*arg2
)
572 for (i
= 0; i
< 4; i
++) {
574 brw_MUL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
576 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
577 brw_ADD(p
, dst
[i
], dst
[i
], arg2
[i
]);
578 brw_set_saturate(p
, 0);
583 void emit_lrp(struct brw_compile
*p
,
584 const struct brw_reg
*dst
,
586 const struct brw_reg
*arg0
,
587 const struct brw_reg
*arg1
,
588 const struct brw_reg
*arg2
)
592 /* Uses dst as a temporary:
594 for (i
= 0; i
< 4; i
++) {
596 /* Can I use the LINE instruction for this?
598 brw_ADD(p
, dst
[i
], negate(arg0
[i
]), brw_imm_f(1.0));
599 brw_MUL(p
, brw_null_reg(), dst
[i
], arg2
[i
]);
601 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
602 brw_MAC(p
, dst
[i
], arg0
[i
], arg1
[i
]);
603 brw_set_saturate(p
, 0);
608 void emit_sop(struct brw_compile
*p
,
609 const struct brw_reg
*dst
,
612 const struct brw_reg
*arg0
,
613 const struct brw_reg
*arg1
)
617 for (i
= 0; i
< 4; i
++) {
619 brw_push_insn_state(p
);
620 brw_CMP(p
, brw_null_reg(), cond
, arg0
[i
], arg1
[i
]);
621 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
622 brw_MOV(p
, dst
[i
], brw_imm_f(0));
623 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
624 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
625 brw_pop_insn_state(p
);
630 static void emit_slt( struct brw_compile
*p
,
631 const struct brw_reg
*dst
,
633 const struct brw_reg
*arg0
,
634 const struct brw_reg
*arg1
)
636 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_L
, arg0
, arg1
);
639 static void emit_sle( struct brw_compile
*p
,
640 const struct brw_reg
*dst
,
642 const struct brw_reg
*arg0
,
643 const struct brw_reg
*arg1
)
645 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_LE
, arg0
, arg1
);
648 static void emit_sgt( struct brw_compile
*p
,
649 const struct brw_reg
*dst
,
651 const struct brw_reg
*arg0
,
652 const struct brw_reg
*arg1
)
654 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_G
, arg0
, arg1
);
657 static void emit_sge( struct brw_compile
*p
,
658 const struct brw_reg
*dst
,
660 const struct brw_reg
*arg0
,
661 const struct brw_reg
*arg1
)
663 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_GE
, arg0
, arg1
);
666 static void emit_seq( struct brw_compile
*p
,
667 const struct brw_reg
*dst
,
669 const struct brw_reg
*arg0
,
670 const struct brw_reg
*arg1
)
672 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_EQ
, arg0
, arg1
);
675 static void emit_sne( struct brw_compile
*p
,
676 const struct brw_reg
*dst
,
678 const struct brw_reg
*arg0
,
679 const struct brw_reg
*arg1
)
681 emit_sop(p
, dst
, mask
, BRW_CONDITIONAL_NEQ
, arg0
, arg1
);
684 void emit_cmp(struct brw_compile
*p
,
685 const struct brw_reg
*dst
,
687 const struct brw_reg
*arg0
,
688 const struct brw_reg
*arg1
,
689 const struct brw_reg
*arg2
)
693 for (i
= 0; i
< 4; i
++) {
695 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], brw_imm_f(0));
697 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
698 brw_SEL(p
, dst
[i
], arg1
[i
], arg2
[i
]);
699 brw_set_saturate(p
, 0);
700 brw_set_predicate_control_flag_value(p
, 0xff);
705 void emit_sign(struct brw_compile
*p
,
706 const struct brw_reg
*dst
,
708 const struct brw_reg
*arg0
)
712 for (i
= 0; i
< 4; i
++) {
714 brw_MOV(p
, dst
[i
], brw_imm_f(0.0));
716 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], brw_imm_f(0));
717 brw_MOV(p
, dst
[i
], brw_imm_f(-1.0));
718 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
720 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
[i
], brw_imm_f(0));
721 brw_MOV(p
, dst
[i
], brw_imm_f(1.0));
722 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
727 void emit_max(struct brw_compile
*p
,
728 const struct brw_reg
*dst
,
730 const struct brw_reg
*arg0
,
731 const struct brw_reg
*arg1
)
735 for (i
= 0; i
< 4; i
++) {
737 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
[i
], arg1
[i
]);
739 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
740 brw_SEL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
741 brw_set_saturate(p
, 0);
742 brw_set_predicate_control_flag_value(p
, 0xff);
747 void emit_min(struct brw_compile
*p
,
748 const struct brw_reg
*dst
,
750 const struct brw_reg
*arg0
,
751 const struct brw_reg
*arg1
)
755 for (i
= 0; i
< 4; i
++) {
757 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
[i
], arg1
[i
]);
759 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
760 brw_SEL(p
, dst
[i
], arg0
[i
], arg1
[i
]);
761 brw_set_saturate(p
, 0);
762 brw_set_predicate_control_flag_value(p
, 0xff);
768 void emit_dp2(struct brw_compile
*p
,
769 const struct brw_reg
*dst
,
771 const struct brw_reg
*arg0
,
772 const struct brw_reg
*arg1
)
774 int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
776 if (!(mask
& WRITEMASK_XYZW
))
777 return; /* Do not emit dead code */
779 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
781 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
783 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
784 brw_MAC(p
, dst
[dst_chan
], arg0
[1], arg1
[1]);
785 brw_set_saturate(p
, 0);
789 void emit_dp3(struct brw_compile
*p
,
790 const struct brw_reg
*dst
,
792 const struct brw_reg
*arg0
,
793 const struct brw_reg
*arg1
)
795 int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
797 if (!(mask
& WRITEMASK_XYZW
))
798 return; /* Do not emit dead code */
800 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
802 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
803 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
805 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
806 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
807 brw_set_saturate(p
, 0);
811 void emit_dp4(struct brw_compile
*p
,
812 const struct brw_reg
*dst
,
814 const struct brw_reg
*arg0
,
815 const struct brw_reg
*arg1
)
817 int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
819 if (!(mask
& WRITEMASK_XYZW
))
820 return; /* Do not emit dead code */
822 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
824 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
825 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
826 brw_MAC(p
, brw_null_reg(), arg0
[2], arg1
[2]);
828 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
829 brw_MAC(p
, dst
[dst_chan
], arg0
[3], arg1
[3]);
830 brw_set_saturate(p
, 0);
834 void emit_dph(struct brw_compile
*p
,
835 const struct brw_reg
*dst
,
837 const struct brw_reg
*arg0
,
838 const struct brw_reg
*arg1
)
840 const int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
842 if (!(mask
& WRITEMASK_XYZW
))
843 return; /* Do not emit dead code */
845 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
847 brw_MUL(p
, brw_null_reg(), arg0
[0], arg1
[0]);
848 brw_MAC(p
, brw_null_reg(), arg0
[1], arg1
[1]);
849 brw_MAC(p
, dst
[dst_chan
], arg0
[2], arg1
[2]);
851 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
852 brw_ADD(p
, dst
[dst_chan
], dst
[dst_chan
], arg1
[3]);
853 brw_set_saturate(p
, 0);
857 void emit_xpd(struct brw_compile
*p
,
858 const struct brw_reg
*dst
,
860 const struct brw_reg
*arg0
,
861 const struct brw_reg
*arg1
)
865 assert((mask
& WRITEMASK_W
) != WRITEMASK_W
);
867 for (i
= 0 ; i
< 3; i
++) {
872 brw_MUL(p
, brw_null_reg(), negate(arg0
[i2
]), arg1
[i1
]);
874 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
875 brw_MAC(p
, dst
[i
], arg0
[i1
], arg1
[i2
]);
876 brw_set_saturate(p
, 0);
882 void emit_math1(struct brw_wm_compile
*c
,
884 const struct brw_reg
*dst
,
886 const struct brw_reg
*arg0
)
888 struct brw_compile
*p
= &c
->func
;
889 struct intel_context
*intel
= &p
->brw
->intel
;
890 int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
893 if (!(mask
& WRITEMASK_XYZW
))
894 return; /* Do not emit dead code */
896 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
898 if (intel
->gen
>= 6 && ((arg0
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
||
899 arg0
[0].file
!= BRW_GENERAL_REGISTER_FILE
) ||
900 arg0
[0].negate
|| arg0
[0].abs
)) {
901 /* Gen6 math requires that source and dst horizontal stride be 1,
902 * and that the argument be in the GRF.
904 * The hardware ignores source modifiers (negate and abs) on math
905 * instructions, so we also move to a temp to set those up.
908 brw_MOV(p
, src
, arg0
[0]);
913 /* Send two messages to perform all 16 operations:
915 brw_push_insn_state(p
);
916 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
917 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
923 BRW_MATH_DATA_VECTOR
,
924 BRW_MATH_PRECISION_FULL
);
926 if (c
->dispatch_width
== 16) {
927 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
929 offset(dst
[dst_chan
],1),
933 BRW_MATH_DATA_VECTOR
,
934 BRW_MATH_PRECISION_FULL
);
936 brw_pop_insn_state(p
);
940 void emit_math2(struct brw_wm_compile
*c
,
942 const struct brw_reg
*dst
,
944 const struct brw_reg
*arg0
,
945 const struct brw_reg
*arg1
)
947 struct brw_compile
*p
= &c
->func
;
948 struct intel_context
*intel
= &p
->brw
->intel
;
949 int dst_chan
= ffs(mask
& WRITEMASK_XYZW
) - 1;
951 if (!(mask
& WRITEMASK_XYZW
))
952 return; /* Do not emit dead code */
954 assert(is_power_of_two(mask
& WRITEMASK_XYZW
));
956 brw_push_insn_state(p
);
958 /* math can only operate on up to a vec8 at a time, so in
959 * dispatch_width==16 we have to do the second half manually.
961 if (intel
->gen
>= 6) {
962 struct brw_reg src0
= arg0
[0];
963 struct brw_reg src1
= arg1
[0];
964 struct brw_reg temp_dst
= dst
[dst_chan
];
966 if (arg0
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
) {
967 brw_MOV(p
, temp_dst
, src0
);
971 if (arg1
[0].hstride
== BRW_HORIZONTAL_STRIDE_0
) {
972 /* This is a heinous hack to get a temporary register for use
973 * in case both arg0 and arg1 are constants. Why you're
974 * doing exponentiation on constant values in the shader, we
977 * max_wm_grf is almost surely less than the maximum GRF, and
978 * gen6 doesn't care about the number of GRFs used in a
979 * shader like pre-gen6 did.
981 struct brw_reg temp
= brw_vec8_grf(c
->max_wm_grf
, 0);
982 brw_MOV(p
, temp
, src1
);
986 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
987 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
993 if (c
->dispatch_width
== 16) {
994 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1002 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1003 brw_MOV(p
, brw_message_reg(3), arg1
[0]);
1004 if (c
->dispatch_width
== 16) {
1005 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1006 brw_MOV(p
, brw_message_reg(5), sechalf(arg1
[0]));
1009 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
1010 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1016 BRW_MATH_DATA_VECTOR
,
1017 BRW_MATH_PRECISION_FULL
);
1019 /* Send two messages to perform all 16 operations:
1021 if (c
->dispatch_width
== 16) {
1022 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1024 offset(dst
[dst_chan
],1),
1028 BRW_MATH_DATA_VECTOR
,
1029 BRW_MATH_PRECISION_FULL
);
1032 brw_pop_insn_state(p
);
1036 void emit_tex(struct brw_wm_compile
*c
,
1037 struct brw_reg
*dst
,
1039 struct brw_reg
*arg
,
1040 struct brw_reg depth_payload
,
1045 struct brw_compile
*p
= &c
->func
;
1046 struct intel_context
*intel
= &p
->brw
->intel
;
1047 struct brw_reg dst_retyped
;
1048 GLuint cur_mrf
= 2, response_length
;
1049 GLuint i
, nr_texcoords
;
1052 GLuint mrf_per_channel
;
1055 if (c
->dispatch_width
== 16) {
1056 mrf_per_channel
= 2;
1057 response_length
= 8;
1058 dst_retyped
= retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
);
1059 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD16
;
1061 mrf_per_channel
= 1;
1062 response_length
= 4;
1063 dst_retyped
= retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
);
1064 simd_mode
= BRW_SAMPLER_SIMD_MODE_SIMD8
;
1067 /* How many input regs are there?
1070 case TEXTURE_1D_INDEX
:
1074 case TEXTURE_2D_INDEX
:
1075 case TEXTURE_1D_ARRAY_INDEX
:
1076 case TEXTURE_RECT_INDEX
:
1077 case TEXTURE_EXTERNAL_INDEX
:
1078 emit
= WRITEMASK_XY
;
1081 case TEXTURE_3D_INDEX
:
1082 case TEXTURE_2D_ARRAY_INDEX
:
1083 case TEXTURE_CUBE_INDEX
:
1084 emit
= WRITEMASK_XYZ
;
1088 /* unexpected target */
1092 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1093 if (intel
->gen
< 5 && c
->dispatch_width
== 8)
1097 if (intel
->gen
< 7) {
1098 /* For shadow comparisons, we have to supply u,v,r. */
1101 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
1102 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[2]);
1103 cur_mrf
+= mrf_per_channel
;
1107 /* Emit the texcoords. */
1108 for (i
= 0; i
< nr_texcoords
; i
++) {
1109 if (c
->key
.tex
.gl_clamp_mask
[i
] & (1 << sampler
))
1110 brw_set_saturate(p
, true);
1113 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[i
]);
1115 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1116 cur_mrf
+= mrf_per_channel
;
1118 brw_set_saturate(p
, false);
1121 /* Fill in the shadow comparison reference value. */
1122 if (shadow
&& intel
->gen
< 7) {
1123 if (intel
->gen
>= 5) {
1124 /* Fill in the cube map array index value. */
1125 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1126 cur_mrf
+= mrf_per_channel
;
1127 } else if (c
->dispatch_width
== 8) {
1128 /* Fill in the LOD bias value. */
1129 brw_MOV(p
, brw_message_reg(cur_mrf
), brw_imm_f(0));
1130 cur_mrf
+= mrf_per_channel
;
1132 brw_MOV(p
, brw_message_reg(cur_mrf
), arg
[2]);
1133 cur_mrf
+= mrf_per_channel
;
1136 if (intel
->gen
>= 5) {
1138 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE
;
1140 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE
;
1142 /* Note that G45 and older determines shadow compare and dispatch width
1143 * from message length for most messages.
1145 if (c
->dispatch_width
== 16 && shadow
)
1146 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE
;
1148 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE
;
1154 retype(depth_payload
, BRW_REGISTER_TYPE_UW
),
1155 SURF_INDEX_TEXTURE(sampler
),
1157 dst_flags
& WRITEMASK_XYZW
,
1163 BRW_SAMPLER_RETURN_FORMAT_FLOAT32
);
1167 void emit_txb(struct brw_wm_compile
*c
,
1168 struct brw_reg
*dst
,
1170 struct brw_reg
*arg
,
1171 struct brw_reg depth_payload
,
1175 struct brw_compile
*p
= &c
->func
;
1176 struct intel_context
*intel
= &p
->brw
->intel
;
1179 GLuint mrf_per_channel
;
1180 GLuint response_length
;
1181 struct brw_reg dst_retyped
;
1183 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1184 * samples, so we'll use the 16-wide instruction, leave the second halves
1185 * undefined, and trust the execution mask to keep the undefined pixels
1188 if (c
->dispatch_width
== 16 || intel
->gen
< 5) {
1189 if (intel
->gen
>= 5)
1190 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
1192 msg_type
= BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS
;
1193 mrf_per_channel
= 2;
1194 dst_retyped
= retype(vec16(dst
[0]), BRW_REGISTER_TYPE_UW
);
1195 response_length
= 8;
1197 msg_type
= GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS
;
1198 mrf_per_channel
= 1;
1199 dst_retyped
= retype(vec8(dst
[0]), BRW_REGISTER_TYPE_UW
);
1200 response_length
= 4;
1203 /* Shadow ignored for txb. */
1205 case TEXTURE_1D_INDEX
:
1206 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1207 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), brw_imm_f(0));
1208 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), brw_imm_f(0));
1210 case TEXTURE_2D_INDEX
:
1211 case TEXTURE_RECT_INDEX
:
1212 case TEXTURE_EXTERNAL_INDEX
:
1213 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1214 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), arg
[1]);
1215 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), brw_imm_f(0));
1217 case TEXTURE_3D_INDEX
:
1218 case TEXTURE_CUBE_INDEX
:
1219 brw_MOV(p
, brw_message_reg(2 + 0 * mrf_per_channel
), arg
[0]);
1220 brw_MOV(p
, brw_message_reg(2 + 1 * mrf_per_channel
), arg
[1]);
1221 brw_MOV(p
, brw_message_reg(2 + 2 * mrf_per_channel
), arg
[2]);
1224 /* unexpected target */
1228 brw_MOV(p
, brw_message_reg(2 + 3 * mrf_per_channel
), arg
[3]);
1229 msgLength
= 2 + 4 * mrf_per_channel
- 1;
1234 retype(depth_payload
, BRW_REGISTER_TYPE_UW
),
1235 SURF_INDEX_TEXTURE(sampler
),
1237 dst_flags
& WRITEMASK_XYZW
,
1242 BRW_SAMPLER_SIMD_MODE_SIMD16
,
1243 BRW_SAMPLER_RETURN_FORMAT_FLOAT32
);
1247 static void emit_lit(struct brw_wm_compile
*c
,
1248 const struct brw_reg
*dst
,
1250 const struct brw_reg
*arg0
)
1252 struct brw_compile
*p
= &c
->func
;
1254 assert((mask
& WRITEMASK_XW
) == 0);
1256 if (mask
& WRITEMASK_Y
) {
1257 brw_set_saturate(p
, (mask
& SATURATE
) ? 1 : 0);
1258 brw_MOV(p
, dst
[1], arg0
[0]);
1259 brw_set_saturate(p
, 0);
1262 if (mask
& WRITEMASK_Z
) {
1263 emit_math2(c
, BRW_MATH_FUNCTION_POW
,
1265 WRITEMASK_X
| (mask
& SATURATE
),
1270 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1271 * some of the POW calculations above, but 16-wide iff statements
1272 * seem to lock c1 hardware, so this is a nasty workaround:
1274 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_LE
, arg0
[0], brw_imm_f(0));
1276 if (mask
& WRITEMASK_Y
)
1277 brw_MOV(p
, dst
[1], brw_imm_f(0));
1279 if (mask
& WRITEMASK_Z
)
1280 brw_MOV(p
, dst
[2], brw_imm_f(0));
1282 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1286 /* Kill pixel - set execution mask to zero for those pixels which
1289 static void emit_kil( struct brw_wm_compile
*c
,
1290 struct brw_reg
*arg0
)
1292 struct brw_compile
*p
= &c
->func
;
1293 struct intel_context
*intel
= &p
->brw
->intel
;
1294 struct brw_reg pixelmask
;
1297 if (intel
->gen
>= 6)
1298 pixelmask
= retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW
);
1300 pixelmask
= retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW
);
1302 for (i
= 0; i
< 4; i
++) {
1303 /* Check if we've already done the comparison for this reg
1304 * -- common when someone does KIL TEMP.wwww.
1306 for (j
= 0; j
< i
; j
++) {
1307 if (memcmp(&arg0
[j
], &arg0
[i
], sizeof(arg0
[0])) == 0)
1313 brw_push_insn_state(p
);
1314 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
[i
], brw_imm_f(0));
1315 brw_set_predicate_control_flag_value(p
, 0xff);
1316 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1317 brw_AND(p
, pixelmask
, brw_flag_reg(), pixelmask
);
1318 brw_pop_insn_state(p
);
1322 static void fire_fb_write( struct brw_wm_compile
*c
,
1328 struct brw_compile
*p
= &c
->func
;
1329 struct intel_context
*intel
= &p
->brw
->intel
;
1330 uint32_t msg_control
;
1332 /* Pass through control information:
1334 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1336 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1339 brw_push_insn_state(p
);
1340 brw_set_mask_control(p
, BRW_MASK_DISABLE
); /* ? */
1341 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1343 brw_message_reg(base_reg
+ 1),
1344 brw_vec8_grf(1, 0));
1345 brw_pop_insn_state(p
);
1348 if (c
->dispatch_width
== 16)
1349 msg_control
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE
;
1351 msg_control
= BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01
;
1353 /* Send framebuffer write message: */
1354 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1358 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW
),
1368 static void emit_aa( struct brw_wm_compile
*c
,
1369 struct brw_reg
*arg1
,
1372 struct brw_compile
*p
= &c
->func
;
1373 GLuint comp
= c
->aa_dest_stencil_reg
/ 2;
1374 GLuint off
= c
->aa_dest_stencil_reg
% 2;
1375 struct brw_reg aa
= offset(arg1
[comp
], off
);
1377 brw_push_insn_state(p
);
1378 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
); /* ?? */
1379 brw_MOV(p
, brw_message_reg(reg
), aa
);
1380 brw_pop_insn_state(p
);
1384 /* Post-fragment-program processing. Send the results to the
1386 * \param arg0 the fragment color
1387 * \param arg1 the pass-through depth value
1388 * \param arg2 the shader-computed depth value
1390 void emit_fb_write(struct brw_wm_compile
*c
,
1391 struct brw_reg
*arg0
,
1392 struct brw_reg
*arg1
,
1393 struct brw_reg
*arg2
,
1397 struct brw_compile
*p
= &c
->func
;
1398 struct brw_context
*brw
= p
->brw
;
1399 struct intel_context
*intel
= &brw
->intel
;
1403 /* Reserve a space for AA - may not be needed:
1405 if (c
->aa_dest_stencil_reg
)
1408 /* I don't really understand how this achieves the color interleave
1409 * (ie RGBARGBA) in the result: [Do the saturation here]
1411 brw_push_insn_state(p
);
1413 if (c
->key
.clamp_fragment_color
)
1414 brw_set_saturate(p
, 1);
1416 for (channel
= 0; channel
< 4; channel
++) {
1417 if (intel
->gen
>= 6) {
1418 /* gen6 SIMD16 single source DP write looks like:
1428 if (c
->dispatch_width
== 16) {
1429 brw_MOV(p
, brw_message_reg(nr
+ channel
* 2), arg0
[channel
]);
1431 brw_MOV(p
, brw_message_reg(nr
+ channel
), arg0
[channel
]);
1433 } else if (c
->dispatch_width
== 16 && brw
->has_compr4
) {
1434 /* pre-gen6 SIMD16 single source DP write looks like:
1444 * By setting the high bit of the MRF register number, we indicate
1445 * that we want COMPR4 mode - instead of doing the usual destination
1446 * + 1 for the second half we get destination + 4.
1449 brw_message_reg(nr
+ channel
+ BRW_MRF_COMPR4
),
1452 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1453 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1454 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1456 brw_message_reg(nr
+ channel
),
1459 if (c
->dispatch_width
== 16) {
1460 brw_set_compression_control(p
, BRW_COMPRESSION_2NDHALF
);
1462 brw_message_reg(nr
+ channel
+ 4),
1463 sechalf(arg0
[channel
]));
1468 brw_set_saturate(p
, 0);
1470 /* skip over the regs populated above:
1472 if (c
->dispatch_width
== 16)
1477 brw_pop_insn_state(p
);
1479 if (c
->source_depth_to_render_target
)
1481 if (c
->computes_depth
)
1482 brw_MOV(p
, brw_message_reg(nr
), arg2
[2]);
1484 brw_MOV(p
, brw_message_reg(nr
), arg1
[1]); /* ? */
1489 if (c
->dest_depth_reg
)
1491 GLuint comp
= c
->dest_depth_reg
/ 2;
1492 GLuint off
= c
->dest_depth_reg
% 2;
1495 brw_push_insn_state(p
);
1496 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1498 brw_MOV(p
, brw_message_reg(nr
), offset(arg1
[comp
],1));
1500 brw_MOV(p
, brw_message_reg(nr
+1), arg1
[comp
+1]);
1501 brw_pop_insn_state(p
);
1504 brw_MOV(p
, brw_message_reg(nr
), arg1
[comp
]);
1509 if (intel
->gen
>= 6) {
1510 /* Load the message header. There's no implied move from src0
1511 * to the base mrf on gen6.
1513 brw_push_insn_state(p
);
1514 brw_set_mask_control(p
, BRW_MASK_DISABLE
);
1515 brw_MOV(p
, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD
),
1516 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
1517 brw_pop_insn_state(p
);
1520 brw_MOV(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
1522 2), BRW_REGISTER_TYPE_UD
),
1523 brw_imm_ud(target
));
1527 if (!c
->runtime_check_aads_emit
) {
1528 if (c
->aa_dest_stencil_reg
)
1529 emit_aa(c
, arg1
, 2);
1531 fire_fb_write(c
, 0, nr
, target
, eot
);
1534 struct brw_reg v1_null_ud
= vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD
));
1535 struct brw_reg ip
= brw_ip_reg();
1538 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1539 brw_set_conditionalmod(p
, BRW_CONDITIONAL_Z
);
1542 get_element_ud(brw_vec8_grf(1,0), 6),
1545 jmp
= brw_JMPI(p
, ip
, ip
, brw_imm_w(0)) - p
->store
;
1547 emit_aa(c
, arg1
, 2);
1548 fire_fb_write(c
, 0, nr
, target
, eot
);
1549 /* note - thread killed in subroutine */
1551 brw_land_fwd_jump(p
, jmp
);
1553 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1555 fire_fb_write(c
, 1, nr
-1, target
, eot
);
1560 * Move a GPR to scratch memory.
1562 static void emit_spill( struct brw_wm_compile
*c
,
1566 struct brw_compile
*p
= &c
->func
;
1569 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1571 brw_MOV(p
, brw_message_reg(2), reg
);
1574 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1575 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1577 brw_oword_block_write_scratch(p
, brw_message_reg(1), 2, slot
);
1582 * Load a GPR from scratch memory.
1584 static void emit_unspill( struct brw_wm_compile
*c
,
1588 struct brw_compile
*p
= &c
->func
;
1590 /* Slot 0 is the undef value.
1593 brw_MOV(p
, reg
, brw_imm_f(0));
1598 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1599 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1602 brw_oword_block_read(p
, vec16(reg
), brw_message_reg(1), 2, slot
);
1607 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1608 * Args with unspill_reg != 0 will be loaded from scratch memory.
1610 static void get_argument_regs( struct brw_wm_compile
*c
,
1611 struct brw_wm_ref
*arg
[],
1612 struct brw_reg
*regs
)
1616 for (i
= 0; i
< 4; i
++) {
1618 if (arg
[i
]->unspill_reg
)
1620 brw_vec8_grf(arg
[i
]->unspill_reg
, 0),
1621 arg
[i
]->value
->spill_slot
);
1623 regs
[i
] = arg
[i
]->hw_reg
;
1626 regs
[i
] = brw_null_reg();
1633 * For values that have a spill_slot!=0, write those regs to scratch memory.
1635 static void spill_values( struct brw_wm_compile
*c
,
1636 struct brw_wm_value
*values
,
1641 for (i
= 0; i
< nr
; i
++)
1642 if (values
[i
].spill_slot
)
1643 emit_spill(c
, values
[i
].hw_reg
, values
[i
].spill_slot
);
1647 /* Emit the fragment program instructions here.
1649 void brw_wm_emit( struct brw_wm_compile
*c
)
1651 struct brw_compile
*p
= &c
->func
;
1652 struct intel_context
*intel
= &p
->brw
->intel
;
1655 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
1656 if (intel
->gen
>= 6)
1657 brw_set_acc_write_control(p
, 1);
1659 /* Check if any of the payload regs need to be spilled:
1661 spill_values(c
, c
->payload
.depth
, 4);
1662 spill_values(c
, c
->creg
, c
->nr_creg
);
1663 spill_values(c
, c
->payload
.input_interp
, FRAG_ATTRIB_MAX
);
1666 for (insn
= 0; insn
< c
->nr_insns
; insn
++) {
1668 struct brw_wm_instruction
*inst
= &c
->instruction
[insn
];
1669 struct brw_reg args
[3][4], dst
[4];
1670 GLuint i
, dst_flags
;
1672 /* Get argument regs:
1674 for (i
= 0; i
< 3; i
++)
1675 get_argument_regs(c
, inst
->src
[i
], args
[i
]);
1679 for (i
= 0; i
< 4; i
++)
1681 dst
[i
] = inst
->dst
[i
]->hw_reg
;
1683 dst
[i
] = brw_null_reg();
1687 dst_flags
= inst
->writemask
;
1689 dst_flags
|= SATURATE
;
1691 switch (inst
->opcode
) {
1692 /* Generated instructions for calculating triangle interpolants:
1695 emit_pixel_xy(c
, dst
, dst_flags
);
1699 emit_delta_xy(p
, dst
, dst_flags
, args
[0]);
1703 emit_wpos_xy(c
, dst
, dst_flags
, args
[0]);
1707 emit_pixel_w(c
, dst
, dst_flags
, args
[0], args
[1]);
1711 emit_linterp(p
, dst
, dst_flags
, args
[0], args
[1]);
1715 emit_pinterp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1719 emit_cinterp(p
, dst
, dst_flags
, args
[0]);
1723 emit_fb_write(c
, args
[0], args
[1], args
[2], inst
->target
, inst
->eot
);
1726 case WM_FRONTFACING
:
1727 emit_frontfacing(p
, dst
, dst_flags
);
1730 /* Straightforward arithmetic:
1733 emit_alu2(p
, brw_ADD
, dst
, dst_flags
, args
[0], args
[1]);
1737 emit_alu1(p
, brw_FRC
, dst
, dst_flags
, args
[0]);
1741 emit_alu1(p
, brw_RNDD
, dst
, dst_flags
, args
[0]);
1745 emit_ddxy(p
, dst
, dst_flags
, true, args
[0], false);
1749 /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no
1750 * guarantee that c->key.render_to_fbo is set).
1752 assert(c
->fp
->program
.UsesDFdy
);
1753 emit_ddxy(p
, dst
, dst_flags
, false, args
[0], c
->key
.render_to_fbo
);
1757 emit_dp2(p
, dst
, dst_flags
, args
[0], args
[1]);
1761 emit_dp3(p
, dst
, dst_flags
, args
[0], args
[1]);
1765 emit_dp4(p
, dst
, dst_flags
, args
[0], args
[1]);
1769 emit_dph(p
, dst
, dst_flags
, args
[0], args
[1]);
1773 for (i
= 0; i
< 4; i
++) {
1774 if (dst_flags
& (1<<i
)) {
1775 brw_RNDZ(p
, dst
[i
], args
[0][i
]);
1781 emit_lrp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1785 emit_mad(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1790 emit_alu1(p
, brw_MOV
, dst
, dst_flags
, args
[0]);
1794 emit_alu2(p
, brw_MUL
, dst
, dst_flags
, args
[0], args
[1]);
1798 emit_xpd(p
, dst
, dst_flags
, args
[0], args
[1]);
1801 /* Higher math functions:
1804 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, dst_flags
, args
[0]);
1808 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, dst_flags
, args
[0]);
1812 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, dst_flags
, args
[0]);
1816 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, dst_flags
, args
[0]);
1820 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, dst_flags
, args
[0]);
1824 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, dst_flags
, args
[0]);
1828 /* There is an scs math function, but it would need some
1829 * fixup for 16-element execution.
1831 if (dst_flags
& WRITEMASK_X
)
1832 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, (dst_flags
&SATURATE
)|WRITEMASK_X
, args
[0]);
1833 if (dst_flags
& WRITEMASK_Y
)
1834 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
+1, (dst_flags
&SATURATE
)|WRITEMASK_X
, args
[0]);
1838 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, dst_flags
, args
[0], args
[1]);
1844 emit_cmp(p
, dst
, dst_flags
, args
[0], args
[1], args
[2]);
1848 emit_max(p
, dst
, dst_flags
, args
[0], args
[1]);
1852 emit_min(p
, dst
, dst_flags
, args
[0], args
[1]);
1856 emit_slt(p
, dst
, dst_flags
, args
[0], args
[1]);
1860 emit_sle(p
, dst
, dst_flags
, args
[0], args
[1]);
1863 emit_sgt(p
, dst
, dst_flags
, args
[0], args
[1]);
1866 emit_sge(p
, dst
, dst_flags
, args
[0], args
[1]);
1869 emit_seq(p
, dst
, dst_flags
, args
[0], args
[1]);
1872 emit_sne(p
, dst
, dst_flags
, args
[0], args
[1]);
1876 emit_sign(p
, dst
, dst_flags
, args
[0]);
1880 emit_lit(c
, dst
, dst_flags
, args
[0]);
1883 /* Texturing operations:
1886 emit_tex(c
, dst
, dst_flags
, args
[0], c
->payload
.depth
[0].hw_reg
,
1887 inst
->tex_idx
, inst
->tex_unit
,
1892 emit_txb(c
, dst
, dst_flags
, args
[0], c
->payload
.depth
[0].hw_reg
,
1893 inst
->tex_idx
, inst
->tex_unit
);
1897 emit_kil(c
, args
[0]);
1901 printf("Unsupported opcode %i (%s) in fragment shader\n",
1902 inst
->opcode
, inst
->opcode
< MAX_OPCODE
?
1903 _mesa_opcode_string(inst
->opcode
) :
1907 for (i
= 0; i
< 4; i
++)
1908 if (inst
->dst
[i
] && inst
->dst
[i
]->spill_slot
)
1910 inst
->dst
[i
]->hw_reg
,
1911 inst
->dst
[i
]->spill_slot
);
1914 /* Only properly tested on ILK */
1915 if (p
->brw
->intel
.gen
== 5) {
1916 brw_remove_duplicate_mrf_moves(p
);
1917 if (c
->dispatch_width
== 16)
1918 brw_remove_grf_to_mrf_moves(p
);
1921 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1922 printf("wm-native:\n");
1923 brw_dump_compile(p
, stdout
, 0, p
->next_insn_offset
);