Merge remote branch 'origin/master' into pipe-video
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_SWZ] = 1,
87 [OPCODE_XPD] = 2,
88 };
89
90 /* These opcodes get broken down in a way that allow two
91 * args to be immediates.
92 */
93 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
94 if (arg == 1 || arg == 2)
95 return GL_TRUE;
96 }
97
98 if (opcode > ARRAY_SIZE(opcode_array))
99 return GL_FALSE;
100
101 return arg == opcode_array[opcode] - 1;
102 }
103
104 /**
105 * Computes the screen-space x,y position of the pixels.
106 *
107 * This will be used by emit_delta_xy() or emit_wpos_xy() for
108 * interpolation of attributes..
109 *
110 * Payload R0:
111 *
112 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
113 * corresponding to each of the 16 execution channels.
114 * R0.1..8 -- ?
115 * R1.0 -- triangle vertex 0.X
116 * R1.1 -- triangle vertex 0.Y
117 * R1.2 -- tile 0 x,y coords (2 packed uwords)
118 * R1.3 -- tile 1 x,y coords (2 packed uwords)
119 * R1.4 -- tile 2 x,y coords (2 packed uwords)
120 * R1.5 -- tile 3 x,y coords (2 packed uwords)
121 * R1.6 -- ?
122 * R1.7 -- ?
123 * R1.8 -- ?
124 */
125 void emit_pixel_xy(struct brw_wm_compile *c,
126 const struct brw_reg *dst,
127 GLuint mask)
128 {
129 struct brw_compile *p = &c->func;
130 struct brw_reg r1 = brw_vec1_grf(1, 0);
131 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
132 struct brw_reg dst0_uw, dst1_uw;
133
134 brw_push_insn_state(p);
135 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
136
137 if (c->dispatch_width == 16) {
138 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
139 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
140 } else {
141 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
142 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
143 }
144
145 /* Calculate pixel centers by adding 1 or 0 to each of the
146 * micro-tile coordinates passed in r1.
147 */
148 if (mask & WRITEMASK_X) {
149 brw_ADD(p,
150 dst0_uw,
151 stride(suboffset(r1_uw, 4), 2, 4, 0),
152 brw_imm_v(0x10101010));
153 }
154
155 if (mask & WRITEMASK_Y) {
156 brw_ADD(p,
157 dst1_uw,
158 stride(suboffset(r1_uw,5), 2, 4, 0),
159 brw_imm_v(0x11001100));
160 }
161 brw_pop_insn_state(p);
162 }
163
164 /**
165 * Computes the screen-space x,y distance of the pixels from the start
166 * vertex.
167 *
168 * This will be used in linterp or pinterp with the start vertex value
169 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
170 * to produce interpolated attribute values.
171 */
172 void emit_delta_xy(struct brw_compile *p,
173 const struct brw_reg *dst,
174 GLuint mask,
175 const struct brw_reg *arg0)
176 {
177 struct intel_context *intel = &p->brw->intel;
178 struct brw_reg r1 = brw_vec1_grf(1, 0);
179
180 if (mask == 0)
181 return;
182
183 assert(mask == WRITEMASK_XY);
184
185 if (intel->gen >= 6) {
186 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
187 Just add them with 0.0 for dst reg.. */
188 r1 = brw_imm_v(0x00000000);
189 brw_ADD(p,
190 dst[0],
191 retype(arg0[0], BRW_REGISTER_TYPE_UW),
192 r1);
193 brw_ADD(p,
194 dst[1],
195 retype(arg0[1], BRW_REGISTER_TYPE_UW),
196 r1);
197 return;
198 }
199
200 /* Calc delta X,Y by subtracting origin in r1 from the pixel
201 * centers produced by emit_pixel_xy().
202 */
203 brw_ADD(p,
204 dst[0],
205 retype(arg0[0], BRW_REGISTER_TYPE_UW),
206 negate(r1));
207 brw_ADD(p,
208 dst[1],
209 retype(arg0[1], BRW_REGISTER_TYPE_UW),
210 negate(suboffset(r1,1)));
211 }
212
213 /**
214 * Computes the pixel offset from the window origin for gl_FragCoord().
215 */
216 void emit_wpos_xy(struct brw_wm_compile *c,
217 const struct brw_reg *dst,
218 GLuint mask,
219 const struct brw_reg *arg0)
220 {
221 struct brw_compile *p = &c->func;
222
223 if (mask & WRITEMASK_X) {
224 if (c->fp->program.PixelCenterInteger) {
225 /* X' = X */
226 brw_MOV(p,
227 dst[0],
228 retype(arg0[0], BRW_REGISTER_TYPE_W));
229 } else {
230 /* X' = X + 0.5 */
231 brw_ADD(p,
232 dst[0],
233 retype(arg0[0], BRW_REGISTER_TYPE_W),
234 brw_imm_f(0.5));
235 }
236 }
237
238 if (mask & WRITEMASK_Y) {
239 if (c->fp->program.OriginUpperLeft) {
240 if (c->fp->program.PixelCenterInteger) {
241 /* Y' = Y */
242 brw_MOV(p,
243 dst[1],
244 retype(arg0[1], BRW_REGISTER_TYPE_W));
245 } else {
246 /* Y' = Y + 0.5 */
247 brw_ADD(p,
248 dst[1],
249 retype(arg0[1], BRW_REGISTER_TYPE_W),
250 brw_imm_f(0.5));
251 }
252 } else {
253 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
254
255 /* Y' = (height - 1) - Y + center */
256 brw_ADD(p,
257 dst[1],
258 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
259 brw_imm_f(c->key.drawable_height - 1 + center_offset));
260 }
261 }
262 }
263
264
265 void emit_pixel_w(struct brw_wm_compile *c,
266 const struct brw_reg *dst,
267 GLuint mask,
268 const struct brw_reg *arg0,
269 const struct brw_reg *deltas)
270 {
271 struct brw_compile *p = &c->func;
272 struct intel_context *intel = &p->brw->intel;
273 struct brw_reg src;
274 struct brw_reg temp_dst;
275
276 if (intel->gen >= 6)
277 temp_dst = dst[3];
278 else
279 temp_dst = brw_message_reg(2);
280
281 assert(intel->gen < 6);
282
283 /* Don't need this if all you are doing is interpolating color, for
284 * instance.
285 */
286 if (mask & WRITEMASK_W) {
287 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
288
289 /* Calc 1/w - just linterp wpos[3] optimized by putting the
290 * result straight into a message reg.
291 */
292 if (can_do_pln(intel, deltas)) {
293 brw_PLN(p, temp_dst, interp3, deltas[0]);
294 } else {
295 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
296 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
297 }
298
299 /* Calc w */
300 if (intel->gen >= 6)
301 src = temp_dst;
302 else
303 src = brw_null_reg();
304
305 if (c->dispatch_width == 16) {
306 brw_math_16(p, dst[3],
307 BRW_MATH_FUNCTION_INV,
308 BRW_MATH_SATURATE_NONE,
309 2, src,
310 BRW_MATH_PRECISION_FULL);
311 } else {
312 brw_math(p, dst[3],
313 BRW_MATH_FUNCTION_INV,
314 BRW_MATH_SATURATE_NONE,
315 2, src,
316 BRW_MATH_DATA_VECTOR,
317 BRW_MATH_PRECISION_FULL);
318 }
319 }
320 }
321
322 void emit_linterp(struct brw_compile *p,
323 const struct brw_reg *dst,
324 GLuint mask,
325 const struct brw_reg *arg0,
326 const struct brw_reg *deltas)
327 {
328 struct intel_context *intel = &p->brw->intel;
329 struct brw_reg interp[4];
330 GLuint nr = arg0[0].nr;
331 GLuint i;
332
333 interp[0] = brw_vec1_grf(nr, 0);
334 interp[1] = brw_vec1_grf(nr, 4);
335 interp[2] = brw_vec1_grf(nr+1, 0);
336 interp[3] = brw_vec1_grf(nr+1, 4);
337
338 for (i = 0; i < 4; i++) {
339 if (mask & (1<<i)) {
340 if (intel->gen >= 6) {
341 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
342 } else if (can_do_pln(intel, deltas)) {
343 brw_PLN(p, dst[i], interp[i], deltas[0]);
344 } else {
345 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
346 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
347 }
348 }
349 }
350 }
351
352
353 void emit_pinterp(struct brw_compile *p,
354 const struct brw_reg *dst,
355 GLuint mask,
356 const struct brw_reg *arg0,
357 const struct brw_reg *deltas,
358 const struct brw_reg *w)
359 {
360 struct intel_context *intel = &p->brw->intel;
361 struct brw_reg interp[4];
362 GLuint nr = arg0[0].nr;
363 GLuint i;
364
365 if (intel->gen >= 6) {
366 emit_linterp(p, dst, mask, arg0, interp);
367 return;
368 }
369
370 interp[0] = brw_vec1_grf(nr, 0);
371 interp[1] = brw_vec1_grf(nr, 4);
372 interp[2] = brw_vec1_grf(nr+1, 0);
373 interp[3] = brw_vec1_grf(nr+1, 4);
374
375 for (i = 0; i < 4; i++) {
376 if (mask & (1<<i)) {
377 if (can_do_pln(intel, deltas)) {
378 brw_PLN(p, dst[i], interp[i], deltas[0]);
379 } else {
380 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
381 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
382 }
383 }
384 }
385 for (i = 0; i < 4; i++) {
386 if (mask & (1<<i)) {
387 brw_MUL(p, dst[i], dst[i], w[3]);
388 }
389 }
390 }
391
392
393 void emit_cinterp(struct brw_compile *p,
394 const struct brw_reg *dst,
395 GLuint mask,
396 const struct brw_reg *arg0)
397 {
398 struct brw_reg interp[4];
399 GLuint nr = arg0[0].nr;
400 GLuint i;
401
402 interp[0] = brw_vec1_grf(nr, 0);
403 interp[1] = brw_vec1_grf(nr, 4);
404 interp[2] = brw_vec1_grf(nr+1, 0);
405 interp[3] = brw_vec1_grf(nr+1, 4);
406
407 for (i = 0; i < 4; i++) {
408 if (mask & (1<<i)) {
409 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
410 }
411 }
412 }
413
414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
415 void emit_frontfacing(struct brw_compile *p,
416 const struct brw_reg *dst,
417 GLuint mask)
418 {
419 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
420 GLuint i;
421
422 if (!(mask & WRITEMASK_XYZW))
423 return;
424
425 for (i = 0; i < 4; i++) {
426 if (mask & (1<<i)) {
427 brw_MOV(p, dst[i], brw_imm_f(0.0));
428 }
429 }
430
431 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
432 * us front face
433 */
434 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
435 for (i = 0; i < 4; i++) {
436 if (mask & (1<<i)) {
437 brw_MOV(p, dst[i], brw_imm_f(1.0));
438 }
439 }
440 brw_set_predicate_control_flag_value(p, 0xff);
441 }
442
443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
444 * looking like:
445 *
446 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
447 *
448 * and we're trying to produce:
449 *
450 * DDX DDY
451 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
452 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
453 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
454 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
455 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
456 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
457 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
458 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
459 *
460 * and add another set of two more subspans if in 16-pixel dispatch mode.
461 *
462 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
463 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
464 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
465 * between each other. We could probably do it like ddx and swizzle the right
466 * order later, but bail for now and just produce
467 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
468 */
469 void emit_ddxy(struct brw_compile *p,
470 const struct brw_reg *dst,
471 GLuint mask,
472 GLboolean is_ddx,
473 const struct brw_reg *arg0)
474 {
475 int i;
476 struct brw_reg src0, src1;
477
478 if (mask & SATURATE)
479 brw_set_saturate(p, 1);
480 for (i = 0; i < 4; i++ ) {
481 if (mask & (1<<i)) {
482 if (is_ddx) {
483 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
484 BRW_REGISTER_TYPE_F,
485 BRW_VERTICAL_STRIDE_2,
486 BRW_WIDTH_2,
487 BRW_HORIZONTAL_STRIDE_0,
488 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
489 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
490 BRW_REGISTER_TYPE_F,
491 BRW_VERTICAL_STRIDE_2,
492 BRW_WIDTH_2,
493 BRW_HORIZONTAL_STRIDE_0,
494 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
495 } else {
496 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
497 BRW_REGISTER_TYPE_F,
498 BRW_VERTICAL_STRIDE_4,
499 BRW_WIDTH_4,
500 BRW_HORIZONTAL_STRIDE_0,
501 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
502 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
503 BRW_REGISTER_TYPE_F,
504 BRW_VERTICAL_STRIDE_4,
505 BRW_WIDTH_4,
506 BRW_HORIZONTAL_STRIDE_0,
507 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
508 }
509 brw_ADD(p, dst[i], src0, negate(src1));
510 }
511 }
512 if (mask & SATURATE)
513 brw_set_saturate(p, 0);
514 }
515
516 void emit_alu1(struct brw_compile *p,
517 struct brw_instruction *(*func)(struct brw_compile *,
518 struct brw_reg,
519 struct brw_reg),
520 const struct brw_reg *dst,
521 GLuint mask,
522 const struct brw_reg *arg0)
523 {
524 GLuint i;
525
526 if (mask & SATURATE)
527 brw_set_saturate(p, 1);
528
529 for (i = 0; i < 4; i++) {
530 if (mask & (1<<i)) {
531 func(p, dst[i], arg0[i]);
532 }
533 }
534
535 if (mask & SATURATE)
536 brw_set_saturate(p, 0);
537 }
538
539
540 void emit_alu2(struct brw_compile *p,
541 struct brw_instruction *(*func)(struct brw_compile *,
542 struct brw_reg,
543 struct brw_reg,
544 struct brw_reg),
545 const struct brw_reg *dst,
546 GLuint mask,
547 const struct brw_reg *arg0,
548 const struct brw_reg *arg1)
549 {
550 GLuint i;
551
552 if (mask & SATURATE)
553 brw_set_saturate(p, 1);
554
555 for (i = 0; i < 4; i++) {
556 if (mask & (1<<i)) {
557 func(p, dst[i], arg0[i], arg1[i]);
558 }
559 }
560
561 if (mask & SATURATE)
562 brw_set_saturate(p, 0);
563 }
564
565
566 void emit_mad(struct brw_compile *p,
567 const struct brw_reg *dst,
568 GLuint mask,
569 const struct brw_reg *arg0,
570 const struct brw_reg *arg1,
571 const struct brw_reg *arg2)
572 {
573 GLuint i;
574
575 for (i = 0; i < 4; i++) {
576 if (mask & (1<<i)) {
577 brw_MUL(p, dst[i], arg0[i], arg1[i]);
578
579 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
580 brw_ADD(p, dst[i], dst[i], arg2[i]);
581 brw_set_saturate(p, 0);
582 }
583 }
584 }
585
586 void emit_lrp(struct brw_compile *p,
587 const struct brw_reg *dst,
588 GLuint mask,
589 const struct brw_reg *arg0,
590 const struct brw_reg *arg1,
591 const struct brw_reg *arg2)
592 {
593 GLuint i;
594
595 /* Uses dst as a temporary:
596 */
597 for (i = 0; i < 4; i++) {
598 if (mask & (1<<i)) {
599 /* Can I use the LINE instruction for this?
600 */
601 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
602 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
603
604 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
605 brw_MAC(p, dst[i], arg0[i], arg1[i]);
606 brw_set_saturate(p, 0);
607 }
608 }
609 }
610
611 void emit_sop(struct brw_compile *p,
612 const struct brw_reg *dst,
613 GLuint mask,
614 GLuint cond,
615 const struct brw_reg *arg0,
616 const struct brw_reg *arg1)
617 {
618 GLuint i;
619
620 for (i = 0; i < 4; i++) {
621 if (mask & (1<<i)) {
622 brw_push_insn_state(p);
623 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
624 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
625 brw_MOV(p, dst[i], brw_imm_f(0));
626 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
627 brw_MOV(p, dst[i], brw_imm_f(1.0));
628 brw_pop_insn_state(p);
629 }
630 }
631 }
632
633 static void emit_slt( struct brw_compile *p,
634 const struct brw_reg *dst,
635 GLuint mask,
636 const struct brw_reg *arg0,
637 const struct brw_reg *arg1 )
638 {
639 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
640 }
641
642 static void emit_sle( struct brw_compile *p,
643 const struct brw_reg *dst,
644 GLuint mask,
645 const struct brw_reg *arg0,
646 const struct brw_reg *arg1 )
647 {
648 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
649 }
650
651 static void emit_sgt( struct brw_compile *p,
652 const struct brw_reg *dst,
653 GLuint mask,
654 const struct brw_reg *arg0,
655 const struct brw_reg *arg1 )
656 {
657 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
658 }
659
660 static void emit_sge( struct brw_compile *p,
661 const struct brw_reg *dst,
662 GLuint mask,
663 const struct brw_reg *arg0,
664 const struct brw_reg *arg1 )
665 {
666 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
667 }
668
669 static void emit_seq( struct brw_compile *p,
670 const struct brw_reg *dst,
671 GLuint mask,
672 const struct brw_reg *arg0,
673 const struct brw_reg *arg1 )
674 {
675 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
676 }
677
678 static void emit_sne( struct brw_compile *p,
679 const struct brw_reg *dst,
680 GLuint mask,
681 const struct brw_reg *arg0,
682 const struct brw_reg *arg1 )
683 {
684 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
685 }
686
687 void emit_cmp(struct brw_compile *p,
688 const struct brw_reg *dst,
689 GLuint mask,
690 const struct brw_reg *arg0,
691 const struct brw_reg *arg1,
692 const struct brw_reg *arg2)
693 {
694 GLuint i;
695
696 for (i = 0; i < 4; i++) {
697 if (mask & (1<<i)) {
698 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
699
700 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
701 brw_SEL(p, dst[i], arg1[i], arg2[i]);
702 brw_set_saturate(p, 0);
703 brw_set_predicate_control_flag_value(p, 0xff);
704 }
705 }
706 }
707
708 void emit_sign(struct brw_compile *p,
709 const struct brw_reg *dst,
710 GLuint mask,
711 const struct brw_reg *arg0)
712 {
713 GLuint i;
714
715 for (i = 0; i < 4; i++) {
716 if (mask & (1<<i)) {
717 brw_MOV(p, dst[i], brw_imm_f(0.0));
718
719 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
720 brw_MOV(p, dst[i], brw_imm_f(-1.0));
721 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
722
723 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
724 brw_MOV(p, dst[i], brw_imm_f(1.0));
725 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
726 }
727 }
728 }
729
730 void emit_max(struct brw_compile *p,
731 const struct brw_reg *dst,
732 GLuint mask,
733 const struct brw_reg *arg0,
734 const struct brw_reg *arg1)
735 {
736 GLuint i;
737
738 for (i = 0; i < 4; i++) {
739 if (mask & (1<<i)) {
740 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
741
742 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
743 brw_SEL(p, dst[i], arg0[i], arg1[i]);
744 brw_set_saturate(p, 0);
745 brw_set_predicate_control_flag_value(p, 0xff);
746 }
747 }
748 }
749
750 void emit_min(struct brw_compile *p,
751 const struct brw_reg *dst,
752 GLuint mask,
753 const struct brw_reg *arg0,
754 const struct brw_reg *arg1)
755 {
756 GLuint i;
757
758 for (i = 0; i < 4; i++) {
759 if (mask & (1<<i)) {
760 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
761
762 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
763 brw_SEL(p, dst[i], arg0[i], arg1[i]);
764 brw_set_saturate(p, 0);
765 brw_set_predicate_control_flag_value(p, 0xff);
766 }
767 }
768 }
769
770
771 void emit_dp2(struct brw_compile *p,
772 const struct brw_reg *dst,
773 GLuint mask,
774 const struct brw_reg *arg0,
775 const struct brw_reg *arg1)
776 {
777 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
778
779 if (!(mask & WRITEMASK_XYZW))
780 return; /* Do not emit dead code */
781
782 assert(is_power_of_two(mask & WRITEMASK_XYZW));
783
784 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
785
786 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
787 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
788 brw_set_saturate(p, 0);
789 }
790
791
792 void emit_dp3(struct brw_compile *p,
793 const struct brw_reg *dst,
794 GLuint mask,
795 const struct brw_reg *arg0,
796 const struct brw_reg *arg1)
797 {
798 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
799
800 if (!(mask & WRITEMASK_XYZW))
801 return; /* Do not emit dead code */
802
803 assert(is_power_of_two(mask & WRITEMASK_XYZW));
804
805 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
806 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
807
808 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
809 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
810 brw_set_saturate(p, 0);
811 }
812
813
814 void emit_dp4(struct brw_compile *p,
815 const struct brw_reg *dst,
816 GLuint mask,
817 const struct brw_reg *arg0,
818 const struct brw_reg *arg1)
819 {
820 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
821
822 if (!(mask & WRITEMASK_XYZW))
823 return; /* Do not emit dead code */
824
825 assert(is_power_of_two(mask & WRITEMASK_XYZW));
826
827 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
828 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
829 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
830
831 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
832 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
833 brw_set_saturate(p, 0);
834 }
835
836
837 void emit_dph(struct brw_compile *p,
838 const struct brw_reg *dst,
839 GLuint mask,
840 const struct brw_reg *arg0,
841 const struct brw_reg *arg1)
842 {
843 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
844
845 if (!(mask & WRITEMASK_XYZW))
846 return; /* Do not emit dead code */
847
848 assert(is_power_of_two(mask & WRITEMASK_XYZW));
849
850 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
851 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
852 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
853
854 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
855 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
856 brw_set_saturate(p, 0);
857 }
858
859
860 void emit_xpd(struct brw_compile *p,
861 const struct brw_reg *dst,
862 GLuint mask,
863 const struct brw_reg *arg0,
864 const struct brw_reg *arg1)
865 {
866 GLuint i;
867
868 assert((mask & WRITEMASK_W) != WRITEMASK_W);
869
870 for (i = 0 ; i < 3; i++) {
871 if (mask & (1<<i)) {
872 GLuint i2 = (i+2)%3;
873 GLuint i1 = (i+1)%3;
874
875 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
876
877 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
878 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
879 brw_set_saturate(p, 0);
880 }
881 }
882 }
883
884
885 void emit_math1(struct brw_wm_compile *c,
886 GLuint function,
887 const struct brw_reg *dst,
888 GLuint mask,
889 const struct brw_reg *arg0)
890 {
891 struct brw_compile *p = &c->func;
892 struct intel_context *intel = &p->brw->intel;
893 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
894 GLuint saturate = ((mask & SATURATE) ?
895 BRW_MATH_SATURATE_SATURATE :
896 BRW_MATH_SATURATE_NONE);
897 struct brw_reg src;
898
899 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
900 arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
901 arg0[0].negate || arg0[0].abs)) {
902 /* Gen6 math requires that source and dst horizontal stride be 1,
903 * and that the argument be in the GRF.
904 *
905 * The hardware ignores source modifiers (negate and abs) on math
906 * instructions, so we also move to a temp to set those up.
907 */
908 src = dst[dst_chan];
909 brw_MOV(p, src, arg0[0]);
910 } else {
911 src = arg0[0];
912 }
913
914 if (!(mask & WRITEMASK_XYZW))
915 return; /* Do not emit dead code */
916
917 assert(is_power_of_two(mask & WRITEMASK_XYZW));
918
919 /* Send two messages to perform all 16 operations:
920 */
921 brw_push_insn_state(p);
922 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
923 brw_math(p,
924 dst[dst_chan],
925 function,
926 saturate,
927 2,
928 src,
929 BRW_MATH_DATA_VECTOR,
930 BRW_MATH_PRECISION_FULL);
931
932 if (c->dispatch_width == 16) {
933 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
934 brw_math(p,
935 offset(dst[dst_chan],1),
936 function,
937 saturate,
938 3,
939 sechalf(src),
940 BRW_MATH_DATA_VECTOR,
941 BRW_MATH_PRECISION_FULL);
942 }
943 brw_pop_insn_state(p);
944 }
945
946
947 void emit_math2(struct brw_wm_compile *c,
948 GLuint function,
949 const struct brw_reg *dst,
950 GLuint mask,
951 const struct brw_reg *arg0,
952 const struct brw_reg *arg1)
953 {
954 struct brw_compile *p = &c->func;
955 struct intel_context *intel = &p->brw->intel;
956 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
957
958 if (!(mask & WRITEMASK_XYZW))
959 return; /* Do not emit dead code */
960
961 assert(is_power_of_two(mask & WRITEMASK_XYZW));
962
963 brw_push_insn_state(p);
964
965 /* math can only operate on up to a vec8 at a time, so in
966 * dispatch_width==16 we have to do the second half manually.
967 */
968 if (intel->gen >= 6) {
969 struct brw_reg src0 = arg0[0];
970 struct brw_reg src1 = arg1[0];
971 struct brw_reg temp_dst = dst[dst_chan];
972
973 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
974 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
975 /* Both scalar arguments. Do scalar calc. */
976 src0.hstride = BRW_HORIZONTAL_STRIDE_1;
977 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
978 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
979 temp_dst.width = BRW_WIDTH_1;
980
981 if (arg0[0].subnr != 0) {
982 brw_MOV(p, temp_dst, src0);
983 src0 = temp_dst;
984
985 /* Ouch. We've used the temp as a dst, and we still
986 * need a temp to store arg1 in, because src and dst
987 * offsets have to be equal. Leaving this up to
988 * glsl2-965 to handle correctly.
989 */
990 assert(arg1[0].subnr == 0);
991 } else if (arg1[0].subnr != 0) {
992 brw_MOV(p, temp_dst, src1);
993 src1 = temp_dst;
994 }
995 } else {
996 brw_MOV(p, temp_dst, src0);
997 src0 = temp_dst;
998 }
999 } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1000 brw_MOV(p, temp_dst, src1);
1001 src1 = temp_dst;
1002 }
1003
1004 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1005 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1006 brw_math2(p,
1007 temp_dst,
1008 function,
1009 src0,
1010 src1);
1011 if (c->dispatch_width == 16) {
1012 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1013 brw_math2(p,
1014 sechalf(temp_dst),
1015 function,
1016 sechalf(src0),
1017 sechalf(src1));
1018 }
1019
1020 /* Splat a scalar result into all the channels. */
1021 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1022 arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1023 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1024 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1025 brw_MOV(p, dst[dst_chan], temp_dst);
1026 }
1027 } else {
1028 GLuint saturate = ((mask & SATURATE) ?
1029 BRW_MATH_SATURATE_SATURATE :
1030 BRW_MATH_SATURATE_NONE);
1031
1032 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1033 brw_MOV(p, brw_message_reg(3), arg1[0]);
1034 if (c->dispatch_width == 16) {
1035 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1036 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1037 }
1038
1039 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1040 brw_math(p,
1041 dst[dst_chan],
1042 function,
1043 saturate,
1044 2,
1045 arg0[0],
1046 BRW_MATH_DATA_VECTOR,
1047 BRW_MATH_PRECISION_FULL);
1048
1049 /* Send two messages to perform all 16 operations:
1050 */
1051 if (c->dispatch_width == 16) {
1052 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1053 brw_math(p,
1054 offset(dst[dst_chan],1),
1055 function,
1056 saturate,
1057 4,
1058 sechalf(arg0[0]),
1059 BRW_MATH_DATA_VECTOR,
1060 BRW_MATH_PRECISION_FULL);
1061 }
1062 }
1063 brw_pop_insn_state(p);
1064 }
1065
1066
1067 void emit_tex(struct brw_wm_compile *c,
1068 struct brw_reg *dst,
1069 GLuint dst_flags,
1070 struct brw_reg *arg,
1071 struct brw_reg depth_payload,
1072 GLuint tex_idx,
1073 GLuint sampler,
1074 GLboolean shadow)
1075 {
1076 struct brw_compile *p = &c->func;
1077 struct intel_context *intel = &p->brw->intel;
1078 struct brw_reg dst_retyped;
1079 GLuint cur_mrf = 2, response_length;
1080 GLuint i, nr_texcoords;
1081 GLuint emit;
1082 GLuint msg_type;
1083 GLuint mrf_per_channel;
1084 GLuint simd_mode;
1085
1086 if (c->dispatch_width == 16) {
1087 mrf_per_channel = 2;
1088 response_length = 8;
1089 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1090 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1091 } else {
1092 mrf_per_channel = 1;
1093 response_length = 4;
1094 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1095 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1096 }
1097
1098 /* How many input regs are there?
1099 */
1100 switch (tex_idx) {
1101 case TEXTURE_1D_INDEX:
1102 emit = WRITEMASK_X;
1103 nr_texcoords = 1;
1104 break;
1105 case TEXTURE_2D_INDEX:
1106 case TEXTURE_RECT_INDEX:
1107 emit = WRITEMASK_XY;
1108 nr_texcoords = 2;
1109 break;
1110 case TEXTURE_3D_INDEX:
1111 case TEXTURE_CUBE_INDEX:
1112 emit = WRITEMASK_XYZ;
1113 nr_texcoords = 3;
1114 break;
1115 default:
1116 /* unexpected target */
1117 abort();
1118 }
1119
1120 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1121 if (intel->gen < 5 && c->dispatch_width == 8)
1122 nr_texcoords = 3;
1123
1124 /* For shadow comparisons, we have to supply u,v,r. */
1125 if (shadow)
1126 nr_texcoords = 3;
1127
1128 /* Emit the texcoords. */
1129 for (i = 0; i < nr_texcoords; i++) {
1130 if (emit & (1<<i))
1131 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1132 else
1133 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1134 cur_mrf += mrf_per_channel;
1135 }
1136
1137 /* Fill in the shadow comparison reference value. */
1138 if (shadow) {
1139 if (intel->gen >= 5) {
1140 /* Fill in the cube map array index value. */
1141 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1142 cur_mrf += mrf_per_channel;
1143 } else if (c->dispatch_width == 8) {
1144 /* Fill in the LOD bias value. */
1145 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1146 cur_mrf += mrf_per_channel;
1147 }
1148 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1149 cur_mrf += mrf_per_channel;
1150 }
1151
1152 if (intel->gen >= 5) {
1153 if (shadow)
1154 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1155 else
1156 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1157 } else {
1158 /* Note that G45 and older determines shadow compare and dispatch width
1159 * from message length for most messages.
1160 */
1161 if (c->dispatch_width == 16 && shadow)
1162 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1163 else
1164 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1165 }
1166
1167 brw_SAMPLE(p,
1168 dst_retyped,
1169 1,
1170 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1171 SURF_INDEX_TEXTURE(sampler),
1172 sampler,
1173 dst_flags & WRITEMASK_XYZW,
1174 msg_type,
1175 response_length,
1176 cur_mrf - 1,
1177 0,
1178 1,
1179 simd_mode);
1180 }
1181
1182
1183 void emit_txb(struct brw_wm_compile *c,
1184 struct brw_reg *dst,
1185 GLuint dst_flags,
1186 struct brw_reg *arg,
1187 struct brw_reg depth_payload,
1188 GLuint tex_idx,
1189 GLuint sampler)
1190 {
1191 struct brw_compile *p = &c->func;
1192 struct intel_context *intel = &p->brw->intel;
1193 GLuint msgLength;
1194 GLuint msg_type;
1195 GLuint mrf_per_channel;
1196 GLuint response_length;
1197 struct brw_reg dst_retyped;
1198
1199 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1200 * samples, so we'll use the 16-wide instruction, leave the second halves
1201 * undefined, and trust the execution mask to keep the undefined pixels
1202 * from mattering.
1203 */
1204 if (c->dispatch_width == 16 || intel->gen < 5) {
1205 if (intel->gen >= 5)
1206 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1207 else
1208 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1209 mrf_per_channel = 2;
1210 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1211 response_length = 8;
1212 } else {
1213 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1214 mrf_per_channel = 1;
1215 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1216 response_length = 4;
1217 }
1218
1219 /* Shadow ignored for txb. */
1220 switch (tex_idx) {
1221 case TEXTURE_1D_INDEX:
1222 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1223 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1224 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1225 break;
1226 case TEXTURE_2D_INDEX:
1227 case TEXTURE_RECT_INDEX:
1228 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1229 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1230 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1231 break;
1232 case TEXTURE_3D_INDEX:
1233 case TEXTURE_CUBE_INDEX:
1234 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1235 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1236 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1237 break;
1238 default:
1239 /* unexpected target */
1240 abort();
1241 }
1242
1243 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1244 msgLength = 2 + 4 * mrf_per_channel - 1;
1245
1246 brw_SAMPLE(p,
1247 dst_retyped,
1248 1,
1249 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1250 SURF_INDEX_TEXTURE(sampler),
1251 sampler,
1252 dst_flags & WRITEMASK_XYZW,
1253 msg_type,
1254 response_length,
1255 msgLength,
1256 0,
1257 1,
1258 BRW_SAMPLER_SIMD_MODE_SIMD16);
1259 }
1260
1261
1262 static void emit_lit(struct brw_wm_compile *c,
1263 const struct brw_reg *dst,
1264 GLuint mask,
1265 const struct brw_reg *arg0)
1266 {
1267 struct brw_compile *p = &c->func;
1268
1269 assert((mask & WRITEMASK_XW) == 0);
1270
1271 if (mask & WRITEMASK_Y) {
1272 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1273 brw_MOV(p, dst[1], arg0[0]);
1274 brw_set_saturate(p, 0);
1275 }
1276
1277 if (mask & WRITEMASK_Z) {
1278 emit_math2(c, BRW_MATH_FUNCTION_POW,
1279 &dst[2],
1280 WRITEMASK_X | (mask & SATURATE),
1281 &arg0[1],
1282 &arg0[3]);
1283 }
1284
1285 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1286 * some of the POW calculations above, but 16-wide iff statements
1287 * seem to lock c1 hardware, so this is a nasty workaround:
1288 */
1289 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1290 {
1291 if (mask & WRITEMASK_Y)
1292 brw_MOV(p, dst[1], brw_imm_f(0));
1293
1294 if (mask & WRITEMASK_Z)
1295 brw_MOV(p, dst[2], brw_imm_f(0));
1296 }
1297 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1298 }
1299
1300
1301 /* Kill pixel - set execution mask to zero for those pixels which
1302 * fail.
1303 */
1304 static void emit_kil( struct brw_wm_compile *c,
1305 struct brw_reg *arg0)
1306 {
1307 struct brw_compile *p = &c->func;
1308 struct intel_context *intel = &p->brw->intel;
1309 struct brw_reg pixelmask;
1310 GLuint i, j;
1311
1312 if (intel->gen >= 6)
1313 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1314 else
1315 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1316
1317 for (i = 0; i < 4; i++) {
1318 /* Check if we've already done the comparison for this reg
1319 * -- common when someone does KIL TEMP.wwww.
1320 */
1321 for (j = 0; j < i; j++) {
1322 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1323 break;
1324 }
1325 if (j != i)
1326 continue;
1327
1328 brw_push_insn_state(p);
1329 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1330 brw_set_predicate_control_flag_value(p, 0xff);
1331 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1332 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1333 brw_pop_insn_state(p);
1334 }
1335 }
1336
1337 static void fire_fb_write( struct brw_wm_compile *c,
1338 GLuint base_reg,
1339 GLuint nr,
1340 GLuint target,
1341 GLuint eot )
1342 {
1343 struct brw_compile *p = &c->func;
1344 struct intel_context *intel = &p->brw->intel;
1345 struct brw_reg dst;
1346
1347 if (c->dispatch_width == 16)
1348 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1349 else
1350 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1351
1352 /* Pass through control information:
1353 */
1354 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1355 if (intel->gen < 6) /* gen6, use headerless for fb write */
1356 {
1357 brw_push_insn_state(p);
1358 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1359 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1360 brw_MOV(p,
1361 brw_message_reg(base_reg + 1),
1362 brw_vec8_grf(1, 0));
1363 brw_pop_insn_state(p);
1364 }
1365
1366 /* Send framebuffer write message: */
1367 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1368 brw_fb_WRITE(p,
1369 c->dispatch_width,
1370 dst,
1371 base_reg,
1372 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1373 target,
1374 nr,
1375 0,
1376 eot);
1377 }
1378
1379
1380 static void emit_aa( struct brw_wm_compile *c,
1381 struct brw_reg *arg1,
1382 GLuint reg )
1383 {
1384 struct brw_compile *p = &c->func;
1385 GLuint comp = c->aa_dest_stencil_reg / 2;
1386 GLuint off = c->aa_dest_stencil_reg % 2;
1387 struct brw_reg aa = offset(arg1[comp], off);
1388
1389 brw_push_insn_state(p);
1390 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1391 brw_MOV(p, brw_message_reg(reg), aa);
1392 brw_pop_insn_state(p);
1393 }
1394
1395
1396 /* Post-fragment-program processing. Send the results to the
1397 * framebuffer.
1398 * \param arg0 the fragment color
1399 * \param arg1 the pass-through depth value
1400 * \param arg2 the shader-computed depth value
1401 */
1402 void emit_fb_write(struct brw_wm_compile *c,
1403 struct brw_reg *arg0,
1404 struct brw_reg *arg1,
1405 struct brw_reg *arg2,
1406 GLuint target,
1407 GLuint eot)
1408 {
1409 struct brw_compile *p = &c->func;
1410 struct brw_context *brw = p->brw;
1411 struct intel_context *intel = &brw->intel;
1412 GLuint nr = 2;
1413 GLuint channel;
1414
1415 /* Reserve a space for AA - may not be needed:
1416 */
1417 if (c->aa_dest_stencil_reg)
1418 nr += 1;
1419
1420 /* I don't really understand how this achieves the color interleave
1421 * (ie RGBARGBA) in the result: [Do the saturation here]
1422 */
1423 brw_push_insn_state(p);
1424
1425 for (channel = 0; channel < 4; channel++) {
1426 if (intel->gen >= 6) {
1427 /* gen6 SIMD16 single source DP write looks like:
1428 * m + 0: r0
1429 * m + 1: r1
1430 * m + 2: g0
1431 * m + 3: g1
1432 * m + 4: b0
1433 * m + 5: b1
1434 * m + 6: a0
1435 * m + 7: a1
1436 */
1437 if (c->dispatch_width == 16) {
1438 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1439 } else {
1440 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1441 }
1442 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1443 /* pre-gen6 SIMD16 single source DP write looks like:
1444 * m + 0: r0
1445 * m + 1: g0
1446 * m + 2: b0
1447 * m + 3: a0
1448 * m + 4: r1
1449 * m + 5: g1
1450 * m + 6: b1
1451 * m + 7: a1
1452 *
1453 * By setting the high bit of the MRF register number, we indicate
1454 * that we want COMPR4 mode - instead of doing the usual destination
1455 * + 1 for the second half we get destination + 4.
1456 */
1457 brw_MOV(p,
1458 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1459 arg0[channel]);
1460 } else {
1461 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1462 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1463 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1464 brw_MOV(p,
1465 brw_message_reg(nr + channel),
1466 arg0[channel]);
1467
1468 if (c->dispatch_width == 16) {
1469 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1470 brw_MOV(p,
1471 brw_message_reg(nr + channel + 4),
1472 sechalf(arg0[channel]));
1473 }
1474 }
1475 }
1476 /* skip over the regs populated above:
1477 */
1478 if (c->dispatch_width == 16)
1479 nr += 8;
1480 else
1481 nr += 4;
1482
1483 brw_pop_insn_state(p);
1484
1485 if (c->source_depth_to_render_target)
1486 {
1487 if (c->computes_depth)
1488 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1489 else
1490 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1491
1492 nr += 2;
1493 }
1494
1495 if (c->dest_depth_reg)
1496 {
1497 GLuint comp = c->dest_depth_reg / 2;
1498 GLuint off = c->dest_depth_reg % 2;
1499
1500 if (off != 0) {
1501 brw_push_insn_state(p);
1502 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1503
1504 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1505 /* 2nd half? */
1506 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1507 brw_pop_insn_state(p);
1508 }
1509 else {
1510 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1511 }
1512 nr += 2;
1513 }
1514
1515 if (intel->gen >= 6) {
1516 /* Load the message header. There's no implied move from src0
1517 * to the base mrf on gen6.
1518 */
1519 brw_push_insn_state(p);
1520 brw_set_mask_control(p, BRW_MASK_DISABLE);
1521 brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0));
1522 brw_pop_insn_state(p);
1523
1524 if (target != 0) {
1525 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1526 0,
1527 2), BRW_REGISTER_TYPE_UD),
1528 brw_imm_ud(target));
1529 }
1530 }
1531
1532 if (!c->runtime_check_aads_emit) {
1533 if (c->aa_dest_stencil_reg)
1534 emit_aa(c, arg1, 2);
1535
1536 fire_fb_write(c, 0, nr, target, eot);
1537 }
1538 else {
1539 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1540 struct brw_reg ip = brw_ip_reg();
1541 struct brw_instruction *jmp;
1542
1543 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1544 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1545 brw_AND(p,
1546 v1_null_ud,
1547 get_element_ud(brw_vec8_grf(1,0), 6),
1548 brw_imm_ud(1<<26));
1549
1550 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1551 {
1552 emit_aa(c, arg1, 2);
1553 fire_fb_write(c, 0, nr, target, eot);
1554 /* note - thread killed in subroutine */
1555 }
1556 brw_land_fwd_jump(p, jmp);
1557
1558 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1559 */
1560 fire_fb_write(c, 1, nr-1, target, eot);
1561 }
1562 }
1563
1564 /**
1565 * Move a GPR to scratch memory.
1566 */
1567 static void emit_spill( struct brw_wm_compile *c,
1568 struct brw_reg reg,
1569 GLuint slot )
1570 {
1571 struct brw_compile *p = &c->func;
1572
1573 /*
1574 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1575 */
1576 brw_MOV(p, brw_message_reg(2), reg);
1577
1578 /*
1579 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1580 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1581 */
1582 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1583 }
1584
1585
1586 /**
1587 * Load a GPR from scratch memory.
1588 */
1589 static void emit_unspill( struct brw_wm_compile *c,
1590 struct brw_reg reg,
1591 GLuint slot )
1592 {
1593 struct brw_compile *p = &c->func;
1594
1595 /* Slot 0 is the undef value.
1596 */
1597 if (slot == 0) {
1598 brw_MOV(p, reg, brw_imm_f(0));
1599 return;
1600 }
1601
1602 /*
1603 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1604 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1605 */
1606
1607 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1608 }
1609
1610
1611 /**
1612 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1613 * Args with unspill_reg != 0 will be loaded from scratch memory.
1614 */
1615 static void get_argument_regs( struct brw_wm_compile *c,
1616 struct brw_wm_ref *arg[],
1617 struct brw_reg *regs )
1618 {
1619 GLuint i;
1620
1621 for (i = 0; i < 4; i++) {
1622 if (arg[i]) {
1623 if (arg[i]->unspill_reg)
1624 emit_unspill(c,
1625 brw_vec8_grf(arg[i]->unspill_reg, 0),
1626 arg[i]->value->spill_slot);
1627
1628 regs[i] = arg[i]->hw_reg;
1629 }
1630 else {
1631 regs[i] = brw_null_reg();
1632 }
1633 }
1634 }
1635
1636
1637 /**
1638 * For values that have a spill_slot!=0, write those regs to scratch memory.
1639 */
1640 static void spill_values( struct brw_wm_compile *c,
1641 struct brw_wm_value *values,
1642 GLuint nr )
1643 {
1644 GLuint i;
1645
1646 for (i = 0; i < nr; i++)
1647 if (values[i].spill_slot)
1648 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1649 }
1650
1651
1652 /* Emit the fragment program instructions here.
1653 */
1654 void brw_wm_emit( struct brw_wm_compile *c )
1655 {
1656 struct brw_compile *p = &c->func;
1657 struct intel_context *intel = &p->brw->intel;
1658 GLuint insn;
1659
1660 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1661 if (intel->gen >= 6)
1662 brw_set_acc_write_control(p, 1);
1663
1664 /* Check if any of the payload regs need to be spilled:
1665 */
1666 spill_values(c, c->payload.depth, 4);
1667 spill_values(c, c->creg, c->nr_creg);
1668 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1669
1670
1671 for (insn = 0; insn < c->nr_insns; insn++) {
1672
1673 struct brw_wm_instruction *inst = &c->instruction[insn];
1674 struct brw_reg args[3][4], dst[4];
1675 GLuint i, dst_flags;
1676
1677 /* Get argument regs:
1678 */
1679 for (i = 0; i < 3; i++)
1680 get_argument_regs(c, inst->src[i], args[i]);
1681
1682 /* Get dest regs:
1683 */
1684 for (i = 0; i < 4; i++)
1685 if (inst->dst[i])
1686 dst[i] = inst->dst[i]->hw_reg;
1687 else
1688 dst[i] = brw_null_reg();
1689
1690 /* Flags
1691 */
1692 dst_flags = inst->writemask;
1693 if (inst->saturate)
1694 dst_flags |= SATURATE;
1695
1696 switch (inst->opcode) {
1697 /* Generated instructions for calculating triangle interpolants:
1698 */
1699 case WM_PIXELXY:
1700 emit_pixel_xy(c, dst, dst_flags);
1701 break;
1702
1703 case WM_DELTAXY:
1704 emit_delta_xy(p, dst, dst_flags, args[0]);
1705 break;
1706
1707 case WM_WPOSXY:
1708 emit_wpos_xy(c, dst, dst_flags, args[0]);
1709 break;
1710
1711 case WM_PIXELW:
1712 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1713 break;
1714
1715 case WM_LINTERP:
1716 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1717 break;
1718
1719 case WM_PINTERP:
1720 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1721 break;
1722
1723 case WM_CINTERP:
1724 emit_cinterp(p, dst, dst_flags, args[0]);
1725 break;
1726
1727 case WM_FB_WRITE:
1728 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1729 break;
1730
1731 case WM_FRONTFACING:
1732 emit_frontfacing(p, dst, dst_flags);
1733 break;
1734
1735 /* Straightforward arithmetic:
1736 */
1737 case OPCODE_ADD:
1738 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1739 break;
1740
1741 case OPCODE_FRC:
1742 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1743 break;
1744
1745 case OPCODE_FLR:
1746 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1747 break;
1748
1749 case OPCODE_DDX:
1750 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1751 break;
1752
1753 case OPCODE_DDY:
1754 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1755 break;
1756
1757 case OPCODE_DP2:
1758 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1759 break;
1760
1761 case OPCODE_DP3:
1762 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1763 break;
1764
1765 case OPCODE_DP4:
1766 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1767 break;
1768
1769 case OPCODE_DPH:
1770 emit_dph(p, dst, dst_flags, args[0], args[1]);
1771 break;
1772
1773 case OPCODE_TRUNC:
1774 for (i = 0; i < 4; i++) {
1775 if (dst_flags & (1<<i)) {
1776 brw_RNDZ(p, dst[i], args[0][i]);
1777 }
1778 }
1779 break;
1780
1781 case OPCODE_LRP:
1782 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1783 break;
1784
1785 case OPCODE_MAD:
1786 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1787 break;
1788
1789 case OPCODE_MOV:
1790 case OPCODE_SWZ:
1791 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1792 break;
1793
1794 case OPCODE_MUL:
1795 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1796 break;
1797
1798 case OPCODE_XPD:
1799 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1800 break;
1801
1802 /* Higher math functions:
1803 */
1804 case OPCODE_RCP:
1805 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1806 break;
1807
1808 case OPCODE_RSQ:
1809 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1810 break;
1811
1812 case OPCODE_SIN:
1813 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1814 break;
1815
1816 case OPCODE_COS:
1817 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1818 break;
1819
1820 case OPCODE_EX2:
1821 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1822 break;
1823
1824 case OPCODE_LG2:
1825 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1826 break;
1827
1828 case OPCODE_SCS:
1829 /* There is an scs math function, but it would need some
1830 * fixup for 16-element execution.
1831 */
1832 if (dst_flags & WRITEMASK_X)
1833 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1834 if (dst_flags & WRITEMASK_Y)
1835 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1836 break;
1837
1838 case OPCODE_POW:
1839 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1840 break;
1841
1842 /* Comparisons:
1843 */
1844 case OPCODE_CMP:
1845 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1846 break;
1847
1848 case OPCODE_MAX:
1849 emit_max(p, dst, dst_flags, args[0], args[1]);
1850 break;
1851
1852 case OPCODE_MIN:
1853 emit_min(p, dst, dst_flags, args[0], args[1]);
1854 break;
1855
1856 case OPCODE_SLT:
1857 emit_slt(p, dst, dst_flags, args[0], args[1]);
1858 break;
1859
1860 case OPCODE_SLE:
1861 emit_sle(p, dst, dst_flags, args[0], args[1]);
1862 break;
1863 case OPCODE_SGT:
1864 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1865 break;
1866 case OPCODE_SGE:
1867 emit_sge(p, dst, dst_flags, args[0], args[1]);
1868 break;
1869 case OPCODE_SEQ:
1870 emit_seq(p, dst, dst_flags, args[0], args[1]);
1871 break;
1872 case OPCODE_SNE:
1873 emit_sne(p, dst, dst_flags, args[0], args[1]);
1874 break;
1875
1876 case OPCODE_SSG:
1877 emit_sign(p, dst, dst_flags, args[0]);
1878 break;
1879
1880 case OPCODE_LIT:
1881 emit_lit(c, dst, dst_flags, args[0]);
1882 break;
1883
1884 /* Texturing operations:
1885 */
1886 case OPCODE_TEX:
1887 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1888 inst->tex_idx, inst->tex_unit,
1889 inst->tex_shadow);
1890 break;
1891
1892 case OPCODE_TXB:
1893 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1894 inst->tex_idx, inst->tex_unit);
1895 break;
1896
1897 case OPCODE_KIL:
1898 emit_kil(c, args[0]);
1899 break;
1900
1901 default:
1902 printf("Unsupported opcode %i (%s) in fragment shader\n",
1903 inst->opcode, inst->opcode < MAX_OPCODE ?
1904 _mesa_opcode_string(inst->opcode) :
1905 "unknown");
1906 }
1907
1908 for (i = 0; i < 4; i++)
1909 if (inst->dst[i] && inst->dst[i]->spill_slot)
1910 emit_spill(c,
1911 inst->dst[i]->hw_reg,
1912 inst->dst[i]->spill_slot);
1913 }
1914
1915 /* Only properly tested on ILK */
1916 if (p->brw->intel.gen == 5) {
1917 brw_remove_duplicate_mrf_moves(p);
1918 if (c->dispatch_width == 16)
1919 brw_remove_grf_to_mrf_moves(p);
1920 }
1921
1922 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1923 int i;
1924
1925 printf("wm-native:\n");
1926 for (i = 0; i < p->nr_insn; i++)
1927 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1928 printf("\n");
1929 }
1930 }
1931