intel: Clarify the depthRb == stencilRb logic.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Return the SrcReg index of the channels that can be immediate float operands
55 * instead of usage of PROGRAM_CONSTANT values through push/pull.
56 */
57 GLboolean
58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
59 {
60 int opcode_array[] = {
61 [OPCODE_ADD] = 2,
62 [OPCODE_CMP] = 3,
63 [OPCODE_DP3] = 2,
64 [OPCODE_DP4] = 2,
65 [OPCODE_DPH] = 2,
66 [OPCODE_MAX] = 2,
67 [OPCODE_MIN] = 2,
68 [OPCODE_MOV] = 1,
69 [OPCODE_MUL] = 2,
70 [OPCODE_SEQ] = 2,
71 [OPCODE_SGE] = 2,
72 [OPCODE_SGT] = 2,
73 [OPCODE_SLE] = 2,
74 [OPCODE_SLT] = 2,
75 [OPCODE_SNE] = 2,
76 [OPCODE_SWZ] = 1,
77 [OPCODE_XPD] = 2,
78 };
79
80 /* These opcodes get broken down in a way that allow two
81 * args to be immediates.
82 */
83 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
84 if (arg == 1 || arg == 2)
85 return GL_TRUE;
86 }
87
88 if (opcode > ARRAY_SIZE(opcode_array))
89 return GL_FALSE;
90
91 return arg == opcode_array[opcode] - 1;
92 }
93
94 /**
95 * Computes the screen-space x,y position of the pixels.
96 *
97 * This will be used by emit_delta_xy() or emit_wpos_xy() for
98 * interpolation of attributes..
99 *
100 * Payload R0:
101 *
102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
103 * corresponding to each of the 16 execution channels.
104 * R0.1..8 -- ?
105 * R1.0 -- triangle vertex 0.X
106 * R1.1 -- triangle vertex 0.Y
107 * R1.2 -- tile 0 x,y coords (2 packed uwords)
108 * R1.3 -- tile 1 x,y coords (2 packed uwords)
109 * R1.4 -- tile 2 x,y coords (2 packed uwords)
110 * R1.5 -- tile 3 x,y coords (2 packed uwords)
111 * R1.6 -- ?
112 * R1.7 -- ?
113 * R1.8 -- ?
114 */
115 void emit_pixel_xy(struct brw_wm_compile *c,
116 const struct brw_reg *dst,
117 GLuint mask)
118 {
119 struct brw_compile *p = &c->func;
120 struct brw_reg r1 = brw_vec1_grf(1, 0);
121 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
122 struct brw_reg dst0_uw, dst1_uw;
123
124 brw_push_insn_state(p);
125 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
126
127 if (c->dispatch_width == 16) {
128 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
129 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
130 } else {
131 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
132 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
133 }
134
135 /* Calculate pixel centers by adding 1 or 0 to each of the
136 * micro-tile coordinates passed in r1.
137 */
138 if (mask & WRITEMASK_X) {
139 brw_ADD(p,
140 dst0_uw,
141 stride(suboffset(r1_uw, 4), 2, 4, 0),
142 brw_imm_v(0x10101010));
143 }
144
145 if (mask & WRITEMASK_Y) {
146 brw_ADD(p,
147 dst1_uw,
148 stride(suboffset(r1_uw,5), 2, 4, 0),
149 brw_imm_v(0x11001100));
150 }
151 brw_pop_insn_state(p);
152 }
153
154 /**
155 * Computes the screen-space x,y distance of the pixels from the start
156 * vertex.
157 *
158 * This will be used in linterp or pinterp with the start vertex value
159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
160 * to produce interpolated attribute values.
161 */
162 void emit_delta_xy(struct brw_compile *p,
163 const struct brw_reg *dst,
164 GLuint mask,
165 const struct brw_reg *arg0)
166 {
167 struct intel_context *intel = &p->brw->intel;
168 struct brw_reg r1 = brw_vec1_grf(1, 0);
169
170 if (mask == 0)
171 return;
172
173 assert(mask == WRITEMASK_XY);
174
175 if (intel->gen >= 6) {
176 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
177 Just add them with 0.0 for dst reg.. */
178 r1 = brw_imm_v(0x00000000);
179 brw_ADD(p,
180 dst[0],
181 retype(arg0[0], BRW_REGISTER_TYPE_UW),
182 r1);
183 brw_ADD(p,
184 dst[1],
185 retype(arg0[1], BRW_REGISTER_TYPE_UW),
186 r1);
187 return;
188 }
189
190 /* Calc delta X,Y by subtracting origin in r1 from the pixel
191 * centers produced by emit_pixel_xy().
192 */
193 brw_ADD(p,
194 dst[0],
195 retype(arg0[0], BRW_REGISTER_TYPE_UW),
196 negate(r1));
197 brw_ADD(p,
198 dst[1],
199 retype(arg0[1], BRW_REGISTER_TYPE_UW),
200 negate(suboffset(r1,1)));
201 }
202
203 /**
204 * Computes the pixel offset from the window origin for gl_FragCoord().
205 */
206 void emit_wpos_xy(struct brw_wm_compile *c,
207 const struct brw_reg *dst,
208 GLuint mask,
209 const struct brw_reg *arg0)
210 {
211 struct brw_compile *p = &c->func;
212 struct intel_context *intel = &p->brw->intel;
213 struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
214 struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
215
216 if (mask & WRITEMASK_X) {
217 if (intel->gen >= 6) {
218 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
219 brw_MOV(p, delta_x_f, delta_x);
220 delta_x = delta_x_f;
221 }
222
223 if (c->fp->program.PixelCenterInteger) {
224 /* X' = X */
225 brw_MOV(p, dst[0], delta_x);
226 } else {
227 /* X' = X + 0.5 */
228 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
229 }
230 }
231
232 if (mask & WRITEMASK_Y) {
233 if (intel->gen >= 6) {
234 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
235 brw_MOV(p, delta_y_f, delta_y);
236 delta_y = delta_y_f;
237 }
238
239 if (c->fp->program.OriginUpperLeft) {
240 if (c->fp->program.PixelCenterInteger) {
241 /* Y' = Y */
242 brw_MOV(p, dst[1], delta_y);
243 } else {
244 brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
245 }
246 } else {
247 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
248
249 /* Y' = (height - 1) - Y + center */
250 brw_ADD(p, dst[1], negate(delta_y),
251 brw_imm_f(c->key.drawable_height - 1 + center_offset));
252 }
253 }
254 }
255
256
257 void emit_pixel_w(struct brw_wm_compile *c,
258 const struct brw_reg *dst,
259 GLuint mask,
260 const struct brw_reg *arg0,
261 const struct brw_reg *deltas)
262 {
263 struct brw_compile *p = &c->func;
264 struct intel_context *intel = &p->brw->intel;
265 struct brw_reg src;
266 struct brw_reg temp_dst;
267
268 if (intel->gen >= 6)
269 temp_dst = dst[3];
270 else
271 temp_dst = brw_message_reg(2);
272
273 assert(intel->gen < 6);
274
275 /* Don't need this if all you are doing is interpolating color, for
276 * instance.
277 */
278 if (mask & WRITEMASK_W) {
279 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
280
281 /* Calc 1/w - just linterp wpos[3] optimized by putting the
282 * result straight into a message reg.
283 */
284 if (can_do_pln(intel, deltas)) {
285 brw_PLN(p, temp_dst, interp3, deltas[0]);
286 } else {
287 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
288 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
289 }
290
291 /* Calc w */
292 if (intel->gen >= 6)
293 src = temp_dst;
294 else
295 src = brw_null_reg();
296
297 if (c->dispatch_width == 16) {
298 brw_math_16(p, dst[3],
299 BRW_MATH_FUNCTION_INV,
300 BRW_MATH_SATURATE_NONE,
301 2, src,
302 BRW_MATH_PRECISION_FULL);
303 } else {
304 brw_math(p, dst[3],
305 BRW_MATH_FUNCTION_INV,
306 BRW_MATH_SATURATE_NONE,
307 2, src,
308 BRW_MATH_DATA_VECTOR,
309 BRW_MATH_PRECISION_FULL);
310 }
311 }
312 }
313
314 void emit_linterp(struct brw_compile *p,
315 const struct brw_reg *dst,
316 GLuint mask,
317 const struct brw_reg *arg0,
318 const struct brw_reg *deltas)
319 {
320 struct intel_context *intel = &p->brw->intel;
321 struct brw_reg interp[4];
322 GLuint nr = arg0[0].nr;
323 GLuint i;
324
325 interp[0] = brw_vec1_grf(nr, 0);
326 interp[1] = brw_vec1_grf(nr, 4);
327 interp[2] = brw_vec1_grf(nr+1, 0);
328 interp[3] = brw_vec1_grf(nr+1, 4);
329
330 for (i = 0; i < 4; i++) {
331 if (mask & (1<<i)) {
332 if (intel->gen >= 6) {
333 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
334 } else if (can_do_pln(intel, deltas)) {
335 brw_PLN(p, dst[i], interp[i], deltas[0]);
336 } else {
337 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
338 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
339 }
340 }
341 }
342 }
343
344
345 void emit_pinterp(struct brw_compile *p,
346 const struct brw_reg *dst,
347 GLuint mask,
348 const struct brw_reg *arg0,
349 const struct brw_reg *deltas,
350 const struct brw_reg *w)
351 {
352 struct intel_context *intel = &p->brw->intel;
353 struct brw_reg interp[4];
354 GLuint nr = arg0[0].nr;
355 GLuint i;
356
357 if (intel->gen >= 6) {
358 emit_linterp(p, dst, mask, arg0, interp);
359 return;
360 }
361
362 interp[0] = brw_vec1_grf(nr, 0);
363 interp[1] = brw_vec1_grf(nr, 4);
364 interp[2] = brw_vec1_grf(nr+1, 0);
365 interp[3] = brw_vec1_grf(nr+1, 4);
366
367 for (i = 0; i < 4; i++) {
368 if (mask & (1<<i)) {
369 if (can_do_pln(intel, deltas)) {
370 brw_PLN(p, dst[i], interp[i], deltas[0]);
371 } else {
372 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
373 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
374 }
375 }
376 }
377 for (i = 0; i < 4; i++) {
378 if (mask & (1<<i)) {
379 brw_MUL(p, dst[i], dst[i], w[3]);
380 }
381 }
382 }
383
384
385 void emit_cinterp(struct brw_compile *p,
386 const struct brw_reg *dst,
387 GLuint mask,
388 const struct brw_reg *arg0)
389 {
390 struct brw_reg interp[4];
391 GLuint nr = arg0[0].nr;
392 GLuint i;
393
394 interp[0] = brw_vec1_grf(nr, 0);
395 interp[1] = brw_vec1_grf(nr, 4);
396 interp[2] = brw_vec1_grf(nr+1, 0);
397 interp[3] = brw_vec1_grf(nr+1, 4);
398
399 for (i = 0; i < 4; i++) {
400 if (mask & (1<<i)) {
401 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
402 }
403 }
404 }
405
406 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
407 void emit_frontfacing(struct brw_compile *p,
408 const struct brw_reg *dst,
409 GLuint mask)
410 {
411 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
412 GLuint i;
413
414 if (!(mask & WRITEMASK_XYZW))
415 return;
416
417 for (i = 0; i < 4; i++) {
418 if (mask & (1<<i)) {
419 brw_MOV(p, dst[i], brw_imm_f(0.0));
420 }
421 }
422
423 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
424 * us front face
425 */
426 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
427 for (i = 0; i < 4; i++) {
428 if (mask & (1<<i)) {
429 brw_MOV(p, dst[i], brw_imm_f(1.0));
430 }
431 }
432 brw_set_predicate_control_flag_value(p, 0xff);
433 }
434
435 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
436 * looking like:
437 *
438 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
439 *
440 * and we're trying to produce:
441 *
442 * DDX DDY
443 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
444 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
445 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
446 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
447 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
448 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
449 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
450 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
451 *
452 * and add another set of two more subspans if in 16-pixel dispatch mode.
453 *
454 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
455 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
456 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
457 * between each other. We could probably do it like ddx and swizzle the right
458 * order later, but bail for now and just produce
459 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
460 */
461 void emit_ddxy(struct brw_compile *p,
462 const struct brw_reg *dst,
463 GLuint mask,
464 GLboolean is_ddx,
465 const struct brw_reg *arg0)
466 {
467 int i;
468 struct brw_reg src0, src1;
469
470 if (mask & SATURATE)
471 brw_set_saturate(p, 1);
472 for (i = 0; i < 4; i++ ) {
473 if (mask & (1<<i)) {
474 if (is_ddx) {
475 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
476 BRW_REGISTER_TYPE_F,
477 BRW_VERTICAL_STRIDE_2,
478 BRW_WIDTH_2,
479 BRW_HORIZONTAL_STRIDE_0,
480 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
481 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
482 BRW_REGISTER_TYPE_F,
483 BRW_VERTICAL_STRIDE_2,
484 BRW_WIDTH_2,
485 BRW_HORIZONTAL_STRIDE_0,
486 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
487 } else {
488 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
489 BRW_REGISTER_TYPE_F,
490 BRW_VERTICAL_STRIDE_4,
491 BRW_WIDTH_4,
492 BRW_HORIZONTAL_STRIDE_0,
493 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
494 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
495 BRW_REGISTER_TYPE_F,
496 BRW_VERTICAL_STRIDE_4,
497 BRW_WIDTH_4,
498 BRW_HORIZONTAL_STRIDE_0,
499 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
500 }
501 brw_ADD(p, dst[i], src0, negate(src1));
502 }
503 }
504 if (mask & SATURATE)
505 brw_set_saturate(p, 0);
506 }
507
508 void emit_alu1(struct brw_compile *p,
509 struct brw_instruction *(*func)(struct brw_compile *,
510 struct brw_reg,
511 struct brw_reg),
512 const struct brw_reg *dst,
513 GLuint mask,
514 const struct brw_reg *arg0)
515 {
516 GLuint i;
517
518 if (mask & SATURATE)
519 brw_set_saturate(p, 1);
520
521 for (i = 0; i < 4; i++) {
522 if (mask & (1<<i)) {
523 func(p, dst[i], arg0[i]);
524 }
525 }
526
527 if (mask & SATURATE)
528 brw_set_saturate(p, 0);
529 }
530
531
532 void emit_alu2(struct brw_compile *p,
533 struct brw_instruction *(*func)(struct brw_compile *,
534 struct brw_reg,
535 struct brw_reg,
536 struct brw_reg),
537 const struct brw_reg *dst,
538 GLuint mask,
539 const struct brw_reg *arg0,
540 const struct brw_reg *arg1)
541 {
542 GLuint i;
543
544 if (mask & SATURATE)
545 brw_set_saturate(p, 1);
546
547 for (i = 0; i < 4; i++) {
548 if (mask & (1<<i)) {
549 func(p, dst[i], arg0[i], arg1[i]);
550 }
551 }
552
553 if (mask & SATURATE)
554 brw_set_saturate(p, 0);
555 }
556
557
558 void emit_mad(struct brw_compile *p,
559 const struct brw_reg *dst,
560 GLuint mask,
561 const struct brw_reg *arg0,
562 const struct brw_reg *arg1,
563 const struct brw_reg *arg2)
564 {
565 GLuint i;
566
567 for (i = 0; i < 4; i++) {
568 if (mask & (1<<i)) {
569 brw_MUL(p, dst[i], arg0[i], arg1[i]);
570
571 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
572 brw_ADD(p, dst[i], dst[i], arg2[i]);
573 brw_set_saturate(p, 0);
574 }
575 }
576 }
577
578 void emit_lrp(struct brw_compile *p,
579 const struct brw_reg *dst,
580 GLuint mask,
581 const struct brw_reg *arg0,
582 const struct brw_reg *arg1,
583 const struct brw_reg *arg2)
584 {
585 GLuint i;
586
587 /* Uses dst as a temporary:
588 */
589 for (i = 0; i < 4; i++) {
590 if (mask & (1<<i)) {
591 /* Can I use the LINE instruction for this?
592 */
593 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
594 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
595
596 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
597 brw_MAC(p, dst[i], arg0[i], arg1[i]);
598 brw_set_saturate(p, 0);
599 }
600 }
601 }
602
603 void emit_sop(struct brw_compile *p,
604 const struct brw_reg *dst,
605 GLuint mask,
606 GLuint cond,
607 const struct brw_reg *arg0,
608 const struct brw_reg *arg1)
609 {
610 GLuint i;
611
612 for (i = 0; i < 4; i++) {
613 if (mask & (1<<i)) {
614 brw_push_insn_state(p);
615 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
616 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
617 brw_MOV(p, dst[i], brw_imm_f(0));
618 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
619 brw_MOV(p, dst[i], brw_imm_f(1.0));
620 brw_pop_insn_state(p);
621 }
622 }
623 }
624
625 static void emit_slt( struct brw_compile *p,
626 const struct brw_reg *dst,
627 GLuint mask,
628 const struct brw_reg *arg0,
629 const struct brw_reg *arg1 )
630 {
631 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
632 }
633
634 static void emit_sle( struct brw_compile *p,
635 const struct brw_reg *dst,
636 GLuint mask,
637 const struct brw_reg *arg0,
638 const struct brw_reg *arg1 )
639 {
640 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
641 }
642
643 static void emit_sgt( struct brw_compile *p,
644 const struct brw_reg *dst,
645 GLuint mask,
646 const struct brw_reg *arg0,
647 const struct brw_reg *arg1 )
648 {
649 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
650 }
651
652 static void emit_sge( struct brw_compile *p,
653 const struct brw_reg *dst,
654 GLuint mask,
655 const struct brw_reg *arg0,
656 const struct brw_reg *arg1 )
657 {
658 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
659 }
660
661 static void emit_seq( struct brw_compile *p,
662 const struct brw_reg *dst,
663 GLuint mask,
664 const struct brw_reg *arg0,
665 const struct brw_reg *arg1 )
666 {
667 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
668 }
669
670 static void emit_sne( struct brw_compile *p,
671 const struct brw_reg *dst,
672 GLuint mask,
673 const struct brw_reg *arg0,
674 const struct brw_reg *arg1 )
675 {
676 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
677 }
678
679 void emit_cmp(struct brw_compile *p,
680 const struct brw_reg *dst,
681 GLuint mask,
682 const struct brw_reg *arg0,
683 const struct brw_reg *arg1,
684 const struct brw_reg *arg2)
685 {
686 GLuint i;
687
688 for (i = 0; i < 4; i++) {
689 if (mask & (1<<i)) {
690 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
691
692 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
693 brw_SEL(p, dst[i], arg1[i], arg2[i]);
694 brw_set_saturate(p, 0);
695 brw_set_predicate_control_flag_value(p, 0xff);
696 }
697 }
698 }
699
700 void emit_sign(struct brw_compile *p,
701 const struct brw_reg *dst,
702 GLuint mask,
703 const struct brw_reg *arg0)
704 {
705 GLuint i;
706
707 for (i = 0; i < 4; i++) {
708 if (mask & (1<<i)) {
709 brw_MOV(p, dst[i], brw_imm_f(0.0));
710
711 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
712 brw_MOV(p, dst[i], brw_imm_f(-1.0));
713 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
714
715 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
716 brw_MOV(p, dst[i], brw_imm_f(1.0));
717 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
718 }
719 }
720 }
721
722 void emit_max(struct brw_compile *p,
723 const struct brw_reg *dst,
724 GLuint mask,
725 const struct brw_reg *arg0,
726 const struct brw_reg *arg1)
727 {
728 GLuint i;
729
730 for (i = 0; i < 4; i++) {
731 if (mask & (1<<i)) {
732 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
733
734 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
735 brw_SEL(p, dst[i], arg0[i], arg1[i]);
736 brw_set_saturate(p, 0);
737 brw_set_predicate_control_flag_value(p, 0xff);
738 }
739 }
740 }
741
742 void emit_min(struct brw_compile *p,
743 const struct brw_reg *dst,
744 GLuint mask,
745 const struct brw_reg *arg0,
746 const struct brw_reg *arg1)
747 {
748 GLuint i;
749
750 for (i = 0; i < 4; i++) {
751 if (mask & (1<<i)) {
752 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
753
754 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
755 brw_SEL(p, dst[i], arg0[i], arg1[i]);
756 brw_set_saturate(p, 0);
757 brw_set_predicate_control_flag_value(p, 0xff);
758 }
759 }
760 }
761
762
763 void emit_dp2(struct brw_compile *p,
764 const struct brw_reg *dst,
765 GLuint mask,
766 const struct brw_reg *arg0,
767 const struct brw_reg *arg1)
768 {
769 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
770
771 if (!(mask & WRITEMASK_XYZW))
772 return; /* Do not emit dead code */
773
774 assert(is_power_of_two(mask & WRITEMASK_XYZW));
775
776 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
777
778 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
779 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
780 brw_set_saturate(p, 0);
781 }
782
783
784 void emit_dp3(struct brw_compile *p,
785 const struct brw_reg *dst,
786 GLuint mask,
787 const struct brw_reg *arg0,
788 const struct brw_reg *arg1)
789 {
790 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
791
792 if (!(mask & WRITEMASK_XYZW))
793 return; /* Do not emit dead code */
794
795 assert(is_power_of_two(mask & WRITEMASK_XYZW));
796
797 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
798 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
799
800 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
801 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
802 brw_set_saturate(p, 0);
803 }
804
805
806 void emit_dp4(struct brw_compile *p,
807 const struct brw_reg *dst,
808 GLuint mask,
809 const struct brw_reg *arg0,
810 const struct brw_reg *arg1)
811 {
812 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
813
814 if (!(mask & WRITEMASK_XYZW))
815 return; /* Do not emit dead code */
816
817 assert(is_power_of_two(mask & WRITEMASK_XYZW));
818
819 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
820 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
821 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
822
823 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
824 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
825 brw_set_saturate(p, 0);
826 }
827
828
829 void emit_dph(struct brw_compile *p,
830 const struct brw_reg *dst,
831 GLuint mask,
832 const struct brw_reg *arg0,
833 const struct brw_reg *arg1)
834 {
835 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
836
837 if (!(mask & WRITEMASK_XYZW))
838 return; /* Do not emit dead code */
839
840 assert(is_power_of_two(mask & WRITEMASK_XYZW));
841
842 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
843 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
844 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
845
846 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
847 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
848 brw_set_saturate(p, 0);
849 }
850
851
852 void emit_xpd(struct brw_compile *p,
853 const struct brw_reg *dst,
854 GLuint mask,
855 const struct brw_reg *arg0,
856 const struct brw_reg *arg1)
857 {
858 GLuint i;
859
860 assert((mask & WRITEMASK_W) != WRITEMASK_W);
861
862 for (i = 0 ; i < 3; i++) {
863 if (mask & (1<<i)) {
864 GLuint i2 = (i+2)%3;
865 GLuint i1 = (i+1)%3;
866
867 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
868
869 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
870 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
871 brw_set_saturate(p, 0);
872 }
873 }
874 }
875
876
877 void emit_math1(struct brw_wm_compile *c,
878 GLuint function,
879 const struct brw_reg *dst,
880 GLuint mask,
881 const struct brw_reg *arg0)
882 {
883 struct brw_compile *p = &c->func;
884 struct intel_context *intel = &p->brw->intel;
885 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
886 GLuint saturate = ((mask & SATURATE) ?
887 BRW_MATH_SATURATE_SATURATE :
888 BRW_MATH_SATURATE_NONE);
889 struct brw_reg src;
890
891 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
892 arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
893 arg0[0].negate || arg0[0].abs)) {
894 /* Gen6 math requires that source and dst horizontal stride be 1,
895 * and that the argument be in the GRF.
896 *
897 * The hardware ignores source modifiers (negate and abs) on math
898 * instructions, so we also move to a temp to set those up.
899 */
900 src = dst[dst_chan];
901 brw_MOV(p, src, arg0[0]);
902 } else {
903 src = arg0[0];
904 }
905
906 if (!(mask & WRITEMASK_XYZW))
907 return; /* Do not emit dead code */
908
909 assert(is_power_of_two(mask & WRITEMASK_XYZW));
910
911 /* Send two messages to perform all 16 operations:
912 */
913 brw_push_insn_state(p);
914 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
915 brw_math(p,
916 dst[dst_chan],
917 function,
918 saturate,
919 2,
920 src,
921 BRW_MATH_DATA_VECTOR,
922 BRW_MATH_PRECISION_FULL);
923
924 if (c->dispatch_width == 16) {
925 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
926 brw_math(p,
927 offset(dst[dst_chan],1),
928 function,
929 saturate,
930 3,
931 sechalf(src),
932 BRW_MATH_DATA_VECTOR,
933 BRW_MATH_PRECISION_FULL);
934 }
935 brw_pop_insn_state(p);
936 }
937
938
939 void emit_math2(struct brw_wm_compile *c,
940 GLuint function,
941 const struct brw_reg *dst,
942 GLuint mask,
943 const struct brw_reg *arg0,
944 const struct brw_reg *arg1)
945 {
946 struct brw_compile *p = &c->func;
947 struct intel_context *intel = &p->brw->intel;
948 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
949
950 if (!(mask & WRITEMASK_XYZW))
951 return; /* Do not emit dead code */
952
953 assert(is_power_of_two(mask & WRITEMASK_XYZW));
954
955 brw_push_insn_state(p);
956
957 /* math can only operate on up to a vec8 at a time, so in
958 * dispatch_width==16 we have to do the second half manually.
959 */
960 if (intel->gen >= 6) {
961 struct brw_reg src0 = arg0[0];
962 struct brw_reg src1 = arg1[0];
963 struct brw_reg temp_dst = dst[dst_chan];
964
965 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
966 brw_MOV(p, temp_dst, src0);
967 src0 = temp_dst;
968 }
969
970 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
971 /* This is a heinous hack to get a temporary register for use
972 * in case both arg0 and arg1 are constants. Why you're
973 * doing exponentiation on constant values in the shader, we
974 * don't know.
975 *
976 * max_wm_grf is almost surely less than the maximum GRF, and
977 * gen6 doesn't care about the number of GRFs used in a
978 * shader like pre-gen6 did.
979 */
980 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
981 brw_MOV(p, temp, src1);
982 src1 = temp;
983 }
984
985 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
986 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
987 brw_math2(p,
988 temp_dst,
989 function,
990 src0,
991 src1);
992 if (c->dispatch_width == 16) {
993 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
994 brw_math2(p,
995 sechalf(temp_dst),
996 function,
997 sechalf(src0),
998 sechalf(src1));
999 }
1000 } else {
1001 GLuint saturate = ((mask & SATURATE) ?
1002 BRW_MATH_SATURATE_SATURATE :
1003 BRW_MATH_SATURATE_NONE);
1004
1005 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1006 brw_MOV(p, brw_message_reg(3), arg1[0]);
1007 if (c->dispatch_width == 16) {
1008 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1009 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1010 }
1011
1012 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1013 brw_math(p,
1014 dst[dst_chan],
1015 function,
1016 saturate,
1017 2,
1018 arg0[0],
1019 BRW_MATH_DATA_VECTOR,
1020 BRW_MATH_PRECISION_FULL);
1021
1022 /* Send two messages to perform all 16 operations:
1023 */
1024 if (c->dispatch_width == 16) {
1025 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1026 brw_math(p,
1027 offset(dst[dst_chan],1),
1028 function,
1029 saturate,
1030 4,
1031 sechalf(arg0[0]),
1032 BRW_MATH_DATA_VECTOR,
1033 BRW_MATH_PRECISION_FULL);
1034 }
1035 }
1036 brw_pop_insn_state(p);
1037 }
1038
1039
1040 void emit_tex(struct brw_wm_compile *c,
1041 struct brw_reg *dst,
1042 GLuint dst_flags,
1043 struct brw_reg *arg,
1044 struct brw_reg depth_payload,
1045 GLuint tex_idx,
1046 GLuint sampler,
1047 GLboolean shadow)
1048 {
1049 struct brw_compile *p = &c->func;
1050 struct intel_context *intel = &p->brw->intel;
1051 struct brw_reg dst_retyped;
1052 GLuint cur_mrf = 2, response_length;
1053 GLuint i, nr_texcoords;
1054 GLuint emit;
1055 GLuint msg_type;
1056 GLuint mrf_per_channel;
1057 GLuint simd_mode;
1058
1059 if (c->dispatch_width == 16) {
1060 mrf_per_channel = 2;
1061 response_length = 8;
1062 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1063 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1064 } else {
1065 mrf_per_channel = 1;
1066 response_length = 4;
1067 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1068 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1069 }
1070
1071 /* How many input regs are there?
1072 */
1073 switch (tex_idx) {
1074 case TEXTURE_1D_INDEX:
1075 emit = WRITEMASK_X;
1076 nr_texcoords = 1;
1077 break;
1078 case TEXTURE_2D_INDEX:
1079 case TEXTURE_RECT_INDEX:
1080 emit = WRITEMASK_XY;
1081 nr_texcoords = 2;
1082 break;
1083 case TEXTURE_3D_INDEX:
1084 case TEXTURE_CUBE_INDEX:
1085 emit = WRITEMASK_XYZ;
1086 nr_texcoords = 3;
1087 break;
1088 default:
1089 /* unexpected target */
1090 abort();
1091 }
1092
1093 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1094 if (intel->gen < 5 && c->dispatch_width == 8)
1095 nr_texcoords = 3;
1096
1097 /* For shadow comparisons, we have to supply u,v,r. */
1098 if (shadow)
1099 nr_texcoords = 3;
1100
1101 /* Emit the texcoords. */
1102 for (i = 0; i < nr_texcoords; i++) {
1103 if (c->key.gl_clamp_mask[i] & (1 << sampler))
1104 brw_set_saturate(p, true);
1105
1106 if (emit & (1<<i))
1107 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1108 else
1109 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1110 cur_mrf += mrf_per_channel;
1111
1112 brw_set_saturate(p, false);
1113 }
1114
1115 /* Fill in the shadow comparison reference value. */
1116 if (shadow) {
1117 if (intel->gen >= 5) {
1118 /* Fill in the cube map array index value. */
1119 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1120 cur_mrf += mrf_per_channel;
1121 } else if (c->dispatch_width == 8) {
1122 /* Fill in the LOD bias value. */
1123 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1124 cur_mrf += mrf_per_channel;
1125 }
1126 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1127 cur_mrf += mrf_per_channel;
1128 }
1129
1130 if (intel->gen >= 5) {
1131 if (shadow)
1132 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1133 else
1134 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1135 } else {
1136 /* Note that G45 and older determines shadow compare and dispatch width
1137 * from message length for most messages.
1138 */
1139 if (c->dispatch_width == 16 && shadow)
1140 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1141 else
1142 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1143 }
1144
1145 brw_SAMPLE(p,
1146 dst_retyped,
1147 1,
1148 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1149 SURF_INDEX_TEXTURE(sampler),
1150 sampler,
1151 dst_flags & WRITEMASK_XYZW,
1152 msg_type,
1153 response_length,
1154 cur_mrf - 1,
1155 0,
1156 1,
1157 simd_mode);
1158 }
1159
1160
1161 void emit_txb(struct brw_wm_compile *c,
1162 struct brw_reg *dst,
1163 GLuint dst_flags,
1164 struct brw_reg *arg,
1165 struct brw_reg depth_payload,
1166 GLuint tex_idx,
1167 GLuint sampler)
1168 {
1169 struct brw_compile *p = &c->func;
1170 struct intel_context *intel = &p->brw->intel;
1171 GLuint msgLength;
1172 GLuint msg_type;
1173 GLuint mrf_per_channel;
1174 GLuint response_length;
1175 struct brw_reg dst_retyped;
1176
1177 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1178 * samples, so we'll use the 16-wide instruction, leave the second halves
1179 * undefined, and trust the execution mask to keep the undefined pixels
1180 * from mattering.
1181 */
1182 if (c->dispatch_width == 16 || intel->gen < 5) {
1183 if (intel->gen >= 5)
1184 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1185 else
1186 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1187 mrf_per_channel = 2;
1188 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1189 response_length = 8;
1190 } else {
1191 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1192 mrf_per_channel = 1;
1193 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1194 response_length = 4;
1195 }
1196
1197 /* Shadow ignored for txb. */
1198 switch (tex_idx) {
1199 case TEXTURE_1D_INDEX:
1200 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1201 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1202 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1203 break;
1204 case TEXTURE_2D_INDEX:
1205 case TEXTURE_RECT_INDEX:
1206 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1207 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1208 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1209 break;
1210 case TEXTURE_3D_INDEX:
1211 case TEXTURE_CUBE_INDEX:
1212 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1213 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1214 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1215 break;
1216 default:
1217 /* unexpected target */
1218 abort();
1219 }
1220
1221 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1222 msgLength = 2 + 4 * mrf_per_channel - 1;
1223
1224 brw_SAMPLE(p,
1225 dst_retyped,
1226 1,
1227 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1228 SURF_INDEX_TEXTURE(sampler),
1229 sampler,
1230 dst_flags & WRITEMASK_XYZW,
1231 msg_type,
1232 response_length,
1233 msgLength,
1234 0,
1235 1,
1236 BRW_SAMPLER_SIMD_MODE_SIMD16);
1237 }
1238
1239
1240 static void emit_lit(struct brw_wm_compile *c,
1241 const struct brw_reg *dst,
1242 GLuint mask,
1243 const struct brw_reg *arg0)
1244 {
1245 struct brw_compile *p = &c->func;
1246
1247 assert((mask & WRITEMASK_XW) == 0);
1248
1249 if (mask & WRITEMASK_Y) {
1250 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1251 brw_MOV(p, dst[1], arg0[0]);
1252 brw_set_saturate(p, 0);
1253 }
1254
1255 if (mask & WRITEMASK_Z) {
1256 emit_math2(c, BRW_MATH_FUNCTION_POW,
1257 &dst[2],
1258 WRITEMASK_X | (mask & SATURATE),
1259 &arg0[1],
1260 &arg0[3]);
1261 }
1262
1263 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1264 * some of the POW calculations above, but 16-wide iff statements
1265 * seem to lock c1 hardware, so this is a nasty workaround:
1266 */
1267 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1268 {
1269 if (mask & WRITEMASK_Y)
1270 brw_MOV(p, dst[1], brw_imm_f(0));
1271
1272 if (mask & WRITEMASK_Z)
1273 brw_MOV(p, dst[2], brw_imm_f(0));
1274 }
1275 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1276 }
1277
1278
1279 /* Kill pixel - set execution mask to zero for those pixels which
1280 * fail.
1281 */
1282 static void emit_kil( struct brw_wm_compile *c,
1283 struct brw_reg *arg0)
1284 {
1285 struct brw_compile *p = &c->func;
1286 struct intel_context *intel = &p->brw->intel;
1287 struct brw_reg pixelmask;
1288 GLuint i, j;
1289
1290 if (intel->gen >= 6)
1291 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1292 else
1293 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1294
1295 for (i = 0; i < 4; i++) {
1296 /* Check if we've already done the comparison for this reg
1297 * -- common when someone does KIL TEMP.wwww.
1298 */
1299 for (j = 0; j < i; j++) {
1300 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1301 break;
1302 }
1303 if (j != i)
1304 continue;
1305
1306 brw_push_insn_state(p);
1307 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1308 brw_set_predicate_control_flag_value(p, 0xff);
1309 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1310 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1311 brw_pop_insn_state(p);
1312 }
1313 }
1314
1315 static void fire_fb_write( struct brw_wm_compile *c,
1316 GLuint base_reg,
1317 GLuint nr,
1318 GLuint target,
1319 GLuint eot )
1320 {
1321 struct brw_compile *p = &c->func;
1322 struct intel_context *intel = &p->brw->intel;
1323
1324 /* Pass through control information:
1325 *
1326 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1327 */
1328 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1329 if (intel->gen < 6)
1330 {
1331 brw_push_insn_state(p);
1332 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1333 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1334 brw_MOV(p,
1335 brw_message_reg(base_reg + 1),
1336 brw_vec8_grf(1, 0));
1337 brw_pop_insn_state(p);
1338 }
1339
1340 /* Send framebuffer write message: */
1341 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1342 brw_fb_WRITE(p,
1343 c->dispatch_width,
1344 base_reg,
1345 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1346 target,
1347 nr,
1348 0,
1349 eot,
1350 GL_TRUE);
1351 }
1352
1353
1354 static void emit_aa( struct brw_wm_compile *c,
1355 struct brw_reg *arg1,
1356 GLuint reg )
1357 {
1358 struct brw_compile *p = &c->func;
1359 GLuint comp = c->aa_dest_stencil_reg / 2;
1360 GLuint off = c->aa_dest_stencil_reg % 2;
1361 struct brw_reg aa = offset(arg1[comp], off);
1362
1363 brw_push_insn_state(p);
1364 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1365 brw_MOV(p, brw_message_reg(reg), aa);
1366 brw_pop_insn_state(p);
1367 }
1368
1369
1370 /* Post-fragment-program processing. Send the results to the
1371 * framebuffer.
1372 * \param arg0 the fragment color
1373 * \param arg1 the pass-through depth value
1374 * \param arg2 the shader-computed depth value
1375 */
1376 void emit_fb_write(struct brw_wm_compile *c,
1377 struct brw_reg *arg0,
1378 struct brw_reg *arg1,
1379 struct brw_reg *arg2,
1380 GLuint target,
1381 GLuint eot)
1382 {
1383 struct brw_compile *p = &c->func;
1384 struct brw_context *brw = p->brw;
1385 struct intel_context *intel = &brw->intel;
1386 GLuint nr = 2;
1387 GLuint channel;
1388
1389 /* Reserve a space for AA - may not be needed:
1390 */
1391 if (c->aa_dest_stencil_reg)
1392 nr += 1;
1393
1394 /* I don't really understand how this achieves the color interleave
1395 * (ie RGBARGBA) in the result: [Do the saturation here]
1396 */
1397 brw_push_insn_state(p);
1398
1399 if (c->key.clamp_fragment_color)
1400 brw_set_saturate(p, 1);
1401
1402 for (channel = 0; channel < 4; channel++) {
1403 if (intel->gen >= 6) {
1404 /* gen6 SIMD16 single source DP write looks like:
1405 * m + 0: r0
1406 * m + 1: r1
1407 * m + 2: g0
1408 * m + 3: g1
1409 * m + 4: b0
1410 * m + 5: b1
1411 * m + 6: a0
1412 * m + 7: a1
1413 */
1414 if (c->dispatch_width == 16) {
1415 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1416 } else {
1417 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1418 }
1419 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1420 /* pre-gen6 SIMD16 single source DP write looks like:
1421 * m + 0: r0
1422 * m + 1: g0
1423 * m + 2: b0
1424 * m + 3: a0
1425 * m + 4: r1
1426 * m + 5: g1
1427 * m + 6: b1
1428 * m + 7: a1
1429 *
1430 * By setting the high bit of the MRF register number, we indicate
1431 * that we want COMPR4 mode - instead of doing the usual destination
1432 * + 1 for the second half we get destination + 4.
1433 */
1434 brw_MOV(p,
1435 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1436 arg0[channel]);
1437 } else {
1438 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1439 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1440 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1441 brw_MOV(p,
1442 brw_message_reg(nr + channel),
1443 arg0[channel]);
1444
1445 if (c->dispatch_width == 16) {
1446 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1447 brw_MOV(p,
1448 brw_message_reg(nr + channel + 4),
1449 sechalf(arg0[channel]));
1450 }
1451 }
1452 }
1453
1454 brw_set_saturate(p, 0);
1455
1456 /* skip over the regs populated above:
1457 */
1458 if (c->dispatch_width == 16)
1459 nr += 8;
1460 else
1461 nr += 4;
1462
1463 brw_pop_insn_state(p);
1464
1465 if (c->source_depth_to_render_target)
1466 {
1467 if (c->computes_depth)
1468 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1469 else
1470 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1471
1472 nr += 2;
1473 }
1474
1475 if (c->dest_depth_reg)
1476 {
1477 GLuint comp = c->dest_depth_reg / 2;
1478 GLuint off = c->dest_depth_reg % 2;
1479
1480 if (off != 0) {
1481 brw_push_insn_state(p);
1482 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1483
1484 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1485 /* 2nd half? */
1486 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1487 brw_pop_insn_state(p);
1488 }
1489 else {
1490 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1491 }
1492 nr += 2;
1493 }
1494
1495 if (intel->gen >= 6) {
1496 /* Load the message header. There's no implied move from src0
1497 * to the base mrf on gen6.
1498 */
1499 brw_push_insn_state(p);
1500 brw_set_mask_control(p, BRW_MASK_DISABLE);
1501 brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1502 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1503 brw_pop_insn_state(p);
1504
1505 if (target != 0) {
1506 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1507 0,
1508 2), BRW_REGISTER_TYPE_UD),
1509 brw_imm_ud(target));
1510 }
1511 }
1512
1513 if (!c->runtime_check_aads_emit) {
1514 if (c->aa_dest_stencil_reg)
1515 emit_aa(c, arg1, 2);
1516
1517 fire_fb_write(c, 0, nr, target, eot);
1518 }
1519 else {
1520 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1521 struct brw_reg ip = brw_ip_reg();
1522 struct brw_instruction *jmp;
1523
1524 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1525 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1526 brw_AND(p,
1527 v1_null_ud,
1528 get_element_ud(brw_vec8_grf(1,0), 6),
1529 brw_imm_ud(1<<26));
1530
1531 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1532 {
1533 emit_aa(c, arg1, 2);
1534 fire_fb_write(c, 0, nr, target, eot);
1535 /* note - thread killed in subroutine */
1536 }
1537 brw_land_fwd_jump(p, jmp);
1538
1539 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1540 */
1541 fire_fb_write(c, 1, nr-1, target, eot);
1542 }
1543 }
1544
1545 /**
1546 * Move a GPR to scratch memory.
1547 */
1548 static void emit_spill( struct brw_wm_compile *c,
1549 struct brw_reg reg,
1550 GLuint slot )
1551 {
1552 struct brw_compile *p = &c->func;
1553
1554 /*
1555 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1556 */
1557 brw_MOV(p, brw_message_reg(2), reg);
1558
1559 /*
1560 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1561 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1562 */
1563 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1564 }
1565
1566
1567 /**
1568 * Load a GPR from scratch memory.
1569 */
1570 static void emit_unspill( struct brw_wm_compile *c,
1571 struct brw_reg reg,
1572 GLuint slot )
1573 {
1574 struct brw_compile *p = &c->func;
1575
1576 /* Slot 0 is the undef value.
1577 */
1578 if (slot == 0) {
1579 brw_MOV(p, reg, brw_imm_f(0));
1580 return;
1581 }
1582
1583 /*
1584 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1585 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1586 */
1587
1588 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1589 }
1590
1591
1592 /**
1593 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1594 * Args with unspill_reg != 0 will be loaded from scratch memory.
1595 */
1596 static void get_argument_regs( struct brw_wm_compile *c,
1597 struct brw_wm_ref *arg[],
1598 struct brw_reg *regs )
1599 {
1600 GLuint i;
1601
1602 for (i = 0; i < 4; i++) {
1603 if (arg[i]) {
1604 if (arg[i]->unspill_reg)
1605 emit_unspill(c,
1606 brw_vec8_grf(arg[i]->unspill_reg, 0),
1607 arg[i]->value->spill_slot);
1608
1609 regs[i] = arg[i]->hw_reg;
1610 }
1611 else {
1612 regs[i] = brw_null_reg();
1613 }
1614 }
1615 }
1616
1617
1618 /**
1619 * For values that have a spill_slot!=0, write those regs to scratch memory.
1620 */
1621 static void spill_values( struct brw_wm_compile *c,
1622 struct brw_wm_value *values,
1623 GLuint nr )
1624 {
1625 GLuint i;
1626
1627 for (i = 0; i < nr; i++)
1628 if (values[i].spill_slot)
1629 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1630 }
1631
1632
1633 /* Emit the fragment program instructions here.
1634 */
1635 void brw_wm_emit( struct brw_wm_compile *c )
1636 {
1637 struct brw_compile *p = &c->func;
1638 struct intel_context *intel = &p->brw->intel;
1639 GLuint insn;
1640
1641 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1642 if (intel->gen >= 6)
1643 brw_set_acc_write_control(p, 1);
1644
1645 /* Check if any of the payload regs need to be spilled:
1646 */
1647 spill_values(c, c->payload.depth, 4);
1648 spill_values(c, c->creg, c->nr_creg);
1649 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1650
1651
1652 for (insn = 0; insn < c->nr_insns; insn++) {
1653
1654 struct brw_wm_instruction *inst = &c->instruction[insn];
1655 struct brw_reg args[3][4], dst[4];
1656 GLuint i, dst_flags;
1657
1658 /* Get argument regs:
1659 */
1660 for (i = 0; i < 3; i++)
1661 get_argument_regs(c, inst->src[i], args[i]);
1662
1663 /* Get dest regs:
1664 */
1665 for (i = 0; i < 4; i++)
1666 if (inst->dst[i])
1667 dst[i] = inst->dst[i]->hw_reg;
1668 else
1669 dst[i] = brw_null_reg();
1670
1671 /* Flags
1672 */
1673 dst_flags = inst->writemask;
1674 if (inst->saturate)
1675 dst_flags |= SATURATE;
1676
1677 switch (inst->opcode) {
1678 /* Generated instructions for calculating triangle interpolants:
1679 */
1680 case WM_PIXELXY:
1681 emit_pixel_xy(c, dst, dst_flags);
1682 break;
1683
1684 case WM_DELTAXY:
1685 emit_delta_xy(p, dst, dst_flags, args[0]);
1686 break;
1687
1688 case WM_WPOSXY:
1689 emit_wpos_xy(c, dst, dst_flags, args[0]);
1690 break;
1691
1692 case WM_PIXELW:
1693 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1694 break;
1695
1696 case WM_LINTERP:
1697 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1698 break;
1699
1700 case WM_PINTERP:
1701 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1702 break;
1703
1704 case WM_CINTERP:
1705 emit_cinterp(p, dst, dst_flags, args[0]);
1706 break;
1707
1708 case WM_FB_WRITE:
1709 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1710 break;
1711
1712 case WM_FRONTFACING:
1713 emit_frontfacing(p, dst, dst_flags);
1714 break;
1715
1716 /* Straightforward arithmetic:
1717 */
1718 case OPCODE_ADD:
1719 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1720 break;
1721
1722 case OPCODE_FRC:
1723 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1724 break;
1725
1726 case OPCODE_FLR:
1727 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1728 break;
1729
1730 case OPCODE_DDX:
1731 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1732 break;
1733
1734 case OPCODE_DDY:
1735 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1736 break;
1737
1738 case OPCODE_DP2:
1739 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1740 break;
1741
1742 case OPCODE_DP3:
1743 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1744 break;
1745
1746 case OPCODE_DP4:
1747 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1748 break;
1749
1750 case OPCODE_DPH:
1751 emit_dph(p, dst, dst_flags, args[0], args[1]);
1752 break;
1753
1754 case OPCODE_TRUNC:
1755 for (i = 0; i < 4; i++) {
1756 if (dst_flags & (1<<i)) {
1757 brw_RNDZ(p, dst[i], args[0][i]);
1758 }
1759 }
1760 break;
1761
1762 case OPCODE_LRP:
1763 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1764 break;
1765
1766 case OPCODE_MAD:
1767 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1768 break;
1769
1770 case OPCODE_MOV:
1771 case OPCODE_SWZ:
1772 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1773 break;
1774
1775 case OPCODE_MUL:
1776 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1777 break;
1778
1779 case OPCODE_XPD:
1780 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1781 break;
1782
1783 /* Higher math functions:
1784 */
1785 case OPCODE_RCP:
1786 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1787 break;
1788
1789 case OPCODE_RSQ:
1790 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1791 break;
1792
1793 case OPCODE_SIN:
1794 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1795 break;
1796
1797 case OPCODE_COS:
1798 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1799 break;
1800
1801 case OPCODE_EX2:
1802 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1803 break;
1804
1805 case OPCODE_LG2:
1806 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1807 break;
1808
1809 case OPCODE_SCS:
1810 /* There is an scs math function, but it would need some
1811 * fixup for 16-element execution.
1812 */
1813 if (dst_flags & WRITEMASK_X)
1814 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1815 if (dst_flags & WRITEMASK_Y)
1816 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1817 break;
1818
1819 case OPCODE_POW:
1820 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1821 break;
1822
1823 /* Comparisons:
1824 */
1825 case OPCODE_CMP:
1826 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1827 break;
1828
1829 case OPCODE_MAX:
1830 emit_max(p, dst, dst_flags, args[0], args[1]);
1831 break;
1832
1833 case OPCODE_MIN:
1834 emit_min(p, dst, dst_flags, args[0], args[1]);
1835 break;
1836
1837 case OPCODE_SLT:
1838 emit_slt(p, dst, dst_flags, args[0], args[1]);
1839 break;
1840
1841 case OPCODE_SLE:
1842 emit_sle(p, dst, dst_flags, args[0], args[1]);
1843 break;
1844 case OPCODE_SGT:
1845 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1846 break;
1847 case OPCODE_SGE:
1848 emit_sge(p, dst, dst_flags, args[0], args[1]);
1849 break;
1850 case OPCODE_SEQ:
1851 emit_seq(p, dst, dst_flags, args[0], args[1]);
1852 break;
1853 case OPCODE_SNE:
1854 emit_sne(p, dst, dst_flags, args[0], args[1]);
1855 break;
1856
1857 case OPCODE_SSG:
1858 emit_sign(p, dst, dst_flags, args[0]);
1859 break;
1860
1861 case OPCODE_LIT:
1862 emit_lit(c, dst, dst_flags, args[0]);
1863 break;
1864
1865 /* Texturing operations:
1866 */
1867 case OPCODE_TEX:
1868 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1869 inst->tex_idx, inst->tex_unit,
1870 inst->tex_shadow);
1871 break;
1872
1873 case OPCODE_TXB:
1874 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1875 inst->tex_idx, inst->tex_unit);
1876 break;
1877
1878 case OPCODE_KIL:
1879 emit_kil(c, args[0]);
1880 break;
1881
1882 default:
1883 printf("Unsupported opcode %i (%s) in fragment shader\n",
1884 inst->opcode, inst->opcode < MAX_OPCODE ?
1885 _mesa_opcode_string(inst->opcode) :
1886 "unknown");
1887 }
1888
1889 for (i = 0; i < 4; i++)
1890 if (inst->dst[i] && inst->dst[i]->spill_slot)
1891 emit_spill(c,
1892 inst->dst[i]->hw_reg,
1893 inst->dst[i]->spill_slot);
1894 }
1895
1896 /* Only properly tested on ILK */
1897 if (p->brw->intel.gen == 5) {
1898 brw_remove_duplicate_mrf_moves(p);
1899 if (c->dispatch_width == 16)
1900 brw_remove_grf_to_mrf_moves(p);
1901 }
1902
1903 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1904 int i;
1905
1906 printf("wm-native:\n");
1907 for (i = 0; i < p->nr_insn; i++)
1908 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1909 printf("\n");
1910 }
1911 }
1912