intel: Annotate debug printout checks with unlikely().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_XPD] = 2,
87 };
88
89 /* These opcodes get broken down in a way that allow two
90 * args to be immediates.
91 */
92 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
93 if (arg == 1 || arg == 2)
94 return GL_TRUE;
95 }
96
97 if (opcode > ARRAY_SIZE(opcode_array))
98 return GL_FALSE;
99
100 return arg == opcode_array[opcode] - 1;
101 }
102
103 /**
104 * Computes the screen-space x,y position of the pixels.
105 *
106 * This will be used by emit_delta_xy() or emit_wpos_xy() for
107 * interpolation of attributes..
108 *
109 * Payload R0:
110 *
111 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
112 * corresponding to each of the 16 execution channels.
113 * R0.1..8 -- ?
114 * R1.0 -- triangle vertex 0.X
115 * R1.1 -- triangle vertex 0.Y
116 * R1.2 -- tile 0 x,y coords (2 packed uwords)
117 * R1.3 -- tile 1 x,y coords (2 packed uwords)
118 * R1.4 -- tile 2 x,y coords (2 packed uwords)
119 * R1.5 -- tile 3 x,y coords (2 packed uwords)
120 * R1.6 -- ?
121 * R1.7 -- ?
122 * R1.8 -- ?
123 */
124 void emit_pixel_xy(struct brw_wm_compile *c,
125 const struct brw_reg *dst,
126 GLuint mask)
127 {
128 struct brw_compile *p = &c->func;
129 struct brw_reg r1 = brw_vec1_grf(1, 0);
130 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
131 struct brw_reg dst0_uw, dst1_uw;
132
133 brw_push_insn_state(p);
134 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
135
136 if (c->dispatch_width == 16) {
137 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
138 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
139 } else {
140 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
141 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
142 }
143
144 /* Calculate pixel centers by adding 1 or 0 to each of the
145 * micro-tile coordinates passed in r1.
146 */
147 if (mask & WRITEMASK_X) {
148 brw_ADD(p,
149 dst0_uw,
150 stride(suboffset(r1_uw, 4), 2, 4, 0),
151 brw_imm_v(0x10101010));
152 }
153
154 if (mask & WRITEMASK_Y) {
155 brw_ADD(p,
156 dst1_uw,
157 stride(suboffset(r1_uw,5), 2, 4, 0),
158 brw_imm_v(0x11001100));
159 }
160 brw_pop_insn_state(p);
161 }
162
163 /**
164 * Computes the screen-space x,y distance of the pixels from the start
165 * vertex.
166 *
167 * This will be used in linterp or pinterp with the start vertex value
168 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
169 * to produce interpolated attribute values.
170 */
171 void emit_delta_xy(struct brw_compile *p,
172 const struct brw_reg *dst,
173 GLuint mask,
174 const struct brw_reg *arg0)
175 {
176 struct intel_context *intel = &p->brw->intel;
177 struct brw_reg r1 = brw_vec1_grf(1, 0);
178
179 if (mask == 0)
180 return;
181
182 assert(mask == WRITEMASK_XY);
183
184 if (intel->gen >= 6) {
185 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
186 Just add them with 0.0 for dst reg.. */
187 r1 = brw_imm_v(0x00000000);
188 brw_ADD(p,
189 dst[0],
190 retype(arg0[0], BRW_REGISTER_TYPE_UW),
191 r1);
192 brw_ADD(p,
193 dst[1],
194 retype(arg0[1], BRW_REGISTER_TYPE_UW),
195 r1);
196 return;
197 }
198
199 /* Calc delta X,Y by subtracting origin in r1 from the pixel
200 * centers produced by emit_pixel_xy().
201 */
202 brw_ADD(p,
203 dst[0],
204 retype(arg0[0], BRW_REGISTER_TYPE_UW),
205 negate(r1));
206 brw_ADD(p,
207 dst[1],
208 retype(arg0[1], BRW_REGISTER_TYPE_UW),
209 negate(suboffset(r1,1)));
210 }
211
212 /**
213 * Computes the pixel offset from the window origin for gl_FragCoord().
214 */
215 void emit_wpos_xy(struct brw_wm_compile *c,
216 const struct brw_reg *dst,
217 GLuint mask,
218 const struct brw_reg *arg0)
219 {
220 struct brw_compile *p = &c->func;
221
222 if (mask & WRITEMASK_X) {
223 if (c->fp->program.PixelCenterInteger) {
224 /* X' = X */
225 brw_MOV(p,
226 dst[0],
227 retype(arg0[0], BRW_REGISTER_TYPE_W));
228 } else {
229 /* X' = X + 0.5 */
230 brw_ADD(p,
231 dst[0],
232 retype(arg0[0], BRW_REGISTER_TYPE_W),
233 brw_imm_f(0.5));
234 }
235 }
236
237 if (mask & WRITEMASK_Y) {
238 if (c->fp->program.OriginUpperLeft) {
239 if (c->fp->program.PixelCenterInteger) {
240 /* Y' = Y */
241 brw_MOV(p,
242 dst[1],
243 retype(arg0[1], BRW_REGISTER_TYPE_W));
244 } else {
245 /* Y' = Y + 0.5 */
246 brw_ADD(p,
247 dst[1],
248 retype(arg0[1], BRW_REGISTER_TYPE_W),
249 brw_imm_f(0.5));
250 }
251 } else {
252 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
253
254 /* Y' = (height - 1) - Y + center */
255 brw_ADD(p,
256 dst[1],
257 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
258 brw_imm_f(c->key.drawable_height - 1 + center_offset));
259 }
260 }
261 }
262
263
264 void emit_pixel_w(struct brw_wm_compile *c,
265 const struct brw_reg *dst,
266 GLuint mask,
267 const struct brw_reg *arg0,
268 const struct brw_reg *deltas)
269 {
270 struct brw_compile *p = &c->func;
271 struct intel_context *intel = &p->brw->intel;
272 struct brw_reg src;
273 struct brw_reg temp_dst;
274
275 if (intel->gen >= 6)
276 temp_dst = dst[3];
277 else
278 temp_dst = brw_message_reg(2);
279
280 assert(intel->gen < 6);
281
282 /* Don't need this if all you are doing is interpolating color, for
283 * instance.
284 */
285 if (mask & WRITEMASK_W) {
286 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
287
288 /* Calc 1/w - just linterp wpos[3] optimized by putting the
289 * result straight into a message reg.
290 */
291 if (can_do_pln(intel, deltas)) {
292 brw_PLN(p, temp_dst, interp3, deltas[0]);
293 } else {
294 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
295 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
296 }
297
298 /* Calc w */
299 if (intel->gen >= 6)
300 src = temp_dst;
301 else
302 src = brw_null_reg();
303
304 if (c->dispatch_width == 16) {
305 brw_math_16(p, dst[3],
306 BRW_MATH_FUNCTION_INV,
307 BRW_MATH_SATURATE_NONE,
308 2, src,
309 BRW_MATH_PRECISION_FULL);
310 } else {
311 brw_math(p, dst[3],
312 BRW_MATH_FUNCTION_INV,
313 BRW_MATH_SATURATE_NONE,
314 2, src,
315 BRW_MATH_DATA_VECTOR,
316 BRW_MATH_PRECISION_FULL);
317 }
318 }
319 }
320
321 void emit_linterp(struct brw_compile *p,
322 const struct brw_reg *dst,
323 GLuint mask,
324 const struct brw_reg *arg0,
325 const struct brw_reg *deltas)
326 {
327 struct intel_context *intel = &p->brw->intel;
328 struct brw_reg interp[4];
329 GLuint nr = arg0[0].nr;
330 GLuint i;
331
332 interp[0] = brw_vec1_grf(nr, 0);
333 interp[1] = brw_vec1_grf(nr, 4);
334 interp[2] = brw_vec1_grf(nr+1, 0);
335 interp[3] = brw_vec1_grf(nr+1, 4);
336
337 for (i = 0; i < 4; i++) {
338 if (mask & (1<<i)) {
339 if (intel->gen >= 6) {
340 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
341 } else if (can_do_pln(intel, deltas)) {
342 brw_PLN(p, dst[i], interp[i], deltas[0]);
343 } else {
344 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
345 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
346 }
347 }
348 }
349 }
350
351
352 void emit_pinterp(struct brw_compile *p,
353 const struct brw_reg *dst,
354 GLuint mask,
355 const struct brw_reg *arg0,
356 const struct brw_reg *deltas,
357 const struct brw_reg *w)
358 {
359 struct intel_context *intel = &p->brw->intel;
360 struct brw_reg interp[4];
361 GLuint nr = arg0[0].nr;
362 GLuint i;
363
364 if (intel->gen >= 6) {
365 emit_linterp(p, dst, mask, arg0, interp);
366 return;
367 }
368
369 interp[0] = brw_vec1_grf(nr, 0);
370 interp[1] = brw_vec1_grf(nr, 4);
371 interp[2] = brw_vec1_grf(nr+1, 0);
372 interp[3] = brw_vec1_grf(nr+1, 4);
373
374 for (i = 0; i < 4; i++) {
375 if (mask & (1<<i)) {
376 if (can_do_pln(intel, deltas)) {
377 brw_PLN(p, dst[i], interp[i], deltas[0]);
378 } else {
379 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
380 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
381 }
382 }
383 }
384 for (i = 0; i < 4; i++) {
385 if (mask & (1<<i)) {
386 brw_MUL(p, dst[i], dst[i], w[3]);
387 }
388 }
389 }
390
391
392 void emit_cinterp(struct brw_compile *p,
393 const struct brw_reg *dst,
394 GLuint mask,
395 const struct brw_reg *arg0)
396 {
397 struct brw_reg interp[4];
398 GLuint nr = arg0[0].nr;
399 GLuint i;
400
401 interp[0] = brw_vec1_grf(nr, 0);
402 interp[1] = brw_vec1_grf(nr, 4);
403 interp[2] = brw_vec1_grf(nr+1, 0);
404 interp[3] = brw_vec1_grf(nr+1, 4);
405
406 for (i = 0; i < 4; i++) {
407 if (mask & (1<<i)) {
408 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
409 }
410 }
411 }
412
413 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
414 void emit_frontfacing(struct brw_compile *p,
415 const struct brw_reg *dst,
416 GLuint mask)
417 {
418 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
419 GLuint i;
420
421 if (!(mask & WRITEMASK_XYZW))
422 return;
423
424 for (i = 0; i < 4; i++) {
425 if (mask & (1<<i)) {
426 brw_MOV(p, dst[i], brw_imm_f(0.0));
427 }
428 }
429
430 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
431 * us front face
432 */
433 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
434 for (i = 0; i < 4; i++) {
435 if (mask & (1<<i)) {
436 brw_MOV(p, dst[i], brw_imm_f(1.0));
437 }
438 }
439 brw_set_predicate_control_flag_value(p, 0xff);
440 }
441
442 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
443 * looking like:
444 *
445 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
446 *
447 * and we're trying to produce:
448 *
449 * DDX DDY
450 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
451 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
452 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
453 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
454 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
455 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
456 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
457 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
458 *
459 * and add another set of two more subspans if in 16-pixel dispatch mode.
460 *
461 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
462 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
463 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
464 * between each other. We could probably do it like ddx and swizzle the right
465 * order later, but bail for now and just produce
466 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
467 */
468 void emit_ddxy(struct brw_compile *p,
469 const struct brw_reg *dst,
470 GLuint mask,
471 GLboolean is_ddx,
472 const struct brw_reg *arg0)
473 {
474 int i;
475 struct brw_reg src0, src1;
476
477 if (mask & SATURATE)
478 brw_set_saturate(p, 1);
479 for (i = 0; i < 4; i++ ) {
480 if (mask & (1<<i)) {
481 if (is_ddx) {
482 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
483 BRW_REGISTER_TYPE_F,
484 BRW_VERTICAL_STRIDE_2,
485 BRW_WIDTH_2,
486 BRW_HORIZONTAL_STRIDE_0,
487 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
488 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
489 BRW_REGISTER_TYPE_F,
490 BRW_VERTICAL_STRIDE_2,
491 BRW_WIDTH_2,
492 BRW_HORIZONTAL_STRIDE_0,
493 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
494 } else {
495 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
496 BRW_REGISTER_TYPE_F,
497 BRW_VERTICAL_STRIDE_4,
498 BRW_WIDTH_4,
499 BRW_HORIZONTAL_STRIDE_0,
500 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
501 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
502 BRW_REGISTER_TYPE_F,
503 BRW_VERTICAL_STRIDE_4,
504 BRW_WIDTH_4,
505 BRW_HORIZONTAL_STRIDE_0,
506 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
507 }
508 brw_ADD(p, dst[i], src0, negate(src1));
509 }
510 }
511 if (mask & SATURATE)
512 brw_set_saturate(p, 0);
513 }
514
515 void emit_alu1(struct brw_compile *p,
516 struct brw_instruction *(*func)(struct brw_compile *,
517 struct brw_reg,
518 struct brw_reg),
519 const struct brw_reg *dst,
520 GLuint mask,
521 const struct brw_reg *arg0)
522 {
523 GLuint i;
524
525 if (mask & SATURATE)
526 brw_set_saturate(p, 1);
527
528 for (i = 0; i < 4; i++) {
529 if (mask & (1<<i)) {
530 func(p, dst[i], arg0[i]);
531 }
532 }
533
534 if (mask & SATURATE)
535 brw_set_saturate(p, 0);
536 }
537
538
539 void emit_alu2(struct brw_compile *p,
540 struct brw_instruction *(*func)(struct brw_compile *,
541 struct brw_reg,
542 struct brw_reg,
543 struct brw_reg),
544 const struct brw_reg *dst,
545 GLuint mask,
546 const struct brw_reg *arg0,
547 const struct brw_reg *arg1)
548 {
549 GLuint i;
550
551 if (mask & SATURATE)
552 brw_set_saturate(p, 1);
553
554 for (i = 0; i < 4; i++) {
555 if (mask & (1<<i)) {
556 func(p, dst[i], arg0[i], arg1[i]);
557 }
558 }
559
560 if (mask & SATURATE)
561 brw_set_saturate(p, 0);
562 }
563
564
565 void emit_mad(struct brw_compile *p,
566 const struct brw_reg *dst,
567 GLuint mask,
568 const struct brw_reg *arg0,
569 const struct brw_reg *arg1,
570 const struct brw_reg *arg2)
571 {
572 GLuint i;
573
574 for (i = 0; i < 4; i++) {
575 if (mask & (1<<i)) {
576 brw_MUL(p, dst[i], arg0[i], arg1[i]);
577
578 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
579 brw_ADD(p, dst[i], dst[i], arg2[i]);
580 brw_set_saturate(p, 0);
581 }
582 }
583 }
584
585 void emit_lrp(struct brw_compile *p,
586 const struct brw_reg *dst,
587 GLuint mask,
588 const struct brw_reg *arg0,
589 const struct brw_reg *arg1,
590 const struct brw_reg *arg2)
591 {
592 GLuint i;
593
594 /* Uses dst as a temporary:
595 */
596 for (i = 0; i < 4; i++) {
597 if (mask & (1<<i)) {
598 /* Can I use the LINE instruction for this?
599 */
600 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
601 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
602
603 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
604 brw_MAC(p, dst[i], arg0[i], arg1[i]);
605 brw_set_saturate(p, 0);
606 }
607 }
608 }
609
610 void emit_sop(struct brw_compile *p,
611 const struct brw_reg *dst,
612 GLuint mask,
613 GLuint cond,
614 const struct brw_reg *arg0,
615 const struct brw_reg *arg1)
616 {
617 GLuint i;
618
619 for (i = 0; i < 4; i++) {
620 if (mask & (1<<i)) {
621 brw_push_insn_state(p);
622 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
623 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
624 brw_MOV(p, dst[i], brw_imm_f(0));
625 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
626 brw_MOV(p, dst[i], brw_imm_f(1.0));
627 brw_pop_insn_state(p);
628 }
629 }
630 }
631
632 static void emit_slt( struct brw_compile *p,
633 const struct brw_reg *dst,
634 GLuint mask,
635 const struct brw_reg *arg0,
636 const struct brw_reg *arg1 )
637 {
638 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
639 }
640
641 static void emit_sle( struct brw_compile *p,
642 const struct brw_reg *dst,
643 GLuint mask,
644 const struct brw_reg *arg0,
645 const struct brw_reg *arg1 )
646 {
647 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
648 }
649
650 static void emit_sgt( struct brw_compile *p,
651 const struct brw_reg *dst,
652 GLuint mask,
653 const struct brw_reg *arg0,
654 const struct brw_reg *arg1 )
655 {
656 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
657 }
658
659 static void emit_sge( struct brw_compile *p,
660 const struct brw_reg *dst,
661 GLuint mask,
662 const struct brw_reg *arg0,
663 const struct brw_reg *arg1 )
664 {
665 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
666 }
667
668 static void emit_seq( struct brw_compile *p,
669 const struct brw_reg *dst,
670 GLuint mask,
671 const struct brw_reg *arg0,
672 const struct brw_reg *arg1 )
673 {
674 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
675 }
676
677 static void emit_sne( struct brw_compile *p,
678 const struct brw_reg *dst,
679 GLuint mask,
680 const struct brw_reg *arg0,
681 const struct brw_reg *arg1 )
682 {
683 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
684 }
685
686 void emit_cmp(struct brw_compile *p,
687 const struct brw_reg *dst,
688 GLuint mask,
689 const struct brw_reg *arg0,
690 const struct brw_reg *arg1,
691 const struct brw_reg *arg2)
692 {
693 GLuint i;
694
695 for (i = 0; i < 4; i++) {
696 if (mask & (1<<i)) {
697 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
698
699 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
700 brw_SEL(p, dst[i], arg1[i], arg2[i]);
701 brw_set_saturate(p, 0);
702 brw_set_predicate_control_flag_value(p, 0xff);
703 }
704 }
705 }
706
707 void emit_sign(struct brw_compile *p,
708 const struct brw_reg *dst,
709 GLuint mask,
710 const struct brw_reg *arg0)
711 {
712 GLuint i;
713
714 for (i = 0; i < 4; i++) {
715 if (mask & (1<<i)) {
716 brw_MOV(p, dst[i], brw_imm_f(0.0));
717
718 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
719 brw_MOV(p, dst[i], brw_imm_f(-1.0));
720 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
721
722 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
723 brw_MOV(p, dst[i], brw_imm_f(1.0));
724 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
725 }
726 }
727 }
728
729 void emit_max(struct brw_compile *p,
730 const struct brw_reg *dst,
731 GLuint mask,
732 const struct brw_reg *arg0,
733 const struct brw_reg *arg1)
734 {
735 GLuint i;
736
737 for (i = 0; i < 4; i++) {
738 if (mask & (1<<i)) {
739 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
740
741 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
742 brw_SEL(p, dst[i], arg0[i], arg1[i]);
743 brw_set_saturate(p, 0);
744 brw_set_predicate_control_flag_value(p, 0xff);
745 }
746 }
747 }
748
749 void emit_min(struct brw_compile *p,
750 const struct brw_reg *dst,
751 GLuint mask,
752 const struct brw_reg *arg0,
753 const struct brw_reg *arg1)
754 {
755 GLuint i;
756
757 for (i = 0; i < 4; i++) {
758 if (mask & (1<<i)) {
759 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
760
761 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
762 brw_SEL(p, dst[i], arg0[i], arg1[i]);
763 brw_set_saturate(p, 0);
764 brw_set_predicate_control_flag_value(p, 0xff);
765 }
766 }
767 }
768
769
770 void emit_dp2(struct brw_compile *p,
771 const struct brw_reg *dst,
772 GLuint mask,
773 const struct brw_reg *arg0,
774 const struct brw_reg *arg1)
775 {
776 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
777
778 if (!(mask & WRITEMASK_XYZW))
779 return; /* Do not emit dead code */
780
781 assert(is_power_of_two(mask & WRITEMASK_XYZW));
782
783 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
784
785 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
786 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
787 brw_set_saturate(p, 0);
788 }
789
790
791 void emit_dp3(struct brw_compile *p,
792 const struct brw_reg *dst,
793 GLuint mask,
794 const struct brw_reg *arg0,
795 const struct brw_reg *arg1)
796 {
797 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
798
799 if (!(mask & WRITEMASK_XYZW))
800 return; /* Do not emit dead code */
801
802 assert(is_power_of_two(mask & WRITEMASK_XYZW));
803
804 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
805 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
806
807 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
808 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
809 brw_set_saturate(p, 0);
810 }
811
812
813 void emit_dp4(struct brw_compile *p,
814 const struct brw_reg *dst,
815 GLuint mask,
816 const struct brw_reg *arg0,
817 const struct brw_reg *arg1)
818 {
819 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
820
821 if (!(mask & WRITEMASK_XYZW))
822 return; /* Do not emit dead code */
823
824 assert(is_power_of_two(mask & WRITEMASK_XYZW));
825
826 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
827 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
828 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
829
830 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
831 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
832 brw_set_saturate(p, 0);
833 }
834
835
836 void emit_dph(struct brw_compile *p,
837 const struct brw_reg *dst,
838 GLuint mask,
839 const struct brw_reg *arg0,
840 const struct brw_reg *arg1)
841 {
842 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
843
844 if (!(mask & WRITEMASK_XYZW))
845 return; /* Do not emit dead code */
846
847 assert(is_power_of_two(mask & WRITEMASK_XYZW));
848
849 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
850 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
851 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
852
853 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
854 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
855 brw_set_saturate(p, 0);
856 }
857
858
859 void emit_xpd(struct brw_compile *p,
860 const struct brw_reg *dst,
861 GLuint mask,
862 const struct brw_reg *arg0,
863 const struct brw_reg *arg1)
864 {
865 GLuint i;
866
867 assert((mask & WRITEMASK_W) != WRITEMASK_W);
868
869 for (i = 0 ; i < 3; i++) {
870 if (mask & (1<<i)) {
871 GLuint i2 = (i+2)%3;
872 GLuint i1 = (i+1)%3;
873
874 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
875
876 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
877 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
878 brw_set_saturate(p, 0);
879 }
880 }
881 }
882
883
884 void emit_math1(struct brw_wm_compile *c,
885 GLuint function,
886 const struct brw_reg *dst,
887 GLuint mask,
888 const struct brw_reg *arg0)
889 {
890 struct brw_compile *p = &c->func;
891 struct intel_context *intel = &p->brw->intel;
892 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
893 GLuint saturate = ((mask & SATURATE) ?
894 BRW_MATH_SATURATE_SATURATE :
895 BRW_MATH_SATURATE_NONE);
896 struct brw_reg src;
897
898 if (intel->gen >= 6 && arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
899 /* Gen6 math requires that source and dst horizontal stride be 1.
900 *
901 */
902 src = *dst;
903 brw_MOV(p, src, arg0[0]);
904 } else {
905 src = arg0[0];
906 }
907
908 if (!(mask & WRITEMASK_XYZW))
909 return; /* Do not emit dead code */
910
911 assert(is_power_of_two(mask & WRITEMASK_XYZW));
912
913 /* Send two messages to perform all 16 operations:
914 */
915 brw_push_insn_state(p);
916 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
917 brw_math(p,
918 dst[dst_chan],
919 function,
920 saturate,
921 2,
922 src,
923 BRW_MATH_DATA_VECTOR,
924 BRW_MATH_PRECISION_FULL);
925
926 if (c->dispatch_width == 16) {
927 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
928 brw_math(p,
929 offset(dst[dst_chan],1),
930 function,
931 saturate,
932 3,
933 sechalf(src),
934 BRW_MATH_DATA_VECTOR,
935 BRW_MATH_PRECISION_FULL);
936 }
937 brw_pop_insn_state(p);
938 }
939
940
941 void emit_math2(struct brw_wm_compile *c,
942 GLuint function,
943 const struct brw_reg *dst,
944 GLuint mask,
945 const struct brw_reg *arg0,
946 const struct brw_reg *arg1)
947 {
948 struct brw_compile *p = &c->func;
949 struct intel_context *intel = &p->brw->intel;
950 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
951
952 if (!(mask & WRITEMASK_XYZW))
953 return; /* Do not emit dead code */
954
955 assert(is_power_of_two(mask & WRITEMASK_XYZW));
956
957 brw_push_insn_state(p);
958
959 /* math can only operate on up to a vec8 at a time, so in
960 * dispatch_width==16 we have to do the second half manually.
961 */
962 if (intel->gen >= 6) {
963 struct brw_reg src0 = arg0[0];
964 struct brw_reg src1 = arg1[0];
965 struct brw_reg temp_dst = dst[dst_chan];
966
967 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
968 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
969 /* Both scalar arguments. Do scalar calc. */
970 src0.hstride = BRW_HORIZONTAL_STRIDE_1;
971 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
972 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
973 temp_dst.width = BRW_WIDTH_1;
974
975 if (arg0[0].subnr != 0) {
976 brw_MOV(p, temp_dst, src0);
977 src0 = temp_dst;
978
979 /* Ouch. We've used the temp as a dst, and we still
980 * need a temp to store arg1 in, because src and dst
981 * offsets have to be equal. Leaving this up to
982 * glsl2-965 to handle correctly.
983 */
984 assert(arg1[0].subnr == 0);
985 } else if (arg1[0].subnr != 0) {
986 brw_MOV(p, temp_dst, src1);
987 src1 = temp_dst;
988 }
989 } else {
990 brw_MOV(p, temp_dst, src0);
991 src0 = temp_dst;
992 }
993 } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
994 brw_MOV(p, temp_dst, src1);
995 src1 = temp_dst;
996 }
997
998 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
999 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1000 brw_math2(p,
1001 temp_dst,
1002 function,
1003 src0,
1004 src1);
1005 if (c->dispatch_width == 16) {
1006 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1007 brw_math2(p,
1008 sechalf(temp_dst),
1009 function,
1010 sechalf(src0),
1011 sechalf(src1));
1012 }
1013
1014 /* Splat a scalar result into all the channels. */
1015 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1016 arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1017 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1018 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1019 brw_MOV(p, dst[dst_chan], temp_dst);
1020 }
1021 } else {
1022 GLuint saturate = ((mask & SATURATE) ?
1023 BRW_MATH_SATURATE_SATURATE :
1024 BRW_MATH_SATURATE_NONE);
1025
1026 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1027 brw_MOV(p, brw_message_reg(3), arg1[0]);
1028 if (c->dispatch_width == 16) {
1029 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1030 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1031 }
1032
1033 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1034 brw_math(p,
1035 dst[dst_chan],
1036 function,
1037 saturate,
1038 2,
1039 arg0[0],
1040 BRW_MATH_DATA_VECTOR,
1041 BRW_MATH_PRECISION_FULL);
1042
1043 /* Send two messages to perform all 16 operations:
1044 */
1045 if (c->dispatch_width == 16) {
1046 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1047 brw_math(p,
1048 offset(dst[dst_chan],1),
1049 function,
1050 saturate,
1051 4,
1052 sechalf(arg0[0]),
1053 BRW_MATH_DATA_VECTOR,
1054 BRW_MATH_PRECISION_FULL);
1055 }
1056 }
1057 brw_pop_insn_state(p);
1058 }
1059
1060
1061 void emit_tex(struct brw_wm_compile *c,
1062 struct brw_reg *dst,
1063 GLuint dst_flags,
1064 struct brw_reg *arg,
1065 struct brw_reg depth_payload,
1066 GLuint tex_idx,
1067 GLuint sampler,
1068 GLboolean shadow)
1069 {
1070 struct brw_compile *p = &c->func;
1071 struct intel_context *intel = &p->brw->intel;
1072 struct brw_reg dst_retyped;
1073 GLuint cur_mrf = 2, response_length;
1074 GLuint i, nr_texcoords;
1075 GLuint emit;
1076 GLuint msg_type;
1077 GLuint mrf_per_channel;
1078 GLuint simd_mode;
1079
1080 if (c->dispatch_width == 16) {
1081 mrf_per_channel = 2;
1082 response_length = 8;
1083 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1084 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1085 } else {
1086 mrf_per_channel = 1;
1087 response_length = 4;
1088 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1089 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1090 }
1091
1092 /* How many input regs are there?
1093 */
1094 switch (tex_idx) {
1095 case TEXTURE_1D_INDEX:
1096 emit = WRITEMASK_X;
1097 nr_texcoords = 1;
1098 break;
1099 case TEXTURE_2D_INDEX:
1100 case TEXTURE_RECT_INDEX:
1101 emit = WRITEMASK_XY;
1102 nr_texcoords = 2;
1103 break;
1104 case TEXTURE_3D_INDEX:
1105 case TEXTURE_CUBE_INDEX:
1106 emit = WRITEMASK_XYZ;
1107 nr_texcoords = 3;
1108 break;
1109 default:
1110 /* unexpected target */
1111 abort();
1112 }
1113
1114 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1115 if (intel->gen < 5 && c->dispatch_width == 8)
1116 nr_texcoords = 3;
1117
1118 /* For shadow comparisons, we have to supply u,v,r. */
1119 if (shadow)
1120 nr_texcoords = 3;
1121
1122 /* Emit the texcoords. */
1123 for (i = 0; i < nr_texcoords; i++) {
1124 if (emit & (1<<i))
1125 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1126 else
1127 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1128 cur_mrf += mrf_per_channel;
1129 }
1130
1131 /* Fill in the shadow comparison reference value. */
1132 if (shadow) {
1133 if (intel->gen >= 5) {
1134 /* Fill in the cube map array index value. */
1135 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1136 cur_mrf += mrf_per_channel;
1137 } else if (c->dispatch_width == 8) {
1138 /* Fill in the LOD bias value. */
1139 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1140 cur_mrf += mrf_per_channel;
1141 }
1142 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1143 cur_mrf += mrf_per_channel;
1144 }
1145
1146 if (intel->gen >= 5) {
1147 if (shadow)
1148 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1149 else
1150 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1151 } else {
1152 /* Note that G45 and older determines shadow compare and dispatch width
1153 * from message length for most messages.
1154 */
1155 if (c->dispatch_width == 16 && shadow)
1156 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1157 else
1158 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1159 }
1160
1161 brw_SAMPLE(p,
1162 dst_retyped,
1163 1,
1164 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1165 SURF_INDEX_TEXTURE(sampler),
1166 sampler,
1167 dst_flags & WRITEMASK_XYZW,
1168 msg_type,
1169 response_length,
1170 cur_mrf - 1,
1171 0,
1172 1,
1173 simd_mode);
1174 }
1175
1176
1177 void emit_txb(struct brw_wm_compile *c,
1178 struct brw_reg *dst,
1179 GLuint dst_flags,
1180 struct brw_reg *arg,
1181 struct brw_reg depth_payload,
1182 GLuint tex_idx,
1183 GLuint sampler)
1184 {
1185 struct brw_compile *p = &c->func;
1186 struct intel_context *intel = &p->brw->intel;
1187 GLuint msgLength;
1188 GLuint msg_type;
1189 GLuint mrf_per_channel;
1190 GLuint response_length;
1191 struct brw_reg dst_retyped;
1192
1193 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1194 * samples, so we'll use the 16-wide instruction, leave the second halves
1195 * undefined, and trust the execution mask to keep the undefined pixels
1196 * from mattering.
1197 */
1198 if (c->dispatch_width == 16 || intel->gen < 5) {
1199 if (intel->gen >= 5)
1200 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1201 else
1202 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1203 mrf_per_channel = 2;
1204 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1205 response_length = 8;
1206 } else {
1207 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1208 mrf_per_channel = 1;
1209 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1210 response_length = 4;
1211 }
1212
1213 /* Shadow ignored for txb. */
1214 switch (tex_idx) {
1215 case TEXTURE_1D_INDEX:
1216 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1217 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1218 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1219 break;
1220 case TEXTURE_2D_INDEX:
1221 case TEXTURE_RECT_INDEX:
1222 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1223 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1224 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1225 break;
1226 case TEXTURE_3D_INDEX:
1227 case TEXTURE_CUBE_INDEX:
1228 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1229 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1230 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1231 break;
1232 default:
1233 /* unexpected target */
1234 abort();
1235 }
1236
1237 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1238 msgLength = 2 + 4 * mrf_per_channel - 1;
1239
1240 brw_SAMPLE(p,
1241 dst_retyped,
1242 1,
1243 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1244 SURF_INDEX_TEXTURE(sampler),
1245 sampler,
1246 dst_flags & WRITEMASK_XYZW,
1247 msg_type,
1248 response_length,
1249 msgLength,
1250 0,
1251 1,
1252 BRW_SAMPLER_SIMD_MODE_SIMD16);
1253 }
1254
1255
1256 static void emit_lit(struct brw_wm_compile *c,
1257 const struct brw_reg *dst,
1258 GLuint mask,
1259 const struct brw_reg *arg0)
1260 {
1261 struct brw_compile *p = &c->func;
1262
1263 assert((mask & WRITEMASK_XW) == 0);
1264
1265 if (mask & WRITEMASK_Y) {
1266 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1267 brw_MOV(p, dst[1], arg0[0]);
1268 brw_set_saturate(p, 0);
1269 }
1270
1271 if (mask & WRITEMASK_Z) {
1272 emit_math2(c, BRW_MATH_FUNCTION_POW,
1273 &dst[2],
1274 WRITEMASK_X | (mask & SATURATE),
1275 &arg0[1],
1276 &arg0[3]);
1277 }
1278
1279 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1280 * some of the POW calculations above, but 16-wide iff statements
1281 * seem to lock c1 hardware, so this is a nasty workaround:
1282 */
1283 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1284 {
1285 if (mask & WRITEMASK_Y)
1286 brw_MOV(p, dst[1], brw_imm_f(0));
1287
1288 if (mask & WRITEMASK_Z)
1289 brw_MOV(p, dst[2], brw_imm_f(0));
1290 }
1291 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1292 }
1293
1294
1295 /* Kill pixel - set execution mask to zero for those pixels which
1296 * fail.
1297 */
1298 static void emit_kil( struct brw_wm_compile *c,
1299 struct brw_reg *arg0)
1300 {
1301 struct brw_compile *p = &c->func;
1302 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1303 GLuint i, j;
1304
1305 for (i = 0; i < 4; i++) {
1306 /* Check if we've already done the comparison for this reg
1307 * -- common when someone does KIL TEMP.wwww.
1308 */
1309 for (j = 0; j < i; j++) {
1310 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1311 break;
1312 }
1313 if (j != i)
1314 continue;
1315
1316 brw_push_insn_state(p);
1317 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1318 brw_set_predicate_control_flag_value(p, 0xff);
1319 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1320 brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1321 brw_pop_insn_state(p);
1322 }
1323 }
1324
1325 /* KIL_NV kills the pixels that are currently executing, not based on a test
1326 * of the arguments.
1327 */
1328 void emit_kil_nv( struct brw_wm_compile *c )
1329 {
1330 struct brw_compile *p = &c->func;
1331 struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1332
1333 brw_push_insn_state(p);
1334 brw_set_mask_control(p, BRW_MASK_DISABLE);
1335 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1336 brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1337 brw_pop_insn_state(p);
1338 }
1339
1340 static void fire_fb_write( struct brw_wm_compile *c,
1341 GLuint base_reg,
1342 GLuint nr,
1343 GLuint target,
1344 GLuint eot )
1345 {
1346 struct brw_compile *p = &c->func;
1347 struct intel_context *intel = &p->brw->intel;
1348 struct brw_reg dst;
1349
1350 if (c->dispatch_width == 16)
1351 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1352 else
1353 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1354
1355 /* Pass through control information:
1356 */
1357 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1358 if (intel->gen < 6) /* gen6, use headerless for fb write */
1359 {
1360 brw_push_insn_state(p);
1361 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1362 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1363 brw_MOV(p,
1364 brw_message_reg(base_reg + 1),
1365 brw_vec8_grf(1, 0));
1366 brw_pop_insn_state(p);
1367 }
1368
1369 /* Send framebuffer write message: */
1370 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1371 brw_fb_WRITE(p,
1372 c->dispatch_width,
1373 dst,
1374 base_reg,
1375 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1376 target,
1377 nr,
1378 0,
1379 eot);
1380 }
1381
1382
1383 static void emit_aa( struct brw_wm_compile *c,
1384 struct brw_reg *arg1,
1385 GLuint reg )
1386 {
1387 struct brw_compile *p = &c->func;
1388 GLuint comp = c->key.aa_dest_stencil_reg / 2;
1389 GLuint off = c->key.aa_dest_stencil_reg % 2;
1390 struct brw_reg aa = offset(arg1[comp], off);
1391
1392 brw_push_insn_state(p);
1393 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1394 brw_MOV(p, brw_message_reg(reg), aa);
1395 brw_pop_insn_state(p);
1396 }
1397
1398
1399 /* Post-fragment-program processing. Send the results to the
1400 * framebuffer.
1401 * \param arg0 the fragment color
1402 * \param arg1 the pass-through depth value
1403 * \param arg2 the shader-computed depth value
1404 */
1405 void emit_fb_write(struct brw_wm_compile *c,
1406 struct brw_reg *arg0,
1407 struct brw_reg *arg1,
1408 struct brw_reg *arg2,
1409 GLuint target,
1410 GLuint eot)
1411 {
1412 struct brw_compile *p = &c->func;
1413 struct brw_context *brw = p->brw;
1414 struct intel_context *intel = &brw->intel;
1415 GLuint nr = 2;
1416 GLuint channel;
1417 int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1418
1419 /* Reserve a space for AA - may not be needed:
1420 */
1421 if (c->key.aa_dest_stencil_reg)
1422 nr += 1;
1423
1424 /* I don't really understand how this achieves the color interleave
1425 * (ie RGBARGBA) in the result: [Do the saturation here]
1426 */
1427 brw_push_insn_state(p);
1428
1429 if (intel->gen >= 6)
1430 base_reg = nr;
1431 else
1432 base_reg = 0;
1433
1434 for (channel = 0; channel < 4; channel++) {
1435 if (intel->gen >= 6) {
1436 /* gen6 SIMD16 single source DP write looks like:
1437 * m + 0: r0
1438 * m + 1: r1
1439 * m + 2: g0
1440 * m + 3: g1
1441 * m + 4: b0
1442 * m + 5: b1
1443 * m + 6: a0
1444 * m + 7: a1
1445 */
1446 if (c->dispatch_width == 16) {
1447 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1448 } else {
1449 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1450 }
1451 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1452 /* pre-gen6 SIMD16 single source DP write looks like:
1453 * m + 0: r0
1454 * m + 1: g0
1455 * m + 2: b0
1456 * m + 3: a0
1457 * m + 4: r1
1458 * m + 5: g1
1459 * m + 6: b1
1460 * m + 7: a1
1461 *
1462 * By setting the high bit of the MRF register number, we indicate
1463 * that we want COMPR4 mode - instead of doing the usual destination
1464 * + 1 for the second half we get destination + 4.
1465 */
1466 brw_MOV(p,
1467 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1468 arg0[channel]);
1469 } else {
1470 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1471 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1472 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1473 brw_MOV(p,
1474 brw_message_reg(nr + channel),
1475 arg0[channel]);
1476
1477 if (c->dispatch_width == 16) {
1478 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1479 brw_MOV(p,
1480 brw_message_reg(nr + channel + 4),
1481 sechalf(arg0[channel]));
1482 }
1483 }
1484 }
1485 /* skip over the regs populated above:
1486 */
1487 if (c->dispatch_width == 16)
1488 nr += 8;
1489 else
1490 nr += 4;
1491
1492 brw_pop_insn_state(p);
1493
1494 if (c->key.source_depth_to_render_target)
1495 {
1496 if (c->key.computes_depth)
1497 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1498 else
1499 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1500
1501 nr += 2;
1502 }
1503
1504 if (c->key.dest_depth_reg)
1505 {
1506 GLuint comp = c->key.dest_depth_reg / 2;
1507 GLuint off = c->key.dest_depth_reg % 2;
1508
1509 if (off != 0) {
1510 brw_push_insn_state(p);
1511 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1512
1513 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1514 /* 2nd half? */
1515 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1516 brw_pop_insn_state(p);
1517 }
1518 else {
1519 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1520 }
1521 nr += 2;
1522 }
1523
1524 if (intel->gen >= 6) {
1525 /* Subtract off the message header, since we send headerless. */
1526 nr -= 2;
1527 }
1528
1529 if (!c->key.runtime_check_aads_emit) {
1530 if (c->key.aa_dest_stencil_reg)
1531 emit_aa(c, arg1, 2);
1532
1533 fire_fb_write(c, base_reg, nr, target, eot);
1534 }
1535 else {
1536 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1537 struct brw_reg ip = brw_ip_reg();
1538 struct brw_instruction *jmp;
1539
1540 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1541 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1542 brw_AND(p,
1543 v1_null_ud,
1544 get_element_ud(brw_vec8_grf(1,0), 6),
1545 brw_imm_ud(1<<26));
1546
1547 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1548 {
1549 emit_aa(c, arg1, 2);
1550 fire_fb_write(c, 0, nr, target, eot);
1551 /* note - thread killed in subroutine */
1552 }
1553 brw_land_fwd_jump(p, jmp);
1554
1555 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1556 */
1557 fire_fb_write(c, 1, nr-1, target, eot);
1558 }
1559 }
1560
1561 /**
1562 * Move a GPR to scratch memory.
1563 */
1564 static void emit_spill( struct brw_wm_compile *c,
1565 struct brw_reg reg,
1566 GLuint slot )
1567 {
1568 struct brw_compile *p = &c->func;
1569
1570 /*
1571 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1572 */
1573 brw_MOV(p, brw_message_reg(2), reg);
1574
1575 /*
1576 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1577 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1578 */
1579 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1580 }
1581
1582
1583 /**
1584 * Load a GPR from scratch memory.
1585 */
1586 static void emit_unspill( struct brw_wm_compile *c,
1587 struct brw_reg reg,
1588 GLuint slot )
1589 {
1590 struct brw_compile *p = &c->func;
1591
1592 /* Slot 0 is the undef value.
1593 */
1594 if (slot == 0) {
1595 brw_MOV(p, reg, brw_imm_f(0));
1596 return;
1597 }
1598
1599 /*
1600 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1601 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1602 */
1603
1604 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1605 }
1606
1607
1608 /**
1609 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1610 * Args with unspill_reg != 0 will be loaded from scratch memory.
1611 */
1612 static void get_argument_regs( struct brw_wm_compile *c,
1613 struct brw_wm_ref *arg[],
1614 struct brw_reg *regs )
1615 {
1616 GLuint i;
1617
1618 for (i = 0; i < 4; i++) {
1619 if (arg[i]) {
1620 if (arg[i]->unspill_reg)
1621 emit_unspill(c,
1622 brw_vec8_grf(arg[i]->unspill_reg, 0),
1623 arg[i]->value->spill_slot);
1624
1625 regs[i] = arg[i]->hw_reg;
1626 }
1627 else {
1628 regs[i] = brw_null_reg();
1629 }
1630 }
1631 }
1632
1633
1634 /**
1635 * For values that have a spill_slot!=0, write those regs to scratch memory.
1636 */
1637 static void spill_values( struct brw_wm_compile *c,
1638 struct brw_wm_value *values,
1639 GLuint nr )
1640 {
1641 GLuint i;
1642
1643 for (i = 0; i < nr; i++)
1644 if (values[i].spill_slot)
1645 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1646 }
1647
1648
1649 /* Emit the fragment program instructions here.
1650 */
1651 void brw_wm_emit( struct brw_wm_compile *c )
1652 {
1653 struct brw_compile *p = &c->func;
1654 struct intel_context *intel = &p->brw->intel;
1655 GLuint insn;
1656
1657 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1658 if (intel->gen >= 6)
1659 brw_set_acc_write_control(p, 1);
1660
1661 /* Check if any of the payload regs need to be spilled:
1662 */
1663 spill_values(c, c->payload.depth, 4);
1664 spill_values(c, c->creg, c->nr_creg);
1665 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1666
1667
1668 for (insn = 0; insn < c->nr_insns; insn++) {
1669
1670 struct brw_wm_instruction *inst = &c->instruction[insn];
1671 struct brw_reg args[3][4], dst[4];
1672 GLuint i, dst_flags;
1673
1674 /* Get argument regs:
1675 */
1676 for (i = 0; i < 3; i++)
1677 get_argument_regs(c, inst->src[i], args[i]);
1678
1679 /* Get dest regs:
1680 */
1681 for (i = 0; i < 4; i++)
1682 if (inst->dst[i])
1683 dst[i] = inst->dst[i]->hw_reg;
1684 else
1685 dst[i] = brw_null_reg();
1686
1687 /* Flags
1688 */
1689 dst_flags = inst->writemask;
1690 if (inst->saturate)
1691 dst_flags |= SATURATE;
1692
1693 switch (inst->opcode) {
1694 /* Generated instructions for calculating triangle interpolants:
1695 */
1696 case WM_PIXELXY:
1697 emit_pixel_xy(c, dst, dst_flags);
1698 break;
1699
1700 case WM_DELTAXY:
1701 emit_delta_xy(p, dst, dst_flags, args[0]);
1702 break;
1703
1704 case WM_WPOSXY:
1705 emit_wpos_xy(c, dst, dst_flags, args[0]);
1706 break;
1707
1708 case WM_PIXELW:
1709 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1710 break;
1711
1712 case WM_LINTERP:
1713 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1714 break;
1715
1716 case WM_PINTERP:
1717 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1718 break;
1719
1720 case WM_CINTERP:
1721 emit_cinterp(p, dst, dst_flags, args[0]);
1722 break;
1723
1724 case WM_FB_WRITE:
1725 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1726 break;
1727
1728 case WM_FRONTFACING:
1729 emit_frontfacing(p, dst, dst_flags);
1730 break;
1731
1732 /* Straightforward arithmetic:
1733 */
1734 case OPCODE_ADD:
1735 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1736 break;
1737
1738 case OPCODE_FRC:
1739 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1740 break;
1741
1742 case OPCODE_FLR:
1743 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1744 break;
1745
1746 case OPCODE_DDX:
1747 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1748 break;
1749
1750 case OPCODE_DDY:
1751 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1752 break;
1753
1754 case OPCODE_DP2:
1755 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1756 break;
1757
1758 case OPCODE_DP3:
1759 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1760 break;
1761
1762 case OPCODE_DP4:
1763 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1764 break;
1765
1766 case OPCODE_DPH:
1767 emit_dph(p, dst, dst_flags, args[0], args[1]);
1768 break;
1769
1770 case OPCODE_TRUNC:
1771 for (i = 0; i < 4; i++) {
1772 if (dst_flags & (1<<i)) {
1773 brw_RNDZ(p, dst[i], args[0][i]);
1774 }
1775 }
1776 break;
1777
1778 case OPCODE_LRP:
1779 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1780 break;
1781
1782 case OPCODE_MAD:
1783 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1784 break;
1785
1786 case OPCODE_MOV:
1787 case OPCODE_SWZ:
1788 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1789 break;
1790
1791 case OPCODE_MUL:
1792 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1793 break;
1794
1795 case OPCODE_XPD:
1796 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1797 break;
1798
1799 /* Higher math functions:
1800 */
1801 case OPCODE_RCP:
1802 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1803 break;
1804
1805 case OPCODE_RSQ:
1806 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1807 break;
1808
1809 case OPCODE_SIN:
1810 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1811 break;
1812
1813 case OPCODE_COS:
1814 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1815 break;
1816
1817 case OPCODE_EX2:
1818 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1819 break;
1820
1821 case OPCODE_LG2:
1822 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1823 break;
1824
1825 case OPCODE_SCS:
1826 /* There is an scs math function, but it would need some
1827 * fixup for 16-element execution.
1828 */
1829 if (dst_flags & WRITEMASK_X)
1830 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1831 if (dst_flags & WRITEMASK_Y)
1832 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833 break;
1834
1835 case OPCODE_POW:
1836 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1837 break;
1838
1839 /* Comparisons:
1840 */
1841 case OPCODE_CMP:
1842 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1843 break;
1844
1845 case OPCODE_MAX:
1846 emit_max(p, dst, dst_flags, args[0], args[1]);
1847 break;
1848
1849 case OPCODE_MIN:
1850 emit_min(p, dst, dst_flags, args[0], args[1]);
1851 break;
1852
1853 case OPCODE_SLT:
1854 emit_slt(p, dst, dst_flags, args[0], args[1]);
1855 break;
1856
1857 case OPCODE_SLE:
1858 emit_sle(p, dst, dst_flags, args[0], args[1]);
1859 break;
1860 case OPCODE_SGT:
1861 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1862 break;
1863 case OPCODE_SGE:
1864 emit_sge(p, dst, dst_flags, args[0], args[1]);
1865 break;
1866 case OPCODE_SEQ:
1867 emit_seq(p, dst, dst_flags, args[0], args[1]);
1868 break;
1869 case OPCODE_SNE:
1870 emit_sne(p, dst, dst_flags, args[0], args[1]);
1871 break;
1872
1873 case OPCODE_SSG:
1874 emit_sign(p, dst, dst_flags, args[0]);
1875 break;
1876
1877 case OPCODE_LIT:
1878 emit_lit(c, dst, dst_flags, args[0]);
1879 break;
1880
1881 /* Texturing operations:
1882 */
1883 case OPCODE_TEX:
1884 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1885 inst->tex_idx, inst->tex_unit,
1886 inst->tex_shadow);
1887 break;
1888
1889 case OPCODE_TXB:
1890 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1891 inst->tex_idx, inst->tex_unit);
1892 break;
1893
1894 case OPCODE_KIL:
1895 emit_kil(c, args[0]);
1896 break;
1897
1898 case OPCODE_KIL_NV:
1899 emit_kil_nv(c);
1900 break;
1901
1902 default:
1903 printf("Unsupported opcode %i (%s) in fragment shader\n",
1904 inst->opcode, inst->opcode < MAX_OPCODE ?
1905 _mesa_opcode_string(inst->opcode) :
1906 "unknown");
1907 }
1908
1909 for (i = 0; i < 4; i++)
1910 if (inst->dst[i] && inst->dst[i]->spill_slot)
1911 emit_spill(c,
1912 inst->dst[i]->hw_reg,
1913 inst->dst[i]->spill_slot);
1914 }
1915
1916 /* Only properly tested on ILK */
1917 if (p->brw->intel.gen == 5) {
1918 brw_remove_duplicate_mrf_moves(p);
1919 if (c->dispatch_width == 16)
1920 brw_remove_grf_to_mrf_moves(p);
1921 }
1922
1923 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1924 int i;
1925
1926 printf("wm-native:\n");
1927 for (i = 0; i < p->nr_insn; i++)
1928 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1929 printf("\n");
1930 }
1931 }
1932