Merge remote branch 'vdpau/pipe-video' into pipe-video
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "brw_context.h"
35 #include "brw_wm.h"
36
37 static GLboolean can_do_pln(struct intel_context *intel,
38 const struct brw_reg *deltas)
39 {
40 struct brw_context *brw = brw_context(&intel->ctx);
41
42 if (!brw->has_pln)
43 return GL_FALSE;
44
45 if (deltas[1].nr != deltas[0].nr + 1)
46 return GL_FALSE;
47
48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49 return GL_FALSE;
50
51 return GL_TRUE;
52 }
53
54 /* Not quite sure how correct this is - need to understand horiz
55 * vs. vertical strides a little better.
56 */
57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
58 {
59 if (reg.vstride)
60 reg.nr++;
61 return reg;
62 }
63
64 /* Return the SrcReg index of the channels that can be immediate float operands
65 * instead of usage of PROGRAM_CONSTANT values through push/pull.
66 */
67 GLboolean
68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
69 {
70 int opcode_array[] = {
71 [OPCODE_ADD] = 2,
72 [OPCODE_CMP] = 3,
73 [OPCODE_DP3] = 2,
74 [OPCODE_DP4] = 2,
75 [OPCODE_DPH] = 2,
76 [OPCODE_MAX] = 2,
77 [OPCODE_MIN] = 2,
78 [OPCODE_MOV] = 1,
79 [OPCODE_MUL] = 2,
80 [OPCODE_SEQ] = 2,
81 [OPCODE_SGE] = 2,
82 [OPCODE_SGT] = 2,
83 [OPCODE_SLE] = 2,
84 [OPCODE_SLT] = 2,
85 [OPCODE_SNE] = 2,
86 [OPCODE_SWZ] = 1,
87 [OPCODE_XPD] = 2,
88 };
89
90 /* These opcodes get broken down in a way that allow two
91 * args to be immediates.
92 */
93 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
94 if (arg == 1 || arg == 2)
95 return GL_TRUE;
96 }
97
98 if (opcode > ARRAY_SIZE(opcode_array))
99 return GL_FALSE;
100
101 return arg == opcode_array[opcode] - 1;
102 }
103
104 /**
105 * Computes the screen-space x,y position of the pixels.
106 *
107 * This will be used by emit_delta_xy() or emit_wpos_xy() for
108 * interpolation of attributes..
109 *
110 * Payload R0:
111 *
112 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
113 * corresponding to each of the 16 execution channels.
114 * R0.1..8 -- ?
115 * R1.0 -- triangle vertex 0.X
116 * R1.1 -- triangle vertex 0.Y
117 * R1.2 -- tile 0 x,y coords (2 packed uwords)
118 * R1.3 -- tile 1 x,y coords (2 packed uwords)
119 * R1.4 -- tile 2 x,y coords (2 packed uwords)
120 * R1.5 -- tile 3 x,y coords (2 packed uwords)
121 * R1.6 -- ?
122 * R1.7 -- ?
123 * R1.8 -- ?
124 */
125 void emit_pixel_xy(struct brw_wm_compile *c,
126 const struct brw_reg *dst,
127 GLuint mask)
128 {
129 struct brw_compile *p = &c->func;
130 struct brw_reg r1 = brw_vec1_grf(1, 0);
131 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
132 struct brw_reg dst0_uw, dst1_uw;
133
134 brw_push_insn_state(p);
135 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
136
137 if (c->dispatch_width == 16) {
138 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
139 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
140 } else {
141 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
142 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
143 }
144
145 /* Calculate pixel centers by adding 1 or 0 to each of the
146 * micro-tile coordinates passed in r1.
147 */
148 if (mask & WRITEMASK_X) {
149 brw_ADD(p,
150 dst0_uw,
151 stride(suboffset(r1_uw, 4), 2, 4, 0),
152 brw_imm_v(0x10101010));
153 }
154
155 if (mask & WRITEMASK_Y) {
156 brw_ADD(p,
157 dst1_uw,
158 stride(suboffset(r1_uw,5), 2, 4, 0),
159 brw_imm_v(0x11001100));
160 }
161 brw_pop_insn_state(p);
162 }
163
164 /**
165 * Computes the screen-space x,y distance of the pixels from the start
166 * vertex.
167 *
168 * This will be used in linterp or pinterp with the start vertex value
169 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
170 * to produce interpolated attribute values.
171 */
172 void emit_delta_xy(struct brw_compile *p,
173 const struct brw_reg *dst,
174 GLuint mask,
175 const struct brw_reg *arg0)
176 {
177 struct intel_context *intel = &p->brw->intel;
178 struct brw_reg r1 = brw_vec1_grf(1, 0);
179
180 if (mask == 0)
181 return;
182
183 assert(mask == WRITEMASK_XY);
184
185 if (intel->gen >= 6) {
186 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
187 Just add them with 0.0 for dst reg.. */
188 r1 = brw_imm_v(0x00000000);
189 brw_ADD(p,
190 dst[0],
191 retype(arg0[0], BRW_REGISTER_TYPE_UW),
192 r1);
193 brw_ADD(p,
194 dst[1],
195 retype(arg0[1], BRW_REGISTER_TYPE_UW),
196 r1);
197 return;
198 }
199
200 /* Calc delta X,Y by subtracting origin in r1 from the pixel
201 * centers produced by emit_pixel_xy().
202 */
203 brw_ADD(p,
204 dst[0],
205 retype(arg0[0], BRW_REGISTER_TYPE_UW),
206 negate(r1));
207 brw_ADD(p,
208 dst[1],
209 retype(arg0[1], BRW_REGISTER_TYPE_UW),
210 negate(suboffset(r1,1)));
211 }
212
213 /**
214 * Computes the pixel offset from the window origin for gl_FragCoord().
215 */
216 void emit_wpos_xy(struct brw_wm_compile *c,
217 const struct brw_reg *dst,
218 GLuint mask,
219 const struct brw_reg *arg0)
220 {
221 struct brw_compile *p = &c->func;
222 struct intel_context *intel = &p->brw->intel;
223 struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
224 struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
225
226 if (mask & WRITEMASK_X) {
227 if (intel->gen >= 6) {
228 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
229 brw_MOV(p, delta_x_f, delta_x);
230 delta_x = delta_x_f;
231 }
232
233 if (c->fp->program.PixelCenterInteger) {
234 /* X' = X */
235 brw_MOV(p, dst[0], delta_x);
236 } else {
237 /* X' = X + 0.5 */
238 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
239 }
240 }
241
242 if (mask & WRITEMASK_Y) {
243 if (intel->gen >= 6) {
244 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
245 brw_MOV(p, delta_y_f, delta_y);
246 delta_y = delta_y_f;
247 }
248
249 if (c->fp->program.OriginUpperLeft) {
250 if (c->fp->program.PixelCenterInteger) {
251 /* Y' = Y */
252 brw_MOV(p, dst[1], delta_y);
253 } else {
254 brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
255 }
256 } else {
257 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
258
259 /* Y' = (height - 1) - Y + center */
260 brw_ADD(p, dst[1], negate(delta_y),
261 brw_imm_f(c->key.drawable_height - 1 + center_offset));
262 }
263 }
264 }
265
266
267 void emit_pixel_w(struct brw_wm_compile *c,
268 const struct brw_reg *dst,
269 GLuint mask,
270 const struct brw_reg *arg0,
271 const struct brw_reg *deltas)
272 {
273 struct brw_compile *p = &c->func;
274 struct intel_context *intel = &p->brw->intel;
275 struct brw_reg src;
276 struct brw_reg temp_dst;
277
278 if (intel->gen >= 6)
279 temp_dst = dst[3];
280 else
281 temp_dst = brw_message_reg(2);
282
283 assert(intel->gen < 6);
284
285 /* Don't need this if all you are doing is interpolating color, for
286 * instance.
287 */
288 if (mask & WRITEMASK_W) {
289 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
290
291 /* Calc 1/w - just linterp wpos[3] optimized by putting the
292 * result straight into a message reg.
293 */
294 if (can_do_pln(intel, deltas)) {
295 brw_PLN(p, temp_dst, interp3, deltas[0]);
296 } else {
297 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
298 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
299 }
300
301 /* Calc w */
302 if (intel->gen >= 6)
303 src = temp_dst;
304 else
305 src = brw_null_reg();
306
307 if (c->dispatch_width == 16) {
308 brw_math_16(p, dst[3],
309 BRW_MATH_FUNCTION_INV,
310 BRW_MATH_SATURATE_NONE,
311 2, src,
312 BRW_MATH_PRECISION_FULL);
313 } else {
314 brw_math(p, dst[3],
315 BRW_MATH_FUNCTION_INV,
316 BRW_MATH_SATURATE_NONE,
317 2, src,
318 BRW_MATH_DATA_VECTOR,
319 BRW_MATH_PRECISION_FULL);
320 }
321 }
322 }
323
324 void emit_linterp(struct brw_compile *p,
325 const struct brw_reg *dst,
326 GLuint mask,
327 const struct brw_reg *arg0,
328 const struct brw_reg *deltas)
329 {
330 struct intel_context *intel = &p->brw->intel;
331 struct brw_reg interp[4];
332 GLuint nr = arg0[0].nr;
333 GLuint i;
334
335 interp[0] = brw_vec1_grf(nr, 0);
336 interp[1] = brw_vec1_grf(nr, 4);
337 interp[2] = brw_vec1_grf(nr+1, 0);
338 interp[3] = brw_vec1_grf(nr+1, 4);
339
340 for (i = 0; i < 4; i++) {
341 if (mask & (1<<i)) {
342 if (intel->gen >= 6) {
343 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
344 } else if (can_do_pln(intel, deltas)) {
345 brw_PLN(p, dst[i], interp[i], deltas[0]);
346 } else {
347 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
348 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
349 }
350 }
351 }
352 }
353
354
355 void emit_pinterp(struct brw_compile *p,
356 const struct brw_reg *dst,
357 GLuint mask,
358 const struct brw_reg *arg0,
359 const struct brw_reg *deltas,
360 const struct brw_reg *w)
361 {
362 struct intel_context *intel = &p->brw->intel;
363 struct brw_reg interp[4];
364 GLuint nr = arg0[0].nr;
365 GLuint i;
366
367 if (intel->gen >= 6) {
368 emit_linterp(p, dst, mask, arg0, interp);
369 return;
370 }
371
372 interp[0] = brw_vec1_grf(nr, 0);
373 interp[1] = brw_vec1_grf(nr, 4);
374 interp[2] = brw_vec1_grf(nr+1, 0);
375 interp[3] = brw_vec1_grf(nr+1, 4);
376
377 for (i = 0; i < 4; i++) {
378 if (mask & (1<<i)) {
379 if (can_do_pln(intel, deltas)) {
380 brw_PLN(p, dst[i], interp[i], deltas[0]);
381 } else {
382 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
383 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
384 }
385 }
386 }
387 for (i = 0; i < 4; i++) {
388 if (mask & (1<<i)) {
389 brw_MUL(p, dst[i], dst[i], w[3]);
390 }
391 }
392 }
393
394
395 void emit_cinterp(struct brw_compile *p,
396 const struct brw_reg *dst,
397 GLuint mask,
398 const struct brw_reg *arg0)
399 {
400 struct brw_reg interp[4];
401 GLuint nr = arg0[0].nr;
402 GLuint i;
403
404 interp[0] = brw_vec1_grf(nr, 0);
405 interp[1] = brw_vec1_grf(nr, 4);
406 interp[2] = brw_vec1_grf(nr+1, 0);
407 interp[3] = brw_vec1_grf(nr+1, 4);
408
409 for (i = 0; i < 4; i++) {
410 if (mask & (1<<i)) {
411 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */
412 }
413 }
414 }
415
416 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
417 void emit_frontfacing(struct brw_compile *p,
418 const struct brw_reg *dst,
419 GLuint mask)
420 {
421 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
422 GLuint i;
423
424 if (!(mask & WRITEMASK_XYZW))
425 return;
426
427 for (i = 0; i < 4; i++) {
428 if (mask & (1<<i)) {
429 brw_MOV(p, dst[i], brw_imm_f(0.0));
430 }
431 }
432
433 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
434 * us front face
435 */
436 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
437 for (i = 0; i < 4; i++) {
438 if (mask & (1<<i)) {
439 brw_MOV(p, dst[i], brw_imm_f(1.0));
440 }
441 }
442 brw_set_predicate_control_flag_value(p, 0xff);
443 }
444
445 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
446 * looking like:
447 *
448 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
449 *
450 * and we're trying to produce:
451 *
452 * DDX DDY
453 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
454 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
455 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
456 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
457 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
458 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
459 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
460 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
461 *
462 * and add another set of two more subspans if in 16-pixel dispatch mode.
463 *
464 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
465 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
466 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
467 * between each other. We could probably do it like ddx and swizzle the right
468 * order later, but bail for now and just produce
469 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
470 */
471 void emit_ddxy(struct brw_compile *p,
472 const struct brw_reg *dst,
473 GLuint mask,
474 GLboolean is_ddx,
475 const struct brw_reg *arg0)
476 {
477 int i;
478 struct brw_reg src0, src1;
479
480 if (mask & SATURATE)
481 brw_set_saturate(p, 1);
482 for (i = 0; i < 4; i++ ) {
483 if (mask & (1<<i)) {
484 if (is_ddx) {
485 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
486 BRW_REGISTER_TYPE_F,
487 BRW_VERTICAL_STRIDE_2,
488 BRW_WIDTH_2,
489 BRW_HORIZONTAL_STRIDE_0,
490 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
491 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
492 BRW_REGISTER_TYPE_F,
493 BRW_VERTICAL_STRIDE_2,
494 BRW_WIDTH_2,
495 BRW_HORIZONTAL_STRIDE_0,
496 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
497 } else {
498 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
499 BRW_REGISTER_TYPE_F,
500 BRW_VERTICAL_STRIDE_4,
501 BRW_WIDTH_4,
502 BRW_HORIZONTAL_STRIDE_0,
503 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
504 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
505 BRW_REGISTER_TYPE_F,
506 BRW_VERTICAL_STRIDE_4,
507 BRW_WIDTH_4,
508 BRW_HORIZONTAL_STRIDE_0,
509 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
510 }
511 brw_ADD(p, dst[i], src0, negate(src1));
512 }
513 }
514 if (mask & SATURATE)
515 brw_set_saturate(p, 0);
516 }
517
518 void emit_alu1(struct brw_compile *p,
519 struct brw_instruction *(*func)(struct brw_compile *,
520 struct brw_reg,
521 struct brw_reg),
522 const struct brw_reg *dst,
523 GLuint mask,
524 const struct brw_reg *arg0)
525 {
526 GLuint i;
527
528 if (mask & SATURATE)
529 brw_set_saturate(p, 1);
530
531 for (i = 0; i < 4; i++) {
532 if (mask & (1<<i)) {
533 func(p, dst[i], arg0[i]);
534 }
535 }
536
537 if (mask & SATURATE)
538 brw_set_saturate(p, 0);
539 }
540
541
542 void emit_alu2(struct brw_compile *p,
543 struct brw_instruction *(*func)(struct brw_compile *,
544 struct brw_reg,
545 struct brw_reg,
546 struct brw_reg),
547 const struct brw_reg *dst,
548 GLuint mask,
549 const struct brw_reg *arg0,
550 const struct brw_reg *arg1)
551 {
552 GLuint i;
553
554 if (mask & SATURATE)
555 brw_set_saturate(p, 1);
556
557 for (i = 0; i < 4; i++) {
558 if (mask & (1<<i)) {
559 func(p, dst[i], arg0[i], arg1[i]);
560 }
561 }
562
563 if (mask & SATURATE)
564 brw_set_saturate(p, 0);
565 }
566
567
568 void emit_mad(struct brw_compile *p,
569 const struct brw_reg *dst,
570 GLuint mask,
571 const struct brw_reg *arg0,
572 const struct brw_reg *arg1,
573 const struct brw_reg *arg2)
574 {
575 GLuint i;
576
577 for (i = 0; i < 4; i++) {
578 if (mask & (1<<i)) {
579 brw_MUL(p, dst[i], arg0[i], arg1[i]);
580
581 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
582 brw_ADD(p, dst[i], dst[i], arg2[i]);
583 brw_set_saturate(p, 0);
584 }
585 }
586 }
587
588 void emit_lrp(struct brw_compile *p,
589 const struct brw_reg *dst,
590 GLuint mask,
591 const struct brw_reg *arg0,
592 const struct brw_reg *arg1,
593 const struct brw_reg *arg2)
594 {
595 GLuint i;
596
597 /* Uses dst as a temporary:
598 */
599 for (i = 0; i < 4; i++) {
600 if (mask & (1<<i)) {
601 /* Can I use the LINE instruction for this?
602 */
603 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
604 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
605
606 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
607 brw_MAC(p, dst[i], arg0[i], arg1[i]);
608 brw_set_saturate(p, 0);
609 }
610 }
611 }
612
613 void emit_sop(struct brw_compile *p,
614 const struct brw_reg *dst,
615 GLuint mask,
616 GLuint cond,
617 const struct brw_reg *arg0,
618 const struct brw_reg *arg1)
619 {
620 GLuint i;
621
622 for (i = 0; i < 4; i++) {
623 if (mask & (1<<i)) {
624 brw_push_insn_state(p);
625 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
626 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
627 brw_MOV(p, dst[i], brw_imm_f(0));
628 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
629 brw_MOV(p, dst[i], brw_imm_f(1.0));
630 brw_pop_insn_state(p);
631 }
632 }
633 }
634
635 static void emit_slt( struct brw_compile *p,
636 const struct brw_reg *dst,
637 GLuint mask,
638 const struct brw_reg *arg0,
639 const struct brw_reg *arg1 )
640 {
641 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
642 }
643
644 static void emit_sle( struct brw_compile *p,
645 const struct brw_reg *dst,
646 GLuint mask,
647 const struct brw_reg *arg0,
648 const struct brw_reg *arg1 )
649 {
650 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
651 }
652
653 static void emit_sgt( struct brw_compile *p,
654 const struct brw_reg *dst,
655 GLuint mask,
656 const struct brw_reg *arg0,
657 const struct brw_reg *arg1 )
658 {
659 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
660 }
661
662 static void emit_sge( struct brw_compile *p,
663 const struct brw_reg *dst,
664 GLuint mask,
665 const struct brw_reg *arg0,
666 const struct brw_reg *arg1 )
667 {
668 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
669 }
670
671 static void emit_seq( struct brw_compile *p,
672 const struct brw_reg *dst,
673 GLuint mask,
674 const struct brw_reg *arg0,
675 const struct brw_reg *arg1 )
676 {
677 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
678 }
679
680 static void emit_sne( struct brw_compile *p,
681 const struct brw_reg *dst,
682 GLuint mask,
683 const struct brw_reg *arg0,
684 const struct brw_reg *arg1 )
685 {
686 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
687 }
688
689 void emit_cmp(struct brw_compile *p,
690 const struct brw_reg *dst,
691 GLuint mask,
692 const struct brw_reg *arg0,
693 const struct brw_reg *arg1,
694 const struct brw_reg *arg2)
695 {
696 GLuint i;
697
698 for (i = 0; i < 4; i++) {
699 if (mask & (1<<i)) {
700 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
701
702 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
703 brw_SEL(p, dst[i], arg1[i], arg2[i]);
704 brw_set_saturate(p, 0);
705 brw_set_predicate_control_flag_value(p, 0xff);
706 }
707 }
708 }
709
710 void emit_sign(struct brw_compile *p,
711 const struct brw_reg *dst,
712 GLuint mask,
713 const struct brw_reg *arg0)
714 {
715 GLuint i;
716
717 for (i = 0; i < 4; i++) {
718 if (mask & (1<<i)) {
719 brw_MOV(p, dst[i], brw_imm_f(0.0));
720
721 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
722 brw_MOV(p, dst[i], brw_imm_f(-1.0));
723 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
724
725 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
726 brw_MOV(p, dst[i], brw_imm_f(1.0));
727 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
728 }
729 }
730 }
731
732 void emit_max(struct brw_compile *p,
733 const struct brw_reg *dst,
734 GLuint mask,
735 const struct brw_reg *arg0,
736 const struct brw_reg *arg1)
737 {
738 GLuint i;
739
740 for (i = 0; i < 4; i++) {
741 if (mask & (1<<i)) {
742 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
743
744 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
745 brw_SEL(p, dst[i], arg0[i], arg1[i]);
746 brw_set_saturate(p, 0);
747 brw_set_predicate_control_flag_value(p, 0xff);
748 }
749 }
750 }
751
752 void emit_min(struct brw_compile *p,
753 const struct brw_reg *dst,
754 GLuint mask,
755 const struct brw_reg *arg0,
756 const struct brw_reg *arg1)
757 {
758 GLuint i;
759
760 for (i = 0; i < 4; i++) {
761 if (mask & (1<<i)) {
762 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
763
764 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
765 brw_SEL(p, dst[i], arg0[i], arg1[i]);
766 brw_set_saturate(p, 0);
767 brw_set_predicate_control_flag_value(p, 0xff);
768 }
769 }
770 }
771
772
773 void emit_dp2(struct brw_compile *p,
774 const struct brw_reg *dst,
775 GLuint mask,
776 const struct brw_reg *arg0,
777 const struct brw_reg *arg1)
778 {
779 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
780
781 if (!(mask & WRITEMASK_XYZW))
782 return; /* Do not emit dead code */
783
784 assert(is_power_of_two(mask & WRITEMASK_XYZW));
785
786 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
787
788 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
789 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
790 brw_set_saturate(p, 0);
791 }
792
793
794 void emit_dp3(struct brw_compile *p,
795 const struct brw_reg *dst,
796 GLuint mask,
797 const struct brw_reg *arg0,
798 const struct brw_reg *arg1)
799 {
800 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
801
802 if (!(mask & WRITEMASK_XYZW))
803 return; /* Do not emit dead code */
804
805 assert(is_power_of_two(mask & WRITEMASK_XYZW));
806
807 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
808 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
809
810 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
811 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
812 brw_set_saturate(p, 0);
813 }
814
815
816 void emit_dp4(struct brw_compile *p,
817 const struct brw_reg *dst,
818 GLuint mask,
819 const struct brw_reg *arg0,
820 const struct brw_reg *arg1)
821 {
822 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
823
824 if (!(mask & WRITEMASK_XYZW))
825 return; /* Do not emit dead code */
826
827 assert(is_power_of_two(mask & WRITEMASK_XYZW));
828
829 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
830 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
831 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
832
833 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
834 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
835 brw_set_saturate(p, 0);
836 }
837
838
839 void emit_dph(struct brw_compile *p,
840 const struct brw_reg *dst,
841 GLuint mask,
842 const struct brw_reg *arg0,
843 const struct brw_reg *arg1)
844 {
845 const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
846
847 if (!(mask & WRITEMASK_XYZW))
848 return; /* Do not emit dead code */
849
850 assert(is_power_of_two(mask & WRITEMASK_XYZW));
851
852 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
853 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
854 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
855
856 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
857 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
858 brw_set_saturate(p, 0);
859 }
860
861
862 void emit_xpd(struct brw_compile *p,
863 const struct brw_reg *dst,
864 GLuint mask,
865 const struct brw_reg *arg0,
866 const struct brw_reg *arg1)
867 {
868 GLuint i;
869
870 assert((mask & WRITEMASK_W) != WRITEMASK_W);
871
872 for (i = 0 ; i < 3; i++) {
873 if (mask & (1<<i)) {
874 GLuint i2 = (i+2)%3;
875 GLuint i1 = (i+1)%3;
876
877 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
878
879 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
880 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
881 brw_set_saturate(p, 0);
882 }
883 }
884 }
885
886
887 void emit_math1(struct brw_wm_compile *c,
888 GLuint function,
889 const struct brw_reg *dst,
890 GLuint mask,
891 const struct brw_reg *arg0)
892 {
893 struct brw_compile *p = &c->func;
894 struct intel_context *intel = &p->brw->intel;
895 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
896 GLuint saturate = ((mask & SATURATE) ?
897 BRW_MATH_SATURATE_SATURATE :
898 BRW_MATH_SATURATE_NONE);
899 struct brw_reg src;
900
901 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
902 arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
903 arg0[0].negate || arg0[0].abs)) {
904 /* Gen6 math requires that source and dst horizontal stride be 1,
905 * and that the argument be in the GRF.
906 *
907 * The hardware ignores source modifiers (negate and abs) on math
908 * instructions, so we also move to a temp to set those up.
909 */
910 src = dst[dst_chan];
911 brw_MOV(p, src, arg0[0]);
912 } else {
913 src = arg0[0];
914 }
915
916 if (!(mask & WRITEMASK_XYZW))
917 return; /* Do not emit dead code */
918
919 assert(is_power_of_two(mask & WRITEMASK_XYZW));
920
921 /* Send two messages to perform all 16 operations:
922 */
923 brw_push_insn_state(p);
924 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
925 brw_math(p,
926 dst[dst_chan],
927 function,
928 saturate,
929 2,
930 src,
931 BRW_MATH_DATA_VECTOR,
932 BRW_MATH_PRECISION_FULL);
933
934 if (c->dispatch_width == 16) {
935 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
936 brw_math(p,
937 offset(dst[dst_chan],1),
938 function,
939 saturate,
940 3,
941 sechalf(src),
942 BRW_MATH_DATA_VECTOR,
943 BRW_MATH_PRECISION_FULL);
944 }
945 brw_pop_insn_state(p);
946 }
947
948
949 void emit_math2(struct brw_wm_compile *c,
950 GLuint function,
951 const struct brw_reg *dst,
952 GLuint mask,
953 const struct brw_reg *arg0,
954 const struct brw_reg *arg1)
955 {
956 struct brw_compile *p = &c->func;
957 struct intel_context *intel = &p->brw->intel;
958 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
959
960 if (!(mask & WRITEMASK_XYZW))
961 return; /* Do not emit dead code */
962
963 assert(is_power_of_two(mask & WRITEMASK_XYZW));
964
965 brw_push_insn_state(p);
966
967 /* math can only operate on up to a vec8 at a time, so in
968 * dispatch_width==16 we have to do the second half manually.
969 */
970 if (intel->gen >= 6) {
971 struct brw_reg src0 = arg0[0];
972 struct brw_reg src1 = arg1[0];
973 struct brw_reg temp_dst = dst[dst_chan];
974
975 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
976 brw_MOV(p, temp_dst, src0);
977 src0 = temp_dst;
978 }
979
980 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
981 /* This is a heinous hack to get a temporary register for use
982 * in case both arg0 and arg1 are constants. Why you're
983 * doing exponentiation on constant values in the shader, we
984 * don't know.
985 *
986 * max_wm_grf is almost surely less than the maximum GRF, and
987 * gen6 doesn't care about the number of GRFs used in a
988 * shader like pre-gen6 did.
989 */
990 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
991 brw_MOV(p, temp, src1);
992 src1 = temp;
993 }
994
995 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
996 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
997 brw_math2(p,
998 temp_dst,
999 function,
1000 src0,
1001 src1);
1002 if (c->dispatch_width == 16) {
1003 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1004 brw_math2(p,
1005 sechalf(temp_dst),
1006 function,
1007 sechalf(src0),
1008 sechalf(src1));
1009 }
1010 } else {
1011 GLuint saturate = ((mask & SATURATE) ?
1012 BRW_MATH_SATURATE_SATURATE :
1013 BRW_MATH_SATURATE_NONE);
1014
1015 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1016 brw_MOV(p, brw_message_reg(3), arg1[0]);
1017 if (c->dispatch_width == 16) {
1018 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1019 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1020 }
1021
1022 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1023 brw_math(p,
1024 dst[dst_chan],
1025 function,
1026 saturate,
1027 2,
1028 arg0[0],
1029 BRW_MATH_DATA_VECTOR,
1030 BRW_MATH_PRECISION_FULL);
1031
1032 /* Send two messages to perform all 16 operations:
1033 */
1034 if (c->dispatch_width == 16) {
1035 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1036 brw_math(p,
1037 offset(dst[dst_chan],1),
1038 function,
1039 saturate,
1040 4,
1041 sechalf(arg0[0]),
1042 BRW_MATH_DATA_VECTOR,
1043 BRW_MATH_PRECISION_FULL);
1044 }
1045 }
1046 brw_pop_insn_state(p);
1047 }
1048
1049
1050 void emit_tex(struct brw_wm_compile *c,
1051 struct brw_reg *dst,
1052 GLuint dst_flags,
1053 struct brw_reg *arg,
1054 struct brw_reg depth_payload,
1055 GLuint tex_idx,
1056 GLuint sampler,
1057 GLboolean shadow)
1058 {
1059 struct brw_compile *p = &c->func;
1060 struct intel_context *intel = &p->brw->intel;
1061 struct brw_reg dst_retyped;
1062 GLuint cur_mrf = 2, response_length;
1063 GLuint i, nr_texcoords;
1064 GLuint emit;
1065 GLuint msg_type;
1066 GLuint mrf_per_channel;
1067 GLuint simd_mode;
1068
1069 if (c->dispatch_width == 16) {
1070 mrf_per_channel = 2;
1071 response_length = 8;
1072 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1073 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1074 } else {
1075 mrf_per_channel = 1;
1076 response_length = 4;
1077 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1078 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1079 }
1080
1081 /* How many input regs are there?
1082 */
1083 switch (tex_idx) {
1084 case TEXTURE_1D_INDEX:
1085 emit = WRITEMASK_X;
1086 nr_texcoords = 1;
1087 break;
1088 case TEXTURE_2D_INDEX:
1089 case TEXTURE_RECT_INDEX:
1090 emit = WRITEMASK_XY;
1091 nr_texcoords = 2;
1092 break;
1093 case TEXTURE_3D_INDEX:
1094 case TEXTURE_CUBE_INDEX:
1095 emit = WRITEMASK_XYZ;
1096 nr_texcoords = 3;
1097 break;
1098 default:
1099 /* unexpected target */
1100 abort();
1101 }
1102
1103 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1104 if (intel->gen < 5 && c->dispatch_width == 8)
1105 nr_texcoords = 3;
1106
1107 /* For shadow comparisons, we have to supply u,v,r. */
1108 if (shadow)
1109 nr_texcoords = 3;
1110
1111 /* Emit the texcoords. */
1112 for (i = 0; i < nr_texcoords; i++) {
1113 if (emit & (1<<i))
1114 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1115 else
1116 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1117 cur_mrf += mrf_per_channel;
1118 }
1119
1120 /* Fill in the shadow comparison reference value. */
1121 if (shadow) {
1122 if (intel->gen >= 5) {
1123 /* Fill in the cube map array index value. */
1124 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1125 cur_mrf += mrf_per_channel;
1126 } else if (c->dispatch_width == 8) {
1127 /* Fill in the LOD bias value. */
1128 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1129 cur_mrf += mrf_per_channel;
1130 }
1131 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1132 cur_mrf += mrf_per_channel;
1133 }
1134
1135 if (intel->gen >= 5) {
1136 if (shadow)
1137 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1138 else
1139 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1140 } else {
1141 /* Note that G45 and older determines shadow compare and dispatch width
1142 * from message length for most messages.
1143 */
1144 if (c->dispatch_width == 16 && shadow)
1145 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1146 else
1147 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1148 }
1149
1150 brw_SAMPLE(p,
1151 dst_retyped,
1152 1,
1153 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1154 SURF_INDEX_TEXTURE(sampler),
1155 sampler,
1156 dst_flags & WRITEMASK_XYZW,
1157 msg_type,
1158 response_length,
1159 cur_mrf - 1,
1160 0,
1161 1,
1162 simd_mode);
1163 }
1164
1165
1166 void emit_txb(struct brw_wm_compile *c,
1167 struct brw_reg *dst,
1168 GLuint dst_flags,
1169 struct brw_reg *arg,
1170 struct brw_reg depth_payload,
1171 GLuint tex_idx,
1172 GLuint sampler)
1173 {
1174 struct brw_compile *p = &c->func;
1175 struct intel_context *intel = &p->brw->intel;
1176 GLuint msgLength;
1177 GLuint msg_type;
1178 GLuint mrf_per_channel;
1179 GLuint response_length;
1180 struct brw_reg dst_retyped;
1181
1182 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1183 * samples, so we'll use the 16-wide instruction, leave the second halves
1184 * undefined, and trust the execution mask to keep the undefined pixels
1185 * from mattering.
1186 */
1187 if (c->dispatch_width == 16 || intel->gen < 5) {
1188 if (intel->gen >= 5)
1189 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1190 else
1191 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1192 mrf_per_channel = 2;
1193 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1194 response_length = 8;
1195 } else {
1196 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1197 mrf_per_channel = 1;
1198 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1199 response_length = 4;
1200 }
1201
1202 /* Shadow ignored for txb. */
1203 switch (tex_idx) {
1204 case TEXTURE_1D_INDEX:
1205 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1206 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1207 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1208 break;
1209 case TEXTURE_2D_INDEX:
1210 case TEXTURE_RECT_INDEX:
1211 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1212 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1213 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1214 break;
1215 case TEXTURE_3D_INDEX:
1216 case TEXTURE_CUBE_INDEX:
1217 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1218 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1219 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1220 break;
1221 default:
1222 /* unexpected target */
1223 abort();
1224 }
1225
1226 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1227 msgLength = 2 + 4 * mrf_per_channel - 1;
1228
1229 brw_SAMPLE(p,
1230 dst_retyped,
1231 1,
1232 retype(depth_payload, BRW_REGISTER_TYPE_UW),
1233 SURF_INDEX_TEXTURE(sampler),
1234 sampler,
1235 dst_flags & WRITEMASK_XYZW,
1236 msg_type,
1237 response_length,
1238 msgLength,
1239 0,
1240 1,
1241 BRW_SAMPLER_SIMD_MODE_SIMD16);
1242 }
1243
1244
1245 static void emit_lit(struct brw_wm_compile *c,
1246 const struct brw_reg *dst,
1247 GLuint mask,
1248 const struct brw_reg *arg0)
1249 {
1250 struct brw_compile *p = &c->func;
1251
1252 assert((mask & WRITEMASK_XW) == 0);
1253
1254 if (mask & WRITEMASK_Y) {
1255 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1256 brw_MOV(p, dst[1], arg0[0]);
1257 brw_set_saturate(p, 0);
1258 }
1259
1260 if (mask & WRITEMASK_Z) {
1261 emit_math2(c, BRW_MATH_FUNCTION_POW,
1262 &dst[2],
1263 WRITEMASK_X | (mask & SATURATE),
1264 &arg0[1],
1265 &arg0[3]);
1266 }
1267
1268 /* Ordinarily you'd use an iff statement to skip or shortcircuit
1269 * some of the POW calculations above, but 16-wide iff statements
1270 * seem to lock c1 hardware, so this is a nasty workaround:
1271 */
1272 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1273 {
1274 if (mask & WRITEMASK_Y)
1275 brw_MOV(p, dst[1], brw_imm_f(0));
1276
1277 if (mask & WRITEMASK_Z)
1278 brw_MOV(p, dst[2], brw_imm_f(0));
1279 }
1280 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1281 }
1282
1283
1284 /* Kill pixel - set execution mask to zero for those pixels which
1285 * fail.
1286 */
1287 static void emit_kil( struct brw_wm_compile *c,
1288 struct brw_reg *arg0)
1289 {
1290 struct brw_compile *p = &c->func;
1291 struct intel_context *intel = &p->brw->intel;
1292 struct brw_reg pixelmask;
1293 GLuint i, j;
1294
1295 if (intel->gen >= 6)
1296 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1297 else
1298 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1299
1300 for (i = 0; i < 4; i++) {
1301 /* Check if we've already done the comparison for this reg
1302 * -- common when someone does KIL TEMP.wwww.
1303 */
1304 for (j = 0; j < i; j++) {
1305 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1306 break;
1307 }
1308 if (j != i)
1309 continue;
1310
1311 brw_push_insn_state(p);
1312 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1313 brw_set_predicate_control_flag_value(p, 0xff);
1314 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1315 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1316 brw_pop_insn_state(p);
1317 }
1318 }
1319
1320 static void fire_fb_write( struct brw_wm_compile *c,
1321 GLuint base_reg,
1322 GLuint nr,
1323 GLuint target,
1324 GLuint eot )
1325 {
1326 struct brw_compile *p = &c->func;
1327 struct intel_context *intel = &p->brw->intel;
1328 struct brw_reg dst;
1329
1330 if (c->dispatch_width == 16)
1331 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1332 else
1333 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1334
1335 /* Pass through control information:
1336 *
1337 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1338 */
1339 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
1340 if (intel->gen < 6)
1341 {
1342 brw_push_insn_state(p);
1343 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1344 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1345 brw_MOV(p,
1346 brw_message_reg(base_reg + 1),
1347 brw_vec8_grf(1, 0));
1348 brw_pop_insn_state(p);
1349 }
1350
1351 /* Send framebuffer write message: */
1352 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */
1353 brw_fb_WRITE(p,
1354 c->dispatch_width,
1355 dst,
1356 base_reg,
1357 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1358 target,
1359 nr,
1360 0,
1361 eot,
1362 GL_TRUE);
1363 }
1364
1365
1366 static void emit_aa( struct brw_wm_compile *c,
1367 struct brw_reg *arg1,
1368 GLuint reg )
1369 {
1370 struct brw_compile *p = &c->func;
1371 GLuint comp = c->aa_dest_stencil_reg / 2;
1372 GLuint off = c->aa_dest_stencil_reg % 2;
1373 struct brw_reg aa = offset(arg1[comp], off);
1374
1375 brw_push_insn_state(p);
1376 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1377 brw_MOV(p, brw_message_reg(reg), aa);
1378 brw_pop_insn_state(p);
1379 }
1380
1381
1382 /* Post-fragment-program processing. Send the results to the
1383 * framebuffer.
1384 * \param arg0 the fragment color
1385 * \param arg1 the pass-through depth value
1386 * \param arg2 the shader-computed depth value
1387 */
1388 void emit_fb_write(struct brw_wm_compile *c,
1389 struct brw_reg *arg0,
1390 struct brw_reg *arg1,
1391 struct brw_reg *arg2,
1392 GLuint target,
1393 GLuint eot)
1394 {
1395 struct brw_compile *p = &c->func;
1396 struct brw_context *brw = p->brw;
1397 struct intel_context *intel = &brw->intel;
1398 GLuint nr = 2;
1399 GLuint channel;
1400
1401 /* Reserve a space for AA - may not be needed:
1402 */
1403 if (c->aa_dest_stencil_reg)
1404 nr += 1;
1405
1406 /* I don't really understand how this achieves the color interleave
1407 * (ie RGBARGBA) in the result: [Do the saturation here]
1408 */
1409 brw_push_insn_state(p);
1410
1411 for (channel = 0; channel < 4; channel++) {
1412 if (intel->gen >= 6) {
1413 /* gen6 SIMD16 single source DP write looks like:
1414 * m + 0: r0
1415 * m + 1: r1
1416 * m + 2: g0
1417 * m + 3: g1
1418 * m + 4: b0
1419 * m + 5: b1
1420 * m + 6: a0
1421 * m + 7: a1
1422 */
1423 if (c->dispatch_width == 16) {
1424 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1425 } else {
1426 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1427 }
1428 } else if (c->dispatch_width == 16 && brw->has_compr4) {
1429 /* pre-gen6 SIMD16 single source DP write looks like:
1430 * m + 0: r0
1431 * m + 1: g0
1432 * m + 2: b0
1433 * m + 3: a0
1434 * m + 4: r1
1435 * m + 5: g1
1436 * m + 6: b1
1437 * m + 7: a1
1438 *
1439 * By setting the high bit of the MRF register number, we indicate
1440 * that we want COMPR4 mode - instead of doing the usual destination
1441 * + 1 for the second half we get destination + 4.
1442 */
1443 brw_MOV(p,
1444 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1445 arg0[channel]);
1446 } else {
1447 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
1448 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
1449 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1450 brw_MOV(p,
1451 brw_message_reg(nr + channel),
1452 arg0[channel]);
1453
1454 if (c->dispatch_width == 16) {
1455 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1456 brw_MOV(p,
1457 brw_message_reg(nr + channel + 4),
1458 sechalf(arg0[channel]));
1459 }
1460 }
1461 }
1462 /* skip over the regs populated above:
1463 */
1464 if (c->dispatch_width == 16)
1465 nr += 8;
1466 else
1467 nr += 4;
1468
1469 brw_pop_insn_state(p);
1470
1471 if (c->source_depth_to_render_target)
1472 {
1473 if (c->computes_depth)
1474 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1475 else
1476 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1477
1478 nr += 2;
1479 }
1480
1481 if (c->dest_depth_reg)
1482 {
1483 GLuint comp = c->dest_depth_reg / 2;
1484 GLuint off = c->dest_depth_reg % 2;
1485
1486 if (off != 0) {
1487 brw_push_insn_state(p);
1488 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1489
1490 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1491 /* 2nd half? */
1492 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1493 brw_pop_insn_state(p);
1494 }
1495 else {
1496 brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1497 }
1498 nr += 2;
1499 }
1500
1501 if (intel->gen >= 6) {
1502 /* Load the message header. There's no implied move from src0
1503 * to the base mrf on gen6.
1504 */
1505 brw_push_insn_state(p);
1506 brw_set_mask_control(p, BRW_MASK_DISABLE);
1507 brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1508 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1509 brw_pop_insn_state(p);
1510
1511 if (target != 0) {
1512 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1513 0,
1514 2), BRW_REGISTER_TYPE_UD),
1515 brw_imm_ud(target));
1516 }
1517 }
1518
1519 if (!c->runtime_check_aads_emit) {
1520 if (c->aa_dest_stencil_reg)
1521 emit_aa(c, arg1, 2);
1522
1523 fire_fb_write(c, 0, nr, target, eot);
1524 }
1525 else {
1526 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1527 struct brw_reg ip = brw_ip_reg();
1528 struct brw_instruction *jmp;
1529
1530 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1531 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1532 brw_AND(p,
1533 v1_null_ud,
1534 get_element_ud(brw_vec8_grf(1,0), 6),
1535 brw_imm_ud(1<<26));
1536
1537 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1538 {
1539 emit_aa(c, arg1, 2);
1540 fire_fb_write(c, 0, nr, target, eot);
1541 /* note - thread killed in subroutine */
1542 }
1543 brw_land_fwd_jump(p, jmp);
1544
1545 /* ELSE: Shuffle up one register to fill in the hole left for AA:
1546 */
1547 fire_fb_write(c, 1, nr-1, target, eot);
1548 }
1549 }
1550
1551 /**
1552 * Move a GPR to scratch memory.
1553 */
1554 static void emit_spill( struct brw_wm_compile *c,
1555 struct brw_reg reg,
1556 GLuint slot )
1557 {
1558 struct brw_compile *p = &c->func;
1559
1560 /*
1561 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr }
1562 */
1563 brw_MOV(p, brw_message_reg(2), reg);
1564
1565 /*
1566 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask }
1567 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 }
1568 */
1569 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1570 }
1571
1572
1573 /**
1574 * Load a GPR from scratch memory.
1575 */
1576 static void emit_unspill( struct brw_wm_compile *c,
1577 struct brw_reg reg,
1578 GLuint slot )
1579 {
1580 struct brw_compile *p = &c->func;
1581
1582 /* Slot 0 is the undef value.
1583 */
1584 if (slot == 0) {
1585 brw_MOV(p, reg, brw_imm_f(0));
1586 return;
1587 }
1588
1589 /*
1590 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask }
1591 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 }
1592 */
1593
1594 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1595 }
1596
1597
1598 /**
1599 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1600 * Args with unspill_reg != 0 will be loaded from scratch memory.
1601 */
1602 static void get_argument_regs( struct brw_wm_compile *c,
1603 struct brw_wm_ref *arg[],
1604 struct brw_reg *regs )
1605 {
1606 GLuint i;
1607
1608 for (i = 0; i < 4; i++) {
1609 if (arg[i]) {
1610 if (arg[i]->unspill_reg)
1611 emit_unspill(c,
1612 brw_vec8_grf(arg[i]->unspill_reg, 0),
1613 arg[i]->value->spill_slot);
1614
1615 regs[i] = arg[i]->hw_reg;
1616 }
1617 else {
1618 regs[i] = brw_null_reg();
1619 }
1620 }
1621 }
1622
1623
1624 /**
1625 * For values that have a spill_slot!=0, write those regs to scratch memory.
1626 */
1627 static void spill_values( struct brw_wm_compile *c,
1628 struct brw_wm_value *values,
1629 GLuint nr )
1630 {
1631 GLuint i;
1632
1633 for (i = 0; i < nr; i++)
1634 if (values[i].spill_slot)
1635 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1636 }
1637
1638
1639 /* Emit the fragment program instructions here.
1640 */
1641 void brw_wm_emit( struct brw_wm_compile *c )
1642 {
1643 struct brw_compile *p = &c->func;
1644 struct intel_context *intel = &p->brw->intel;
1645 GLuint insn;
1646
1647 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1648 if (intel->gen >= 6)
1649 brw_set_acc_write_control(p, 1);
1650
1651 /* Check if any of the payload regs need to be spilled:
1652 */
1653 spill_values(c, c->payload.depth, 4);
1654 spill_values(c, c->creg, c->nr_creg);
1655 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1656
1657
1658 for (insn = 0; insn < c->nr_insns; insn++) {
1659
1660 struct brw_wm_instruction *inst = &c->instruction[insn];
1661 struct brw_reg args[3][4], dst[4];
1662 GLuint i, dst_flags;
1663
1664 /* Get argument regs:
1665 */
1666 for (i = 0; i < 3; i++)
1667 get_argument_regs(c, inst->src[i], args[i]);
1668
1669 /* Get dest regs:
1670 */
1671 for (i = 0; i < 4; i++)
1672 if (inst->dst[i])
1673 dst[i] = inst->dst[i]->hw_reg;
1674 else
1675 dst[i] = brw_null_reg();
1676
1677 /* Flags
1678 */
1679 dst_flags = inst->writemask;
1680 if (inst->saturate)
1681 dst_flags |= SATURATE;
1682
1683 switch (inst->opcode) {
1684 /* Generated instructions for calculating triangle interpolants:
1685 */
1686 case WM_PIXELXY:
1687 emit_pixel_xy(c, dst, dst_flags);
1688 break;
1689
1690 case WM_DELTAXY:
1691 emit_delta_xy(p, dst, dst_flags, args[0]);
1692 break;
1693
1694 case WM_WPOSXY:
1695 emit_wpos_xy(c, dst, dst_flags, args[0]);
1696 break;
1697
1698 case WM_PIXELW:
1699 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1700 break;
1701
1702 case WM_LINTERP:
1703 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1704 break;
1705
1706 case WM_PINTERP:
1707 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1708 break;
1709
1710 case WM_CINTERP:
1711 emit_cinterp(p, dst, dst_flags, args[0]);
1712 break;
1713
1714 case WM_FB_WRITE:
1715 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1716 break;
1717
1718 case WM_FRONTFACING:
1719 emit_frontfacing(p, dst, dst_flags);
1720 break;
1721
1722 /* Straightforward arithmetic:
1723 */
1724 case OPCODE_ADD:
1725 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1726 break;
1727
1728 case OPCODE_FRC:
1729 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1730 break;
1731
1732 case OPCODE_FLR:
1733 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1734 break;
1735
1736 case OPCODE_DDX:
1737 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1738 break;
1739
1740 case OPCODE_DDY:
1741 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1742 break;
1743
1744 case OPCODE_DP2:
1745 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1746 break;
1747
1748 case OPCODE_DP3:
1749 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1750 break;
1751
1752 case OPCODE_DP4:
1753 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1754 break;
1755
1756 case OPCODE_DPH:
1757 emit_dph(p, dst, dst_flags, args[0], args[1]);
1758 break;
1759
1760 case OPCODE_TRUNC:
1761 for (i = 0; i < 4; i++) {
1762 if (dst_flags & (1<<i)) {
1763 brw_RNDZ(p, dst[i], args[0][i]);
1764 }
1765 }
1766 break;
1767
1768 case OPCODE_LRP:
1769 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1770 break;
1771
1772 case OPCODE_MAD:
1773 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1774 break;
1775
1776 case OPCODE_MOV:
1777 case OPCODE_SWZ:
1778 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1779 break;
1780
1781 case OPCODE_MUL:
1782 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1783 break;
1784
1785 case OPCODE_XPD:
1786 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1787 break;
1788
1789 /* Higher math functions:
1790 */
1791 case OPCODE_RCP:
1792 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1793 break;
1794
1795 case OPCODE_RSQ:
1796 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1797 break;
1798
1799 case OPCODE_SIN:
1800 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1801 break;
1802
1803 case OPCODE_COS:
1804 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1805 break;
1806
1807 case OPCODE_EX2:
1808 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1809 break;
1810
1811 case OPCODE_LG2:
1812 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1813 break;
1814
1815 case OPCODE_SCS:
1816 /* There is an scs math function, but it would need some
1817 * fixup for 16-element execution.
1818 */
1819 if (dst_flags & WRITEMASK_X)
1820 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1821 if (dst_flags & WRITEMASK_Y)
1822 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1823 break;
1824
1825 case OPCODE_POW:
1826 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1827 break;
1828
1829 /* Comparisons:
1830 */
1831 case OPCODE_CMP:
1832 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1833 break;
1834
1835 case OPCODE_MAX:
1836 emit_max(p, dst, dst_flags, args[0], args[1]);
1837 break;
1838
1839 case OPCODE_MIN:
1840 emit_min(p, dst, dst_flags, args[0], args[1]);
1841 break;
1842
1843 case OPCODE_SLT:
1844 emit_slt(p, dst, dst_flags, args[0], args[1]);
1845 break;
1846
1847 case OPCODE_SLE:
1848 emit_sle(p, dst, dst_flags, args[0], args[1]);
1849 break;
1850 case OPCODE_SGT:
1851 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1852 break;
1853 case OPCODE_SGE:
1854 emit_sge(p, dst, dst_flags, args[0], args[1]);
1855 break;
1856 case OPCODE_SEQ:
1857 emit_seq(p, dst, dst_flags, args[0], args[1]);
1858 break;
1859 case OPCODE_SNE:
1860 emit_sne(p, dst, dst_flags, args[0], args[1]);
1861 break;
1862
1863 case OPCODE_SSG:
1864 emit_sign(p, dst, dst_flags, args[0]);
1865 break;
1866
1867 case OPCODE_LIT:
1868 emit_lit(c, dst, dst_flags, args[0]);
1869 break;
1870
1871 /* Texturing operations:
1872 */
1873 case OPCODE_TEX:
1874 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1875 inst->tex_idx, inst->tex_unit,
1876 inst->tex_shadow);
1877 break;
1878
1879 case OPCODE_TXB:
1880 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1881 inst->tex_idx, inst->tex_unit);
1882 break;
1883
1884 case OPCODE_KIL:
1885 emit_kil(c, args[0]);
1886 break;
1887
1888 default:
1889 printf("Unsupported opcode %i (%s) in fragment shader\n",
1890 inst->opcode, inst->opcode < MAX_OPCODE ?
1891 _mesa_opcode_string(inst->opcode) :
1892 "unknown");
1893 }
1894
1895 for (i = 0; i < 4; i++)
1896 if (inst->dst[i] && inst->dst[i]->spill_slot)
1897 emit_spill(c,
1898 inst->dst[i]->hw_reg,
1899 inst->dst[i]->spill_slot);
1900 }
1901
1902 /* Only properly tested on ILK */
1903 if (p->brw->intel.gen == 5) {
1904 brw_remove_duplicate_mrf_moves(p);
1905 if (c->dispatch_width == 16)
1906 brw_remove_grf_to_mrf_moves(p);
1907 }
1908
1909 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1910 int i;
1911
1912 printf("wm-native:\n");
1913 for (i = 0; i < p->nr_insn; i++)
1914 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1915 printf("\n");
1916 }
1917 }
1918