src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_SWZ] = 1,
  87       [OPCODE_XPD] = 2,
  88    };
  89
  90    /* These opcodes get broken down in a way that allow two
  91     * args to be immediates.
  92     */
  93    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  94       if (arg == 1 || arg == 2)
  95          return GL_TRUE;
  96    }
  97
  98    if (opcode > ARRAY_SIZE(opcode_array))
  99       return GL_FALSE;
 100
 101    return arg == opcode_array[opcode] - 1;
 102 }
 103
 104 /**
 105  * Computes the screen-space x,y position of the pixels.
 106  *
 107  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 108  * interpolation of attributes..
 109  *
 110  * Payload R0:
 111  *
 112  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 113  *         corresponding to each of the 16 execution channels.
 114  * R0.1..8 -- ?
 115  * R1.0 -- triangle vertex 0.X
 116  * R1.1 -- triangle vertex 0.Y
 117  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 118  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 119  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 120  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 121  * R1.6 -- ?
 122  * R1.7 -- ?
 123  * R1.8 -- ?
 124  */
 125 void emit_pixel_xy(struct brw_wm_compile *c,
 126                    const struct brw_reg *dst,
 127                    GLuint mask)
 128 {
 129    struct brw_compile *p = &c->func;
 130    struct brw_reg r1 = brw_vec1_grf(1, 0);
 131    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 132    struct brw_reg dst0_uw, dst1_uw;
 133
 134    brw_push_insn_state(p);
 135    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 136
 137    if (c->dispatch_width == 16) {
 138       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 139       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 140    } else {
 141       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 142       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 143    }
 144
 145    /* Calculate pixel centers by adding 1 or 0 to each of the
 146     * micro-tile coordinates passed in r1.
 147     */
 148    if (mask & WRITEMASK_X) {
 149       brw_ADD(p,
 150               dst0_uw,
 151               stride(suboffset(r1_uw, 4), 2, 4, 0),
 152               brw_imm_v(0x10101010));
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       brw_ADD(p,
 157               dst1_uw,
 158               stride(suboffset(r1_uw,5), 2, 4, 0),
 159               brw_imm_v(0x11001100));
 160    }
 161    brw_pop_insn_state(p);
 162 }
 163
 164 /**
 165  * Computes the screen-space x,y distance of the pixels from the start
 166  * vertex.
 167  *
 168  * This will be used in linterp or pinterp with the start vertex value
 169  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 170  * to produce interpolated attribute values.
 171  */
 172 void emit_delta_xy(struct brw_compile *p,
 173                    const struct brw_reg *dst,
 174                    GLuint mask,
 175                    const struct brw_reg *arg0)
 176 {
 177    struct intel_context *intel = &p->brw->intel;
 178    struct brw_reg r1 = brw_vec1_grf(1, 0);
 179
 180    if (mask == 0)
 181       return;
 182
 183    assert(mask == WRITEMASK_XY);
 184
 185    if (intel->gen >= 6) {
 186        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 187           Just add them with 0.0 for dst reg.. */
 188        r1 = brw_imm_v(0x00000000);
 189        brw_ADD(p,
 190                dst[0],
 191                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 192                r1);
 193        brw_ADD(p,
 194                dst[1],
 195                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 196                r1);
 197        return;
 198    }
 199
 200    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 201     * centers produced by emit_pixel_xy().
 202     */
 203    brw_ADD(p,
 204            dst[0],
 205            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 206            negate(r1));
 207    brw_ADD(p,
 208            dst[1],
 209            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 210            negate(suboffset(r1,1)));
 211 }
 212
 213 /**
 214  * Computes the pixel offset from the window origin for gl_FragCoord().
 215  */
 216 void emit_wpos_xy(struct brw_wm_compile *c,
 217                   const struct brw_reg *dst,
 218                   GLuint mask,
 219                   const struct brw_reg *arg0)
 220 {
 221    struct brw_compile *p = &c->func;
 222
 223    if (mask & WRITEMASK_X) {
 224       if (c->fp->program.PixelCenterInteger) {
 225          /* X' = X */
 226          brw_MOV(p,
 227                  dst[0],
 228                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 229       } else {
 230          /* X' = X + 0.5 */
 231          brw_ADD(p,
 232                  dst[0],
 233                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 234                  brw_imm_f(0.5));
 235       }
 236    }
 237
 238    if (mask & WRITEMASK_Y) {
 239       if (c->fp->program.OriginUpperLeft) {
 240          if (c->fp->program.PixelCenterInteger) {
 241             /* Y' = Y */
 242             brw_MOV(p,
 243                     dst[1],
 244                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 245          } else {
 246             /* Y' = Y + 0.5 */
 247             brw_ADD(p,
 248                     dst[1],
 249                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 250                     brw_imm_f(0.5));
 251          }
 252       } else {
 253          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 254
 255          /* Y' = (height - 1) - Y + center */
 256          brw_ADD(p,
 257                  dst[1],
 258                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 259                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 260       }
 261    }
 262 }
 263
 264
 265 void emit_pixel_w(struct brw_wm_compile *c,
 266                   const struct brw_reg *dst,
 267                   GLuint mask,
 268                   const struct brw_reg *arg0,
 269                   const struct brw_reg *deltas)
 270 {
 271    struct brw_compile *p = &c->func;
 272    struct intel_context *intel = &p->brw->intel;
 273    struct brw_reg src;
 274    struct brw_reg temp_dst;
 275
 276    if (intel->gen >= 6)
 277         temp_dst = dst[3];
 278    else
 279         temp_dst = brw_message_reg(2);
 280
 281    assert(intel->gen < 6);
 282
 283    /* Don't need this if all you are doing is interpolating color, for
 284     * instance.
 285     */
 286    if (mask & WRITEMASK_W) {
 287       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 288
 289       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 290        * result straight into a message reg.
 291        */
 292       if (can_do_pln(intel, deltas)) {
 293          brw_PLN(p, temp_dst, interp3, deltas[0]);
 294       } else {
 295          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 296          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 297       }
 298
 299       /* Calc w */
 300       if (intel->gen >= 6)
 301          src = temp_dst;
 302       else
 303          src = brw_null_reg();
 304
 305       if (c->dispatch_width == 16) {
 306          brw_math_16(p, dst[3],
 307                      BRW_MATH_FUNCTION_INV,
 308                      BRW_MATH_SATURATE_NONE,
 309                      2, src,
 310                      BRW_MATH_PRECISION_FULL);
 311       } else {
 312          brw_math(p, dst[3],
 313                   BRW_MATH_FUNCTION_INV,
 314                   BRW_MATH_SATURATE_NONE,
 315                   2, src,
 316                   BRW_MATH_DATA_VECTOR,
 317                   BRW_MATH_PRECISION_FULL);
 318       }
 319    }
 320 }
 321
 322 void emit_linterp(struct brw_compile *p,
 323                   const struct brw_reg *dst,
 324                   GLuint mask,
 325                   const struct brw_reg *arg0,
 326                   const struct brw_reg *deltas)
 327 {
 328    struct intel_context *intel = &p->brw->intel;
 329    struct brw_reg interp[4];
 330    GLuint nr = arg0[0].nr;
 331    GLuint i;
 332
 333    interp[0] = brw_vec1_grf(nr, 0);
 334    interp[1] = brw_vec1_grf(nr, 4);
 335    interp[2] = brw_vec1_grf(nr+1, 0);
 336    interp[3] = brw_vec1_grf(nr+1, 4);
 337
 338    for (i = 0; i < 4; i++) {
 339       if (mask & (1<<i)) {
 340          if (intel->gen >= 6) {
 341             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 342          } else if (can_do_pln(intel, deltas)) {
 343             brw_PLN(p, dst[i], interp[i], deltas[0]);
 344          } else {
 345             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 346             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 347          }
 348       }
 349    }
 350 }
 351
 352
 353 void emit_pinterp(struct brw_compile *p,
 354                   const struct brw_reg *dst,
 355                   GLuint mask,
 356                   const struct brw_reg *arg0,
 357                   const struct brw_reg *deltas,
 358                   const struct brw_reg *w)
 359 {
 360    struct intel_context *intel = &p->brw->intel;
 361    struct brw_reg interp[4];
 362    GLuint nr = arg0[0].nr;
 363    GLuint i;
 364
 365    if (intel->gen >= 6) {
 366       emit_linterp(p, dst, mask, arg0, interp);
 367       return;
 368    }
 369
 370    interp[0] = brw_vec1_grf(nr, 0);
 371    interp[1] = brw_vec1_grf(nr, 4);
 372    interp[2] = brw_vec1_grf(nr+1, 0);
 373    interp[3] = brw_vec1_grf(nr+1, 4);
 374
 375    for (i = 0; i < 4; i++) {
 376       if (mask & (1<<i)) {
 377          if (can_do_pln(intel, deltas)) {
 378             brw_PLN(p, dst[i], interp[i], deltas[0]);
 379          } else {
 380             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 381             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 382          }
 383       }
 384    }
 385    for (i = 0; i < 4; i++) {
 386       if (mask & (1<<i)) {
 387          brw_MUL(p, dst[i], dst[i], w[3]);
 388       }
 389    }
 390 }
 391
 392
 393 void emit_cinterp(struct brw_compile *p,
 394                   const struct brw_reg *dst,
 395                   GLuint mask,
 396                   const struct brw_reg *arg0)
 397 {
 398    struct brw_reg interp[4];
 399    GLuint nr = arg0[0].nr;
 400    GLuint i;
 401
 402    interp[0] = brw_vec1_grf(nr, 0);
 403    interp[1] = brw_vec1_grf(nr, 4);
 404    interp[2] = brw_vec1_grf(nr+1, 0);
 405    interp[3] = brw_vec1_grf(nr+1, 4);
 406
 407    for (i = 0; i < 4; i++) {
 408       if (mask & (1<<i)) {
 409          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 410       }
 411    }
 412 }
 413
 414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 415 void emit_frontfacing(struct brw_compile *p,
 416                       const struct brw_reg *dst,
 417                       GLuint mask)
 418 {
 419    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 420    GLuint i;
 421
 422    if (!(mask & WRITEMASK_XYZW))
 423       return;
 424
 425    for (i = 0; i < 4; i++) {
 426       if (mask & (1<<i)) {
 427          brw_MOV(p, dst[i], brw_imm_f(0.0));
 428       }
 429    }
 430
 431    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 432     * us front face
 433     */
 434    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 435    for (i = 0; i < 4; i++) {
 436       if (mask & (1<<i)) {
 437          brw_MOV(p, dst[i], brw_imm_f(1.0));
 438       }
 439    }
 440    brw_set_predicate_control_flag_value(p, 0xff);
 441 }
 442
 443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 444  * looking like:
 445  *
 446  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 447  *
 448  * and we're trying to produce:
 449  *
 450  *           DDX                     DDY
 451  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 452  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 453  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 454  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 455  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 456  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 457  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 458  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 459  *
 460  * and add another set of two more subspans if in 16-pixel dispatch mode.
 461  *
 462  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 463  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 464  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 465  * between each other.  We could probably do it like ddx and swizzle the right
 466  * order later, but bail for now and just produce
 467  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 468  */
 469 void emit_ddxy(struct brw_compile *p,
 470                const struct brw_reg *dst,
 471                GLuint mask,
 472                GLboolean is_ddx,
 473                const struct brw_reg *arg0)
 474 {
 475    int i;
 476    struct brw_reg src0, src1;
 477
 478    if (mask & SATURATE)
 479       brw_set_saturate(p, 1);
 480    for (i = 0; i < 4; i++ ) {
 481       if (mask & (1<<i)) {
 482          if (is_ddx) {
 483             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 484                            BRW_REGISTER_TYPE_F,
 485                            BRW_VERTICAL_STRIDE_2,
 486                            BRW_WIDTH_2,
 487                            BRW_HORIZONTAL_STRIDE_0,
 488                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 489             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 490                            BRW_REGISTER_TYPE_F,
 491                            BRW_VERTICAL_STRIDE_2,
 492                            BRW_WIDTH_2,
 493                            BRW_HORIZONTAL_STRIDE_0,
 494                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 495          } else {
 496             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 497                            BRW_REGISTER_TYPE_F,
 498                            BRW_VERTICAL_STRIDE_4,
 499                            BRW_WIDTH_4,
 500                            BRW_HORIZONTAL_STRIDE_0,
 501                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 502             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 503                            BRW_REGISTER_TYPE_F,
 504                            BRW_VERTICAL_STRIDE_4,
 505                            BRW_WIDTH_4,
 506                            BRW_HORIZONTAL_STRIDE_0,
 507                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 508          }
 509          brw_ADD(p, dst[i], src0, negate(src1));
 510       }
 511    }
 512    if (mask & SATURATE)
 513       brw_set_saturate(p, 0);
 514 }
 515
 516 void emit_alu1(struct brw_compile *p,
 517                struct brw_instruction *(*func)(struct brw_compile *,
 518                                                struct brw_reg,
 519                                                struct brw_reg),
 520                const struct brw_reg *dst,
 521                GLuint mask,
 522                const struct brw_reg *arg0)
 523 {
 524    GLuint i;
 525
 526    if (mask & SATURATE)
 527       brw_set_saturate(p, 1);
 528
 529    for (i = 0; i < 4; i++) {
 530       if (mask & (1<<i)) {
 531          func(p, dst[i], arg0[i]);
 532       }
 533    }
 534
 535    if (mask & SATURATE)
 536       brw_set_saturate(p, 0);
 537 }
 538
 539
 540 void emit_alu2(struct brw_compile *p,
 541                struct brw_instruction *(*func)(struct brw_compile *,
 542                                                struct brw_reg,
 543                                                struct brw_reg,
 544                                                struct brw_reg),
 545                const struct brw_reg *dst,
 546                GLuint mask,
 547                const struct brw_reg *arg0,
 548                const struct brw_reg *arg1)
 549 {
 550    GLuint i;
 551
 552    if (mask & SATURATE)
 553       brw_set_saturate(p, 1);
 554
 555    for (i = 0; i < 4; i++) {
 556       if (mask & (1<<i)) {
 557          func(p, dst[i], arg0[i], arg1[i]);
 558       }
 559    }
 560
 561    if (mask & SATURATE)
 562       brw_set_saturate(p, 0);
 563 }
 564
 565
 566 void emit_mad(struct brw_compile *p,
 567               const struct brw_reg *dst,
 568               GLuint mask,
 569               const struct brw_reg *arg0,
 570               const struct brw_reg *arg1,
 571               const struct brw_reg *arg2)
 572 {
 573    GLuint i;
 574
 575    for (i = 0; i < 4; i++) {
 576       if (mask & (1<<i)) {
 577          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 578
 579          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 580          brw_ADD(p, dst[i], dst[i], arg2[i]);
 581          brw_set_saturate(p, 0);
 582       }
 583    }
 584 }
 585
 586 void emit_lrp(struct brw_compile *p,
 587               const struct brw_reg *dst,
 588               GLuint mask,
 589               const struct brw_reg *arg0,
 590               const struct brw_reg *arg1,
 591               const struct brw_reg *arg2)
 592 {
 593    GLuint i;
 594
 595    /* Uses dst as a temporary:
 596     */
 597    for (i = 0; i < 4; i++) {
 598       if (mask & (1<<i)) {
 599          /* Can I use the LINE instruction for this?
 600           */
 601          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 602          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 603
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607       }
 608    }
 609 }
 610
 611 void emit_sop(struct brw_compile *p,
 612               const struct brw_reg *dst,
 613               GLuint mask,
 614               GLuint cond,
 615               const struct brw_reg *arg0,
 616               const struct brw_reg *arg1)
 617 {
 618    GLuint i;
 619
 620    for (i = 0; i < 4; i++) {
 621       if (mask & (1<<i)) {
 622          brw_push_insn_state(p);
 623          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 624          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 625          brw_MOV(p, dst[i], brw_imm_f(0));
 626          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 627          brw_MOV(p, dst[i], brw_imm_f(1.0));
 628          brw_pop_insn_state(p);
 629       }
 630    }
 631 }
 632
 633 static void emit_slt( struct brw_compile *p,
 634                       const struct brw_reg *dst,
 635                       GLuint mask,
 636                       const struct brw_reg *arg0,
 637                       const struct brw_reg *arg1 )
 638 {
 639    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 640 }
 641
 642 static void emit_sle( struct brw_compile *p,
 643                       const struct brw_reg *dst,
 644                       GLuint mask,
 645                       const struct brw_reg *arg0,
 646                       const struct brw_reg *arg1 )
 647 {
 648    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 649 }
 650
 651 static void emit_sgt( struct brw_compile *p,
 652                       const struct brw_reg *dst,
 653                       GLuint mask,
 654                       const struct brw_reg *arg0,
 655                       const struct brw_reg *arg1 )
 656 {
 657    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 658 }
 659
 660 static void emit_sge( struct brw_compile *p,
 661                       const struct brw_reg *dst,
 662                       GLuint mask,
 663                       const struct brw_reg *arg0,
 664                       const struct brw_reg *arg1 )
 665 {
 666    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 667 }
 668
 669 static void emit_seq( struct brw_compile *p,
 670                       const struct brw_reg *dst,
 671                       GLuint mask,
 672                       const struct brw_reg *arg0,
 673                       const struct brw_reg *arg1 )
 674 {
 675    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 676 }
 677
 678 static void emit_sne( struct brw_compile *p,
 679                       const struct brw_reg *dst,
 680                       GLuint mask,
 681                       const struct brw_reg *arg0,
 682                       const struct brw_reg *arg1 )
 683 {
 684    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 685 }
 686
 687 void emit_cmp(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1,
 692               const struct brw_reg *arg2)
 693 {
 694    GLuint i;
 695
 696    for (i = 0; i < 4; i++) {
 697       if (mask & (1<<i)) {
 698          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 699
 700          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 701          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 702          brw_set_saturate(p, 0);
 703          brw_set_predicate_control_flag_value(p, 0xff);
 704       }
 705    }
 706 }
 707
 708 void emit_sign(struct brw_compile *p,
 709                const struct brw_reg *dst,
 710                GLuint mask,
 711                const struct brw_reg *arg0)
 712 {
 713    GLuint i;
 714
 715    for (i = 0; i < 4; i++) {
 716       if (mask & (1<<i)) {
 717          brw_MOV(p, dst[i], brw_imm_f(0.0));
 718
 719          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 720          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 721          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 722
 723          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 724          brw_MOV(p, dst[i], brw_imm_f(1.0));
 725          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 726       }
 727    }
 728 }
 729
 730 void emit_max(struct brw_compile *p,
 731               const struct brw_reg *dst,
 732               GLuint mask,
 733               const struct brw_reg *arg0,
 734               const struct brw_reg *arg1)
 735 {
 736    GLuint i;
 737
 738    for (i = 0; i < 4; i++) {
 739       if (mask & (1<<i)) {
 740          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 741
 742          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 743          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 744          brw_set_saturate(p, 0);
 745          brw_set_predicate_control_flag_value(p, 0xff);
 746       }
 747    }
 748 }
 749
 750 void emit_min(struct brw_compile *p,
 751               const struct brw_reg *dst,
 752               GLuint mask,
 753               const struct brw_reg *arg0,
 754               const struct brw_reg *arg1)
 755 {
 756    GLuint i;
 757
 758    for (i = 0; i < 4; i++) {
 759       if (mask & (1<<i)) {
 760          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 761
 762          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 763          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 764          brw_set_saturate(p, 0);
 765          brw_set_predicate_control_flag_value(p, 0xff);
 766       }
 767    }
 768 }
 769
 770
 771 void emit_dp2(struct brw_compile *p,
 772               const struct brw_reg *dst,
 773               GLuint mask,
 774               const struct brw_reg *arg0,
 775               const struct brw_reg *arg1)
 776 {
 777    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 778
 779    if (!(mask & WRITEMASK_XYZW))
 780       return; /* Do not emit dead code */
 781
 782    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 783
 784    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 785
 786    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 787    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 788    brw_set_saturate(p, 0);
 789 }
 790
 791
 792 void emit_dp3(struct brw_compile *p,
 793               const struct brw_reg *dst,
 794               GLuint mask,
 795               const struct brw_reg *arg0,
 796               const struct brw_reg *arg1)
 797 {
 798    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 799
 800    if (!(mask & WRITEMASK_XYZW))
 801       return; /* Do not emit dead code */
 802
 803    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 804
 805    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 806    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 807
 808    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 809    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 810    brw_set_saturate(p, 0);
 811 }
 812
 813
 814 void emit_dp4(struct brw_compile *p,
 815               const struct brw_reg *dst,
 816               GLuint mask,
 817               const struct brw_reg *arg0,
 818               const struct brw_reg *arg1)
 819 {
 820    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 821
 822    if (!(mask & WRITEMASK_XYZW))
 823       return; /* Do not emit dead code */
 824
 825    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 826
 827    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 828    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 829    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 830
 831    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 832    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 833    brw_set_saturate(p, 0);
 834 }
 835
 836
 837 void emit_dph(struct brw_compile *p,
 838               const struct brw_reg *dst,
 839               GLuint mask,
 840               const struct brw_reg *arg0,
 841               const struct brw_reg *arg1)
 842 {
 843    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 844
 845    if (!(mask & WRITEMASK_XYZW))
 846       return; /* Do not emit dead code */
 847
 848    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 849
 850    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 851    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 852    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 853
 854    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 855    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 856    brw_set_saturate(p, 0);
 857 }
 858
 859
 860 void emit_xpd(struct brw_compile *p,
 861               const struct brw_reg *dst,
 862               GLuint mask,
 863               const struct brw_reg *arg0,
 864               const struct brw_reg *arg1)
 865 {
 866    GLuint i;
 867
 868    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 869
 870    for (i = 0 ; i < 3; i++) {
 871       if (mask & (1<<i)) {
 872          GLuint i2 = (i+2)%3;
 873          GLuint i1 = (i+1)%3;
 874
 875          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 876
 877          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 878          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 879          brw_set_saturate(p, 0);
 880       }
 881    }
 882 }
 883
 884
 885 void emit_math1(struct brw_wm_compile *c,
 886                 GLuint function,
 887                 const struct brw_reg *dst,
 888                 GLuint mask,
 889                 const struct brw_reg *arg0)
 890 {
 891    struct brw_compile *p = &c->func;
 892    struct intel_context *intel = &p->brw->intel;
 893    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 894    GLuint saturate = ((mask & SATURATE) ?
 895                       BRW_MATH_SATURATE_SATURATE :
 896                       BRW_MATH_SATURATE_NONE);
 897    struct brw_reg src;
 898
 899    if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
 900                             arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
 901                            arg0[0].negate || arg0[0].abs)) {
 902       /* Gen6 math requires that source and dst horizontal stride be 1,
 903        * and that the argument be in the GRF.
 904        *
 905        * The hardware ignores source modifiers (negate and abs) on math
 906        * instructions, so we also move to a temp to set those up.
 907        */
 908       src = dst[dst_chan];
 909       brw_MOV(p, src, arg0[0]);
 910    } else {
 911       src = arg0[0];
 912    }
 913
 914    if (!(mask & WRITEMASK_XYZW))
 915       return; /* Do not emit dead code */
 916
 917    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 918
 919    /* Send two messages to perform all 16 operations:
 920     */
 921    brw_push_insn_state(p);
 922    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 923    brw_math(p,
 924             dst[dst_chan],
 925             function,
 926             saturate,
 927             2,
 928             src,
 929             BRW_MATH_DATA_VECTOR,
 930             BRW_MATH_PRECISION_FULL);
 931
 932    if (c->dispatch_width == 16) {
 933       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 934       brw_math(p,
 935                offset(dst[dst_chan],1),
 936                function,
 937                saturate,
 938                3,
 939                sechalf(src),
 940                BRW_MATH_DATA_VECTOR,
 941                BRW_MATH_PRECISION_FULL);
 942    }
 943    brw_pop_insn_state(p);
 944 }
 945
 946
 947 void emit_math2(struct brw_wm_compile *c,
 948                 GLuint function,
 949                 const struct brw_reg *dst,
 950                 GLuint mask,
 951                 const struct brw_reg *arg0,
 952                 const struct brw_reg *arg1)
 953 {
 954    struct brw_compile *p = &c->func;
 955    struct intel_context *intel = &p->brw->intel;
 956    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 957
 958    if (!(mask & WRITEMASK_XYZW))
 959       return; /* Do not emit dead code */
 960
 961    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 962
 963    brw_push_insn_state(p);
 964
 965    /* math can only operate on up to a vec8 at a time, so in
 966     * dispatch_width==16 we have to do the second half manually.
 967     */
 968    if (intel->gen >= 6) {
 969       struct brw_reg src0 = arg0[0];
 970       struct brw_reg src1 = arg1[0];
 971       struct brw_reg temp_dst = dst[dst_chan];
 972
 973       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 974          if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 975             /* Both scalar arguments.  Do scalar calc. */
 976             src0.hstride = BRW_HORIZONTAL_STRIDE_1;
 977             src1.hstride = BRW_HORIZONTAL_STRIDE_1;
 978             temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
 979             temp_dst.width = BRW_WIDTH_1;
 980
 981             if (arg0[0].subnr != 0) {
 982                brw_MOV(p, temp_dst, src0);
 983                src0 = temp_dst;
 984
 985                /* Ouch.  We've used the temp as a dst, and we still
 986                 * need a temp to store arg1 in, because src and dst
 987                 * offsets have to be equal.  Leaving this up to
 988                 * glsl2-965 to handle correctly.
 989                 */
 990                assert(arg1[0].subnr == 0);
 991             } else if (arg1[0].subnr != 0) {
 992                brw_MOV(p, temp_dst, src1);
 993                src1 = temp_dst;
 994             }
 995          } else {
 996             brw_MOV(p, temp_dst, src0);
 997             src0 = temp_dst;
 998          }
 999       } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1000          brw_MOV(p, temp_dst, src1);
1001          src1 = temp_dst;
1002       }
1003
1004       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1005       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1006       brw_math2(p,
1007                 temp_dst,
1008                 function,
1009                 src0,
1010                 src1);
1011       if (c->dispatch_width == 16) {
1012          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1013          brw_math2(p,
1014                    sechalf(temp_dst),
1015                    function,
1016                    sechalf(src0),
1017                    sechalf(src1));
1018       }
1019
1020       /* Splat a scalar result into all the channels. */
1021       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1022           arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1023          temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1024          temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1025          brw_MOV(p, dst[dst_chan], temp_dst);
1026       }
1027    } else {
1028       GLuint saturate = ((mask & SATURATE) ?
1029                          BRW_MATH_SATURATE_SATURATE :
1030                          BRW_MATH_SATURATE_NONE);
1031
1032       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1033       brw_MOV(p, brw_message_reg(3), arg1[0]);
1034       if (c->dispatch_width == 16) {
1035          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1036          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1037       }
1038
1039       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1040       brw_math(p,
1041                dst[dst_chan],
1042                function,
1043                saturate,
1044                2,
1045                arg0[0],
1046                BRW_MATH_DATA_VECTOR,
1047                BRW_MATH_PRECISION_FULL);
1048
1049       /* Send two messages to perform all 16 operations:
1050        */
1051       if (c->dispatch_width == 16) {
1052          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1053          brw_math(p,
1054                   offset(dst[dst_chan],1),
1055                   function,
1056                   saturate,
1057                   4,
1058                   sechalf(arg0[0]),
1059                   BRW_MATH_DATA_VECTOR,
1060                   BRW_MATH_PRECISION_FULL);
1061       }
1062    }
1063    brw_pop_insn_state(p);
1064 }
1065
1066
1067 void emit_tex(struct brw_wm_compile *c,
1068               struct brw_reg *dst,
1069               GLuint dst_flags,
1070               struct brw_reg *arg,
1071               struct brw_reg depth_payload,
1072               GLuint tex_idx,
1073               GLuint sampler,
1074               GLboolean shadow)
1075 {
1076    struct brw_compile *p = &c->func;
1077    struct intel_context *intel = &p->brw->intel;
1078    struct brw_reg dst_retyped;
1079    GLuint cur_mrf = 2, response_length;
1080    GLuint i, nr_texcoords;
1081    GLuint emit;
1082    GLuint msg_type;
1083    GLuint mrf_per_channel;
1084    GLuint simd_mode;
1085
1086    if (c->dispatch_width == 16) {
1087       mrf_per_channel = 2;
1088       response_length = 8;
1089       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1090       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1091    } else {
1092       mrf_per_channel = 1;
1093       response_length = 4;
1094       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1095       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1096    }
1097
1098    /* How many input regs are there?
1099     */
1100    switch (tex_idx) {
1101    case TEXTURE_1D_INDEX:
1102       emit = WRITEMASK_X;
1103       nr_texcoords = 1;
1104       break;
1105    case TEXTURE_2D_INDEX:
1106    case TEXTURE_RECT_INDEX:
1107       emit = WRITEMASK_XY;
1108       nr_texcoords = 2;
1109       break;
1110    case TEXTURE_3D_INDEX:
1111    case TEXTURE_CUBE_INDEX:
1112       emit = WRITEMASK_XYZ;
1113       nr_texcoords = 3;
1114       break;
1115    default:
1116       /* unexpected target */
1117       abort();
1118    }
1119
1120    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1121    if (intel->gen < 5 && c->dispatch_width == 8)
1122       nr_texcoords = 3;
1123
1124    /* For shadow comparisons, we have to supply u,v,r. */
1125    if (shadow)
1126       nr_texcoords = 3;
1127
1128    /* Emit the texcoords. */
1129    for (i = 0; i < nr_texcoords; i++) {
1130       if (emit & (1<<i))
1131          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1132       else
1133          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1134       cur_mrf += mrf_per_channel;
1135    }
1136
1137    /* Fill in the shadow comparison reference value. */
1138    if (shadow) {
1139       if (intel->gen >= 5) {
1140          /* Fill in the cube map array index value. */
1141          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1142          cur_mrf += mrf_per_channel;
1143       } else if (c->dispatch_width == 8) {
1144          /* Fill in the LOD bias value. */
1145          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1146          cur_mrf += mrf_per_channel;
1147       }
1148       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1149       cur_mrf += mrf_per_channel;
1150    }
1151
1152    if (intel->gen >= 5) {
1153       if (shadow)
1154          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1155       else
1156          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1157    } else {
1158       /* Note that G45 and older determines shadow compare and dispatch width
1159        * from message length for most messages.
1160        */
1161       if (c->dispatch_width == 16 && shadow)
1162          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1163       else
1164          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1165    }
1166
1167    brw_SAMPLE(p,
1168               dst_retyped,
1169               1,
1170               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1171               SURF_INDEX_TEXTURE(sampler),
1172               sampler,
1173               dst_flags & WRITEMASK_XYZW,
1174               msg_type,
1175               response_length,
1176               cur_mrf - 1,
1177               0,
1178               1,
1179               simd_mode);
1180 }
1181
1182
1183 void emit_txb(struct brw_wm_compile *c,
1184               struct brw_reg *dst,
1185               GLuint dst_flags,
1186               struct brw_reg *arg,
1187               struct brw_reg depth_payload,
1188               GLuint tex_idx,
1189               GLuint sampler)
1190 {
1191    struct brw_compile *p = &c->func;
1192    struct intel_context *intel = &p->brw->intel;
1193    GLuint msgLength;
1194    GLuint msg_type;
1195    GLuint mrf_per_channel;
1196    GLuint response_length;
1197    struct brw_reg dst_retyped;
1198
1199    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1200     * samples, so we'll use the 16-wide instruction, leave the second halves
1201     * undefined, and trust the execution mask to keep the undefined pixels
1202     * from mattering.
1203     */
1204    if (c->dispatch_width == 16 || intel->gen < 5) {
1205       if (intel->gen >= 5)
1206          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1207       else
1208          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1209       mrf_per_channel = 2;
1210       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1211       response_length = 8;
1212    } else {
1213       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1214       mrf_per_channel = 1;
1215       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1216       response_length = 4;
1217    }
1218
1219    /* Shadow ignored for txb. */
1220    switch (tex_idx) {
1221    case TEXTURE_1D_INDEX:
1222       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1223       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1224       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1225       break;
1226    case TEXTURE_2D_INDEX:
1227    case TEXTURE_RECT_INDEX:
1228       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1229       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1230       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1231       break;
1232    case TEXTURE_3D_INDEX:
1233    case TEXTURE_CUBE_INDEX:
1234       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1235       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1236       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1237       break;
1238    default:
1239       /* unexpected target */
1240       abort();
1241    }
1242
1243    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1244    msgLength = 2 + 4 * mrf_per_channel - 1;
1245
1246    brw_SAMPLE(p,
1247               dst_retyped,
1248               1,
1249               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1250               SURF_INDEX_TEXTURE(sampler),
1251               sampler,
1252               dst_flags & WRITEMASK_XYZW,
1253               msg_type,
1254               response_length,
1255               msgLength,
1256               0,
1257               1,
1258               BRW_SAMPLER_SIMD_MODE_SIMD16);
1259 }
1260
1261
1262 static void emit_lit(struct brw_wm_compile *c,
1263                      const struct brw_reg *dst,
1264                      GLuint mask,
1265                      const struct brw_reg *arg0)
1266 {
1267    struct brw_compile *p = &c->func;
1268
1269    assert((mask & WRITEMASK_XW) == 0);
1270
1271    if (mask & WRITEMASK_Y) {
1272       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1273       brw_MOV(p, dst[1], arg0[0]);
1274       brw_set_saturate(p, 0);
1275    }
1276
1277    if (mask & WRITEMASK_Z) {
1278       emit_math2(c, BRW_MATH_FUNCTION_POW,
1279                  &dst[2],
1280                  WRITEMASK_X | (mask & SATURATE),
1281                  &arg0[1],
1282                  &arg0[3]);
1283    }
1284
1285    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1286     * some of the POW calculations above, but 16-wide iff statements
1287     * seem to lock c1 hardware, so this is a nasty workaround:
1288     */
1289    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1290    {
1291       if (mask & WRITEMASK_Y)
1292          brw_MOV(p, dst[1], brw_imm_f(0));
1293
1294       if (mask & WRITEMASK_Z)
1295          brw_MOV(p, dst[2], brw_imm_f(0));
1296    }
1297    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1298 }
1299
1300
1301 /* Kill pixel - set execution mask to zero for those pixels which
1302  * fail.
1303  */
1304 static void emit_kil( struct brw_wm_compile *c,
1305                       struct brw_reg *arg0)
1306 {
1307    struct brw_compile *p = &c->func;
1308    struct intel_context *intel = &p->brw->intel;
1309    struct brw_reg pixelmask;
1310    GLuint i, j;
1311
1312    if (intel->gen >= 6)
1313       pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1314    else
1315       pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1316
1317    for (i = 0; i < 4; i++) {
1318       /* Check if we've already done the comparison for this reg
1319        * -- common when someone does KIL TEMP.wwww.
1320        */
1321       for (j = 0; j < i; j++) {
1322          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1323             break;
1324       }
1325       if (j != i)
1326          continue;
1327
1328       brw_push_insn_state(p);
1329       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1330       brw_set_predicate_control_flag_value(p, 0xff);
1331       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1332       brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1333       brw_pop_insn_state(p);
1334    }
1335 }
1336
1337 static void fire_fb_write( struct brw_wm_compile *c,
1338                            GLuint base_reg,
1339                            GLuint nr,
1340                            GLuint target,
1341                            GLuint eot )
1342 {
1343    struct brw_compile *p = &c->func;
1344    struct intel_context *intel = &p->brw->intel;
1345    struct brw_reg dst;
1346
1347    if (c->dispatch_width == 16)
1348       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1349    else
1350       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1351
1352    /* Pass through control information:
1353     */
1354 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1355    if (intel->gen < 6) /* gen6, use headerless for fb write */
1356    {
1357       brw_push_insn_state(p);
1358       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1359       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1360       brw_MOV(p,
1361                brw_message_reg(base_reg + 1),
1362                brw_vec8_grf(1, 0));
1363       brw_pop_insn_state(p);
1364    }
1365
1366    /* Send framebuffer write message: */
1367 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1368    brw_fb_WRITE(p,
1369                 c->dispatch_width,
1370                 dst,
1371                 base_reg,
1372                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1373                 target,
1374                 nr,
1375                 0,
1376                 eot);
1377 }
1378
1379
1380 static void emit_aa( struct brw_wm_compile *c,
1381                      struct brw_reg *arg1,
1382                      GLuint reg )
1383 {
1384    struct brw_compile *p = &c->func;
1385    GLuint comp = c->aa_dest_stencil_reg / 2;
1386    GLuint off = c->aa_dest_stencil_reg % 2;
1387    struct brw_reg aa = offset(arg1[comp], off);
1388
1389    brw_push_insn_state(p);
1390    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1391    brw_MOV(p, brw_message_reg(reg), aa);
1392    brw_pop_insn_state(p);
1393 }
1394
1395
1396 /* Post-fragment-program processing.  Send the results to the
1397  * framebuffer.
1398  * \param arg0  the fragment color
1399  * \param arg1  the pass-through depth value
1400  * \param arg2  the shader-computed depth value
1401  */
1402 void emit_fb_write(struct brw_wm_compile *c,
1403                    struct brw_reg *arg0,
1404                    struct brw_reg *arg1,
1405                    struct brw_reg *arg2,
1406                    GLuint target,
1407                    GLuint eot)
1408 {
1409    struct brw_compile *p = &c->func;
1410    struct brw_context *brw = p->brw;
1411    struct intel_context *intel = &brw->intel;
1412    GLuint nr = 2;
1413    GLuint channel;
1414
1415    /* Reserve a space for AA - may not be needed:
1416     */
1417    if (c->aa_dest_stencil_reg)
1418       nr += 1;
1419
1420    /* I don't really understand how this achieves the color interleave
1421     * (ie RGBARGBA) in the result:  [Do the saturation here]
1422     */
1423    brw_push_insn_state(p);
1424
1425    for (channel = 0; channel < 4; channel++) {
1426       if (intel->gen >= 6) {
1427          /* gen6 SIMD16 single source DP write looks like:
1428           * m + 0: r0
1429           * m + 1: r1
1430           * m + 2: g0
1431           * m + 3: g1
1432           * m + 4: b0
1433           * m + 5: b1
1434           * m + 6: a0
1435           * m + 7: a1
1436           */
1437          if (c->dispatch_width == 16) {
1438             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1439          } else {
1440             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1441          }
1442       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1443          /* pre-gen6 SIMD16 single source DP write looks like:
1444           * m + 0: r0
1445           * m + 1: g0
1446           * m + 2: b0
1447           * m + 3: a0
1448           * m + 4: r1
1449           * m + 5: g1
1450           * m + 6: b1
1451           * m + 7: a1
1452           *
1453           * By setting the high bit of the MRF register number, we indicate
1454           * that we want COMPR4 mode - instead of doing the usual destination
1455           * + 1 for the second half we get destination + 4.
1456           */
1457          brw_MOV(p,
1458                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1459                  arg0[channel]);
1460       } else {
1461          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1462          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1463          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1464          brw_MOV(p,
1465                  brw_message_reg(nr + channel),
1466                  arg0[channel]);
1467
1468          if (c->dispatch_width == 16) {
1469             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1470             brw_MOV(p,
1471                     brw_message_reg(nr + channel + 4),
1472                     sechalf(arg0[channel]));
1473          }
1474       }
1475    }
1476    /* skip over the regs populated above:
1477     */
1478    if (c->dispatch_width == 16)
1479       nr += 8;
1480    else
1481       nr += 4;
1482
1483    brw_pop_insn_state(p);
1484
1485    if (c->source_depth_to_render_target)
1486    {
1487       if (c->computes_depth)
1488          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1489       else
1490          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1491
1492       nr += 2;
1493    }
1494
1495    if (c->dest_depth_reg)
1496    {
1497       GLuint comp = c->dest_depth_reg / 2;
1498       GLuint off = c->dest_depth_reg % 2;
1499
1500       if (off != 0) {
1501          brw_push_insn_state(p);
1502          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1503
1504          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1505          /* 2nd half? */
1506          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1507          brw_pop_insn_state(p);
1508       }
1509       else {
1510          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1511       }
1512       nr += 2;
1513    }
1514
1515    if (intel->gen >= 6) {
1516       /* Load the message header.  There's no implied move from src0
1517        * to the base mrf on gen6.
1518        */
1519       brw_push_insn_state(p);
1520       brw_set_mask_control(p, BRW_MASK_DISABLE);
1521       brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0));
1522       brw_pop_insn_state(p);
1523
1524       if (target != 0) {
1525          brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1526                                         0,
1527                                         2), BRW_REGISTER_TYPE_UD),
1528                  brw_imm_ud(target));
1529       }
1530    }
1531
1532    if (!c->runtime_check_aads_emit) {
1533       if (c->aa_dest_stencil_reg)
1534          emit_aa(c, arg1, 2);
1535
1536       fire_fb_write(c, 0, nr, target, eot);
1537    }
1538    else {
1539       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1540       struct brw_reg ip = brw_ip_reg();
1541       struct brw_instruction *jmp;
1542
1543       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1544       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1545       brw_AND(p,
1546               v1_null_ud,
1547               get_element_ud(brw_vec8_grf(1,0), 6),
1548               brw_imm_ud(1<<26));
1549
1550       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1551       {
1552          emit_aa(c, arg1, 2);
1553          fire_fb_write(c, 0, nr, target, eot);
1554          /* note - thread killed in subroutine */
1555       }
1556       brw_land_fwd_jump(p, jmp);
1557
1558       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1559        */
1560       fire_fb_write(c, 1, nr-1, target, eot);
1561    }
1562 }
1563
1564 /**
1565  * Move a GPR to scratch memory.
1566  */
1567 static void emit_spill( struct brw_wm_compile *c,
1568                         struct brw_reg reg,
1569                         GLuint slot )
1570 {
1571    struct brw_compile *p = &c->func;
1572
1573    /*
1574      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1575    */
1576    brw_MOV(p, brw_message_reg(2), reg);
1577
1578    /*
1579      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1580      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1581    */
1582    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1583 }
1584
1585
1586 /**
1587  * Load a GPR from scratch memory.
1588  */
1589 static void emit_unspill( struct brw_wm_compile *c,
1590                           struct brw_reg reg,
1591                           GLuint slot )
1592 {
1593    struct brw_compile *p = &c->func;
1594
1595    /* Slot 0 is the undef value.
1596     */
1597    if (slot == 0) {
1598       brw_MOV(p, reg, brw_imm_f(0));
1599       return;
1600    }
1601
1602    /*
1603      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1604      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1605    */
1606
1607    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1608 }
1609
1610
1611 /**
1612  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1613  * Args with unspill_reg != 0 will be loaded from scratch memory.
1614  */
1615 static void get_argument_regs( struct brw_wm_compile *c,
1616                                struct brw_wm_ref *arg[],
1617                                struct brw_reg *regs )
1618 {
1619    GLuint i;
1620
1621    for (i = 0; i < 4; i++) {
1622       if (arg[i]) {
1623          if (arg[i]->unspill_reg)
1624             emit_unspill(c,
1625                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1626                          arg[i]->value->spill_slot);
1627
1628          regs[i] = arg[i]->hw_reg;
1629       }
1630       else {
1631          regs[i] = brw_null_reg();
1632       }
1633    }
1634 }
1635
1636
1637 /**
1638  * For values that have a spill_slot!=0, write those regs to scratch memory.
1639  */
1640 static void spill_values( struct brw_wm_compile *c,
1641                           struct brw_wm_value *values,
1642                           GLuint nr )
1643 {
1644    GLuint i;
1645
1646    for (i = 0; i < nr; i++)
1647       if (values[i].spill_slot)
1648          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1649 }
1650
1651
1652 /* Emit the fragment program instructions here.
1653  */
1654 void brw_wm_emit( struct brw_wm_compile *c )
1655 {
1656    struct brw_compile *p = &c->func;
1657    struct intel_context *intel = &p->brw->intel;
1658    GLuint insn;
1659
1660    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1661    if (intel->gen >= 6)
1662         brw_set_acc_write_control(p, 1);
1663
1664    /* Check if any of the payload regs need to be spilled:
1665     */
1666    spill_values(c, c->payload.depth, 4);
1667    spill_values(c, c->creg, c->nr_creg);
1668    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1669
1670
1671    for (insn = 0; insn < c->nr_insns; insn++) {
1672
1673       struct brw_wm_instruction *inst = &c->instruction[insn];
1674       struct brw_reg args[3][4], dst[4];
1675       GLuint i, dst_flags;
1676
1677       /* Get argument regs:
1678        */
1679       for (i = 0; i < 3; i++)
1680          get_argument_regs(c, inst->src[i], args[i]);
1681
1682       /* Get dest regs:
1683        */
1684       for (i = 0; i < 4; i++)
1685          if (inst->dst[i])
1686             dst[i] = inst->dst[i]->hw_reg;
1687          else
1688             dst[i] = brw_null_reg();
1689
1690       /* Flags
1691        */
1692       dst_flags = inst->writemask;
1693       if (inst->saturate)
1694          dst_flags |= SATURATE;
1695
1696       switch (inst->opcode) {
1697          /* Generated instructions for calculating triangle interpolants:
1698           */
1699       case WM_PIXELXY:
1700          emit_pixel_xy(c, dst, dst_flags);
1701          break;
1702
1703       case WM_DELTAXY:
1704          emit_delta_xy(p, dst, dst_flags, args[0]);
1705          break;
1706
1707       case WM_WPOSXY:
1708          emit_wpos_xy(c, dst, dst_flags, args[0]);
1709          break;
1710
1711       case WM_PIXELW:
1712          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1713          break;
1714
1715       case WM_LINTERP:
1716          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1717          break;
1718
1719       case WM_PINTERP:
1720          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1721          break;
1722
1723       case WM_CINTERP:
1724          emit_cinterp(p, dst, dst_flags, args[0]);
1725          break;
1726
1727       case WM_FB_WRITE:
1728          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1729          break;
1730
1731       case WM_FRONTFACING:
1732          emit_frontfacing(p, dst, dst_flags);
1733          break;
1734
1735          /* Straightforward arithmetic:
1736           */
1737       case OPCODE_ADD:
1738          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1739          break;
1740
1741       case OPCODE_FRC:
1742          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1743          break;
1744
1745       case OPCODE_FLR:
1746          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1747          break;
1748
1749       case OPCODE_DDX:
1750          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1751          break;
1752
1753       case OPCODE_DDY:
1754          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1755          break;
1756
1757       case OPCODE_DP2:
1758          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1759          break;
1760
1761       case OPCODE_DP3:
1762          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1763          break;
1764
1765       case OPCODE_DP4:
1766          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1767          break;
1768
1769       case OPCODE_DPH:
1770          emit_dph(p, dst, dst_flags, args[0], args[1]);
1771          break;
1772
1773       case OPCODE_TRUNC:
1774          for (i = 0; i < 4; i++) {
1775             if (dst_flags & (1<<i)) {
1776                brw_RNDZ(p, dst[i], args[0][i]);
1777             }
1778          }
1779          break;
1780
1781       case OPCODE_LRP:
1782          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1783          break;
1784
1785       case OPCODE_MAD:
1786          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1787          break;
1788
1789       case OPCODE_MOV:
1790       case OPCODE_SWZ:
1791          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1792          break;
1793
1794       case OPCODE_MUL:
1795          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1796          break;
1797
1798       case OPCODE_XPD:
1799          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1800          break;
1801
1802          /* Higher math functions:
1803           */
1804       case OPCODE_RCP:
1805          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1806          break;
1807
1808       case OPCODE_RSQ:
1809          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1810          break;
1811
1812       case OPCODE_SIN:
1813          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1814          break;
1815
1816       case OPCODE_COS:
1817          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1818          break;
1819
1820       case OPCODE_EX2:
1821          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1822          break;
1823
1824       case OPCODE_LG2:
1825          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1826          break;
1827
1828       case OPCODE_SCS:
1829          /* There is an scs math function, but it would need some
1830           * fixup for 16-element execution.
1831           */
1832          if (dst_flags & WRITEMASK_X)
1833             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1834          if (dst_flags & WRITEMASK_Y)
1835             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1836          break;
1837
1838       case OPCODE_POW:
1839          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1840          break;
1841
1842          /* Comparisons:
1843           */
1844       case OPCODE_CMP:
1845          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1846          break;
1847
1848       case OPCODE_MAX:
1849          emit_max(p, dst, dst_flags, args[0], args[1]);
1850          break;
1851
1852       case OPCODE_MIN:
1853          emit_min(p, dst, dst_flags, args[0], args[1]);
1854          break;
1855
1856       case OPCODE_SLT:
1857          emit_slt(p, dst, dst_flags, args[0], args[1]);
1858          break;
1859
1860       case OPCODE_SLE:
1861          emit_sle(p, dst, dst_flags, args[0], args[1]);
1862         break;
1863       case OPCODE_SGT:
1864          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1865         break;
1866       case OPCODE_SGE:
1867          emit_sge(p, dst, dst_flags, args[0], args[1]);
1868          break;
1869       case OPCODE_SEQ:
1870          emit_seq(p, dst, dst_flags, args[0], args[1]);
1871         break;
1872       case OPCODE_SNE:
1873          emit_sne(p, dst, dst_flags, args[0], args[1]);
1874         break;
1875
1876       case OPCODE_SSG:
1877          emit_sign(p, dst, dst_flags, args[0]);
1878          break;
1879
1880       case OPCODE_LIT:
1881          emit_lit(c, dst, dst_flags, args[0]);
1882          break;
1883
1884          /* Texturing operations:
1885           */
1886       case OPCODE_TEX:
1887          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1888                   inst->tex_idx, inst->tex_unit,
1889                   inst->tex_shadow);
1890          break;
1891
1892       case OPCODE_TXB:
1893          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1894                   inst->tex_idx, inst->tex_unit);
1895          break;
1896
1897       case OPCODE_KIL:
1898          emit_kil(c, args[0]);
1899          break;
1900
1901       default:
1902          printf("Unsupported opcode %i (%s) in fragment shader\n",
1903                 inst->opcode, inst->opcode < MAX_OPCODE ?
1904                 _mesa_opcode_string(inst->opcode) :
1905                 "unknown");
1906       }
1907
1908       for (i = 0; i < 4; i++)
1909         if (inst->dst[i] && inst->dst[i]->spill_slot)
1910            emit_spill(c,
1911                       inst->dst[i]->hw_reg,
1912                       inst->dst[i]->spill_slot);
1913    }
1914
1915    /* Only properly tested on ILK */
1916    if (p->brw->intel.gen == 5) {
1917      brw_remove_duplicate_mrf_moves(p);
1918      if (c->dispatch_width == 16)
1919         brw_remove_grf_to_mrf_moves(p);
1920    }
1921
1922    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1923       int i;
1924
1925      printf("wm-native:\n");
1926      for (i = 0; i < p->nr_insn; i++)
1927          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1928       printf("\n");
1929    }
1930 }
1931