src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_SWZ] = 1,
  87       [OPCODE_XPD] = 2,
  88    };
  89
  90    /* These opcodes get broken down in a way that allow two
  91     * args to be immediates.
  92     */
  93    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  94       if (arg == 1 || arg == 2)
  95          return GL_TRUE;
  96    }
  97
  98    if (opcode > ARRAY_SIZE(opcode_array))
  99       return GL_FALSE;
 100
 101    return arg == opcode_array[opcode] - 1;
 102 }
 103
 104 /**
 105  * Computes the screen-space x,y position of the pixels.
 106  *
 107  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 108  * interpolation of attributes..
 109  *
 110  * Payload R0:
 111  *
 112  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 113  *         corresponding to each of the 16 execution channels.
 114  * R0.1..8 -- ?
 115  * R1.0 -- triangle vertex 0.X
 116  * R1.1 -- triangle vertex 0.Y
 117  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 118  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 119  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 120  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 121  * R1.6 -- ?
 122  * R1.7 -- ?
 123  * R1.8 -- ?
 124  */
 125 void emit_pixel_xy(struct brw_wm_compile *c,
 126                    const struct brw_reg *dst,
 127                    GLuint mask)
 128 {
 129    struct brw_compile *p = &c->func;
 130    struct brw_reg r1 = brw_vec1_grf(1, 0);
 131    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 132    struct brw_reg dst0_uw, dst1_uw;
 133
 134    brw_push_insn_state(p);
 135    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 136
 137    if (c->dispatch_width == 16) {
 138       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 139       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 140    } else {
 141       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 142       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 143    }
 144
 145    /* Calculate pixel centers by adding 1 or 0 to each of the
 146     * micro-tile coordinates passed in r1.
 147     */
 148    if (mask & WRITEMASK_X) {
 149       brw_ADD(p,
 150               dst0_uw,
 151               stride(suboffset(r1_uw, 4), 2, 4, 0),
 152               brw_imm_v(0x10101010));
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       brw_ADD(p,
 157               dst1_uw,
 158               stride(suboffset(r1_uw,5), 2, 4, 0),
 159               brw_imm_v(0x11001100));
 160    }
 161    brw_pop_insn_state(p);
 162 }
 163
 164 /**
 165  * Computes the screen-space x,y distance of the pixels from the start
 166  * vertex.
 167  *
 168  * This will be used in linterp or pinterp with the start vertex value
 169  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 170  * to produce interpolated attribute values.
 171  */
 172 void emit_delta_xy(struct brw_compile *p,
 173                    const struct brw_reg *dst,
 174                    GLuint mask,
 175                    const struct brw_reg *arg0)
 176 {
 177    struct intel_context *intel = &p->brw->intel;
 178    struct brw_reg r1 = brw_vec1_grf(1, 0);
 179
 180    if (mask == 0)
 181       return;
 182
 183    assert(mask == WRITEMASK_XY);
 184
 185    if (intel->gen >= 6) {
 186        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 187           Just add them with 0.0 for dst reg.. */
 188        r1 = brw_imm_v(0x00000000);
 189        brw_ADD(p,
 190                dst[0],
 191                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 192                r1);
 193        brw_ADD(p,
 194                dst[1],
 195                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 196                r1);
 197        return;
 198    }
 199
 200    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 201     * centers produced by emit_pixel_xy().
 202     */
 203    brw_ADD(p,
 204            dst[0],
 205            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 206            negate(r1));
 207    brw_ADD(p,
 208            dst[1],
 209            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 210            negate(suboffset(r1,1)));
 211 }
 212
 213 /**
 214  * Computes the pixel offset from the window origin for gl_FragCoord().
 215  */
 216 void emit_wpos_xy(struct brw_wm_compile *c,
 217                   const struct brw_reg *dst,
 218                   GLuint mask,
 219                   const struct brw_reg *arg0)
 220 {
 221    struct brw_compile *p = &c->func;
 222
 223    if (mask & WRITEMASK_X) {
 224       if (c->fp->program.PixelCenterInteger) {
 225          /* X' = X */
 226          brw_MOV(p,
 227                  dst[0],
 228                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 229       } else {
 230          /* X' = X + 0.5 */
 231          brw_ADD(p,
 232                  dst[0],
 233                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 234                  brw_imm_f(0.5));
 235       }
 236    }
 237
 238    if (mask & WRITEMASK_Y) {
 239       if (c->fp->program.OriginUpperLeft) {
 240          if (c->fp->program.PixelCenterInteger) {
 241             /* Y' = Y */
 242             brw_MOV(p,
 243                     dst[1],
 244                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 245          } else {
 246             /* Y' = Y + 0.5 */
 247             brw_ADD(p,
 248                     dst[1],
 249                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 250                     brw_imm_f(0.5));
 251          }
 252       } else {
 253          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 254
 255          /* Y' = (height - 1) - Y + center */
 256          brw_ADD(p,
 257                  dst[1],
 258                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 259                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 260       }
 261    }
 262 }
 263
 264
 265 void emit_pixel_w(struct brw_wm_compile *c,
 266                   const struct brw_reg *dst,
 267                   GLuint mask,
 268                   const struct brw_reg *arg0,
 269                   const struct brw_reg *deltas)
 270 {
 271    struct brw_compile *p = &c->func;
 272    struct intel_context *intel = &p->brw->intel;
 273    struct brw_reg src;
 274    struct brw_reg temp_dst;
 275
 276    if (intel->gen >= 6)
 277         temp_dst = dst[3];
 278    else
 279         temp_dst = brw_message_reg(2);
 280
 281    assert(intel->gen < 6);
 282
 283    /* Don't need this if all you are doing is interpolating color, for
 284     * instance.
 285     */
 286    if (mask & WRITEMASK_W) {
 287       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 288
 289       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 290        * result straight into a message reg.
 291        */
 292       if (can_do_pln(intel, deltas)) {
 293          brw_PLN(p, temp_dst, interp3, deltas[0]);
 294       } else {
 295          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 296          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 297       }
 298
 299       /* Calc w */
 300       if (intel->gen >= 6)
 301          src = temp_dst;
 302       else
 303          src = brw_null_reg();
 304
 305       if (c->dispatch_width == 16) {
 306          brw_math_16(p, dst[3],
 307                      BRW_MATH_FUNCTION_INV,
 308                      BRW_MATH_SATURATE_NONE,
 309                      2, src,
 310                      BRW_MATH_PRECISION_FULL);
 311       } else {
 312          brw_math(p, dst[3],
 313                   BRW_MATH_FUNCTION_INV,
 314                   BRW_MATH_SATURATE_NONE,
 315                   2, src,
 316                   BRW_MATH_DATA_VECTOR,
 317                   BRW_MATH_PRECISION_FULL);
 318       }
 319    }
 320 }
 321
 322 void emit_linterp(struct brw_compile *p,
 323                   const struct brw_reg *dst,
 324                   GLuint mask,
 325                   const struct brw_reg *arg0,
 326                   const struct brw_reg *deltas)
 327 {
 328    struct intel_context *intel = &p->brw->intel;
 329    struct brw_reg interp[4];
 330    GLuint nr = arg0[0].nr;
 331    GLuint i;
 332
 333    interp[0] = brw_vec1_grf(nr, 0);
 334    interp[1] = brw_vec1_grf(nr, 4);
 335    interp[2] = brw_vec1_grf(nr+1, 0);
 336    interp[3] = brw_vec1_grf(nr+1, 4);
 337
 338    for (i = 0; i < 4; i++) {
 339       if (mask & (1<<i)) {
 340          if (intel->gen >= 6) {
 341             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 342          } else if (can_do_pln(intel, deltas)) {
 343             brw_PLN(p, dst[i], interp[i], deltas[0]);
 344          } else {
 345             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 346             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 347          }
 348       }
 349    }
 350 }
 351
 352
 353 void emit_pinterp(struct brw_compile *p,
 354                   const struct brw_reg *dst,
 355                   GLuint mask,
 356                   const struct brw_reg *arg0,
 357                   const struct brw_reg *deltas,
 358                   const struct brw_reg *w)
 359 {
 360    struct intel_context *intel = &p->brw->intel;
 361    struct brw_reg interp[4];
 362    GLuint nr = arg0[0].nr;
 363    GLuint i;
 364
 365    if (intel->gen >= 6) {
 366       emit_linterp(p, dst, mask, arg0, interp);
 367       return;
 368    }
 369
 370    interp[0] = brw_vec1_grf(nr, 0);
 371    interp[1] = brw_vec1_grf(nr, 4);
 372    interp[2] = brw_vec1_grf(nr+1, 0);
 373    interp[3] = brw_vec1_grf(nr+1, 4);
 374
 375    for (i = 0; i < 4; i++) {
 376       if (mask & (1<<i)) {
 377          if (can_do_pln(intel, deltas)) {
 378             brw_PLN(p, dst[i], interp[i], deltas[0]);
 379          } else {
 380             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 381             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 382          }
 383       }
 384    }
 385    for (i = 0; i < 4; i++) {
 386       if (mask & (1<<i)) {
 387          brw_MUL(p, dst[i], dst[i], w[3]);
 388       }
 389    }
 390 }
 391
 392
 393 void emit_cinterp(struct brw_compile *p,
 394                   const struct brw_reg *dst,
 395                   GLuint mask,
 396                   const struct brw_reg *arg0)
 397 {
 398    struct brw_reg interp[4];
 399    GLuint nr = arg0[0].nr;
 400    GLuint i;
 401
 402    interp[0] = brw_vec1_grf(nr, 0);
 403    interp[1] = brw_vec1_grf(nr, 4);
 404    interp[2] = brw_vec1_grf(nr+1, 0);
 405    interp[3] = brw_vec1_grf(nr+1, 4);
 406
 407    for (i = 0; i < 4; i++) {
 408       if (mask & (1<<i)) {
 409          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 410       }
 411    }
 412 }
 413
 414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 415 void emit_frontfacing(struct brw_compile *p,
 416                       const struct brw_reg *dst,
 417                       GLuint mask)
 418 {
 419    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 420    GLuint i;
 421
 422    if (!(mask & WRITEMASK_XYZW))
 423       return;
 424
 425    for (i = 0; i < 4; i++) {
 426       if (mask & (1<<i)) {
 427          brw_MOV(p, dst[i], brw_imm_f(0.0));
 428       }
 429    }
 430
 431    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 432     * us front face
 433     */
 434    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 435    for (i = 0; i < 4; i++) {
 436       if (mask & (1<<i)) {
 437          brw_MOV(p, dst[i], brw_imm_f(1.0));
 438       }
 439    }
 440    brw_set_predicate_control_flag_value(p, 0xff);
 441 }
 442
 443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 444  * looking like:
 445  *
 446  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 447  *
 448  * and we're trying to produce:
 449  *
 450  *           DDX                     DDY
 451  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 452  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 453  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 454  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 455  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 456  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 457  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 458  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 459  *
 460  * and add another set of two more subspans if in 16-pixel dispatch mode.
 461  *
 462  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 463  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 464  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 465  * between each other.  We could probably do it like ddx and swizzle the right
 466  * order later, but bail for now and just produce
 467  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 468  */
 469 void emit_ddxy(struct brw_compile *p,
 470                const struct brw_reg *dst,
 471                GLuint mask,
 472                GLboolean is_ddx,
 473                const struct brw_reg *arg0)
 474 {
 475    int i;
 476    struct brw_reg src0, src1;
 477
 478    if (mask & SATURATE)
 479       brw_set_saturate(p, 1);
 480    for (i = 0; i < 4; i++ ) {
 481       if (mask & (1<<i)) {
 482          if (is_ddx) {
 483             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 484                            BRW_REGISTER_TYPE_F,
 485                            BRW_VERTICAL_STRIDE_2,
 486                            BRW_WIDTH_2,
 487                            BRW_HORIZONTAL_STRIDE_0,
 488                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 489             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 490                            BRW_REGISTER_TYPE_F,
 491                            BRW_VERTICAL_STRIDE_2,
 492                            BRW_WIDTH_2,
 493                            BRW_HORIZONTAL_STRIDE_0,
 494                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 495          } else {
 496             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 497                            BRW_REGISTER_TYPE_F,
 498                            BRW_VERTICAL_STRIDE_4,
 499                            BRW_WIDTH_4,
 500                            BRW_HORIZONTAL_STRIDE_0,
 501                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 502             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 503                            BRW_REGISTER_TYPE_F,
 504                            BRW_VERTICAL_STRIDE_4,
 505                            BRW_WIDTH_4,
 506                            BRW_HORIZONTAL_STRIDE_0,
 507                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 508          }
 509          brw_ADD(p, dst[i], src0, negate(src1));
 510       }
 511    }
 512    if (mask & SATURATE)
 513       brw_set_saturate(p, 0);
 514 }
 515
 516 void emit_alu1(struct brw_compile *p,
 517                struct brw_instruction *(*func)(struct brw_compile *,
 518                                                struct brw_reg,
 519                                                struct brw_reg),
 520                const struct brw_reg *dst,
 521                GLuint mask,
 522                const struct brw_reg *arg0)
 523 {
 524    GLuint i;
 525
 526    if (mask & SATURATE)
 527       brw_set_saturate(p, 1);
 528
 529    for (i = 0; i < 4; i++) {
 530       if (mask & (1<<i)) {
 531          func(p, dst[i], arg0[i]);
 532       }
 533    }
 534
 535    if (mask & SATURATE)
 536       brw_set_saturate(p, 0);
 537 }
 538
 539
 540 void emit_alu2(struct brw_compile *p,
 541                struct brw_instruction *(*func)(struct brw_compile *,
 542                                                struct brw_reg,
 543                                                struct brw_reg,
 544                                                struct brw_reg),
 545                const struct brw_reg *dst,
 546                GLuint mask,
 547                const struct brw_reg *arg0,
 548                const struct brw_reg *arg1)
 549 {
 550    GLuint i;
 551
 552    if (mask & SATURATE)
 553       brw_set_saturate(p, 1);
 554
 555    for (i = 0; i < 4; i++) {
 556       if (mask & (1<<i)) {
 557          func(p, dst[i], arg0[i], arg1[i]);
 558       }
 559    }
 560
 561    if (mask & SATURATE)
 562       brw_set_saturate(p, 0);
 563 }
 564
 565
 566 void emit_mad(struct brw_compile *p,
 567               const struct brw_reg *dst,
 568               GLuint mask,
 569               const struct brw_reg *arg0,
 570               const struct brw_reg *arg1,
 571               const struct brw_reg *arg2)
 572 {
 573    GLuint i;
 574
 575    for (i = 0; i < 4; i++) {
 576       if (mask & (1<<i)) {
 577          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 578
 579          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 580          brw_ADD(p, dst[i], dst[i], arg2[i]);
 581          brw_set_saturate(p, 0);
 582       }
 583    }
 584 }
 585
 586 void emit_lrp(struct brw_compile *p,
 587               const struct brw_reg *dst,
 588               GLuint mask,
 589               const struct brw_reg *arg0,
 590               const struct brw_reg *arg1,
 591               const struct brw_reg *arg2)
 592 {
 593    GLuint i;
 594
 595    /* Uses dst as a temporary:
 596     */
 597    for (i = 0; i < 4; i++) {
 598       if (mask & (1<<i)) {
 599          /* Can I use the LINE instruction for this?
 600           */
 601          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 602          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 603
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607       }
 608    }
 609 }
 610
 611 void emit_sop(struct brw_compile *p,
 612               const struct brw_reg *dst,
 613               GLuint mask,
 614               GLuint cond,
 615               const struct brw_reg *arg0,
 616               const struct brw_reg *arg1)
 617 {
 618    GLuint i;
 619
 620    for (i = 0; i < 4; i++) {
 621       if (mask & (1<<i)) {
 622          brw_push_insn_state(p);
 623          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 624          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 625          brw_MOV(p, dst[i], brw_imm_f(0));
 626          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 627          brw_MOV(p, dst[i], brw_imm_f(1.0));
 628          brw_pop_insn_state(p);
 629       }
 630    }
 631 }
 632
 633 static void emit_slt( struct brw_compile *p,
 634                       const struct brw_reg *dst,
 635                       GLuint mask,
 636                       const struct brw_reg *arg0,
 637                       const struct brw_reg *arg1 )
 638 {
 639    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 640 }
 641
 642 static void emit_sle( struct brw_compile *p,
 643                       const struct brw_reg *dst,
 644                       GLuint mask,
 645                       const struct brw_reg *arg0,
 646                       const struct brw_reg *arg1 )
 647 {
 648    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 649 }
 650
 651 static void emit_sgt( struct brw_compile *p,
 652                       const struct brw_reg *dst,
 653                       GLuint mask,
 654                       const struct brw_reg *arg0,
 655                       const struct brw_reg *arg1 )
 656 {
 657    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 658 }
 659
 660 static void emit_sge( struct brw_compile *p,
 661                       const struct brw_reg *dst,
 662                       GLuint mask,
 663                       const struct brw_reg *arg0,
 664                       const struct brw_reg *arg1 )
 665 {
 666    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 667 }
 668
 669 static void emit_seq( struct brw_compile *p,
 670                       const struct brw_reg *dst,
 671                       GLuint mask,
 672                       const struct brw_reg *arg0,
 673                       const struct brw_reg *arg1 )
 674 {
 675    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 676 }
 677
 678 static void emit_sne( struct brw_compile *p,
 679                       const struct brw_reg *dst,
 680                       GLuint mask,
 681                       const struct brw_reg *arg0,
 682                       const struct brw_reg *arg1 )
 683 {
 684    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 685 }
 686
 687 void emit_cmp(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1,
 692               const struct brw_reg *arg2)
 693 {
 694    GLuint i;
 695
 696    for (i = 0; i < 4; i++) {
 697       if (mask & (1<<i)) {
 698          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 699
 700          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 701          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 702          brw_set_saturate(p, 0);
 703          brw_set_predicate_control_flag_value(p, 0xff);
 704       }
 705    }
 706 }
 707
 708 void emit_sign(struct brw_compile *p,
 709                const struct brw_reg *dst,
 710                GLuint mask,
 711                const struct brw_reg *arg0)
 712 {
 713    GLuint i;
 714
 715    for (i = 0; i < 4; i++) {
 716       if (mask & (1<<i)) {
 717          brw_MOV(p, dst[i], brw_imm_f(0.0));
 718
 719          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 720          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 721          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 722
 723          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 724          brw_MOV(p, dst[i], brw_imm_f(1.0));
 725          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 726       }
 727    }
 728 }
 729
 730 void emit_max(struct brw_compile *p,
 731               const struct brw_reg *dst,
 732               GLuint mask,
 733               const struct brw_reg *arg0,
 734               const struct brw_reg *arg1)
 735 {
 736    GLuint i;
 737
 738    for (i = 0; i < 4; i++) {
 739       if (mask & (1<<i)) {
 740          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 741
 742          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 743          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 744          brw_set_saturate(p, 0);
 745          brw_set_predicate_control_flag_value(p, 0xff);
 746       }
 747    }
 748 }
 749
 750 void emit_min(struct brw_compile *p,
 751               const struct brw_reg *dst,
 752               GLuint mask,
 753               const struct brw_reg *arg0,
 754               const struct brw_reg *arg1)
 755 {
 756    GLuint i;
 757
 758    for (i = 0; i < 4; i++) {
 759       if (mask & (1<<i)) {
 760          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 761
 762          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 763          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 764          brw_set_saturate(p, 0);
 765          brw_set_predicate_control_flag_value(p, 0xff);
 766       }
 767    }
 768 }
 769
 770
 771 void emit_dp2(struct brw_compile *p,
 772               const struct brw_reg *dst,
 773               GLuint mask,
 774               const struct brw_reg *arg0,
 775               const struct brw_reg *arg1)
 776 {
 777    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 778
 779    if (!(mask & WRITEMASK_XYZW))
 780       return; /* Do not emit dead code */
 781
 782    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 783
 784    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 785
 786    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 787    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 788    brw_set_saturate(p, 0);
 789 }
 790
 791
 792 void emit_dp3(struct brw_compile *p,
 793               const struct brw_reg *dst,
 794               GLuint mask,
 795               const struct brw_reg *arg0,
 796               const struct brw_reg *arg1)
 797 {
 798    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 799
 800    if (!(mask & WRITEMASK_XYZW))
 801       return; /* Do not emit dead code */
 802
 803    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 804
 805    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 806    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 807
 808    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 809    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 810    brw_set_saturate(p, 0);
 811 }
 812
 813
 814 void emit_dp4(struct brw_compile *p,
 815               const struct brw_reg *dst,
 816               GLuint mask,
 817               const struct brw_reg *arg0,
 818               const struct brw_reg *arg1)
 819 {
 820    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 821
 822    if (!(mask & WRITEMASK_XYZW))
 823       return; /* Do not emit dead code */
 824
 825    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 826
 827    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 828    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 829    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 830
 831    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 832    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 833    brw_set_saturate(p, 0);
 834 }
 835
 836
 837 void emit_dph(struct brw_compile *p,
 838               const struct brw_reg *dst,
 839               GLuint mask,
 840               const struct brw_reg *arg0,
 841               const struct brw_reg *arg1)
 842 {
 843    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 844
 845    if (!(mask & WRITEMASK_XYZW))
 846       return; /* Do not emit dead code */
 847
 848    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 849
 850    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 851    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 852    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 853
 854    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 855    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 856    brw_set_saturate(p, 0);
 857 }
 858
 859
 860 void emit_xpd(struct brw_compile *p,
 861               const struct brw_reg *dst,
 862               GLuint mask,
 863               const struct brw_reg *arg0,
 864               const struct brw_reg *arg1)
 865 {
 866    GLuint i;
 867
 868    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 869
 870    for (i = 0 ; i < 3; i++) {
 871       if (mask & (1<<i)) {
 872          GLuint i2 = (i+2)%3;
 873          GLuint i1 = (i+1)%3;
 874
 875          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 876
 877          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 878          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 879          brw_set_saturate(p, 0);
 880       }
 881    }
 882 }
 883
 884
 885 void emit_math1(struct brw_wm_compile *c,
 886                 GLuint function,
 887                 const struct brw_reg *dst,
 888                 GLuint mask,
 889                 const struct brw_reg *arg0)
 890 {
 891    struct brw_compile *p = &c->func;
 892    struct intel_context *intel = &p->brw->intel;
 893    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 894    GLuint saturate = ((mask & SATURATE) ?
 895                       BRW_MATH_SATURATE_SATURATE :
 896                       BRW_MATH_SATURATE_NONE);
 897    struct brw_reg src;
 898
 899    if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
 900                            arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
 901       /* Gen6 math requires that source and dst horizontal stride be 1,
 902        * and that the argument be in the GRF.
 903        */
 904       src = dst[dst_chan];
 905       brw_MOV(p, src, arg0[0]);
 906    } else {
 907       src = arg0[0];
 908    }
 909
 910    if (!(mask & WRITEMASK_XYZW))
 911       return; /* Do not emit dead code */
 912
 913    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 914
 915    /* Send two messages to perform all 16 operations:
 916     */
 917    brw_push_insn_state(p);
 918    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 919    brw_math(p,
 920             dst[dst_chan],
 921             function,
 922             saturate,
 923             2,
 924             src,
 925             BRW_MATH_DATA_VECTOR,
 926             BRW_MATH_PRECISION_FULL);
 927
 928    if (c->dispatch_width == 16) {
 929       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 930       brw_math(p,
 931                offset(dst[dst_chan],1),
 932                function,
 933                saturate,
 934                3,
 935                sechalf(src),
 936                BRW_MATH_DATA_VECTOR,
 937                BRW_MATH_PRECISION_FULL);
 938    }
 939    brw_pop_insn_state(p);
 940 }
 941
 942
 943 void emit_math2(struct brw_wm_compile *c,
 944                 GLuint function,
 945                 const struct brw_reg *dst,
 946                 GLuint mask,
 947                 const struct brw_reg *arg0,
 948                 const struct brw_reg *arg1)
 949 {
 950    struct brw_compile *p = &c->func;
 951    struct intel_context *intel = &p->brw->intel;
 952    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 953
 954    if (!(mask & WRITEMASK_XYZW))
 955       return; /* Do not emit dead code */
 956
 957    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 958
 959    brw_push_insn_state(p);
 960
 961    /* math can only operate on up to a vec8 at a time, so in
 962     * dispatch_width==16 we have to do the second half manually.
 963     */
 964    if (intel->gen >= 6) {
 965       struct brw_reg src0 = arg0[0];
 966       struct brw_reg src1 = arg1[0];
 967       struct brw_reg temp_dst = dst[dst_chan];
 968
 969       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 970          if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 971             /* Both scalar arguments.  Do scalar calc. */
 972             src0.hstride = BRW_HORIZONTAL_STRIDE_1;
 973             src1.hstride = BRW_HORIZONTAL_STRIDE_1;
 974             temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
 975             temp_dst.width = BRW_WIDTH_1;
 976
 977             if (arg0[0].subnr != 0) {
 978                brw_MOV(p, temp_dst, src0);
 979                src0 = temp_dst;
 980
 981                /* Ouch.  We've used the temp as a dst, and we still
 982                 * need a temp to store arg1 in, because src and dst
 983                 * offsets have to be equal.  Leaving this up to
 984                 * glsl2-965 to handle correctly.
 985                 */
 986                assert(arg1[0].subnr == 0);
 987             } else if (arg1[0].subnr != 0) {
 988                brw_MOV(p, temp_dst, src1);
 989                src1 = temp_dst;
 990             }
 991          } else {
 992             brw_MOV(p, temp_dst, src0);
 993             src0 = temp_dst;
 994          }
 995       } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 996          brw_MOV(p, temp_dst, src1);
 997          src1 = temp_dst;
 998       }
 999
1000       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1001       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1002       brw_math2(p,
1003                 temp_dst,
1004                 function,
1005                 src0,
1006                 src1);
1007       if (c->dispatch_width == 16) {
1008          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1009          brw_math2(p,
1010                    sechalf(temp_dst),
1011                    function,
1012                    sechalf(src0),
1013                    sechalf(src1));
1014       }
1015
1016       /* Splat a scalar result into all the channels. */
1017       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1018           arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1019          temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1020          temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1021          brw_MOV(p, dst[dst_chan], temp_dst);
1022       }
1023    } else {
1024       GLuint saturate = ((mask & SATURATE) ?
1025                          BRW_MATH_SATURATE_SATURATE :
1026                          BRW_MATH_SATURATE_NONE);
1027
1028       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1029       brw_MOV(p, brw_message_reg(3), arg1[0]);
1030       if (c->dispatch_width == 16) {
1031          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1032          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1033       }
1034
1035       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1036       brw_math(p,
1037                dst[dst_chan],
1038                function,
1039                saturate,
1040                2,
1041                arg0[0],
1042                BRW_MATH_DATA_VECTOR,
1043                BRW_MATH_PRECISION_FULL);
1044
1045       /* Send two messages to perform all 16 operations:
1046        */
1047       if (c->dispatch_width == 16) {
1048          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1049          brw_math(p,
1050                   offset(dst[dst_chan],1),
1051                   function,
1052                   saturate,
1053                   4,
1054                   sechalf(arg0[0]),
1055                   BRW_MATH_DATA_VECTOR,
1056                   BRW_MATH_PRECISION_FULL);
1057       }
1058    }
1059    brw_pop_insn_state(p);
1060 }
1061
1062
1063 void emit_tex(struct brw_wm_compile *c,
1064               struct brw_reg *dst,
1065               GLuint dst_flags,
1066               struct brw_reg *arg,
1067               struct brw_reg depth_payload,
1068               GLuint tex_idx,
1069               GLuint sampler,
1070               GLboolean shadow)
1071 {
1072    struct brw_compile *p = &c->func;
1073    struct intel_context *intel = &p->brw->intel;
1074    struct brw_reg dst_retyped;
1075    GLuint cur_mrf = 2, response_length;
1076    GLuint i, nr_texcoords;
1077    GLuint emit;
1078    GLuint msg_type;
1079    GLuint mrf_per_channel;
1080    GLuint simd_mode;
1081
1082    if (c->dispatch_width == 16) {
1083       mrf_per_channel = 2;
1084       response_length = 8;
1085       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1086       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1087    } else {
1088       mrf_per_channel = 1;
1089       response_length = 4;
1090       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1091       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1092    }
1093
1094    /* How many input regs are there?
1095     */
1096    switch (tex_idx) {
1097    case TEXTURE_1D_INDEX:
1098       emit = WRITEMASK_X;
1099       nr_texcoords = 1;
1100       break;
1101    case TEXTURE_2D_INDEX:
1102    case TEXTURE_RECT_INDEX:
1103       emit = WRITEMASK_XY;
1104       nr_texcoords = 2;
1105       break;
1106    case TEXTURE_3D_INDEX:
1107    case TEXTURE_CUBE_INDEX:
1108       emit = WRITEMASK_XYZ;
1109       nr_texcoords = 3;
1110       break;
1111    default:
1112       /* unexpected target */
1113       abort();
1114    }
1115
1116    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1117    if (intel->gen < 5 && c->dispatch_width == 8)
1118       nr_texcoords = 3;
1119
1120    /* For shadow comparisons, we have to supply u,v,r. */
1121    if (shadow)
1122       nr_texcoords = 3;
1123
1124    /* Emit the texcoords. */
1125    for (i = 0; i < nr_texcoords; i++) {
1126       if (emit & (1<<i))
1127          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1128       else
1129          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130       cur_mrf += mrf_per_channel;
1131    }
1132
1133    /* Fill in the shadow comparison reference value. */
1134    if (shadow) {
1135       if (intel->gen >= 5) {
1136          /* Fill in the cube map array index value. */
1137          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1138          cur_mrf += mrf_per_channel;
1139       } else if (c->dispatch_width == 8) {
1140          /* Fill in the LOD bias value. */
1141          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1142          cur_mrf += mrf_per_channel;
1143       }
1144       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1145       cur_mrf += mrf_per_channel;
1146    }
1147
1148    if (intel->gen >= 5) {
1149       if (shadow)
1150          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1151       else
1152          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1153    } else {
1154       /* Note that G45 and older determines shadow compare and dispatch width
1155        * from message length for most messages.
1156        */
1157       if (c->dispatch_width == 16 && shadow)
1158          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1159       else
1160          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1161    }
1162
1163    brw_SAMPLE(p,
1164               dst_retyped,
1165               1,
1166               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1167               SURF_INDEX_TEXTURE(sampler),
1168               sampler,
1169               dst_flags & WRITEMASK_XYZW,
1170               msg_type,
1171               response_length,
1172               cur_mrf - 1,
1173               0,
1174               1,
1175               simd_mode);
1176 }
1177
1178
1179 void emit_txb(struct brw_wm_compile *c,
1180               struct brw_reg *dst,
1181               GLuint dst_flags,
1182               struct brw_reg *arg,
1183               struct brw_reg depth_payload,
1184               GLuint tex_idx,
1185               GLuint sampler)
1186 {
1187    struct brw_compile *p = &c->func;
1188    struct intel_context *intel = &p->brw->intel;
1189    GLuint msgLength;
1190    GLuint msg_type;
1191    GLuint mrf_per_channel;
1192    GLuint response_length;
1193    struct brw_reg dst_retyped;
1194
1195    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1196     * samples, so we'll use the 16-wide instruction, leave the second halves
1197     * undefined, and trust the execution mask to keep the undefined pixels
1198     * from mattering.
1199     */
1200    if (c->dispatch_width == 16 || intel->gen < 5) {
1201       if (intel->gen >= 5)
1202          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1203       else
1204          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1205       mrf_per_channel = 2;
1206       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1207       response_length = 8;
1208    } else {
1209       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1210       mrf_per_channel = 1;
1211       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1212       response_length = 4;
1213    }
1214
1215    /* Shadow ignored for txb. */
1216    switch (tex_idx) {
1217    case TEXTURE_1D_INDEX:
1218       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1219       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1220       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1221       break;
1222    case TEXTURE_2D_INDEX:
1223    case TEXTURE_RECT_INDEX:
1224       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1225       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1226       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1227       break;
1228    case TEXTURE_3D_INDEX:
1229    case TEXTURE_CUBE_INDEX:
1230       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1231       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1232       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1233       break;
1234    default:
1235       /* unexpected target */
1236       abort();
1237    }
1238
1239    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1240    msgLength = 2 + 4 * mrf_per_channel - 1;
1241
1242    brw_SAMPLE(p,
1243               dst_retyped,
1244               1,
1245               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1246               SURF_INDEX_TEXTURE(sampler),
1247               sampler,
1248               dst_flags & WRITEMASK_XYZW,
1249               msg_type,
1250               response_length,
1251               msgLength,
1252               0,
1253               1,
1254               BRW_SAMPLER_SIMD_MODE_SIMD16);
1255 }
1256
1257
1258 static void emit_lit(struct brw_wm_compile *c,
1259                      const struct brw_reg *dst,
1260                      GLuint mask,
1261                      const struct brw_reg *arg0)
1262 {
1263    struct brw_compile *p = &c->func;
1264
1265    assert((mask & WRITEMASK_XW) == 0);
1266
1267    if (mask & WRITEMASK_Y) {
1268       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1269       brw_MOV(p, dst[1], arg0[0]);
1270       brw_set_saturate(p, 0);
1271    }
1272
1273    if (mask & WRITEMASK_Z) {
1274       emit_math2(c, BRW_MATH_FUNCTION_POW,
1275                  &dst[2],
1276                  WRITEMASK_X | (mask & SATURATE),
1277                  &arg0[1],
1278                  &arg0[3]);
1279    }
1280
1281    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1282     * some of the POW calculations above, but 16-wide iff statements
1283     * seem to lock c1 hardware, so this is a nasty workaround:
1284     */
1285    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1286    {
1287       if (mask & WRITEMASK_Y)
1288          brw_MOV(p, dst[1], brw_imm_f(0));
1289
1290       if (mask & WRITEMASK_Z)
1291          brw_MOV(p, dst[2], brw_imm_f(0));
1292    }
1293    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1294 }
1295
1296
1297 /* Kill pixel - set execution mask to zero for those pixels which
1298  * fail.
1299  */
1300 static void emit_kil( struct brw_wm_compile *c,
1301                       struct brw_reg *arg0)
1302 {
1303    struct brw_compile *p = &c->func;
1304    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1305    GLuint i, j;
1306
1307    for (i = 0; i < 4; i++) {
1308       /* Check if we've already done the comparison for this reg
1309        * -- common when someone does KIL TEMP.wwww.
1310        */
1311       for (j = 0; j < i; j++) {
1312          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1313             break;
1314       }
1315       if (j != i)
1316          continue;
1317
1318       brw_push_insn_state(p);
1319       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1320       brw_set_predicate_control_flag_value(p, 0xff);
1321       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1322       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1323       brw_pop_insn_state(p);
1324    }
1325 }
1326
1327 /* KIL_NV kills the pixels that are currently executing, not based on a test
1328  * of the arguments.
1329  */
1330 void emit_kil_nv( struct brw_wm_compile *c )
1331 {
1332    struct brw_compile *p = &c->func;
1333    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1334
1335    brw_push_insn_state(p);
1336    brw_set_mask_control(p, BRW_MASK_DISABLE);
1337    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1338    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1339    brw_pop_insn_state(p);
1340 }
1341
1342 static void fire_fb_write( struct brw_wm_compile *c,
1343                            GLuint base_reg,
1344                            GLuint nr,
1345                            GLuint target,
1346                            GLuint eot )
1347 {
1348    struct brw_compile *p = &c->func;
1349    struct intel_context *intel = &p->brw->intel;
1350    struct brw_reg dst;
1351
1352    if (c->dispatch_width == 16)
1353       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1354    else
1355       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1356
1357    /* Pass through control information:
1358     */
1359 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1360    if (intel->gen < 6) /* gen6, use headerless for fb write */
1361    {
1362       brw_push_insn_state(p);
1363       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1364       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1365       brw_MOV(p,
1366                brw_message_reg(base_reg + 1),
1367                brw_vec8_grf(1, 0));
1368       brw_pop_insn_state(p);
1369    }
1370
1371    /* Send framebuffer write message: */
1372 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1373    brw_fb_WRITE(p,
1374                 c->dispatch_width,
1375                 dst,
1376                 base_reg,
1377                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1378                 target,
1379                 nr,
1380                 0,
1381                 eot);
1382 }
1383
1384
1385 static void emit_aa( struct brw_wm_compile *c,
1386                      struct brw_reg *arg1,
1387                      GLuint reg )
1388 {
1389    struct brw_compile *p = &c->func;
1390    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1391    GLuint off = c->key.aa_dest_stencil_reg % 2;
1392    struct brw_reg aa = offset(arg1[comp], off);
1393
1394    brw_push_insn_state(p);
1395    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1396    brw_MOV(p, brw_message_reg(reg), aa);
1397    brw_pop_insn_state(p);
1398 }
1399
1400
1401 /* Post-fragment-program processing.  Send the results to the
1402  * framebuffer.
1403  * \param arg0  the fragment color
1404  * \param arg1  the pass-through depth value
1405  * \param arg2  the shader-computed depth value
1406  */
1407 void emit_fb_write(struct brw_wm_compile *c,
1408                    struct brw_reg *arg0,
1409                    struct brw_reg *arg1,
1410                    struct brw_reg *arg2,
1411                    GLuint target,
1412                    GLuint eot)
1413 {
1414    struct brw_compile *p = &c->func;
1415    struct brw_context *brw = p->brw;
1416    struct intel_context *intel = &brw->intel;
1417    GLuint nr = 2;
1418    GLuint channel;
1419    int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1420
1421    /* Reserve a space for AA - may not be needed:
1422     */
1423    if (c->key.aa_dest_stencil_reg)
1424       nr += 1;
1425
1426    /* I don't really understand how this achieves the color interleave
1427     * (ie RGBARGBA) in the result:  [Do the saturation here]
1428     */
1429    brw_push_insn_state(p);
1430
1431    if (intel->gen >= 6)
1432         base_reg = nr;
1433    else
1434         base_reg = 0;
1435
1436    for (channel = 0; channel < 4; channel++) {
1437       if (intel->gen >= 6) {
1438          /* gen6 SIMD16 single source DP write looks like:
1439           * m + 0: r0
1440           * m + 1: r1
1441           * m + 2: g0
1442           * m + 3: g1
1443           * m + 4: b0
1444           * m + 5: b1
1445           * m + 6: a0
1446           * m + 7: a1
1447           */
1448          if (c->dispatch_width == 16) {
1449             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1450          } else {
1451             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1452          }
1453       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1454          /* pre-gen6 SIMD16 single source DP write looks like:
1455           * m + 0: r0
1456           * m + 1: g0
1457           * m + 2: b0
1458           * m + 3: a0
1459           * m + 4: r1
1460           * m + 5: g1
1461           * m + 6: b1
1462           * m + 7: a1
1463           *
1464           * By setting the high bit of the MRF register number, we indicate
1465           * that we want COMPR4 mode - instead of doing the usual destination
1466           * + 1 for the second half we get destination + 4.
1467           */
1468          brw_MOV(p,
1469                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1470                  arg0[channel]);
1471       } else {
1472          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1473          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1474          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1475          brw_MOV(p,
1476                  brw_message_reg(nr + channel),
1477                  arg0[channel]);
1478
1479          if (c->dispatch_width == 16) {
1480             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1481             brw_MOV(p,
1482                     brw_message_reg(nr + channel + 4),
1483                     sechalf(arg0[channel]));
1484          }
1485       }
1486    }
1487    /* skip over the regs populated above:
1488     */
1489    if (c->dispatch_width == 16)
1490       nr += 8;
1491    else
1492       nr += 4;
1493
1494    brw_pop_insn_state(p);
1495
1496    if (c->key.source_depth_to_render_target)
1497    {
1498       if (c->key.computes_depth)
1499          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1500       else
1501          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1502
1503       nr += 2;
1504    }
1505
1506    if (c->key.dest_depth_reg)
1507    {
1508       GLuint comp = c->key.dest_depth_reg / 2;
1509       GLuint off = c->key.dest_depth_reg % 2;
1510
1511       if (off != 0) {
1512          brw_push_insn_state(p);
1513          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1514
1515          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1516          /* 2nd half? */
1517          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1518          brw_pop_insn_state(p);
1519       }
1520       else {
1521          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1522       }
1523       nr += 2;
1524    }
1525
1526    if (intel->gen >= 6) {
1527       /* Subtract off the message header, since we send headerless. */
1528       nr -= 2;
1529    }
1530
1531    if (!c->key.runtime_check_aads_emit) {
1532       if (c->key.aa_dest_stencil_reg)
1533          emit_aa(c, arg1, 2);
1534
1535       fire_fb_write(c, base_reg, nr, target, eot);
1536    }
1537    else {
1538       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1539       struct brw_reg ip = brw_ip_reg();
1540       struct brw_instruction *jmp;
1541
1542       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1543       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1544       brw_AND(p,
1545               v1_null_ud,
1546               get_element_ud(brw_vec8_grf(1,0), 6),
1547               brw_imm_ud(1<<26));
1548
1549       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1550       {
1551          emit_aa(c, arg1, 2);
1552          fire_fb_write(c, 0, nr, target, eot);
1553          /* note - thread killed in subroutine */
1554       }
1555       brw_land_fwd_jump(p, jmp);
1556
1557       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1558        */
1559       fire_fb_write(c, 1, nr-1, target, eot);
1560    }
1561 }
1562
1563 /**
1564  * Move a GPR to scratch memory.
1565  */
1566 static void emit_spill( struct brw_wm_compile *c,
1567                         struct brw_reg reg,
1568                         GLuint slot )
1569 {
1570    struct brw_compile *p = &c->func;
1571
1572    /*
1573      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1574    */
1575    brw_MOV(p, brw_message_reg(2), reg);
1576
1577    /*
1578      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1579      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1580    */
1581    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1582 }
1583
1584
1585 /**
1586  * Load a GPR from scratch memory.
1587  */
1588 static void emit_unspill( struct brw_wm_compile *c,
1589                           struct brw_reg reg,
1590                           GLuint slot )
1591 {
1592    struct brw_compile *p = &c->func;
1593
1594    /* Slot 0 is the undef value.
1595     */
1596    if (slot == 0) {
1597       brw_MOV(p, reg, brw_imm_f(0));
1598       return;
1599    }
1600
1601    /*
1602      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1603      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1604    */
1605
1606    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1607 }
1608
1609
1610 /**
1611  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1612  * Args with unspill_reg != 0 will be loaded from scratch memory.
1613  */
1614 static void get_argument_regs( struct brw_wm_compile *c,
1615                                struct brw_wm_ref *arg[],
1616                                struct brw_reg *regs )
1617 {
1618    GLuint i;
1619
1620    for (i = 0; i < 4; i++) {
1621       if (arg[i]) {
1622          if (arg[i]->unspill_reg)
1623             emit_unspill(c,
1624                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1625                          arg[i]->value->spill_slot);
1626
1627          regs[i] = arg[i]->hw_reg;
1628       }
1629       else {
1630          regs[i] = brw_null_reg();
1631       }
1632    }
1633 }
1634
1635
1636 /**
1637  * For values that have a spill_slot!=0, write those regs to scratch memory.
1638  */
1639 static void spill_values( struct brw_wm_compile *c,
1640                           struct brw_wm_value *values,
1641                           GLuint nr )
1642 {
1643    GLuint i;
1644
1645    for (i = 0; i < nr; i++)
1646       if (values[i].spill_slot)
1647          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1648 }
1649
1650
1651 /* Emit the fragment program instructions here.
1652  */
1653 void brw_wm_emit( struct brw_wm_compile *c )
1654 {
1655    struct brw_compile *p = &c->func;
1656    struct intel_context *intel = &p->brw->intel;
1657    GLuint insn;
1658
1659    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1660    if (intel->gen >= 6)
1661         brw_set_acc_write_control(p, 1);
1662
1663    /* Check if any of the payload regs need to be spilled:
1664     */
1665    spill_values(c, c->payload.depth, 4);
1666    spill_values(c, c->creg, c->nr_creg);
1667    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1668
1669
1670    for (insn = 0; insn < c->nr_insns; insn++) {
1671
1672       struct brw_wm_instruction *inst = &c->instruction[insn];
1673       struct brw_reg args[3][4], dst[4];
1674       GLuint i, dst_flags;
1675
1676       /* Get argument regs:
1677        */
1678       for (i = 0; i < 3; i++)
1679          get_argument_regs(c, inst->src[i], args[i]);
1680
1681       /* Get dest regs:
1682        */
1683       for (i = 0; i < 4; i++)
1684          if (inst->dst[i])
1685             dst[i] = inst->dst[i]->hw_reg;
1686          else
1687             dst[i] = brw_null_reg();
1688
1689       /* Flags
1690        */
1691       dst_flags = inst->writemask;
1692       if (inst->saturate)
1693          dst_flags |= SATURATE;
1694
1695       switch (inst->opcode) {
1696          /* Generated instructions for calculating triangle interpolants:
1697           */
1698       case WM_PIXELXY:
1699          emit_pixel_xy(c, dst, dst_flags);
1700          break;
1701
1702       case WM_DELTAXY:
1703          emit_delta_xy(p, dst, dst_flags, args[0]);
1704          break;
1705
1706       case WM_WPOSXY:
1707          emit_wpos_xy(c, dst, dst_flags, args[0]);
1708          break;
1709
1710       case WM_PIXELW:
1711          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1712          break;
1713
1714       case WM_LINTERP:
1715          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1716          break;
1717
1718       case WM_PINTERP:
1719          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1720          break;
1721
1722       case WM_CINTERP:
1723          emit_cinterp(p, dst, dst_flags, args[0]);
1724          break;
1725
1726       case WM_FB_WRITE:
1727          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1728          break;
1729
1730       case WM_FRONTFACING:
1731          emit_frontfacing(p, dst, dst_flags);
1732          break;
1733
1734          /* Straightforward arithmetic:
1735           */
1736       case OPCODE_ADD:
1737          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1738          break;
1739
1740       case OPCODE_FRC:
1741          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1742          break;
1743
1744       case OPCODE_FLR:
1745          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1746          break;
1747
1748       case OPCODE_DDX:
1749          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1750          break;
1751
1752       case OPCODE_DDY:
1753          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1754          break;
1755
1756       case OPCODE_DP2:
1757          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1758          break;
1759
1760       case OPCODE_DP3:
1761          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1762          break;
1763
1764       case OPCODE_DP4:
1765          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1766          break;
1767
1768       case OPCODE_DPH:
1769          emit_dph(p, dst, dst_flags, args[0], args[1]);
1770          break;
1771
1772       case OPCODE_TRUNC:
1773          for (i = 0; i < 4; i++) {
1774             if (dst_flags & (1<<i)) {
1775                brw_RNDZ(p, dst[i], args[0][i]);
1776             }
1777          }
1778          break;
1779
1780       case OPCODE_LRP:
1781          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1782          break;
1783
1784       case OPCODE_MAD:
1785          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1786          break;
1787
1788       case OPCODE_MOV:
1789       case OPCODE_SWZ:
1790          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1791          break;
1792
1793       case OPCODE_MUL:
1794          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1795          break;
1796
1797       case OPCODE_XPD:
1798          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1799          break;
1800
1801          /* Higher math functions:
1802           */
1803       case OPCODE_RCP:
1804          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1805          break;
1806
1807       case OPCODE_RSQ:
1808          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1809          break;
1810
1811       case OPCODE_SIN:
1812          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1813          break;
1814
1815       case OPCODE_COS:
1816          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1817          break;
1818
1819       case OPCODE_EX2:
1820          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1821          break;
1822
1823       case OPCODE_LG2:
1824          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1825          break;
1826
1827       case OPCODE_SCS:
1828          /* There is an scs math function, but it would need some
1829           * fixup for 16-element execution.
1830           */
1831          if (dst_flags & WRITEMASK_X)
1832             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833          if (dst_flags & WRITEMASK_Y)
1834             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1835          break;
1836
1837       case OPCODE_POW:
1838          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1839          break;
1840
1841          /* Comparisons:
1842           */
1843       case OPCODE_CMP:
1844          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1845          break;
1846
1847       case OPCODE_MAX:
1848          emit_max(p, dst, dst_flags, args[0], args[1]);
1849          break;
1850
1851       case OPCODE_MIN:
1852          emit_min(p, dst, dst_flags, args[0], args[1]);
1853          break;
1854
1855       case OPCODE_SLT:
1856          emit_slt(p, dst, dst_flags, args[0], args[1]);
1857          break;
1858
1859       case OPCODE_SLE:
1860          emit_sle(p, dst, dst_flags, args[0], args[1]);
1861         break;
1862       case OPCODE_SGT:
1863          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1864         break;
1865       case OPCODE_SGE:
1866          emit_sge(p, dst, dst_flags, args[0], args[1]);
1867          break;
1868       case OPCODE_SEQ:
1869          emit_seq(p, dst, dst_flags, args[0], args[1]);
1870         break;
1871       case OPCODE_SNE:
1872          emit_sne(p, dst, dst_flags, args[0], args[1]);
1873         break;
1874
1875       case OPCODE_SSG:
1876          emit_sign(p, dst, dst_flags, args[0]);
1877          break;
1878
1879       case OPCODE_LIT:
1880          emit_lit(c, dst, dst_flags, args[0]);
1881          break;
1882
1883          /* Texturing operations:
1884           */
1885       case OPCODE_TEX:
1886          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1887                   inst->tex_idx, inst->tex_unit,
1888                   inst->tex_shadow);
1889          break;
1890
1891       case OPCODE_TXB:
1892          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1893                   inst->tex_idx, inst->tex_unit);
1894          break;
1895
1896       case OPCODE_KIL:
1897          emit_kil(c, args[0]);
1898          break;
1899
1900       case OPCODE_KIL_NV:
1901          emit_kil_nv(c);
1902          break;
1903
1904       default:
1905          printf("Unsupported opcode %i (%s) in fragment shader\n",
1906                 inst->opcode, inst->opcode < MAX_OPCODE ?
1907                 _mesa_opcode_string(inst->opcode) :
1908                 "unknown");
1909       }
1910
1911       for (i = 0; i < 4; i++)
1912         if (inst->dst[i] && inst->dst[i]->spill_slot)
1913            emit_spill(c,
1914                       inst->dst[i]->hw_reg,
1915                       inst->dst[i]->spill_slot);
1916    }
1917
1918    /* Only properly tested on ILK */
1919    if (p->brw->intel.gen == 5) {
1920      brw_remove_duplicate_mrf_moves(p);
1921      if (c->dispatch_width == 16)
1922         brw_remove_grf_to_mrf_moves(p);
1923    }
1924
1925    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1926       int i;
1927
1928      printf("wm-native:\n");
1929      for (i = 0; i < p->nr_insn; i++)
1930          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1931       printf("\n");
1932    }
1933 }
1934