src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_SWZ] = 1,
  87       [OPCODE_XPD] = 2,
  88    };
  89
  90    /* These opcodes get broken down in a way that allow two
  91     * args to be immediates.
  92     */
  93    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  94       if (arg == 1 || arg == 2)
  95          return GL_TRUE;
  96    }
  97
  98    if (opcode > ARRAY_SIZE(opcode_array))
  99       return GL_FALSE;
 100
 101    return arg == opcode_array[opcode] - 1;
 102 }
 103
 104 /**
 105  * Computes the screen-space x,y position of the pixels.
 106  *
 107  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 108  * interpolation of attributes..
 109  *
 110  * Payload R0:
 111  *
 112  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 113  *         corresponding to each of the 16 execution channels.
 114  * R0.1..8 -- ?
 115  * R1.0 -- triangle vertex 0.X
 116  * R1.1 -- triangle vertex 0.Y
 117  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 118  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 119  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 120  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 121  * R1.6 -- ?
 122  * R1.7 -- ?
 123  * R1.8 -- ?
 124  */
 125 void emit_pixel_xy(struct brw_wm_compile *c,
 126                    const struct brw_reg *dst,
 127                    GLuint mask)
 128 {
 129    struct brw_compile *p = &c->func;
 130    struct brw_reg r1 = brw_vec1_grf(1, 0);
 131    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 132    struct brw_reg dst0_uw, dst1_uw;
 133
 134    brw_push_insn_state(p);
 135    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 136
 137    if (c->dispatch_width == 16) {
 138       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 139       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 140    } else {
 141       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 142       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 143    }
 144
 145    /* Calculate pixel centers by adding 1 or 0 to each of the
 146     * micro-tile coordinates passed in r1.
 147     */
 148    if (mask & WRITEMASK_X) {
 149       brw_ADD(p,
 150               dst0_uw,
 151               stride(suboffset(r1_uw, 4), 2, 4, 0),
 152               brw_imm_v(0x10101010));
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       brw_ADD(p,
 157               dst1_uw,
 158               stride(suboffset(r1_uw,5), 2, 4, 0),
 159               brw_imm_v(0x11001100));
 160    }
 161    brw_pop_insn_state(p);
 162 }
 163
 164 /**
 165  * Computes the screen-space x,y distance of the pixels from the start
 166  * vertex.
 167  *
 168  * This will be used in linterp or pinterp with the start vertex value
 169  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 170  * to produce interpolated attribute values.
 171  */
 172 void emit_delta_xy(struct brw_compile *p,
 173                    const struct brw_reg *dst,
 174                    GLuint mask,
 175                    const struct brw_reg *arg0)
 176 {
 177    struct intel_context *intel = &p->brw->intel;
 178    struct brw_reg r1 = brw_vec1_grf(1, 0);
 179
 180    if (mask == 0)
 181       return;
 182
 183    assert(mask == WRITEMASK_XY);
 184
 185    if (intel->gen >= 6) {
 186        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 187           Just add them with 0.0 for dst reg.. */
 188        r1 = brw_imm_v(0x00000000);
 189        brw_ADD(p,
 190                dst[0],
 191                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 192                r1);
 193        brw_ADD(p,
 194                dst[1],
 195                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 196                r1);
 197        return;
 198    }
 199
 200    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 201     * centers produced by emit_pixel_xy().
 202     */
 203    brw_ADD(p,
 204            dst[0],
 205            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 206            negate(r1));
 207    brw_ADD(p,
 208            dst[1],
 209            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 210            negate(suboffset(r1,1)));
 211 }
 212
 213 /**
 214  * Computes the pixel offset from the window origin for gl_FragCoord().
 215  */
 216 void emit_wpos_xy(struct brw_wm_compile *c,
 217                   const struct brw_reg *dst,
 218                   GLuint mask,
 219                   const struct brw_reg *arg0)
 220 {
 221    struct brw_compile *p = &c->func;
 222
 223    if (mask & WRITEMASK_X) {
 224       if (c->fp->program.PixelCenterInteger) {
 225          /* X' = X */
 226          brw_MOV(p,
 227                  dst[0],
 228                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 229       } else {
 230          /* X' = X + 0.5 */
 231          brw_ADD(p,
 232                  dst[0],
 233                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 234                  brw_imm_f(0.5));
 235       }
 236    }
 237
 238    if (mask & WRITEMASK_Y) {
 239       if (c->fp->program.OriginUpperLeft) {
 240          if (c->fp->program.PixelCenterInteger) {
 241             /* Y' = Y */
 242             brw_MOV(p,
 243                     dst[1],
 244                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 245          } else {
 246             /* Y' = Y + 0.5 */
 247             brw_ADD(p,
 248                     dst[1],
 249                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 250                     brw_imm_f(0.5));
 251          }
 252       } else {
 253          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 254
 255          /* Y' = (height - 1) - Y + center */
 256          brw_ADD(p,
 257                  dst[1],
 258                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 259                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 260       }
 261    }
 262 }
 263
 264
 265 void emit_pixel_w(struct brw_wm_compile *c,
 266                   const struct brw_reg *dst,
 267                   GLuint mask,
 268                   const struct brw_reg *arg0,
 269                   const struct brw_reg *deltas)
 270 {
 271    struct brw_compile *p = &c->func;
 272    struct intel_context *intel = &p->brw->intel;
 273    struct brw_reg src;
 274    struct brw_reg temp_dst;
 275
 276    if (intel->gen >= 6)
 277         temp_dst = dst[3];
 278    else
 279         temp_dst = brw_message_reg(2);
 280
 281    assert(intel->gen < 6);
 282
 283    /* Don't need this if all you are doing is interpolating color, for
 284     * instance.
 285     */
 286    if (mask & WRITEMASK_W) {
 287       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 288
 289       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 290        * result straight into a message reg.
 291        */
 292       if (can_do_pln(intel, deltas)) {
 293          brw_PLN(p, temp_dst, interp3, deltas[0]);
 294       } else {
 295          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 296          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 297       }
 298
 299       /* Calc w */
 300       if (intel->gen >= 6)
 301          src = temp_dst;
 302       else
 303          src = brw_null_reg();
 304
 305       if (c->dispatch_width == 16) {
 306          brw_math_16(p, dst[3],
 307                      BRW_MATH_FUNCTION_INV,
 308                      BRW_MATH_SATURATE_NONE,
 309                      2, src,
 310                      BRW_MATH_PRECISION_FULL);
 311       } else {
 312          brw_math(p, dst[3],
 313                   BRW_MATH_FUNCTION_INV,
 314                   BRW_MATH_SATURATE_NONE,
 315                   2, src,
 316                   BRW_MATH_DATA_VECTOR,
 317                   BRW_MATH_PRECISION_FULL);
 318       }
 319    }
 320 }
 321
 322 void emit_linterp(struct brw_compile *p,
 323                   const struct brw_reg *dst,
 324                   GLuint mask,
 325                   const struct brw_reg *arg0,
 326                   const struct brw_reg *deltas)
 327 {
 328    struct intel_context *intel = &p->brw->intel;
 329    struct brw_reg interp[4];
 330    GLuint nr = arg0[0].nr;
 331    GLuint i;
 332
 333    interp[0] = brw_vec1_grf(nr, 0);
 334    interp[1] = brw_vec1_grf(nr, 4);
 335    interp[2] = brw_vec1_grf(nr+1, 0);
 336    interp[3] = brw_vec1_grf(nr+1, 4);
 337
 338    for (i = 0; i < 4; i++) {
 339       if (mask & (1<<i)) {
 340          if (intel->gen >= 6) {
 341             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 342          } else if (can_do_pln(intel, deltas)) {
 343             brw_PLN(p, dst[i], interp[i], deltas[0]);
 344          } else {
 345             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 346             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 347          }
 348       }
 349    }
 350 }
 351
 352
 353 void emit_pinterp(struct brw_compile *p,
 354                   const struct brw_reg *dst,
 355                   GLuint mask,
 356                   const struct brw_reg *arg0,
 357                   const struct brw_reg *deltas,
 358                   const struct brw_reg *w)
 359 {
 360    struct intel_context *intel = &p->brw->intel;
 361    struct brw_reg interp[4];
 362    GLuint nr = arg0[0].nr;
 363    GLuint i;
 364
 365    if (intel->gen >= 6) {
 366       emit_linterp(p, dst, mask, arg0, interp);
 367       return;
 368    }
 369
 370    interp[0] = brw_vec1_grf(nr, 0);
 371    interp[1] = brw_vec1_grf(nr, 4);
 372    interp[2] = brw_vec1_grf(nr+1, 0);
 373    interp[3] = brw_vec1_grf(nr+1, 4);
 374
 375    for (i = 0; i < 4; i++) {
 376       if (mask & (1<<i)) {
 377          if (can_do_pln(intel, deltas)) {
 378             brw_PLN(p, dst[i], interp[i], deltas[0]);
 379          } else {
 380             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 381             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 382          }
 383       }
 384    }
 385    for (i = 0; i < 4; i++) {
 386       if (mask & (1<<i)) {
 387          brw_MUL(p, dst[i], dst[i], w[3]);
 388       }
 389    }
 390 }
 391
 392
 393 void emit_cinterp(struct brw_compile *p,
 394                   const struct brw_reg *dst,
 395                   GLuint mask,
 396                   const struct brw_reg *arg0)
 397 {
 398    struct brw_reg interp[4];
 399    GLuint nr = arg0[0].nr;
 400    GLuint i;
 401
 402    interp[0] = brw_vec1_grf(nr, 0);
 403    interp[1] = brw_vec1_grf(nr, 4);
 404    interp[2] = brw_vec1_grf(nr+1, 0);
 405    interp[3] = brw_vec1_grf(nr+1, 4);
 406
 407    for (i = 0; i < 4; i++) {
 408       if (mask & (1<<i)) {
 409          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 410       }
 411    }
 412 }
 413
 414 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 415 void emit_frontfacing(struct brw_compile *p,
 416                       const struct brw_reg *dst,
 417                       GLuint mask)
 418 {
 419    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 420    GLuint i;
 421
 422    if (!(mask & WRITEMASK_XYZW))
 423       return;
 424
 425    for (i = 0; i < 4; i++) {
 426       if (mask & (1<<i)) {
 427          brw_MOV(p, dst[i], brw_imm_f(0.0));
 428       }
 429    }
 430
 431    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 432     * us front face
 433     */
 434    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 435    for (i = 0; i < 4; i++) {
 436       if (mask & (1<<i)) {
 437          brw_MOV(p, dst[i], brw_imm_f(1.0));
 438       }
 439    }
 440    brw_set_predicate_control_flag_value(p, 0xff);
 441 }
 442
 443 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 444  * looking like:
 445  *
 446  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 447  *
 448  * and we're trying to produce:
 449  *
 450  *           DDX                     DDY
 451  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 452  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 453  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 454  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 455  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 456  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 457  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 458  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 459  *
 460  * and add another set of two more subspans if in 16-pixel dispatch mode.
 461  *
 462  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 463  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 464  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 465  * between each other.  We could probably do it like ddx and swizzle the right
 466  * order later, but bail for now and just produce
 467  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 468  */
 469 void emit_ddxy(struct brw_compile *p,
 470                const struct brw_reg *dst,
 471                GLuint mask,
 472                GLboolean is_ddx,
 473                const struct brw_reg *arg0)
 474 {
 475    int i;
 476    struct brw_reg src0, src1;
 477
 478    if (mask & SATURATE)
 479       brw_set_saturate(p, 1);
 480    for (i = 0; i < 4; i++ ) {
 481       if (mask & (1<<i)) {
 482          if (is_ddx) {
 483             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 484                            BRW_REGISTER_TYPE_F,
 485                            BRW_VERTICAL_STRIDE_2,
 486                            BRW_WIDTH_2,
 487                            BRW_HORIZONTAL_STRIDE_0,
 488                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 489             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 490                            BRW_REGISTER_TYPE_F,
 491                            BRW_VERTICAL_STRIDE_2,
 492                            BRW_WIDTH_2,
 493                            BRW_HORIZONTAL_STRIDE_0,
 494                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 495          } else {
 496             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 497                            BRW_REGISTER_TYPE_F,
 498                            BRW_VERTICAL_STRIDE_4,
 499                            BRW_WIDTH_4,
 500                            BRW_HORIZONTAL_STRIDE_0,
 501                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 502             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 503                            BRW_REGISTER_TYPE_F,
 504                            BRW_VERTICAL_STRIDE_4,
 505                            BRW_WIDTH_4,
 506                            BRW_HORIZONTAL_STRIDE_0,
 507                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 508          }
 509          brw_ADD(p, dst[i], src0, negate(src1));
 510       }
 511    }
 512    if (mask & SATURATE)
 513       brw_set_saturate(p, 0);
 514 }
 515
 516 void emit_alu1(struct brw_compile *p,
 517                struct brw_instruction *(*func)(struct brw_compile *,
 518                                                struct brw_reg,
 519                                                struct brw_reg),
 520                const struct brw_reg *dst,
 521                GLuint mask,
 522                const struct brw_reg *arg0)
 523 {
 524    GLuint i;
 525
 526    if (mask & SATURATE)
 527       brw_set_saturate(p, 1);
 528
 529    for (i = 0; i < 4; i++) {
 530       if (mask & (1<<i)) {
 531          func(p, dst[i], arg0[i]);
 532       }
 533    }
 534
 535    if (mask & SATURATE)
 536       brw_set_saturate(p, 0);
 537 }
 538
 539
 540 void emit_alu2(struct brw_compile *p,
 541                struct brw_instruction *(*func)(struct brw_compile *,
 542                                                struct brw_reg,
 543                                                struct brw_reg,
 544                                                struct brw_reg),
 545                const struct brw_reg *dst,
 546                GLuint mask,
 547                const struct brw_reg *arg0,
 548                const struct brw_reg *arg1)
 549 {
 550    GLuint i;
 551
 552    if (mask & SATURATE)
 553       brw_set_saturate(p, 1);
 554
 555    for (i = 0; i < 4; i++) {
 556       if (mask & (1<<i)) {
 557          func(p, dst[i], arg0[i], arg1[i]);
 558       }
 559    }
 560
 561    if (mask & SATURATE)
 562       brw_set_saturate(p, 0);
 563 }
 564
 565
 566 void emit_mad(struct brw_compile *p,
 567               const struct brw_reg *dst,
 568               GLuint mask,
 569               const struct brw_reg *arg0,
 570               const struct brw_reg *arg1,
 571               const struct brw_reg *arg2)
 572 {
 573    GLuint i;
 574
 575    for (i = 0; i < 4; i++) {
 576       if (mask & (1<<i)) {
 577          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 578
 579          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 580          brw_ADD(p, dst[i], dst[i], arg2[i]);
 581          brw_set_saturate(p, 0);
 582       }
 583    }
 584 }
 585
 586 void emit_lrp(struct brw_compile *p,
 587               const struct brw_reg *dst,
 588               GLuint mask,
 589               const struct brw_reg *arg0,
 590               const struct brw_reg *arg1,
 591               const struct brw_reg *arg2)
 592 {
 593    GLuint i;
 594
 595    /* Uses dst as a temporary:
 596     */
 597    for (i = 0; i < 4; i++) {
 598       if (mask & (1<<i)) {
 599          /* Can I use the LINE instruction for this?
 600           */
 601          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 602          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 603
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607       }
 608    }
 609 }
 610
 611 void emit_sop(struct brw_compile *p,
 612               const struct brw_reg *dst,
 613               GLuint mask,
 614               GLuint cond,
 615               const struct brw_reg *arg0,
 616               const struct brw_reg *arg1)
 617 {
 618    GLuint i;
 619
 620    for (i = 0; i < 4; i++) {
 621       if (mask & (1<<i)) {
 622          brw_push_insn_state(p);
 623          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 624          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 625          brw_MOV(p, dst[i], brw_imm_f(0));
 626          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 627          brw_MOV(p, dst[i], brw_imm_f(1.0));
 628          brw_pop_insn_state(p);
 629       }
 630    }
 631 }
 632
 633 static void emit_slt( struct brw_compile *p,
 634                       const struct brw_reg *dst,
 635                       GLuint mask,
 636                       const struct brw_reg *arg0,
 637                       const struct brw_reg *arg1 )
 638 {
 639    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 640 }
 641
 642 static void emit_sle( struct brw_compile *p,
 643                       const struct brw_reg *dst,
 644                       GLuint mask,
 645                       const struct brw_reg *arg0,
 646                       const struct brw_reg *arg1 )
 647 {
 648    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 649 }
 650
 651 static void emit_sgt( struct brw_compile *p,
 652                       const struct brw_reg *dst,
 653                       GLuint mask,
 654                       const struct brw_reg *arg0,
 655                       const struct brw_reg *arg1 )
 656 {
 657    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 658 }
 659
 660 static void emit_sge( struct brw_compile *p,
 661                       const struct brw_reg *dst,
 662                       GLuint mask,
 663                       const struct brw_reg *arg0,
 664                       const struct brw_reg *arg1 )
 665 {
 666    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 667 }
 668
 669 static void emit_seq( struct brw_compile *p,
 670                       const struct brw_reg *dst,
 671                       GLuint mask,
 672                       const struct brw_reg *arg0,
 673                       const struct brw_reg *arg1 )
 674 {
 675    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 676 }
 677
 678 static void emit_sne( struct brw_compile *p,
 679                       const struct brw_reg *dst,
 680                       GLuint mask,
 681                       const struct brw_reg *arg0,
 682                       const struct brw_reg *arg1 )
 683 {
 684    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 685 }
 686
 687 void emit_cmp(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1,
 692               const struct brw_reg *arg2)
 693 {
 694    GLuint i;
 695
 696    for (i = 0; i < 4; i++) {
 697       if (mask & (1<<i)) {
 698          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 699
 700          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 701          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 702          brw_set_saturate(p, 0);
 703          brw_set_predicate_control_flag_value(p, 0xff);
 704       }
 705    }
 706 }
 707
 708 void emit_sign(struct brw_compile *p,
 709                const struct brw_reg *dst,
 710                GLuint mask,
 711                const struct brw_reg *arg0)
 712 {
 713    GLuint i;
 714
 715    for (i = 0; i < 4; i++) {
 716       if (mask & (1<<i)) {
 717          brw_MOV(p, dst[i], brw_imm_f(0.0));
 718
 719          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 720          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 721          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 722
 723          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 724          brw_MOV(p, dst[i], brw_imm_f(1.0));
 725          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 726       }
 727    }
 728 }
 729
 730 void emit_max(struct brw_compile *p,
 731               const struct brw_reg *dst,
 732               GLuint mask,
 733               const struct brw_reg *arg0,
 734               const struct brw_reg *arg1)
 735 {
 736    GLuint i;
 737
 738    for (i = 0; i < 4; i++) {
 739       if (mask & (1<<i)) {
 740          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 741
 742          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 743          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 744          brw_set_saturate(p, 0);
 745          brw_set_predicate_control_flag_value(p, 0xff);
 746       }
 747    }
 748 }
 749
 750 void emit_min(struct brw_compile *p,
 751               const struct brw_reg *dst,
 752               GLuint mask,
 753               const struct brw_reg *arg0,
 754               const struct brw_reg *arg1)
 755 {
 756    GLuint i;
 757
 758    for (i = 0; i < 4; i++) {
 759       if (mask & (1<<i)) {
 760          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 761
 762          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 763          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 764          brw_set_saturate(p, 0);
 765          brw_set_predicate_control_flag_value(p, 0xff);
 766       }
 767    }
 768 }
 769
 770
 771 void emit_dp2(struct brw_compile *p,
 772               const struct brw_reg *dst,
 773               GLuint mask,
 774               const struct brw_reg *arg0,
 775               const struct brw_reg *arg1)
 776 {
 777    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 778
 779    if (!(mask & WRITEMASK_XYZW))
 780       return; /* Do not emit dead code */
 781
 782    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 783
 784    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 785
 786    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 787    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 788    brw_set_saturate(p, 0);
 789 }
 790
 791
 792 void emit_dp3(struct brw_compile *p,
 793               const struct brw_reg *dst,
 794               GLuint mask,
 795               const struct brw_reg *arg0,
 796               const struct brw_reg *arg1)
 797 {
 798    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 799
 800    if (!(mask & WRITEMASK_XYZW))
 801       return; /* Do not emit dead code */
 802
 803    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 804
 805    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 806    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 807
 808    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 809    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 810    brw_set_saturate(p, 0);
 811 }
 812
 813
 814 void emit_dp4(struct brw_compile *p,
 815               const struct brw_reg *dst,
 816               GLuint mask,
 817               const struct brw_reg *arg0,
 818               const struct brw_reg *arg1)
 819 {
 820    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 821
 822    if (!(mask & WRITEMASK_XYZW))
 823       return; /* Do not emit dead code */
 824
 825    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 826
 827    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 828    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 829    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 830
 831    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 832    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 833    brw_set_saturate(p, 0);
 834 }
 835
 836
 837 void emit_dph(struct brw_compile *p,
 838               const struct brw_reg *dst,
 839               GLuint mask,
 840               const struct brw_reg *arg0,
 841               const struct brw_reg *arg1)
 842 {
 843    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 844
 845    if (!(mask & WRITEMASK_XYZW))
 846       return; /* Do not emit dead code */
 847
 848    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 849
 850    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 851    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 852    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 853
 854    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 855    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 856    brw_set_saturate(p, 0);
 857 }
 858
 859
 860 void emit_xpd(struct brw_compile *p,
 861               const struct brw_reg *dst,
 862               GLuint mask,
 863               const struct brw_reg *arg0,
 864               const struct brw_reg *arg1)
 865 {
 866    GLuint i;
 867
 868    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 869
 870    for (i = 0 ; i < 3; i++) {
 871       if (mask & (1<<i)) {
 872          GLuint i2 = (i+2)%3;
 873          GLuint i1 = (i+1)%3;
 874
 875          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 876
 877          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 878          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 879          brw_set_saturate(p, 0);
 880       }
 881    }
 882 }
 883
 884
 885 void emit_math1(struct brw_wm_compile *c,
 886                 GLuint function,
 887                 const struct brw_reg *dst,
 888                 GLuint mask,
 889                 const struct brw_reg *arg0)
 890 {
 891    struct brw_compile *p = &c->func;
 892    struct intel_context *intel = &p->brw->intel;
 893    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 894    GLuint saturate = ((mask & SATURATE) ?
 895                       BRW_MATH_SATURATE_SATURATE :
 896                       BRW_MATH_SATURATE_NONE);
 897    struct brw_reg src;
 898
 899    if (intel->gen >= 6 && arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 900       /* Gen6 math requires that source and dst horizontal stride be 1.
 901        *
 902        */
 903       src = *dst;
 904       brw_MOV(p, src, arg0[0]);
 905    } else {
 906       src = arg0[0];
 907    }
 908
 909    if (!(mask & WRITEMASK_XYZW))
 910       return; /* Do not emit dead code */
 911
 912    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 913
 914    /* Send two messages to perform all 16 operations:
 915     */
 916    brw_push_insn_state(p);
 917    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 918    brw_math(p,
 919             dst[dst_chan],
 920             function,
 921             saturate,
 922             2,
 923             src,
 924             BRW_MATH_DATA_VECTOR,
 925             BRW_MATH_PRECISION_FULL);
 926
 927    if (c->dispatch_width == 16) {
 928       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 929       brw_math(p,
 930                offset(dst[dst_chan],1),
 931                function,
 932                saturate,
 933                3,
 934                sechalf(src),
 935                BRW_MATH_DATA_VECTOR,
 936                BRW_MATH_PRECISION_FULL);
 937    }
 938    brw_pop_insn_state(p);
 939 }
 940
 941
 942 void emit_math2(struct brw_wm_compile *c,
 943                 GLuint function,
 944                 const struct brw_reg *dst,
 945                 GLuint mask,
 946                 const struct brw_reg *arg0,
 947                 const struct brw_reg *arg1)
 948 {
 949    struct brw_compile *p = &c->func;
 950    struct intel_context *intel = &p->brw->intel;
 951    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 952
 953    if (!(mask & WRITEMASK_XYZW))
 954       return; /* Do not emit dead code */
 955
 956    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 957
 958    brw_push_insn_state(p);
 959
 960    /* math can only operate on up to a vec8 at a time, so in
 961     * dispatch_width==16 we have to do the second half manually.
 962     */
 963    if (intel->gen >= 6) {
 964       struct brw_reg src0 = arg0[0];
 965       struct brw_reg src1 = arg1[0];
 966       struct brw_reg temp_dst = dst[dst_chan];
 967
 968       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 969          if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 970             /* Both scalar arguments.  Do scalar calc. */
 971             src0.hstride = BRW_HORIZONTAL_STRIDE_1;
 972             src1.hstride = BRW_HORIZONTAL_STRIDE_1;
 973             temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
 974             temp_dst.width = BRW_WIDTH_1;
 975
 976             if (arg0[0].subnr != 0) {
 977                brw_MOV(p, temp_dst, src0);
 978                src0 = temp_dst;
 979
 980                /* Ouch.  We've used the temp as a dst, and we still
 981                 * need a temp to store arg1 in, because src and dst
 982                 * offsets have to be equal.  Leaving this up to
 983                 * glsl2-965 to handle correctly.
 984                 */
 985                assert(arg1[0].subnr == 0);
 986             } else if (arg1[0].subnr != 0) {
 987                brw_MOV(p, temp_dst, src1);
 988                src1 = temp_dst;
 989             }
 990          } else {
 991             brw_MOV(p, temp_dst, src0);
 992             src0 = temp_dst;
 993          }
 994       } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 995          brw_MOV(p, temp_dst, src1);
 996          src1 = temp_dst;
 997       }
 998
 999       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1000       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1001       brw_math2(p,
1002                 temp_dst,
1003                 function,
1004                 src0,
1005                 src1);
1006       if (c->dispatch_width == 16) {
1007          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1008          brw_math2(p,
1009                    sechalf(temp_dst),
1010                    function,
1011                    sechalf(src0),
1012                    sechalf(src1));
1013       }
1014
1015       /* Splat a scalar result into all the channels. */
1016       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1017           arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1018          temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1019          temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1020          brw_MOV(p, dst[dst_chan], temp_dst);
1021       }
1022    } else {
1023       GLuint saturate = ((mask & SATURATE) ?
1024                          BRW_MATH_SATURATE_SATURATE :
1025                          BRW_MATH_SATURATE_NONE);
1026
1027       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1028       brw_MOV(p, brw_message_reg(3), arg1[0]);
1029       if (c->dispatch_width == 16) {
1030          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1031          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1032       }
1033
1034       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1035       brw_math(p,
1036                dst[dst_chan],
1037                function,
1038                saturate,
1039                2,
1040                arg0[0],
1041                BRW_MATH_DATA_VECTOR,
1042                BRW_MATH_PRECISION_FULL);
1043
1044       /* Send two messages to perform all 16 operations:
1045        */
1046       if (c->dispatch_width == 16) {
1047          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1048          brw_math(p,
1049                   offset(dst[dst_chan],1),
1050                   function,
1051                   saturate,
1052                   4,
1053                   sechalf(arg0[0]),
1054                   BRW_MATH_DATA_VECTOR,
1055                   BRW_MATH_PRECISION_FULL);
1056       }
1057    }
1058    brw_pop_insn_state(p);
1059 }
1060
1061
1062 void emit_tex(struct brw_wm_compile *c,
1063               struct brw_reg *dst,
1064               GLuint dst_flags,
1065               struct brw_reg *arg,
1066               struct brw_reg depth_payload,
1067               GLuint tex_idx,
1068               GLuint sampler,
1069               GLboolean shadow)
1070 {
1071    struct brw_compile *p = &c->func;
1072    struct intel_context *intel = &p->brw->intel;
1073    struct brw_reg dst_retyped;
1074    GLuint cur_mrf = 2, response_length;
1075    GLuint i, nr_texcoords;
1076    GLuint emit;
1077    GLuint msg_type;
1078    GLuint mrf_per_channel;
1079    GLuint simd_mode;
1080
1081    if (c->dispatch_width == 16) {
1082       mrf_per_channel = 2;
1083       response_length = 8;
1084       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1085       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1086    } else {
1087       mrf_per_channel = 1;
1088       response_length = 4;
1089       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1090       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1091    }
1092
1093    /* How many input regs are there?
1094     */
1095    switch (tex_idx) {
1096    case TEXTURE_1D_INDEX:
1097       emit = WRITEMASK_X;
1098       nr_texcoords = 1;
1099       break;
1100    case TEXTURE_2D_INDEX:
1101    case TEXTURE_RECT_INDEX:
1102       emit = WRITEMASK_XY;
1103       nr_texcoords = 2;
1104       break;
1105    case TEXTURE_3D_INDEX:
1106    case TEXTURE_CUBE_INDEX:
1107       emit = WRITEMASK_XYZ;
1108       nr_texcoords = 3;
1109       break;
1110    default:
1111       /* unexpected target */
1112       abort();
1113    }
1114
1115    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1116    if (intel->gen < 5 && c->dispatch_width == 8)
1117       nr_texcoords = 3;
1118
1119    /* For shadow comparisons, we have to supply u,v,r. */
1120    if (shadow)
1121       nr_texcoords = 3;
1122
1123    /* Emit the texcoords. */
1124    for (i = 0; i < nr_texcoords; i++) {
1125       if (emit & (1<<i))
1126          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1127       else
1128          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1129       cur_mrf += mrf_per_channel;
1130    }
1131
1132    /* Fill in the shadow comparison reference value. */
1133    if (shadow) {
1134       if (intel->gen >= 5) {
1135          /* Fill in the cube map array index value. */
1136          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1137          cur_mrf += mrf_per_channel;
1138       } else if (c->dispatch_width == 8) {
1139          /* Fill in the LOD bias value. */
1140          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1141          cur_mrf += mrf_per_channel;
1142       }
1143       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1144       cur_mrf += mrf_per_channel;
1145    }
1146
1147    if (intel->gen >= 5) {
1148       if (shadow)
1149          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1150       else
1151          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1152    } else {
1153       /* Note that G45 and older determines shadow compare and dispatch width
1154        * from message length for most messages.
1155        */
1156       if (c->dispatch_width == 16 && shadow)
1157          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1158       else
1159          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1160    }
1161
1162    brw_SAMPLE(p,
1163               dst_retyped,
1164               1,
1165               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1166               SURF_INDEX_TEXTURE(sampler),
1167               sampler,
1168               dst_flags & WRITEMASK_XYZW,
1169               msg_type,
1170               response_length,
1171               cur_mrf - 1,
1172               0,
1173               1,
1174               simd_mode);
1175 }
1176
1177
1178 void emit_txb(struct brw_wm_compile *c,
1179               struct brw_reg *dst,
1180               GLuint dst_flags,
1181               struct brw_reg *arg,
1182               struct brw_reg depth_payload,
1183               GLuint tex_idx,
1184               GLuint sampler)
1185 {
1186    struct brw_compile *p = &c->func;
1187    struct intel_context *intel = &p->brw->intel;
1188    GLuint msgLength;
1189    GLuint msg_type;
1190    GLuint mrf_per_channel;
1191    GLuint response_length;
1192    struct brw_reg dst_retyped;
1193
1194    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1195     * samples, so we'll use the 16-wide instruction, leave the second halves
1196     * undefined, and trust the execution mask to keep the undefined pixels
1197     * from mattering.
1198     */
1199    if (c->dispatch_width == 16 || intel->gen < 5) {
1200       if (intel->gen >= 5)
1201          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1202       else
1203          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1204       mrf_per_channel = 2;
1205       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1206       response_length = 8;
1207    } else {
1208       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1209       mrf_per_channel = 1;
1210       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1211       response_length = 4;
1212    }
1213
1214    /* Shadow ignored for txb. */
1215    switch (tex_idx) {
1216    case TEXTURE_1D_INDEX:
1217       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1218       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1219       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1220       break;
1221    case TEXTURE_2D_INDEX:
1222    case TEXTURE_RECT_INDEX:
1223       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1224       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1225       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1226       break;
1227    case TEXTURE_3D_INDEX:
1228    case TEXTURE_CUBE_INDEX:
1229       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1230       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1231       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1232       break;
1233    default:
1234       /* unexpected target */
1235       abort();
1236    }
1237
1238    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1239    msgLength = 2 + 4 * mrf_per_channel - 1;
1240
1241    brw_SAMPLE(p,
1242               dst_retyped,
1243               1,
1244               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1245               SURF_INDEX_TEXTURE(sampler),
1246               sampler,
1247               dst_flags & WRITEMASK_XYZW,
1248               msg_type,
1249               response_length,
1250               msgLength,
1251               0,
1252               1,
1253               BRW_SAMPLER_SIMD_MODE_SIMD16);
1254 }
1255
1256
1257 static void emit_lit(struct brw_wm_compile *c,
1258                      const struct brw_reg *dst,
1259                      GLuint mask,
1260                      const struct brw_reg *arg0)
1261 {
1262    struct brw_compile *p = &c->func;
1263
1264    assert((mask & WRITEMASK_XW) == 0);
1265
1266    if (mask & WRITEMASK_Y) {
1267       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1268       brw_MOV(p, dst[1], arg0[0]);
1269       brw_set_saturate(p, 0);
1270    }
1271
1272    if (mask & WRITEMASK_Z) {
1273       emit_math2(c, BRW_MATH_FUNCTION_POW,
1274                  &dst[2],
1275                  WRITEMASK_X | (mask & SATURATE),
1276                  &arg0[1],
1277                  &arg0[3]);
1278    }
1279
1280    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1281     * some of the POW calculations above, but 16-wide iff statements
1282     * seem to lock c1 hardware, so this is a nasty workaround:
1283     */
1284    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1285    {
1286       if (mask & WRITEMASK_Y)
1287          brw_MOV(p, dst[1], brw_imm_f(0));
1288
1289       if (mask & WRITEMASK_Z)
1290          brw_MOV(p, dst[2], brw_imm_f(0));
1291    }
1292    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1293 }
1294
1295
1296 /* Kill pixel - set execution mask to zero for those pixels which
1297  * fail.
1298  */
1299 static void emit_kil( struct brw_wm_compile *c,
1300                       struct brw_reg *arg0)
1301 {
1302    struct brw_compile *p = &c->func;
1303    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1304    GLuint i, j;
1305
1306    for (i = 0; i < 4; i++) {
1307       /* Check if we've already done the comparison for this reg
1308        * -- common when someone does KIL TEMP.wwww.
1309        */
1310       for (j = 0; j < i; j++) {
1311          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1312             break;
1313       }
1314       if (j != i)
1315          continue;
1316
1317       brw_push_insn_state(p);
1318       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1319       brw_set_predicate_control_flag_value(p, 0xff);
1320       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1321       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1322       brw_pop_insn_state(p);
1323    }
1324 }
1325
1326 /* KIL_NV kills the pixels that are currently executing, not based on a test
1327  * of the arguments.
1328  */
1329 void emit_kil_nv( struct brw_wm_compile *c )
1330 {
1331    struct brw_compile *p = &c->func;
1332    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1333
1334    brw_push_insn_state(p);
1335    brw_set_mask_control(p, BRW_MASK_DISABLE);
1336    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1337    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1338    brw_pop_insn_state(p);
1339 }
1340
1341 static void fire_fb_write( struct brw_wm_compile *c,
1342                            GLuint base_reg,
1343                            GLuint nr,
1344                            GLuint target,
1345                            GLuint eot )
1346 {
1347    struct brw_compile *p = &c->func;
1348    struct intel_context *intel = &p->brw->intel;
1349    struct brw_reg dst;
1350
1351    if (c->dispatch_width == 16)
1352       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1353    else
1354       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1355
1356    /* Pass through control information:
1357     */
1358 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1359    if (intel->gen < 6) /* gen6, use headerless for fb write */
1360    {
1361       brw_push_insn_state(p);
1362       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1363       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1364       brw_MOV(p,
1365                brw_message_reg(base_reg + 1),
1366                brw_vec8_grf(1, 0));
1367       brw_pop_insn_state(p);
1368    }
1369
1370    /* Send framebuffer write message: */
1371 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1372    brw_fb_WRITE(p,
1373                 c->dispatch_width,
1374                 dst,
1375                 base_reg,
1376                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1377                 target,
1378                 nr,
1379                 0,
1380                 eot);
1381 }
1382
1383
1384 static void emit_aa( struct brw_wm_compile *c,
1385                      struct brw_reg *arg1,
1386                      GLuint reg )
1387 {
1388    struct brw_compile *p = &c->func;
1389    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1390    GLuint off = c->key.aa_dest_stencil_reg % 2;
1391    struct brw_reg aa = offset(arg1[comp], off);
1392
1393    brw_push_insn_state(p);
1394    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1395    brw_MOV(p, brw_message_reg(reg), aa);
1396    brw_pop_insn_state(p);
1397 }
1398
1399
1400 /* Post-fragment-program processing.  Send the results to the
1401  * framebuffer.
1402  * \param arg0  the fragment color
1403  * \param arg1  the pass-through depth value
1404  * \param arg2  the shader-computed depth value
1405  */
1406 void emit_fb_write(struct brw_wm_compile *c,
1407                    struct brw_reg *arg0,
1408                    struct brw_reg *arg1,
1409                    struct brw_reg *arg2,
1410                    GLuint target,
1411                    GLuint eot)
1412 {
1413    struct brw_compile *p = &c->func;
1414    struct brw_context *brw = p->brw;
1415    struct intel_context *intel = &brw->intel;
1416    GLuint nr = 2;
1417    GLuint channel;
1418    int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1419
1420    /* Reserve a space for AA - may not be needed:
1421     */
1422    if (c->key.aa_dest_stencil_reg)
1423       nr += 1;
1424
1425    /* I don't really understand how this achieves the color interleave
1426     * (ie RGBARGBA) in the result:  [Do the saturation here]
1427     */
1428    brw_push_insn_state(p);
1429
1430    if (intel->gen >= 6)
1431         base_reg = nr;
1432    else
1433         base_reg = 0;
1434
1435    for (channel = 0; channel < 4; channel++) {
1436       if (intel->gen >= 6) {
1437          /* gen6 SIMD16 single source DP write looks like:
1438           * m + 0: r0
1439           * m + 1: r1
1440           * m + 2: g0
1441           * m + 3: g1
1442           * m + 4: b0
1443           * m + 5: b1
1444           * m + 6: a0
1445           * m + 7: a1
1446           */
1447          if (c->dispatch_width == 16) {
1448             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1449          } else {
1450             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1451          }
1452       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1453          /* pre-gen6 SIMD16 single source DP write looks like:
1454           * m + 0: r0
1455           * m + 1: g0
1456           * m + 2: b0
1457           * m + 3: a0
1458           * m + 4: r1
1459           * m + 5: g1
1460           * m + 6: b1
1461           * m + 7: a1
1462           *
1463           * By setting the high bit of the MRF register number, we indicate
1464           * that we want COMPR4 mode - instead of doing the usual destination
1465           * + 1 for the second half we get destination + 4.
1466           */
1467          brw_MOV(p,
1468                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1469                  arg0[channel]);
1470       } else {
1471          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1472          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1473          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1474          brw_MOV(p,
1475                  brw_message_reg(nr + channel),
1476                  arg0[channel]);
1477
1478          if (c->dispatch_width == 16) {
1479             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1480             brw_MOV(p,
1481                     brw_message_reg(nr + channel + 4),
1482                     sechalf(arg0[channel]));
1483          }
1484       }
1485    }
1486    /* skip over the regs populated above:
1487     */
1488    if (c->dispatch_width == 16)
1489       nr += 8;
1490    else
1491       nr += 4;
1492
1493    brw_pop_insn_state(p);
1494
1495    if (c->key.source_depth_to_render_target)
1496    {
1497       if (c->key.computes_depth)
1498          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1499       else
1500          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1501
1502       nr += 2;
1503    }
1504
1505    if (c->key.dest_depth_reg)
1506    {
1507       GLuint comp = c->key.dest_depth_reg / 2;
1508       GLuint off = c->key.dest_depth_reg % 2;
1509
1510       if (off != 0) {
1511          brw_push_insn_state(p);
1512          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1513
1514          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1515          /* 2nd half? */
1516          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1517          brw_pop_insn_state(p);
1518       }
1519       else {
1520          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1521       }
1522       nr += 2;
1523    }
1524
1525    if (intel->gen >= 6) {
1526       /* Subtract off the message header, since we send headerless. */
1527       nr -= 2;
1528    }
1529
1530    if (!c->key.runtime_check_aads_emit) {
1531       if (c->key.aa_dest_stencil_reg)
1532          emit_aa(c, arg1, 2);
1533
1534       fire_fb_write(c, base_reg, nr, target, eot);
1535    }
1536    else {
1537       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1538       struct brw_reg ip = brw_ip_reg();
1539       struct brw_instruction *jmp;
1540
1541       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1542       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1543       brw_AND(p,
1544               v1_null_ud,
1545               get_element_ud(brw_vec8_grf(1,0), 6),
1546               brw_imm_ud(1<<26));
1547
1548       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1549       {
1550          emit_aa(c, arg1, 2);
1551          fire_fb_write(c, 0, nr, target, eot);
1552          /* note - thread killed in subroutine */
1553       }
1554       brw_land_fwd_jump(p, jmp);
1555
1556       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1557        */
1558       fire_fb_write(c, 1, nr-1, target, eot);
1559    }
1560 }
1561
1562 /**
1563  * Move a GPR to scratch memory.
1564  */
1565 static void emit_spill( struct brw_wm_compile *c,
1566                         struct brw_reg reg,
1567                         GLuint slot )
1568 {
1569    struct brw_compile *p = &c->func;
1570
1571    /*
1572      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1573    */
1574    brw_MOV(p, brw_message_reg(2), reg);
1575
1576    /*
1577      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1578      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1579    */
1580    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1581 }
1582
1583
1584 /**
1585  * Load a GPR from scratch memory.
1586  */
1587 static void emit_unspill( struct brw_wm_compile *c,
1588                           struct brw_reg reg,
1589                           GLuint slot )
1590 {
1591    struct brw_compile *p = &c->func;
1592
1593    /* Slot 0 is the undef value.
1594     */
1595    if (slot == 0) {
1596       brw_MOV(p, reg, brw_imm_f(0));
1597       return;
1598    }
1599
1600    /*
1601      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1602      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1603    */
1604
1605    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1606 }
1607
1608
1609 /**
1610  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1611  * Args with unspill_reg != 0 will be loaded from scratch memory.
1612  */
1613 static void get_argument_regs( struct brw_wm_compile *c,
1614                                struct brw_wm_ref *arg[],
1615                                struct brw_reg *regs )
1616 {
1617    GLuint i;
1618
1619    for (i = 0; i < 4; i++) {
1620       if (arg[i]) {
1621          if (arg[i]->unspill_reg)
1622             emit_unspill(c,
1623                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1624                          arg[i]->value->spill_slot);
1625
1626          regs[i] = arg[i]->hw_reg;
1627       }
1628       else {
1629          regs[i] = brw_null_reg();
1630       }
1631    }
1632 }
1633
1634
1635 /**
1636  * For values that have a spill_slot!=0, write those regs to scratch memory.
1637  */
1638 static void spill_values( struct brw_wm_compile *c,
1639                           struct brw_wm_value *values,
1640                           GLuint nr )
1641 {
1642    GLuint i;
1643
1644    for (i = 0; i < nr; i++)
1645       if (values[i].spill_slot)
1646          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1647 }
1648
1649
1650 /* Emit the fragment program instructions here.
1651  */
1652 void brw_wm_emit( struct brw_wm_compile *c )
1653 {
1654    struct brw_compile *p = &c->func;
1655    struct intel_context *intel = &p->brw->intel;
1656    GLuint insn;
1657
1658    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1659    if (intel->gen >= 6)
1660         brw_set_acc_write_control(p, 1);
1661
1662    /* Check if any of the payload regs need to be spilled:
1663     */
1664    spill_values(c, c->payload.depth, 4);
1665    spill_values(c, c->creg, c->nr_creg);
1666    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1667
1668
1669    for (insn = 0; insn < c->nr_insns; insn++) {
1670
1671       struct brw_wm_instruction *inst = &c->instruction[insn];
1672       struct brw_reg args[3][4], dst[4];
1673       GLuint i, dst_flags;
1674
1675       /* Get argument regs:
1676        */
1677       for (i = 0; i < 3; i++)
1678          get_argument_regs(c, inst->src[i], args[i]);
1679
1680       /* Get dest regs:
1681        */
1682       for (i = 0; i < 4; i++)
1683          if (inst->dst[i])
1684             dst[i] = inst->dst[i]->hw_reg;
1685          else
1686             dst[i] = brw_null_reg();
1687
1688       /* Flags
1689        */
1690       dst_flags = inst->writemask;
1691       if (inst->saturate)
1692          dst_flags |= SATURATE;
1693
1694       switch (inst->opcode) {
1695          /* Generated instructions for calculating triangle interpolants:
1696           */
1697       case WM_PIXELXY:
1698          emit_pixel_xy(c, dst, dst_flags);
1699          break;
1700
1701       case WM_DELTAXY:
1702          emit_delta_xy(p, dst, dst_flags, args[0]);
1703          break;
1704
1705       case WM_WPOSXY:
1706          emit_wpos_xy(c, dst, dst_flags, args[0]);
1707          break;
1708
1709       case WM_PIXELW:
1710          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1711          break;
1712
1713       case WM_LINTERP:
1714          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1715          break;
1716
1717       case WM_PINTERP:
1718          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1719          break;
1720
1721       case WM_CINTERP:
1722          emit_cinterp(p, dst, dst_flags, args[0]);
1723          break;
1724
1725       case WM_FB_WRITE:
1726          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1727          break;
1728
1729       case WM_FRONTFACING:
1730          emit_frontfacing(p, dst, dst_flags);
1731          break;
1732
1733          /* Straightforward arithmetic:
1734           */
1735       case OPCODE_ADD:
1736          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1737          break;
1738
1739       case OPCODE_FRC:
1740          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1741          break;
1742
1743       case OPCODE_FLR:
1744          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1745          break;
1746
1747       case OPCODE_DDX:
1748          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1749          break;
1750
1751       case OPCODE_DDY:
1752          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1753          break;
1754
1755       case OPCODE_DP2:
1756          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1757          break;
1758
1759       case OPCODE_DP3:
1760          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1761          break;
1762
1763       case OPCODE_DP4:
1764          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1765          break;
1766
1767       case OPCODE_DPH:
1768          emit_dph(p, dst, dst_flags, args[0], args[1]);
1769          break;
1770
1771       case OPCODE_TRUNC:
1772          for (i = 0; i < 4; i++) {
1773             if (dst_flags & (1<<i)) {
1774                brw_RNDZ(p, dst[i], args[0][i]);
1775             }
1776          }
1777          break;
1778
1779       case OPCODE_LRP:
1780          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1781          break;
1782
1783       case OPCODE_MAD:
1784          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1785          break;
1786
1787       case OPCODE_MOV:
1788       case OPCODE_SWZ:
1789          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1790          break;
1791
1792       case OPCODE_MUL:
1793          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1794          break;
1795
1796       case OPCODE_XPD:
1797          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1798          break;
1799
1800          /* Higher math functions:
1801           */
1802       case OPCODE_RCP:
1803          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1804          break;
1805
1806       case OPCODE_RSQ:
1807          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1808          break;
1809
1810       case OPCODE_SIN:
1811          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1812          break;
1813
1814       case OPCODE_COS:
1815          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1816          break;
1817
1818       case OPCODE_EX2:
1819          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1820          break;
1821
1822       case OPCODE_LG2:
1823          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1824          break;
1825
1826       case OPCODE_SCS:
1827          /* There is an scs math function, but it would need some
1828           * fixup for 16-element execution.
1829           */
1830          if (dst_flags & WRITEMASK_X)
1831             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1832          if (dst_flags & WRITEMASK_Y)
1833             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1834          break;
1835
1836       case OPCODE_POW:
1837          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1838          break;
1839
1840          /* Comparisons:
1841           */
1842       case OPCODE_CMP:
1843          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1844          break;
1845
1846       case OPCODE_MAX:
1847          emit_max(p, dst, dst_flags, args[0], args[1]);
1848          break;
1849
1850       case OPCODE_MIN:
1851          emit_min(p, dst, dst_flags, args[0], args[1]);
1852          break;
1853
1854       case OPCODE_SLT:
1855          emit_slt(p, dst, dst_flags, args[0], args[1]);
1856          break;
1857
1858       case OPCODE_SLE:
1859          emit_sle(p, dst, dst_flags, args[0], args[1]);
1860         break;
1861       case OPCODE_SGT:
1862          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1863         break;
1864       case OPCODE_SGE:
1865          emit_sge(p, dst, dst_flags, args[0], args[1]);
1866          break;
1867       case OPCODE_SEQ:
1868          emit_seq(p, dst, dst_flags, args[0], args[1]);
1869         break;
1870       case OPCODE_SNE:
1871          emit_sne(p, dst, dst_flags, args[0], args[1]);
1872         break;
1873
1874       case OPCODE_SSG:
1875          emit_sign(p, dst, dst_flags, args[0]);
1876          break;
1877
1878       case OPCODE_LIT:
1879          emit_lit(c, dst, dst_flags, args[0]);
1880          break;
1881
1882          /* Texturing operations:
1883           */
1884       case OPCODE_TEX:
1885          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1886                   inst->tex_idx, inst->tex_unit,
1887                   inst->tex_shadow);
1888          break;
1889
1890       case OPCODE_TXB:
1891          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1892                   inst->tex_idx, inst->tex_unit);
1893          break;
1894
1895       case OPCODE_KIL:
1896          emit_kil(c, args[0]);
1897          break;
1898
1899       case OPCODE_KIL_NV:
1900          emit_kil_nv(c);
1901          break;
1902
1903       default:
1904          printf("Unsupported opcode %i (%s) in fragment shader\n",
1905                 inst->opcode, inst->opcode < MAX_OPCODE ?
1906                 _mesa_opcode_string(inst->opcode) :
1907                 "unknown");
1908       }
1909
1910       for (i = 0; i < 4; i++)
1911         if (inst->dst[i] && inst->dst[i]->spill_slot)
1912            emit_spill(c,
1913                       inst->dst[i]->hw_reg,
1914                       inst->dst[i]->spill_slot);
1915    }
1916
1917    /* Only properly tested on ILK */
1918    if (p->brw->intel.gen == 5) {
1919      brw_remove_duplicate_mrf_moves(p);
1920      if (c->dispatch_width == 16)
1921         brw_remove_grf_to_mrf_moves(p);
1922    }
1923
1924    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1925       int i;
1926
1927      printf("wm-native:\n");
1928      for (i = 0; i < p->nr_insn; i++)
1929          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1930       printf("\n");
1931    }
1932 }
1933