src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_XPD] = 2,
  87    };
  88
  89    /* These opcodes get broken down in a way that allow two
  90     * args to be immediates.
  91     */
  92    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  93       if (arg == 1 || arg == 2)
  94          return GL_TRUE;
  95    }
  96
  97    if (opcode > ARRAY_SIZE(opcode_array))
  98       return GL_FALSE;
  99
 100    return arg == opcode_array[opcode] - 1;
 101 }
 102
 103 /**
 104  * Computes the screen-space x,y position of the pixels.
 105  *
 106  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 107  * interpolation of attributes..
 108  *
 109  * Payload R0:
 110  *
 111  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 112  *         corresponding to each of the 16 execution channels.
 113  * R0.1..8 -- ?
 114  * R1.0 -- triangle vertex 0.X
 115  * R1.1 -- triangle vertex 0.Y
 116  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 117  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 118  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 119  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 120  * R1.6 -- ?
 121  * R1.7 -- ?
 122  * R1.8 -- ?
 123  */
 124 void emit_pixel_xy(struct brw_wm_compile *c,
 125                    const struct brw_reg *dst,
 126                    GLuint mask)
 127 {
 128    struct brw_compile *p = &c->func;
 129    struct brw_reg r1 = brw_vec1_grf(1, 0);
 130    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 131    struct brw_reg dst0_uw, dst1_uw;
 132
 133    brw_push_insn_state(p);
 134    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 135
 136    if (c->dispatch_width == 16) {
 137       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 138       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 139    } else {
 140       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 141       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 142    }
 143
 144    /* Calculate pixel centers by adding 1 or 0 to each of the
 145     * micro-tile coordinates passed in r1.
 146     */
 147    if (mask & WRITEMASK_X) {
 148       brw_ADD(p,
 149               dst0_uw,
 150               stride(suboffset(r1_uw, 4), 2, 4, 0),
 151               brw_imm_v(0x10101010));
 152    }
 153
 154    if (mask & WRITEMASK_Y) {
 155       brw_ADD(p,
 156               dst1_uw,
 157               stride(suboffset(r1_uw,5), 2, 4, 0),
 158               brw_imm_v(0x11001100));
 159    }
 160    brw_pop_insn_state(p);
 161 }
 162
 163 /**
 164  * Computes the screen-space x,y distance of the pixels from the start
 165  * vertex.
 166  *
 167  * This will be used in linterp or pinterp with the start vertex value
 168  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 169  * to produce interpolated attribute values.
 170  */
 171 void emit_delta_xy(struct brw_compile *p,
 172                    const struct brw_reg *dst,
 173                    GLuint mask,
 174                    const struct brw_reg *arg0)
 175 {
 176    struct intel_context *intel = &p->brw->intel;
 177    struct brw_reg r1 = brw_vec1_grf(1, 0);
 178
 179    if (mask == 0)
 180       return;
 181
 182    assert(mask == WRITEMASK_XY);
 183
 184    if (intel->gen >= 6) {
 185        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 186           Just add them with 0.0 for dst reg.. */
 187        r1 = brw_imm_v(0x00000000);
 188        brw_ADD(p,
 189                dst[0],
 190                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 191                r1);
 192        brw_ADD(p,
 193                dst[1],
 194                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 195                r1);
 196        return;
 197    }
 198
 199    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 200     * centers produced by emit_pixel_xy().
 201     */
 202    brw_ADD(p,
 203            dst[0],
 204            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 205            negate(r1));
 206    brw_ADD(p,
 207            dst[1],
 208            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 209            negate(suboffset(r1,1)));
 210 }
 211
 212 /**
 213  * Computes the pixel offset from the window origin for gl_FragCoord().
 214  */
 215 void emit_wpos_xy(struct brw_wm_compile *c,
 216                   const struct brw_reg *dst,
 217                   GLuint mask,
 218                   const struct brw_reg *arg0)
 219 {
 220    struct brw_compile *p = &c->func;
 221
 222    if (mask & WRITEMASK_X) {
 223       if (c->fp->program.PixelCenterInteger) {
 224          /* X' = X */
 225          brw_MOV(p,
 226                  dst[0],
 227                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 228       } else {
 229          /* X' = X + 0.5 */
 230          brw_ADD(p,
 231                  dst[0],
 232                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 233                  brw_imm_f(0.5));
 234       }
 235    }
 236
 237    if (mask & WRITEMASK_Y) {
 238       if (c->fp->program.OriginUpperLeft) {
 239          if (c->fp->program.PixelCenterInteger) {
 240             /* Y' = Y */
 241             brw_MOV(p,
 242                     dst[1],
 243                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 244          } else {
 245             /* Y' = Y + 0.5 */
 246             brw_ADD(p,
 247                     dst[1],
 248                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 249                     brw_imm_f(0.5));
 250          }
 251       } else {
 252          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 253
 254          /* Y' = (height - 1) - Y + center */
 255          brw_ADD(p,
 256                  dst[1],
 257                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 258                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 259       }
 260    }
 261 }
 262
 263
 264 void emit_pixel_w(struct brw_wm_compile *c,
 265                   const struct brw_reg *dst,
 266                   GLuint mask,
 267                   const struct brw_reg *arg0,
 268                   const struct brw_reg *deltas)
 269 {
 270    struct brw_compile *p = &c->func;
 271    struct intel_context *intel = &p->brw->intel;
 272    struct brw_reg src;
 273    struct brw_reg temp_dst;
 274
 275    if (intel->gen >= 6)
 276         temp_dst = dst[3];
 277    else
 278         temp_dst = brw_message_reg(2);
 279
 280    assert(intel->gen < 6);
 281
 282    /* Don't need this if all you are doing is interpolating color, for
 283     * instance.
 284     */
 285    if (mask & WRITEMASK_W) {
 286       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 287
 288       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 289        * result straight into a message reg.
 290        */
 291       if (can_do_pln(intel, deltas)) {
 292          brw_PLN(p, temp_dst, interp3, deltas[0]);
 293       } else {
 294          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 295          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 296       }
 297
 298       /* Calc w */
 299       if (intel->gen >= 6)
 300          src = temp_dst;
 301       else
 302          src = brw_null_reg();
 303
 304       if (c->dispatch_width == 16) {
 305          brw_math_16(p, dst[3],
 306                      BRW_MATH_FUNCTION_INV,
 307                      BRW_MATH_SATURATE_NONE,
 308                      2, src,
 309                      BRW_MATH_PRECISION_FULL);
 310       } else {
 311          brw_math(p, dst[3],
 312                   BRW_MATH_FUNCTION_INV,
 313                   BRW_MATH_SATURATE_NONE,
 314                   2, src,
 315                   BRW_MATH_DATA_VECTOR,
 316                   BRW_MATH_PRECISION_FULL);
 317       }
 318    }
 319 }
 320
 321 void emit_linterp(struct brw_compile *p,
 322                   const struct brw_reg *dst,
 323                   GLuint mask,
 324                   const struct brw_reg *arg0,
 325                   const struct brw_reg *deltas)
 326 {
 327    struct intel_context *intel = &p->brw->intel;
 328    struct brw_reg interp[4];
 329    GLuint nr = arg0[0].nr;
 330    GLuint i;
 331
 332    interp[0] = brw_vec1_grf(nr, 0);
 333    interp[1] = brw_vec1_grf(nr, 4);
 334    interp[2] = brw_vec1_grf(nr+1, 0);
 335    interp[3] = brw_vec1_grf(nr+1, 4);
 336
 337    for (i = 0; i < 4; i++) {
 338       if (mask & (1<<i)) {
 339          if (intel->gen >= 6) {
 340             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 341          } else if (can_do_pln(intel, deltas)) {
 342             brw_PLN(p, dst[i], interp[i], deltas[0]);
 343          } else {
 344             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 345             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 346          }
 347       }
 348    }
 349 }
 350
 351
 352 void emit_pinterp(struct brw_compile *p,
 353                   const struct brw_reg *dst,
 354                   GLuint mask,
 355                   const struct brw_reg *arg0,
 356                   const struct brw_reg *deltas,
 357                   const struct brw_reg *w)
 358 {
 359    struct intel_context *intel = &p->brw->intel;
 360    struct brw_reg interp[4];
 361    GLuint nr = arg0[0].nr;
 362    GLuint i;
 363
 364    if (intel->gen >= 6) {
 365       emit_linterp(p, dst, mask, arg0, interp);
 366       return;
 367    }
 368
 369    interp[0] = brw_vec1_grf(nr, 0);
 370    interp[1] = brw_vec1_grf(nr, 4);
 371    interp[2] = brw_vec1_grf(nr+1, 0);
 372    interp[3] = brw_vec1_grf(nr+1, 4);
 373
 374    for (i = 0; i < 4; i++) {
 375       if (mask & (1<<i)) {
 376          if (can_do_pln(intel, deltas)) {
 377             brw_PLN(p, dst[i], interp[i], deltas[0]);
 378          } else {
 379             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 380             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 381          }
 382       }
 383    }
 384    for (i = 0; i < 4; i++) {
 385       if (mask & (1<<i)) {
 386          brw_MUL(p, dst[i], dst[i], w[3]);
 387       }
 388    }
 389 }
 390
 391
 392 void emit_cinterp(struct brw_compile *p,
 393                   const struct brw_reg *dst,
 394                   GLuint mask,
 395                   const struct brw_reg *arg0)
 396 {
 397    struct brw_reg interp[4];
 398    GLuint nr = arg0[0].nr;
 399    GLuint i;
 400
 401    interp[0] = brw_vec1_grf(nr, 0);
 402    interp[1] = brw_vec1_grf(nr, 4);
 403    interp[2] = brw_vec1_grf(nr+1, 0);
 404    interp[3] = brw_vec1_grf(nr+1, 4);
 405
 406    for (i = 0; i < 4; i++) {
 407       if (mask & (1<<i)) {
 408          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 409       }
 410    }
 411 }
 412
 413 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 414 void emit_frontfacing(struct brw_compile *p,
 415                       const struct brw_reg *dst,
 416                       GLuint mask)
 417 {
 418    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 419    GLuint i;
 420
 421    if (!(mask & WRITEMASK_XYZW))
 422       return;
 423
 424    for (i = 0; i < 4; i++) {
 425       if (mask & (1<<i)) {
 426          brw_MOV(p, dst[i], brw_imm_f(0.0));
 427       }
 428    }
 429
 430    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 431     * us front face
 432     */
 433    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 434    for (i = 0; i < 4; i++) {
 435       if (mask & (1<<i)) {
 436          brw_MOV(p, dst[i], brw_imm_f(1.0));
 437       }
 438    }
 439    brw_set_predicate_control_flag_value(p, 0xff);
 440 }
 441
 442 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 443  * looking like:
 444  *
 445  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 446  *
 447  * and we're trying to produce:
 448  *
 449  *           DDX                     DDY
 450  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 451  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 452  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 453  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 454  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 455  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 456  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 457  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 458  *
 459  * and add another set of two more subspans if in 16-pixel dispatch mode.
 460  *
 461  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 462  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 463  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 464  * between each other.  We could probably do it like ddx and swizzle the right
 465  * order later, but bail for now and just produce
 466  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 467  */
 468 void emit_ddxy(struct brw_compile *p,
 469                const struct brw_reg *dst,
 470                GLuint mask,
 471                GLboolean is_ddx,
 472                const struct brw_reg *arg0)
 473 {
 474    int i;
 475    struct brw_reg src0, src1;
 476
 477    if (mask & SATURATE)
 478       brw_set_saturate(p, 1);
 479    for (i = 0; i < 4; i++ ) {
 480       if (mask & (1<<i)) {
 481          if (is_ddx) {
 482             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 483                            BRW_REGISTER_TYPE_F,
 484                            BRW_VERTICAL_STRIDE_2,
 485                            BRW_WIDTH_2,
 486                            BRW_HORIZONTAL_STRIDE_0,
 487                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 488             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 489                            BRW_REGISTER_TYPE_F,
 490                            BRW_VERTICAL_STRIDE_2,
 491                            BRW_WIDTH_2,
 492                            BRW_HORIZONTAL_STRIDE_0,
 493                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 494          } else {
 495             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 496                            BRW_REGISTER_TYPE_F,
 497                            BRW_VERTICAL_STRIDE_4,
 498                            BRW_WIDTH_4,
 499                            BRW_HORIZONTAL_STRIDE_0,
 500                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 501             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 502                            BRW_REGISTER_TYPE_F,
 503                            BRW_VERTICAL_STRIDE_4,
 504                            BRW_WIDTH_4,
 505                            BRW_HORIZONTAL_STRIDE_0,
 506                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 507          }
 508          brw_ADD(p, dst[i], src0, negate(src1));
 509       }
 510    }
 511    if (mask & SATURATE)
 512       brw_set_saturate(p, 0);
 513 }
 514
 515 void emit_alu1(struct brw_compile *p,
 516                struct brw_instruction *(*func)(struct brw_compile *,
 517                                                struct brw_reg,
 518                                                struct brw_reg),
 519                const struct brw_reg *dst,
 520                GLuint mask,
 521                const struct brw_reg *arg0)
 522 {
 523    GLuint i;
 524
 525    if (mask & SATURATE)
 526       brw_set_saturate(p, 1);
 527
 528    for (i = 0; i < 4; i++) {
 529       if (mask & (1<<i)) {
 530          func(p, dst[i], arg0[i]);
 531       }
 532    }
 533
 534    if (mask & SATURATE)
 535       brw_set_saturate(p, 0);
 536 }
 537
 538
 539 void emit_alu2(struct brw_compile *p,
 540                struct brw_instruction *(*func)(struct brw_compile *,
 541                                                struct brw_reg,
 542                                                struct brw_reg,
 543                                                struct brw_reg),
 544                const struct brw_reg *dst,
 545                GLuint mask,
 546                const struct brw_reg *arg0,
 547                const struct brw_reg *arg1)
 548 {
 549    GLuint i;
 550
 551    if (mask & SATURATE)
 552       brw_set_saturate(p, 1);
 553
 554    for (i = 0; i < 4; i++) {
 555       if (mask & (1<<i)) {
 556          func(p, dst[i], arg0[i], arg1[i]);
 557       }
 558    }
 559
 560    if (mask & SATURATE)
 561       brw_set_saturate(p, 0);
 562 }
 563
 564
 565 void emit_mad(struct brw_compile *p,
 566               const struct brw_reg *dst,
 567               GLuint mask,
 568               const struct brw_reg *arg0,
 569               const struct brw_reg *arg1,
 570               const struct brw_reg *arg2)
 571 {
 572    GLuint i;
 573
 574    for (i = 0; i < 4; i++) {
 575       if (mask & (1<<i)) {
 576          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 577
 578          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 579          brw_ADD(p, dst[i], dst[i], arg2[i]);
 580          brw_set_saturate(p, 0);
 581       }
 582    }
 583 }
 584
 585 void emit_lrp(struct brw_compile *p,
 586               const struct brw_reg *dst,
 587               GLuint mask,
 588               const struct brw_reg *arg0,
 589               const struct brw_reg *arg1,
 590               const struct brw_reg *arg2)
 591 {
 592    GLuint i;
 593
 594    /* Uses dst as a temporary:
 595     */
 596    for (i = 0; i < 4; i++) {
 597       if (mask & (1<<i)) {
 598          /* Can I use the LINE instruction for this?
 599           */
 600          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 601          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 602
 603          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 604          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 605          brw_set_saturate(p, 0);
 606       }
 607    }
 608 }
 609
 610 void emit_sop(struct brw_compile *p,
 611               const struct brw_reg *dst,
 612               GLuint mask,
 613               GLuint cond,
 614               const struct brw_reg *arg0,
 615               const struct brw_reg *arg1)
 616 {
 617    GLuint i;
 618
 619    for (i = 0; i < 4; i++) {
 620       if (mask & (1<<i)) {
 621          brw_push_insn_state(p);
 622          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 623          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 624          brw_MOV(p, dst[i], brw_imm_f(0));
 625          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 626          brw_MOV(p, dst[i], brw_imm_f(1.0));
 627          brw_pop_insn_state(p);
 628       }
 629    }
 630 }
 631
 632 static void emit_slt( struct brw_compile *p,
 633                       const struct brw_reg *dst,
 634                       GLuint mask,
 635                       const struct brw_reg *arg0,
 636                       const struct brw_reg *arg1 )
 637 {
 638    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 639 }
 640
 641 static void emit_sle( struct brw_compile *p,
 642                       const struct brw_reg *dst,
 643                       GLuint mask,
 644                       const struct brw_reg *arg0,
 645                       const struct brw_reg *arg1 )
 646 {
 647    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 648 }
 649
 650 static void emit_sgt( struct brw_compile *p,
 651                       const struct brw_reg *dst,
 652                       GLuint mask,
 653                       const struct brw_reg *arg0,
 654                       const struct brw_reg *arg1 )
 655 {
 656    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 657 }
 658
 659 static void emit_sge( struct brw_compile *p,
 660                       const struct brw_reg *dst,
 661                       GLuint mask,
 662                       const struct brw_reg *arg0,
 663                       const struct brw_reg *arg1 )
 664 {
 665    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 666 }
 667
 668 static void emit_seq( struct brw_compile *p,
 669                       const struct brw_reg *dst,
 670                       GLuint mask,
 671                       const struct brw_reg *arg0,
 672                       const struct brw_reg *arg1 )
 673 {
 674    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 675 }
 676
 677 static void emit_sne( struct brw_compile *p,
 678                       const struct brw_reg *dst,
 679                       GLuint mask,
 680                       const struct brw_reg *arg0,
 681                       const struct brw_reg *arg1 )
 682 {
 683    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 684 }
 685
 686 void emit_cmp(struct brw_compile *p,
 687               const struct brw_reg *dst,
 688               GLuint mask,
 689               const struct brw_reg *arg0,
 690               const struct brw_reg *arg1,
 691               const struct brw_reg *arg2)
 692 {
 693    GLuint i;
 694
 695    for (i = 0; i < 4; i++) {
 696       if (mask & (1<<i)) {
 697          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 698
 699          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 700          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 701          brw_set_saturate(p, 0);
 702          brw_set_predicate_control_flag_value(p, 0xff);
 703       }
 704    }
 705 }
 706
 707 void emit_sign(struct brw_compile *p,
 708                const struct brw_reg *dst,
 709                GLuint mask,
 710                const struct brw_reg *arg0)
 711 {
 712    GLuint i;
 713
 714    for (i = 0; i < 4; i++) {
 715       if (mask & (1<<i)) {
 716          brw_MOV(p, dst[i], brw_imm_f(0.0));
 717
 718          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 719          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 720          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 721
 722          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 723          brw_MOV(p, dst[i], brw_imm_f(1.0));
 724          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 725       }
 726    }
 727 }
 728
 729 void emit_max(struct brw_compile *p,
 730               const struct brw_reg *dst,
 731               GLuint mask,
 732               const struct brw_reg *arg0,
 733               const struct brw_reg *arg1)
 734 {
 735    GLuint i;
 736
 737    for (i = 0; i < 4; i++) {
 738       if (mask & (1<<i)) {
 739          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 740
 741          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 742          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 743          brw_set_saturate(p, 0);
 744          brw_set_predicate_control_flag_value(p, 0xff);
 745       }
 746    }
 747 }
 748
 749 void emit_min(struct brw_compile *p,
 750               const struct brw_reg *dst,
 751               GLuint mask,
 752               const struct brw_reg *arg0,
 753               const struct brw_reg *arg1)
 754 {
 755    GLuint i;
 756
 757    for (i = 0; i < 4; i++) {
 758       if (mask & (1<<i)) {
 759          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 760
 761          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 762          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 763          brw_set_saturate(p, 0);
 764          brw_set_predicate_control_flag_value(p, 0xff);
 765       }
 766    }
 767 }
 768
 769
 770 void emit_dp2(struct brw_compile *p,
 771               const struct brw_reg *dst,
 772               GLuint mask,
 773               const struct brw_reg *arg0,
 774               const struct brw_reg *arg1)
 775 {
 776    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 777
 778    if (!(mask & WRITEMASK_XYZW))
 779       return; /* Do not emit dead code */
 780
 781    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 782
 783    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 784
 785    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 786    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 787    brw_set_saturate(p, 0);
 788 }
 789
 790
 791 void emit_dp3(struct brw_compile *p,
 792               const struct brw_reg *dst,
 793               GLuint mask,
 794               const struct brw_reg *arg0,
 795               const struct brw_reg *arg1)
 796 {
 797    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 798
 799    if (!(mask & WRITEMASK_XYZW))
 800       return; /* Do not emit dead code */
 801
 802    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 803
 804    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 805    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 806
 807    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 808    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 809    brw_set_saturate(p, 0);
 810 }
 811
 812
 813 void emit_dp4(struct brw_compile *p,
 814               const struct brw_reg *dst,
 815               GLuint mask,
 816               const struct brw_reg *arg0,
 817               const struct brw_reg *arg1)
 818 {
 819    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 820
 821    if (!(mask & WRITEMASK_XYZW))
 822       return; /* Do not emit dead code */
 823
 824    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 825
 826    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 827    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 828    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 829
 830    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 831    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 832    brw_set_saturate(p, 0);
 833 }
 834
 835
 836 void emit_dph(struct brw_compile *p,
 837               const struct brw_reg *dst,
 838               GLuint mask,
 839               const struct brw_reg *arg0,
 840               const struct brw_reg *arg1)
 841 {
 842    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 843
 844    if (!(mask & WRITEMASK_XYZW))
 845       return; /* Do not emit dead code */
 846
 847    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 848
 849    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 850    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 851    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 852
 853    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 854    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 855    brw_set_saturate(p, 0);
 856 }
 857
 858
 859 void emit_xpd(struct brw_compile *p,
 860               const struct brw_reg *dst,
 861               GLuint mask,
 862               const struct brw_reg *arg0,
 863               const struct brw_reg *arg1)
 864 {
 865    GLuint i;
 866
 867    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 868
 869    for (i = 0 ; i < 3; i++) {
 870       if (mask & (1<<i)) {
 871          GLuint i2 = (i+2)%3;
 872          GLuint i1 = (i+1)%3;
 873
 874          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 875
 876          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 877          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 878          brw_set_saturate(p, 0);
 879       }
 880    }
 881 }
 882
 883
 884 void emit_math1(struct brw_wm_compile *c,
 885                 GLuint function,
 886                 const struct brw_reg *dst,
 887                 GLuint mask,
 888                 const struct brw_reg *arg0)
 889 {
 890    struct brw_compile *p = &c->func;
 891    struct intel_context *intel = &p->brw->intel;
 892    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 893    GLuint saturate = ((mask & SATURATE) ?
 894                       BRW_MATH_SATURATE_SATURATE :
 895                       BRW_MATH_SATURATE_NONE);
 896    struct brw_reg src;
 897
 898    if (intel->gen >= 6 && arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 899       /* Gen6 math requires that source and dst horizontal stride be 1.
 900        *
 901        */
 902       src = *dst;
 903       brw_MOV(p, src, arg0[0]);
 904    } else {
 905       src = arg0[0];
 906    }
 907
 908    if (!(mask & WRITEMASK_XYZW))
 909       return; /* Do not emit dead code */
 910
 911    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 912
 913    /* Send two messages to perform all 16 operations:
 914     */
 915    brw_push_insn_state(p);
 916    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 917    brw_math(p,
 918             dst[dst_chan],
 919             function,
 920             saturate,
 921             2,
 922             src,
 923             BRW_MATH_DATA_VECTOR,
 924             BRW_MATH_PRECISION_FULL);
 925
 926    if (c->dispatch_width == 16) {
 927       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 928       brw_math(p,
 929                offset(dst[dst_chan],1),
 930                function,
 931                saturate,
 932                3,
 933                sechalf(src),
 934                BRW_MATH_DATA_VECTOR,
 935                BRW_MATH_PRECISION_FULL);
 936    }
 937    brw_pop_insn_state(p);
 938 }
 939
 940
 941 void emit_math2(struct brw_wm_compile *c,
 942                 GLuint function,
 943                 const struct brw_reg *dst,
 944                 GLuint mask,
 945                 const struct brw_reg *arg0,
 946                 const struct brw_reg *arg1)
 947 {
 948    struct brw_compile *p = &c->func;
 949    struct intel_context *intel = &p->brw->intel;
 950    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 951
 952    if (!(mask & WRITEMASK_XYZW))
 953       return; /* Do not emit dead code */
 954
 955    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 956
 957    brw_push_insn_state(p);
 958
 959    /* math can only operate on up to a vec8 at a time, so in
 960     * dispatch_width==16 we have to do the second half manually.
 961     */
 962    if (intel->gen >= 6) {
 963       struct brw_reg src0 = arg0[0];
 964       struct brw_reg src1 = arg1[0];
 965       struct brw_reg temp_dst = dst[dst_chan];
 966
 967       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 968          if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 969             /* Both scalar arguments.  Do scalar calc. */
 970             src0.hstride = BRW_HORIZONTAL_STRIDE_1;
 971             src1.hstride = BRW_HORIZONTAL_STRIDE_1;
 972             temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
 973             temp_dst.width = BRW_WIDTH_1;
 974
 975             if (arg0[0].subnr != 0) {
 976                brw_MOV(p, temp_dst, src0);
 977                src0 = temp_dst;
 978
 979                /* Ouch.  We've used the temp as a dst, and we still
 980                 * need a temp to store arg1 in, because src and dst
 981                 * offsets have to be equal.  Leaving this up to
 982                 * glsl2-965 to handle correctly.
 983                 */
 984                assert(arg1[0].subnr == 0);
 985             } else if (arg1[0].subnr != 0) {
 986                brw_MOV(p, temp_dst, src1);
 987                src1 = temp_dst;
 988             }
 989          } else {
 990             brw_MOV(p, temp_dst, src0);
 991             src0 = temp_dst;
 992          }
 993       } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 994          brw_MOV(p, temp_dst, src1);
 995          src1 = temp_dst;
 996       }
 997
 998       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 999       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1000       brw_math2(p,
1001                 temp_dst,
1002                 function,
1003                 src0,
1004                 src1);
1005       if (c->dispatch_width == 16) {
1006          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1007          brw_math2(p,
1008                    sechalf(temp_dst),
1009                    function,
1010                    sechalf(src0),
1011                    sechalf(src1));
1012       }
1013
1014       /* Splat a scalar result into all the channels. */
1015       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
1016           arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
1017          temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
1018          temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
1019          brw_MOV(p, dst[dst_chan], temp_dst);
1020       }
1021    } else {
1022       GLuint saturate = ((mask & SATURATE) ?
1023                          BRW_MATH_SATURATE_SATURATE :
1024                          BRW_MATH_SATURATE_NONE);
1025
1026       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1027       brw_MOV(p, brw_message_reg(3), arg1[0]);
1028       if (c->dispatch_width == 16) {
1029          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1030          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1031       }
1032
1033       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1034       brw_math(p,
1035                dst[dst_chan],
1036                function,
1037                saturate,
1038                2,
1039                arg0[0],
1040                BRW_MATH_DATA_VECTOR,
1041                BRW_MATH_PRECISION_FULL);
1042
1043       /* Send two messages to perform all 16 operations:
1044        */
1045       if (c->dispatch_width == 16) {
1046          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1047          brw_math(p,
1048                   offset(dst[dst_chan],1),
1049                   function,
1050                   saturate,
1051                   4,
1052                   sechalf(arg0[0]),
1053                   BRW_MATH_DATA_VECTOR,
1054                   BRW_MATH_PRECISION_FULL);
1055       }
1056    }
1057    brw_pop_insn_state(p);
1058 }
1059
1060
1061 void emit_tex(struct brw_wm_compile *c,
1062               struct brw_reg *dst,
1063               GLuint dst_flags,
1064               struct brw_reg *arg,
1065               struct brw_reg depth_payload,
1066               GLuint tex_idx,
1067               GLuint sampler,
1068               GLboolean shadow)
1069 {
1070    struct brw_compile *p = &c->func;
1071    struct intel_context *intel = &p->brw->intel;
1072    struct brw_reg dst_retyped;
1073    GLuint cur_mrf = 2, response_length;
1074    GLuint i, nr_texcoords;
1075    GLuint emit;
1076    GLuint msg_type;
1077    GLuint mrf_per_channel;
1078    GLuint simd_mode;
1079
1080    if (c->dispatch_width == 16) {
1081       mrf_per_channel = 2;
1082       response_length = 8;
1083       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1084       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1085    } else {
1086       mrf_per_channel = 1;
1087       response_length = 4;
1088       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1089       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1090    }
1091
1092    /* How many input regs are there?
1093     */
1094    switch (tex_idx) {
1095    case TEXTURE_1D_INDEX:
1096       emit = WRITEMASK_X;
1097       nr_texcoords = 1;
1098       break;
1099    case TEXTURE_2D_INDEX:
1100    case TEXTURE_RECT_INDEX:
1101       emit = WRITEMASK_XY;
1102       nr_texcoords = 2;
1103       break;
1104    case TEXTURE_3D_INDEX:
1105    case TEXTURE_CUBE_INDEX:
1106       emit = WRITEMASK_XYZ;
1107       nr_texcoords = 3;
1108       break;
1109    default:
1110       /* unexpected target */
1111       abort();
1112    }
1113
1114    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1115    if (intel->gen < 5 && c->dispatch_width == 8)
1116       nr_texcoords = 3;
1117
1118    /* For shadow comparisons, we have to supply u,v,r. */
1119    if (shadow)
1120       nr_texcoords = 3;
1121
1122    /* Emit the texcoords. */
1123    for (i = 0; i < nr_texcoords; i++) {
1124       if (emit & (1<<i))
1125          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1126       else
1127          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1128       cur_mrf += mrf_per_channel;
1129    }
1130
1131    /* Fill in the shadow comparison reference value. */
1132    if (shadow) {
1133       if (intel->gen >= 5) {
1134          /* Fill in the cube map array index value. */
1135          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1136          cur_mrf += mrf_per_channel;
1137       } else if (c->dispatch_width == 8) {
1138          /* Fill in the LOD bias value. */
1139          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1140          cur_mrf += mrf_per_channel;
1141       }
1142       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1143       cur_mrf += mrf_per_channel;
1144    }
1145
1146    if (intel->gen >= 5) {
1147       if (shadow)
1148          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1149       else
1150          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1151    } else {
1152       /* Note that G45 and older determines shadow compare and dispatch width
1153        * from message length for most messages.
1154        */
1155       if (c->dispatch_width == 16 && shadow)
1156          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1157       else
1158          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1159    }
1160
1161    brw_SAMPLE(p,
1162               dst_retyped,
1163               1,
1164               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1165               SURF_INDEX_TEXTURE(sampler),
1166               sampler,
1167               dst_flags & WRITEMASK_XYZW,
1168               msg_type,
1169               response_length,
1170               cur_mrf - 1,
1171               0,
1172               1,
1173               simd_mode);
1174 }
1175
1176
1177 void emit_txb(struct brw_wm_compile *c,
1178               struct brw_reg *dst,
1179               GLuint dst_flags,
1180               struct brw_reg *arg,
1181               struct brw_reg depth_payload,
1182               GLuint tex_idx,
1183               GLuint sampler)
1184 {
1185    struct brw_compile *p = &c->func;
1186    struct intel_context *intel = &p->brw->intel;
1187    GLuint msgLength;
1188    GLuint msg_type;
1189    GLuint mrf_per_channel;
1190    GLuint response_length;
1191    struct brw_reg dst_retyped;
1192
1193    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1194     * samples, so we'll use the 16-wide instruction, leave the second halves
1195     * undefined, and trust the execution mask to keep the undefined pixels
1196     * from mattering.
1197     */
1198    if (c->dispatch_width == 16 || intel->gen < 5) {
1199       if (intel->gen >= 5)
1200          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1201       else
1202          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1203       mrf_per_channel = 2;
1204       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1205       response_length = 8;
1206    } else {
1207       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1208       mrf_per_channel = 1;
1209       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1210       response_length = 4;
1211    }
1212
1213    /* Shadow ignored for txb. */
1214    switch (tex_idx) {
1215    case TEXTURE_1D_INDEX:
1216       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1217       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1218       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1219       break;
1220    case TEXTURE_2D_INDEX:
1221    case TEXTURE_RECT_INDEX:
1222       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1223       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1224       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1225       break;
1226    case TEXTURE_3D_INDEX:
1227    case TEXTURE_CUBE_INDEX:
1228       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1229       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1230       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1231       break;
1232    default:
1233       /* unexpected target */
1234       abort();
1235    }
1236
1237    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1238    msgLength = 2 + 4 * mrf_per_channel - 1;
1239
1240    brw_SAMPLE(p,
1241               dst_retyped,
1242               1,
1243               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1244               SURF_INDEX_TEXTURE(sampler),
1245               sampler,
1246               dst_flags & WRITEMASK_XYZW,
1247               msg_type,
1248               response_length,
1249               msgLength,
1250               0,
1251               1,
1252               BRW_SAMPLER_SIMD_MODE_SIMD16);
1253 }
1254
1255
1256 static void emit_lit(struct brw_wm_compile *c,
1257                      const struct brw_reg *dst,
1258                      GLuint mask,
1259                      const struct brw_reg *arg0)
1260 {
1261    struct brw_compile *p = &c->func;
1262
1263    assert((mask & WRITEMASK_XW) == 0);
1264
1265    if (mask & WRITEMASK_Y) {
1266       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1267       brw_MOV(p, dst[1], arg0[0]);
1268       brw_set_saturate(p, 0);
1269    }
1270
1271    if (mask & WRITEMASK_Z) {
1272       emit_math2(c, BRW_MATH_FUNCTION_POW,
1273                  &dst[2],
1274                  WRITEMASK_X | (mask & SATURATE),
1275                  &arg0[1],
1276                  &arg0[3]);
1277    }
1278
1279    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1280     * some of the POW calculations above, but 16-wide iff statements
1281     * seem to lock c1 hardware, so this is a nasty workaround:
1282     */
1283    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1284    {
1285       if (mask & WRITEMASK_Y)
1286          brw_MOV(p, dst[1], brw_imm_f(0));
1287
1288       if (mask & WRITEMASK_Z)
1289          brw_MOV(p, dst[2], brw_imm_f(0));
1290    }
1291    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1292 }
1293
1294
1295 /* Kill pixel - set execution mask to zero for those pixels which
1296  * fail.
1297  */
1298 static void emit_kil( struct brw_wm_compile *c,
1299                       struct brw_reg *arg0)
1300 {
1301    struct brw_compile *p = &c->func;
1302    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1303    GLuint i, j;
1304
1305    for (i = 0; i < 4; i++) {
1306       /* Check if we've already done the comparison for this reg
1307        * -- common when someone does KIL TEMP.wwww.
1308        */
1309       for (j = 0; j < i; j++) {
1310          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1311             break;
1312       }
1313       if (j != i)
1314          continue;
1315
1316       brw_push_insn_state(p);
1317       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1318       brw_set_predicate_control_flag_value(p, 0xff);
1319       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1320       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1321       brw_pop_insn_state(p);
1322    }
1323 }
1324
1325 /* KIL_NV kills the pixels that are currently executing, not based on a test
1326  * of the arguments.
1327  */
1328 void emit_kil_nv( struct brw_wm_compile *c )
1329 {
1330    struct brw_compile *p = &c->func;
1331    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1332
1333    brw_push_insn_state(p);
1334    brw_set_mask_control(p, BRW_MASK_DISABLE);
1335    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1336    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1337    brw_pop_insn_state(p);
1338 }
1339
1340 static void fire_fb_write( struct brw_wm_compile *c,
1341                            GLuint base_reg,
1342                            GLuint nr,
1343                            GLuint target,
1344                            GLuint eot )
1345 {
1346    struct brw_compile *p = &c->func;
1347    struct intel_context *intel = &p->brw->intel;
1348    struct brw_reg dst;
1349
1350    if (c->dispatch_width == 16)
1351       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1352    else
1353       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1354
1355    /* Pass through control information:
1356     */
1357 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1358    if (intel->gen < 6) /* gen6, use headerless for fb write */
1359    {
1360       brw_push_insn_state(p);
1361       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1362       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1363       brw_MOV(p,
1364                brw_message_reg(base_reg + 1),
1365                brw_vec8_grf(1, 0));
1366       brw_pop_insn_state(p);
1367    }
1368
1369    /* Send framebuffer write message: */
1370 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1371    brw_fb_WRITE(p,
1372                 c->dispatch_width,
1373                 dst,
1374                 base_reg,
1375                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1376                 target,
1377                 nr,
1378                 0,
1379                 eot);
1380 }
1381
1382
1383 static void emit_aa( struct brw_wm_compile *c,
1384                      struct brw_reg *arg1,
1385                      GLuint reg )
1386 {
1387    struct brw_compile *p = &c->func;
1388    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1389    GLuint off = c->key.aa_dest_stencil_reg % 2;
1390    struct brw_reg aa = offset(arg1[comp], off);
1391
1392    brw_push_insn_state(p);
1393    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1394    brw_MOV(p, brw_message_reg(reg), aa);
1395    brw_pop_insn_state(p);
1396 }
1397
1398
1399 /* Post-fragment-program processing.  Send the results to the
1400  * framebuffer.
1401  * \param arg0  the fragment color
1402  * \param arg1  the pass-through depth value
1403  * \param arg2  the shader-computed depth value
1404  */
1405 void emit_fb_write(struct brw_wm_compile *c,
1406                    struct brw_reg *arg0,
1407                    struct brw_reg *arg1,
1408                    struct brw_reg *arg2,
1409                    GLuint target,
1410                    GLuint eot)
1411 {
1412    struct brw_compile *p = &c->func;
1413    struct brw_context *brw = p->brw;
1414    struct intel_context *intel = &brw->intel;
1415    GLuint nr = 2;
1416    GLuint channel;
1417    int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
1418
1419    /* Reserve a space for AA - may not be needed:
1420     */
1421    if (c->key.aa_dest_stencil_reg)
1422       nr += 1;
1423
1424    /* I don't really understand how this achieves the color interleave
1425     * (ie RGBARGBA) in the result:  [Do the saturation here]
1426     */
1427    brw_push_insn_state(p);
1428
1429    if (intel->gen >= 6)
1430         base_reg = nr;
1431    else
1432         base_reg = 0;
1433
1434    for (channel = 0; channel < 4; channel++) {
1435       if (intel->gen >= 6) {
1436          /* gen6 SIMD16 single source DP write looks like:
1437           * m + 0: r0
1438           * m + 1: r1
1439           * m + 2: g0
1440           * m + 3: g1
1441           * m + 4: b0
1442           * m + 5: b1
1443           * m + 6: a0
1444           * m + 7: a1
1445           */
1446          if (c->dispatch_width == 16) {
1447             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1448          } else {
1449             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1450          }
1451       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1452          /* pre-gen6 SIMD16 single source DP write looks like:
1453           * m + 0: r0
1454           * m + 1: g0
1455           * m + 2: b0
1456           * m + 3: a0
1457           * m + 4: r1
1458           * m + 5: g1
1459           * m + 6: b1
1460           * m + 7: a1
1461           *
1462           * By setting the high bit of the MRF register number, we indicate
1463           * that we want COMPR4 mode - instead of doing the usual destination
1464           * + 1 for the second half we get destination + 4.
1465           */
1466          brw_MOV(p,
1467                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1468                  arg0[channel]);
1469       } else {
1470          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1471          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1472          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1473          brw_MOV(p,
1474                  brw_message_reg(nr + channel),
1475                  arg0[channel]);
1476
1477          if (c->dispatch_width == 16) {
1478             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1479             brw_MOV(p,
1480                     brw_message_reg(nr + channel + 4),
1481                     sechalf(arg0[channel]));
1482          }
1483       }
1484    }
1485    /* skip over the regs populated above:
1486     */
1487    if (c->dispatch_width == 16)
1488       nr += 8;
1489    else
1490       nr += 4;
1491
1492    brw_pop_insn_state(p);
1493
1494    if (c->key.source_depth_to_render_target)
1495    {
1496       if (c->key.computes_depth)
1497          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1498       else
1499          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1500
1501       nr += 2;
1502    }
1503
1504    if (c->key.dest_depth_reg)
1505    {
1506       GLuint comp = c->key.dest_depth_reg / 2;
1507       GLuint off = c->key.dest_depth_reg % 2;
1508
1509       if (off != 0) {
1510          brw_push_insn_state(p);
1511          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1512
1513          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1514          /* 2nd half? */
1515          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1516          brw_pop_insn_state(p);
1517       }
1518       else {
1519          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1520       }
1521       nr += 2;
1522    }
1523
1524    if (intel->gen >= 6) {
1525       /* Subtract off the message header, since we send headerless. */
1526       nr -= 2;
1527    }
1528
1529    if (!c->key.runtime_check_aads_emit) {
1530       if (c->key.aa_dest_stencil_reg)
1531          emit_aa(c, arg1, 2);
1532
1533       fire_fb_write(c, base_reg, nr, target, eot);
1534    }
1535    else {
1536       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1537       struct brw_reg ip = brw_ip_reg();
1538       struct brw_instruction *jmp;
1539
1540       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1541       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1542       brw_AND(p,
1543               v1_null_ud,
1544               get_element_ud(brw_vec8_grf(1,0), 6),
1545               brw_imm_ud(1<<26));
1546
1547       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1548       {
1549          emit_aa(c, arg1, 2);
1550          fire_fb_write(c, 0, nr, target, eot);
1551          /* note - thread killed in subroutine */
1552       }
1553       brw_land_fwd_jump(p, jmp);
1554
1555       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1556        */
1557       fire_fb_write(c, 1, nr-1, target, eot);
1558    }
1559 }
1560
1561 /**
1562  * Move a GPR to scratch memory.
1563  */
1564 static void emit_spill( struct brw_wm_compile *c,
1565                         struct brw_reg reg,
1566                         GLuint slot )
1567 {
1568    struct brw_compile *p = &c->func;
1569
1570    /*
1571      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1572    */
1573    brw_MOV(p, brw_message_reg(2), reg);
1574
1575    /*
1576      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1577      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1578    */
1579    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1580 }
1581
1582
1583 /**
1584  * Load a GPR from scratch memory.
1585  */
1586 static void emit_unspill( struct brw_wm_compile *c,
1587                           struct brw_reg reg,
1588                           GLuint slot )
1589 {
1590    struct brw_compile *p = &c->func;
1591
1592    /* Slot 0 is the undef value.
1593     */
1594    if (slot == 0) {
1595       brw_MOV(p, reg, brw_imm_f(0));
1596       return;
1597    }
1598
1599    /*
1600      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1601      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1602    */
1603
1604    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1605 }
1606
1607
1608 /**
1609  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1610  * Args with unspill_reg != 0 will be loaded from scratch memory.
1611  */
1612 static void get_argument_regs( struct brw_wm_compile *c,
1613                                struct brw_wm_ref *arg[],
1614                                struct brw_reg *regs )
1615 {
1616    GLuint i;
1617
1618    for (i = 0; i < 4; i++) {
1619       if (arg[i]) {
1620          if (arg[i]->unspill_reg)
1621             emit_unspill(c,
1622                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1623                          arg[i]->value->spill_slot);
1624
1625          regs[i] = arg[i]->hw_reg;
1626       }
1627       else {
1628          regs[i] = brw_null_reg();
1629       }
1630    }
1631 }
1632
1633
1634 /**
1635  * For values that have a spill_slot!=0, write those regs to scratch memory.
1636  */
1637 static void spill_values( struct brw_wm_compile *c,
1638                           struct brw_wm_value *values,
1639                           GLuint nr )
1640 {
1641    GLuint i;
1642
1643    for (i = 0; i < nr; i++)
1644       if (values[i].spill_slot)
1645          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1646 }
1647
1648
1649 /* Emit the fragment program instructions here.
1650  */
1651 void brw_wm_emit( struct brw_wm_compile *c )
1652 {
1653    struct brw_compile *p = &c->func;
1654    struct intel_context *intel = &p->brw->intel;
1655    GLuint insn;
1656
1657    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1658    if (intel->gen >= 6)
1659         brw_set_acc_write_control(p, 1);
1660
1661    /* Check if any of the payload regs need to be spilled:
1662     */
1663    spill_values(c, c->payload.depth, 4);
1664    spill_values(c, c->creg, c->nr_creg);
1665    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1666
1667
1668    for (insn = 0; insn < c->nr_insns; insn++) {
1669
1670       struct brw_wm_instruction *inst = &c->instruction[insn];
1671       struct brw_reg args[3][4], dst[4];
1672       GLuint i, dst_flags;
1673
1674       /* Get argument regs:
1675        */
1676       for (i = 0; i < 3; i++)
1677          get_argument_regs(c, inst->src[i], args[i]);
1678
1679       /* Get dest regs:
1680        */
1681       for (i = 0; i < 4; i++)
1682          if (inst->dst[i])
1683             dst[i] = inst->dst[i]->hw_reg;
1684          else
1685             dst[i] = brw_null_reg();
1686
1687       /* Flags
1688        */
1689       dst_flags = inst->writemask;
1690       if (inst->saturate)
1691          dst_flags |= SATURATE;
1692
1693       switch (inst->opcode) {
1694          /* Generated instructions for calculating triangle interpolants:
1695           */
1696       case WM_PIXELXY:
1697          emit_pixel_xy(c, dst, dst_flags);
1698          break;
1699
1700       case WM_DELTAXY:
1701          emit_delta_xy(p, dst, dst_flags, args[0]);
1702          break;
1703
1704       case WM_WPOSXY:
1705          emit_wpos_xy(c, dst, dst_flags, args[0]);
1706          break;
1707
1708       case WM_PIXELW:
1709          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1710          break;
1711
1712       case WM_LINTERP:
1713          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1714          break;
1715
1716       case WM_PINTERP:
1717          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1718          break;
1719
1720       case WM_CINTERP:
1721          emit_cinterp(p, dst, dst_flags, args[0]);
1722          break;
1723
1724       case WM_FB_WRITE:
1725          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1726          break;
1727
1728       case WM_FRONTFACING:
1729          emit_frontfacing(p, dst, dst_flags);
1730          break;
1731
1732          /* Straightforward arithmetic:
1733           */
1734       case OPCODE_ADD:
1735          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1736          break;
1737
1738       case OPCODE_FRC:
1739          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1740          break;
1741
1742       case OPCODE_FLR:
1743          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1744          break;
1745
1746       case OPCODE_DDX:
1747          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1748          break;
1749
1750       case OPCODE_DDY:
1751          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1752          break;
1753
1754       case OPCODE_DP2:
1755          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1756          break;
1757
1758       case OPCODE_DP3:
1759          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1760          break;
1761
1762       case OPCODE_DP4:
1763          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1764          break;
1765
1766       case OPCODE_DPH:
1767          emit_dph(p, dst, dst_flags, args[0], args[1]);
1768          break;
1769
1770       case OPCODE_TRUNC:
1771          for (i = 0; i < 4; i++) {
1772             if (dst_flags & (1<<i)) {
1773                brw_RNDZ(p, dst[i], args[0][i]);
1774             }
1775          }
1776          break;
1777
1778       case OPCODE_LRP:
1779          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1780          break;
1781
1782       case OPCODE_MAD:
1783          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1784          break;
1785
1786       case OPCODE_MOV:
1787       case OPCODE_SWZ:
1788          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1789          break;
1790
1791       case OPCODE_MUL:
1792          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1793          break;
1794
1795       case OPCODE_XPD:
1796          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1797          break;
1798
1799          /* Higher math functions:
1800           */
1801       case OPCODE_RCP:
1802          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1803          break;
1804
1805       case OPCODE_RSQ:
1806          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1807          break;
1808
1809       case OPCODE_SIN:
1810          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1811          break;
1812
1813       case OPCODE_COS:
1814          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1815          break;
1816
1817       case OPCODE_EX2:
1818          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1819          break;
1820
1821       case OPCODE_LG2:
1822          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1823          break;
1824
1825       case OPCODE_SCS:
1826          /* There is an scs math function, but it would need some
1827           * fixup for 16-element execution.
1828           */
1829          if (dst_flags & WRITEMASK_X)
1830             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1831          if (dst_flags & WRITEMASK_Y)
1832             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833          break;
1834
1835       case OPCODE_POW:
1836          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1837          break;
1838
1839          /* Comparisons:
1840           */
1841       case OPCODE_CMP:
1842          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1843          break;
1844
1845       case OPCODE_MAX:
1846          emit_max(p, dst, dst_flags, args[0], args[1]);
1847          break;
1848
1849       case OPCODE_MIN:
1850          emit_min(p, dst, dst_flags, args[0], args[1]);
1851          break;
1852
1853       case OPCODE_SLT:
1854          emit_slt(p, dst, dst_flags, args[0], args[1]);
1855          break;
1856
1857       case OPCODE_SLE:
1858          emit_sle(p, dst, dst_flags, args[0], args[1]);
1859         break;
1860       case OPCODE_SGT:
1861          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1862         break;
1863       case OPCODE_SGE:
1864          emit_sge(p, dst, dst_flags, args[0], args[1]);
1865          break;
1866       case OPCODE_SEQ:
1867          emit_seq(p, dst, dst_flags, args[0], args[1]);
1868         break;
1869       case OPCODE_SNE:
1870          emit_sne(p, dst, dst_flags, args[0], args[1]);
1871         break;
1872
1873       case OPCODE_SSG:
1874          emit_sign(p, dst, dst_flags, args[0]);
1875          break;
1876
1877       case OPCODE_LIT:
1878          emit_lit(c, dst, dst_flags, args[0]);
1879          break;
1880
1881          /* Texturing operations:
1882           */
1883       case OPCODE_TEX:
1884          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1885                   inst->tex_idx, inst->tex_unit,
1886                   inst->tex_shadow);
1887          break;
1888
1889       case OPCODE_TXB:
1890          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1891                   inst->tex_idx, inst->tex_unit);
1892          break;
1893
1894       case OPCODE_KIL:
1895          emit_kil(c, args[0]);
1896          break;
1897
1898       case OPCODE_KIL_NV:
1899          emit_kil_nv(c);
1900          break;
1901
1902       default:
1903          printf("Unsupported opcode %i (%s) in fragment shader\n",
1904                 inst->opcode, inst->opcode < MAX_OPCODE ?
1905                 _mesa_opcode_string(inst->opcode) :
1906                 "unknown");
1907       }
1908
1909       for (i = 0; i < 4; i++)
1910         if (inst->dst[i] && inst->dst[i]->spill_slot)
1911            emit_spill(c,
1912                       inst->dst[i]->hw_reg,
1913                       inst->dst[i]->spill_slot);
1914    }
1915
1916    /* Only properly tested on ILK */
1917    if (p->brw->intel.gen == 5) {
1918      brw_remove_duplicate_mrf_moves(p);
1919      if (c->dispatch_width == 16)
1920         brw_remove_grf_to_mrf_moves(p);
1921    }
1922
1923    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1924       int i;
1925
1926      printf("wm-native:\n");
1927      for (i = 0; i < p->nr_insn; i++)
1928          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1929       printf("\n");
1930    }
1931 }
1932