src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_SWZ] = 1,
  87       [OPCODE_XPD] = 2,
  88    };
  89
  90    /* These opcodes get broken down in a way that allow two
  91     * args to be immediates.
  92     */
  93    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  94       if (arg == 1 || arg == 2)
  95          return GL_TRUE;
  96    }
  97
  98    if (opcode > ARRAY_SIZE(opcode_array))
  99       return GL_FALSE;
 100
 101    return arg == opcode_array[opcode] - 1;
 102 }
 103
 104 /**
 105  * Computes the screen-space x,y position of the pixels.
 106  *
 107  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 108  * interpolation of attributes..
 109  *
 110  * Payload R0:
 111  *
 112  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 113  *         corresponding to each of the 16 execution channels.
 114  * R0.1..8 -- ?
 115  * R1.0 -- triangle vertex 0.X
 116  * R1.1 -- triangle vertex 0.Y
 117  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 118  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 119  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 120  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 121  * R1.6 -- ?
 122  * R1.7 -- ?
 123  * R1.8 -- ?
 124  */
 125 void emit_pixel_xy(struct brw_wm_compile *c,
 126                    const struct brw_reg *dst,
 127                    GLuint mask)
 128 {
 129    struct brw_compile *p = &c->func;
 130    struct brw_reg r1 = brw_vec1_grf(1, 0);
 131    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 132    struct brw_reg dst0_uw, dst1_uw;
 133
 134    brw_push_insn_state(p);
 135    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 136
 137    if (c->dispatch_width == 16) {
 138       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 139       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 140    } else {
 141       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 142       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 143    }
 144
 145    /* Calculate pixel centers by adding 1 or 0 to each of the
 146     * micro-tile coordinates passed in r1.
 147     */
 148    if (mask & WRITEMASK_X) {
 149       brw_ADD(p,
 150               dst0_uw,
 151               stride(suboffset(r1_uw, 4), 2, 4, 0),
 152               brw_imm_v(0x10101010));
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       brw_ADD(p,
 157               dst1_uw,
 158               stride(suboffset(r1_uw,5), 2, 4, 0),
 159               brw_imm_v(0x11001100));
 160    }
 161    brw_pop_insn_state(p);
 162 }
 163
 164 /**
 165  * Computes the screen-space x,y distance of the pixels from the start
 166  * vertex.
 167  *
 168  * This will be used in linterp or pinterp with the start vertex value
 169  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 170  * to produce interpolated attribute values.
 171  */
 172 void emit_delta_xy(struct brw_compile *p,
 173                    const struct brw_reg *dst,
 174                    GLuint mask,
 175                    const struct brw_reg *arg0)
 176 {
 177    struct intel_context *intel = &p->brw->intel;
 178    struct brw_reg r1 = brw_vec1_grf(1, 0);
 179
 180    if (mask == 0)
 181       return;
 182
 183    assert(mask == WRITEMASK_XY);
 184
 185    if (intel->gen >= 6) {
 186        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 187           Just add them with 0.0 for dst reg.. */
 188        r1 = brw_imm_v(0x00000000);
 189        brw_ADD(p,
 190                dst[0],
 191                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 192                r1);
 193        brw_ADD(p,
 194                dst[1],
 195                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 196                r1);
 197        return;
 198    }
 199
 200    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 201     * centers produced by emit_pixel_xy().
 202     */
 203    brw_ADD(p,
 204            dst[0],
 205            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 206            negate(r1));
 207    brw_ADD(p,
 208            dst[1],
 209            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 210            negate(suboffset(r1,1)));
 211 }
 212
 213 /**
 214  * Computes the pixel offset from the window origin for gl_FragCoord().
 215  */
 216 void emit_wpos_xy(struct brw_wm_compile *c,
 217                   const struct brw_reg *dst,
 218                   GLuint mask,
 219                   const struct brw_reg *arg0)
 220 {
 221    struct brw_compile *p = &c->func;
 222    struct intel_context *intel = &p->brw->intel;
 223    struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
 224    struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
 225
 226    if (mask & WRITEMASK_X) {
 227       if (intel->gen >= 6) {
 228          struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
 229          brw_MOV(p, delta_x_f, delta_x);
 230          delta_x = delta_x_f;
 231       }
 232
 233       if (c->fp->program.PixelCenterInteger) {
 234          /* X' = X */
 235          brw_MOV(p, dst[0], delta_x);
 236       } else {
 237          /* X' = X + 0.5 */
 238          brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
 239       }
 240    }
 241
 242    if (mask & WRITEMASK_Y) {
 243       if (intel->gen >= 6) {
 244          struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
 245          brw_MOV(p, delta_y_f, delta_y);
 246          delta_y = delta_y_f;
 247       }
 248
 249       if (c->fp->program.OriginUpperLeft) {
 250          if (c->fp->program.PixelCenterInteger) {
 251             /* Y' = Y */
 252             brw_MOV(p, dst[1], delta_y);
 253          } else {
 254             brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
 255          }
 256       } else {
 257          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 258
 259          /* Y' = (height - 1) - Y + center */
 260          brw_ADD(p, dst[1], negate(delta_y),
 261                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 262       }
 263    }
 264 }
 265
 266
 267 void emit_pixel_w(struct brw_wm_compile *c,
 268                   const struct brw_reg *dst,
 269                   GLuint mask,
 270                   const struct brw_reg *arg0,
 271                   const struct brw_reg *deltas)
 272 {
 273    struct brw_compile *p = &c->func;
 274    struct intel_context *intel = &p->brw->intel;
 275    struct brw_reg src;
 276    struct brw_reg temp_dst;
 277
 278    if (intel->gen >= 6)
 279         temp_dst = dst[3];
 280    else
 281         temp_dst = brw_message_reg(2);
 282
 283    assert(intel->gen < 6);
 284
 285    /* Don't need this if all you are doing is interpolating color, for
 286     * instance.
 287     */
 288    if (mask & WRITEMASK_W) {
 289       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 290
 291       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 292        * result straight into a message reg.
 293        */
 294       if (can_do_pln(intel, deltas)) {
 295          brw_PLN(p, temp_dst, interp3, deltas[0]);
 296       } else {
 297          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 298          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 299       }
 300
 301       /* Calc w */
 302       if (intel->gen >= 6)
 303          src = temp_dst;
 304       else
 305          src = brw_null_reg();
 306
 307       if (c->dispatch_width == 16) {
 308          brw_math_16(p, dst[3],
 309                      BRW_MATH_FUNCTION_INV,
 310                      BRW_MATH_SATURATE_NONE,
 311                      2, src,
 312                      BRW_MATH_PRECISION_FULL);
 313       } else {
 314          brw_math(p, dst[3],
 315                   BRW_MATH_FUNCTION_INV,
 316                   BRW_MATH_SATURATE_NONE,
 317                   2, src,
 318                   BRW_MATH_DATA_VECTOR,
 319                   BRW_MATH_PRECISION_FULL);
 320       }
 321    }
 322 }
 323
 324 void emit_linterp(struct brw_compile *p,
 325                   const struct brw_reg *dst,
 326                   GLuint mask,
 327                   const struct brw_reg *arg0,
 328                   const struct brw_reg *deltas)
 329 {
 330    struct intel_context *intel = &p->brw->intel;
 331    struct brw_reg interp[4];
 332    GLuint nr = arg0[0].nr;
 333    GLuint i;
 334
 335    interp[0] = brw_vec1_grf(nr, 0);
 336    interp[1] = brw_vec1_grf(nr, 4);
 337    interp[2] = brw_vec1_grf(nr+1, 0);
 338    interp[3] = brw_vec1_grf(nr+1, 4);
 339
 340    for (i = 0; i < 4; i++) {
 341       if (mask & (1<<i)) {
 342          if (intel->gen >= 6) {
 343             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 344          } else if (can_do_pln(intel, deltas)) {
 345             brw_PLN(p, dst[i], interp[i], deltas[0]);
 346          } else {
 347             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 348             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 349          }
 350       }
 351    }
 352 }
 353
 354
 355 void emit_pinterp(struct brw_compile *p,
 356                   const struct brw_reg *dst,
 357                   GLuint mask,
 358                   const struct brw_reg *arg0,
 359                   const struct brw_reg *deltas,
 360                   const struct brw_reg *w)
 361 {
 362    struct intel_context *intel = &p->brw->intel;
 363    struct brw_reg interp[4];
 364    GLuint nr = arg0[0].nr;
 365    GLuint i;
 366
 367    if (intel->gen >= 6) {
 368       emit_linterp(p, dst, mask, arg0, interp);
 369       return;
 370    }
 371
 372    interp[0] = brw_vec1_grf(nr, 0);
 373    interp[1] = brw_vec1_grf(nr, 4);
 374    interp[2] = brw_vec1_grf(nr+1, 0);
 375    interp[3] = brw_vec1_grf(nr+1, 4);
 376
 377    for (i = 0; i < 4; i++) {
 378       if (mask & (1<<i)) {
 379          if (can_do_pln(intel, deltas)) {
 380             brw_PLN(p, dst[i], interp[i], deltas[0]);
 381          } else {
 382             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 383             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 384          }
 385       }
 386    }
 387    for (i = 0; i < 4; i++) {
 388       if (mask & (1<<i)) {
 389          brw_MUL(p, dst[i], dst[i], w[3]);
 390       }
 391    }
 392 }
 393
 394
 395 void emit_cinterp(struct brw_compile *p,
 396                   const struct brw_reg *dst,
 397                   GLuint mask,
 398                   const struct brw_reg *arg0)
 399 {
 400    struct brw_reg interp[4];
 401    GLuint nr = arg0[0].nr;
 402    GLuint i;
 403
 404    interp[0] = brw_vec1_grf(nr, 0);
 405    interp[1] = brw_vec1_grf(nr, 4);
 406    interp[2] = brw_vec1_grf(nr+1, 0);
 407    interp[3] = brw_vec1_grf(nr+1, 4);
 408
 409    for (i = 0; i < 4; i++) {
 410       if (mask & (1<<i)) {
 411          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 412       }
 413    }
 414 }
 415
 416 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 417 void emit_frontfacing(struct brw_compile *p,
 418                       const struct brw_reg *dst,
 419                       GLuint mask)
 420 {
 421    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 422    GLuint i;
 423
 424    if (!(mask & WRITEMASK_XYZW))
 425       return;
 426
 427    for (i = 0; i < 4; i++) {
 428       if (mask & (1<<i)) {
 429          brw_MOV(p, dst[i], brw_imm_f(0.0));
 430       }
 431    }
 432
 433    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 434     * us front face
 435     */
 436    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 437    for (i = 0; i < 4; i++) {
 438       if (mask & (1<<i)) {
 439          brw_MOV(p, dst[i], brw_imm_f(1.0));
 440       }
 441    }
 442    brw_set_predicate_control_flag_value(p, 0xff);
 443 }
 444
 445 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 446  * looking like:
 447  *
 448  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 449  *
 450  * and we're trying to produce:
 451  *
 452  *           DDX                     DDY
 453  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 454  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 455  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 456  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 457  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 458  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 459  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 460  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 461  *
 462  * and add another set of two more subspans if in 16-pixel dispatch mode.
 463  *
 464  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 465  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 466  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 467  * between each other.  We could probably do it like ddx and swizzle the right
 468  * order later, but bail for now and just produce
 469  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 470  */
 471 void emit_ddxy(struct brw_compile *p,
 472                const struct brw_reg *dst,
 473                GLuint mask,
 474                GLboolean is_ddx,
 475                const struct brw_reg *arg0)
 476 {
 477    int i;
 478    struct brw_reg src0, src1;
 479
 480    if (mask & SATURATE)
 481       brw_set_saturate(p, 1);
 482    for (i = 0; i < 4; i++ ) {
 483       if (mask & (1<<i)) {
 484          if (is_ddx) {
 485             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 486                            BRW_REGISTER_TYPE_F,
 487                            BRW_VERTICAL_STRIDE_2,
 488                            BRW_WIDTH_2,
 489                            BRW_HORIZONTAL_STRIDE_0,
 490                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 491             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 492                            BRW_REGISTER_TYPE_F,
 493                            BRW_VERTICAL_STRIDE_2,
 494                            BRW_WIDTH_2,
 495                            BRW_HORIZONTAL_STRIDE_0,
 496                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 497          } else {
 498             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 499                            BRW_REGISTER_TYPE_F,
 500                            BRW_VERTICAL_STRIDE_4,
 501                            BRW_WIDTH_4,
 502                            BRW_HORIZONTAL_STRIDE_0,
 503                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 504             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 505                            BRW_REGISTER_TYPE_F,
 506                            BRW_VERTICAL_STRIDE_4,
 507                            BRW_WIDTH_4,
 508                            BRW_HORIZONTAL_STRIDE_0,
 509                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 510          }
 511          brw_ADD(p, dst[i], src0, negate(src1));
 512       }
 513    }
 514    if (mask & SATURATE)
 515       brw_set_saturate(p, 0);
 516 }
 517
 518 void emit_alu1(struct brw_compile *p,
 519                struct brw_instruction *(*func)(struct brw_compile *,
 520                                                struct brw_reg,
 521                                                struct brw_reg),
 522                const struct brw_reg *dst,
 523                GLuint mask,
 524                const struct brw_reg *arg0)
 525 {
 526    GLuint i;
 527
 528    if (mask & SATURATE)
 529       brw_set_saturate(p, 1);
 530
 531    for (i = 0; i < 4; i++) {
 532       if (mask & (1<<i)) {
 533          func(p, dst[i], arg0[i]);
 534       }
 535    }
 536
 537    if (mask & SATURATE)
 538       brw_set_saturate(p, 0);
 539 }
 540
 541
 542 void emit_alu2(struct brw_compile *p,
 543                struct brw_instruction *(*func)(struct brw_compile *,
 544                                                struct brw_reg,
 545                                                struct brw_reg,
 546                                                struct brw_reg),
 547                const struct brw_reg *dst,
 548                GLuint mask,
 549                const struct brw_reg *arg0,
 550                const struct brw_reg *arg1)
 551 {
 552    GLuint i;
 553
 554    if (mask & SATURATE)
 555       brw_set_saturate(p, 1);
 556
 557    for (i = 0; i < 4; i++) {
 558       if (mask & (1<<i)) {
 559          func(p, dst[i], arg0[i], arg1[i]);
 560       }
 561    }
 562
 563    if (mask & SATURATE)
 564       brw_set_saturate(p, 0);
 565 }
 566
 567
 568 void emit_mad(struct brw_compile *p,
 569               const struct brw_reg *dst,
 570               GLuint mask,
 571               const struct brw_reg *arg0,
 572               const struct brw_reg *arg1,
 573               const struct brw_reg *arg2)
 574 {
 575    GLuint i;
 576
 577    for (i = 0; i < 4; i++) {
 578       if (mask & (1<<i)) {
 579          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 580
 581          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 582          brw_ADD(p, dst[i], dst[i], arg2[i]);
 583          brw_set_saturate(p, 0);
 584       }
 585    }
 586 }
 587
 588 void emit_lrp(struct brw_compile *p,
 589               const struct brw_reg *dst,
 590               GLuint mask,
 591               const struct brw_reg *arg0,
 592               const struct brw_reg *arg1,
 593               const struct brw_reg *arg2)
 594 {
 595    GLuint i;
 596
 597    /* Uses dst as a temporary:
 598     */
 599    for (i = 0; i < 4; i++) {
 600       if (mask & (1<<i)) {
 601          /* Can I use the LINE instruction for this?
 602           */
 603          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 604          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 605
 606          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 607          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 608          brw_set_saturate(p, 0);
 609       }
 610    }
 611 }
 612
 613 void emit_sop(struct brw_compile *p,
 614               const struct brw_reg *dst,
 615               GLuint mask,
 616               GLuint cond,
 617               const struct brw_reg *arg0,
 618               const struct brw_reg *arg1)
 619 {
 620    GLuint i;
 621
 622    for (i = 0; i < 4; i++) {
 623       if (mask & (1<<i)) {
 624          brw_push_insn_state(p);
 625          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 626          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 627          brw_MOV(p, dst[i], brw_imm_f(0));
 628          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 629          brw_MOV(p, dst[i], brw_imm_f(1.0));
 630          brw_pop_insn_state(p);
 631       }
 632    }
 633 }
 634
 635 static void emit_slt( struct brw_compile *p,
 636                       const struct brw_reg *dst,
 637                       GLuint mask,
 638                       const struct brw_reg *arg0,
 639                       const struct brw_reg *arg1 )
 640 {
 641    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 642 }
 643
 644 static void emit_sle( struct brw_compile *p,
 645                       const struct brw_reg *dst,
 646                       GLuint mask,
 647                       const struct brw_reg *arg0,
 648                       const struct brw_reg *arg1 )
 649 {
 650    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 651 }
 652
 653 static void emit_sgt( struct brw_compile *p,
 654                       const struct brw_reg *dst,
 655                       GLuint mask,
 656                       const struct brw_reg *arg0,
 657                       const struct brw_reg *arg1 )
 658 {
 659    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 660 }
 661
 662 static void emit_sge( struct brw_compile *p,
 663                       const struct brw_reg *dst,
 664                       GLuint mask,
 665                       const struct brw_reg *arg0,
 666                       const struct brw_reg *arg1 )
 667 {
 668    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 669 }
 670
 671 static void emit_seq( struct brw_compile *p,
 672                       const struct brw_reg *dst,
 673                       GLuint mask,
 674                       const struct brw_reg *arg0,
 675                       const struct brw_reg *arg1 )
 676 {
 677    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 678 }
 679
 680 static void emit_sne( struct brw_compile *p,
 681                       const struct brw_reg *dst,
 682                       GLuint mask,
 683                       const struct brw_reg *arg0,
 684                       const struct brw_reg *arg1 )
 685 {
 686    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 687 }
 688
 689 void emit_cmp(struct brw_compile *p,
 690               const struct brw_reg *dst,
 691               GLuint mask,
 692               const struct brw_reg *arg0,
 693               const struct brw_reg *arg1,
 694               const struct brw_reg *arg2)
 695 {
 696    GLuint i;
 697
 698    for (i = 0; i < 4; i++) {
 699       if (mask & (1<<i)) {
 700          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 701
 702          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 703          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 704          brw_set_saturate(p, 0);
 705          brw_set_predicate_control_flag_value(p, 0xff);
 706       }
 707    }
 708 }
 709
 710 void emit_sign(struct brw_compile *p,
 711                const struct brw_reg *dst,
 712                GLuint mask,
 713                const struct brw_reg *arg0)
 714 {
 715    GLuint i;
 716
 717    for (i = 0; i < 4; i++) {
 718       if (mask & (1<<i)) {
 719          brw_MOV(p, dst[i], brw_imm_f(0.0));
 720
 721          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 722          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 723          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 724
 725          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 726          brw_MOV(p, dst[i], brw_imm_f(1.0));
 727          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 728       }
 729    }
 730 }
 731
 732 void emit_max(struct brw_compile *p,
 733               const struct brw_reg *dst,
 734               GLuint mask,
 735               const struct brw_reg *arg0,
 736               const struct brw_reg *arg1)
 737 {
 738    GLuint i;
 739
 740    for (i = 0; i < 4; i++) {
 741       if (mask & (1<<i)) {
 742          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 743
 744          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 745          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 746          brw_set_saturate(p, 0);
 747          brw_set_predicate_control_flag_value(p, 0xff);
 748       }
 749    }
 750 }
 751
 752 void emit_min(struct brw_compile *p,
 753               const struct brw_reg *dst,
 754               GLuint mask,
 755               const struct brw_reg *arg0,
 756               const struct brw_reg *arg1)
 757 {
 758    GLuint i;
 759
 760    for (i = 0; i < 4; i++) {
 761       if (mask & (1<<i)) {
 762          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 763
 764          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 765          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 766          brw_set_saturate(p, 0);
 767          brw_set_predicate_control_flag_value(p, 0xff);
 768       }
 769    }
 770 }
 771
 772
 773 void emit_dp2(struct brw_compile *p,
 774               const struct brw_reg *dst,
 775               GLuint mask,
 776               const struct brw_reg *arg0,
 777               const struct brw_reg *arg1)
 778 {
 779    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 780
 781    if (!(mask & WRITEMASK_XYZW))
 782       return; /* Do not emit dead code */
 783
 784    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 785
 786    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 787
 788    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 789    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 790    brw_set_saturate(p, 0);
 791 }
 792
 793
 794 void emit_dp3(struct brw_compile *p,
 795               const struct brw_reg *dst,
 796               GLuint mask,
 797               const struct brw_reg *arg0,
 798               const struct brw_reg *arg1)
 799 {
 800    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 801
 802    if (!(mask & WRITEMASK_XYZW))
 803       return; /* Do not emit dead code */
 804
 805    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 806
 807    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 808    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 809
 810    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 811    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 812    brw_set_saturate(p, 0);
 813 }
 814
 815
 816 void emit_dp4(struct brw_compile *p,
 817               const struct brw_reg *dst,
 818               GLuint mask,
 819               const struct brw_reg *arg0,
 820               const struct brw_reg *arg1)
 821 {
 822    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 823
 824    if (!(mask & WRITEMASK_XYZW))
 825       return; /* Do not emit dead code */
 826
 827    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 828
 829    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 830    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 831    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 832
 833    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 834    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 835    brw_set_saturate(p, 0);
 836 }
 837
 838
 839 void emit_dph(struct brw_compile *p,
 840               const struct brw_reg *dst,
 841               GLuint mask,
 842               const struct brw_reg *arg0,
 843               const struct brw_reg *arg1)
 844 {
 845    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 846
 847    if (!(mask & WRITEMASK_XYZW))
 848       return; /* Do not emit dead code */
 849
 850    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 851
 852    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 853    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 854    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 855
 856    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 857    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 858    brw_set_saturate(p, 0);
 859 }
 860
 861
 862 void emit_xpd(struct brw_compile *p,
 863               const struct brw_reg *dst,
 864               GLuint mask,
 865               const struct brw_reg *arg0,
 866               const struct brw_reg *arg1)
 867 {
 868    GLuint i;
 869
 870    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 871
 872    for (i = 0 ; i < 3; i++) {
 873       if (mask & (1<<i)) {
 874          GLuint i2 = (i+2)%3;
 875          GLuint i1 = (i+1)%3;
 876
 877          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 878
 879          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 880          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 881          brw_set_saturate(p, 0);
 882       }
 883    }
 884 }
 885
 886
 887 void emit_math1(struct brw_wm_compile *c,
 888                 GLuint function,
 889                 const struct brw_reg *dst,
 890                 GLuint mask,
 891                 const struct brw_reg *arg0)
 892 {
 893    struct brw_compile *p = &c->func;
 894    struct intel_context *intel = &p->brw->intel;
 895    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 896    GLuint saturate = ((mask & SATURATE) ?
 897                       BRW_MATH_SATURATE_SATURATE :
 898                       BRW_MATH_SATURATE_NONE);
 899    struct brw_reg src;
 900
 901    if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
 902                             arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
 903                            arg0[0].negate || arg0[0].abs)) {
 904       /* Gen6 math requires that source and dst horizontal stride be 1,
 905        * and that the argument be in the GRF.
 906        *
 907        * The hardware ignores source modifiers (negate and abs) on math
 908        * instructions, so we also move to a temp to set those up.
 909        */
 910       src = dst[dst_chan];
 911       brw_MOV(p, src, arg0[0]);
 912    } else {
 913       src = arg0[0];
 914    }
 915
 916    if (!(mask & WRITEMASK_XYZW))
 917       return; /* Do not emit dead code */
 918
 919    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 920
 921    /* Send two messages to perform all 16 operations:
 922     */
 923    brw_push_insn_state(p);
 924    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 925    brw_math(p,
 926             dst[dst_chan],
 927             function,
 928             saturate,
 929             2,
 930             src,
 931             BRW_MATH_DATA_VECTOR,
 932             BRW_MATH_PRECISION_FULL);
 933
 934    if (c->dispatch_width == 16) {
 935       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 936       brw_math(p,
 937                offset(dst[dst_chan],1),
 938                function,
 939                saturate,
 940                3,
 941                sechalf(src),
 942                BRW_MATH_DATA_VECTOR,
 943                BRW_MATH_PRECISION_FULL);
 944    }
 945    brw_pop_insn_state(p);
 946 }
 947
 948
 949 void emit_math2(struct brw_wm_compile *c,
 950                 GLuint function,
 951                 const struct brw_reg *dst,
 952                 GLuint mask,
 953                 const struct brw_reg *arg0,
 954                 const struct brw_reg *arg1)
 955 {
 956    struct brw_compile *p = &c->func;
 957    struct intel_context *intel = &p->brw->intel;
 958    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 959
 960    if (!(mask & WRITEMASK_XYZW))
 961       return; /* Do not emit dead code */
 962
 963    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 964
 965    brw_push_insn_state(p);
 966
 967    /* math can only operate on up to a vec8 at a time, so in
 968     * dispatch_width==16 we have to do the second half manually.
 969     */
 970    if (intel->gen >= 6) {
 971       struct brw_reg src0 = arg0[0];
 972       struct brw_reg src1 = arg1[0];
 973       struct brw_reg temp_dst = dst[dst_chan];
 974
 975       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 976          brw_MOV(p, temp_dst, src0);
 977          src0 = temp_dst;
 978       }
 979
 980       if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 981          /* This is a heinous hack to get a temporary register for use
 982           * in case both arg0 and arg1 are constants.  Why you're
 983           * doing exponentiation on constant values in the shader, we
 984           * don't know.
 985           *
 986           * max_wm_grf is almost surely less than the maximum GRF, and
 987           * gen6 doesn't care about the number of GRFs used in a
 988           * shader like pre-gen6 did.
 989           */
 990          struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
 991          brw_MOV(p, temp, src1);
 992          src1 = temp;
 993       }
 994
 995       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 996       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 997       brw_math2(p,
 998                 temp_dst,
 999                 function,
1000                 src0,
1001                 src1);
1002       if (c->dispatch_width == 16) {
1003          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1004          brw_math2(p,
1005                    sechalf(temp_dst),
1006                    function,
1007                    sechalf(src0),
1008                    sechalf(src1));
1009       }
1010    } else {
1011       GLuint saturate = ((mask & SATURATE) ?
1012                          BRW_MATH_SATURATE_SATURATE :
1013                          BRW_MATH_SATURATE_NONE);
1014
1015       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1016       brw_MOV(p, brw_message_reg(3), arg1[0]);
1017       if (c->dispatch_width == 16) {
1018          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1019          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1020       }
1021
1022       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1023       brw_math(p,
1024                dst[dst_chan],
1025                function,
1026                saturate,
1027                2,
1028                arg0[0],
1029                BRW_MATH_DATA_VECTOR,
1030                BRW_MATH_PRECISION_FULL);
1031
1032       /* Send two messages to perform all 16 operations:
1033        */
1034       if (c->dispatch_width == 16) {
1035          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1036          brw_math(p,
1037                   offset(dst[dst_chan],1),
1038                   function,
1039                   saturate,
1040                   4,
1041                   sechalf(arg0[0]),
1042                   BRW_MATH_DATA_VECTOR,
1043                   BRW_MATH_PRECISION_FULL);
1044       }
1045    }
1046    brw_pop_insn_state(p);
1047 }
1048
1049
1050 void emit_tex(struct brw_wm_compile *c,
1051               struct brw_reg *dst,
1052               GLuint dst_flags,
1053               struct brw_reg *arg,
1054               struct brw_reg depth_payload,
1055               GLuint tex_idx,
1056               GLuint sampler,
1057               GLboolean shadow)
1058 {
1059    struct brw_compile *p = &c->func;
1060    struct intel_context *intel = &p->brw->intel;
1061    struct brw_reg dst_retyped;
1062    GLuint cur_mrf = 2, response_length;
1063    GLuint i, nr_texcoords;
1064    GLuint emit;
1065    GLuint msg_type;
1066    GLuint mrf_per_channel;
1067    GLuint simd_mode;
1068
1069    if (c->dispatch_width == 16) {
1070       mrf_per_channel = 2;
1071       response_length = 8;
1072       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1073       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1074    } else {
1075       mrf_per_channel = 1;
1076       response_length = 4;
1077       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1078       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1079    }
1080
1081    /* How many input regs are there?
1082     */
1083    switch (tex_idx) {
1084    case TEXTURE_1D_INDEX:
1085       emit = WRITEMASK_X;
1086       nr_texcoords = 1;
1087       break;
1088    case TEXTURE_2D_INDEX:
1089    case TEXTURE_RECT_INDEX:
1090       emit = WRITEMASK_XY;
1091       nr_texcoords = 2;
1092       break;
1093    case TEXTURE_3D_INDEX:
1094    case TEXTURE_CUBE_INDEX:
1095       emit = WRITEMASK_XYZ;
1096       nr_texcoords = 3;
1097       break;
1098    default:
1099       /* unexpected target */
1100       abort();
1101    }
1102
1103    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1104    if (intel->gen < 5 && c->dispatch_width == 8)
1105       nr_texcoords = 3;
1106
1107    /* For shadow comparisons, we have to supply u,v,r. */
1108    if (shadow)
1109       nr_texcoords = 3;
1110
1111    /* Emit the texcoords. */
1112    for (i = 0; i < nr_texcoords; i++) {
1113       if (emit & (1<<i))
1114          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1115       else
1116          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1117       cur_mrf += mrf_per_channel;
1118    }
1119
1120    /* Fill in the shadow comparison reference value. */
1121    if (shadow) {
1122       if (intel->gen >= 5) {
1123          /* Fill in the cube map array index value. */
1124          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1125          cur_mrf += mrf_per_channel;
1126       } else if (c->dispatch_width == 8) {
1127          /* Fill in the LOD bias value. */
1128          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1129          cur_mrf += mrf_per_channel;
1130       }
1131       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1132       cur_mrf += mrf_per_channel;
1133    }
1134
1135    if (intel->gen >= 5) {
1136       if (shadow)
1137          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1138       else
1139          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1140    } else {
1141       /* Note that G45 and older determines shadow compare and dispatch width
1142        * from message length for most messages.
1143        */
1144       if (c->dispatch_width == 16 && shadow)
1145          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1146       else
1147          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1148    }
1149
1150    brw_SAMPLE(p,
1151               dst_retyped,
1152               1,
1153               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1154               SURF_INDEX_TEXTURE(sampler),
1155               sampler,
1156               dst_flags & WRITEMASK_XYZW,
1157               msg_type,
1158               response_length,
1159               cur_mrf - 1,
1160               0,
1161               1,
1162               simd_mode);
1163 }
1164
1165
1166 void emit_txb(struct brw_wm_compile *c,
1167               struct brw_reg *dst,
1168               GLuint dst_flags,
1169               struct brw_reg *arg,
1170               struct brw_reg depth_payload,
1171               GLuint tex_idx,
1172               GLuint sampler)
1173 {
1174    struct brw_compile *p = &c->func;
1175    struct intel_context *intel = &p->brw->intel;
1176    GLuint msgLength;
1177    GLuint msg_type;
1178    GLuint mrf_per_channel;
1179    GLuint response_length;
1180    struct brw_reg dst_retyped;
1181
1182    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1183     * samples, so we'll use the 16-wide instruction, leave the second halves
1184     * undefined, and trust the execution mask to keep the undefined pixels
1185     * from mattering.
1186     */
1187    if (c->dispatch_width == 16 || intel->gen < 5) {
1188       if (intel->gen >= 5)
1189          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1190       else
1191          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1192       mrf_per_channel = 2;
1193       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1194       response_length = 8;
1195    } else {
1196       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1197       mrf_per_channel = 1;
1198       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1199       response_length = 4;
1200    }
1201
1202    /* Shadow ignored for txb. */
1203    switch (tex_idx) {
1204    case TEXTURE_1D_INDEX:
1205       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1206       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1207       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1208       break;
1209    case TEXTURE_2D_INDEX:
1210    case TEXTURE_RECT_INDEX:
1211       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1212       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1213       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1214       break;
1215    case TEXTURE_3D_INDEX:
1216    case TEXTURE_CUBE_INDEX:
1217       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1218       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1219       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1220       break;
1221    default:
1222       /* unexpected target */
1223       abort();
1224    }
1225
1226    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1227    msgLength = 2 + 4 * mrf_per_channel - 1;
1228
1229    brw_SAMPLE(p,
1230               dst_retyped,
1231               1,
1232               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1233               SURF_INDEX_TEXTURE(sampler),
1234               sampler,
1235               dst_flags & WRITEMASK_XYZW,
1236               msg_type,
1237               response_length,
1238               msgLength,
1239               0,
1240               1,
1241               BRW_SAMPLER_SIMD_MODE_SIMD16);
1242 }
1243
1244
1245 static void emit_lit(struct brw_wm_compile *c,
1246                      const struct brw_reg *dst,
1247                      GLuint mask,
1248                      const struct brw_reg *arg0)
1249 {
1250    struct brw_compile *p = &c->func;
1251
1252    assert((mask & WRITEMASK_XW) == 0);
1253
1254    if (mask & WRITEMASK_Y) {
1255       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1256       brw_MOV(p, dst[1], arg0[0]);
1257       brw_set_saturate(p, 0);
1258    }
1259
1260    if (mask & WRITEMASK_Z) {
1261       emit_math2(c, BRW_MATH_FUNCTION_POW,
1262                  &dst[2],
1263                  WRITEMASK_X | (mask & SATURATE),
1264                  &arg0[1],
1265                  &arg0[3]);
1266    }
1267
1268    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1269     * some of the POW calculations above, but 16-wide iff statements
1270     * seem to lock c1 hardware, so this is a nasty workaround:
1271     */
1272    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1273    {
1274       if (mask & WRITEMASK_Y)
1275          brw_MOV(p, dst[1], brw_imm_f(0));
1276
1277       if (mask & WRITEMASK_Z)
1278          brw_MOV(p, dst[2], brw_imm_f(0));
1279    }
1280    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1281 }
1282
1283
1284 /* Kill pixel - set execution mask to zero for those pixels which
1285  * fail.
1286  */
1287 static void emit_kil( struct brw_wm_compile *c,
1288                       struct brw_reg *arg0)
1289 {
1290    struct brw_compile *p = &c->func;
1291    struct intel_context *intel = &p->brw->intel;
1292    struct brw_reg pixelmask;
1293    GLuint i, j;
1294
1295    if (intel->gen >= 6)
1296       pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1297    else
1298       pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1299
1300    for (i = 0; i < 4; i++) {
1301       /* Check if we've already done the comparison for this reg
1302        * -- common when someone does KIL TEMP.wwww.
1303        */
1304       for (j = 0; j < i; j++) {
1305          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1306             break;
1307       }
1308       if (j != i)
1309          continue;
1310
1311       brw_push_insn_state(p);
1312       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1313       brw_set_predicate_control_flag_value(p, 0xff);
1314       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1315       brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1316       brw_pop_insn_state(p);
1317    }
1318 }
1319
1320 static void fire_fb_write( struct brw_wm_compile *c,
1321                            GLuint base_reg,
1322                            GLuint nr,
1323                            GLuint target,
1324                            GLuint eot )
1325 {
1326    struct brw_compile *p = &c->func;
1327    struct intel_context *intel = &p->brw->intel;
1328    struct brw_reg dst;
1329
1330    if (c->dispatch_width == 16)
1331       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1332    else
1333       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1334
1335    /* Pass through control information:
1336     *
1337     * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1338     */
1339 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1340    if (intel->gen < 6)
1341    {
1342       brw_push_insn_state(p);
1343       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1344       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1345       brw_MOV(p,
1346                brw_message_reg(base_reg + 1),
1347                brw_vec8_grf(1, 0));
1348       brw_pop_insn_state(p);
1349    }
1350
1351    /* Send framebuffer write message: */
1352 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1353    brw_fb_WRITE(p,
1354                 c->dispatch_width,
1355                 dst,
1356                 base_reg,
1357                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1358                 target,
1359                 nr,
1360                 0,
1361                 eot,
1362                 GL_TRUE);
1363 }
1364
1365
1366 static void emit_aa( struct brw_wm_compile *c,
1367                      struct brw_reg *arg1,
1368                      GLuint reg )
1369 {
1370    struct brw_compile *p = &c->func;
1371    GLuint comp = c->aa_dest_stencil_reg / 2;
1372    GLuint off = c->aa_dest_stencil_reg % 2;
1373    struct brw_reg aa = offset(arg1[comp], off);
1374
1375    brw_push_insn_state(p);
1376    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1377    brw_MOV(p, brw_message_reg(reg), aa);
1378    brw_pop_insn_state(p);
1379 }
1380
1381
1382 /* Post-fragment-program processing.  Send the results to the
1383  * framebuffer.
1384  * \param arg0  the fragment color
1385  * \param arg1  the pass-through depth value
1386  * \param arg2  the shader-computed depth value
1387  */
1388 void emit_fb_write(struct brw_wm_compile *c,
1389                    struct brw_reg *arg0,
1390                    struct brw_reg *arg1,
1391                    struct brw_reg *arg2,
1392                    GLuint target,
1393                    GLuint eot)
1394 {
1395    struct brw_compile *p = &c->func;
1396    struct brw_context *brw = p->brw;
1397    struct intel_context *intel = &brw->intel;
1398    GLuint nr = 2;
1399    GLuint channel;
1400
1401    /* Reserve a space for AA - may not be needed:
1402     */
1403    if (c->aa_dest_stencil_reg)
1404       nr += 1;
1405
1406    /* I don't really understand how this achieves the color interleave
1407     * (ie RGBARGBA) in the result:  [Do the saturation here]
1408     */
1409    brw_push_insn_state(p);
1410
1411    for (channel = 0; channel < 4; channel++) {
1412       if (intel->gen >= 6) {
1413          /* gen6 SIMD16 single source DP write looks like:
1414           * m + 0: r0
1415           * m + 1: r1
1416           * m + 2: g0
1417           * m + 3: g1
1418           * m + 4: b0
1419           * m + 5: b1
1420           * m + 6: a0
1421           * m + 7: a1
1422           */
1423          if (c->dispatch_width == 16) {
1424             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1425          } else {
1426             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1427          }
1428       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1429          /* pre-gen6 SIMD16 single source DP write looks like:
1430           * m + 0: r0
1431           * m + 1: g0
1432           * m + 2: b0
1433           * m + 3: a0
1434           * m + 4: r1
1435           * m + 5: g1
1436           * m + 6: b1
1437           * m + 7: a1
1438           *
1439           * By setting the high bit of the MRF register number, we indicate
1440           * that we want COMPR4 mode - instead of doing the usual destination
1441           * + 1 for the second half we get destination + 4.
1442           */
1443          brw_MOV(p,
1444                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1445                  arg0[channel]);
1446       } else {
1447          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1448          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1449          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1450          brw_MOV(p,
1451                  brw_message_reg(nr + channel),
1452                  arg0[channel]);
1453
1454          if (c->dispatch_width == 16) {
1455             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1456             brw_MOV(p,
1457                     brw_message_reg(nr + channel + 4),
1458                     sechalf(arg0[channel]));
1459          }
1460       }
1461    }
1462    /* skip over the regs populated above:
1463     */
1464    if (c->dispatch_width == 16)
1465       nr += 8;
1466    else
1467       nr += 4;
1468
1469    brw_pop_insn_state(p);
1470
1471    if (c->source_depth_to_render_target)
1472    {
1473       if (c->computes_depth)
1474          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1475       else
1476          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1477
1478       nr += 2;
1479    }
1480
1481    if (c->dest_depth_reg)
1482    {
1483       GLuint comp = c->dest_depth_reg / 2;
1484       GLuint off = c->dest_depth_reg % 2;
1485
1486       if (off != 0) {
1487          brw_push_insn_state(p);
1488          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1489
1490          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1491          /* 2nd half? */
1492          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1493          brw_pop_insn_state(p);
1494       }
1495       else {
1496          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1497       }
1498       nr += 2;
1499    }
1500
1501    if (intel->gen >= 6) {
1502       /* Load the message header.  There's no implied move from src0
1503        * to the base mrf on gen6.
1504        */
1505       brw_push_insn_state(p);
1506       brw_set_mask_control(p, BRW_MASK_DISABLE);
1507       brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1508               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1509       brw_pop_insn_state(p);
1510
1511       if (target != 0) {
1512          brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1513                                         0,
1514                                         2), BRW_REGISTER_TYPE_UD),
1515                  brw_imm_ud(target));
1516       }
1517    }
1518
1519    if (!c->runtime_check_aads_emit) {
1520       if (c->aa_dest_stencil_reg)
1521          emit_aa(c, arg1, 2);
1522
1523       fire_fb_write(c, 0, nr, target, eot);
1524    }
1525    else {
1526       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1527       struct brw_reg ip = brw_ip_reg();
1528       struct brw_instruction *jmp;
1529
1530       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1531       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1532       brw_AND(p,
1533               v1_null_ud,
1534               get_element_ud(brw_vec8_grf(1,0), 6),
1535               brw_imm_ud(1<<26));
1536
1537       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1538       {
1539          emit_aa(c, arg1, 2);
1540          fire_fb_write(c, 0, nr, target, eot);
1541          /* note - thread killed in subroutine */
1542       }
1543       brw_land_fwd_jump(p, jmp);
1544
1545       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1546        */
1547       fire_fb_write(c, 1, nr-1, target, eot);
1548    }
1549 }
1550
1551 /**
1552  * Move a GPR to scratch memory.
1553  */
1554 static void emit_spill( struct brw_wm_compile *c,
1555                         struct brw_reg reg,
1556                         GLuint slot )
1557 {
1558    struct brw_compile *p = &c->func;
1559
1560    /*
1561      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1562    */
1563    brw_MOV(p, brw_message_reg(2), reg);
1564
1565    /*
1566      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1567      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1568    */
1569    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1570 }
1571
1572
1573 /**
1574  * Load a GPR from scratch memory.
1575  */
1576 static void emit_unspill( struct brw_wm_compile *c,
1577                           struct brw_reg reg,
1578                           GLuint slot )
1579 {
1580    struct brw_compile *p = &c->func;
1581
1582    /* Slot 0 is the undef value.
1583     */
1584    if (slot == 0) {
1585       brw_MOV(p, reg, brw_imm_f(0));
1586       return;
1587    }
1588
1589    /*
1590      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1591      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1592    */
1593
1594    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1595 }
1596
1597
1598 /**
1599  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1600  * Args with unspill_reg != 0 will be loaded from scratch memory.
1601  */
1602 static void get_argument_regs( struct brw_wm_compile *c,
1603                                struct brw_wm_ref *arg[],
1604                                struct brw_reg *regs )
1605 {
1606    GLuint i;
1607
1608    for (i = 0; i < 4; i++) {
1609       if (arg[i]) {
1610          if (arg[i]->unspill_reg)
1611             emit_unspill(c,
1612                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1613                          arg[i]->value->spill_slot);
1614
1615          regs[i] = arg[i]->hw_reg;
1616       }
1617       else {
1618          regs[i] = brw_null_reg();
1619       }
1620    }
1621 }
1622
1623
1624 /**
1625  * For values that have a spill_slot!=0, write those regs to scratch memory.
1626  */
1627 static void spill_values( struct brw_wm_compile *c,
1628                           struct brw_wm_value *values,
1629                           GLuint nr )
1630 {
1631    GLuint i;
1632
1633    for (i = 0; i < nr; i++)
1634       if (values[i].spill_slot)
1635          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1636 }
1637
1638
1639 /* Emit the fragment program instructions here.
1640  */
1641 void brw_wm_emit( struct brw_wm_compile *c )
1642 {
1643    struct brw_compile *p = &c->func;
1644    struct intel_context *intel = &p->brw->intel;
1645    GLuint insn;
1646
1647    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1648    if (intel->gen >= 6)
1649         brw_set_acc_write_control(p, 1);
1650
1651    /* Check if any of the payload regs need to be spilled:
1652     */
1653    spill_values(c, c->payload.depth, 4);
1654    spill_values(c, c->creg, c->nr_creg);
1655    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1656
1657
1658    for (insn = 0; insn < c->nr_insns; insn++) {
1659
1660       struct brw_wm_instruction *inst = &c->instruction[insn];
1661       struct brw_reg args[3][4], dst[4];
1662       GLuint i, dst_flags;
1663
1664       /* Get argument regs:
1665        */
1666       for (i = 0; i < 3; i++)
1667          get_argument_regs(c, inst->src[i], args[i]);
1668
1669       /* Get dest regs:
1670        */
1671       for (i = 0; i < 4; i++)
1672          if (inst->dst[i])
1673             dst[i] = inst->dst[i]->hw_reg;
1674          else
1675             dst[i] = brw_null_reg();
1676
1677       /* Flags
1678        */
1679       dst_flags = inst->writemask;
1680       if (inst->saturate)
1681          dst_flags |= SATURATE;
1682
1683       switch (inst->opcode) {
1684          /* Generated instructions for calculating triangle interpolants:
1685           */
1686       case WM_PIXELXY:
1687          emit_pixel_xy(c, dst, dst_flags);
1688          break;
1689
1690       case WM_DELTAXY:
1691          emit_delta_xy(p, dst, dst_flags, args[0]);
1692          break;
1693
1694       case WM_WPOSXY:
1695          emit_wpos_xy(c, dst, dst_flags, args[0]);
1696          break;
1697
1698       case WM_PIXELW:
1699          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1700          break;
1701
1702       case WM_LINTERP:
1703          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1704          break;
1705
1706       case WM_PINTERP:
1707          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1708          break;
1709
1710       case WM_CINTERP:
1711          emit_cinterp(p, dst, dst_flags, args[0]);
1712          break;
1713
1714       case WM_FB_WRITE:
1715          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1716          break;
1717
1718       case WM_FRONTFACING:
1719          emit_frontfacing(p, dst, dst_flags);
1720          break;
1721
1722          /* Straightforward arithmetic:
1723           */
1724       case OPCODE_ADD:
1725          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1726          break;
1727
1728       case OPCODE_FRC:
1729          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1730          break;
1731
1732       case OPCODE_FLR:
1733          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1734          break;
1735
1736       case OPCODE_DDX:
1737          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1738          break;
1739
1740       case OPCODE_DDY:
1741          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1742          break;
1743
1744       case OPCODE_DP2:
1745          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1746          break;
1747
1748       case OPCODE_DP3:
1749          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1750          break;
1751
1752       case OPCODE_DP4:
1753          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1754          break;
1755
1756       case OPCODE_DPH:
1757          emit_dph(p, dst, dst_flags, args[0], args[1]);
1758          break;
1759
1760       case OPCODE_TRUNC:
1761          for (i = 0; i < 4; i++) {
1762             if (dst_flags & (1<<i)) {
1763                brw_RNDZ(p, dst[i], args[0][i]);
1764             }
1765          }
1766          break;
1767
1768       case OPCODE_LRP:
1769          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1770          break;
1771
1772       case OPCODE_MAD:
1773          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1774          break;
1775
1776       case OPCODE_MOV:
1777       case OPCODE_SWZ:
1778          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1779          break;
1780
1781       case OPCODE_MUL:
1782          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1783          break;
1784
1785       case OPCODE_XPD:
1786          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1787          break;
1788
1789          /* Higher math functions:
1790           */
1791       case OPCODE_RCP:
1792          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1793          break;
1794
1795       case OPCODE_RSQ:
1796          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1797          break;
1798
1799       case OPCODE_SIN:
1800          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1801          break;
1802
1803       case OPCODE_COS:
1804          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1805          break;
1806
1807       case OPCODE_EX2:
1808          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1809          break;
1810
1811       case OPCODE_LG2:
1812          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1813          break;
1814
1815       case OPCODE_SCS:
1816          /* There is an scs math function, but it would need some
1817           * fixup for 16-element execution.
1818           */
1819          if (dst_flags & WRITEMASK_X)
1820             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1821          if (dst_flags & WRITEMASK_Y)
1822             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1823          break;
1824
1825       case OPCODE_POW:
1826          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1827          break;
1828
1829          /* Comparisons:
1830           */
1831       case OPCODE_CMP:
1832          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1833          break;
1834
1835       case OPCODE_MAX:
1836          emit_max(p, dst, dst_flags, args[0], args[1]);
1837          break;
1838
1839       case OPCODE_MIN:
1840          emit_min(p, dst, dst_flags, args[0], args[1]);
1841          break;
1842
1843       case OPCODE_SLT:
1844          emit_slt(p, dst, dst_flags, args[0], args[1]);
1845          break;
1846
1847       case OPCODE_SLE:
1848          emit_sle(p, dst, dst_flags, args[0], args[1]);
1849         break;
1850       case OPCODE_SGT:
1851          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1852         break;
1853       case OPCODE_SGE:
1854          emit_sge(p, dst, dst_flags, args[0], args[1]);
1855          break;
1856       case OPCODE_SEQ:
1857          emit_seq(p, dst, dst_flags, args[0], args[1]);
1858         break;
1859       case OPCODE_SNE:
1860          emit_sne(p, dst, dst_flags, args[0], args[1]);
1861         break;
1862
1863       case OPCODE_SSG:
1864          emit_sign(p, dst, dst_flags, args[0]);
1865          break;
1866
1867       case OPCODE_LIT:
1868          emit_lit(c, dst, dst_flags, args[0]);
1869          break;
1870
1871          /* Texturing operations:
1872           */
1873       case OPCODE_TEX:
1874          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1875                   inst->tex_idx, inst->tex_unit,
1876                   inst->tex_shadow);
1877          break;
1878
1879       case OPCODE_TXB:
1880          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1881                   inst->tex_idx, inst->tex_unit);
1882          break;
1883
1884       case OPCODE_KIL:
1885          emit_kil(c, args[0]);
1886          break;
1887
1888       default:
1889          printf("Unsupported opcode %i (%s) in fragment shader\n",
1890                 inst->opcode, inst->opcode < MAX_OPCODE ?
1891                 _mesa_opcode_string(inst->opcode) :
1892                 "unknown");
1893       }
1894
1895       for (i = 0; i < 4; i++)
1896         if (inst->dst[i] && inst->dst[i]->spill_slot)
1897            emit_spill(c,
1898                       inst->dst[i]->hw_reg,
1899                       inst->dst[i]->spill_slot);
1900    }
1901
1902    /* Only properly tested on ILK */
1903    if (p->brw->intel.gen == 5) {
1904      brw_remove_duplicate_mrf_moves(p);
1905      if (c->dispatch_width == 16)
1906         brw_remove_grf_to_mrf_moves(p);
1907    }
1908
1909    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1910       int i;
1911
1912      printf("wm-native:\n");
1913      for (i = 0; i < p->nr_insn; i++)
1914          brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1915       printf("\n");
1916    }
1917 }
1918