src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static bool
  38 can_do_pln(struct intel_context *intel, const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return false;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return false;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return false;
  50
  51    return true;
  52 }
  53
  54 /* Return the SrcReg index of the channels that can be immediate float operands
  55  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  56  */
  57 bool
  58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  59 {
  60    int opcode_array[] = {
  61       [OPCODE_ADD] = 2,
  62       [OPCODE_CMP] = 3,
  63       [OPCODE_DP3] = 2,
  64       [OPCODE_DP4] = 2,
  65       [OPCODE_DPH] = 2,
  66       [OPCODE_MAX] = 2,
  67       [OPCODE_MIN] = 2,
  68       [OPCODE_MOV] = 1,
  69       [OPCODE_MUL] = 2,
  70       [OPCODE_SEQ] = 2,
  71       [OPCODE_SGE] = 2,
  72       [OPCODE_SGT] = 2,
  73       [OPCODE_SLE] = 2,
  74       [OPCODE_SLT] = 2,
  75       [OPCODE_SNE] = 2,
  76       [OPCODE_SWZ] = 1,
  77       [OPCODE_XPD] = 2,
  78    };
  79
  80    /* These opcodes get broken down in a way that allow two
  81     * args to be immediates.
  82     */
  83    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  84       if (arg == 1 || arg == 2)
  85          return true;
  86    }
  87
  88    if (opcode > ARRAY_SIZE(opcode_array))
  89       return false;
  90
  91    return arg == opcode_array[opcode] - 1;
  92 }
  93
  94 /**
  95  * Computes the screen-space x,y position of the pixels.
  96  *
  97  * This will be used by emit_delta_xy() or emit_wpos_xy() for
  98  * interpolation of attributes..
  99  *
 100  * Payload R0:
 101  *
 102  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 103  *         corresponding to each of the 16 execution channels.
 104  * R0.1..8 -- ?
 105  * R1.0 -- triangle vertex 0.X
 106  * R1.1 -- triangle vertex 0.Y
 107  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 108  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 109  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 110  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 111  * R1.6 -- ?
 112  * R1.7 -- ?
 113  * R1.8 -- ?
 114  */
 115 void emit_pixel_xy(struct brw_wm_compile *c,
 116                    const struct brw_reg *dst,
 117                    GLuint mask)
 118 {
 119    struct brw_compile *p = &c->func;
 120    struct brw_reg r1 = brw_vec1_grf(1, 0);
 121    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 122    struct brw_reg dst0_uw, dst1_uw;
 123
 124    brw_push_insn_state(p);
 125    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 126
 127    if (c->dispatch_width == 16) {
 128       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 129       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 130    } else {
 131       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 132       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 133    }
 134
 135    /* Calculate pixel centers by adding 1 or 0 to each of the
 136     * micro-tile coordinates passed in r1.
 137     */
 138    if (mask & WRITEMASK_X) {
 139       brw_ADD(p,
 140               dst0_uw,
 141               stride(suboffset(r1_uw, 4), 2, 4, 0),
 142               brw_imm_v(0x10101010));
 143    }
 144
 145    if (mask & WRITEMASK_Y) {
 146       brw_ADD(p,
 147               dst1_uw,
 148               stride(suboffset(r1_uw,5), 2, 4, 0),
 149               brw_imm_v(0x11001100));
 150    }
 151    brw_pop_insn_state(p);
 152 }
 153
 154 /**
 155  * Computes the screen-space x,y distance of the pixels from the start
 156  * vertex.
 157  *
 158  * This will be used in linterp or pinterp with the start vertex value
 159  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 160  * to produce interpolated attribute values.
 161  */
 162 void emit_delta_xy(struct brw_compile *p,
 163                    const struct brw_reg *dst,
 164                    GLuint mask,
 165                    const struct brw_reg *arg0)
 166 {
 167    struct intel_context *intel = &p->brw->intel;
 168    struct brw_reg r1 = brw_vec1_grf(1, 0);
 169
 170    if (mask == 0)
 171       return;
 172
 173    assert(mask == WRITEMASK_XY);
 174
 175    if (intel->gen >= 6) {
 176        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
 177           Just add them with 0.0 for dst reg.. */
 178        r1 = brw_imm_v(0x00000000);
 179        brw_ADD(p,
 180                dst[0],
 181                retype(arg0[0], BRW_REGISTER_TYPE_UW),
 182                r1);
 183        brw_ADD(p,
 184                dst[1],
 185                retype(arg0[1], BRW_REGISTER_TYPE_UW),
 186                r1);
 187        return;
 188    }
 189
 190    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 191     * centers produced by emit_pixel_xy().
 192     */
 193    brw_ADD(p,
 194            dst[0],
 195            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 196            negate(r1));
 197    brw_ADD(p,
 198            dst[1],
 199            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 200            negate(suboffset(r1,1)));
 201 }
 202
 203 /**
 204  * Computes the pixel offset from the window origin for gl_FragCoord().
 205  */
 206 void emit_wpos_xy(struct brw_wm_compile *c,
 207                   const struct brw_reg *dst,
 208                   GLuint mask,
 209                   const struct brw_reg *arg0)
 210 {
 211    struct brw_compile *p = &c->func;
 212    struct intel_context *intel = &p->brw->intel;
 213    struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
 214    struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
 215
 216    if (mask & WRITEMASK_X) {
 217       if (intel->gen >= 6) {
 218          struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
 219          brw_MOV(p, delta_x_f, delta_x);
 220          delta_x = delta_x_f;
 221       }
 222
 223       if (c->fp->program.PixelCenterInteger) {
 224          /* X' = X */
 225          brw_MOV(p, dst[0], delta_x);
 226       } else {
 227          /* X' = X + 0.5 */
 228          brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
 229       }
 230    }
 231
 232    if (mask & WRITEMASK_Y) {
 233       if (intel->gen >= 6) {
 234          struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
 235          brw_MOV(p, delta_y_f, delta_y);
 236          delta_y = delta_y_f;
 237       }
 238
 239       if (c->fp->program.OriginUpperLeft) {
 240          if (c->fp->program.PixelCenterInteger) {
 241             /* Y' = Y */
 242             brw_MOV(p, dst[1], delta_y);
 243          } else {
 244             brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
 245          }
 246       } else {
 247          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 248
 249          /* Y' = (height - 1) - Y + center */
 250          brw_ADD(p, dst[1], negate(delta_y),
 251                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 252       }
 253    }
 254 }
 255
 256
 257 void emit_pixel_w(struct brw_wm_compile *c,
 258                   const struct brw_reg *dst,
 259                   GLuint mask,
 260                   const struct brw_reg *arg0,
 261                   const struct brw_reg *deltas)
 262 {
 263    struct brw_compile *p = &c->func;
 264    struct intel_context *intel = &p->brw->intel;
 265    struct brw_reg src;
 266    struct brw_reg temp_dst;
 267
 268    if (intel->gen >= 6)
 269         temp_dst = dst[3];
 270    else
 271         temp_dst = brw_message_reg(2);
 272
 273    assert(intel->gen < 6);
 274
 275    /* Don't need this if all you are doing is interpolating color, for
 276     * instance.
 277     */
 278    if (mask & WRITEMASK_W) {
 279       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 280
 281       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 282        * result straight into a message reg.
 283        */
 284       if (can_do_pln(intel, deltas)) {
 285          brw_PLN(p, temp_dst, interp3, deltas[0]);
 286       } else {
 287          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 288          brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
 289       }
 290
 291       /* Calc w */
 292       if (intel->gen >= 6)
 293          src = temp_dst;
 294       else
 295          src = brw_null_reg();
 296
 297       if (c->dispatch_width == 16) {
 298          brw_math_16(p, dst[3],
 299                      BRW_MATH_FUNCTION_INV,
 300                      2, src,
 301                      BRW_MATH_PRECISION_FULL);
 302       } else {
 303          brw_math(p, dst[3],
 304                   BRW_MATH_FUNCTION_INV,
 305                   2, src,
 306                   BRW_MATH_DATA_VECTOR,
 307                   BRW_MATH_PRECISION_FULL);
 308       }
 309    }
 310 }
 311
 312 void emit_linterp(struct brw_compile *p,
 313                   const struct brw_reg *dst,
 314                   GLuint mask,
 315                   const struct brw_reg *arg0,
 316                   const struct brw_reg *deltas)
 317 {
 318    struct intel_context *intel = &p->brw->intel;
 319    struct brw_reg interp[4];
 320    GLuint nr = arg0[0].nr;
 321    GLuint i;
 322
 323    interp[0] = brw_vec1_grf(nr, 0);
 324    interp[1] = brw_vec1_grf(nr, 4);
 325    interp[2] = brw_vec1_grf(nr+1, 0);
 326    interp[3] = brw_vec1_grf(nr+1, 4);
 327
 328    for (i = 0; i < 4; i++) {
 329       if (mask & (1<<i)) {
 330          if (intel->gen >= 6) {
 331             brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
 332          } else if (can_do_pln(intel, deltas)) {
 333             brw_PLN(p, dst[i], interp[i], deltas[0]);
 334          } else {
 335             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 336             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 337          }
 338       }
 339    }
 340 }
 341
 342
 343 void emit_pinterp(struct brw_compile *p,
 344                   const struct brw_reg *dst,
 345                   GLuint mask,
 346                   const struct brw_reg *arg0,
 347                   const struct brw_reg *deltas,
 348                   const struct brw_reg *w)
 349 {
 350    struct intel_context *intel = &p->brw->intel;
 351    struct brw_reg interp[4];
 352    GLuint nr = arg0[0].nr;
 353    GLuint i;
 354
 355    if (intel->gen >= 6) {
 356       emit_linterp(p, dst, mask, arg0, interp);
 357       return;
 358    }
 359
 360    interp[0] = brw_vec1_grf(nr, 0);
 361    interp[1] = brw_vec1_grf(nr, 4);
 362    interp[2] = brw_vec1_grf(nr+1, 0);
 363    interp[3] = brw_vec1_grf(nr+1, 4);
 364
 365    for (i = 0; i < 4; i++) {
 366       if (mask & (1<<i)) {
 367          if (can_do_pln(intel, deltas)) {
 368             brw_PLN(p, dst[i], interp[i], deltas[0]);
 369          } else {
 370             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 371             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 372          }
 373       }
 374    }
 375    for (i = 0; i < 4; i++) {
 376       if (mask & (1<<i)) {
 377          brw_MUL(p, dst[i], dst[i], w[3]);
 378       }
 379    }
 380 }
 381
 382
 383 void emit_cinterp(struct brw_compile *p,
 384                   const struct brw_reg *dst,
 385                   GLuint mask,
 386                   const struct brw_reg *arg0)
 387 {
 388    struct brw_reg interp[4];
 389    GLuint nr = arg0[0].nr;
 390    GLuint i;
 391
 392    interp[0] = brw_vec1_grf(nr, 0);
 393    interp[1] = brw_vec1_grf(nr, 4);
 394    interp[2] = brw_vec1_grf(nr+1, 0);
 395    interp[3] = brw_vec1_grf(nr+1, 4);
 396
 397    for (i = 0; i < 4; i++) {
 398       if (mask & (1<<i)) {
 399          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 400       }
 401    }
 402 }
 403
 404 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 405 void emit_frontfacing(struct brw_compile *p,
 406                       const struct brw_reg *dst,
 407                       GLuint mask)
 408 {
 409    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 410    GLuint i;
 411
 412    if (!(mask & WRITEMASK_XYZW))
 413       return;
 414
 415    for (i = 0; i < 4; i++) {
 416       if (mask & (1<<i)) {
 417          brw_MOV(p, dst[i], brw_imm_f(0.0));
 418       }
 419    }
 420
 421    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 422     * us front face
 423     */
 424    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 425    for (i = 0; i < 4; i++) {
 426       if (mask & (1<<i)) {
 427          brw_MOV(p, dst[i], brw_imm_f(1.0));
 428       }
 429    }
 430    brw_set_predicate_control_flag_value(p, 0xff);
 431 }
 432
 433 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 434  * looking like:
 435  *
 436  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 437  *
 438  * and we're trying to produce:
 439  *
 440  *           DDX                     DDY
 441  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 442  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 443  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 444  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 445  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 446  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 447  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 448  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 449  *
 450  * and add another set of two more subspans if in 16-pixel dispatch mode.
 451  *
 452  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 453  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 454  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 455  * between each other.  We could probably do it like ddx and swizzle the right
 456  * order later, but bail for now and just produce
 457  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 458  *
 459  * The negate_value boolean is used to negate the d/dy computation for FBOs,
 460  * since they place the origin at the upper left instead of the lower left.
 461  */
 462 void emit_ddxy(struct brw_compile *p,
 463                const struct brw_reg *dst,
 464                GLuint mask,
 465                bool is_ddx,
 466                const struct brw_reg *arg0,
 467                bool negate_value)
 468 {
 469    int i;
 470    struct brw_reg src0, src1;
 471
 472    if (mask & SATURATE)
 473       brw_set_saturate(p, 1);
 474    for (i = 0; i < 4; i++ ) {
 475       if (mask & (1<<i)) {
 476          if (is_ddx) {
 477             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 478                            BRW_REGISTER_TYPE_F,
 479                            BRW_VERTICAL_STRIDE_2,
 480                            BRW_WIDTH_2,
 481                            BRW_HORIZONTAL_STRIDE_0,
 482                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 483             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 484                            BRW_REGISTER_TYPE_F,
 485                            BRW_VERTICAL_STRIDE_2,
 486                            BRW_WIDTH_2,
 487                            BRW_HORIZONTAL_STRIDE_0,
 488                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 489          } else {
 490             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 491                            BRW_REGISTER_TYPE_F,
 492                            BRW_VERTICAL_STRIDE_4,
 493                            BRW_WIDTH_4,
 494                            BRW_HORIZONTAL_STRIDE_0,
 495                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 496             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 497                            BRW_REGISTER_TYPE_F,
 498                            BRW_VERTICAL_STRIDE_4,
 499                            BRW_WIDTH_4,
 500                            BRW_HORIZONTAL_STRIDE_0,
 501                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 502          }
 503          if (negate_value)
 504             brw_ADD(p, dst[i], src1, negate(src0));
 505          else
 506             brw_ADD(p, dst[i], src0, negate(src1));
 507       }
 508    }
 509    if (mask & SATURATE)
 510       brw_set_saturate(p, 0);
 511 }
 512
 513 void emit_alu1(struct brw_compile *p,
 514                struct brw_instruction *(*func)(struct brw_compile *,
 515                                                struct brw_reg,
 516                                                struct brw_reg),
 517                const struct brw_reg *dst,
 518                GLuint mask,
 519                const struct brw_reg *arg0)
 520 {
 521    GLuint i;
 522
 523    if (mask & SATURATE)
 524       brw_set_saturate(p, 1);
 525
 526    for (i = 0; i < 4; i++) {
 527       if (mask & (1<<i)) {
 528          func(p, dst[i], arg0[i]);
 529       }
 530    }
 531
 532    if (mask & SATURATE)
 533       brw_set_saturate(p, 0);
 534 }
 535
 536
 537 void emit_alu2(struct brw_compile *p,
 538                struct brw_instruction *(*func)(struct brw_compile *,
 539                                                struct brw_reg,
 540                                                struct brw_reg,
 541                                                struct brw_reg),
 542                const struct brw_reg *dst,
 543                GLuint mask,
 544                const struct brw_reg *arg0,
 545                const struct brw_reg *arg1)
 546 {
 547    GLuint i;
 548
 549    if (mask & SATURATE)
 550       brw_set_saturate(p, 1);
 551
 552    for (i = 0; i < 4; i++) {
 553       if (mask & (1<<i)) {
 554          func(p, dst[i], arg0[i], arg1[i]);
 555       }
 556    }
 557
 558    if (mask & SATURATE)
 559       brw_set_saturate(p, 0);
 560 }
 561
 562
 563 void emit_mad(struct brw_compile *p,
 564               const struct brw_reg *dst,
 565               GLuint mask,
 566               const struct brw_reg *arg0,
 567               const struct brw_reg *arg1,
 568               const struct brw_reg *arg2)
 569 {
 570    GLuint i;
 571
 572    for (i = 0; i < 4; i++) {
 573       if (mask & (1<<i)) {
 574          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 575
 576          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 577          brw_ADD(p, dst[i], dst[i], arg2[i]);
 578          brw_set_saturate(p, 0);
 579       }
 580    }
 581 }
 582
 583 void emit_lrp(struct brw_compile *p,
 584               const struct brw_reg *dst,
 585               GLuint mask,
 586               const struct brw_reg *arg0,
 587               const struct brw_reg *arg1,
 588               const struct brw_reg *arg2)
 589 {
 590    GLuint i;
 591
 592    /* Uses dst as a temporary:
 593     */
 594    for (i = 0; i < 4; i++) {
 595       if (mask & (1<<i)) {
 596          /* Can I use the LINE instruction for this?
 597           */
 598          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 599          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 600
 601          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 602          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 603          brw_set_saturate(p, 0);
 604       }
 605    }
 606 }
 607
 608 void emit_sop(struct brw_compile *p,
 609               const struct brw_reg *dst,
 610               GLuint mask,
 611               GLuint cond,
 612               const struct brw_reg *arg0,
 613               const struct brw_reg *arg1)
 614 {
 615    GLuint i;
 616
 617    for (i = 0; i < 4; i++) {
 618       if (mask & (1<<i)) {
 619          brw_push_insn_state(p);
 620          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 621          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 622          brw_MOV(p, dst[i], brw_imm_f(0));
 623          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 624          brw_MOV(p, dst[i], brw_imm_f(1.0));
 625          brw_pop_insn_state(p);
 626       }
 627    }
 628 }
 629
 630 static void emit_slt( struct brw_compile *p,
 631                       const struct brw_reg *dst,
 632                       GLuint mask,
 633                       const struct brw_reg *arg0,
 634                       const struct brw_reg *arg1 )
 635 {
 636    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 637 }
 638
 639 static void emit_sle( struct brw_compile *p,
 640                       const struct brw_reg *dst,
 641                       GLuint mask,
 642                       const struct brw_reg *arg0,
 643                       const struct brw_reg *arg1 )
 644 {
 645    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 646 }
 647
 648 static void emit_sgt( struct brw_compile *p,
 649                       const struct brw_reg *dst,
 650                       GLuint mask,
 651                       const struct brw_reg *arg0,
 652                       const struct brw_reg *arg1 )
 653 {
 654    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 655 }
 656
 657 static void emit_sge( struct brw_compile *p,
 658                       const struct brw_reg *dst,
 659                       GLuint mask,
 660                       const struct brw_reg *arg0,
 661                       const struct brw_reg *arg1 )
 662 {
 663    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 664 }
 665
 666 static void emit_seq( struct brw_compile *p,
 667                       const struct brw_reg *dst,
 668                       GLuint mask,
 669                       const struct brw_reg *arg0,
 670                       const struct brw_reg *arg1 )
 671 {
 672    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 673 }
 674
 675 static void emit_sne( struct brw_compile *p,
 676                       const struct brw_reg *dst,
 677                       GLuint mask,
 678                       const struct brw_reg *arg0,
 679                       const struct brw_reg *arg1 )
 680 {
 681    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 682 }
 683
 684 void emit_cmp(struct brw_compile *p,
 685               const struct brw_reg *dst,
 686               GLuint mask,
 687               const struct brw_reg *arg0,
 688               const struct brw_reg *arg1,
 689               const struct brw_reg *arg2)
 690 {
 691    GLuint i;
 692
 693    for (i = 0; i < 4; i++) {
 694       if (mask & (1<<i)) {
 695          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 696
 697          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 698          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 699          brw_set_saturate(p, 0);
 700          brw_set_predicate_control_flag_value(p, 0xff);
 701       }
 702    }
 703 }
 704
 705 void emit_sign(struct brw_compile *p,
 706                const struct brw_reg *dst,
 707                GLuint mask,
 708                const struct brw_reg *arg0)
 709 {
 710    GLuint i;
 711
 712    for (i = 0; i < 4; i++) {
 713       if (mask & (1<<i)) {
 714          brw_MOV(p, dst[i], brw_imm_f(0.0));
 715
 716          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 717          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 718          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 719
 720          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 721          brw_MOV(p, dst[i], brw_imm_f(1.0));
 722          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 723       }
 724    }
 725 }
 726
 727 void emit_max(struct brw_compile *p,
 728               const struct brw_reg *dst,
 729               GLuint mask,
 730               const struct brw_reg *arg0,
 731               const struct brw_reg *arg1)
 732 {
 733    GLuint i;
 734
 735    for (i = 0; i < 4; i++) {
 736       if (mask & (1<<i)) {
 737          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 738
 739          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 740          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 741          brw_set_saturate(p, 0);
 742          brw_set_predicate_control_flag_value(p, 0xff);
 743       }
 744    }
 745 }
 746
 747 void emit_min(struct brw_compile *p,
 748               const struct brw_reg *dst,
 749               GLuint mask,
 750               const struct brw_reg *arg0,
 751               const struct brw_reg *arg1)
 752 {
 753    GLuint i;
 754
 755    for (i = 0; i < 4; i++) {
 756       if (mask & (1<<i)) {
 757          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 758
 759          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 760          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 761          brw_set_saturate(p, 0);
 762          brw_set_predicate_control_flag_value(p, 0xff);
 763       }
 764    }
 765 }
 766
 767
 768 void emit_dp2(struct brw_compile *p,
 769               const struct brw_reg *dst,
 770               GLuint mask,
 771               const struct brw_reg *arg0,
 772               const struct brw_reg *arg1)
 773 {
 774    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 775
 776    if (!(mask & WRITEMASK_XYZW))
 777       return; /* Do not emit dead code */
 778
 779    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 780
 781    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 782
 783    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 784    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
 785    brw_set_saturate(p, 0);
 786 }
 787
 788
 789 void emit_dp3(struct brw_compile *p,
 790               const struct brw_reg *dst,
 791               GLuint mask,
 792               const struct brw_reg *arg0,
 793               const struct brw_reg *arg1)
 794 {
 795    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 796
 797    if (!(mask & WRITEMASK_XYZW))
 798       return; /* Do not emit dead code */
 799
 800    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 801
 802    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 803    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 804
 805    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 806    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 807    brw_set_saturate(p, 0);
 808 }
 809
 810
 811 void emit_dp4(struct brw_compile *p,
 812               const struct brw_reg *dst,
 813               GLuint mask,
 814               const struct brw_reg *arg0,
 815               const struct brw_reg *arg1)
 816 {
 817    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 818
 819    if (!(mask & WRITEMASK_XYZW))
 820       return; /* Do not emit dead code */
 821
 822    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 823
 824    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 825    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 826    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 827
 828    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 829    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 830    brw_set_saturate(p, 0);
 831 }
 832
 833
 834 void emit_dph(struct brw_compile *p,
 835               const struct brw_reg *dst,
 836               GLuint mask,
 837               const struct brw_reg *arg0,
 838               const struct brw_reg *arg1)
 839 {
 840    const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 841
 842    if (!(mask & WRITEMASK_XYZW))
 843       return; /* Do not emit dead code */
 844
 845    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 846
 847    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 848    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 849    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 850
 851    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 852    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 853    brw_set_saturate(p, 0);
 854 }
 855
 856
 857 void emit_xpd(struct brw_compile *p,
 858               const struct brw_reg *dst,
 859               GLuint mask,
 860               const struct brw_reg *arg0,
 861               const struct brw_reg *arg1)
 862 {
 863    GLuint i;
 864
 865    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 866
 867    for (i = 0 ; i < 3; i++) {
 868       if (mask & (1<<i)) {
 869          GLuint i2 = (i+2)%3;
 870          GLuint i1 = (i+1)%3;
 871
 872          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 873
 874          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 875          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 876          brw_set_saturate(p, 0);
 877       }
 878    }
 879 }
 880
 881
 882 void emit_math1(struct brw_wm_compile *c,
 883                 GLuint function,
 884                 const struct brw_reg *dst,
 885                 GLuint mask,
 886                 const struct brw_reg *arg0)
 887 {
 888    struct brw_compile *p = &c->func;
 889    struct intel_context *intel = &p->brw->intel;
 890    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 891    struct brw_reg src;
 892
 893    if (!(mask & WRITEMASK_XYZW))
 894       return; /* Do not emit dead code */
 895
 896    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 897
 898    if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
 899                             arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
 900                            arg0[0].negate || arg0[0].abs)) {
 901       /* Gen6 math requires that source and dst horizontal stride be 1,
 902        * and that the argument be in the GRF.
 903        *
 904        * The hardware ignores source modifiers (negate and abs) on math
 905        * instructions, so we also move to a temp to set those up.
 906        */
 907       src = dst[dst_chan];
 908       brw_MOV(p, src, arg0[0]);
 909    } else {
 910       src = arg0[0];
 911    }
 912
 913    /* Send two messages to perform all 16 operations:
 914     */
 915    brw_push_insn_state(p);
 916    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 917    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 918    brw_math(p,
 919             dst[dst_chan],
 920             function,
 921             2,
 922             src,
 923             BRW_MATH_DATA_VECTOR,
 924             BRW_MATH_PRECISION_FULL);
 925
 926    if (c->dispatch_width == 16) {
 927       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 928       brw_math(p,
 929                offset(dst[dst_chan],1),
 930                function,
 931                3,
 932                sechalf(src),
 933                BRW_MATH_DATA_VECTOR,
 934                BRW_MATH_PRECISION_FULL);
 935    }
 936    brw_pop_insn_state(p);
 937 }
 938
 939
 940 void emit_math2(struct brw_wm_compile *c,
 941                 GLuint function,
 942                 const struct brw_reg *dst,
 943                 GLuint mask,
 944                 const struct brw_reg *arg0,
 945                 const struct brw_reg *arg1)
 946 {
 947    struct brw_compile *p = &c->func;
 948    struct intel_context *intel = &p->brw->intel;
 949    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
 950
 951    if (!(mask & WRITEMASK_XYZW))
 952       return; /* Do not emit dead code */
 953
 954    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 955
 956    brw_push_insn_state(p);
 957
 958    /* math can only operate on up to a vec8 at a time, so in
 959     * dispatch_width==16 we have to do the second half manually.
 960     */
 961    if (intel->gen >= 6) {
 962       struct brw_reg src0 = arg0[0];
 963       struct brw_reg src1 = arg1[0];
 964       struct brw_reg temp_dst = dst[dst_chan];
 965
 966       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 967          brw_MOV(p, temp_dst, src0);
 968          src0 = temp_dst;
 969       }
 970
 971       if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
 972          /* This is a heinous hack to get a temporary register for use
 973           * in case both arg0 and arg1 are constants.  Why you're
 974           * doing exponentiation on constant values in the shader, we
 975           * don't know.
 976           *
 977           * max_wm_grf is almost surely less than the maximum GRF, and
 978           * gen6 doesn't care about the number of GRFs used in a
 979           * shader like pre-gen6 did.
 980           */
 981          struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
 982          brw_MOV(p, temp, src1);
 983          src1 = temp;
 984       }
 985
 986       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 987       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 988       brw_math2(p,
 989                 temp_dst,
 990                 function,
 991                 src0,
 992                 src1);
 993       if (c->dispatch_width == 16) {
 994          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 995          brw_math2(p,
 996                    sechalf(temp_dst),
 997                    function,
 998                    sechalf(src0),
 999                    sechalf(src1));
1000       }
1001    } else {
1002       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1003       brw_MOV(p, brw_message_reg(3), arg1[0]);
1004       if (c->dispatch_width == 16) {
1005          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1006          brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1007       }
1008
1009       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1010       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1011       brw_math(p,
1012                dst[dst_chan],
1013                function,
1014                2,
1015                arg0[0],
1016                BRW_MATH_DATA_VECTOR,
1017                BRW_MATH_PRECISION_FULL);
1018
1019       /* Send two messages to perform all 16 operations:
1020        */
1021       if (c->dispatch_width == 16) {
1022          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1023          brw_math(p,
1024                   offset(dst[dst_chan],1),
1025                   function,
1026                   4,
1027                   sechalf(arg0[0]),
1028                   BRW_MATH_DATA_VECTOR,
1029                   BRW_MATH_PRECISION_FULL);
1030       }
1031    }
1032    brw_pop_insn_state(p);
1033 }
1034
1035
1036 void emit_tex(struct brw_wm_compile *c,
1037               struct brw_reg *dst,
1038               GLuint dst_flags,
1039               struct brw_reg *arg,
1040               struct brw_reg depth_payload,
1041               GLuint tex_idx,
1042               GLuint sampler,
1043               bool shadow)
1044 {
1045    struct brw_compile *p = &c->func;
1046    struct intel_context *intel = &p->brw->intel;
1047    struct brw_reg dst_retyped;
1048    GLuint cur_mrf = 2, response_length;
1049    GLuint i, nr_texcoords;
1050    GLuint emit;
1051    GLuint msg_type;
1052    GLuint mrf_per_channel;
1053    GLuint simd_mode;
1054
1055    if (c->dispatch_width == 16) {
1056       mrf_per_channel = 2;
1057       response_length = 8;
1058       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1059       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1060    } else {
1061       mrf_per_channel = 1;
1062       response_length = 4;
1063       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1064       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1065    }
1066
1067    /* How many input regs are there?
1068     */
1069    switch (tex_idx) {
1070    case TEXTURE_1D_INDEX:
1071       emit = WRITEMASK_X;
1072       nr_texcoords = 1;
1073       break;
1074    case TEXTURE_2D_INDEX:
1075    case TEXTURE_1D_ARRAY_INDEX:
1076    case TEXTURE_RECT_INDEX:
1077    case TEXTURE_EXTERNAL_INDEX:
1078       emit = WRITEMASK_XY;
1079       nr_texcoords = 2;
1080       break;
1081    case TEXTURE_3D_INDEX:
1082    case TEXTURE_2D_ARRAY_INDEX:
1083    case TEXTURE_CUBE_INDEX:
1084       emit = WRITEMASK_XYZ;
1085       nr_texcoords = 3;
1086       break;
1087    default:
1088       /* unexpected target */
1089       abort();
1090    }
1091
1092    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1093    if (intel->gen < 5 && c->dispatch_width == 8)
1094       nr_texcoords = 3;
1095
1096    if (shadow) {
1097       if (intel->gen < 7) {
1098          /* For shadow comparisons, we have to supply u,v,r. */
1099          nr_texcoords = 3;
1100       } else {
1101          /* On Ivybridge, the shadow comparitor comes first. Just load it. */
1102          brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1103          cur_mrf += mrf_per_channel;
1104       }
1105    }
1106
1107    /* Emit the texcoords. */
1108    for (i = 0; i < nr_texcoords; i++) {
1109       if (c->key.tex.gl_clamp_mask[i] & (1 << sampler))
1110          brw_set_saturate(p, true);
1111
1112       if (emit & (1<<i))
1113          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1114       else
1115          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1116       cur_mrf += mrf_per_channel;
1117
1118       brw_set_saturate(p, false);
1119    }
1120
1121    /* Fill in the shadow comparison reference value. */
1122    if (shadow && intel->gen < 7) {
1123       if (intel->gen >= 5) {
1124          /* Fill in the cube map array index value. */
1125          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1126          cur_mrf += mrf_per_channel;
1127       } else if (c->dispatch_width == 8) {
1128          /* Fill in the LOD bias value. */
1129          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130          cur_mrf += mrf_per_channel;
1131       }
1132       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1133       cur_mrf += mrf_per_channel;
1134    }
1135
1136    if (intel->gen >= 5) {
1137       if (shadow)
1138          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1139       else
1140          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1141    } else {
1142       /* Note that G45 and older determines shadow compare and dispatch width
1143        * from message length for most messages.
1144        */
1145       if (c->dispatch_width == 16 && shadow)
1146          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1147       else
1148          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1149    }
1150
1151    brw_SAMPLE(p,
1152               dst_retyped,
1153               1,
1154               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1155               SURF_INDEX_TEXTURE(sampler),
1156               sampler,
1157               dst_flags & WRITEMASK_XYZW,
1158               msg_type,
1159               response_length,
1160               cur_mrf - 1,
1161               1,
1162               simd_mode,
1163               BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1164 }
1165
1166
1167 void emit_txb(struct brw_wm_compile *c,
1168               struct brw_reg *dst,
1169               GLuint dst_flags,
1170               struct brw_reg *arg,
1171               struct brw_reg depth_payload,
1172               GLuint tex_idx,
1173               GLuint sampler)
1174 {
1175    struct brw_compile *p = &c->func;
1176    struct intel_context *intel = &p->brw->intel;
1177    GLuint msgLength;
1178    GLuint msg_type;
1179    GLuint mrf_per_channel;
1180    GLuint response_length;
1181    struct brw_reg dst_retyped;
1182
1183    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1184     * samples, so we'll use the 16-wide instruction, leave the second halves
1185     * undefined, and trust the execution mask to keep the undefined pixels
1186     * from mattering.
1187     */
1188    if (c->dispatch_width == 16 || intel->gen < 5) {
1189       if (intel->gen >= 5)
1190          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1191       else
1192          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1193       mrf_per_channel = 2;
1194       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1195       response_length = 8;
1196    } else {
1197       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1198       mrf_per_channel = 1;
1199       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1200       response_length = 4;
1201    }
1202
1203    /* Shadow ignored for txb. */
1204    switch (tex_idx) {
1205    case TEXTURE_1D_INDEX:
1206       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1207       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1208       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1209       break;
1210    case TEXTURE_2D_INDEX:
1211    case TEXTURE_RECT_INDEX:
1212    case TEXTURE_EXTERNAL_INDEX:
1213       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1214       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1215       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1216       break;
1217    case TEXTURE_3D_INDEX:
1218    case TEXTURE_CUBE_INDEX:
1219       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1220       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1221       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1222       break;
1223    default:
1224       /* unexpected target */
1225       abort();
1226    }
1227
1228    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1229    msgLength = 2 + 4 * mrf_per_channel - 1;
1230
1231    brw_SAMPLE(p,
1232               dst_retyped,
1233               1,
1234               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1235               SURF_INDEX_TEXTURE(sampler),
1236               sampler,
1237               dst_flags & WRITEMASK_XYZW,
1238               msg_type,
1239               response_length,
1240               msgLength,
1241               1,
1242               BRW_SAMPLER_SIMD_MODE_SIMD16,
1243               BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1244 }
1245
1246
1247 static void emit_lit(struct brw_wm_compile *c,
1248                      const struct brw_reg *dst,
1249                      GLuint mask,
1250                      const struct brw_reg *arg0)
1251 {
1252    struct brw_compile *p = &c->func;
1253
1254    assert((mask & WRITEMASK_XW) == 0);
1255
1256    if (mask & WRITEMASK_Y) {
1257       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1258       brw_MOV(p, dst[1], arg0[0]);
1259       brw_set_saturate(p, 0);
1260    }
1261
1262    if (mask & WRITEMASK_Z) {
1263       emit_math2(c, BRW_MATH_FUNCTION_POW,
1264                  &dst[2],
1265                  WRITEMASK_X | (mask & SATURATE),
1266                  &arg0[1],
1267                  &arg0[3]);
1268    }
1269
1270    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1271     * some of the POW calculations above, but 16-wide iff statements
1272     * seem to lock c1 hardware, so this is a nasty workaround:
1273     */
1274    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1275    {
1276       if (mask & WRITEMASK_Y)
1277          brw_MOV(p, dst[1], brw_imm_f(0));
1278
1279       if (mask & WRITEMASK_Z)
1280          brw_MOV(p, dst[2], brw_imm_f(0));
1281    }
1282    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1283 }
1284
1285
1286 /* Kill pixel - set execution mask to zero for those pixels which
1287  * fail.
1288  */
1289 static void emit_kil( struct brw_wm_compile *c,
1290                       struct brw_reg *arg0)
1291 {
1292    struct brw_compile *p = &c->func;
1293    struct intel_context *intel = &p->brw->intel;
1294    struct brw_reg pixelmask;
1295    GLuint i, j;
1296
1297    if (intel->gen >= 6)
1298       pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1299    else
1300       pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1301
1302    for (i = 0; i < 4; i++) {
1303       /* Check if we've already done the comparison for this reg
1304        * -- common when someone does KIL TEMP.wwww.
1305        */
1306       for (j = 0; j < i; j++) {
1307          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1308             break;
1309       }
1310       if (j != i)
1311          continue;
1312
1313       brw_push_insn_state(p);
1314       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1315       brw_set_predicate_control_flag_value(p, 0xff);
1316       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1317       brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1318       brw_pop_insn_state(p);
1319    }
1320 }
1321
1322 static void fire_fb_write( struct brw_wm_compile *c,
1323                            GLuint base_reg,
1324                            GLuint nr,
1325                            GLuint target,
1326                            GLuint eot )
1327 {
1328    struct brw_compile *p = &c->func;
1329    struct intel_context *intel = &p->brw->intel;
1330    uint32_t msg_control;
1331
1332    /* Pass through control information:
1333     *
1334     * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1335     */
1336 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1337    if (intel->gen < 6)
1338    {
1339       brw_push_insn_state(p);
1340       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1341       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1342       brw_MOV(p,
1343                brw_message_reg(base_reg + 1),
1344                brw_vec8_grf(1, 0));
1345       brw_pop_insn_state(p);
1346    }
1347
1348    if (c->dispatch_width == 16)
1349       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1350    else
1351       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1352
1353    /* Send framebuffer write message: */
1354 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1355    brw_fb_WRITE(p,
1356                 c->dispatch_width,
1357                 base_reg,
1358                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1359                 msg_control,
1360                 target,
1361                 nr,
1362                 0,
1363                 eot,
1364                 true);
1365 }
1366
1367
1368 static void emit_aa( struct brw_wm_compile *c,
1369                      struct brw_reg *arg1,
1370                      GLuint reg )
1371 {
1372    struct brw_compile *p = &c->func;
1373    GLuint comp = c->aa_dest_stencil_reg / 2;
1374    GLuint off = c->aa_dest_stencil_reg % 2;
1375    struct brw_reg aa = offset(arg1[comp], off);
1376
1377    brw_push_insn_state(p);
1378    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1379    brw_MOV(p, brw_message_reg(reg), aa);
1380    brw_pop_insn_state(p);
1381 }
1382
1383
1384 /* Post-fragment-program processing.  Send the results to the
1385  * framebuffer.
1386  * \param arg0  the fragment color
1387  * \param arg1  the pass-through depth value
1388  * \param arg2  the shader-computed depth value
1389  */
1390 void emit_fb_write(struct brw_wm_compile *c,
1391                    struct brw_reg *arg0,
1392                    struct brw_reg *arg1,
1393                    struct brw_reg *arg2,
1394                    GLuint target,
1395                    GLuint eot)
1396 {
1397    struct brw_compile *p = &c->func;
1398    struct brw_context *brw = p->brw;
1399    struct intel_context *intel = &brw->intel;
1400    GLuint nr = 2;
1401    GLuint channel;
1402
1403    /* Reserve a space for AA - may not be needed:
1404     */
1405    if (c->aa_dest_stencil_reg)
1406       nr += 1;
1407
1408    /* I don't really understand how this achieves the color interleave
1409     * (ie RGBARGBA) in the result:  [Do the saturation here]
1410     */
1411    brw_push_insn_state(p);
1412
1413    if (c->key.clamp_fragment_color)
1414       brw_set_saturate(p, 1);
1415
1416    for (channel = 0; channel < 4; channel++) {
1417       if (intel->gen >= 6) {
1418          /* gen6 SIMD16 single source DP write looks like:
1419           * m + 0: r0
1420           * m + 1: r1
1421           * m + 2: g0
1422           * m + 3: g1
1423           * m + 4: b0
1424           * m + 5: b1
1425           * m + 6: a0
1426           * m + 7: a1
1427           */
1428          if (c->dispatch_width == 16) {
1429             brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1430          } else {
1431             brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1432          }
1433       } else if (c->dispatch_width == 16 && brw->has_compr4) {
1434          /* pre-gen6 SIMD16 single source DP write looks like:
1435           * m + 0: r0
1436           * m + 1: g0
1437           * m + 2: b0
1438           * m + 3: a0
1439           * m + 4: r1
1440           * m + 5: g1
1441           * m + 6: b1
1442           * m + 7: a1
1443           *
1444           * By setting the high bit of the MRF register number, we indicate
1445           * that we want COMPR4 mode - instead of doing the usual destination
1446           * + 1 for the second half we get destination + 4.
1447           */
1448          brw_MOV(p,
1449                  brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1450                  arg0[channel]);
1451       } else {
1452          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1453          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1454          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1455          brw_MOV(p,
1456                  brw_message_reg(nr + channel),
1457                  arg0[channel]);
1458
1459          if (c->dispatch_width == 16) {
1460             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1461             brw_MOV(p,
1462                     brw_message_reg(nr + channel + 4),
1463                     sechalf(arg0[channel]));
1464          }
1465       }
1466    }
1467
1468    brw_set_saturate(p, 0);
1469
1470    /* skip over the regs populated above:
1471     */
1472    if (c->dispatch_width == 16)
1473       nr += 8;
1474    else
1475       nr += 4;
1476
1477    brw_pop_insn_state(p);
1478
1479    if (c->source_depth_to_render_target)
1480    {
1481       if (c->computes_depth)
1482          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1483       else
1484          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1485
1486       nr += 2;
1487    }
1488
1489    if (c->dest_depth_reg)
1490    {
1491       GLuint comp = c->dest_depth_reg / 2;
1492       GLuint off = c->dest_depth_reg % 2;
1493
1494       if (off != 0) {
1495          brw_push_insn_state(p);
1496          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1497
1498          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1499          /* 2nd half? */
1500          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1501          brw_pop_insn_state(p);
1502       }
1503       else {
1504          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1505       }
1506       nr += 2;
1507    }
1508
1509    if (intel->gen >= 6) {
1510       /* Load the message header.  There's no implied move from src0
1511        * to the base mrf on gen6.
1512        */
1513       brw_push_insn_state(p);
1514       brw_set_mask_control(p, BRW_MASK_DISABLE);
1515       brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1516               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1517       brw_pop_insn_state(p);
1518
1519       if (target != 0) {
1520          brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1521                                         0,
1522                                         2), BRW_REGISTER_TYPE_UD),
1523                  brw_imm_ud(target));
1524       }
1525    }
1526
1527    if (!c->runtime_check_aads_emit) {
1528       if (c->aa_dest_stencil_reg)
1529          emit_aa(c, arg1, 2);
1530
1531       fire_fb_write(c, 0, nr, target, eot);
1532    }
1533    else {
1534       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1535       struct brw_reg ip = brw_ip_reg();
1536       int jmp;
1537
1538       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1539       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1540       brw_AND(p,
1541               v1_null_ud,
1542               get_element_ud(brw_vec8_grf(1,0), 6),
1543               brw_imm_ud(1<<26));
1544
1545       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store;
1546       {
1547          emit_aa(c, arg1, 2);
1548          fire_fb_write(c, 0, nr, target, eot);
1549          /* note - thread killed in subroutine */
1550       }
1551       brw_land_fwd_jump(p, jmp);
1552
1553       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1554        */
1555       fire_fb_write(c, 1, nr-1, target, eot);
1556    }
1557 }
1558
1559 /**
1560  * Move a GPR to scratch memory.
1561  */
1562 static void emit_spill( struct brw_wm_compile *c,
1563                         struct brw_reg reg,
1564                         GLuint slot )
1565 {
1566    struct brw_compile *p = &c->func;
1567
1568    /*
1569      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1570    */
1571    brw_MOV(p, brw_message_reg(2), reg);
1572
1573    /*
1574      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1575      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1576    */
1577    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1578 }
1579
1580
1581 /**
1582  * Load a GPR from scratch memory.
1583  */
1584 static void emit_unspill( struct brw_wm_compile *c,
1585                           struct brw_reg reg,
1586                           GLuint slot )
1587 {
1588    struct brw_compile *p = &c->func;
1589
1590    /* Slot 0 is the undef value.
1591     */
1592    if (slot == 0) {
1593       brw_MOV(p, reg, brw_imm_f(0));
1594       return;
1595    }
1596
1597    /*
1598      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1599      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1600    */
1601
1602    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1603 }
1604
1605
1606 /**
1607  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1608  * Args with unspill_reg != 0 will be loaded from scratch memory.
1609  */
1610 static void get_argument_regs( struct brw_wm_compile *c,
1611                                struct brw_wm_ref *arg[],
1612                                struct brw_reg *regs )
1613 {
1614    GLuint i;
1615
1616    for (i = 0; i < 4; i++) {
1617       if (arg[i]) {
1618          if (arg[i]->unspill_reg)
1619             emit_unspill(c,
1620                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1621                          arg[i]->value->spill_slot);
1622
1623          regs[i] = arg[i]->hw_reg;
1624       }
1625       else {
1626          regs[i] = brw_null_reg();
1627       }
1628    }
1629 }
1630
1631
1632 /**
1633  * For values that have a spill_slot!=0, write those regs to scratch memory.
1634  */
1635 static void spill_values( struct brw_wm_compile *c,
1636                           struct brw_wm_value *values,
1637                           GLuint nr )
1638 {
1639    GLuint i;
1640
1641    for (i = 0; i < nr; i++)
1642       if (values[i].spill_slot)
1643          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1644 }
1645
1646
1647 /* Emit the fragment program instructions here.
1648  */
1649 void brw_wm_emit( struct brw_wm_compile *c )
1650 {
1651    struct brw_compile *p = &c->func;
1652    struct intel_context *intel = &p->brw->intel;
1653    GLuint insn;
1654
1655    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1656    if (intel->gen >= 6)
1657         brw_set_acc_write_control(p, 1);
1658
1659    /* Check if any of the payload regs need to be spilled:
1660     */
1661    spill_values(c, c->payload.depth, 4);
1662    spill_values(c, c->creg, c->nr_creg);
1663    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1664
1665
1666    for (insn = 0; insn < c->nr_insns; insn++) {
1667
1668       struct brw_wm_instruction *inst = &c->instruction[insn];
1669       struct brw_reg args[3][4], dst[4];
1670       GLuint i, dst_flags;
1671
1672       /* Get argument regs:
1673        */
1674       for (i = 0; i < 3; i++)
1675          get_argument_regs(c, inst->src[i], args[i]);
1676
1677       /* Get dest regs:
1678        */
1679       for (i = 0; i < 4; i++)
1680          if (inst->dst[i])
1681             dst[i] = inst->dst[i]->hw_reg;
1682          else
1683             dst[i] = brw_null_reg();
1684
1685       /* Flags
1686        */
1687       dst_flags = inst->writemask;
1688       if (inst->saturate)
1689          dst_flags |= SATURATE;
1690
1691       switch (inst->opcode) {
1692          /* Generated instructions for calculating triangle interpolants:
1693           */
1694       case WM_PIXELXY:
1695          emit_pixel_xy(c, dst, dst_flags);
1696          break;
1697
1698       case WM_DELTAXY:
1699          emit_delta_xy(p, dst, dst_flags, args[0]);
1700          break;
1701
1702       case WM_WPOSXY:
1703          emit_wpos_xy(c, dst, dst_flags, args[0]);
1704          break;
1705
1706       case WM_PIXELW:
1707          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1708          break;
1709
1710       case WM_LINTERP:
1711          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1712          break;
1713
1714       case WM_PINTERP:
1715          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1716          break;
1717
1718       case WM_CINTERP:
1719          emit_cinterp(p, dst, dst_flags, args[0]);
1720          break;
1721
1722       case WM_FB_WRITE:
1723          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1724          break;
1725
1726       case WM_FRONTFACING:
1727          emit_frontfacing(p, dst, dst_flags);
1728          break;
1729
1730          /* Straightforward arithmetic:
1731           */
1732       case OPCODE_ADD:
1733          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1734          break;
1735
1736       case OPCODE_FRC:
1737          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1738          break;
1739
1740       case OPCODE_FLR:
1741          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1742          break;
1743
1744       case OPCODE_DDX:
1745          emit_ddxy(p, dst, dst_flags, true, args[0], false);
1746          break;
1747
1748       case OPCODE_DDY:
1749          /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no
1750           * guarantee that c->key.render_to_fbo is set).
1751           */
1752          assert(c->fp->program.UsesDFdy);
1753          emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo);
1754          break;
1755
1756       case OPCODE_DP2:
1757          emit_dp2(p, dst, dst_flags, args[0], args[1]);
1758          break;
1759
1760       case OPCODE_DP3:
1761          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1762          break;
1763
1764       case OPCODE_DP4:
1765          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1766          break;
1767
1768       case OPCODE_DPH:
1769          emit_dph(p, dst, dst_flags, args[0], args[1]);
1770          break;
1771
1772       case OPCODE_TRUNC:
1773          for (i = 0; i < 4; i++) {
1774             if (dst_flags & (1<<i)) {
1775                brw_RNDZ(p, dst[i], args[0][i]);
1776             }
1777          }
1778          break;
1779
1780       case OPCODE_LRP:
1781          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1782          break;
1783
1784       case OPCODE_MAD:
1785          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1786          break;
1787
1788       case OPCODE_MOV:
1789       case OPCODE_SWZ:
1790          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1791          break;
1792
1793       case OPCODE_MUL:
1794          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1795          break;
1796
1797       case OPCODE_XPD:
1798          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1799          break;
1800
1801          /* Higher math functions:
1802           */
1803       case OPCODE_RCP:
1804          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1805          break;
1806
1807       case OPCODE_RSQ:
1808          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1809          break;
1810
1811       case OPCODE_SIN:
1812          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1813          break;
1814
1815       case OPCODE_COS:
1816          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1817          break;
1818
1819       case OPCODE_EX2:
1820          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1821          break;
1822
1823       case OPCODE_LG2:
1824          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1825          break;
1826
1827       case OPCODE_SCS:
1828          /* There is an scs math function, but it would need some
1829           * fixup for 16-element execution.
1830           */
1831          if (dst_flags & WRITEMASK_X)
1832             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833          if (dst_flags & WRITEMASK_Y)
1834             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1835          break;
1836
1837       case OPCODE_POW:
1838          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1839          break;
1840
1841          /* Comparisons:
1842           */
1843       case OPCODE_CMP:
1844          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1845          break;
1846
1847       case OPCODE_MAX:
1848          emit_max(p, dst, dst_flags, args[0], args[1]);
1849          break;
1850
1851       case OPCODE_MIN:
1852          emit_min(p, dst, dst_flags, args[0], args[1]);
1853          break;
1854
1855       case OPCODE_SLT:
1856          emit_slt(p, dst, dst_flags, args[0], args[1]);
1857          break;
1858
1859       case OPCODE_SLE:
1860          emit_sle(p, dst, dst_flags, args[0], args[1]);
1861         break;
1862       case OPCODE_SGT:
1863          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1864         break;
1865       case OPCODE_SGE:
1866          emit_sge(p, dst, dst_flags, args[0], args[1]);
1867          break;
1868       case OPCODE_SEQ:
1869          emit_seq(p, dst, dst_flags, args[0], args[1]);
1870         break;
1871       case OPCODE_SNE:
1872          emit_sne(p, dst, dst_flags, args[0], args[1]);
1873         break;
1874
1875       case OPCODE_SSG:
1876          emit_sign(p, dst, dst_flags, args[0]);
1877          break;
1878
1879       case OPCODE_LIT:
1880          emit_lit(c, dst, dst_flags, args[0]);
1881          break;
1882
1883          /* Texturing operations:
1884           */
1885       case OPCODE_TEX:
1886          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1887                   inst->tex_idx, inst->tex_unit,
1888                   inst->tex_shadow);
1889          break;
1890
1891       case OPCODE_TXB:
1892          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1893                   inst->tex_idx, inst->tex_unit);
1894          break;
1895
1896       case OPCODE_KIL:
1897          emit_kil(c, args[0]);
1898          break;
1899
1900       default:
1901          printf("Unsupported opcode %i (%s) in fragment shader\n",
1902                 inst->opcode, inst->opcode < MAX_OPCODE ?
1903                 _mesa_opcode_string(inst->opcode) :
1904                 "unknown");
1905       }
1906
1907       for (i = 0; i < 4; i++)
1908         if (inst->dst[i] && inst->dst[i]->spill_slot)
1909            emit_spill(c,
1910                       inst->dst[i]->hw_reg,
1911                       inst->dst[i]->spill_slot);
1912    }
1913
1914    /* Only properly tested on ILK */
1915    if (p->brw->intel.gen == 5) {
1916      brw_remove_duplicate_mrf_moves(p);
1917      if (c->dispatch_width == 16)
1918         brw_remove_grf_to_mrf_moves(p);
1919    }
1920
1921    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1922       printf("wm-native:\n");
1923       brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1924       printf("\n");
1925    }
1926 }
1927