src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64
  65 /* Payload R0:
  66  *
  67  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  68  *         corresponding to each of the 16 execution channels.
  69  * R0.1..8 -- ?
  70  * R1.0 -- triangle vertex 0.X
  71  * R1.1 -- triangle vertex 0.Y
  72  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  73  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  74  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  75  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  76  * R1.6 -- ?
  77  * R1.7 -- ?
  78  * R1.8 -- ?
  79  */
  80
  81 void emit_pixel_xy(struct brw_wm_compile *c,
  82                    const struct brw_reg *dst,
  83                    GLuint mask)
  84 {
  85    struct brw_compile *p = &c->func;
  86    struct brw_reg r1 = brw_vec1_grf(1, 0);
  87    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  88    struct brw_reg dst0_uw, dst1_uw;
  89
  90    brw_push_insn_state(p);
  91    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  92
  93    if (c->dispatch_width == 16) {
  94       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
  95       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
  96    } else {
  97       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
  98       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
  99    }
 100
 101    /* Calculate pixel centers by adding 1 or 0 to each of the
 102     * micro-tile coordinates passed in r1.
 103     */
 104    if (mask & WRITEMASK_X) {
 105       brw_ADD(p,
 106               dst0_uw,
 107               stride(suboffset(r1_uw, 4), 2, 4, 0),
 108               brw_imm_v(0x10101010));
 109    }
 110
 111    if (mask & WRITEMASK_Y) {
 112       brw_ADD(p,
 113               dst1_uw,
 114               stride(suboffset(r1_uw,5), 2, 4, 0),
 115               brw_imm_v(0x11001100));
 116    }
 117    brw_pop_insn_state(p);
 118 }
 119
 120
 121 void emit_delta_xy(struct brw_compile *p,
 122                    const struct brw_reg *dst,
 123                    GLuint mask,
 124                    const struct brw_reg *arg0)
 125 {
 126    struct brw_reg r1 = brw_vec1_grf(1, 0);
 127
 128    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 129     * centers.
 130     */
 131    if (mask & WRITEMASK_X) {
 132       brw_ADD(p,
 133               dst[0],
 134               retype(arg0[0], BRW_REGISTER_TYPE_UW),
 135               negate(r1));
 136    }
 137
 138    if (mask & WRITEMASK_Y) {
 139       brw_ADD(p,
 140               dst[1],
 141               retype(arg0[1], BRW_REGISTER_TYPE_UW),
 142               negate(suboffset(r1,1)));
 143
 144    }
 145 }
 146
 147 void emit_wpos_xy(struct brw_wm_compile *c,
 148                   const struct brw_reg *dst,
 149                   GLuint mask,
 150                   const struct brw_reg *arg0)
 151 {
 152    struct brw_compile *p = &c->func;
 153
 154    /* Calculate the pixel offset from window bottom left into destination
 155     * X and Y channels.
 156     */
 157    if (mask & WRITEMASK_X) {
 158       if (c->fp->program.PixelCenterInteger) {
 159          /* X' = X */
 160          brw_MOV(p,
 161                  dst[0],
 162                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 163       } else {
 164          /* X' = X + 0.5 */
 165          brw_ADD(p,
 166                  dst[0],
 167                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 168                  brw_imm_f(0.5));
 169       }
 170    }
 171
 172    if (mask & WRITEMASK_Y) {
 173       if (c->fp->program.OriginUpperLeft) {
 174          if (c->fp->program.PixelCenterInteger) {
 175             /* Y' = Y */
 176             brw_MOV(p,
 177                     dst[1],
 178                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 179          } else {
 180             /* Y' = Y + 0.5 */
 181             brw_ADD(p,
 182                     dst[1],
 183                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 184                     brw_imm_f(0.5));
 185          }
 186       } else {
 187          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 188
 189          /* Y' = (height - 1) - Y + center */
 190          brw_ADD(p,
 191                  dst[1],
 192                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 193                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 194       }
 195    }
 196 }
 197
 198
 199 void emit_pixel_w(struct brw_wm_compile *c,
 200                   const struct brw_reg *dst,
 201                   GLuint mask,
 202                   const struct brw_reg *arg0,
 203                   const struct brw_reg *deltas)
 204 {
 205    struct brw_compile *p = &c->func;
 206    struct intel_context *intel = &p->brw->intel;
 207
 208    /* Don't need this if all you are doing is interpolating color, for
 209     * instance.
 210     */
 211    if (mask & WRITEMASK_W) {
 212       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 213
 214       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 215        * result straight into a message reg.
 216        */
 217       if (can_do_pln(intel, deltas)) {
 218          brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
 219       } else {
 220          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 221          brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 222       }
 223
 224       /* Calc w */
 225       if (c->dispatch_width == 16) {
 226          brw_math_16(p, dst[3],
 227                      BRW_MATH_FUNCTION_INV,
 228                      BRW_MATH_SATURATE_NONE,
 229                      2, brw_null_reg(),
 230                      BRW_MATH_PRECISION_FULL);
 231       } else {
 232          brw_math(p, dst[3],
 233                   BRW_MATH_FUNCTION_INV,
 234                   BRW_MATH_SATURATE_NONE,
 235                   2, brw_null_reg(),
 236                   BRW_MATH_DATA_VECTOR,
 237                   BRW_MATH_PRECISION_FULL);
 238       }
 239    }
 240 }
 241
 242
 243 void emit_linterp(struct brw_compile *p,
 244                   const struct brw_reg *dst,
 245                   GLuint mask,
 246                   const struct brw_reg *arg0,
 247                   const struct brw_reg *deltas)
 248 {
 249    struct intel_context *intel = &p->brw->intel;
 250    struct brw_reg interp[4];
 251    GLuint nr = arg0[0].nr;
 252    GLuint i;
 253
 254    interp[0] = brw_vec1_grf(nr, 0);
 255    interp[1] = brw_vec1_grf(nr, 4);
 256    interp[2] = brw_vec1_grf(nr+1, 0);
 257    interp[3] = brw_vec1_grf(nr+1, 4);
 258
 259    for (i = 0; i < 4; i++) {
 260       if (mask & (1<<i)) {
 261          if (can_do_pln(intel, deltas)) {
 262             brw_PLN(p, dst[i], interp[i], deltas[0]);
 263          } else {
 264             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 265             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 266          }
 267       }
 268    }
 269 }
 270
 271
 272 void emit_pinterp(struct brw_compile *p,
 273                   const struct brw_reg *dst,
 274                   GLuint mask,
 275                   const struct brw_reg *arg0,
 276                   const struct brw_reg *deltas,
 277                   const struct brw_reg *w)
 278 {
 279    struct intel_context *intel = &p->brw->intel;
 280    struct brw_reg interp[4];
 281    GLuint nr = arg0[0].nr;
 282    GLuint i;
 283
 284    interp[0] = brw_vec1_grf(nr, 0);
 285    interp[1] = brw_vec1_grf(nr, 4);
 286    interp[2] = brw_vec1_grf(nr+1, 0);
 287    interp[3] = brw_vec1_grf(nr+1, 4);
 288
 289    for (i = 0; i < 4; i++) {
 290       if (mask & (1<<i)) {
 291          if (can_do_pln(intel, deltas)) {
 292             brw_PLN(p, dst[i], interp[i], deltas[0]);
 293          } else {
 294             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 295             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 296          }
 297       }
 298    }
 299    for (i = 0; i < 4; i++) {
 300       if (mask & (1<<i)) {
 301          brw_MUL(p, dst[i], dst[i], w[3]);
 302       }
 303    }
 304 }
 305
 306
 307 void emit_cinterp(struct brw_compile *p,
 308                   const struct brw_reg *dst,
 309                   GLuint mask,
 310                   const struct brw_reg *arg0)
 311 {
 312    struct brw_reg interp[4];
 313    GLuint nr = arg0[0].nr;
 314    GLuint i;
 315
 316    interp[0] = brw_vec1_grf(nr, 0);
 317    interp[1] = brw_vec1_grf(nr, 4);
 318    interp[2] = brw_vec1_grf(nr+1, 0);
 319    interp[3] = brw_vec1_grf(nr+1, 4);
 320
 321    for (i = 0; i < 4; i++) {
 322       if (mask & (1<<i)) {
 323          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 324       }
 325    }
 326 }
 327
 328 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 329 void emit_frontfacing(struct brw_compile *p,
 330                       const struct brw_reg *dst,
 331                       GLuint mask)
 332 {
 333    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 334    GLuint i;
 335
 336    if (!(mask & WRITEMASK_XYZW))
 337       return;
 338
 339    for (i = 0; i < 4; i++) {
 340       if (mask & (1<<i)) {
 341          brw_MOV(p, dst[i], brw_imm_f(0.0));
 342       }
 343    }
 344
 345    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 346     * us front face
 347     */
 348    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 349    for (i = 0; i < 4; i++) {
 350       if (mask & (1<<i)) {
 351          brw_MOV(p, dst[i], brw_imm_f(1.0));
 352       }
 353    }
 354    brw_set_predicate_control_flag_value(p, 0xff);
 355 }
 356
 357 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 358  * looking like:
 359  *
 360  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 361  *
 362  * and we're trying to produce:
 363  *
 364  *           DDX                     DDY
 365  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 366  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 367  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 368  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 369  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 370  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 371  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 372  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 373  *
 374  * and add another set of two more subspans if in 16-pixel dispatch mode.
 375  *
 376  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 377  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 378  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 379  * between each other.  We could probably do it like ddx and swizzle the right
 380  * order later, but bail for now and just produce
 381  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 382  */
 383 void emit_ddxy(struct brw_compile *p,
 384                const struct brw_reg *dst,
 385                GLuint mask,
 386                GLboolean is_ddx,
 387                const struct brw_reg *arg0)
 388 {
 389    int i;
 390    struct brw_reg src0, src1;
 391
 392    if (mask & SATURATE)
 393       brw_set_saturate(p, 1);
 394    for (i = 0; i < 4; i++ ) {
 395       if (mask & (1<<i)) {
 396          if (is_ddx) {
 397             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 398                            BRW_REGISTER_TYPE_F,
 399                            BRW_VERTICAL_STRIDE_2,
 400                            BRW_WIDTH_2,
 401                            BRW_HORIZONTAL_STRIDE_0,
 402                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 403             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 404                            BRW_REGISTER_TYPE_F,
 405                            BRW_VERTICAL_STRIDE_2,
 406                            BRW_WIDTH_2,
 407                            BRW_HORIZONTAL_STRIDE_0,
 408                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 409          } else {
 410             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 411                            BRW_REGISTER_TYPE_F,
 412                            BRW_VERTICAL_STRIDE_4,
 413                            BRW_WIDTH_4,
 414                            BRW_HORIZONTAL_STRIDE_0,
 415                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 416             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 417                            BRW_REGISTER_TYPE_F,
 418                            BRW_VERTICAL_STRIDE_4,
 419                            BRW_WIDTH_4,
 420                            BRW_HORIZONTAL_STRIDE_0,
 421                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 422          }
 423          brw_ADD(p, dst[i], src0, negate(src1));
 424       }
 425    }
 426    if (mask & SATURATE)
 427       brw_set_saturate(p, 0);
 428 }
 429
 430 void emit_alu1(struct brw_compile *p,
 431                struct brw_instruction *(*func)(struct brw_compile *,
 432                                                struct brw_reg,
 433                                                struct brw_reg),
 434                const struct brw_reg *dst,
 435                GLuint mask,
 436                const struct brw_reg *arg0)
 437 {
 438    GLuint i;
 439
 440    if (mask & SATURATE)
 441       brw_set_saturate(p, 1);
 442
 443    for (i = 0; i < 4; i++) {
 444       if (mask & (1<<i)) {
 445          func(p, dst[i], arg0[i]);
 446       }
 447    }
 448
 449    if (mask & SATURATE)
 450       brw_set_saturate(p, 0);
 451 }
 452
 453
 454 void emit_alu2(struct brw_compile *p,
 455                struct brw_instruction *(*func)(struct brw_compile *,
 456                                                struct brw_reg,
 457                                                struct brw_reg,
 458                                                struct brw_reg),
 459                const struct brw_reg *dst,
 460                GLuint mask,
 461                const struct brw_reg *arg0,
 462                const struct brw_reg *arg1)
 463 {
 464    GLuint i;
 465
 466    if (mask & SATURATE)
 467       brw_set_saturate(p, 1);
 468
 469    for (i = 0; i < 4; i++) {
 470       if (mask & (1<<i)) {
 471          func(p, dst[i], arg0[i], arg1[i]);
 472       }
 473    }
 474
 475    if (mask & SATURATE)
 476       brw_set_saturate(p, 0);
 477 }
 478
 479
 480 void emit_mad(struct brw_compile *p,
 481               const struct brw_reg *dst,
 482               GLuint mask,
 483               const struct brw_reg *arg0,
 484               const struct brw_reg *arg1,
 485               const struct brw_reg *arg2)
 486 {
 487    GLuint i;
 488
 489    for (i = 0; i < 4; i++) {
 490       if (mask & (1<<i)) {
 491          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 492
 493          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 494          brw_ADD(p, dst[i], dst[i], arg2[i]);
 495          brw_set_saturate(p, 0);
 496       }
 497    }
 498 }
 499
 500 void emit_lrp(struct brw_compile *p,
 501               const struct brw_reg *dst,
 502               GLuint mask,
 503               const struct brw_reg *arg0,
 504               const struct brw_reg *arg1,
 505               const struct brw_reg *arg2)
 506 {
 507    GLuint i;
 508
 509    /* Uses dst as a temporary:
 510     */
 511    for (i = 0; i < 4; i++) {
 512       if (mask & (1<<i)) {
 513          /* Can I use the LINE instruction for this?
 514           */
 515          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 516          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 517
 518          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 519          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 520          brw_set_saturate(p, 0);
 521       }
 522    }
 523 }
 524
 525 void emit_sop(struct brw_compile *p,
 526               const struct brw_reg *dst,
 527               GLuint mask,
 528               GLuint cond,
 529               const struct brw_reg *arg0,
 530               const struct brw_reg *arg1)
 531 {
 532    GLuint i;
 533
 534    for (i = 0; i < 4; i++) {
 535       if (mask & (1<<i)) {
 536          brw_push_insn_state(p);
 537          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 538          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 539          brw_MOV(p, dst[i], brw_imm_f(0));
 540          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 541          brw_MOV(p, dst[i], brw_imm_f(1.0));
 542          brw_pop_insn_state(p);
 543       }
 544    }
 545 }
 546
 547 static void emit_slt( struct brw_compile *p,
 548                       const struct brw_reg *dst,
 549                       GLuint mask,
 550                       const struct brw_reg *arg0,
 551                       const struct brw_reg *arg1 )
 552 {
 553    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 554 }
 555
 556 static void emit_sle( struct brw_compile *p,
 557                       const struct brw_reg *dst,
 558                       GLuint mask,
 559                       const struct brw_reg *arg0,
 560                       const struct brw_reg *arg1 )
 561 {
 562    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 563 }
 564
 565 static void emit_sgt( struct brw_compile *p,
 566                       const struct brw_reg *dst,
 567                       GLuint mask,
 568                       const struct brw_reg *arg0,
 569                       const struct brw_reg *arg1 )
 570 {
 571    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 572 }
 573
 574 static void emit_sge( struct brw_compile *p,
 575                       const struct brw_reg *dst,
 576                       GLuint mask,
 577                       const struct brw_reg *arg0,
 578                       const struct brw_reg *arg1 )
 579 {
 580    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 581 }
 582
 583 static void emit_seq( struct brw_compile *p,
 584                       const struct brw_reg *dst,
 585                       GLuint mask,
 586                       const struct brw_reg *arg0,
 587                       const struct brw_reg *arg1 )
 588 {
 589    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 590 }
 591
 592 static void emit_sne( struct brw_compile *p,
 593                       const struct brw_reg *dst,
 594                       GLuint mask,
 595                       const struct brw_reg *arg0,
 596                       const struct brw_reg *arg1 )
 597 {
 598    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 599 }
 600
 601 void emit_cmp(struct brw_compile *p,
 602               const struct brw_reg *dst,
 603               GLuint mask,
 604               const struct brw_reg *arg0,
 605               const struct brw_reg *arg1,
 606               const struct brw_reg *arg2)
 607 {
 608    GLuint i;
 609
 610    for (i = 0; i < 4; i++) {
 611       if (mask & (1<<i)) {
 612          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 613          brw_MOV(p, dst[i], arg2[i]);
 614          brw_set_saturate(p, 0);
 615
 616          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 617
 618          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 619          brw_MOV(p, dst[i], arg1[i]);
 620          brw_set_saturate(p, 0);
 621          brw_set_predicate_control_flag_value(p, 0xff);
 622       }
 623    }
 624 }
 625
 626 void emit_max(struct brw_compile *p,
 627               const struct brw_reg *dst,
 628               GLuint mask,
 629               const struct brw_reg *arg0,
 630               const struct brw_reg *arg1)
 631 {
 632    GLuint i;
 633
 634    for (i = 0; i < 4; i++) {
 635       if (mask & (1<<i)) {
 636          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 637          brw_MOV(p, dst[i], arg0[i]);
 638          brw_set_saturate(p, 0);
 639
 640          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 641
 642          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 643          brw_MOV(p, dst[i], arg1[i]);
 644          brw_set_saturate(p, 0);
 645          brw_set_predicate_control_flag_value(p, 0xff);
 646       }
 647    }
 648 }
 649
 650 void emit_min(struct brw_compile *p,
 651               const struct brw_reg *dst,
 652               GLuint mask,
 653               const struct brw_reg *arg0,
 654               const struct brw_reg *arg1)
 655 {
 656    GLuint i;
 657
 658    for (i = 0; i < 4; i++) {
 659       if (mask & (1<<i)) {
 660          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 661          brw_MOV(p, dst[i], arg1[i]);
 662          brw_set_saturate(p, 0);
 663
 664          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 665
 666          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 667          brw_MOV(p, dst[i], arg0[i]);
 668          brw_set_saturate(p, 0);
 669          brw_set_predicate_control_flag_value(p, 0xff);
 670       }
 671    }
 672 }
 673
 674
 675 void emit_dp3(struct brw_compile *p,
 676               const struct brw_reg *dst,
 677               GLuint mask,
 678               const struct brw_reg *arg0,
 679               const struct brw_reg *arg1)
 680 {
 681    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 682
 683    if (!(mask & WRITEMASK_XYZW))
 684       return; /* Do not emit dead code */
 685
 686    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 687
 688    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 689    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 690
 691    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 692    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 693    brw_set_saturate(p, 0);
 694 }
 695
 696
 697 void emit_dp4(struct brw_compile *p,
 698               const struct brw_reg *dst,
 699               GLuint mask,
 700               const struct brw_reg *arg0,
 701               const struct brw_reg *arg1)
 702 {
 703    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 704
 705    if (!(mask & WRITEMASK_XYZW))
 706       return; /* Do not emit dead code */
 707
 708    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 709
 710    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 711    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 712    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 713
 714    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 715    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 716    brw_set_saturate(p, 0);
 717 }
 718
 719
 720 void emit_dph(struct brw_compile *p,
 721               const struct brw_reg *dst,
 722               GLuint mask,
 723               const struct brw_reg *arg0,
 724               const struct brw_reg *arg1)
 725 {
 726    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 727
 728    if (!(mask & WRITEMASK_XYZW))
 729       return; /* Do not emit dead code */
 730
 731    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 732
 733    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 734    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 735    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 736
 737    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 738    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 739    brw_set_saturate(p, 0);
 740 }
 741
 742
 743 void emit_xpd(struct brw_compile *p,
 744               const struct brw_reg *dst,
 745               GLuint mask,
 746               const struct brw_reg *arg0,
 747               const struct brw_reg *arg1)
 748 {
 749    GLuint i;
 750
 751    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 752
 753    for (i = 0 ; i < 3; i++) {
 754       if (mask & (1<<i)) {
 755          GLuint i2 = (i+2)%3;
 756          GLuint i1 = (i+1)%3;
 757
 758          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 759
 760          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 761          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 762          brw_set_saturate(p, 0);
 763       }
 764    }
 765 }
 766
 767
 768 void emit_math1(struct brw_wm_compile *c,
 769                 GLuint function,
 770                 const struct brw_reg *dst,
 771                 GLuint mask,
 772                 const struct brw_reg *arg0)
 773 {
 774    struct brw_compile *p = &c->func;
 775    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 776    GLuint saturate = ((mask & SATURATE) ?
 777                       BRW_MATH_SATURATE_SATURATE :
 778                       BRW_MATH_SATURATE_NONE);
 779
 780    if (!(mask & WRITEMASK_XYZW))
 781       return; /* Do not emit dead code */
 782
 783    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 784
 785    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 786     * channels.
 787     */
 788    brw_MOV(p, brw_message_reg(2), arg0[0]);
 789
 790    /* Send two messages to perform all 16 operations:
 791     */
 792    brw_push_insn_state(p);
 793    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 794    brw_math(p,
 795             dst[dst_chan],
 796             function,
 797             saturate,
 798             2,
 799             brw_null_reg(),
 800             BRW_MATH_DATA_VECTOR,
 801             BRW_MATH_PRECISION_FULL);
 802
 803    if (c->dispatch_width == 16) {
 804       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 805       brw_math(p,
 806                offset(dst[dst_chan],1),
 807                function,
 808                saturate,
 809                3,
 810                brw_null_reg(),
 811                BRW_MATH_DATA_VECTOR,
 812                BRW_MATH_PRECISION_FULL);
 813    }
 814    brw_pop_insn_state(p);
 815 }
 816
 817
 818 void emit_math2(struct brw_wm_compile *c,
 819                 GLuint function,
 820                 const struct brw_reg *dst,
 821                 GLuint mask,
 822                 const struct brw_reg *arg0,
 823                 const struct brw_reg *arg1)
 824 {
 825    struct brw_compile *p = &c->func;
 826    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 827    GLuint saturate = ((mask & SATURATE) ?
 828                       BRW_MATH_SATURATE_SATURATE :
 829                       BRW_MATH_SATURATE_NONE);
 830
 831    if (!(mask & WRITEMASK_XYZW))
 832       return; /* Do not emit dead code */
 833
 834    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 835
 836    brw_push_insn_state(p);
 837
 838    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 839    brw_MOV(p, brw_message_reg(2), arg0[0]);
 840    if (c->dispatch_width == 16) {
 841       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 842       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 843    }
 844
 845    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 846    brw_MOV(p, brw_message_reg(3), arg1[0]);
 847    if (c->dispatch_width == 16) {
 848       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 849       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 850    }
 851
 852    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 853    brw_math(p,
 854             dst[dst_chan],
 855             function,
 856             saturate,
 857             2,
 858             brw_null_reg(),
 859             BRW_MATH_DATA_VECTOR,
 860             BRW_MATH_PRECISION_FULL);
 861
 862    /* Send two messages to perform all 16 operations:
 863     */
 864    if (c->dispatch_width == 16) {
 865       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 866       brw_math(p,
 867                offset(dst[dst_chan],1),
 868                function,
 869                saturate,
 870                4,
 871                brw_null_reg(),
 872                BRW_MATH_DATA_VECTOR,
 873                BRW_MATH_PRECISION_FULL);
 874    }
 875    brw_pop_insn_state(p);
 876 }
 877
 878
 879 void emit_tex(struct brw_wm_compile *c,
 880               struct brw_reg *dst,
 881               GLuint dst_flags,
 882               struct brw_reg *arg,
 883               struct brw_reg depth_payload,
 884               GLuint tex_idx,
 885               GLuint sampler,
 886               GLboolean shadow)
 887 {
 888    struct brw_compile *p = &c->func;
 889    struct intel_context *intel = &p->brw->intel;
 890    struct brw_reg dst_retyped;
 891    GLuint cur_mrf = 2, response_length;
 892    GLuint i, nr_texcoords;
 893    GLuint emit;
 894    GLuint msg_type;
 895    GLuint mrf_per_channel;
 896    GLuint simd_mode;
 897
 898    if (c->dispatch_width == 16) {
 899       mrf_per_channel = 2;
 900       response_length = 8;
 901       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 902       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 903    } else {
 904       mrf_per_channel = 1;
 905       response_length = 4;
 906       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 907       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 908    }
 909
 910    /* How many input regs are there?
 911     */
 912    switch (tex_idx) {
 913    case TEXTURE_1D_INDEX:
 914       emit = WRITEMASK_X;
 915       nr_texcoords = 1;
 916       break;
 917    case TEXTURE_2D_INDEX:
 918    case TEXTURE_RECT_INDEX:
 919       emit = WRITEMASK_XY;
 920       nr_texcoords = 2;
 921       break;
 922    case TEXTURE_3D_INDEX:
 923    case TEXTURE_CUBE_INDEX:
 924       emit = WRITEMASK_XYZ;
 925       nr_texcoords = 3;
 926       break;
 927    default:
 928       /* unexpected target */
 929       abort();
 930    }
 931
 932    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 933    if (!intel->is_ironlake && c->dispatch_width == 8)
 934       nr_texcoords = 3;
 935
 936    /* For shadow comparisons, we have to supply u,v,r. */
 937    if (shadow)
 938       nr_texcoords = 3;
 939
 940    /* Emit the texcoords. */
 941    for (i = 0; i < nr_texcoords; i++) {
 942       if (emit & (1<<i))
 943          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 944       else
 945          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 946       cur_mrf += mrf_per_channel;
 947    }
 948
 949    /* Fill in the shadow comparison reference value. */
 950    if (shadow) {
 951       if (intel->is_ironlake) {
 952          /* Fill in the cube map array index value. */
 953          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 954          cur_mrf += mrf_per_channel;
 955       } else if (c->dispatch_width == 8) {
 956          /* Fill in the LOD bias value. */
 957          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 958          cur_mrf += mrf_per_channel;
 959       }
 960       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 961       cur_mrf += mrf_per_channel;
 962    }
 963
 964    if (intel->is_ironlake) {
 965       if (shadow)
 966          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 967       else
 968          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 969    } else {
 970       /* Note that G45 and older determines shadow compare and dispatch width
 971        * from message length for most messages.
 972        */
 973       if (c->dispatch_width == 16 && shadow)
 974          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 975       else
 976          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 977    }
 978
 979    brw_SAMPLE(p,
 980               dst_retyped,
 981               1,
 982               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 983               SURF_INDEX_TEXTURE(sampler),
 984               sampler,
 985               dst_flags & WRITEMASK_XYZW,
 986               msg_type,
 987               response_length,
 988               cur_mrf - 1,
 989               0,
 990               1,
 991               simd_mode);
 992 }
 993
 994
 995 void emit_txb(struct brw_wm_compile *c,
 996               struct brw_reg *dst,
 997               GLuint dst_flags,
 998               struct brw_reg *arg,
 999               struct brw_reg depth_payload,
1000               GLuint tex_idx,
1001               GLuint sampler)
1002 {
1003    struct brw_compile *p = &c->func;
1004    struct intel_context *intel = &p->brw->intel;
1005    GLuint msgLength;
1006    GLuint msg_type;
1007    GLuint mrf_per_channel;
1008    GLuint response_length;
1009    struct brw_reg dst_retyped;
1010
1011    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1012     * samples, so we'll use the 16-wide instruction, leave the second halves
1013     * undefined, and trust the execution mask to keep the undefined pixels
1014     * from mattering.
1015     */
1016    if (c->dispatch_width == 16 || !intel->is_ironlake) {
1017       if (intel->is_ironlake)
1018          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1019       else
1020          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1021       mrf_per_channel = 2;
1022       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1023       response_length = 8;
1024    } else {
1025       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1026       mrf_per_channel = 1;
1027       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1028       response_length = 4;
1029    }
1030
1031    /* Shadow ignored for txb. */
1032    switch (tex_idx) {
1033    case TEXTURE_1D_INDEX:
1034       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1035       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1036       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1037       break;
1038    case TEXTURE_2D_INDEX:
1039    case TEXTURE_RECT_INDEX:
1040       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1041       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1042       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1043       break;
1044    case TEXTURE_3D_INDEX:
1045    case TEXTURE_CUBE_INDEX:
1046       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1047       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1048       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1049       break;
1050    default:
1051       /* unexpected target */
1052       abort();
1053    }
1054
1055    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1056    msgLength = 2 + 4 * mrf_per_channel - 1;
1057
1058    brw_SAMPLE(p,
1059               dst_retyped,
1060               1,
1061               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1062               SURF_INDEX_TEXTURE(sampler),
1063               sampler,
1064               dst_flags & WRITEMASK_XYZW,
1065               msg_type,
1066               response_length,
1067               msgLength,
1068               0,
1069               1,
1070               BRW_SAMPLER_SIMD_MODE_SIMD16);
1071 }
1072
1073
1074 static void emit_lit(struct brw_wm_compile *c,
1075                      const struct brw_reg *dst,
1076                      GLuint mask,
1077                      const struct brw_reg *arg0)
1078 {
1079    struct brw_compile *p = &c->func;
1080
1081    assert((mask & WRITEMASK_XW) == 0);
1082
1083    if (mask & WRITEMASK_Y) {
1084       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1085       brw_MOV(p, dst[1], arg0[0]);
1086       brw_set_saturate(p, 0);
1087    }
1088
1089    if (mask & WRITEMASK_Z) {
1090       emit_math2(c, BRW_MATH_FUNCTION_POW,
1091                  &dst[2],
1092                  WRITEMASK_X | (mask & SATURATE),
1093                  &arg0[1],
1094                  &arg0[3]);
1095    }
1096
1097    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1098     * some of the POW calculations above, but 16-wide iff statements
1099     * seem to lock c1 hardware, so this is a nasty workaround:
1100     */
1101    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1102    {
1103       if (mask & WRITEMASK_Y)
1104          brw_MOV(p, dst[1], brw_imm_f(0));
1105
1106       if (mask & WRITEMASK_Z)
1107          brw_MOV(p, dst[2], brw_imm_f(0));
1108    }
1109    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1110 }
1111
1112
1113 /* Kill pixel - set execution mask to zero for those pixels which
1114  * fail.
1115  */
1116 static void emit_kil( struct brw_wm_compile *c,
1117                       struct brw_reg *arg0)
1118 {
1119    struct brw_compile *p = &c->func;
1120    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1121    GLuint i;
1122
1123    /* XXX - usually won't need 4 compares!
1124     */
1125    for (i = 0; i < 4; i++) {
1126       brw_push_insn_state(p);
1127       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1128       brw_set_predicate_control_flag_value(p, 0xff);
1129       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1130       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1131       brw_pop_insn_state(p);
1132    }
1133 }
1134
1135 /* KIL_NV kills the pixels that are currently executing, not based on a test
1136  * of the arguments.
1137  */
1138 static void emit_kil_nv( struct brw_wm_compile *c )
1139 {
1140    struct brw_compile *p = &c->func;
1141    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1142
1143    brw_push_insn_state(p);
1144    brw_set_mask_control(p, BRW_MASK_DISABLE);
1145    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1146    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1147    brw_pop_insn_state(p);
1148 }
1149
1150 static void fire_fb_write( struct brw_wm_compile *c,
1151                            GLuint base_reg,
1152                            GLuint nr,
1153                            GLuint target,
1154                            GLuint eot )
1155 {
1156    struct brw_compile *p = &c->func;
1157    struct brw_reg dst;
1158
1159    if (c->dispatch_width == 16)
1160       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1161    else
1162       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1163
1164    /* Pass through control information:
1165     */
1166 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1167    {
1168       brw_push_insn_state(p);
1169       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1170       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1171       brw_MOV(p,
1172                brw_message_reg(base_reg + 1),
1173                brw_vec8_grf(1, 0));
1174       brw_pop_insn_state(p);
1175    }
1176
1177    /* Send framebuffer write message: */
1178 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1179    brw_fb_WRITE(p,
1180                 dst,
1181                 base_reg,
1182                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1183                 target,
1184                 nr,
1185                 0,
1186                 eot);
1187 }
1188
1189
1190 static void emit_aa( struct brw_wm_compile *c,
1191                      struct brw_reg *arg1,
1192                      GLuint reg )
1193 {
1194    struct brw_compile *p = &c->func;
1195    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1196    GLuint off = c->key.aa_dest_stencil_reg % 2;
1197    struct brw_reg aa = offset(arg1[comp], off);
1198
1199    brw_push_insn_state(p);
1200    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1201    brw_MOV(p, brw_message_reg(reg), aa);
1202    brw_pop_insn_state(p);
1203 }
1204
1205
1206 /* Post-fragment-program processing.  Send the results to the
1207  * framebuffer.
1208  * \param arg0  the fragment color
1209  * \param arg1  the pass-through depth value
1210  * \param arg2  the shader-computed depth value
1211  */
1212 void emit_fb_write(struct brw_wm_compile *c,
1213                    struct brw_reg *arg0,
1214                    struct brw_reg *arg1,
1215                    struct brw_reg *arg2,
1216                    GLuint target,
1217                    GLuint eot)
1218 {
1219    struct brw_compile *p = &c->func;
1220    struct brw_context *brw = p->brw;
1221    GLuint nr = 2;
1222    GLuint channel;
1223
1224    /* Reserve a space for AA - may not be needed:
1225     */
1226    if (c->key.aa_dest_stencil_reg)
1227       nr += 1;
1228
1229    /* I don't really understand how this achieves the color interleave
1230     * (ie RGBARGBA) in the result:  [Do the saturation here]
1231     */
1232    brw_push_insn_state(p);
1233
1234    for (channel = 0; channel < 4; channel++) {
1235       if (c->dispatch_width == 16 && brw->has_compr4) {
1236          /* By setting the high bit of the MRF register number, we indicate
1237           * that we want COMPR4 mode - instead of doing the usual destination
1238           * + 1 for the second half we get destination + 4.
1239           */
1240          brw_MOV(p,
1241                  brw_message_reg(nr + channel + (1 << 7)),
1242                  arg0[channel]);
1243       } else {
1244          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1245          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1246          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1247          brw_MOV(p,
1248                  brw_message_reg(nr + channel),
1249                  arg0[channel]);
1250
1251          if (c->dispatch_width == 16) {
1252             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1253             brw_MOV(p,
1254                     brw_message_reg(nr + channel + 4),
1255                     sechalf(arg0[channel]));
1256          }
1257       }
1258    }
1259    /* skip over the regs populated above:
1260     */
1261    nr += 8;
1262    brw_pop_insn_state(p);
1263
1264    if (c->key.source_depth_to_render_target)
1265    {
1266       if (c->key.computes_depth)
1267          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1268       else
1269          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1270
1271       nr += 2;
1272    }
1273
1274    if (c->key.dest_depth_reg)
1275    {
1276       GLuint comp = c->key.dest_depth_reg / 2;
1277       GLuint off = c->key.dest_depth_reg % 2;
1278
1279       if (off != 0) {
1280          brw_push_insn_state(p);
1281          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1282
1283          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1284          /* 2nd half? */
1285          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1286          brw_pop_insn_state(p);
1287       }
1288       else {
1289          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1290       }
1291       nr += 2;
1292    }
1293
1294    if (!c->key.runtime_check_aads_emit) {
1295       if (c->key.aa_dest_stencil_reg)
1296          emit_aa(c, arg1, 2);
1297
1298       fire_fb_write(c, 0, nr, target, eot);
1299    }
1300    else {
1301       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1302       struct brw_reg ip = brw_ip_reg();
1303       struct brw_instruction *jmp;
1304
1305       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1306       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1307       brw_AND(p,
1308               v1_null_ud,
1309               get_element_ud(brw_vec8_grf(1,0), 6),
1310               brw_imm_ud(1<<26));
1311
1312       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1313       {
1314          emit_aa(c, arg1, 2);
1315          fire_fb_write(c, 0, nr, target, eot);
1316          /* note - thread killed in subroutine */
1317       }
1318       brw_land_fwd_jump(p, jmp);
1319
1320       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1321        */
1322       fire_fb_write(c, 1, nr-1, target, eot);
1323    }
1324 }
1325
1326 /**
1327  * Move a GPR to scratch memory.
1328  */
1329 static void emit_spill( struct brw_wm_compile *c,
1330                         struct brw_reg reg,
1331                         GLuint slot )
1332 {
1333    struct brw_compile *p = &c->func;
1334
1335    /*
1336      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1337    */
1338    brw_MOV(p, brw_message_reg(2), reg);
1339
1340    /*
1341      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1342      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1343    */
1344    brw_dp_WRITE_16(p,
1345                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1346                    slot);
1347 }
1348
1349
1350 /**
1351  * Load a GPR from scratch memory.
1352  */
1353 static void emit_unspill( struct brw_wm_compile *c,
1354                           struct brw_reg reg,
1355                           GLuint slot )
1356 {
1357    struct brw_compile *p = &c->func;
1358
1359    /* Slot 0 is the undef value.
1360     */
1361    if (slot == 0) {
1362       brw_MOV(p, reg, brw_imm_f(0));
1363       return;
1364    }
1365
1366    /*
1367      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1368      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1369    */
1370
1371    brw_dp_READ_16(p,
1372                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1373                   slot);
1374 }
1375
1376
1377 /**
1378  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1379  * Args with unspill_reg != 0 will be loaded from scratch memory.
1380  */
1381 static void get_argument_regs( struct brw_wm_compile *c,
1382                                struct brw_wm_ref *arg[],
1383                                struct brw_reg *regs )
1384 {
1385    GLuint i;
1386
1387    for (i = 0; i < 4; i++) {
1388       if (arg[i]) {
1389          if (arg[i]->unspill_reg)
1390             emit_unspill(c,
1391                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1392                          arg[i]->value->spill_slot);
1393
1394          regs[i] = arg[i]->hw_reg;
1395       }
1396       else {
1397          regs[i] = brw_null_reg();
1398       }
1399    }
1400 }
1401
1402
1403 /**
1404  * For values that have a spill_slot!=0, write those regs to scratch memory.
1405  */
1406 static void spill_values( struct brw_wm_compile *c,
1407                           struct brw_wm_value *values,
1408                           GLuint nr )
1409 {
1410    GLuint i;
1411
1412    for (i = 0; i < nr; i++)
1413       if (values[i].spill_slot)
1414          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1415 }
1416
1417
1418 /* Emit the fragment program instructions here.
1419  */
1420 void brw_wm_emit( struct brw_wm_compile *c )
1421 {
1422    struct brw_compile *p = &c->func;
1423    GLuint insn;
1424
1425    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1426
1427    /* Check if any of the payload regs need to be spilled:
1428     */
1429    spill_values(c, c->payload.depth, 4);
1430    spill_values(c, c->creg, c->nr_creg);
1431    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1432
1433
1434    for (insn = 0; insn < c->nr_insns; insn++) {
1435
1436       struct brw_wm_instruction *inst = &c->instruction[insn];
1437       struct brw_reg args[3][4], dst[4];
1438       GLuint i, dst_flags;
1439
1440       /* Get argument regs:
1441        */
1442       for (i = 0; i < 3; i++)
1443          get_argument_regs(c, inst->src[i], args[i]);
1444
1445       /* Get dest regs:
1446        */
1447       for (i = 0; i < 4; i++)
1448          if (inst->dst[i])
1449             dst[i] = inst->dst[i]->hw_reg;
1450          else
1451             dst[i] = brw_null_reg();
1452
1453       /* Flags
1454        */
1455       dst_flags = inst->writemask;
1456       if (inst->saturate)
1457          dst_flags |= SATURATE;
1458
1459       switch (inst->opcode) {
1460          /* Generated instructions for calculating triangle interpolants:
1461           */
1462       case WM_PIXELXY:
1463          emit_pixel_xy(c, dst, dst_flags);
1464          break;
1465
1466       case WM_DELTAXY:
1467          emit_delta_xy(p, dst, dst_flags, args[0]);
1468          break;
1469
1470       case WM_WPOSXY:
1471          emit_wpos_xy(c, dst, dst_flags, args[0]);
1472          break;
1473
1474       case WM_PIXELW:
1475          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1476          break;
1477
1478       case WM_LINTERP:
1479          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1480          break;
1481
1482       case WM_PINTERP:
1483          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1484          break;
1485
1486       case WM_CINTERP:
1487          emit_cinterp(p, dst, dst_flags, args[0]);
1488          break;
1489
1490       case WM_FB_WRITE:
1491          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1492          break;
1493
1494       case WM_FRONTFACING:
1495          emit_frontfacing(p, dst, dst_flags);
1496          break;
1497
1498          /* Straightforward arithmetic:
1499           */
1500       case OPCODE_ADD:
1501          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1502          break;
1503
1504       case OPCODE_FRC:
1505          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1506          break;
1507
1508       case OPCODE_FLR:
1509          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1510          break;
1511
1512       case OPCODE_DDX:
1513          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1514          break;
1515
1516       case OPCODE_DDY:
1517          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1518          break;
1519
1520       case OPCODE_DP3:
1521          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1522          break;
1523
1524       case OPCODE_DP4:
1525          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1526          break;
1527
1528       case OPCODE_DPH:
1529          emit_dph(p, dst, dst_flags, args[0], args[1]);
1530          break;
1531
1532       case OPCODE_TRUNC:
1533          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1534          break;
1535
1536       case OPCODE_LRP:
1537          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1538          break;
1539
1540       case OPCODE_MAD:
1541          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1542          break;
1543
1544       case OPCODE_MOV:
1545       case OPCODE_SWZ:
1546          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1547          break;
1548
1549       case OPCODE_MUL:
1550          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1551          break;
1552
1553       case OPCODE_XPD:
1554          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1555          break;
1556
1557          /* Higher math functions:
1558           */
1559       case OPCODE_RCP:
1560          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1561          break;
1562
1563       case OPCODE_RSQ:
1564          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1565          break;
1566
1567       case OPCODE_SIN:
1568          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1569          break;
1570
1571       case OPCODE_COS:
1572          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1573          break;
1574
1575       case OPCODE_EX2:
1576          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1577          break;
1578
1579       case OPCODE_LG2:
1580          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1581          break;
1582
1583       case OPCODE_SCS:
1584          /* There is an scs math function, but it would need some
1585           * fixup for 16-element execution.
1586           */
1587          if (dst_flags & WRITEMASK_X)
1588             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1589          if (dst_flags & WRITEMASK_Y)
1590             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1591          break;
1592
1593       case OPCODE_POW:
1594          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1595          break;
1596
1597          /* Comparisons:
1598           */
1599       case OPCODE_CMP:
1600          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1601          break;
1602
1603       case OPCODE_MAX:
1604          emit_max(p, dst, dst_flags, args[0], args[1]);
1605          break;
1606
1607       case OPCODE_MIN:
1608          emit_min(p, dst, dst_flags, args[0], args[1]);
1609          break;
1610
1611       case OPCODE_SLT:
1612          emit_slt(p, dst, dst_flags, args[0], args[1]);
1613          break;
1614
1615       case OPCODE_SLE:
1616          emit_sle(p, dst, dst_flags, args[0], args[1]);
1617         break;
1618       case OPCODE_SGT:
1619          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1620         break;
1621       case OPCODE_SGE:
1622          emit_sge(p, dst, dst_flags, args[0], args[1]);
1623          break;
1624       case OPCODE_SEQ:
1625          emit_seq(p, dst, dst_flags, args[0], args[1]);
1626         break;
1627       case OPCODE_SNE:
1628          emit_sne(p, dst, dst_flags, args[0], args[1]);
1629         break;
1630
1631       case OPCODE_LIT:
1632          emit_lit(c, dst, dst_flags, args[0]);
1633          break;
1634
1635          /* Texturing operations:
1636           */
1637       case OPCODE_TEX:
1638          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1639                   inst->tex_idx, inst->tex_unit,
1640                   inst->tex_shadow);
1641          break;
1642
1643       case OPCODE_TXB:
1644          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1645                   inst->tex_idx, inst->tex_unit);
1646          break;
1647
1648       case OPCODE_KIL:
1649          emit_kil(c, args[0]);
1650          break;
1651
1652       case OPCODE_KIL_NV:
1653          emit_kil_nv(c);
1654          break;
1655
1656       default:
1657          printf("Unsupported opcode %i (%s) in fragment shader\n",
1658                 inst->opcode, inst->opcode < MAX_OPCODE ?
1659                 _mesa_opcode_string(inst->opcode) :
1660                 "unknown");
1661       }
1662
1663       for (i = 0; i < 4; i++)
1664         if (inst->dst[i] && inst->dst[i]->spill_slot)
1665            emit_spill(c,
1666                       inst->dst[i]->hw_reg,
1667                       inst->dst[i]->spill_slot);
1668    }
1669
1670    if (INTEL_DEBUG & DEBUG_WM) {
1671       int i;
1672
1673       printf("wm-native:\n");
1674       for (i = 0; i < p->nr_insn; i++)
1675          brw_disasm(stderr, &p->store[i]);
1676       printf("\n");
1677    }
1678 }