src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64
  65 /**
  66  * Computes the screen-space x,y position of the pixels.
  67  *
  68  * This will be used by emit_delta_xy() or emit_wpos_xy() for
  69  * interpolation of attributes..
  70  *
  71  * Payload R0:
  72  *
  73  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  74  *         corresponding to each of the 16 execution channels.
  75  * R0.1..8 -- ?
  76  * R1.0 -- triangle vertex 0.X
  77  * R1.1 -- triangle vertex 0.Y
  78  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  79  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  80  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  81  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  82  * R1.6 -- ?
  83  * R1.7 -- ?
  84  * R1.8 -- ?
  85  */
  86 void emit_pixel_xy(struct brw_wm_compile *c,
  87                    const struct brw_reg *dst,
  88                    GLuint mask)
  89 {
  90    struct brw_compile *p = &c->func;
  91    struct brw_reg r1 = brw_vec1_grf(1, 0);
  92    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  93    struct brw_reg dst0_uw, dst1_uw;
  94
  95    brw_push_insn_state(p);
  96    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  97
  98    if (c->dispatch_width == 16) {
  99       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 100       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 101    } else {
 102       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 103       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 104    }
 105
 106    /* Calculate pixel centers by adding 1 or 0 to each of the
 107     * micro-tile coordinates passed in r1.
 108     */
 109    if (mask & WRITEMASK_X) {
 110       brw_ADD(p,
 111               dst0_uw,
 112               stride(suboffset(r1_uw, 4), 2, 4, 0),
 113               brw_imm_v(0x10101010));
 114    }
 115
 116    if (mask & WRITEMASK_Y) {
 117       brw_ADD(p,
 118               dst1_uw,
 119               stride(suboffset(r1_uw,5), 2, 4, 0),
 120               brw_imm_v(0x11001100));
 121    }
 122    brw_pop_insn_state(p);
 123 }
 124
 125 /**
 126  * Computes the screen-space x,y distance of the pixels from the start
 127  * vertex.
 128  *
 129  * This will be used in linterp or pinterp with the start vertex value
 130  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 131  * to produce interpolated attribute values.
 132  */
 133 void emit_delta_xy(struct brw_compile *p,
 134                    const struct brw_reg *dst,
 135                    GLuint mask,
 136                    const struct brw_reg *arg0)
 137 {
 138    struct brw_reg r1 = brw_vec1_grf(1, 0);
 139
 140    if (mask == 0)
 141       return;
 142
 143    assert(mask == WRITEMASK_XY);
 144
 145    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 146     * centers produced by emit_pixel_xy().
 147     */
 148    brw_ADD(p,
 149            dst[0],
 150            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 151            negate(r1));
 152    brw_ADD(p,
 153            dst[1],
 154            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 155            negate(suboffset(r1,1)));
 156 }
 157
 158 /**
 159  * Computes the pixel offset from the window origin for gl_FragCoord().
 160  */
 161 void emit_wpos_xy(struct brw_wm_compile *c,
 162                   const struct brw_reg *dst,
 163                   GLuint mask,
 164                   const struct brw_reg *arg0)
 165 {
 166    struct brw_compile *p = &c->func;
 167
 168    if (mask & WRITEMASK_X) {
 169       if (c->fp->program.PixelCenterInteger) {
 170          /* X' = X */
 171          brw_MOV(p,
 172                  dst[0],
 173                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 174       } else {
 175          /* X' = X + 0.5 */
 176          brw_ADD(p,
 177                  dst[0],
 178                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 179                  brw_imm_f(0.5));
 180       }
 181    }
 182
 183    if (mask & WRITEMASK_Y) {
 184       if (c->fp->program.OriginUpperLeft) {
 185          if (c->fp->program.PixelCenterInteger) {
 186             /* Y' = Y */
 187             brw_MOV(p,
 188                     dst[1],
 189                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 190          } else {
 191             /* Y' = Y + 0.5 */
 192             brw_ADD(p,
 193                     dst[1],
 194                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 195                     brw_imm_f(0.5));
 196          }
 197       } else {
 198          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 199
 200          /* Y' = (height - 1) - Y + center */
 201          brw_ADD(p,
 202                  dst[1],
 203                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 204                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 205       }
 206    }
 207 }
 208
 209
 210 void emit_pixel_w(struct brw_wm_compile *c,
 211                   const struct brw_reg *dst,
 212                   GLuint mask,
 213                   const struct brw_reg *arg0,
 214                   const struct brw_reg *deltas)
 215 {
 216    struct brw_compile *p = &c->func;
 217    struct intel_context *intel = &p->brw->intel;
 218
 219    /* Don't need this if all you are doing is interpolating color, for
 220     * instance.
 221     */
 222    if (mask & WRITEMASK_W) {
 223       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 224
 225       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 226        * result straight into a message reg.
 227        */
 228       if (can_do_pln(intel, deltas)) {
 229          brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
 230       } else {
 231          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 232          brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 233       }
 234
 235       /* Calc w */
 236       if (c->dispatch_width == 16) {
 237          brw_math_16(p, dst[3],
 238                      BRW_MATH_FUNCTION_INV,
 239                      BRW_MATH_SATURATE_NONE,
 240                      2, brw_null_reg(),
 241                      BRW_MATH_PRECISION_FULL);
 242       } else {
 243          brw_math(p, dst[3],
 244                   BRW_MATH_FUNCTION_INV,
 245                   BRW_MATH_SATURATE_NONE,
 246                   2, brw_null_reg(),
 247                   BRW_MATH_DATA_VECTOR,
 248                   BRW_MATH_PRECISION_FULL);
 249       }
 250    }
 251 }
 252
 253
 254 void emit_linterp(struct brw_compile *p,
 255                   const struct brw_reg *dst,
 256                   GLuint mask,
 257                   const struct brw_reg *arg0,
 258                   const struct brw_reg *deltas)
 259 {
 260    struct intel_context *intel = &p->brw->intel;
 261    struct brw_reg interp[4];
 262    GLuint nr = arg0[0].nr;
 263    GLuint i;
 264
 265    interp[0] = brw_vec1_grf(nr, 0);
 266    interp[1] = brw_vec1_grf(nr, 4);
 267    interp[2] = brw_vec1_grf(nr+1, 0);
 268    interp[3] = brw_vec1_grf(nr+1, 4);
 269
 270    for (i = 0; i < 4; i++) {
 271       if (mask & (1<<i)) {
 272          if (can_do_pln(intel, deltas)) {
 273             brw_PLN(p, dst[i], interp[i], deltas[0]);
 274          } else {
 275             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 276             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 277          }
 278       }
 279    }
 280 }
 281
 282
 283 void emit_pinterp(struct brw_compile *p,
 284                   const struct brw_reg *dst,
 285                   GLuint mask,
 286                   const struct brw_reg *arg0,
 287                   const struct brw_reg *deltas,
 288                   const struct brw_reg *w)
 289 {
 290    struct intel_context *intel = &p->brw->intel;
 291    struct brw_reg interp[4];
 292    GLuint nr = arg0[0].nr;
 293    GLuint i;
 294
 295    interp[0] = brw_vec1_grf(nr, 0);
 296    interp[1] = brw_vec1_grf(nr, 4);
 297    interp[2] = brw_vec1_grf(nr+1, 0);
 298    interp[3] = brw_vec1_grf(nr+1, 4);
 299
 300    for (i = 0; i < 4; i++) {
 301       if (mask & (1<<i)) {
 302          if (can_do_pln(intel, deltas)) {
 303             brw_PLN(p, dst[i], interp[i], deltas[0]);
 304          } else {
 305             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 306             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 307          }
 308       }
 309    }
 310    for (i = 0; i < 4; i++) {
 311       if (mask & (1<<i)) {
 312          brw_MUL(p, dst[i], dst[i], w[3]);
 313       }
 314    }
 315 }
 316
 317
 318 void emit_cinterp(struct brw_compile *p,
 319                   const struct brw_reg *dst,
 320                   GLuint mask,
 321                   const struct brw_reg *arg0)
 322 {
 323    struct brw_reg interp[4];
 324    GLuint nr = arg0[0].nr;
 325    GLuint i;
 326
 327    interp[0] = brw_vec1_grf(nr, 0);
 328    interp[1] = brw_vec1_grf(nr, 4);
 329    interp[2] = brw_vec1_grf(nr+1, 0);
 330    interp[3] = brw_vec1_grf(nr+1, 4);
 331
 332    for (i = 0; i < 4; i++) {
 333       if (mask & (1<<i)) {
 334          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 335       }
 336    }
 337 }
 338
 339 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 340 void emit_frontfacing(struct brw_compile *p,
 341                       const struct brw_reg *dst,
 342                       GLuint mask)
 343 {
 344    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 345    GLuint i;
 346
 347    if (!(mask & WRITEMASK_XYZW))
 348       return;
 349
 350    for (i = 0; i < 4; i++) {
 351       if (mask & (1<<i)) {
 352          brw_MOV(p, dst[i], brw_imm_f(0.0));
 353       }
 354    }
 355
 356    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 357     * us front face
 358     */
 359    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 360    for (i = 0; i < 4; i++) {
 361       if (mask & (1<<i)) {
 362          brw_MOV(p, dst[i], brw_imm_f(1.0));
 363       }
 364    }
 365    brw_set_predicate_control_flag_value(p, 0xff);
 366 }
 367
 368 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 369  * looking like:
 370  *
 371  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 372  *
 373  * and we're trying to produce:
 374  *
 375  *           DDX                     DDY
 376  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 377  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 378  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 379  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 380  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 381  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 382  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 383  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 384  *
 385  * and add another set of two more subspans if in 16-pixel dispatch mode.
 386  *
 387  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 388  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 389  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 390  * between each other.  We could probably do it like ddx and swizzle the right
 391  * order later, but bail for now and just produce
 392  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 393  */
 394 void emit_ddxy(struct brw_compile *p,
 395                const struct brw_reg *dst,
 396                GLuint mask,
 397                GLboolean is_ddx,
 398                const struct brw_reg *arg0)
 399 {
 400    int i;
 401    struct brw_reg src0, src1;
 402
 403    if (mask & SATURATE)
 404       brw_set_saturate(p, 1);
 405    for (i = 0; i < 4; i++ ) {
 406       if (mask & (1<<i)) {
 407          if (is_ddx) {
 408             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 409                            BRW_REGISTER_TYPE_F,
 410                            BRW_VERTICAL_STRIDE_2,
 411                            BRW_WIDTH_2,
 412                            BRW_HORIZONTAL_STRIDE_0,
 413                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 414             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 415                            BRW_REGISTER_TYPE_F,
 416                            BRW_VERTICAL_STRIDE_2,
 417                            BRW_WIDTH_2,
 418                            BRW_HORIZONTAL_STRIDE_0,
 419                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 420          } else {
 421             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 422                            BRW_REGISTER_TYPE_F,
 423                            BRW_VERTICAL_STRIDE_4,
 424                            BRW_WIDTH_4,
 425                            BRW_HORIZONTAL_STRIDE_0,
 426                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 427             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 428                            BRW_REGISTER_TYPE_F,
 429                            BRW_VERTICAL_STRIDE_4,
 430                            BRW_WIDTH_4,
 431                            BRW_HORIZONTAL_STRIDE_0,
 432                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 433          }
 434          brw_ADD(p, dst[i], src0, negate(src1));
 435       }
 436    }
 437    if (mask & SATURATE)
 438       brw_set_saturate(p, 0);
 439 }
 440
 441 void emit_alu1(struct brw_compile *p,
 442                struct brw_instruction *(*func)(struct brw_compile *,
 443                                                struct brw_reg,
 444                                                struct brw_reg),
 445                const struct brw_reg *dst,
 446                GLuint mask,
 447                const struct brw_reg *arg0)
 448 {
 449    GLuint i;
 450
 451    if (mask & SATURATE)
 452       brw_set_saturate(p, 1);
 453
 454    for (i = 0; i < 4; i++) {
 455       if (mask & (1<<i)) {
 456          func(p, dst[i], arg0[i]);
 457       }
 458    }
 459
 460    if (mask & SATURATE)
 461       brw_set_saturate(p, 0);
 462 }
 463
 464
 465 void emit_alu2(struct brw_compile *p,
 466                struct brw_instruction *(*func)(struct brw_compile *,
 467                                                struct brw_reg,
 468                                                struct brw_reg,
 469                                                struct brw_reg),
 470                const struct brw_reg *dst,
 471                GLuint mask,
 472                const struct brw_reg *arg0,
 473                const struct brw_reg *arg1)
 474 {
 475    GLuint i;
 476
 477    if (mask & SATURATE)
 478       brw_set_saturate(p, 1);
 479
 480    for (i = 0; i < 4; i++) {
 481       if (mask & (1<<i)) {
 482          func(p, dst[i], arg0[i], arg1[i]);
 483       }
 484    }
 485
 486    if (mask & SATURATE)
 487       brw_set_saturate(p, 0);
 488 }
 489
 490
 491 void emit_mad(struct brw_compile *p,
 492               const struct brw_reg *dst,
 493               GLuint mask,
 494               const struct brw_reg *arg0,
 495               const struct brw_reg *arg1,
 496               const struct brw_reg *arg2)
 497 {
 498    GLuint i;
 499
 500    for (i = 0; i < 4; i++) {
 501       if (mask & (1<<i)) {
 502          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 503
 504          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 505          brw_ADD(p, dst[i], dst[i], arg2[i]);
 506          brw_set_saturate(p, 0);
 507       }
 508    }
 509 }
 510
 511 void emit_lrp(struct brw_compile *p,
 512               const struct brw_reg *dst,
 513               GLuint mask,
 514               const struct brw_reg *arg0,
 515               const struct brw_reg *arg1,
 516               const struct brw_reg *arg2)
 517 {
 518    GLuint i;
 519
 520    /* Uses dst as a temporary:
 521     */
 522    for (i = 0; i < 4; i++) {
 523       if (mask & (1<<i)) {
 524          /* Can I use the LINE instruction for this?
 525           */
 526          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 527          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 528
 529          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 530          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 531          brw_set_saturate(p, 0);
 532       }
 533    }
 534 }
 535
 536 void emit_sop(struct brw_compile *p,
 537               const struct brw_reg *dst,
 538               GLuint mask,
 539               GLuint cond,
 540               const struct brw_reg *arg0,
 541               const struct brw_reg *arg1)
 542 {
 543    GLuint i;
 544
 545    for (i = 0; i < 4; i++) {
 546       if (mask & (1<<i)) {
 547          brw_push_insn_state(p);
 548          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 549          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 550          brw_MOV(p, dst[i], brw_imm_f(0));
 551          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 552          brw_MOV(p, dst[i], brw_imm_f(1.0));
 553          brw_pop_insn_state(p);
 554       }
 555    }
 556 }
 557
 558 static void emit_slt( struct brw_compile *p,
 559                       const struct brw_reg *dst,
 560                       GLuint mask,
 561                       const struct brw_reg *arg0,
 562                       const struct brw_reg *arg1 )
 563 {
 564    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 565 }
 566
 567 static void emit_sle( struct brw_compile *p,
 568                       const struct brw_reg *dst,
 569                       GLuint mask,
 570                       const struct brw_reg *arg0,
 571                       const struct brw_reg *arg1 )
 572 {
 573    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 574 }
 575
 576 static void emit_sgt( struct brw_compile *p,
 577                       const struct brw_reg *dst,
 578                       GLuint mask,
 579                       const struct brw_reg *arg0,
 580                       const struct brw_reg *arg1 )
 581 {
 582    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 583 }
 584
 585 static void emit_sge( struct brw_compile *p,
 586                       const struct brw_reg *dst,
 587                       GLuint mask,
 588                       const struct brw_reg *arg0,
 589                       const struct brw_reg *arg1 )
 590 {
 591    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 592 }
 593
 594 static void emit_seq( struct brw_compile *p,
 595                       const struct brw_reg *dst,
 596                       GLuint mask,
 597                       const struct brw_reg *arg0,
 598                       const struct brw_reg *arg1 )
 599 {
 600    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 601 }
 602
 603 static void emit_sne( struct brw_compile *p,
 604                       const struct brw_reg *dst,
 605                       GLuint mask,
 606                       const struct brw_reg *arg0,
 607                       const struct brw_reg *arg1 )
 608 {
 609    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 610 }
 611
 612 void emit_cmp(struct brw_compile *p,
 613               const struct brw_reg *dst,
 614               GLuint mask,
 615               const struct brw_reg *arg0,
 616               const struct brw_reg *arg1,
 617               const struct brw_reg *arg2)
 618 {
 619    GLuint i;
 620
 621    for (i = 0; i < 4; i++) {
 622       if (mask & (1<<i)) {
 623          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 624          brw_MOV(p, dst[i], arg2[i]);
 625          brw_set_saturate(p, 0);
 626
 627          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 628
 629          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 630          brw_MOV(p, dst[i], arg1[i]);
 631          brw_set_saturate(p, 0);
 632          brw_set_predicate_control_flag_value(p, 0xff);
 633       }
 634    }
 635 }
 636
 637 void emit_max(struct brw_compile *p,
 638               const struct brw_reg *dst,
 639               GLuint mask,
 640               const struct brw_reg *arg0,
 641               const struct brw_reg *arg1)
 642 {
 643    GLuint i;
 644
 645    for (i = 0; i < 4; i++) {
 646       if (mask & (1<<i)) {
 647          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 648
 649          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 650          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 651          brw_set_saturate(p, 0);
 652          brw_set_predicate_control_flag_value(p, 0xff);
 653       }
 654    }
 655 }
 656
 657 void emit_min(struct brw_compile *p,
 658               const struct brw_reg *dst,
 659               GLuint mask,
 660               const struct brw_reg *arg0,
 661               const struct brw_reg *arg1)
 662 {
 663    GLuint i;
 664
 665    for (i = 0; i < 4; i++) {
 666       if (mask & (1<<i)) {
 667          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 668
 669          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 670          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 671          brw_set_saturate(p, 0);
 672          brw_set_predicate_control_flag_value(p, 0xff);
 673       }
 674    }
 675 }
 676
 677
 678 void emit_dp3(struct brw_compile *p,
 679               const struct brw_reg *dst,
 680               GLuint mask,
 681               const struct brw_reg *arg0,
 682               const struct brw_reg *arg1)
 683 {
 684    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 685
 686    if (!(mask & WRITEMASK_XYZW))
 687       return; /* Do not emit dead code */
 688
 689    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 690
 691    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 692    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 693
 694    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 695    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 696    brw_set_saturate(p, 0);
 697 }
 698
 699
 700 void emit_dp4(struct brw_compile *p,
 701               const struct brw_reg *dst,
 702               GLuint mask,
 703               const struct brw_reg *arg0,
 704               const struct brw_reg *arg1)
 705 {
 706    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 707
 708    if (!(mask & WRITEMASK_XYZW))
 709       return; /* Do not emit dead code */
 710
 711    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 712
 713    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 714    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 715    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 716
 717    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 718    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 719    brw_set_saturate(p, 0);
 720 }
 721
 722
 723 void emit_dph(struct brw_compile *p,
 724               const struct brw_reg *dst,
 725               GLuint mask,
 726               const struct brw_reg *arg0,
 727               const struct brw_reg *arg1)
 728 {
 729    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 730
 731    if (!(mask & WRITEMASK_XYZW))
 732       return; /* Do not emit dead code */
 733
 734    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 735
 736    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 737    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 738    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 739
 740    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 741    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 742    brw_set_saturate(p, 0);
 743 }
 744
 745
 746 void emit_xpd(struct brw_compile *p,
 747               const struct brw_reg *dst,
 748               GLuint mask,
 749               const struct brw_reg *arg0,
 750               const struct brw_reg *arg1)
 751 {
 752    GLuint i;
 753
 754    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 755
 756    for (i = 0 ; i < 3; i++) {
 757       if (mask & (1<<i)) {
 758          GLuint i2 = (i+2)%3;
 759          GLuint i1 = (i+1)%3;
 760
 761          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 762
 763          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 764          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 765          brw_set_saturate(p, 0);
 766       }
 767    }
 768 }
 769
 770
 771 void emit_math1(struct brw_wm_compile *c,
 772                 GLuint function,
 773                 const struct brw_reg *dst,
 774                 GLuint mask,
 775                 const struct brw_reg *arg0)
 776 {
 777    struct brw_compile *p = &c->func;
 778    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 779    GLuint saturate = ((mask & SATURATE) ?
 780                       BRW_MATH_SATURATE_SATURATE :
 781                       BRW_MATH_SATURATE_NONE);
 782
 783    if (!(mask & WRITEMASK_XYZW))
 784       return; /* Do not emit dead code */
 785
 786    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 787
 788    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 789     * channels.
 790     */
 791    brw_MOV(p, brw_message_reg(2), arg0[0]);
 792
 793    /* Send two messages to perform all 16 operations:
 794     */
 795    brw_push_insn_state(p);
 796    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 797    brw_math(p,
 798             dst[dst_chan],
 799             function,
 800             saturate,
 801             2,
 802             brw_null_reg(),
 803             BRW_MATH_DATA_VECTOR,
 804             BRW_MATH_PRECISION_FULL);
 805
 806    if (c->dispatch_width == 16) {
 807       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 808       brw_math(p,
 809                offset(dst[dst_chan],1),
 810                function,
 811                saturate,
 812                3,
 813                brw_null_reg(),
 814                BRW_MATH_DATA_VECTOR,
 815                BRW_MATH_PRECISION_FULL);
 816    }
 817    brw_pop_insn_state(p);
 818 }
 819
 820
 821 void emit_math2(struct brw_wm_compile *c,
 822                 GLuint function,
 823                 const struct brw_reg *dst,
 824                 GLuint mask,
 825                 const struct brw_reg *arg0,
 826                 const struct brw_reg *arg1)
 827 {
 828    struct brw_compile *p = &c->func;
 829    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 830    GLuint saturate = ((mask & SATURATE) ?
 831                       BRW_MATH_SATURATE_SATURATE :
 832                       BRW_MATH_SATURATE_NONE);
 833
 834    if (!(mask & WRITEMASK_XYZW))
 835       return; /* Do not emit dead code */
 836
 837    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 838
 839    brw_push_insn_state(p);
 840
 841    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 842    brw_MOV(p, brw_message_reg(2), arg0[0]);
 843    if (c->dispatch_width == 16) {
 844       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 845       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 846    }
 847
 848    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 849    brw_MOV(p, brw_message_reg(3), arg1[0]);
 850    if (c->dispatch_width == 16) {
 851       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 852       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 853    }
 854
 855    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 856    brw_math(p,
 857             dst[dst_chan],
 858             function,
 859             saturate,
 860             2,
 861             brw_null_reg(),
 862             BRW_MATH_DATA_VECTOR,
 863             BRW_MATH_PRECISION_FULL);
 864
 865    /* Send two messages to perform all 16 operations:
 866     */
 867    if (c->dispatch_width == 16) {
 868       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 869       brw_math(p,
 870                offset(dst[dst_chan],1),
 871                function,
 872                saturate,
 873                4,
 874                brw_null_reg(),
 875                BRW_MATH_DATA_VECTOR,
 876                BRW_MATH_PRECISION_FULL);
 877    }
 878    brw_pop_insn_state(p);
 879 }
 880
 881
 882 void emit_tex(struct brw_wm_compile *c,
 883               struct brw_reg *dst,
 884               GLuint dst_flags,
 885               struct brw_reg *arg,
 886               struct brw_reg depth_payload,
 887               GLuint tex_idx,
 888               GLuint sampler,
 889               GLboolean shadow)
 890 {
 891    struct brw_compile *p = &c->func;
 892    struct intel_context *intel = &p->brw->intel;
 893    struct brw_reg dst_retyped;
 894    GLuint cur_mrf = 2, response_length;
 895    GLuint i, nr_texcoords;
 896    GLuint emit;
 897    GLuint msg_type;
 898    GLuint mrf_per_channel;
 899    GLuint simd_mode;
 900
 901    if (c->dispatch_width == 16) {
 902       mrf_per_channel = 2;
 903       response_length = 8;
 904       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 905       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 906    } else {
 907       mrf_per_channel = 1;
 908       response_length = 4;
 909       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 910       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 911    }
 912
 913    /* How many input regs are there?
 914     */
 915    switch (tex_idx) {
 916    case TEXTURE_1D_INDEX:
 917       emit = WRITEMASK_X;
 918       nr_texcoords = 1;
 919       break;
 920    case TEXTURE_2D_INDEX:
 921    case TEXTURE_RECT_INDEX:
 922       emit = WRITEMASK_XY;
 923       nr_texcoords = 2;
 924       break;
 925    case TEXTURE_3D_INDEX:
 926    case TEXTURE_CUBE_INDEX:
 927       emit = WRITEMASK_XYZ;
 928       nr_texcoords = 3;
 929       break;
 930    default:
 931       /* unexpected target */
 932       abort();
 933    }
 934
 935    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 936    if (!intel->is_ironlake && c->dispatch_width == 8)
 937       nr_texcoords = 3;
 938
 939    /* For shadow comparisons, we have to supply u,v,r. */
 940    if (shadow)
 941       nr_texcoords = 3;
 942
 943    /* Emit the texcoords. */
 944    for (i = 0; i < nr_texcoords; i++) {
 945       if (emit & (1<<i))
 946          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 947       else
 948          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 949       cur_mrf += mrf_per_channel;
 950    }
 951
 952    /* Fill in the shadow comparison reference value. */
 953    if (shadow) {
 954       if (intel->is_ironlake) {
 955          /* Fill in the cube map array index value. */
 956          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 957          cur_mrf += mrf_per_channel;
 958       } else if (c->dispatch_width == 8) {
 959          /* Fill in the LOD bias value. */
 960          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 961          cur_mrf += mrf_per_channel;
 962       }
 963       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 964       cur_mrf += mrf_per_channel;
 965    }
 966
 967    if (intel->is_ironlake) {
 968       if (shadow)
 969          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 970       else
 971          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 972    } else {
 973       /* Note that G45 and older determines shadow compare and dispatch width
 974        * from message length for most messages.
 975        */
 976       if (c->dispatch_width == 16 && shadow)
 977          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 978       else
 979          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 980    }
 981
 982    brw_SAMPLE(p,
 983               dst_retyped,
 984               1,
 985               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 986               SURF_INDEX_TEXTURE(sampler),
 987               sampler,
 988               dst_flags & WRITEMASK_XYZW,
 989               msg_type,
 990               response_length,
 991               cur_mrf - 1,
 992               0,
 993               1,
 994               simd_mode);
 995 }
 996
 997
 998 void emit_txb(struct brw_wm_compile *c,
 999               struct brw_reg *dst,
1000               GLuint dst_flags,
1001               struct brw_reg *arg,
1002               struct brw_reg depth_payload,
1003               GLuint tex_idx,
1004               GLuint sampler)
1005 {
1006    struct brw_compile *p = &c->func;
1007    struct intel_context *intel = &p->brw->intel;
1008    GLuint msgLength;
1009    GLuint msg_type;
1010    GLuint mrf_per_channel;
1011    GLuint response_length;
1012    struct brw_reg dst_retyped;
1013
1014    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1015     * samples, so we'll use the 16-wide instruction, leave the second halves
1016     * undefined, and trust the execution mask to keep the undefined pixels
1017     * from mattering.
1018     */
1019    if (c->dispatch_width == 16 || !intel->is_ironlake) {
1020       if (intel->is_ironlake)
1021          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1022       else
1023          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1024       mrf_per_channel = 2;
1025       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1026       response_length = 8;
1027    } else {
1028       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1029       mrf_per_channel = 1;
1030       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1031       response_length = 4;
1032    }
1033
1034    /* Shadow ignored for txb. */
1035    switch (tex_idx) {
1036    case TEXTURE_1D_INDEX:
1037       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1038       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1039       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1040       break;
1041    case TEXTURE_2D_INDEX:
1042    case TEXTURE_RECT_INDEX:
1043       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1044       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1045       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1046       break;
1047    case TEXTURE_3D_INDEX:
1048    case TEXTURE_CUBE_INDEX:
1049       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1050       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1051       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1052       break;
1053    default:
1054       /* unexpected target */
1055       abort();
1056    }
1057
1058    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1059    msgLength = 2 + 4 * mrf_per_channel - 1;
1060
1061    brw_SAMPLE(p,
1062               dst_retyped,
1063               1,
1064               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1065               SURF_INDEX_TEXTURE(sampler),
1066               sampler,
1067               dst_flags & WRITEMASK_XYZW,
1068               msg_type,
1069               response_length,
1070               msgLength,
1071               0,
1072               1,
1073               BRW_SAMPLER_SIMD_MODE_SIMD16);
1074 }
1075
1076
1077 static void emit_lit(struct brw_wm_compile *c,
1078                      const struct brw_reg *dst,
1079                      GLuint mask,
1080                      const struct brw_reg *arg0)
1081 {
1082    struct brw_compile *p = &c->func;
1083
1084    assert((mask & WRITEMASK_XW) == 0);
1085
1086    if (mask & WRITEMASK_Y) {
1087       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1088       brw_MOV(p, dst[1], arg0[0]);
1089       brw_set_saturate(p, 0);
1090    }
1091
1092    if (mask & WRITEMASK_Z) {
1093       emit_math2(c, BRW_MATH_FUNCTION_POW,
1094                  &dst[2],
1095                  WRITEMASK_X | (mask & SATURATE),
1096                  &arg0[1],
1097                  &arg0[3]);
1098    }
1099
1100    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1101     * some of the POW calculations above, but 16-wide iff statements
1102     * seem to lock c1 hardware, so this is a nasty workaround:
1103     */
1104    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1105    {
1106       if (mask & WRITEMASK_Y)
1107          brw_MOV(p, dst[1], brw_imm_f(0));
1108
1109       if (mask & WRITEMASK_Z)
1110          brw_MOV(p, dst[2], brw_imm_f(0));
1111    }
1112    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1113 }
1114
1115
1116 /* Kill pixel - set execution mask to zero for those pixels which
1117  * fail.
1118  */
1119 static void emit_kil( struct brw_wm_compile *c,
1120                       struct brw_reg *arg0)
1121 {
1122    struct brw_compile *p = &c->func;
1123    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1124    GLuint i, j;
1125
1126    for (i = 0; i < 4; i++) {
1127       /* Check if we've already done the comparison for this reg
1128        * -- common when someone does KIL TEMP.wwww.
1129        */
1130       for (j = 0; j < i; j++) {
1131          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1132             break;
1133       }
1134       if (j != i)
1135          continue;
1136
1137       brw_push_insn_state(p);
1138       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1139       brw_set_predicate_control_flag_value(p, 0xff);
1140       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1141       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1142       brw_pop_insn_state(p);
1143    }
1144 }
1145
1146 /* KIL_NV kills the pixels that are currently executing, not based on a test
1147  * of the arguments.
1148  */
1149 static void emit_kil_nv( struct brw_wm_compile *c )
1150 {
1151    struct brw_compile *p = &c->func;
1152    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1153
1154    brw_push_insn_state(p);
1155    brw_set_mask_control(p, BRW_MASK_DISABLE);
1156    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1157    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1158    brw_pop_insn_state(p);
1159 }
1160
1161 static void fire_fb_write( struct brw_wm_compile *c,
1162                            GLuint base_reg,
1163                            GLuint nr,
1164                            GLuint target,
1165                            GLuint eot )
1166 {
1167    struct brw_compile *p = &c->func;
1168    struct brw_reg dst;
1169
1170    if (c->dispatch_width == 16)
1171       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1172    else
1173       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1174
1175    /* Pass through control information:
1176     */
1177 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1178    {
1179       brw_push_insn_state(p);
1180       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1181       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1182       brw_MOV(p,
1183                brw_message_reg(base_reg + 1),
1184                brw_vec8_grf(1, 0));
1185       brw_pop_insn_state(p);
1186    }
1187
1188    /* Send framebuffer write message: */
1189 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1190    brw_fb_WRITE(p,
1191                 dst,
1192                 base_reg,
1193                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1194                 target,
1195                 nr,
1196                 0,
1197                 eot);
1198 }
1199
1200
1201 static void emit_aa( struct brw_wm_compile *c,
1202                      struct brw_reg *arg1,
1203                      GLuint reg )
1204 {
1205    struct brw_compile *p = &c->func;
1206    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1207    GLuint off = c->key.aa_dest_stencil_reg % 2;
1208    struct brw_reg aa = offset(arg1[comp], off);
1209
1210    brw_push_insn_state(p);
1211    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1212    brw_MOV(p, brw_message_reg(reg), aa);
1213    brw_pop_insn_state(p);
1214 }
1215
1216
1217 /* Post-fragment-program processing.  Send the results to the
1218  * framebuffer.
1219  * \param arg0  the fragment color
1220  * \param arg1  the pass-through depth value
1221  * \param arg2  the shader-computed depth value
1222  */
1223 void emit_fb_write(struct brw_wm_compile *c,
1224                    struct brw_reg *arg0,
1225                    struct brw_reg *arg1,
1226                    struct brw_reg *arg2,
1227                    GLuint target,
1228                    GLuint eot)
1229 {
1230    struct brw_compile *p = &c->func;
1231    struct brw_context *brw = p->brw;
1232    GLuint nr = 2;
1233    GLuint channel;
1234
1235    /* Reserve a space for AA - may not be needed:
1236     */
1237    if (c->key.aa_dest_stencil_reg)
1238       nr += 1;
1239
1240    /* I don't really understand how this achieves the color interleave
1241     * (ie RGBARGBA) in the result:  [Do the saturation here]
1242     */
1243    brw_push_insn_state(p);
1244
1245    for (channel = 0; channel < 4; channel++) {
1246       if (c->dispatch_width == 16 && brw->has_compr4) {
1247          /* By setting the high bit of the MRF register number, we indicate
1248           * that we want COMPR4 mode - instead of doing the usual destination
1249           * + 1 for the second half we get destination + 4.
1250           */
1251          brw_MOV(p,
1252                  brw_message_reg(nr + channel + (1 << 7)),
1253                  arg0[channel]);
1254       } else {
1255          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1256          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1257          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1258          brw_MOV(p,
1259                  brw_message_reg(nr + channel),
1260                  arg0[channel]);
1261
1262          if (c->dispatch_width == 16) {
1263             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1264             brw_MOV(p,
1265                     brw_message_reg(nr + channel + 4),
1266                     sechalf(arg0[channel]));
1267          }
1268       }
1269    }
1270    /* skip over the regs populated above:
1271     */
1272    nr += 8;
1273    brw_pop_insn_state(p);
1274
1275    if (c->key.source_depth_to_render_target)
1276    {
1277       if (c->key.computes_depth)
1278          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1279       else
1280          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1281
1282       nr += 2;
1283    }
1284
1285    if (c->key.dest_depth_reg)
1286    {
1287       GLuint comp = c->key.dest_depth_reg / 2;
1288       GLuint off = c->key.dest_depth_reg % 2;
1289
1290       if (off != 0) {
1291          brw_push_insn_state(p);
1292          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1293
1294          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1295          /* 2nd half? */
1296          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1297          brw_pop_insn_state(p);
1298       }
1299       else {
1300          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1301       }
1302       nr += 2;
1303    }
1304
1305    if (!c->key.runtime_check_aads_emit) {
1306       if (c->key.aa_dest_stencil_reg)
1307          emit_aa(c, arg1, 2);
1308
1309       fire_fb_write(c, 0, nr, target, eot);
1310    }
1311    else {
1312       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1313       struct brw_reg ip = brw_ip_reg();
1314       struct brw_instruction *jmp;
1315
1316       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1317       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1318       brw_AND(p,
1319               v1_null_ud,
1320               get_element_ud(brw_vec8_grf(1,0), 6),
1321               brw_imm_ud(1<<26));
1322
1323       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1324       {
1325          emit_aa(c, arg1, 2);
1326          fire_fb_write(c, 0, nr, target, eot);
1327          /* note - thread killed in subroutine */
1328       }
1329       brw_land_fwd_jump(p, jmp);
1330
1331       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1332        */
1333       fire_fb_write(c, 1, nr-1, target, eot);
1334    }
1335 }
1336
1337 /**
1338  * Move a GPR to scratch memory.
1339  */
1340 static void emit_spill( struct brw_wm_compile *c,
1341                         struct brw_reg reg,
1342                         GLuint slot )
1343 {
1344    struct brw_compile *p = &c->func;
1345
1346    /*
1347      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1348    */
1349    brw_MOV(p, brw_message_reg(2), reg);
1350
1351    /*
1352      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1353      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1354    */
1355    brw_dp_WRITE_16(p,
1356                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1357                    slot);
1358 }
1359
1360
1361 /**
1362  * Load a GPR from scratch memory.
1363  */
1364 static void emit_unspill( struct brw_wm_compile *c,
1365                           struct brw_reg reg,
1366                           GLuint slot )
1367 {
1368    struct brw_compile *p = &c->func;
1369
1370    /* Slot 0 is the undef value.
1371     */
1372    if (slot == 0) {
1373       brw_MOV(p, reg, brw_imm_f(0));
1374       return;
1375    }
1376
1377    /*
1378      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1379      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1380    */
1381
1382    brw_dp_READ_16(p,
1383                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1384                   slot);
1385 }
1386
1387
1388 /**
1389  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1390  * Args with unspill_reg != 0 will be loaded from scratch memory.
1391  */
1392 static void get_argument_regs( struct brw_wm_compile *c,
1393                                struct brw_wm_ref *arg[],
1394                                struct brw_reg *regs )
1395 {
1396    GLuint i;
1397
1398    for (i = 0; i < 4; i++) {
1399       if (arg[i]) {
1400          if (arg[i]->unspill_reg)
1401             emit_unspill(c,
1402                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1403                          arg[i]->value->spill_slot);
1404
1405          regs[i] = arg[i]->hw_reg;
1406       }
1407       else {
1408          regs[i] = brw_null_reg();
1409       }
1410    }
1411 }
1412
1413
1414 /**
1415  * For values that have a spill_slot!=0, write those regs to scratch memory.
1416  */
1417 static void spill_values( struct brw_wm_compile *c,
1418                           struct brw_wm_value *values,
1419                           GLuint nr )
1420 {
1421    GLuint i;
1422
1423    for (i = 0; i < nr; i++)
1424       if (values[i].spill_slot)
1425          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1426 }
1427
1428
1429 /* Emit the fragment program instructions here.
1430  */
1431 void brw_wm_emit( struct brw_wm_compile *c )
1432 {
1433    struct brw_compile *p = &c->func;
1434    GLuint insn;
1435
1436    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1437
1438    /* Check if any of the payload regs need to be spilled:
1439     */
1440    spill_values(c, c->payload.depth, 4);
1441    spill_values(c, c->creg, c->nr_creg);
1442    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1443
1444
1445    for (insn = 0; insn < c->nr_insns; insn++) {
1446
1447       struct brw_wm_instruction *inst = &c->instruction[insn];
1448       struct brw_reg args[3][4], dst[4];
1449       GLuint i, dst_flags;
1450
1451       /* Get argument regs:
1452        */
1453       for (i = 0; i < 3; i++)
1454          get_argument_regs(c, inst->src[i], args[i]);
1455
1456       /* Get dest regs:
1457        */
1458       for (i = 0; i < 4; i++)
1459          if (inst->dst[i])
1460             dst[i] = inst->dst[i]->hw_reg;
1461          else
1462             dst[i] = brw_null_reg();
1463
1464       /* Flags
1465        */
1466       dst_flags = inst->writemask;
1467       if (inst->saturate)
1468          dst_flags |= SATURATE;
1469
1470       switch (inst->opcode) {
1471          /* Generated instructions for calculating triangle interpolants:
1472           */
1473       case WM_PIXELXY:
1474          emit_pixel_xy(c, dst, dst_flags);
1475          break;
1476
1477       case WM_DELTAXY:
1478          emit_delta_xy(p, dst, dst_flags, args[0]);
1479          break;
1480
1481       case WM_WPOSXY:
1482          emit_wpos_xy(c, dst, dst_flags, args[0]);
1483          break;
1484
1485       case WM_PIXELW:
1486          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1487          break;
1488
1489       case WM_LINTERP:
1490          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1491          break;
1492
1493       case WM_PINTERP:
1494          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1495          break;
1496
1497       case WM_CINTERP:
1498          emit_cinterp(p, dst, dst_flags, args[0]);
1499          break;
1500
1501       case WM_FB_WRITE:
1502          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1503          break;
1504
1505       case WM_FRONTFACING:
1506          emit_frontfacing(p, dst, dst_flags);
1507          break;
1508
1509          /* Straightforward arithmetic:
1510           */
1511       case OPCODE_ADD:
1512          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1513          break;
1514
1515       case OPCODE_FRC:
1516          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1517          break;
1518
1519       case OPCODE_FLR:
1520          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1521          break;
1522
1523       case OPCODE_DDX:
1524          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1525          break;
1526
1527       case OPCODE_DDY:
1528          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1529          break;
1530
1531       case OPCODE_DP3:
1532          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1533          break;
1534
1535       case OPCODE_DP4:
1536          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1537          break;
1538
1539       case OPCODE_DPH:
1540          emit_dph(p, dst, dst_flags, args[0], args[1]);
1541          break;
1542
1543       case OPCODE_TRUNC:
1544          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1545          break;
1546
1547       case OPCODE_LRP:
1548          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1549          break;
1550
1551       case OPCODE_MAD:
1552          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1553          break;
1554
1555       case OPCODE_MOV:
1556       case OPCODE_SWZ:
1557          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1558          break;
1559
1560       case OPCODE_MUL:
1561          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1562          break;
1563
1564       case OPCODE_XPD:
1565          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1566          break;
1567
1568          /* Higher math functions:
1569           */
1570       case OPCODE_RCP:
1571          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1572          break;
1573
1574       case OPCODE_RSQ:
1575          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1576          break;
1577
1578       case OPCODE_SIN:
1579          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1580          break;
1581
1582       case OPCODE_COS:
1583          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1584          break;
1585
1586       case OPCODE_EX2:
1587          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1588          break;
1589
1590       case OPCODE_LG2:
1591          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1592          break;
1593
1594       case OPCODE_SCS:
1595          /* There is an scs math function, but it would need some
1596           * fixup for 16-element execution.
1597           */
1598          if (dst_flags & WRITEMASK_X)
1599             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1600          if (dst_flags & WRITEMASK_Y)
1601             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1602          break;
1603
1604       case OPCODE_POW:
1605          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1606          break;
1607
1608          /* Comparisons:
1609           */
1610       case OPCODE_CMP:
1611          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1612          break;
1613
1614       case OPCODE_MAX:
1615          emit_max(p, dst, dst_flags, args[0], args[1]);
1616          break;
1617
1618       case OPCODE_MIN:
1619          emit_min(p, dst, dst_flags, args[0], args[1]);
1620          break;
1621
1622       case OPCODE_SLT:
1623          emit_slt(p, dst, dst_flags, args[0], args[1]);
1624          break;
1625
1626       case OPCODE_SLE:
1627          emit_sle(p, dst, dst_flags, args[0], args[1]);
1628         break;
1629       case OPCODE_SGT:
1630          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1631         break;
1632       case OPCODE_SGE:
1633          emit_sge(p, dst, dst_flags, args[0], args[1]);
1634          break;
1635       case OPCODE_SEQ:
1636          emit_seq(p, dst, dst_flags, args[0], args[1]);
1637         break;
1638       case OPCODE_SNE:
1639          emit_sne(p, dst, dst_flags, args[0], args[1]);
1640         break;
1641
1642       case OPCODE_LIT:
1643          emit_lit(c, dst, dst_flags, args[0]);
1644          break;
1645
1646          /* Texturing operations:
1647           */
1648       case OPCODE_TEX:
1649          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1650                   inst->tex_idx, inst->tex_unit,
1651                   inst->tex_shadow);
1652          break;
1653
1654       case OPCODE_TXB:
1655          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1656                   inst->tex_idx, inst->tex_unit);
1657          break;
1658
1659       case OPCODE_KIL:
1660          emit_kil(c, args[0]);
1661          break;
1662
1663       case OPCODE_KIL_NV:
1664          emit_kil_nv(c);
1665          break;
1666
1667       default:
1668          printf("Unsupported opcode %i (%s) in fragment shader\n",
1669                 inst->opcode, inst->opcode < MAX_OPCODE ?
1670                 _mesa_opcode_string(inst->opcode) :
1671                 "unknown");
1672       }
1673
1674       for (i = 0; i < 4; i++)
1675         if (inst->dst[i] && inst->dst[i]->spill_slot)
1676            emit_spill(c,
1677                       inst->dst[i]->hw_reg,
1678                       inst->dst[i]->spill_slot);
1679    }
1680
1681    if (INTEL_DEBUG & DEBUG_WM) {
1682       int i;
1683
1684       printf("wm-native:\n");
1685       for (i = 0; i < p->nr_insn; i++)
1686          brw_disasm(stderr, &p->store[i]);
1687       printf("\n");
1688    }
1689 }