src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 /* Not quite sure how correct this is - need to understand horiz
  38  * vs. vertical strides a little better.
  39  */
  40 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  41 {
  42    if (reg.vstride)
  43       reg.nr++;
  44    return reg;
  45 }
  46
  47
  48 /* Payload R0:
  49  *
  50  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  51  *         corresponding to each of the 16 execution channels.
  52  * R0.1..8 -- ?
  53  * R1.0 -- triangle vertex 0.X
  54  * R1.1 -- triangle vertex 0.Y
  55  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  56  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  57  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  58  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  59  * R1.6 -- ?
  60  * R1.7 -- ?
  61  * R1.8 -- ?
  62  */
  63
  64 void emit_pixel_xy(struct brw_wm_compile *c,
  65                    const struct brw_reg *dst,
  66                    GLuint mask)
  67 {
  68    struct brw_compile *p = &c->func;
  69    struct brw_reg r1 = brw_vec1_grf(1, 0);
  70    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  71    struct brw_reg dst0_uw, dst1_uw;
  72
  73    brw_push_insn_state(p);
  74    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  75
  76    if (c->dispatch_width == 16) {
  77       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
  78       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
  79    } else {
  80       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
  81       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
  82    }
  83
  84    /* Calculate pixel centers by adding 1 or 0 to each of the
  85     * micro-tile coordinates passed in r1.
  86     */
  87    if (mask & WRITEMASK_X) {
  88       brw_ADD(p,
  89               dst0_uw,
  90               stride(suboffset(r1_uw, 4), 2, 4, 0),
  91               brw_imm_v(0x10101010));
  92    }
  93
  94    if (mask & WRITEMASK_Y) {
  95       brw_ADD(p,
  96               dst1_uw,
  97               stride(suboffset(r1_uw,5), 2, 4, 0),
  98               brw_imm_v(0x11001100));
  99    }
 100    brw_pop_insn_state(p);
 101 }
 102
 103
 104 void emit_delta_xy(struct brw_compile *p,
 105                    const struct brw_reg *dst,
 106                    GLuint mask,
 107                    const struct brw_reg *arg0)
 108 {
 109    struct brw_reg r1 = brw_vec1_grf(1, 0);
 110
 111    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 112     * centers.
 113     */
 114    if (mask & WRITEMASK_X) {
 115       brw_ADD(p,
 116               dst[0],
 117               retype(arg0[0], BRW_REGISTER_TYPE_UW),
 118               negate(r1));
 119    }
 120
 121    if (mask & WRITEMASK_Y) {
 122       brw_ADD(p,
 123               dst[1],
 124               retype(arg0[1], BRW_REGISTER_TYPE_UW),
 125               negate(suboffset(r1,1)));
 126
 127    }
 128 }
 129
 130 void emit_wpos_xy(struct brw_wm_compile *c,
 131                   const struct brw_reg *dst,
 132                   GLuint mask,
 133                   const struct brw_reg *arg0)
 134 {
 135    struct brw_compile *p = &c->func;
 136
 137    /* Calculate the pixel offset from window bottom left into destination
 138     * X and Y channels.
 139     */
 140    if (mask & WRITEMASK_X) {
 141       /* X' = X - origin */
 142       brw_ADD(p,
 143               dst[0],
 144               retype(arg0[0], BRW_REGISTER_TYPE_W),
 145               brw_imm_d(0 - c->key.origin_x));
 146    }
 147
 148    if (mask & WRITEMASK_Y) {
 149       /* Y' = height - (Y - origin_y) = height + origin_y - Y */
 150       brw_ADD(p,
 151               dst[1],
 152               negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 153               brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
 154    }
 155 }
 156
 157
 158 void emit_pixel_w(struct brw_wm_compile *c,
 159                   const struct brw_reg *dst,
 160                   GLuint mask,
 161                   const struct brw_reg *arg0,
 162                   const struct brw_reg *deltas)
 163 {
 164    struct brw_compile *p = &c->func;
 165
 166    /* Don't need this if all you are doing is interpolating color, for
 167     * instance.
 168     */
 169    if (mask & WRITEMASK_W) {
 170       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 171
 172       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 173        * result straight into a message reg.
 174        */
 175       brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 176       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 177
 178       /* Calc w */
 179       if (c->dispatch_width == 16) {
 180          brw_math_16(p, dst[3],
 181                      BRW_MATH_FUNCTION_INV,
 182                      BRW_MATH_SATURATE_NONE,
 183                      2, brw_null_reg(),
 184                      BRW_MATH_PRECISION_FULL);
 185       } else {
 186          brw_math(p, dst[3],
 187                   BRW_MATH_FUNCTION_INV,
 188                   BRW_MATH_SATURATE_NONE,
 189                   2, brw_null_reg(),
 190                   BRW_MATH_DATA_VECTOR,
 191                   BRW_MATH_PRECISION_FULL);
 192       }
 193    }
 194 }
 195
 196
 197 void emit_linterp(struct brw_compile *p,
 198                   const struct brw_reg *dst,
 199                   GLuint mask,
 200                   const struct brw_reg *arg0,
 201                   const struct brw_reg *deltas)
 202 {
 203    struct brw_reg interp[4];
 204    GLuint nr = arg0[0].nr;
 205    GLuint i;
 206
 207    interp[0] = brw_vec1_grf(nr, 0);
 208    interp[1] = brw_vec1_grf(nr, 4);
 209    interp[2] = brw_vec1_grf(nr+1, 0);
 210    interp[3] = brw_vec1_grf(nr+1, 4);
 211
 212    for (i = 0; i < 4; i++) {
 213       if (mask & (1<<i)) {
 214          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 215          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 216       }
 217    }
 218 }
 219
 220
 221 void emit_pinterp(struct brw_compile *p,
 222                   const struct brw_reg *dst,
 223                   GLuint mask,
 224                   const struct brw_reg *arg0,
 225                   const struct brw_reg *deltas,
 226                   const struct brw_reg *w)
 227 {
 228    struct brw_reg interp[4];
 229    GLuint nr = arg0[0].nr;
 230    GLuint i;
 231
 232    interp[0] = brw_vec1_grf(nr, 0);
 233    interp[1] = brw_vec1_grf(nr, 4);
 234    interp[2] = brw_vec1_grf(nr+1, 0);
 235    interp[3] = brw_vec1_grf(nr+1, 4);
 236
 237    for (i = 0; i < 4; i++) {
 238       if (mask & (1<<i)) {
 239          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 240          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 241       }
 242    }
 243    for (i = 0; i < 4; i++) {
 244       if (mask & (1<<i)) {
 245          brw_MUL(p, dst[i], dst[i], w[3]);
 246       }
 247    }
 248 }
 249
 250
 251 void emit_cinterp(struct brw_compile *p,
 252                   const struct brw_reg *dst,
 253                   GLuint mask,
 254                   const struct brw_reg *arg0)
 255 {
 256    struct brw_reg interp[4];
 257    GLuint nr = arg0[0].nr;
 258    GLuint i;
 259
 260    interp[0] = brw_vec1_grf(nr, 0);
 261    interp[1] = brw_vec1_grf(nr, 4);
 262    interp[2] = brw_vec1_grf(nr+1, 0);
 263    interp[3] = brw_vec1_grf(nr+1, 4);
 264
 265    for (i = 0; i < 4; i++) {
 266       if (mask & (1<<i)) {
 267          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 268       }
 269    }
 270 }
 271
 272 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 273 void emit_frontfacing(struct brw_compile *p,
 274                       const struct brw_reg *dst,
 275                       GLuint mask)
 276 {
 277    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 278    GLuint i;
 279
 280    if (!(mask & WRITEMASK_XYZW))
 281       return;
 282
 283    for (i = 0; i < 4; i++) {
 284       if (mask & (1<<i)) {
 285          brw_MOV(p, dst[i], brw_imm_f(0.0));
 286       }
 287    }
 288
 289    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 290     * us front face
 291     */
 292    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 293    for (i = 0; i < 4; i++) {
 294       if (mask & (1<<i)) {
 295          brw_MOV(p, dst[i], brw_imm_f(1.0));
 296       }
 297    }
 298    brw_set_predicate_control_flag_value(p, 0xff);
 299 }
 300
 301 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 302  * looking like:
 303  *
 304  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 305  *
 306  * and we're trying to produce:
 307  *
 308  *           DDX                     DDY
 309  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 310  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 311  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 312  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 313  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 314  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 315  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 316  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 317  *
 318  * and add another set of two more subspans if in 16-pixel dispatch mode.
 319  *
 320  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 321  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 322  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 323  * between each other.  We could probably do it like ddx and swizzle the right
 324  * order later, but bail for now and just produce
 325  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 326  */
 327 void emit_ddxy(struct brw_compile *p,
 328                const struct brw_reg *dst,
 329                GLuint mask,
 330                GLboolean is_ddx,
 331                const struct brw_reg *arg0)
 332 {
 333    int i;
 334    struct brw_reg src0, src1;
 335
 336    if (mask & SATURATE)
 337       brw_set_saturate(p, 1);
 338    for (i = 0; i < 4; i++ ) {
 339       if (mask & (1<<i)) {
 340          if (is_ddx) {
 341             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 342                            BRW_REGISTER_TYPE_F,
 343                            BRW_VERTICAL_STRIDE_2,
 344                            BRW_WIDTH_2,
 345                            BRW_HORIZONTAL_STRIDE_0,
 346                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 347             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 348                            BRW_REGISTER_TYPE_F,
 349                            BRW_VERTICAL_STRIDE_2,
 350                            BRW_WIDTH_2,
 351                            BRW_HORIZONTAL_STRIDE_0,
 352                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 353          } else {
 354             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 355                            BRW_REGISTER_TYPE_F,
 356                            BRW_VERTICAL_STRIDE_4,
 357                            BRW_WIDTH_4,
 358                            BRW_HORIZONTAL_STRIDE_0,
 359                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 360             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 361                            BRW_REGISTER_TYPE_F,
 362                            BRW_VERTICAL_STRIDE_4,
 363                            BRW_WIDTH_4,
 364                            BRW_HORIZONTAL_STRIDE_0,
 365                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 366          }
 367          brw_ADD(p, dst[i], src0, negate(src1));
 368       }
 369    }
 370    if (mask & SATURATE)
 371       brw_set_saturate(p, 0);
 372 }
 373
 374 void emit_alu1(struct brw_compile *p,
 375                struct brw_instruction *(*func)(struct brw_compile *,
 376                                                struct brw_reg,
 377                                                struct brw_reg),
 378                const struct brw_reg *dst,
 379                GLuint mask,
 380                const struct brw_reg *arg0)
 381 {
 382    GLuint i;
 383
 384    if (mask & SATURATE)
 385       brw_set_saturate(p, 1);
 386
 387    for (i = 0; i < 4; i++) {
 388       if (mask & (1<<i)) {
 389          func(p, dst[i], arg0[i]);
 390       }
 391    }
 392
 393    if (mask & SATURATE)
 394       brw_set_saturate(p, 0);
 395 }
 396
 397
 398 void emit_alu2(struct brw_compile *p,
 399                struct brw_instruction *(*func)(struct brw_compile *,
 400                                                struct brw_reg,
 401                                                struct brw_reg,
 402                                                struct brw_reg),
 403                const struct brw_reg *dst,
 404                GLuint mask,
 405                const struct brw_reg *arg0,
 406                const struct brw_reg *arg1)
 407 {
 408    GLuint i;
 409
 410    if (mask & SATURATE)
 411       brw_set_saturate(p, 1);
 412
 413    for (i = 0; i < 4; i++) {
 414       if (mask & (1<<i)) {
 415          func(p, dst[i], arg0[i], arg1[i]);
 416       }
 417    }
 418
 419    if (mask & SATURATE)
 420       brw_set_saturate(p, 0);
 421 }
 422
 423
 424 void emit_mad(struct brw_compile *p,
 425               const struct brw_reg *dst,
 426               GLuint mask,
 427               const struct brw_reg *arg0,
 428               const struct brw_reg *arg1,
 429               const struct brw_reg *arg2)
 430 {
 431    GLuint i;
 432
 433    for (i = 0; i < 4; i++) {
 434       if (mask & (1<<i)) {
 435          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 436
 437          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 438          brw_ADD(p, dst[i], dst[i], arg2[i]);
 439          brw_set_saturate(p, 0);
 440       }
 441    }
 442 }
 443
 444 void emit_lrp(struct brw_compile *p,
 445               const struct brw_reg *dst,
 446               GLuint mask,
 447               const struct brw_reg *arg0,
 448               const struct brw_reg *arg1,
 449               const struct brw_reg *arg2)
 450 {
 451    GLuint i;
 452
 453    /* Uses dst as a temporary:
 454     */
 455    for (i = 0; i < 4; i++) {
 456       if (mask & (1<<i)) {
 457          /* Can I use the LINE instruction for this?
 458           */
 459          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 460          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 461
 462          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 463          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 464          brw_set_saturate(p, 0);
 465       }
 466    }
 467 }
 468
 469 void emit_sop(struct brw_compile *p,
 470               const struct brw_reg *dst,
 471               GLuint mask,
 472               GLuint cond,
 473               const struct brw_reg *arg0,
 474               const struct brw_reg *arg1)
 475 {
 476    GLuint i;
 477
 478    for (i = 0; i < 4; i++) {
 479       if (mask & (1<<i)) {
 480          brw_push_insn_state(p);
 481          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 482          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 483          brw_MOV(p, dst[i], brw_imm_f(0));
 484          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 485          brw_MOV(p, dst[i], brw_imm_f(1.0));
 486          brw_pop_insn_state(p);
 487       }
 488    }
 489 }
 490
 491 static void emit_slt( struct brw_compile *p,
 492                       const struct brw_reg *dst,
 493                       GLuint mask,
 494                       const struct brw_reg *arg0,
 495                       const struct brw_reg *arg1 )
 496 {
 497    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 498 }
 499
 500 static void emit_sle( struct brw_compile *p,
 501                       const struct brw_reg *dst,
 502                       GLuint mask,
 503                       const struct brw_reg *arg0,
 504                       const struct brw_reg *arg1 )
 505 {
 506    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 507 }
 508
 509 static void emit_sgt( struct brw_compile *p,
 510                       const struct brw_reg *dst,
 511                       GLuint mask,
 512                       const struct brw_reg *arg0,
 513                       const struct brw_reg *arg1 )
 514 {
 515    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 516 }
 517
 518 static void emit_sge( struct brw_compile *p,
 519                       const struct brw_reg *dst,
 520                       GLuint mask,
 521                       const struct brw_reg *arg0,
 522                       const struct brw_reg *arg1 )
 523 {
 524    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 525 }
 526
 527 static void emit_seq( struct brw_compile *p,
 528                       const struct brw_reg *dst,
 529                       GLuint mask,
 530                       const struct brw_reg *arg0,
 531                       const struct brw_reg *arg1 )
 532 {
 533    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 534 }
 535
 536 static void emit_sne( struct brw_compile *p,
 537                       const struct brw_reg *dst,
 538                       GLuint mask,
 539                       const struct brw_reg *arg0,
 540                       const struct brw_reg *arg1 )
 541 {
 542    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 543 }
 544
 545 static void emit_cmp( struct brw_compile *p,
 546                       const struct brw_reg *dst,
 547                       GLuint mask,
 548                       const struct brw_reg *arg0,
 549                       const struct brw_reg *arg1,
 550                       const struct brw_reg *arg2 )
 551 {
 552    GLuint i;
 553
 554    for (i = 0; i < 4; i++) {
 555       if (mask & (1<<i)) {
 556          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 557          brw_MOV(p, dst[i], arg2[i]);
 558          brw_set_saturate(p, 0);
 559
 560          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 561
 562          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 563          brw_MOV(p, dst[i], arg1[i]);
 564          brw_set_saturate(p, 0);
 565          brw_set_predicate_control_flag_value(p, 0xff);
 566       }
 567    }
 568 }
 569
 570 void emit_max(struct brw_compile *p,
 571               const struct brw_reg *dst,
 572               GLuint mask,
 573               const struct brw_reg *arg0,
 574               const struct brw_reg *arg1)
 575 {
 576    GLuint i;
 577
 578    for (i = 0; i < 4; i++) {
 579       if (mask & (1<<i)) {
 580          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 581          brw_MOV(p, dst[i], arg0[i]);
 582          brw_set_saturate(p, 0);
 583
 584          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 585
 586          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 587          brw_MOV(p, dst[i], arg1[i]);
 588          brw_set_saturate(p, 0);
 589          brw_set_predicate_control_flag_value(p, 0xff);
 590       }
 591    }
 592 }
 593
 594 void emit_min(struct brw_compile *p,
 595               const struct brw_reg *dst,
 596               GLuint mask,
 597               const struct brw_reg *arg0,
 598               const struct brw_reg *arg1)
 599 {
 600    GLuint i;
 601
 602    for (i = 0; i < 4; i++) {
 603       if (mask & (1<<i)) {
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MOV(p, dst[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607
 608          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 609
 610          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 611          brw_MOV(p, dst[i], arg0[i]);
 612          brw_set_saturate(p, 0);
 613          brw_set_predicate_control_flag_value(p, 0xff);
 614       }
 615    }
 616 }
 617
 618
 619 void emit_dp3(struct brw_compile *p,
 620               const struct brw_reg *dst,
 621               GLuint mask,
 622               const struct brw_reg *arg0,
 623               const struct brw_reg *arg1)
 624 {
 625    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 626
 627    if (!(mask & WRITEMASK_XYZW))
 628       return; /* Do not emit dead code */
 629
 630    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 631
 632    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 633    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 634
 635    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 636    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 637    brw_set_saturate(p, 0);
 638 }
 639
 640
 641 void emit_dp4(struct brw_compile *p,
 642               const struct brw_reg *dst,
 643               GLuint mask,
 644               const struct brw_reg *arg0,
 645               const struct brw_reg *arg1)
 646 {
 647    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 648
 649    if (!(mask & WRITEMASK_XYZW))
 650       return; /* Do not emit dead code */
 651
 652    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 653
 654    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 655    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 656    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 657
 658    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 659    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 660    brw_set_saturate(p, 0);
 661 }
 662
 663
 664 void emit_dph(struct brw_compile *p,
 665               const struct brw_reg *dst,
 666               GLuint mask,
 667               const struct brw_reg *arg0,
 668               const struct brw_reg *arg1)
 669 {
 670    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 671
 672    if (!(mask & WRITEMASK_XYZW))
 673       return; /* Do not emit dead code */
 674
 675    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 676
 677    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 678    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 679    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 680
 681    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 682    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 683    brw_set_saturate(p, 0);
 684 }
 685
 686
 687 void emit_xpd(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1)
 692 {
 693    GLuint i;
 694
 695    assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
 696
 697    for (i = 0 ; i < 3; i++) {
 698       if (mask & (1<<i)) {
 699          GLuint i2 = (i+2)%3;
 700          GLuint i1 = (i+1)%3;
 701
 702          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 703
 704          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 705          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 706          brw_set_saturate(p, 0);
 707       }
 708    }
 709 }
 710
 711
 712 void emit_math1(struct brw_wm_compile *c,
 713                 GLuint function,
 714                 const struct brw_reg *dst,
 715                 GLuint mask,
 716                 const struct brw_reg *arg0)
 717 {
 718    struct brw_compile *p = &c->func;
 719    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 720    GLuint saturate = ((mask & SATURATE) ?
 721                       BRW_MATH_SATURATE_SATURATE :
 722                       BRW_MATH_SATURATE_NONE);
 723
 724    if (!(mask & WRITEMASK_XYZW))
 725       return; /* Do not emit dead code */
 726
 727    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 728
 729    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 730     * channels.
 731     */
 732    brw_MOV(p, brw_message_reg(2), arg0[0]);
 733
 734    /* Send two messages to perform all 16 operations:
 735     */
 736    brw_push_insn_state(p);
 737    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 738    brw_math(p,
 739             dst[dst_chan],
 740             function,
 741             saturate,
 742             2,
 743             brw_null_reg(),
 744             BRW_MATH_DATA_VECTOR,
 745             BRW_MATH_PRECISION_FULL);
 746
 747    if (c->dispatch_width == 16) {
 748       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 749       brw_math(p,
 750                offset(dst[dst_chan],1),
 751                function,
 752                saturate,
 753                3,
 754                brw_null_reg(),
 755                BRW_MATH_DATA_VECTOR,
 756                BRW_MATH_PRECISION_FULL);
 757    }
 758    brw_pop_insn_state(p);
 759 }
 760
 761
 762 void emit_math2(struct brw_wm_compile *c,
 763                 GLuint function,
 764                 const struct brw_reg *dst,
 765                 GLuint mask,
 766                 const struct brw_reg *arg0,
 767                 const struct brw_reg *arg1)
 768 {
 769    struct brw_compile *p = &c->func;
 770    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 771    GLuint saturate = ((mask & SATURATE) ?
 772                       BRW_MATH_SATURATE_SATURATE :
 773                       BRW_MATH_SATURATE_NONE);
 774
 775    if (!(mask & WRITEMASK_XYZW))
 776       return; /* Do not emit dead code */
 777
 778    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 779
 780    brw_push_insn_state(p);
 781
 782    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 783    brw_MOV(p, brw_message_reg(2), arg0[0]);
 784    if (c->dispatch_width == 16) {
 785       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 786       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 787    }
 788
 789    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 790    brw_MOV(p, brw_message_reg(3), arg1[0]);
 791    if (c->dispatch_width == 16) {
 792       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 793       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 794    }
 795
 796    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 797    brw_math(p,
 798             dst[dst_chan],
 799             function,
 800             saturate,
 801             2,
 802             brw_null_reg(),
 803             BRW_MATH_DATA_VECTOR,
 804             BRW_MATH_PRECISION_FULL);
 805
 806    /* Send two messages to perform all 16 operations:
 807     */
 808    if (c->dispatch_width == 16) {
 809       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 810       brw_math(p,
 811                offset(dst[dst_chan],1),
 812                function,
 813                saturate,
 814                4,
 815                brw_null_reg(),
 816                BRW_MATH_DATA_VECTOR,
 817                BRW_MATH_PRECISION_FULL);
 818    }
 819    brw_pop_insn_state(p);
 820 }
 821
 822
 823 void emit_tex(struct brw_wm_compile *c,
 824               struct brw_reg *dst,
 825               GLuint dst_flags,
 826               struct brw_reg *arg,
 827               struct brw_reg depth_payload,
 828               GLuint tex_idx,
 829               GLuint sampler,
 830               GLboolean shadow)
 831 {
 832    struct brw_compile *p = &c->func;
 833    struct intel_context *intel = &p->brw->intel;
 834    struct brw_reg dst_retyped;
 835    GLuint cur_mrf = 2, response_length;
 836    GLuint i, nr_texcoords;
 837    GLuint emit;
 838    GLuint msg_type;
 839    GLuint mrf_per_channel;
 840    GLuint simd_mode;
 841
 842    if (c->dispatch_width == 16) {
 843       mrf_per_channel = 2;
 844       response_length = 8;
 845       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 846       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 847    } else {
 848       mrf_per_channel = 1;
 849       response_length = 4;
 850       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 851       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 852    }
 853
 854    /* How many input regs are there?
 855     */
 856    switch (tex_idx) {
 857    case TEXTURE_1D_INDEX:
 858       emit = WRITEMASK_X;
 859       nr_texcoords = 1;
 860       break;
 861    case TEXTURE_2D_INDEX:
 862    case TEXTURE_RECT_INDEX:
 863       emit = WRITEMASK_XY;
 864       nr_texcoords = 2;
 865       break;
 866    case TEXTURE_3D_INDEX:
 867    case TEXTURE_CUBE_INDEX:
 868       emit = WRITEMASK_XYZ;
 869       nr_texcoords = 3;
 870       break;
 871    default:
 872       /* unexpected target */
 873       abort();
 874    }
 875
 876    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 877    if (!intel->is_ironlake && c->dispatch_width == 8)
 878       nr_texcoords = 3;
 879
 880    /* For shadow comparisons, we have to supply u,v,r. */
 881    if (shadow)
 882       nr_texcoords = 3;
 883
 884    /* Emit the texcoords. */
 885    for (i = 0; i < nr_texcoords; i++) {
 886       if (emit & (1<<i))
 887          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 888       else
 889          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 890       cur_mrf += mrf_per_channel;
 891    }
 892
 893    /* Fill in the shadow comparison reference value. */
 894    if (shadow) {
 895       if (intel->is_ironlake) {
 896          /* Fill in the cube map array index value. */
 897          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 898          cur_mrf += mrf_per_channel;
 899       } else if (c->dispatch_width == 8) {
 900          /* Fill in the LOD bias value. */
 901          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 902          cur_mrf += mrf_per_channel;
 903       }
 904       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 905       cur_mrf += mrf_per_channel;
 906    }
 907
 908    if (intel->is_ironlake) {
 909       if (shadow)
 910          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 911       else
 912          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 913    } else {
 914       /* Note that G45 and older determines shadow compare and dispatch width
 915        * from message length for most messages.
 916        */
 917       if (c->dispatch_width == 16 && shadow)
 918          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 919       else
 920          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 921    }
 922
 923    brw_SAMPLE(p,
 924               dst_retyped,
 925               1,
 926               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 927               SURF_INDEX_TEXTURE(sampler),
 928               sampler,
 929               dst_flags & WRITEMASK_XYZW,
 930               msg_type,
 931               response_length,
 932               cur_mrf - 1,
 933               0,
 934               1,
 935               simd_mode);
 936 }
 937
 938
 939 void emit_txb(struct brw_wm_compile *c,
 940               struct brw_reg *dst,
 941               GLuint dst_flags,
 942               struct brw_reg *arg,
 943               struct brw_reg depth_payload,
 944               GLuint tex_idx,
 945               GLuint sampler)
 946 {
 947    struct brw_compile *p = &c->func;
 948    struct intel_context *intel = &p->brw->intel;
 949    GLuint msgLength;
 950    GLuint msg_type;
 951    GLuint mrf_per_channel;
 952    GLuint response_length;
 953    struct brw_reg dst_retyped;
 954
 955    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
 956     * samples, so we'll use the 16-wide instruction, leave the second halves
 957     * undefined, and trust the execution mask to keep the undefined pixels
 958     * from mattering.
 959     */
 960    if (c->dispatch_width == 16 || !intel->is_ironlake) {
 961       if (intel->is_ironlake)
 962          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 963       else
 964          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 965       mrf_per_channel = 2;
 966       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 967       response_length = 8;
 968    } else {
 969       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 970       mrf_per_channel = 1;
 971       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 972       response_length = 4;
 973    }
 974
 975    /* Shadow ignored for txb. */
 976    switch (tex_idx) {
 977    case TEXTURE_1D_INDEX:
 978       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 979       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
 980       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 981       break;
 982    case TEXTURE_2D_INDEX:
 983    case TEXTURE_RECT_INDEX:
 984       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 985       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 986       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 987       break;
 988    case TEXTURE_3D_INDEX:
 989    case TEXTURE_CUBE_INDEX:
 990       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 991       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 992       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
 993       break;
 994    default:
 995       /* unexpected target */
 996       abort();
 997    }
 998
 999    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1000    msgLength = 2 + 4 * mrf_per_channel - 1;
1001
1002    brw_SAMPLE(p,
1003               dst_retyped,
1004               1,
1005               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1006               SURF_INDEX_TEXTURE(sampler),
1007               sampler,
1008               dst_flags & WRITEMASK_XYZW,
1009               msg_type,
1010               response_length,
1011               msgLength,
1012               0,
1013               1,
1014               BRW_SAMPLER_SIMD_MODE_SIMD16);
1015 }
1016
1017
1018 static void emit_lit(struct brw_wm_compile *c,
1019                      const struct brw_reg *dst,
1020                      GLuint mask,
1021                      const struct brw_reg *arg0)
1022 {
1023    struct brw_compile *p = &c->func;
1024
1025    assert((mask & WRITEMASK_XW) == 0);
1026
1027    if (mask & WRITEMASK_Y) {
1028       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1029       brw_MOV(p, dst[1], arg0[0]);
1030       brw_set_saturate(p, 0);
1031    }
1032
1033    if (mask & WRITEMASK_Z) {
1034       emit_math2(c, BRW_MATH_FUNCTION_POW,
1035                  &dst[2],
1036                  WRITEMASK_X | (mask & SATURATE),
1037                  &arg0[1],
1038                  &arg0[3]);
1039    }
1040
1041    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1042     * some of the POW calculations above, but 16-wide iff statements
1043     * seem to lock c1 hardware, so this is a nasty workaround:
1044     */
1045    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1046    {
1047       if (mask & WRITEMASK_Y)
1048          brw_MOV(p, dst[1], brw_imm_f(0));
1049
1050       if (mask & WRITEMASK_Z)
1051          brw_MOV(p, dst[2], brw_imm_f(0));
1052    }
1053    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1054 }
1055
1056
1057 /* Kill pixel - set execution mask to zero for those pixels which
1058  * fail.
1059  */
1060 static void emit_kil( struct brw_wm_compile *c,
1061                       struct brw_reg *arg0)
1062 {
1063    struct brw_compile *p = &c->func;
1064    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1065    GLuint i;
1066
1067    /* XXX - usually won't need 4 compares!
1068     */
1069    for (i = 0; i < 4; i++) {
1070       brw_push_insn_state(p);
1071       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1072       brw_set_predicate_control_flag_value(p, 0xff);
1073       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1074       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1075       brw_pop_insn_state(p);
1076    }
1077 }
1078
1079 /* KIL_NV kills the pixels that are currently executing, not based on a test
1080  * of the arguments.
1081  */
1082 static void emit_kil_nv( struct brw_wm_compile *c )
1083 {
1084    struct brw_compile *p = &c->func;
1085    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1086
1087    brw_push_insn_state(p);
1088    brw_set_mask_control(p, BRW_MASK_DISABLE);
1089    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1090    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1091    brw_pop_insn_state(p);
1092 }
1093
1094 static void fire_fb_write( struct brw_wm_compile *c,
1095                            GLuint base_reg,
1096                            GLuint nr,
1097                            GLuint target,
1098                            GLuint eot )
1099 {
1100    struct brw_compile *p = &c->func;
1101    struct brw_reg dst;
1102
1103    if (c->dispatch_width == 16)
1104       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1105    else
1106       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1107
1108    /* Pass through control information:
1109     */
1110 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1111    {
1112       brw_push_insn_state(p);
1113       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1114       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1115       brw_MOV(p,
1116                brw_message_reg(base_reg + 1),
1117                brw_vec8_grf(1, 0));
1118       brw_pop_insn_state(p);
1119    }
1120
1121    /* Send framebuffer write message: */
1122 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1123    brw_fb_WRITE(p,
1124                 dst,
1125                 base_reg,
1126                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1127                 target,
1128                 nr,
1129                 0,
1130                 eot);
1131 }
1132
1133
1134 static void emit_aa( struct brw_wm_compile *c,
1135                      struct brw_reg *arg1,
1136                      GLuint reg )
1137 {
1138    struct brw_compile *p = &c->func;
1139    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1140    GLuint off = c->key.aa_dest_stencil_reg % 2;
1141    struct brw_reg aa = offset(arg1[comp], off);
1142
1143    brw_push_insn_state(p);
1144    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1145    brw_MOV(p, brw_message_reg(reg), aa);
1146    brw_pop_insn_state(p);
1147 }
1148
1149
1150 /* Post-fragment-program processing.  Send the results to the
1151  * framebuffer.
1152  * \param arg0  the fragment color
1153  * \param arg1  the pass-through depth value
1154  * \param arg2  the shader-computed depth value
1155  */
1156 void emit_fb_write(struct brw_wm_compile *c,
1157                    struct brw_reg *arg0,
1158                    struct brw_reg *arg1,
1159                    struct brw_reg *arg2,
1160                    GLuint target,
1161                    GLuint eot)
1162 {
1163    struct brw_compile *p = &c->func;
1164    struct brw_context *brw = p->brw;
1165    struct intel_context *intel = &brw->intel;
1166    GLuint nr = 2;
1167    GLuint channel;
1168
1169    /* Reserve a space for AA - may not be needed:
1170     */
1171    if (c->key.aa_dest_stencil_reg)
1172       nr += 1;
1173
1174    /* I don't really understand how this achieves the color interleave
1175     * (ie RGBARGBA) in the result:  [Do the saturation here]
1176     */
1177    brw_push_insn_state(p);
1178
1179    for (channel = 0; channel < 4; channel++) {
1180       if (c->dispatch_width == 16 && (BRW_IS_G4X(brw) || intel->is_ironlake)) {
1181          /* By setting the high bit of the MRF register number, we indicate
1182           * that we want COMPR4 mode - instead of doing the usual destination
1183           * + 1 for the second half we get destination + 4.
1184           */
1185          brw_MOV(p,
1186                  brw_message_reg(nr + channel + (1 << 7)),
1187                  arg0[channel]);
1188       } else {
1189          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1190          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1191          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1192          brw_MOV(p,
1193                  brw_message_reg(nr + channel),
1194                  arg0[channel]);
1195
1196          if (c->dispatch_width == 16) {
1197             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1198             brw_MOV(p,
1199                     brw_message_reg(nr + channel + 4),
1200                     sechalf(arg0[channel]));
1201          }
1202       }
1203    }
1204    /* skip over the regs populated above:
1205     */
1206    nr += 8;
1207    brw_pop_insn_state(p);
1208
1209    if (c->key.source_depth_to_render_target)
1210    {
1211       if (c->key.computes_depth)
1212          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1213       else
1214          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1215
1216       nr += 2;
1217    }
1218
1219    if (c->key.dest_depth_reg)
1220    {
1221       GLuint comp = c->key.dest_depth_reg / 2;
1222       GLuint off = c->key.dest_depth_reg % 2;
1223
1224       if (off != 0) {
1225          brw_push_insn_state(p);
1226          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1227
1228          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1229          /* 2nd half? */
1230          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1231          brw_pop_insn_state(p);
1232       }
1233       else {
1234          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1235       }
1236       nr += 2;
1237    }
1238
1239    if (!c->key.runtime_check_aads_emit) {
1240       if (c->key.aa_dest_stencil_reg)
1241          emit_aa(c, arg1, 2);
1242
1243       fire_fb_write(c, 0, nr, target, eot);
1244    }
1245    else {
1246       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1247       struct brw_reg ip = brw_ip_reg();
1248       struct brw_instruction *jmp;
1249
1250       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1251       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1252       brw_AND(p,
1253               v1_null_ud,
1254               get_element_ud(brw_vec8_grf(1,0), 6),
1255               brw_imm_ud(1<<26));
1256
1257       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1258       {
1259          emit_aa(c, arg1, 2);
1260          fire_fb_write(c, 0, nr, target, eot);
1261          /* note - thread killed in subroutine */
1262       }
1263       brw_land_fwd_jump(p, jmp);
1264
1265       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1266        */
1267       fire_fb_write(c, 1, nr-1, target, eot);
1268    }
1269 }
1270
1271 /**
1272  * Move a GPR to scratch memory.
1273  */
1274 static void emit_spill( struct brw_wm_compile *c,
1275                         struct brw_reg reg,
1276                         GLuint slot )
1277 {
1278    struct brw_compile *p = &c->func;
1279
1280    /*
1281      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1282    */
1283    brw_MOV(p, brw_message_reg(2), reg);
1284
1285    /*
1286      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1287      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1288    */
1289    brw_dp_WRITE_16(p,
1290                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1291                    slot);
1292 }
1293
1294
1295 /**
1296  * Load a GPR from scratch memory.
1297  */
1298 static void emit_unspill( struct brw_wm_compile *c,
1299                           struct brw_reg reg,
1300                           GLuint slot )
1301 {
1302    struct brw_compile *p = &c->func;
1303
1304    /* Slot 0 is the undef value.
1305     */
1306    if (slot == 0) {
1307       brw_MOV(p, reg, brw_imm_f(0));
1308       return;
1309    }
1310
1311    /*
1312      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1313      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1314    */
1315
1316    brw_dp_READ_16(p,
1317                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1318                   slot);
1319 }
1320
1321
1322 /**
1323  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1324  * Args with unspill_reg != 0 will be loaded from scratch memory.
1325  */
1326 static void get_argument_regs( struct brw_wm_compile *c,
1327                                struct brw_wm_ref *arg[],
1328                                struct brw_reg *regs )
1329 {
1330    GLuint i;
1331
1332    for (i = 0; i < 4; i++) {
1333       if (arg[i]) {
1334          if (arg[i]->unspill_reg)
1335             emit_unspill(c,
1336                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1337                          arg[i]->value->spill_slot);
1338
1339          regs[i] = arg[i]->hw_reg;
1340       }
1341       else {
1342          regs[i] = brw_null_reg();
1343       }
1344    }
1345 }
1346
1347
1348 /**
1349  * For values that have a spill_slot!=0, write those regs to scratch memory.
1350  */
1351 static void spill_values( struct brw_wm_compile *c,
1352                           struct brw_wm_value *values,
1353                           GLuint nr )
1354 {
1355    GLuint i;
1356
1357    for (i = 0; i < nr; i++)
1358       if (values[i].spill_slot)
1359          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1360 }
1361
1362
1363 /* Emit the fragment program instructions here.
1364  */
1365 void brw_wm_emit( struct brw_wm_compile *c )
1366 {
1367    struct brw_compile *p = &c->func;
1368    GLuint insn;
1369
1370    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1371
1372    /* Check if any of the payload regs need to be spilled:
1373     */
1374    spill_values(c, c->payload.depth, 4);
1375    spill_values(c, c->creg, c->nr_creg);
1376    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1377
1378
1379    for (insn = 0; insn < c->nr_insns; insn++) {
1380
1381       struct brw_wm_instruction *inst = &c->instruction[insn];
1382       struct brw_reg args[3][4], dst[4];
1383       GLuint i, dst_flags;
1384
1385       /* Get argument regs:
1386        */
1387       for (i = 0; i < 3; i++)
1388          get_argument_regs(c, inst->src[i], args[i]);
1389
1390       /* Get dest regs:
1391        */
1392       for (i = 0; i < 4; i++)
1393          if (inst->dst[i])
1394             dst[i] = inst->dst[i]->hw_reg;
1395          else
1396             dst[i] = brw_null_reg();
1397
1398       /* Flags
1399        */
1400       dst_flags = inst->writemask;
1401       if (inst->saturate)
1402          dst_flags |= SATURATE;
1403
1404       switch (inst->opcode) {
1405          /* Generated instructions for calculating triangle interpolants:
1406           */
1407       case WM_PIXELXY:
1408          emit_pixel_xy(c, dst, dst_flags);
1409          break;
1410
1411       case WM_DELTAXY:
1412          emit_delta_xy(p, dst, dst_flags, args[0]);
1413          break;
1414
1415       case WM_WPOSXY:
1416          emit_wpos_xy(c, dst, dst_flags, args[0]);
1417          break;
1418
1419       case WM_PIXELW:
1420          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1421          break;
1422
1423       case WM_LINTERP:
1424          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1425          break;
1426
1427       case WM_PINTERP:
1428          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1429          break;
1430
1431       case WM_CINTERP:
1432          emit_cinterp(p, dst, dst_flags, args[0]);
1433          break;
1434
1435       case WM_FB_WRITE:
1436          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1437          break;
1438
1439       case WM_FRONTFACING:
1440          emit_frontfacing(p, dst, dst_flags);
1441          break;
1442
1443          /* Straightforward arithmetic:
1444           */
1445       case OPCODE_ADD:
1446          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1447          break;
1448
1449       case OPCODE_FRC:
1450          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1451          break;
1452
1453       case OPCODE_FLR:
1454          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1455          break;
1456
1457       case OPCODE_DDX:
1458          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1459          break;
1460
1461       case OPCODE_DDY:
1462          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1463          break;
1464
1465       case OPCODE_DP3:
1466          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1467          break;
1468
1469       case OPCODE_DP4:
1470          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1471          break;
1472
1473       case OPCODE_DPH:
1474          emit_dph(p, dst, dst_flags, args[0], args[1]);
1475          break;
1476
1477       case OPCODE_TRUNC:
1478          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1479          break;
1480
1481       case OPCODE_LRP:
1482          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1483          break;
1484
1485       case OPCODE_MAD:
1486          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1487          break;
1488
1489       case OPCODE_MOV:
1490       case OPCODE_SWZ:
1491          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1492          break;
1493
1494       case OPCODE_MUL:
1495          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1496          break;
1497
1498       case OPCODE_XPD:
1499          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1500          break;
1501
1502          /* Higher math functions:
1503           */
1504       case OPCODE_RCP:
1505          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1506          break;
1507
1508       case OPCODE_RSQ:
1509          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1510          break;
1511
1512       case OPCODE_SIN:
1513          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1514          break;
1515
1516       case OPCODE_COS:
1517          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1518          break;
1519
1520       case OPCODE_EX2:
1521          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1522          break;
1523
1524       case OPCODE_LG2:
1525          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1526          break;
1527
1528       case OPCODE_SCS:
1529          /* There is an scs math function, but it would need some
1530           * fixup for 16-element execution.
1531           */
1532          if (dst_flags & WRITEMASK_X)
1533             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1534          if (dst_flags & WRITEMASK_Y)
1535             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1536          break;
1537
1538       case OPCODE_POW:
1539          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1540          break;
1541
1542          /* Comparisons:
1543           */
1544       case OPCODE_CMP:
1545          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1546          break;
1547
1548       case OPCODE_MAX:
1549          emit_max(p, dst, dst_flags, args[0], args[1]);
1550          break;
1551
1552       case OPCODE_MIN:
1553          emit_min(p, dst, dst_flags, args[0], args[1]);
1554          break;
1555
1556       case OPCODE_SLT:
1557          emit_slt(p, dst, dst_flags, args[0], args[1]);
1558          break;
1559
1560       case OPCODE_SLE:
1561          emit_sle(p, dst, dst_flags, args[0], args[1]);
1562         break;
1563       case OPCODE_SGT:
1564          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1565         break;
1566       case OPCODE_SGE:
1567          emit_sge(p, dst, dst_flags, args[0], args[1]);
1568          break;
1569       case OPCODE_SEQ:
1570          emit_seq(p, dst, dst_flags, args[0], args[1]);
1571         break;
1572       case OPCODE_SNE:
1573          emit_sne(p, dst, dst_flags, args[0], args[1]);
1574         break;
1575
1576       case OPCODE_LIT:
1577          emit_lit(c, dst, dst_flags, args[0]);
1578          break;
1579
1580          /* Texturing operations:
1581           */
1582       case OPCODE_TEX:
1583          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1584                   inst->tex_idx, inst->tex_unit,
1585                   inst->tex_shadow);
1586          break;
1587
1588       case OPCODE_TXB:
1589          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1590                   inst->tex_idx, inst->tex_unit);
1591          break;
1592
1593       case OPCODE_KIL:
1594          emit_kil(c, args[0]);
1595          break;
1596
1597       case OPCODE_KIL_NV:
1598          emit_kil_nv(c);
1599          break;
1600
1601       default:
1602          _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
1603                       inst->opcode, inst->opcode < MAX_OPCODE ?
1604                                     _mesa_opcode_string(inst->opcode) :
1605                                     "unknown");
1606       }
1607
1608       for (i = 0; i < 4; i++)
1609         if (inst->dst[i] && inst->dst[i]->spill_slot)
1610            emit_spill(c,
1611                       inst->dst[i]->hw_reg,
1612                       inst->dst[i]->spill_slot);
1613    }
1614
1615    if (INTEL_DEBUG & DEBUG_WM) {
1616       int i;
1617
1618       _mesa_printf("wm-native:\n");
1619       for (i = 0; i < p->nr_insn; i++)
1620          brw_disasm(stderr, &p->store[i]);
1621       _mesa_printf("\n");
1622    }
1623 }