src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 /* Not quite sure how correct this is - need to understand horiz
  38  * vs. vertical strides a little better.
  39  */
  40 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  41 {
  42    if (reg.vstride)
  43       reg.nr++;
  44    return reg;
  45 }
  46
  47
  48 /* Payload R0:
  49  *
  50  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  51  *         corresponding to each of the 16 execution channels.
  52  * R0.1..8 -- ?
  53  * R1.0 -- triangle vertex 0.X
  54  * R1.1 -- triangle vertex 0.Y
  55  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  56  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  57  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  58  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  59  * R1.6 -- ?
  60  * R1.7 -- ?
  61  * R1.8 -- ?
  62  */
  63
  64 void emit_pixel_xy(struct brw_wm_compile *c,
  65                    const struct brw_reg *dst,
  66                    GLuint mask)
  67 {
  68    struct brw_compile *p = &c->func;
  69    struct brw_reg r1 = brw_vec1_grf(1, 0);
  70    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  71    struct brw_reg dst0_uw, dst1_uw;
  72
  73    brw_push_insn_state(p);
  74    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  75
  76    if (c->dispatch_width == 16) {
  77       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
  78       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
  79    } else {
  80       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
  81       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
  82    }
  83
  84    /* Calculate pixel centers by adding 1 or 0 to each of the
  85     * micro-tile coordinates passed in r1.
  86     */
  87    if (mask & WRITEMASK_X) {
  88       brw_ADD(p,
  89               dst0_uw,
  90               stride(suboffset(r1_uw, 4), 2, 4, 0),
  91               brw_imm_v(0x10101010));
  92    }
  93
  94    if (mask & WRITEMASK_Y) {
  95       brw_ADD(p,
  96               dst1_uw,
  97               stride(suboffset(r1_uw,5), 2, 4, 0),
  98               brw_imm_v(0x11001100));
  99    }
 100    brw_pop_insn_state(p);
 101 }
 102
 103
 104 void emit_delta_xy(struct brw_compile *p,
 105                    const struct brw_reg *dst,
 106                    GLuint mask,
 107                    const struct brw_reg *arg0)
 108 {
 109    struct brw_reg r1 = brw_vec1_grf(1, 0);
 110
 111    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 112     * centers.
 113     */
 114    if (mask & WRITEMASK_X) {
 115       brw_ADD(p,
 116               dst[0],
 117               retype(arg0[0], BRW_REGISTER_TYPE_UW),
 118               negate(r1));
 119    }
 120
 121    if (mask & WRITEMASK_Y) {
 122       brw_ADD(p,
 123               dst[1],
 124               retype(arg0[1], BRW_REGISTER_TYPE_UW),
 125               negate(suboffset(r1,1)));
 126
 127    }
 128 }
 129
 130 void emit_wpos_xy(struct brw_wm_compile *c,
 131                   const struct brw_reg *dst,
 132                   GLuint mask,
 133                   const struct brw_reg *arg0)
 134 {
 135    struct brw_compile *p = &c->func;
 136
 137    /* Calculate the pixel offset from window bottom left into destination
 138     * X and Y channels.
 139     */
 140    if (mask & WRITEMASK_X) {
 141       /* X' = X - origin */
 142       brw_ADD(p,
 143               dst[0],
 144               retype(arg0[0], BRW_REGISTER_TYPE_W),
 145               brw_imm_d(0 - c->key.origin_x));
 146    }
 147
 148    if (mask & WRITEMASK_Y) {
 149       /* Y' = height - (Y - origin_y) = height + origin_y - Y */
 150       brw_ADD(p,
 151               dst[1],
 152               negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 153               brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
 154    }
 155 }
 156
 157
 158 void emit_pixel_w(struct brw_wm_compile *c,
 159                   const struct brw_reg *dst,
 160                   GLuint mask,
 161                   const struct brw_reg *arg0,
 162                   const struct brw_reg *deltas)
 163 {
 164    struct brw_compile *p = &c->func;
 165
 166    /* Don't need this if all you are doing is interpolating color, for
 167     * instance.
 168     */
 169    if (mask & WRITEMASK_W) {
 170       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 171
 172       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 173        * result straight into a message reg.
 174        */
 175       brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 176       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 177
 178       /* Calc w */
 179       if (c->dispatch_width == 16) {
 180          brw_math_16(p, dst[3],
 181                      BRW_MATH_FUNCTION_INV,
 182                      BRW_MATH_SATURATE_NONE,
 183                      2, brw_null_reg(),
 184                      BRW_MATH_PRECISION_FULL);
 185       } else {
 186          brw_math(p, dst[3],
 187                   BRW_MATH_FUNCTION_INV,
 188                   BRW_MATH_SATURATE_NONE,
 189                   2, brw_null_reg(),
 190                   BRW_MATH_DATA_VECTOR,
 191                   BRW_MATH_PRECISION_FULL);
 192       }
 193    }
 194 }
 195
 196
 197 void emit_linterp(struct brw_compile *p,
 198                   const struct brw_reg *dst,
 199                   GLuint mask,
 200                   const struct brw_reg *arg0,
 201                   const struct brw_reg *deltas)
 202 {
 203    struct brw_reg interp[4];
 204    GLuint nr = arg0[0].nr;
 205    GLuint i;
 206
 207    interp[0] = brw_vec1_grf(nr, 0);
 208    interp[1] = brw_vec1_grf(nr, 4);
 209    interp[2] = brw_vec1_grf(nr+1, 0);
 210    interp[3] = brw_vec1_grf(nr+1, 4);
 211
 212    for (i = 0; i < 4; i++) {
 213       if (mask & (1<<i)) {
 214          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 215          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 216       }
 217    }
 218 }
 219
 220
 221 void emit_pinterp(struct brw_compile *p,
 222                   const struct brw_reg *dst,
 223                   GLuint mask,
 224                   const struct brw_reg *arg0,
 225                   const struct brw_reg *deltas,
 226                   const struct brw_reg *w)
 227 {
 228    struct brw_reg interp[4];
 229    GLuint nr = arg0[0].nr;
 230    GLuint i;
 231
 232    interp[0] = brw_vec1_grf(nr, 0);
 233    interp[1] = brw_vec1_grf(nr, 4);
 234    interp[2] = brw_vec1_grf(nr+1, 0);
 235    interp[3] = brw_vec1_grf(nr+1, 4);
 236
 237    for (i = 0; i < 4; i++) {
 238       if (mask & (1<<i)) {
 239          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 240          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 241       }
 242    }
 243    for (i = 0; i < 4; i++) {
 244       if (mask & (1<<i)) {
 245          brw_MUL(p, dst[i], dst[i], w[3]);
 246       }
 247    }
 248 }
 249
 250
 251 void emit_cinterp(struct brw_compile *p,
 252                   const struct brw_reg *dst,
 253                   GLuint mask,
 254                   const struct brw_reg *arg0)
 255 {
 256    struct brw_reg interp[4];
 257    GLuint nr = arg0[0].nr;
 258    GLuint i;
 259
 260    interp[0] = brw_vec1_grf(nr, 0);
 261    interp[1] = brw_vec1_grf(nr, 4);
 262    interp[2] = brw_vec1_grf(nr+1, 0);
 263    interp[3] = brw_vec1_grf(nr+1, 4);
 264
 265    for (i = 0; i < 4; i++) {
 266       if (mask & (1<<i)) {
 267          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 268       }
 269    }
 270 }
 271
 272 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 273 void emit_frontfacing(struct brw_compile *p,
 274                       const struct brw_reg *dst,
 275                       GLuint mask)
 276 {
 277    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 278    GLuint i;
 279
 280    if (!(mask & WRITEMASK_XYZW))
 281       return;
 282
 283    for (i = 0; i < 4; i++) {
 284       if (mask & (1<<i)) {
 285          brw_MOV(p, dst[i], brw_imm_f(0.0));
 286       }
 287    }
 288
 289    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 290     * us front face
 291     */
 292    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 293    for (i = 0; i < 4; i++) {
 294       if (mask & (1<<i)) {
 295          brw_MOV(p, dst[i], brw_imm_f(1.0));
 296       }
 297    }
 298    brw_set_predicate_control_flag_value(p, 0xff);
 299 }
 300
 301 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 302  * looking like:
 303  *
 304  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 305  *
 306  * and we're trying to produce:
 307  *
 308  *           DDX                     DDY
 309  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 310  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 311  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 312  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 313  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 314  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 315  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 316  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 317  *
 318  * and add another set of two more subspans if in 16-pixel dispatch mode.
 319  *
 320  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 321  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 322  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 323  * between each other.  We could probably do it like ddx and swizzle the right
 324  * order later, but bail for now and just produce
 325  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 326  */
 327 void emit_ddxy(struct brw_compile *p,
 328                const struct brw_reg *dst,
 329                GLuint mask,
 330                GLboolean is_ddx,
 331                const struct brw_reg *arg0)
 332 {
 333    int i;
 334    struct brw_reg src0, src1;
 335
 336    if (mask & SATURATE)
 337       brw_set_saturate(p, 1);
 338    for (i = 0; i < 4; i++ ) {
 339       if (mask & (1<<i)) {
 340          if (is_ddx) {
 341             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 342                            BRW_REGISTER_TYPE_F,
 343                            BRW_VERTICAL_STRIDE_2,
 344                            BRW_WIDTH_2,
 345                            BRW_HORIZONTAL_STRIDE_0,
 346                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 347             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 348                            BRW_REGISTER_TYPE_F,
 349                            BRW_VERTICAL_STRIDE_2,
 350                            BRW_WIDTH_2,
 351                            BRW_HORIZONTAL_STRIDE_0,
 352                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 353          } else {
 354             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 355                            BRW_REGISTER_TYPE_F,
 356                            BRW_VERTICAL_STRIDE_4,
 357                            BRW_WIDTH_4,
 358                            BRW_HORIZONTAL_STRIDE_0,
 359                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 360             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 361                            BRW_REGISTER_TYPE_F,
 362                            BRW_VERTICAL_STRIDE_4,
 363                            BRW_WIDTH_4,
 364                            BRW_HORIZONTAL_STRIDE_0,
 365                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 366          }
 367          brw_ADD(p, dst[i], src0, negate(src1));
 368       }
 369    }
 370    if (mask & SATURATE)
 371       brw_set_saturate(p, 0);
 372 }
 373
 374 void emit_alu1(struct brw_compile *p,
 375                struct brw_instruction *(*func)(struct brw_compile *,
 376                                                struct brw_reg,
 377                                                struct brw_reg),
 378                const struct brw_reg *dst,
 379                GLuint mask,
 380                const struct brw_reg *arg0)
 381 {
 382    GLuint i;
 383
 384    if (mask & SATURATE)
 385       brw_set_saturate(p, 1);
 386
 387    for (i = 0; i < 4; i++) {
 388       if (mask & (1<<i)) {
 389          func(p, dst[i], arg0[i]);
 390       }
 391    }
 392
 393    if (mask & SATURATE)
 394       brw_set_saturate(p, 0);
 395 }
 396
 397
 398 void emit_alu2(struct brw_compile *p,
 399                struct brw_instruction *(*func)(struct brw_compile *,
 400                                                struct brw_reg,
 401                                                struct brw_reg,
 402                                                struct brw_reg),
 403                const struct brw_reg *dst,
 404                GLuint mask,
 405                const struct brw_reg *arg0,
 406                const struct brw_reg *arg1)
 407 {
 408    GLuint i;
 409
 410    if (mask & SATURATE)
 411       brw_set_saturate(p, 1);
 412
 413    for (i = 0; i < 4; i++) {
 414       if (mask & (1<<i)) {
 415          func(p, dst[i], arg0[i], arg1[i]);
 416       }
 417    }
 418
 419    if (mask & SATURATE)
 420       brw_set_saturate(p, 0);
 421 }
 422
 423
 424 void emit_mad(struct brw_compile *p,
 425               const struct brw_reg *dst,
 426               GLuint mask,
 427               const struct brw_reg *arg0,
 428               const struct brw_reg *arg1,
 429               const struct brw_reg *arg2)
 430 {
 431    GLuint i;
 432
 433    for (i = 0; i < 4; i++) {
 434       if (mask & (1<<i)) {
 435          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 436
 437          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 438          brw_ADD(p, dst[i], dst[i], arg2[i]);
 439          brw_set_saturate(p, 0);
 440       }
 441    }
 442 }
 443
 444 void emit_lrp(struct brw_compile *p,
 445               const struct brw_reg *dst,
 446               GLuint mask,
 447               const struct brw_reg *arg0,
 448               const struct brw_reg *arg1,
 449               const struct brw_reg *arg2)
 450 {
 451    GLuint i;
 452
 453    /* Uses dst as a temporary:
 454     */
 455    for (i = 0; i < 4; i++) {
 456       if (mask & (1<<i)) {
 457          /* Can I use the LINE instruction for this?
 458           */
 459          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 460          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 461
 462          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 463          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 464          brw_set_saturate(p, 0);
 465       }
 466    }
 467 }
 468
 469 void emit_sop(struct brw_compile *p,
 470               const struct brw_reg *dst,
 471               GLuint mask,
 472               GLuint cond,
 473               const struct brw_reg *arg0,
 474               const struct brw_reg *arg1)
 475 {
 476    GLuint i;
 477
 478    for (i = 0; i < 4; i++) {
 479       if (mask & (1<<i)) {
 480          brw_push_insn_state(p);
 481          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 482          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 483          brw_MOV(p, dst[i], brw_imm_f(0));
 484          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 485          brw_MOV(p, dst[i], brw_imm_f(1.0));
 486          brw_pop_insn_state(p);
 487       }
 488    }
 489 }
 490
 491 static void emit_slt( struct brw_compile *p,
 492                       const struct brw_reg *dst,
 493                       GLuint mask,
 494                       const struct brw_reg *arg0,
 495                       const struct brw_reg *arg1 )
 496 {
 497    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 498 }
 499
 500 static void emit_sle( struct brw_compile *p,
 501                       const struct brw_reg *dst,
 502                       GLuint mask,
 503                       const struct brw_reg *arg0,
 504                       const struct brw_reg *arg1 )
 505 {
 506    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 507 }
 508
 509 static void emit_sgt( struct brw_compile *p,
 510                       const struct brw_reg *dst,
 511                       GLuint mask,
 512                       const struct brw_reg *arg0,
 513                       const struct brw_reg *arg1 )
 514 {
 515    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 516 }
 517
 518 static void emit_sge( struct brw_compile *p,
 519                       const struct brw_reg *dst,
 520                       GLuint mask,
 521                       const struct brw_reg *arg0,
 522                       const struct brw_reg *arg1 )
 523 {
 524    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 525 }
 526
 527 static void emit_seq( struct brw_compile *p,
 528                       const struct brw_reg *dst,
 529                       GLuint mask,
 530                       const struct brw_reg *arg0,
 531                       const struct brw_reg *arg1 )
 532 {
 533    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 534 }
 535
 536 static void emit_sne( struct brw_compile *p,
 537                       const struct brw_reg *dst,
 538                       GLuint mask,
 539                       const struct brw_reg *arg0,
 540                       const struct brw_reg *arg1 )
 541 {
 542    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 543 }
 544
 545 static void emit_cmp( struct brw_compile *p,
 546                       const struct brw_reg *dst,
 547                       GLuint mask,
 548                       const struct brw_reg *arg0,
 549                       const struct brw_reg *arg1,
 550                       const struct brw_reg *arg2 )
 551 {
 552    GLuint i;
 553
 554    for (i = 0; i < 4; i++) {
 555       if (mask & (1<<i)) {
 556          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 557          brw_MOV(p, dst[i], arg2[i]);
 558          brw_set_saturate(p, 0);
 559
 560          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 561
 562          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 563          brw_MOV(p, dst[i], arg1[i]);
 564          brw_set_saturate(p, 0);
 565          brw_set_predicate_control_flag_value(p, 0xff);
 566       }
 567    }
 568 }
 569
 570 void emit_max(struct brw_compile *p,
 571               const struct brw_reg *dst,
 572               GLuint mask,
 573               const struct brw_reg *arg0,
 574               const struct brw_reg *arg1)
 575 {
 576    GLuint i;
 577
 578    for (i = 0; i < 4; i++) {
 579       if (mask & (1<<i)) {
 580          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 581          brw_MOV(p, dst[i], arg0[i]);
 582          brw_set_saturate(p, 0);
 583
 584          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 585
 586          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 587          brw_MOV(p, dst[i], arg1[i]);
 588          brw_set_saturate(p, 0);
 589          brw_set_predicate_control_flag_value(p, 0xff);
 590       }
 591    }
 592 }
 593
 594 void emit_min(struct brw_compile *p,
 595               const struct brw_reg *dst,
 596               GLuint mask,
 597               const struct brw_reg *arg0,
 598               const struct brw_reg *arg1)
 599 {
 600    GLuint i;
 601
 602    for (i = 0; i < 4; i++) {
 603       if (mask & (1<<i)) {
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MOV(p, dst[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607
 608          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 609
 610          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 611          brw_MOV(p, dst[i], arg0[i]);
 612          brw_set_saturate(p, 0);
 613          brw_set_predicate_control_flag_value(p, 0xff);
 614       }
 615    }
 616 }
 617
 618
 619 void emit_dp3(struct brw_compile *p,
 620               const struct brw_reg *dst,
 621               GLuint mask,
 622               const struct brw_reg *arg0,
 623               const struct brw_reg *arg1)
 624 {
 625    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 626
 627    if (!(mask & WRITEMASK_XYZW))
 628       return; /* Do not emit dead code */
 629
 630    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 631
 632    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 633    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 634
 635    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 636    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 637    brw_set_saturate(p, 0);
 638 }
 639
 640
 641 void emit_dp4(struct brw_compile *p,
 642               const struct brw_reg *dst,
 643               GLuint mask,
 644               const struct brw_reg *arg0,
 645               const struct brw_reg *arg1)
 646 {
 647    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 648
 649    if (!(mask & WRITEMASK_XYZW))
 650       return; /* Do not emit dead code */
 651
 652    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 653
 654    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 655    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 656    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 657
 658    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 659    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 660    brw_set_saturate(p, 0);
 661 }
 662
 663
 664 void emit_dph(struct brw_compile *p,
 665               const struct brw_reg *dst,
 666               GLuint mask,
 667               const struct brw_reg *arg0,
 668               const struct brw_reg *arg1)
 669 {
 670    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 671
 672    if (!(mask & WRITEMASK_XYZW))
 673       return; /* Do not emit dead code */
 674
 675    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 676
 677    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 678    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 679    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 680
 681    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 682    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 683    brw_set_saturate(p, 0);
 684 }
 685
 686
 687 void emit_xpd(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1)
 692 {
 693    GLuint i;
 694
 695    assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
 696
 697    for (i = 0 ; i < 3; i++) {
 698       if (mask & (1<<i)) {
 699          GLuint i2 = (i+2)%3;
 700          GLuint i1 = (i+1)%3;
 701
 702          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 703
 704          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 705          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 706          brw_set_saturate(p, 0);
 707       }
 708    }
 709 }
 710
 711
 712 void emit_math1(struct brw_wm_compile *c,
 713                 GLuint function,
 714                 const struct brw_reg *dst,
 715                 GLuint mask,
 716                 const struct brw_reg *arg0)
 717 {
 718    struct brw_compile *p = &c->func;
 719    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 720    GLuint saturate = ((mask & SATURATE) ?
 721                       BRW_MATH_SATURATE_SATURATE :
 722                       BRW_MATH_SATURATE_NONE);
 723
 724    if (!(mask & WRITEMASK_XYZW))
 725       return; /* Do not emit dead code */
 726
 727    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 728
 729    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 730     * channels.
 731     */
 732    brw_MOV(p, brw_message_reg(2), arg0[0]);
 733
 734    /* Send two messages to perform all 16 operations:
 735     */
 736    brw_push_insn_state(p);
 737    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 738    brw_math(p,
 739             dst[dst_chan],
 740             function,
 741             saturate,
 742             2,
 743             brw_null_reg(),
 744             BRW_MATH_DATA_VECTOR,
 745             BRW_MATH_PRECISION_FULL);
 746
 747    if (c->dispatch_width == 16) {
 748       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 749       brw_math(p,
 750                offset(dst[dst_chan],1),
 751                function,
 752                saturate,
 753                3,
 754                brw_null_reg(),
 755                BRW_MATH_DATA_VECTOR,
 756                BRW_MATH_PRECISION_FULL);
 757    }
 758    brw_pop_insn_state(p);
 759 }
 760
 761
 762 void emit_math2(struct brw_wm_compile *c,
 763                 GLuint function,
 764                 const struct brw_reg *dst,
 765                 GLuint mask,
 766                 const struct brw_reg *arg0,
 767                 const struct brw_reg *arg1)
 768 {
 769    struct brw_compile *p = &c->func;
 770    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 771    GLuint saturate = ((mask & SATURATE) ?
 772                       BRW_MATH_SATURATE_SATURATE :
 773                       BRW_MATH_SATURATE_NONE);
 774
 775    if (!(mask & WRITEMASK_XYZW))
 776       return; /* Do not emit dead code */
 777
 778    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 779
 780    brw_push_insn_state(p);
 781
 782    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 783    brw_MOV(p, brw_message_reg(2), arg0[0]);
 784    if (c->dispatch_width == 16) {
 785       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 786       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 787    }
 788
 789    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 790    brw_MOV(p, brw_message_reg(3), arg1[0]);
 791    if (c->dispatch_width == 16) {
 792       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 793       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 794    }
 795
 796    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 797    brw_math(p,
 798             dst[dst_chan],
 799             function,
 800             saturate,
 801             2,
 802             brw_null_reg(),
 803             BRW_MATH_DATA_VECTOR,
 804             BRW_MATH_PRECISION_FULL);
 805
 806    /* Send two messages to perform all 16 operations:
 807     */
 808    if (c->dispatch_width == 16) {
 809       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 810       brw_math(p,
 811                offset(dst[dst_chan],1),
 812                function,
 813                saturate,
 814                4,
 815                brw_null_reg(),
 816                BRW_MATH_DATA_VECTOR,
 817                BRW_MATH_PRECISION_FULL);
 818    }
 819    brw_pop_insn_state(p);
 820 }
 821
 822
 823 void emit_tex(struct brw_wm_compile *c,
 824               struct brw_reg *dst,
 825               GLuint dst_flags,
 826               struct brw_reg *arg,
 827               struct brw_reg depth_payload,
 828               GLuint tex_idx,
 829               GLuint sampler,
 830               GLboolean shadow)
 831 {
 832    struct brw_compile *p = &c->func;
 833    struct brw_reg dst_retyped;
 834    GLuint cur_mrf = 2, response_length;
 835    GLuint i, nr_texcoords;
 836    GLuint emit;
 837    GLuint msg_type;
 838    GLuint mrf_per_channel;
 839    GLuint simd_mode;
 840
 841    if (c->dispatch_width == 16) {
 842       mrf_per_channel = 2;
 843       response_length = 8;
 844       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 845       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 846    } else {
 847       mrf_per_channel = 1;
 848       response_length = 4;
 849       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 850       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 851    }
 852
 853    /* How many input regs are there?
 854     */
 855    switch (tex_idx) {
 856    case TEXTURE_1D_INDEX:
 857       emit = WRITEMASK_X;
 858       nr_texcoords = 1;
 859       break;
 860    case TEXTURE_2D_INDEX:
 861    case TEXTURE_RECT_INDEX:
 862       emit = WRITEMASK_XY;
 863       nr_texcoords = 2;
 864       break;
 865    case TEXTURE_3D_INDEX:
 866    case TEXTURE_CUBE_INDEX:
 867       emit = WRITEMASK_XYZ;
 868       nr_texcoords = 3;
 869       break;
 870    default:
 871       /* unexpected target */
 872       abort();
 873    }
 874
 875    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 876    if (!BRW_IS_IGDNG(p->brw) && c->dispatch_width == 8)
 877       nr_texcoords = 3;
 878
 879    /* For shadow comparisons, we have to supply u,v,r. */
 880    if (shadow)
 881       nr_texcoords = 3;
 882
 883    /* Emit the texcoords. */
 884    for (i = 0; i < nr_texcoords; i++) {
 885       if (emit & (1<<i))
 886          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 887       else
 888          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 889       cur_mrf += mrf_per_channel;
 890    }
 891
 892    /* Fill in the shadow comparison reference value. */
 893    if (shadow) {
 894       if (BRW_IS_IGDNG(p->brw)) {
 895          /* Fill in the cube map array index value. */
 896          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 897          cur_mrf += mrf_per_channel;
 898       } else if (c->dispatch_width == 8) {
 899          /* Fill in the LOD bias value. */
 900          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 901          cur_mrf += mrf_per_channel;
 902       }
 903       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 904       cur_mrf += mrf_per_channel;
 905    }
 906
 907    if (BRW_IS_IGDNG(p->brw)) {
 908       if (shadow)
 909          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 910       else
 911          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 912    } else {
 913       /* Note that G45 and older determines shadow compare and dispatch width
 914        * from message length for most messages.
 915        */
 916       if (c->dispatch_width == 16 && shadow)
 917          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 918       else
 919          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 920    }
 921
 922    brw_SAMPLE(p,
 923               dst_retyped,
 924               1,
 925               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 926               SURF_INDEX_TEXTURE(sampler),
 927               sampler,
 928               dst_flags & WRITEMASK_XYZW,
 929               msg_type,
 930               response_length,
 931               cur_mrf - 1,
 932               0,
 933               1,
 934               simd_mode);
 935 }
 936
 937
 938 void emit_txb(struct brw_wm_compile *c,
 939               struct brw_reg *dst,
 940               GLuint dst_flags,
 941               struct brw_reg *arg,
 942               struct brw_reg depth_payload,
 943               GLuint tex_idx,
 944               GLuint sampler)
 945 {
 946    struct brw_compile *p = &c->func;
 947    GLuint msgLength;
 948    GLuint msg_type;
 949    GLuint mrf_per_channel;
 950    GLuint response_length;
 951    struct brw_reg dst_retyped;
 952
 953    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
 954     * samples, so we'll use the 16-wide instruction, leave the second halves
 955     * undefined, and trust the execution mask to keep the undefined pixels
 956     * from mattering.
 957     */
 958    if (c->dispatch_width == 16 || !BRW_IS_IGDNG(p->brw)) {
 959       if (BRW_IS_IGDNG(p->brw))
 960          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 961       else
 962          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 963       mrf_per_channel = 2;
 964       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 965       response_length = 8;
 966    } else {
 967       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 968       mrf_per_channel = 1;
 969       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 970       response_length = 4;
 971    }
 972
 973    /* Shadow ignored for txb. */
 974    switch (tex_idx) {
 975    case TEXTURE_1D_INDEX:
 976       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 977       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
 978       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 979       break;
 980    case TEXTURE_2D_INDEX:
 981    case TEXTURE_RECT_INDEX:
 982       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 983       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 984       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 985       break;
 986    case TEXTURE_3D_INDEX:
 987    case TEXTURE_CUBE_INDEX:
 988       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 989       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 990       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
 991       break;
 992    default:
 993       /* unexpected target */
 994       abort();
 995    }
 996
 997    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
 998    msgLength = 2 + 4 * mrf_per_channel - 1;
 999
1000    brw_SAMPLE(p,
1001               dst_retyped,
1002               1,
1003               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1004               SURF_INDEX_TEXTURE(sampler),
1005               sampler,
1006               dst_flags & WRITEMASK_XYZW,
1007               msg_type,
1008               response_length,
1009               msgLength,
1010               0,
1011               1,
1012               BRW_SAMPLER_SIMD_MODE_SIMD16);
1013 }
1014
1015
1016 static void emit_lit(struct brw_wm_compile *c,
1017                      const struct brw_reg *dst,
1018                      GLuint mask,
1019                      const struct brw_reg *arg0)
1020 {
1021    struct brw_compile *p = &c->func;
1022
1023    assert((mask & WRITEMASK_XW) == 0);
1024
1025    if (mask & WRITEMASK_Y) {
1026       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1027       brw_MOV(p, dst[1], arg0[0]);
1028       brw_set_saturate(p, 0);
1029    }
1030
1031    if (mask & WRITEMASK_Z) {
1032       emit_math2(c, BRW_MATH_FUNCTION_POW,
1033                  &dst[2],
1034                  WRITEMASK_X | (mask & SATURATE),
1035                  &arg0[1],
1036                  &arg0[3]);
1037    }
1038
1039    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1040     * some of the POW calculations above, but 16-wide iff statements
1041     * seem to lock c1 hardware, so this is a nasty workaround:
1042     */
1043    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1044    {
1045       if (mask & WRITEMASK_Y)
1046          brw_MOV(p, dst[1], brw_imm_f(0));
1047
1048       if (mask & WRITEMASK_Z)
1049          brw_MOV(p, dst[2], brw_imm_f(0));
1050    }
1051    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1052 }
1053
1054
1055 /* Kill pixel - set execution mask to zero for those pixels which
1056  * fail.
1057  */
1058 static void emit_kil( struct brw_wm_compile *c,
1059                       struct brw_reg *arg0)
1060 {
1061    struct brw_compile *p = &c->func;
1062    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1063    GLuint i;
1064
1065    /* XXX - usually won't need 4 compares!
1066     */
1067    for (i = 0; i < 4; i++) {
1068       brw_push_insn_state(p);
1069       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1070       brw_set_predicate_control_flag_value(p, 0xff);
1071       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1072       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1073       brw_pop_insn_state(p);
1074    }
1075 }
1076
1077 /* KIL_NV kills the pixels that are currently executing, not based on a test
1078  * of the arguments.
1079  */
1080 static void emit_kil_nv( struct brw_wm_compile *c )
1081 {
1082    struct brw_compile *p = &c->func;
1083    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1084
1085    brw_push_insn_state(p);
1086    brw_set_mask_control(p, BRW_MASK_DISABLE);
1087    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1088    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1089    brw_pop_insn_state(p);
1090 }
1091
1092 static void fire_fb_write( struct brw_wm_compile *c,
1093                            GLuint base_reg,
1094                            GLuint nr,
1095                            GLuint target,
1096                            GLuint eot )
1097 {
1098    struct brw_compile *p = &c->func;
1099    struct brw_reg dst;
1100
1101    if (c->dispatch_width == 16)
1102       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1103    else
1104       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1105
1106    /* Pass through control information:
1107     */
1108 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1109    {
1110       brw_push_insn_state(p);
1111       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1112       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1113       brw_MOV(p,
1114                brw_message_reg(base_reg + 1),
1115                brw_vec8_grf(1, 0));
1116       brw_pop_insn_state(p);
1117    }
1118
1119    /* Send framebuffer write message: */
1120 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1121    brw_fb_WRITE(p,
1122                 dst,
1123                 base_reg,
1124                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1125                 target,
1126                 nr,
1127                 0,
1128                 eot);
1129 }
1130
1131
1132 static void emit_aa( struct brw_wm_compile *c,
1133                      struct brw_reg *arg1,
1134                      GLuint reg )
1135 {
1136    struct brw_compile *p = &c->func;
1137    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1138    GLuint off = c->key.aa_dest_stencil_reg % 2;
1139    struct brw_reg aa = offset(arg1[comp], off);
1140
1141    brw_push_insn_state(p);
1142    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1143    brw_MOV(p, brw_message_reg(reg), aa);
1144    brw_pop_insn_state(p);
1145 }
1146
1147
1148 /* Post-fragment-program processing.  Send the results to the
1149  * framebuffer.
1150  * \param arg0  the fragment color
1151  * \param arg1  the pass-through depth value
1152  * \param arg2  the shader-computed depth value
1153  */
1154 void emit_fb_write(struct brw_wm_compile *c,
1155                    struct brw_reg *arg0,
1156                    struct brw_reg *arg1,
1157                    struct brw_reg *arg2,
1158                    GLuint target,
1159                    GLuint eot)
1160 {
1161    struct brw_compile *p = &c->func;
1162    struct brw_context *brw = p->brw;
1163    GLuint nr = 2;
1164    GLuint channel;
1165
1166    /* Reserve a space for AA - may not be needed:
1167     */
1168    if (c->key.aa_dest_stencil_reg)
1169       nr += 1;
1170
1171    /* I don't really understand how this achieves the color interleave
1172     * (ie RGBARGBA) in the result:  [Do the saturation here]
1173     */
1174    brw_push_insn_state(p);
1175
1176    for (channel = 0; channel < 4; channel++) {
1177       if (c->dispatch_width == 16 && (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))) {
1178          /* By setting the high bit of the MRF register number, we indicate
1179           * that we want COMPR4 mode - instead of doing the usual destination
1180           * + 1 for the second half we get destination + 4.
1181           */
1182          brw_MOV(p,
1183                  brw_message_reg(nr + channel + (1 << 7)),
1184                  arg0[channel]);
1185       } else {
1186          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1187          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1188          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1189          brw_MOV(p,
1190                  brw_message_reg(nr + channel),
1191                  arg0[channel]);
1192
1193          if (c->dispatch_width == 16) {
1194             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1195             brw_MOV(p,
1196                     brw_message_reg(nr + channel + 4),
1197                     sechalf(arg0[channel]));
1198          }
1199       }
1200    }
1201    /* skip over the regs populated above:
1202     */
1203    nr += 8;
1204    brw_pop_insn_state(p);
1205
1206    if (c->key.source_depth_to_render_target)
1207    {
1208       if (c->key.computes_depth)
1209          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1210       else
1211          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1212
1213       nr += 2;
1214    }
1215
1216    if (c->key.dest_depth_reg)
1217    {
1218       GLuint comp = c->key.dest_depth_reg / 2;
1219       GLuint off = c->key.dest_depth_reg % 2;
1220
1221       if (off != 0) {
1222          brw_push_insn_state(p);
1223          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1224
1225          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1226          /* 2nd half? */
1227          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1228          brw_pop_insn_state(p);
1229       }
1230       else {
1231          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1232       }
1233       nr += 2;
1234    }
1235
1236    if (!c->key.runtime_check_aads_emit) {
1237       if (c->key.aa_dest_stencil_reg)
1238          emit_aa(c, arg1, 2);
1239
1240       fire_fb_write(c, 0, nr, target, eot);
1241    }
1242    else {
1243       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1244       struct brw_reg ip = brw_ip_reg();
1245       struct brw_instruction *jmp;
1246
1247       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1248       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1249       brw_AND(p,
1250               v1_null_ud,
1251               get_element_ud(brw_vec8_grf(1,0), 6),
1252               brw_imm_ud(1<<26));
1253
1254       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1255       {
1256          emit_aa(c, arg1, 2);
1257          fire_fb_write(c, 0, nr, target, eot);
1258          /* note - thread killed in subroutine */
1259       }
1260       brw_land_fwd_jump(p, jmp);
1261
1262       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1263        */
1264       fire_fb_write(c, 1, nr-1, target, eot);
1265    }
1266 }
1267
1268 /**
1269  * Move a GPR to scratch memory.
1270  */
1271 static void emit_spill( struct brw_wm_compile *c,
1272                         struct brw_reg reg,
1273                         GLuint slot )
1274 {
1275    struct brw_compile *p = &c->func;
1276
1277    /*
1278      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1279    */
1280    brw_MOV(p, brw_message_reg(2), reg);
1281
1282    /*
1283      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1284      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1285    */
1286    brw_dp_WRITE_16(p,
1287                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1288                    slot);
1289 }
1290
1291
1292 /**
1293  * Load a GPR from scratch memory.
1294  */
1295 static void emit_unspill( struct brw_wm_compile *c,
1296                           struct brw_reg reg,
1297                           GLuint slot )
1298 {
1299    struct brw_compile *p = &c->func;
1300
1301    /* Slot 0 is the undef value.
1302     */
1303    if (slot == 0) {
1304       brw_MOV(p, reg, brw_imm_f(0));
1305       return;
1306    }
1307
1308    /*
1309      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1310      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1311    */
1312
1313    brw_dp_READ_16(p,
1314                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1315                   slot);
1316 }
1317
1318
1319 /**
1320  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1321  * Args with unspill_reg != 0 will be loaded from scratch memory.
1322  */
1323 static void get_argument_regs( struct brw_wm_compile *c,
1324                                struct brw_wm_ref *arg[],
1325                                struct brw_reg *regs )
1326 {
1327    GLuint i;
1328
1329    for (i = 0; i < 4; i++) {
1330       if (arg[i]) {
1331          if (arg[i]->unspill_reg)
1332             emit_unspill(c,
1333                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1334                          arg[i]->value->spill_slot);
1335
1336          regs[i] = arg[i]->hw_reg;
1337       }
1338       else {
1339          regs[i] = brw_null_reg();
1340       }
1341    }
1342 }
1343
1344
1345 /**
1346  * For values that have a spill_slot!=0, write those regs to scratch memory.
1347  */
1348 static void spill_values( struct brw_wm_compile *c,
1349                           struct brw_wm_value *values,
1350                           GLuint nr )
1351 {
1352    GLuint i;
1353
1354    for (i = 0; i < nr; i++)
1355       if (values[i].spill_slot)
1356          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1357 }
1358
1359
1360 /* Emit the fragment program instructions here.
1361  */
1362 void brw_wm_emit( struct brw_wm_compile *c )
1363 {
1364    struct brw_compile *p = &c->func;
1365    GLuint insn;
1366
1367    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1368
1369    /* Check if any of the payload regs need to be spilled:
1370     */
1371    spill_values(c, c->payload.depth, 4);
1372    spill_values(c, c->creg, c->nr_creg);
1373    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1374
1375
1376    for (insn = 0; insn < c->nr_insns; insn++) {
1377
1378       struct brw_wm_instruction *inst = &c->instruction[insn];
1379       struct brw_reg args[3][4], dst[4];
1380       GLuint i, dst_flags;
1381
1382       /* Get argument regs:
1383        */
1384       for (i = 0; i < 3; i++)
1385          get_argument_regs(c, inst->src[i], args[i]);
1386
1387       /* Get dest regs:
1388        */
1389       for (i = 0; i < 4; i++)
1390          if (inst->dst[i])
1391             dst[i] = inst->dst[i]->hw_reg;
1392          else
1393             dst[i] = brw_null_reg();
1394
1395       /* Flags
1396        */
1397       dst_flags = inst->writemask;
1398       if (inst->saturate)
1399          dst_flags |= SATURATE;
1400
1401       switch (inst->opcode) {
1402          /* Generated instructions for calculating triangle interpolants:
1403           */
1404       case WM_PIXELXY:
1405          emit_pixel_xy(c, dst, dst_flags);
1406          break;
1407
1408       case WM_DELTAXY:
1409          emit_delta_xy(p, dst, dst_flags, args[0]);
1410          break;
1411
1412       case WM_WPOSXY:
1413          emit_wpos_xy(c, dst, dst_flags, args[0]);
1414          break;
1415
1416       case WM_PIXELW:
1417          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1418          break;
1419
1420       case WM_LINTERP:
1421          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1422          break;
1423
1424       case WM_PINTERP:
1425          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1426          break;
1427
1428       case WM_CINTERP:
1429          emit_cinterp(p, dst, dst_flags, args[0]);
1430          break;
1431
1432       case WM_FB_WRITE:
1433          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1434          break;
1435
1436       case WM_FRONTFACING:
1437          emit_frontfacing(p, dst, dst_flags);
1438          break;
1439
1440          /* Straightforward arithmetic:
1441           */
1442       case OPCODE_ADD:
1443          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1444          break;
1445
1446       case OPCODE_FRC:
1447          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1448          break;
1449
1450       case OPCODE_FLR:
1451          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1452          break;
1453
1454       case OPCODE_DDX:
1455          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1456          break;
1457
1458       case OPCODE_DDY:
1459          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1460          break;
1461
1462       case OPCODE_DP3:
1463          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1464          break;
1465
1466       case OPCODE_DP4:
1467          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1468          break;
1469
1470       case OPCODE_DPH:
1471          emit_dph(p, dst, dst_flags, args[0], args[1]);
1472          break;
1473
1474       case OPCODE_TRUNC:
1475          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1476          break;
1477
1478       case OPCODE_LRP:
1479          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1480          break;
1481
1482       case OPCODE_MAD:
1483          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1484          break;
1485
1486       case OPCODE_MOV:
1487       case OPCODE_SWZ:
1488          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1489          break;
1490
1491       case OPCODE_MUL:
1492          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1493          break;
1494
1495       case OPCODE_XPD:
1496          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1497          break;
1498
1499          /* Higher math functions:
1500           */
1501       case OPCODE_RCP:
1502          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1503          break;
1504
1505       case OPCODE_RSQ:
1506          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1507          break;
1508
1509       case OPCODE_SIN:
1510          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1511          break;
1512
1513       case OPCODE_COS:
1514          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1515          break;
1516
1517       case OPCODE_EX2:
1518          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1519          break;
1520
1521       case OPCODE_LG2:
1522          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1523          break;
1524
1525       case OPCODE_SCS:
1526          /* There is an scs math function, but it would need some
1527           * fixup for 16-element execution.
1528           */
1529          if (dst_flags & WRITEMASK_X)
1530             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1531          if (dst_flags & WRITEMASK_Y)
1532             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1533          break;
1534
1535       case OPCODE_POW:
1536          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1537          break;
1538
1539          /* Comparisons:
1540           */
1541       case OPCODE_CMP:
1542          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1543          break;
1544
1545       case OPCODE_MAX:
1546          emit_max(p, dst, dst_flags, args[0], args[1]);
1547          break;
1548
1549       case OPCODE_MIN:
1550          emit_min(p, dst, dst_flags, args[0], args[1]);
1551          break;
1552
1553       case OPCODE_SLT:
1554          emit_slt(p, dst, dst_flags, args[0], args[1]);
1555          break;
1556
1557       case OPCODE_SLE:
1558          emit_sle(p, dst, dst_flags, args[0], args[1]);
1559         break;
1560       case OPCODE_SGT:
1561          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1562         break;
1563       case OPCODE_SGE:
1564          emit_sge(p, dst, dst_flags, args[0], args[1]);
1565          break;
1566       case OPCODE_SEQ:
1567          emit_seq(p, dst, dst_flags, args[0], args[1]);
1568         break;
1569       case OPCODE_SNE:
1570          emit_sne(p, dst, dst_flags, args[0], args[1]);
1571         break;
1572
1573       case OPCODE_LIT:
1574          emit_lit(c, dst, dst_flags, args[0]);
1575          break;
1576
1577          /* Texturing operations:
1578           */
1579       case OPCODE_TEX:
1580          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1581                   inst->tex_idx, inst->tex_unit,
1582                   inst->tex_shadow);
1583          break;
1584
1585       case OPCODE_TXB:
1586          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1587                   inst->tex_idx, inst->tex_unit);
1588          break;
1589
1590       case OPCODE_KIL:
1591          emit_kil(c, args[0]);
1592          break;
1593
1594       case OPCODE_KIL_NV:
1595          emit_kil_nv(c);
1596          break;
1597
1598       default:
1599          _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
1600                       inst->opcode, inst->opcode < MAX_OPCODE ?
1601                                     _mesa_opcode_string(inst->opcode) :
1602                                     "unknown");
1603       }
1604
1605       for (i = 0; i < 4; i++)
1606         if (inst->dst[i] && inst->dst[i]->spill_slot)
1607            emit_spill(c,
1608                       inst->dst[i]->hw_reg,
1609                       inst->dst[i]->spill_slot);
1610    }
1611
1612    if (INTEL_DEBUG & DEBUG_WM) {
1613       int i;
1614
1615       _mesa_printf("wm-native:\n");
1616       for (i = 0; i < p->nr_insn; i++)
1617          brw_disasm(stderr, &p->store[i]);
1618       _mesa_printf("\n");
1619    }
1620 }