src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 /* Not quite sure how correct this is - need to understand horiz
  38  * vs. vertical strides a little better.
  39  */
  40 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  41 {
  42    if (reg.vstride)
  43       reg.nr++;
  44    return reg;
  45 }
  46
  47
  48 /* Payload R0:
  49  *
  50  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  51  *         corresponding to each of the 16 execution channels.
  52  * R0.1..8 -- ?
  53  * R1.0 -- triangle vertex 0.X
  54  * R1.1 -- triangle vertex 0.Y
  55  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  56  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  57  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  58  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  59  * R1.6 -- ?
  60  * R1.7 -- ?
  61  * R1.8 -- ?
  62  */
  63
  64 void emit_pixel_xy(struct brw_wm_compile *c,
  65                    const struct brw_reg *dst,
  66                    GLuint mask)
  67 {
  68    struct brw_compile *p = &c->func;
  69    struct brw_reg r1 = brw_vec1_grf(1, 0);
  70    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  71    struct brw_reg dst0_uw, dst1_uw;
  72
  73    brw_push_insn_state(p);
  74    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  75
  76    if (c->dispatch_width == 16) {
  77       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
  78       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
  79    } else {
  80       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
  81       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
  82    }
  83
  84    /* Calculate pixel centers by adding 1 or 0 to each of the
  85     * micro-tile coordinates passed in r1.
  86     */
  87    if (mask & WRITEMASK_X) {
  88       brw_ADD(p,
  89               dst0_uw,
  90               stride(suboffset(r1_uw, 4), 2, 4, 0),
  91               brw_imm_v(0x10101010));
  92    }
  93
  94    if (mask & WRITEMASK_Y) {
  95       brw_ADD(p,
  96               dst1_uw,
  97               stride(suboffset(r1_uw,5), 2, 4, 0),
  98               brw_imm_v(0x11001100));
  99    }
 100    brw_pop_insn_state(p);
 101 }
 102
 103
 104 void emit_delta_xy(struct brw_compile *p,
 105                    const struct brw_reg *dst,
 106                    GLuint mask,
 107                    const struct brw_reg *arg0)
 108 {
 109    struct brw_reg r1 = brw_vec1_grf(1, 0);
 110
 111    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 112     * centers.
 113     */
 114    if (mask & WRITEMASK_X) {
 115       brw_ADD(p,
 116               dst[0],
 117               retype(arg0[0], BRW_REGISTER_TYPE_UW),
 118               negate(r1));
 119    }
 120
 121    if (mask & WRITEMASK_Y) {
 122       brw_ADD(p,
 123               dst[1],
 124               retype(arg0[1], BRW_REGISTER_TYPE_UW),
 125               negate(suboffset(r1,1)));
 126
 127    }
 128 }
 129
 130 void emit_wpos_xy(struct brw_wm_compile *c,
 131                   const struct brw_reg *dst,
 132                   GLuint mask,
 133                   const struct brw_reg *arg0)
 134 {
 135    struct brw_compile *p = &c->func;
 136
 137    /* Calculate the pixel offset from window bottom left into destination
 138     * X and Y channels.
 139     */
 140    if (mask & WRITEMASK_X) {
 141       /* X' = X - origin */
 142       brw_ADD(p,
 143               dst[0],
 144               retype(arg0[0], BRW_REGISTER_TYPE_W),
 145               brw_imm_d(0 - c->key.origin_x));
 146    }
 147
 148    if (mask & WRITEMASK_Y) {
 149       /* Y' = height - (Y - origin_y) = height + origin_y - Y */
 150       brw_ADD(p,
 151               dst[1],
 152               negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 153               brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
 154    }
 155 }
 156
 157
 158 void emit_pixel_w(struct brw_wm_compile *c,
 159                   const struct brw_reg *dst,
 160                   GLuint mask,
 161                   const struct brw_reg *arg0,
 162                   const struct brw_reg *deltas)
 163 {
 164    struct brw_compile *p = &c->func;
 165
 166    /* Don't need this if all you are doing is interpolating color, for
 167     * instance.
 168     */
 169    if (mask & WRITEMASK_W) {
 170       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 171
 172       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 173        * result straight into a message reg.
 174        */
 175       brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 176       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 177
 178       /* Calc w */
 179       if (c->dispatch_width == 16) {
 180          brw_math_16(p, dst[3],
 181                      BRW_MATH_FUNCTION_INV,
 182                      BRW_MATH_SATURATE_NONE,
 183                      2, brw_null_reg(),
 184                      BRW_MATH_PRECISION_FULL);
 185       } else {
 186          brw_math(p, dst[3],
 187                   BRW_MATH_FUNCTION_INV,
 188                   BRW_MATH_SATURATE_NONE,
 189                   2, brw_null_reg(),
 190                   BRW_MATH_DATA_VECTOR,
 191                   BRW_MATH_PRECISION_FULL);
 192       }
 193    }
 194 }
 195
 196
 197 void emit_linterp(struct brw_compile *p,
 198                   const struct brw_reg *dst,
 199                   GLuint mask,
 200                   const struct brw_reg *arg0,
 201                   const struct brw_reg *deltas)
 202 {
 203    struct brw_reg interp[4];
 204    GLuint nr = arg0[0].nr;
 205    GLuint i;
 206
 207    interp[0] = brw_vec1_grf(nr, 0);
 208    interp[1] = brw_vec1_grf(nr, 4);
 209    interp[2] = brw_vec1_grf(nr+1, 0);
 210    interp[3] = brw_vec1_grf(nr+1, 4);
 211
 212    for (i = 0; i < 4; i++) {
 213       if (mask & (1<<i)) {
 214          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 215          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 216       }
 217    }
 218 }
 219
 220
 221 void emit_pinterp(struct brw_compile *p,
 222                   const struct brw_reg *dst,
 223                   GLuint mask,
 224                   const struct brw_reg *arg0,
 225                   const struct brw_reg *deltas,
 226                   const struct brw_reg *w)
 227 {
 228    struct brw_reg interp[4];
 229    GLuint nr = arg0[0].nr;
 230    GLuint i;
 231
 232    interp[0] = brw_vec1_grf(nr, 0);
 233    interp[1] = brw_vec1_grf(nr, 4);
 234    interp[2] = brw_vec1_grf(nr+1, 0);
 235    interp[3] = brw_vec1_grf(nr+1, 4);
 236
 237    for (i = 0; i < 4; i++) {
 238       if (mask & (1<<i)) {
 239          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 240          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 241       }
 242    }
 243    for (i = 0; i < 4; i++) {
 244       if (mask & (1<<i)) {
 245          brw_MUL(p, dst[i], dst[i], w[3]);
 246       }
 247    }
 248 }
 249
 250
 251 void emit_cinterp(struct brw_compile *p,
 252                   const struct brw_reg *dst,
 253                   GLuint mask,
 254                   const struct brw_reg *arg0)
 255 {
 256    struct brw_reg interp[4];
 257    GLuint nr = arg0[0].nr;
 258    GLuint i;
 259
 260    interp[0] = brw_vec1_grf(nr, 0);
 261    interp[1] = brw_vec1_grf(nr, 4);
 262    interp[2] = brw_vec1_grf(nr+1, 0);
 263    interp[3] = brw_vec1_grf(nr+1, 4);
 264
 265    for (i = 0; i < 4; i++) {
 266       if (mask & (1<<i)) {
 267          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 268       }
 269    }
 270 }
 271
 272 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 273 void emit_frontfacing(struct brw_compile *p,
 274                       const struct brw_reg *dst,
 275                       GLuint mask)
 276 {
 277    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 278    GLuint i;
 279
 280    if (!(mask & WRITEMASK_XYZW))
 281       return;
 282
 283    for (i = 0; i < 4; i++) {
 284       if (mask & (1<<i)) {
 285          brw_MOV(p, dst[i], brw_imm_f(0.0));
 286       }
 287    }
 288
 289    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 290     * us front face
 291     */
 292    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 293    for (i = 0; i < 4; i++) {
 294       if (mask & (1<<i)) {
 295          brw_MOV(p, dst[i], brw_imm_f(1.0));
 296       }
 297    }
 298    brw_set_predicate_control_flag_value(p, 0xff);
 299 }
 300
 301 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 302  * looking like:
 303  *
 304  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 305  *
 306  * and we're trying to produce:
 307  *
 308  *           DDX                     DDY
 309  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 310  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 311  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 312  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 313  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 314  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 315  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 316  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 317  *
 318  * and add another set of two more subspans if in 16-pixel dispatch mode.
 319  *
 320  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 321  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 322  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 323  * between each other.  We could probably do it like ddx and swizzle the right
 324  * order later, but bail for now and just produce
 325  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 326  */
 327 void emit_ddxy(struct brw_compile *p,
 328                const struct brw_reg *dst,
 329                GLuint mask,
 330                GLboolean is_ddx,
 331                const struct brw_reg *arg0)
 332 {
 333    int i;
 334    struct brw_reg src0, src1;
 335
 336    if (mask & SATURATE)
 337       brw_set_saturate(p, 1);
 338    for (i = 0; i < 4; i++ ) {
 339       if (mask & (1<<i)) {
 340          if (is_ddx) {
 341             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 342                            BRW_REGISTER_TYPE_F,
 343                            BRW_VERTICAL_STRIDE_2,
 344                            BRW_WIDTH_2,
 345                            BRW_HORIZONTAL_STRIDE_0,
 346                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 347             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 348                            BRW_REGISTER_TYPE_F,
 349                            BRW_VERTICAL_STRIDE_2,
 350                            BRW_WIDTH_2,
 351                            BRW_HORIZONTAL_STRIDE_0,
 352                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 353          } else {
 354             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 355                            BRW_REGISTER_TYPE_F,
 356                            BRW_VERTICAL_STRIDE_4,
 357                            BRW_WIDTH_4,
 358                            BRW_HORIZONTAL_STRIDE_0,
 359                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 360             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 361                            BRW_REGISTER_TYPE_F,
 362                            BRW_VERTICAL_STRIDE_4,
 363                            BRW_WIDTH_4,
 364                            BRW_HORIZONTAL_STRIDE_0,
 365                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 366          }
 367          brw_ADD(p, dst[i], src0, negate(src1));
 368       }
 369    }
 370    if (mask & SATURATE)
 371       brw_set_saturate(p, 0);
 372 }
 373
 374 void emit_alu1(struct brw_compile *p,
 375                struct brw_instruction *(*func)(struct brw_compile *,
 376                                                struct brw_reg,
 377                                                struct brw_reg),
 378                const struct brw_reg *dst,
 379                GLuint mask,
 380                const struct brw_reg *arg0)
 381 {
 382    GLuint i;
 383
 384    if (mask & SATURATE)
 385       brw_set_saturate(p, 1);
 386
 387    for (i = 0; i < 4; i++) {
 388       if (mask & (1<<i)) {
 389          func(p, dst[i], arg0[i]);
 390       }
 391    }
 392
 393    if (mask & SATURATE)
 394       brw_set_saturate(p, 0);
 395 }
 396
 397
 398 void emit_alu2(struct brw_compile *p,
 399                struct brw_instruction *(*func)(struct brw_compile *,
 400                                                struct brw_reg,
 401                                                struct brw_reg,
 402                                                struct brw_reg),
 403                const struct brw_reg *dst,
 404                GLuint mask,
 405                const struct brw_reg *arg0,
 406                const struct brw_reg *arg1)
 407 {
 408    GLuint i;
 409
 410    if (mask & SATURATE)
 411       brw_set_saturate(p, 1);
 412
 413    for (i = 0; i < 4; i++) {
 414       if (mask & (1<<i)) {
 415          func(p, dst[i], arg0[i], arg1[i]);
 416       }
 417    }
 418
 419    if (mask & SATURATE)
 420       brw_set_saturate(p, 0);
 421 }
 422
 423
 424 void emit_mad(struct brw_compile *p,
 425               const struct brw_reg *dst,
 426               GLuint mask,
 427               const struct brw_reg *arg0,
 428               const struct brw_reg *arg1,
 429               const struct brw_reg *arg2)
 430 {
 431    GLuint i;
 432
 433    for (i = 0; i < 4; i++) {
 434       if (mask & (1<<i)) {
 435          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 436
 437          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 438          brw_ADD(p, dst[i], dst[i], arg2[i]);
 439          brw_set_saturate(p, 0);
 440       }
 441    }
 442 }
 443
 444 void emit_lrp(struct brw_compile *p,
 445               const struct brw_reg *dst,
 446               GLuint mask,
 447               const struct brw_reg *arg0,
 448               const struct brw_reg *arg1,
 449               const struct brw_reg *arg2)
 450 {
 451    GLuint i;
 452
 453    /* Uses dst as a temporary:
 454     */
 455    for (i = 0; i < 4; i++) {
 456       if (mask & (1<<i)) {
 457          /* Can I use the LINE instruction for this?
 458           */
 459          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 460          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 461
 462          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 463          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 464          brw_set_saturate(p, 0);
 465       }
 466    }
 467 }
 468
 469 void emit_sop(struct brw_compile *p,
 470               const struct brw_reg *dst,
 471               GLuint mask,
 472               GLuint cond,
 473               const struct brw_reg *arg0,
 474               const struct brw_reg *arg1)
 475 {
 476    GLuint i;
 477
 478    for (i = 0; i < 4; i++) {
 479       if (mask & (1<<i)) {
 480          brw_push_insn_state(p);
 481          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 482          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 483          brw_MOV(p, dst[i], brw_imm_f(0));
 484          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 485          brw_MOV(p, dst[i], brw_imm_f(1.0));
 486          brw_pop_insn_state(p);
 487       }
 488    }
 489 }
 490
 491 static void emit_slt( struct brw_compile *p,
 492                       const struct brw_reg *dst,
 493                       GLuint mask,
 494                       const struct brw_reg *arg0,
 495                       const struct brw_reg *arg1 )
 496 {
 497    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 498 }
 499
 500 static void emit_sle( struct brw_compile *p,
 501                       const struct brw_reg *dst,
 502                       GLuint mask,
 503                       const struct brw_reg *arg0,
 504                       const struct brw_reg *arg1 )
 505 {
 506    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 507 }
 508
 509 static void emit_sgt( struct brw_compile *p,
 510                       const struct brw_reg *dst,
 511                       GLuint mask,
 512                       const struct brw_reg *arg0,
 513                       const struct brw_reg *arg1 )
 514 {
 515    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 516 }
 517
 518 static void emit_sge( struct brw_compile *p,
 519                       const struct brw_reg *dst,
 520                       GLuint mask,
 521                       const struct brw_reg *arg0,
 522                       const struct brw_reg *arg1 )
 523 {
 524    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 525 }
 526
 527 static void emit_seq( struct brw_compile *p,
 528                       const struct brw_reg *dst,
 529                       GLuint mask,
 530                       const struct brw_reg *arg0,
 531                       const struct brw_reg *arg1 )
 532 {
 533    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 534 }
 535
 536 static void emit_sne( struct brw_compile *p,
 537                       const struct brw_reg *dst,
 538                       GLuint mask,
 539                       const struct brw_reg *arg0,
 540                       const struct brw_reg *arg1 )
 541 {
 542    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 543 }
 544
 545 static void emit_cmp( struct brw_compile *p,
 546                       const struct brw_reg *dst,
 547                       GLuint mask,
 548                       const struct brw_reg *arg0,
 549                       const struct brw_reg *arg1,
 550                       const struct brw_reg *arg2 )
 551 {
 552    GLuint i;
 553
 554    for (i = 0; i < 4; i++) {
 555       if (mask & (1<<i)) {
 556          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 557          brw_MOV(p, dst[i], arg2[i]);
 558          brw_set_saturate(p, 0);
 559
 560          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 561
 562          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 563          brw_MOV(p, dst[i], arg1[i]);
 564          brw_set_saturate(p, 0);
 565          brw_set_predicate_control_flag_value(p, 0xff);
 566       }
 567    }
 568 }
 569
 570 void emit_max(struct brw_compile *p,
 571               const struct brw_reg *dst,
 572               GLuint mask,
 573               const struct brw_reg *arg0,
 574               const struct brw_reg *arg1)
 575 {
 576    GLuint i;
 577
 578    for (i = 0; i < 4; i++) {
 579       if (mask & (1<<i)) {
 580          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 581          brw_MOV(p, dst[i], arg0[i]);
 582          brw_set_saturate(p, 0);
 583
 584          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 585
 586          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 587          brw_MOV(p, dst[i], arg1[i]);
 588          brw_set_saturate(p, 0);
 589          brw_set_predicate_control_flag_value(p, 0xff);
 590       }
 591    }
 592 }
 593
 594 void emit_min(struct brw_compile *p,
 595               const struct brw_reg *dst,
 596               GLuint mask,
 597               const struct brw_reg *arg0,
 598               const struct brw_reg *arg1)
 599 {
 600    GLuint i;
 601
 602    for (i = 0; i < 4; i++) {
 603       if (mask & (1<<i)) {
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MOV(p, dst[i], arg1[i]);
 606          brw_set_saturate(p, 0);
 607
 608          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 609
 610          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 611          brw_MOV(p, dst[i], arg0[i]);
 612          brw_set_saturate(p, 0);
 613          brw_set_predicate_control_flag_value(p, 0xff);
 614       }
 615    }
 616 }
 617
 618
 619 void emit_dp3(struct brw_compile *p,
 620               const struct brw_reg *dst,
 621               GLuint mask,
 622               const struct brw_reg *arg0,
 623               const struct brw_reg *arg1)
 624 {
 625    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 626
 627    if (!(mask & WRITEMASK_XYZW))
 628       return; /* Do not emit dead code */
 629
 630    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 631
 632    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 633    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 634
 635    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 636    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 637    brw_set_saturate(p, 0);
 638 }
 639
 640
 641 void emit_dp4(struct brw_compile *p,
 642               const struct brw_reg *dst,
 643               GLuint mask,
 644               const struct brw_reg *arg0,
 645               const struct brw_reg *arg1)
 646 {
 647    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 648
 649    if (!(mask & WRITEMASK_XYZW))
 650       return; /* Do not emit dead code */
 651
 652    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 653
 654    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 655    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 656    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 657
 658    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 659    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 660    brw_set_saturate(p, 0);
 661 }
 662
 663
 664 void emit_dph(struct brw_compile *p,
 665               const struct brw_reg *dst,
 666               GLuint mask,
 667               const struct brw_reg *arg0,
 668               const struct brw_reg *arg1)
 669 {
 670    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 671
 672    if (!(mask & WRITEMASK_XYZW))
 673       return; /* Do not emit dead code */
 674
 675    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 676
 677    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 678    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 679    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 680
 681    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 682    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 683    brw_set_saturate(p, 0);
 684 }
 685
 686
 687 void emit_xpd(struct brw_compile *p,
 688               const struct brw_reg *dst,
 689               GLuint mask,
 690               const struct brw_reg *arg0,
 691               const struct brw_reg *arg1)
 692 {
 693    GLuint i;
 694
 695    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 696
 697    for (i = 0 ; i < 3; i++) {
 698       if (mask & (1<<i)) {
 699          GLuint i2 = (i+2)%3;
 700          GLuint i1 = (i+1)%3;
 701
 702          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 703
 704          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 705          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 706          brw_set_saturate(p, 0);
 707       }
 708    }
 709 }
 710
 711
 712 void emit_math1(struct brw_wm_compile *c,
 713                 GLuint function,
 714                 const struct brw_reg *dst,
 715                 GLuint mask,
 716                 const struct brw_reg *arg0)
 717 {
 718    struct brw_compile *p = &c->func;
 719    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 720    GLuint saturate = ((mask & SATURATE) ?
 721                       BRW_MATH_SATURATE_SATURATE :
 722                       BRW_MATH_SATURATE_NONE);
 723
 724    if (!(mask & WRITEMASK_XYZW))
 725       return; /* Do not emit dead code */
 726
 727    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 728
 729    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 730     * channels.
 731     */
 732    brw_MOV(p, brw_message_reg(2), arg0[0]);
 733
 734    /* Send two messages to perform all 16 operations:
 735     */
 736    brw_push_insn_state(p);
 737    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 738    brw_math(p,
 739             dst[dst_chan],
 740             function,
 741             saturate,
 742             2,
 743             brw_null_reg(),
 744             BRW_MATH_DATA_VECTOR,
 745             BRW_MATH_PRECISION_FULL);
 746
 747    if (c->dispatch_width == 16) {
 748       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 749       brw_math(p,
 750                offset(dst[dst_chan],1),
 751                function,
 752                saturate,
 753                3,
 754                brw_null_reg(),
 755                BRW_MATH_DATA_VECTOR,
 756                BRW_MATH_PRECISION_FULL);
 757    }
 758    brw_pop_insn_state(p);
 759 }
 760
 761
 762 void emit_math2(struct brw_wm_compile *c,
 763                 GLuint function,
 764                 const struct brw_reg *dst,
 765                 GLuint mask,
 766                 const struct brw_reg *arg0,
 767                 const struct brw_reg *arg1)
 768 {
 769    struct brw_compile *p = &c->func;
 770    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 771    GLuint saturate = ((mask & SATURATE) ?
 772                       BRW_MATH_SATURATE_SATURATE :
 773                       BRW_MATH_SATURATE_NONE);
 774
 775    if (!(mask & WRITEMASK_XYZW))
 776       return; /* Do not emit dead code */
 777
 778    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 779
 780    brw_push_insn_state(p);
 781
 782    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 783    brw_MOV(p, brw_message_reg(2), arg0[0]);
 784    if (c->dispatch_width == 16) {
 785       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 786       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 787    }
 788
 789    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 790    brw_MOV(p, brw_message_reg(3), arg1[0]);
 791    if (c->dispatch_width == 16) {
 792       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 793       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 794    }
 795
 796    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 797    brw_math(p,
 798             dst[dst_chan],
 799             function,
 800             saturate,
 801             2,
 802             brw_null_reg(),
 803             BRW_MATH_DATA_VECTOR,
 804             BRW_MATH_PRECISION_FULL);
 805
 806    /* Send two messages to perform all 16 operations:
 807     */
 808    if (c->dispatch_width == 16) {
 809       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 810       brw_math(p,
 811                offset(dst[dst_chan],1),
 812                function,
 813                saturate,
 814                4,
 815                brw_null_reg(),
 816                BRW_MATH_DATA_VECTOR,
 817                BRW_MATH_PRECISION_FULL);
 818    }
 819    brw_pop_insn_state(p);
 820 }
 821
 822
 823 void emit_tex(struct brw_wm_compile *c,
 824               struct brw_reg *dst,
 825               GLuint dst_flags,
 826               struct brw_reg *arg,
 827               struct brw_reg depth_payload,
 828               GLuint tex_idx,
 829               GLuint sampler,
 830               GLboolean shadow)
 831 {
 832    struct brw_compile *p = &c->func;
 833    struct intel_context *intel = &p->brw->intel;
 834    struct brw_reg dst_retyped;
 835    GLuint cur_mrf = 2, response_length;
 836    GLuint i, nr_texcoords;
 837    GLuint emit;
 838    GLuint msg_type;
 839    GLuint mrf_per_channel;
 840    GLuint simd_mode;
 841
 842    if (c->dispatch_width == 16) {
 843       mrf_per_channel = 2;
 844       response_length = 8;
 845       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 846       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 847    } else {
 848       mrf_per_channel = 1;
 849       response_length = 4;
 850       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 851       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 852    }
 853
 854    /* How many input regs are there?
 855     */
 856    switch (tex_idx) {
 857    case TEXTURE_1D_INDEX:
 858       emit = WRITEMASK_X;
 859       nr_texcoords = 1;
 860       break;
 861    case TEXTURE_2D_INDEX:
 862    case TEXTURE_RECT_INDEX:
 863       emit = WRITEMASK_XY;
 864       nr_texcoords = 2;
 865       break;
 866    case TEXTURE_3D_INDEX:
 867    case TEXTURE_CUBE_INDEX:
 868       emit = WRITEMASK_XYZ;
 869       nr_texcoords = 3;
 870       break;
 871    default:
 872       /* unexpected target */
 873       abort();
 874    }
 875
 876    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 877    if (!intel->is_ironlake && c->dispatch_width == 8)
 878       nr_texcoords = 3;
 879
 880    /* For shadow comparisons, we have to supply u,v,r. */
 881    if (shadow)
 882       nr_texcoords = 3;
 883
 884    /* Emit the texcoords. */
 885    for (i = 0; i < nr_texcoords; i++) {
 886       if (emit & (1<<i))
 887          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 888       else
 889          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 890       cur_mrf += mrf_per_channel;
 891    }
 892
 893    /* Fill in the shadow comparison reference value. */
 894    if (shadow) {
 895       if (intel->is_ironlake) {
 896          /* Fill in the cube map array index value. */
 897          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 898          cur_mrf += mrf_per_channel;
 899       } else if (c->dispatch_width == 8) {
 900          /* Fill in the LOD bias value. */
 901          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 902          cur_mrf += mrf_per_channel;
 903       }
 904       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 905       cur_mrf += mrf_per_channel;
 906    }
 907
 908    if (intel->is_ironlake) {
 909       if (shadow)
 910          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 911       else
 912          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 913    } else {
 914       /* Note that G45 and older determines shadow compare and dispatch width
 915        * from message length for most messages.
 916        */
 917       if (c->dispatch_width == 16 && shadow)
 918          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 919       else
 920          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 921    }
 922
 923    brw_SAMPLE(p,
 924               dst_retyped,
 925               1,
 926               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 927               SURF_INDEX_TEXTURE(sampler),
 928               sampler,
 929               dst_flags & WRITEMASK_XYZW,
 930               msg_type,
 931               response_length,
 932               cur_mrf - 1,
 933               0,
 934               1,
 935               simd_mode);
 936 }
 937
 938
 939 void emit_txb(struct brw_wm_compile *c,
 940               struct brw_reg *dst,
 941               GLuint dst_flags,
 942               struct brw_reg *arg,
 943               struct brw_reg depth_payload,
 944               GLuint tex_idx,
 945               GLuint sampler)
 946 {
 947    struct brw_compile *p = &c->func;
 948    struct intel_context *intel = &p->brw->intel;
 949    GLuint msgLength;
 950    GLuint msg_type;
 951    GLuint mrf_per_channel;
 952    GLuint response_length;
 953    struct brw_reg dst_retyped;
 954
 955    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
 956     * samples, so we'll use the 16-wide instruction, leave the second halves
 957     * undefined, and trust the execution mask to keep the undefined pixels
 958     * from mattering.
 959     */
 960    if (c->dispatch_width == 16 || !intel->is_ironlake) {
 961       if (intel->is_ironlake)
 962          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 963       else
 964          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 965       mrf_per_channel = 2;
 966       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 967       response_length = 8;
 968    } else {
 969       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 970       mrf_per_channel = 1;
 971       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 972       response_length = 4;
 973    }
 974
 975    /* Shadow ignored for txb. */
 976    switch (tex_idx) {
 977    case TEXTURE_1D_INDEX:
 978       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 979       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
 980       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 981       break;
 982    case TEXTURE_2D_INDEX:
 983    case TEXTURE_RECT_INDEX:
 984       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 985       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 986       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
 987       break;
 988    case TEXTURE_3D_INDEX:
 989    case TEXTURE_CUBE_INDEX:
 990       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
 991       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
 992       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
 993       break;
 994    default:
 995       /* unexpected target */
 996       abort();
 997    }
 998
 999    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1000    msgLength = 2 + 4 * mrf_per_channel - 1;
1001
1002    brw_SAMPLE(p,
1003               dst_retyped,
1004               1,
1005               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1006               SURF_INDEX_TEXTURE(sampler),
1007               sampler,
1008               dst_flags & WRITEMASK_XYZW,
1009               msg_type,
1010               response_length,
1011               msgLength,
1012               0,
1013               1,
1014               BRW_SAMPLER_SIMD_MODE_SIMD16);
1015 }
1016
1017
1018 static void emit_lit(struct brw_wm_compile *c,
1019                      const struct brw_reg *dst,
1020                      GLuint mask,
1021                      const struct brw_reg *arg0)
1022 {
1023    struct brw_compile *p = &c->func;
1024
1025    assert((mask & WRITEMASK_XW) == 0);
1026
1027    if (mask & WRITEMASK_Y) {
1028       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1029       brw_MOV(p, dst[1], arg0[0]);
1030       brw_set_saturate(p, 0);
1031    }
1032
1033    if (mask & WRITEMASK_Z) {
1034       emit_math2(c, BRW_MATH_FUNCTION_POW,
1035                  &dst[2],
1036                  WRITEMASK_X | (mask & SATURATE),
1037                  &arg0[1],
1038                  &arg0[3]);
1039    }
1040
1041    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1042     * some of the POW calculations above, but 16-wide iff statements
1043     * seem to lock c1 hardware, so this is a nasty workaround:
1044     */
1045    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1046    {
1047       if (mask & WRITEMASK_Y)
1048          brw_MOV(p, dst[1], brw_imm_f(0));
1049
1050       if (mask & WRITEMASK_Z)
1051          brw_MOV(p, dst[2], brw_imm_f(0));
1052    }
1053    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1054 }
1055
1056
1057 /* Kill pixel - set execution mask to zero for those pixels which
1058  * fail.
1059  */
1060 static void emit_kil( struct brw_wm_compile *c,
1061                       struct brw_reg *arg0)
1062 {
1063    struct brw_compile *p = &c->func;
1064    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1065    GLuint i;
1066
1067    /* XXX - usually won't need 4 compares!
1068     */
1069    for (i = 0; i < 4; i++) {
1070       brw_push_insn_state(p);
1071       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1072       brw_set_predicate_control_flag_value(p, 0xff);
1073       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1074       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1075       brw_pop_insn_state(p);
1076    }
1077 }
1078
1079 /* KIL_NV kills the pixels that are currently executing, not based on a test
1080  * of the arguments.
1081  */
1082 static void emit_kil_nv( struct brw_wm_compile *c )
1083 {
1084    struct brw_compile *p = &c->func;
1085    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1086
1087    brw_push_insn_state(p);
1088    brw_set_mask_control(p, BRW_MASK_DISABLE);
1089    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1090    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1091    brw_pop_insn_state(p);
1092 }
1093
1094 static void fire_fb_write( struct brw_wm_compile *c,
1095                            GLuint base_reg,
1096                            GLuint nr,
1097                            GLuint target,
1098                            GLuint eot )
1099 {
1100    struct brw_compile *p = &c->func;
1101    struct brw_reg dst;
1102
1103    if (c->dispatch_width == 16)
1104       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1105    else
1106       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1107
1108    /* Pass through control information:
1109     */
1110 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1111    {
1112       brw_push_insn_state(p);
1113       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1114       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1115       brw_MOV(p,
1116                brw_message_reg(base_reg + 1),
1117                brw_vec8_grf(1, 0));
1118       brw_pop_insn_state(p);
1119    }
1120
1121    /* Send framebuffer write message: */
1122 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1123    brw_fb_WRITE(p,
1124                 dst,
1125                 base_reg,
1126                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1127                 target,
1128                 nr,
1129                 0,
1130                 eot);
1131 }
1132
1133
1134 static void emit_aa( struct brw_wm_compile *c,
1135                      struct brw_reg *arg1,
1136                      GLuint reg )
1137 {
1138    struct brw_compile *p = &c->func;
1139    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1140    GLuint off = c->key.aa_dest_stencil_reg % 2;
1141    struct brw_reg aa = offset(arg1[comp], off);
1142
1143    brw_push_insn_state(p);
1144    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1145    brw_MOV(p, brw_message_reg(reg), aa);
1146    brw_pop_insn_state(p);
1147 }
1148
1149
1150 /* Post-fragment-program processing.  Send the results to the
1151  * framebuffer.
1152  * \param arg0  the fragment color
1153  * \param arg1  the pass-through depth value
1154  * \param arg2  the shader-computed depth value
1155  */
1156 void emit_fb_write(struct brw_wm_compile *c,
1157                    struct brw_reg *arg0,
1158                    struct brw_reg *arg1,
1159                    struct brw_reg *arg2,
1160                    GLuint target,
1161                    GLuint eot)
1162 {
1163    struct brw_compile *p = &c->func;
1164    struct brw_context *brw = p->brw;
1165    GLuint nr = 2;
1166    GLuint channel;
1167
1168    /* Reserve a space for AA - may not be needed:
1169     */
1170    if (c->key.aa_dest_stencil_reg)
1171       nr += 1;
1172
1173    /* I don't really understand how this achieves the color interleave
1174     * (ie RGBARGBA) in the result:  [Do the saturation here]
1175     */
1176    brw_push_insn_state(p);
1177
1178    for (channel = 0; channel < 4; channel++) {
1179       if (c->dispatch_width == 16 && brw->has_compr4) {
1180          /* By setting the high bit of the MRF register number, we indicate
1181           * that we want COMPR4 mode - instead of doing the usual destination
1182           * + 1 for the second half we get destination + 4.
1183           */
1184          brw_MOV(p,
1185                  brw_message_reg(nr + channel + (1 << 7)),
1186                  arg0[channel]);
1187       } else {
1188          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1189          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1190          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1191          brw_MOV(p,
1192                  brw_message_reg(nr + channel),
1193                  arg0[channel]);
1194
1195          if (c->dispatch_width == 16) {
1196             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1197             brw_MOV(p,
1198                     brw_message_reg(nr + channel + 4),
1199                     sechalf(arg0[channel]));
1200          }
1201       }
1202    }
1203    /* skip over the regs populated above:
1204     */
1205    nr += 8;
1206    brw_pop_insn_state(p);
1207
1208    if (c->key.source_depth_to_render_target)
1209    {
1210       if (c->key.computes_depth)
1211          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1212       else
1213          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1214
1215       nr += 2;
1216    }
1217
1218    if (c->key.dest_depth_reg)
1219    {
1220       GLuint comp = c->key.dest_depth_reg / 2;
1221       GLuint off = c->key.dest_depth_reg % 2;
1222
1223       if (off != 0) {
1224          brw_push_insn_state(p);
1225          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1226
1227          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1228          /* 2nd half? */
1229          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1230          brw_pop_insn_state(p);
1231       }
1232       else {
1233          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1234       }
1235       nr += 2;
1236    }
1237
1238    if (!c->key.runtime_check_aads_emit) {
1239       if (c->key.aa_dest_stencil_reg)
1240          emit_aa(c, arg1, 2);
1241
1242       fire_fb_write(c, 0, nr, target, eot);
1243    }
1244    else {
1245       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1246       struct brw_reg ip = brw_ip_reg();
1247       struct brw_instruction *jmp;
1248
1249       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1250       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1251       brw_AND(p,
1252               v1_null_ud,
1253               get_element_ud(brw_vec8_grf(1,0), 6),
1254               brw_imm_ud(1<<26));
1255
1256       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1257       {
1258          emit_aa(c, arg1, 2);
1259          fire_fb_write(c, 0, nr, target, eot);
1260          /* note - thread killed in subroutine */
1261       }
1262       brw_land_fwd_jump(p, jmp);
1263
1264       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1265        */
1266       fire_fb_write(c, 1, nr-1, target, eot);
1267    }
1268 }
1269
1270 /**
1271  * Move a GPR to scratch memory.
1272  */
1273 static void emit_spill( struct brw_wm_compile *c,
1274                         struct brw_reg reg,
1275                         GLuint slot )
1276 {
1277    struct brw_compile *p = &c->func;
1278
1279    /*
1280      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1281    */
1282    brw_MOV(p, brw_message_reg(2), reg);
1283
1284    /*
1285      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1286      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1287    */
1288    brw_dp_WRITE_16(p,
1289                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1290                    slot);
1291 }
1292
1293
1294 /**
1295  * Load a GPR from scratch memory.
1296  */
1297 static void emit_unspill( struct brw_wm_compile *c,
1298                           struct brw_reg reg,
1299                           GLuint slot )
1300 {
1301    struct brw_compile *p = &c->func;
1302
1303    /* Slot 0 is the undef value.
1304     */
1305    if (slot == 0) {
1306       brw_MOV(p, reg, brw_imm_f(0));
1307       return;
1308    }
1309
1310    /*
1311      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1312      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1313    */
1314
1315    brw_dp_READ_16(p,
1316                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1317                   slot);
1318 }
1319
1320
1321 /**
1322  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1323  * Args with unspill_reg != 0 will be loaded from scratch memory.
1324  */
1325 static void get_argument_regs( struct brw_wm_compile *c,
1326                                struct brw_wm_ref *arg[],
1327                                struct brw_reg *regs )
1328 {
1329    GLuint i;
1330
1331    for (i = 0; i < 4; i++) {
1332       if (arg[i]) {
1333          if (arg[i]->unspill_reg)
1334             emit_unspill(c,
1335                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1336                          arg[i]->value->spill_slot);
1337
1338          regs[i] = arg[i]->hw_reg;
1339       }
1340       else {
1341          regs[i] = brw_null_reg();
1342       }
1343    }
1344 }
1345
1346
1347 /**
1348  * For values that have a spill_slot!=0, write those regs to scratch memory.
1349  */
1350 static void spill_values( struct brw_wm_compile *c,
1351                           struct brw_wm_value *values,
1352                           GLuint nr )
1353 {
1354    GLuint i;
1355
1356    for (i = 0; i < nr; i++)
1357       if (values[i].spill_slot)
1358          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1359 }
1360
1361
1362 /* Emit the fragment program instructions here.
1363  */
1364 void brw_wm_emit( struct brw_wm_compile *c )
1365 {
1366    struct brw_compile *p = &c->func;
1367    GLuint insn;
1368
1369    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1370
1371    /* Check if any of the payload regs need to be spilled:
1372     */
1373    spill_values(c, c->payload.depth, 4);
1374    spill_values(c, c->creg, c->nr_creg);
1375    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1376
1377
1378    for (insn = 0; insn < c->nr_insns; insn++) {
1379
1380       struct brw_wm_instruction *inst = &c->instruction[insn];
1381       struct brw_reg args[3][4], dst[4];
1382       GLuint i, dst_flags;
1383
1384       /* Get argument regs:
1385        */
1386       for (i = 0; i < 3; i++)
1387          get_argument_regs(c, inst->src[i], args[i]);
1388
1389       /* Get dest regs:
1390        */
1391       for (i = 0; i < 4; i++)
1392          if (inst->dst[i])
1393             dst[i] = inst->dst[i]->hw_reg;
1394          else
1395             dst[i] = brw_null_reg();
1396
1397       /* Flags
1398        */
1399       dst_flags = inst->writemask;
1400       if (inst->saturate)
1401          dst_flags |= SATURATE;
1402
1403       switch (inst->opcode) {
1404          /* Generated instructions for calculating triangle interpolants:
1405           */
1406       case WM_PIXELXY:
1407          emit_pixel_xy(c, dst, dst_flags);
1408          break;
1409
1410       case WM_DELTAXY:
1411          emit_delta_xy(p, dst, dst_flags, args[0]);
1412          break;
1413
1414       case WM_WPOSXY:
1415          emit_wpos_xy(c, dst, dst_flags, args[0]);
1416          break;
1417
1418       case WM_PIXELW:
1419          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1420          break;
1421
1422       case WM_LINTERP:
1423          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1424          break;
1425
1426       case WM_PINTERP:
1427          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1428          break;
1429
1430       case WM_CINTERP:
1431          emit_cinterp(p, dst, dst_flags, args[0]);
1432          break;
1433
1434       case WM_FB_WRITE:
1435          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1436          break;
1437
1438       case WM_FRONTFACING:
1439          emit_frontfacing(p, dst, dst_flags);
1440          break;
1441
1442          /* Straightforward arithmetic:
1443           */
1444       case OPCODE_ADD:
1445          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1446          break;
1447
1448       case OPCODE_FRC:
1449          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1450          break;
1451
1452       case OPCODE_FLR:
1453          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1454          break;
1455
1456       case OPCODE_DDX:
1457          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1458          break;
1459
1460       case OPCODE_DDY:
1461          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1462          break;
1463
1464       case OPCODE_DP3:
1465          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1466          break;
1467
1468       case OPCODE_DP4:
1469          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1470          break;
1471
1472       case OPCODE_DPH:
1473          emit_dph(p, dst, dst_flags, args[0], args[1]);
1474          break;
1475
1476       case OPCODE_TRUNC:
1477          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1478          break;
1479
1480       case OPCODE_LRP:
1481          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1482          break;
1483
1484       case OPCODE_MAD:
1485          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1486          break;
1487
1488       case OPCODE_MOV:
1489       case OPCODE_SWZ:
1490          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1491          break;
1492
1493       case OPCODE_MUL:
1494          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1495          break;
1496
1497       case OPCODE_XPD:
1498          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1499          break;
1500
1501          /* Higher math functions:
1502           */
1503       case OPCODE_RCP:
1504          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1505          break;
1506
1507       case OPCODE_RSQ:
1508          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1509          break;
1510
1511       case OPCODE_SIN:
1512          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1513          break;
1514
1515       case OPCODE_COS:
1516          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1517          break;
1518
1519       case OPCODE_EX2:
1520          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1521          break;
1522
1523       case OPCODE_LG2:
1524          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1525          break;
1526
1527       case OPCODE_SCS:
1528          /* There is an scs math function, but it would need some
1529           * fixup for 16-element execution.
1530           */
1531          if (dst_flags & WRITEMASK_X)
1532             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1533          if (dst_flags & WRITEMASK_Y)
1534             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1535          break;
1536
1537       case OPCODE_POW:
1538          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1539          break;
1540
1541          /* Comparisons:
1542           */
1543       case OPCODE_CMP:
1544          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1545          break;
1546
1547       case OPCODE_MAX:
1548          emit_max(p, dst, dst_flags, args[0], args[1]);
1549          break;
1550
1551       case OPCODE_MIN:
1552          emit_min(p, dst, dst_flags, args[0], args[1]);
1553          break;
1554
1555       case OPCODE_SLT:
1556          emit_slt(p, dst, dst_flags, args[0], args[1]);
1557          break;
1558
1559       case OPCODE_SLE:
1560          emit_sle(p, dst, dst_flags, args[0], args[1]);
1561         break;
1562       case OPCODE_SGT:
1563          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1564         break;
1565       case OPCODE_SGE:
1566          emit_sge(p, dst, dst_flags, args[0], args[1]);
1567          break;
1568       case OPCODE_SEQ:
1569          emit_seq(p, dst, dst_flags, args[0], args[1]);
1570         break;
1571       case OPCODE_SNE:
1572          emit_sne(p, dst, dst_flags, args[0], args[1]);
1573         break;
1574
1575       case OPCODE_LIT:
1576          emit_lit(c, dst, dst_flags, args[0]);
1577          break;
1578
1579          /* Texturing operations:
1580           */
1581       case OPCODE_TEX:
1582          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1583                   inst->tex_idx, inst->tex_unit,
1584                   inst->tex_shadow);
1585          break;
1586
1587       case OPCODE_TXB:
1588          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1589                   inst->tex_idx, inst->tex_unit);
1590          break;
1591
1592       case OPCODE_KIL:
1593          emit_kil(c, args[0]);
1594          break;
1595
1596       case OPCODE_KIL_NV:
1597          emit_kil_nv(c);
1598          break;
1599
1600       default:
1601          _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
1602                       inst->opcode, inst->opcode < MAX_OPCODE ?
1603                                     _mesa_opcode_string(inst->opcode) :
1604                                     "unknown");
1605       }
1606
1607       for (i = 0; i < 4; i++)
1608         if (inst->dst[i] && inst->dst[i]->spill_slot)
1609            emit_spill(c,
1610                       inst->dst[i]->hw_reg,
1611                       inst->dst[i]->spill_slot);
1612    }
1613
1614    if (INTEL_DEBUG & DEBUG_WM) {
1615       int i;
1616
1617       _mesa_printf("wm-native:\n");
1618       for (i = 0; i < p->nr_insn; i++)
1619          brw_disasm(stderr, &p->store[i]);
1620       _mesa_printf("\n");
1621    }
1622 }