src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 /* Not quite sure how correct this is - need to understand horiz
  38  * vs. vertical strides a little better.
  39  */
  40 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  41 {
  42    if (reg.vstride)
  43       reg.nr++;
  44    return reg;
  45 }
  46
  47
  48 /* Payload R0:
  49  *
  50  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  51  *         corresponding to each of the 16 execution channels.
  52  * R0.1..8 -- ?
  53  * R1.0 -- triangle vertex 0.X
  54  * R1.1 -- triangle vertex 0.Y
  55  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  56  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  57  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  58  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  59  * R1.6 -- ?
  60  * R1.7 -- ?
  61  * R1.8 -- ?
  62  */
  63
  64 void emit_pixel_xy(struct brw_wm_compile *c,
  65                    const struct brw_reg *dst,
  66                    GLuint mask)
  67 {
  68    struct brw_compile *p = &c->func;
  69    struct brw_reg r1 = brw_vec1_grf(1, 0);
  70    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  71    struct brw_reg dst0_uw, dst1_uw;
  72
  73    brw_push_insn_state(p);
  74    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  75
  76    if (c->dispatch_width == 16) {
  77       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
  78       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
  79    } else {
  80       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
  81       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
  82    }
  83
  84    /* Calculate pixel centers by adding 1 or 0 to each of the
  85     * micro-tile coordinates passed in r1.
  86     */
  87    if (mask & WRITEMASK_X) {
  88       brw_ADD(p,
  89               dst0_uw,
  90               stride(suboffset(r1_uw, 4), 2, 4, 0),
  91               brw_imm_v(0x10101010));
  92    }
  93
  94    if (mask & WRITEMASK_Y) {
  95       brw_ADD(p,
  96               dst1_uw,
  97               stride(suboffset(r1_uw,5), 2, 4, 0),
  98               brw_imm_v(0x11001100));
  99    }
 100    brw_pop_insn_state(p);
 101 }
 102
 103
 104 void emit_delta_xy(struct brw_compile *p,
 105                    const struct brw_reg *dst,
 106                    GLuint mask,
 107                    const struct brw_reg *arg0)
 108 {
 109    struct brw_reg r1 = brw_vec1_grf(1, 0);
 110
 111    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 112     * centers.
 113     */
 114    if (mask & WRITEMASK_X) {
 115       brw_ADD(p,
 116               dst[0],
 117               retype(arg0[0], BRW_REGISTER_TYPE_UW),
 118               negate(r1));
 119    }
 120
 121    if (mask & WRITEMASK_Y) {
 122       brw_ADD(p,
 123               dst[1],
 124               retype(arg0[1], BRW_REGISTER_TYPE_UW),
 125               negate(suboffset(r1,1)));
 126
 127    }
 128 }
 129
 130 void emit_wpos_xy(struct brw_wm_compile *c,
 131                   const struct brw_reg *dst,
 132                   GLuint mask,
 133                   const struct brw_reg *arg0)
 134 {
 135    struct brw_compile *p = &c->func;
 136
 137    /* Calculate the pixel offset from window bottom left into destination
 138     * X and Y channels.
 139     */
 140    if (mask & WRITEMASK_X) {
 141       if (c->fp->program.PixelCenterInteger) {
 142          /* X' = X */
 143          brw_MOV(p,
 144                  dst[0],
 145                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 146       } else {
 147          /* X' = X + 0.5 */
 148          brw_ADD(p,
 149                  dst[0],
 150                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 151                  brw_imm_f(0.5));
 152       }
 153    }
 154
 155    if (mask & WRITEMASK_Y) {
 156       if (c->fp->program.OriginUpperLeft) {
 157          if (c->fp->program.PixelCenterInteger) {
 158             /* Y' = Y */
 159             brw_MOV(p,
 160                     dst[1],
 161                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 162          } else {
 163             /* Y' = Y + 0.5 */
 164             brw_ADD(p,
 165                     dst[1],
 166                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 167                     brw_imm_f(0.5));
 168          }
 169       } else {
 170          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 171
 172          /* Y' = (height - 1) - Y + center */
 173          brw_ADD(p,
 174                  dst[1],
 175                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 176                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 177       }
 178    }
 179 }
 180
 181
 182 void emit_pixel_w(struct brw_wm_compile *c,
 183                   const struct brw_reg *dst,
 184                   GLuint mask,
 185                   const struct brw_reg *arg0,
 186                   const struct brw_reg *deltas)
 187 {
 188    struct brw_compile *p = &c->func;
 189
 190    /* Don't need this if all you are doing is interpolating color, for
 191     * instance.
 192     */
 193    if (mask & WRITEMASK_W) {
 194       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 195
 196       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 197        * result straight into a message reg.
 198        */
 199       brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 200       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 201
 202       /* Calc w */
 203       if (c->dispatch_width == 16) {
 204          brw_math_16(p, dst[3],
 205                      BRW_MATH_FUNCTION_INV,
 206                      BRW_MATH_SATURATE_NONE,
 207                      2, brw_null_reg(),
 208                      BRW_MATH_PRECISION_FULL);
 209       } else {
 210          brw_math(p, dst[3],
 211                   BRW_MATH_FUNCTION_INV,
 212                   BRW_MATH_SATURATE_NONE,
 213                   2, brw_null_reg(),
 214                   BRW_MATH_DATA_VECTOR,
 215                   BRW_MATH_PRECISION_FULL);
 216       }
 217    }
 218 }
 219
 220
 221 void emit_linterp(struct brw_compile *p,
 222                   const struct brw_reg *dst,
 223                   GLuint mask,
 224                   const struct brw_reg *arg0,
 225                   const struct brw_reg *deltas)
 226 {
 227    struct brw_reg interp[4];
 228    GLuint nr = arg0[0].nr;
 229    GLuint i;
 230
 231    interp[0] = brw_vec1_grf(nr, 0);
 232    interp[1] = brw_vec1_grf(nr, 4);
 233    interp[2] = brw_vec1_grf(nr+1, 0);
 234    interp[3] = brw_vec1_grf(nr+1, 4);
 235
 236    for (i = 0; i < 4; i++) {
 237       if (mask & (1<<i)) {
 238          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 239          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 240       }
 241    }
 242 }
 243
 244
 245 void emit_pinterp(struct brw_compile *p,
 246                   const struct brw_reg *dst,
 247                   GLuint mask,
 248                   const struct brw_reg *arg0,
 249                   const struct brw_reg *deltas,
 250                   const struct brw_reg *w)
 251 {
 252    struct brw_reg interp[4];
 253    GLuint nr = arg0[0].nr;
 254    GLuint i;
 255
 256    interp[0] = brw_vec1_grf(nr, 0);
 257    interp[1] = brw_vec1_grf(nr, 4);
 258    interp[2] = brw_vec1_grf(nr+1, 0);
 259    interp[3] = brw_vec1_grf(nr+1, 4);
 260
 261    for (i = 0; i < 4; i++) {
 262       if (mask & (1<<i)) {
 263          brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 264          brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 265       }
 266    }
 267    for (i = 0; i < 4; i++) {
 268       if (mask & (1<<i)) {
 269          brw_MUL(p, dst[i], dst[i], w[3]);
 270       }
 271    }
 272 }
 273
 274
 275 void emit_cinterp(struct brw_compile *p,
 276                   const struct brw_reg *dst,
 277                   GLuint mask,
 278                   const struct brw_reg *arg0)
 279 {
 280    struct brw_reg interp[4];
 281    GLuint nr = arg0[0].nr;
 282    GLuint i;
 283
 284    interp[0] = brw_vec1_grf(nr, 0);
 285    interp[1] = brw_vec1_grf(nr, 4);
 286    interp[2] = brw_vec1_grf(nr+1, 0);
 287    interp[3] = brw_vec1_grf(nr+1, 4);
 288
 289    for (i = 0; i < 4; i++) {
 290       if (mask & (1<<i)) {
 291          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 292       }
 293    }
 294 }
 295
 296 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 297 void emit_frontfacing(struct brw_compile *p,
 298                       const struct brw_reg *dst,
 299                       GLuint mask)
 300 {
 301    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 302    GLuint i;
 303
 304    if (!(mask & WRITEMASK_XYZW))
 305       return;
 306
 307    for (i = 0; i < 4; i++) {
 308       if (mask & (1<<i)) {
 309          brw_MOV(p, dst[i], brw_imm_f(0.0));
 310       }
 311    }
 312
 313    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 314     * us front face
 315     */
 316    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 317    for (i = 0; i < 4; i++) {
 318       if (mask & (1<<i)) {
 319          brw_MOV(p, dst[i], brw_imm_f(1.0));
 320       }
 321    }
 322    brw_set_predicate_control_flag_value(p, 0xff);
 323 }
 324
 325 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 326  * looking like:
 327  *
 328  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 329  *
 330  * and we're trying to produce:
 331  *
 332  *           DDX                     DDY
 333  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 334  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 335  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 336  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 337  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 338  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 339  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 340  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 341  *
 342  * and add another set of two more subspans if in 16-pixel dispatch mode.
 343  *
 344  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 345  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 346  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 347  * between each other.  We could probably do it like ddx and swizzle the right
 348  * order later, but bail for now and just produce
 349  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 350  */
 351 void emit_ddxy(struct brw_compile *p,
 352                const struct brw_reg *dst,
 353                GLuint mask,
 354                GLboolean is_ddx,
 355                const struct brw_reg *arg0)
 356 {
 357    int i;
 358    struct brw_reg src0, src1;
 359
 360    if (mask & SATURATE)
 361       brw_set_saturate(p, 1);
 362    for (i = 0; i < 4; i++ ) {
 363       if (mask & (1<<i)) {
 364          if (is_ddx) {
 365             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 366                            BRW_REGISTER_TYPE_F,
 367                            BRW_VERTICAL_STRIDE_2,
 368                            BRW_WIDTH_2,
 369                            BRW_HORIZONTAL_STRIDE_0,
 370                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 371             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 372                            BRW_REGISTER_TYPE_F,
 373                            BRW_VERTICAL_STRIDE_2,
 374                            BRW_WIDTH_2,
 375                            BRW_HORIZONTAL_STRIDE_0,
 376                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 377          } else {
 378             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 379                            BRW_REGISTER_TYPE_F,
 380                            BRW_VERTICAL_STRIDE_4,
 381                            BRW_WIDTH_4,
 382                            BRW_HORIZONTAL_STRIDE_0,
 383                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 384             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 385                            BRW_REGISTER_TYPE_F,
 386                            BRW_VERTICAL_STRIDE_4,
 387                            BRW_WIDTH_4,
 388                            BRW_HORIZONTAL_STRIDE_0,
 389                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 390          }
 391          brw_ADD(p, dst[i], src0, negate(src1));
 392       }
 393    }
 394    if (mask & SATURATE)
 395       brw_set_saturate(p, 0);
 396 }
 397
 398 void emit_alu1(struct brw_compile *p,
 399                struct brw_instruction *(*func)(struct brw_compile *,
 400                                                struct brw_reg,
 401                                                struct brw_reg),
 402                const struct brw_reg *dst,
 403                GLuint mask,
 404                const struct brw_reg *arg0)
 405 {
 406    GLuint i;
 407
 408    if (mask & SATURATE)
 409       brw_set_saturate(p, 1);
 410
 411    for (i = 0; i < 4; i++) {
 412       if (mask & (1<<i)) {
 413          func(p, dst[i], arg0[i]);
 414       }
 415    }
 416
 417    if (mask & SATURATE)
 418       brw_set_saturate(p, 0);
 419 }
 420
 421
 422 void emit_alu2(struct brw_compile *p,
 423                struct brw_instruction *(*func)(struct brw_compile *,
 424                                                struct brw_reg,
 425                                                struct brw_reg,
 426                                                struct brw_reg),
 427                const struct brw_reg *dst,
 428                GLuint mask,
 429                const struct brw_reg *arg0,
 430                const struct brw_reg *arg1)
 431 {
 432    GLuint i;
 433
 434    if (mask & SATURATE)
 435       brw_set_saturate(p, 1);
 436
 437    for (i = 0; i < 4; i++) {
 438       if (mask & (1<<i)) {
 439          func(p, dst[i], arg0[i], arg1[i]);
 440       }
 441    }
 442
 443    if (mask & SATURATE)
 444       brw_set_saturate(p, 0);
 445 }
 446
 447
 448 void emit_mad(struct brw_compile *p,
 449               const struct brw_reg *dst,
 450               GLuint mask,
 451               const struct brw_reg *arg0,
 452               const struct brw_reg *arg1,
 453               const struct brw_reg *arg2)
 454 {
 455    GLuint i;
 456
 457    for (i = 0; i < 4; i++) {
 458       if (mask & (1<<i)) {
 459          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 460
 461          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 462          brw_ADD(p, dst[i], dst[i], arg2[i]);
 463          brw_set_saturate(p, 0);
 464       }
 465    }
 466 }
 467
 468 void emit_lrp(struct brw_compile *p,
 469               const struct brw_reg *dst,
 470               GLuint mask,
 471               const struct brw_reg *arg0,
 472               const struct brw_reg *arg1,
 473               const struct brw_reg *arg2)
 474 {
 475    GLuint i;
 476
 477    /* Uses dst as a temporary:
 478     */
 479    for (i = 0; i < 4; i++) {
 480       if (mask & (1<<i)) {
 481          /* Can I use the LINE instruction for this?
 482           */
 483          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 484          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 485
 486          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 487          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 488          brw_set_saturate(p, 0);
 489       }
 490    }
 491 }
 492
 493 void emit_sop(struct brw_compile *p,
 494               const struct brw_reg *dst,
 495               GLuint mask,
 496               GLuint cond,
 497               const struct brw_reg *arg0,
 498               const struct brw_reg *arg1)
 499 {
 500    GLuint i;
 501
 502    for (i = 0; i < 4; i++) {
 503       if (mask & (1<<i)) {
 504          brw_push_insn_state(p);
 505          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 506          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 507          brw_MOV(p, dst[i], brw_imm_f(0));
 508          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 509          brw_MOV(p, dst[i], brw_imm_f(1.0));
 510          brw_pop_insn_state(p);
 511       }
 512    }
 513 }
 514
 515 static void emit_slt( struct brw_compile *p,
 516                       const struct brw_reg *dst,
 517                       GLuint mask,
 518                       const struct brw_reg *arg0,
 519                       const struct brw_reg *arg1 )
 520 {
 521    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 522 }
 523
 524 static void emit_sle( struct brw_compile *p,
 525                       const struct brw_reg *dst,
 526                       GLuint mask,
 527                       const struct brw_reg *arg0,
 528                       const struct brw_reg *arg1 )
 529 {
 530    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 531 }
 532
 533 static void emit_sgt( struct brw_compile *p,
 534                       const struct brw_reg *dst,
 535                       GLuint mask,
 536                       const struct brw_reg *arg0,
 537                       const struct brw_reg *arg1 )
 538 {
 539    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 540 }
 541
 542 static void emit_sge( struct brw_compile *p,
 543                       const struct brw_reg *dst,
 544                       GLuint mask,
 545                       const struct brw_reg *arg0,
 546                       const struct brw_reg *arg1 )
 547 {
 548    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 549 }
 550
 551 static void emit_seq( struct brw_compile *p,
 552                       const struct brw_reg *dst,
 553                       GLuint mask,
 554                       const struct brw_reg *arg0,
 555                       const struct brw_reg *arg1 )
 556 {
 557    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 558 }
 559
 560 static void emit_sne( struct brw_compile *p,
 561                       const struct brw_reg *dst,
 562                       GLuint mask,
 563                       const struct brw_reg *arg0,
 564                       const struct brw_reg *arg1 )
 565 {
 566    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 567 }
 568
 569 void emit_cmp(struct brw_compile *p,
 570               const struct brw_reg *dst,
 571               GLuint mask,
 572               const struct brw_reg *arg0,
 573               const struct brw_reg *arg1,
 574               const struct brw_reg *arg2)
 575 {
 576    GLuint i;
 577
 578    for (i = 0; i < 4; i++) {
 579       if (mask & (1<<i)) {
 580          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 581          brw_MOV(p, dst[i], arg2[i]);
 582          brw_set_saturate(p, 0);
 583
 584          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 585
 586          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 587          brw_MOV(p, dst[i], arg1[i]);
 588          brw_set_saturate(p, 0);
 589          brw_set_predicate_control_flag_value(p, 0xff);
 590       }
 591    }
 592 }
 593
 594 void emit_max(struct brw_compile *p,
 595               const struct brw_reg *dst,
 596               GLuint mask,
 597               const struct brw_reg *arg0,
 598               const struct brw_reg *arg1)
 599 {
 600    GLuint i;
 601
 602    for (i = 0; i < 4; i++) {
 603       if (mask & (1<<i)) {
 604          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 605          brw_MOV(p, dst[i], arg0[i]);
 606          brw_set_saturate(p, 0);
 607
 608          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 609
 610          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 611          brw_MOV(p, dst[i], arg1[i]);
 612          brw_set_saturate(p, 0);
 613          brw_set_predicate_control_flag_value(p, 0xff);
 614       }
 615    }
 616 }
 617
 618 void emit_min(struct brw_compile *p,
 619               const struct brw_reg *dst,
 620               GLuint mask,
 621               const struct brw_reg *arg0,
 622               const struct brw_reg *arg1)
 623 {
 624    GLuint i;
 625
 626    for (i = 0; i < 4; i++) {
 627       if (mask & (1<<i)) {
 628          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 629          brw_MOV(p, dst[i], arg1[i]);
 630          brw_set_saturate(p, 0);
 631
 632          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 633
 634          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 635          brw_MOV(p, dst[i], arg0[i]);
 636          brw_set_saturate(p, 0);
 637          brw_set_predicate_control_flag_value(p, 0xff);
 638       }
 639    }
 640 }
 641
 642
 643 void emit_dp3(struct brw_compile *p,
 644               const struct brw_reg *dst,
 645               GLuint mask,
 646               const struct brw_reg *arg0,
 647               const struct brw_reg *arg1)
 648 {
 649    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 650
 651    if (!(mask & WRITEMASK_XYZW))
 652       return; /* Do not emit dead code */
 653
 654    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 655
 656    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 657    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 658
 659    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 660    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 661    brw_set_saturate(p, 0);
 662 }
 663
 664
 665 void emit_dp4(struct brw_compile *p,
 666               const struct brw_reg *dst,
 667               GLuint mask,
 668               const struct brw_reg *arg0,
 669               const struct brw_reg *arg1)
 670 {
 671    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 672
 673    if (!(mask & WRITEMASK_XYZW))
 674       return; /* Do not emit dead code */
 675
 676    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 677
 678    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 679    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 680    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 681
 682    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 683    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 684    brw_set_saturate(p, 0);
 685 }
 686
 687
 688 void emit_dph(struct brw_compile *p,
 689               const struct brw_reg *dst,
 690               GLuint mask,
 691               const struct brw_reg *arg0,
 692               const struct brw_reg *arg1)
 693 {
 694    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 695
 696    if (!(mask & WRITEMASK_XYZW))
 697       return; /* Do not emit dead code */
 698
 699    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 700
 701    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 702    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 703    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 704
 705    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 706    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 707    brw_set_saturate(p, 0);
 708 }
 709
 710
 711 void emit_xpd(struct brw_compile *p,
 712               const struct brw_reg *dst,
 713               GLuint mask,
 714               const struct brw_reg *arg0,
 715               const struct brw_reg *arg1)
 716 {
 717    GLuint i;
 718
 719    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 720
 721    for (i = 0 ; i < 3; i++) {
 722       if (mask & (1<<i)) {
 723          GLuint i2 = (i+2)%3;
 724          GLuint i1 = (i+1)%3;
 725
 726          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 727
 728          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 729          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 730          brw_set_saturate(p, 0);
 731       }
 732    }
 733 }
 734
 735
 736 void emit_math1(struct brw_wm_compile *c,
 737                 GLuint function,
 738                 const struct brw_reg *dst,
 739                 GLuint mask,
 740                 const struct brw_reg *arg0)
 741 {
 742    struct brw_compile *p = &c->func;
 743    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 744    GLuint saturate = ((mask & SATURATE) ?
 745                       BRW_MATH_SATURATE_SATURATE :
 746                       BRW_MATH_SATURATE_NONE);
 747
 748    if (!(mask & WRITEMASK_XYZW))
 749       return; /* Do not emit dead code */
 750
 751    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 752
 753    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 754     * channels.
 755     */
 756    brw_MOV(p, brw_message_reg(2), arg0[0]);
 757
 758    /* Send two messages to perform all 16 operations:
 759     */
 760    brw_push_insn_state(p);
 761    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 762    brw_math(p,
 763             dst[dst_chan],
 764             function,
 765             saturate,
 766             2,
 767             brw_null_reg(),
 768             BRW_MATH_DATA_VECTOR,
 769             BRW_MATH_PRECISION_FULL);
 770
 771    if (c->dispatch_width == 16) {
 772       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 773       brw_math(p,
 774                offset(dst[dst_chan],1),
 775                function,
 776                saturate,
 777                3,
 778                brw_null_reg(),
 779                BRW_MATH_DATA_VECTOR,
 780                BRW_MATH_PRECISION_FULL);
 781    }
 782    brw_pop_insn_state(p);
 783 }
 784
 785
 786 void emit_math2(struct brw_wm_compile *c,
 787                 GLuint function,
 788                 const struct brw_reg *dst,
 789                 GLuint mask,
 790                 const struct brw_reg *arg0,
 791                 const struct brw_reg *arg1)
 792 {
 793    struct brw_compile *p = &c->func;
 794    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 795    GLuint saturate = ((mask & SATURATE) ?
 796                       BRW_MATH_SATURATE_SATURATE :
 797                       BRW_MATH_SATURATE_NONE);
 798
 799    if (!(mask & WRITEMASK_XYZW))
 800       return; /* Do not emit dead code */
 801
 802    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 803
 804    brw_push_insn_state(p);
 805
 806    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 807    brw_MOV(p, brw_message_reg(2), arg0[0]);
 808    if (c->dispatch_width == 16) {
 809       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 810       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 811    }
 812
 813    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 814    brw_MOV(p, brw_message_reg(3), arg1[0]);
 815    if (c->dispatch_width == 16) {
 816       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 817       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 818    }
 819
 820    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 821    brw_math(p,
 822             dst[dst_chan],
 823             function,
 824             saturate,
 825             2,
 826             brw_null_reg(),
 827             BRW_MATH_DATA_VECTOR,
 828             BRW_MATH_PRECISION_FULL);
 829
 830    /* Send two messages to perform all 16 operations:
 831     */
 832    if (c->dispatch_width == 16) {
 833       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 834       brw_math(p,
 835                offset(dst[dst_chan],1),
 836                function,
 837                saturate,
 838                4,
 839                brw_null_reg(),
 840                BRW_MATH_DATA_VECTOR,
 841                BRW_MATH_PRECISION_FULL);
 842    }
 843    brw_pop_insn_state(p);
 844 }
 845
 846
 847 void emit_tex(struct brw_wm_compile *c,
 848               struct brw_reg *dst,
 849               GLuint dst_flags,
 850               struct brw_reg *arg,
 851               struct brw_reg depth_payload,
 852               GLuint tex_idx,
 853               GLuint sampler,
 854               GLboolean shadow)
 855 {
 856    struct brw_compile *p = &c->func;
 857    struct intel_context *intel = &p->brw->intel;
 858    struct brw_reg dst_retyped;
 859    GLuint cur_mrf = 2, response_length;
 860    GLuint i, nr_texcoords;
 861    GLuint emit;
 862    GLuint msg_type;
 863    GLuint mrf_per_channel;
 864    GLuint simd_mode;
 865
 866    if (c->dispatch_width == 16) {
 867       mrf_per_channel = 2;
 868       response_length = 8;
 869       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 870       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 871    } else {
 872       mrf_per_channel = 1;
 873       response_length = 4;
 874       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 875       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 876    }
 877
 878    /* How many input regs are there?
 879     */
 880    switch (tex_idx) {
 881    case TEXTURE_1D_INDEX:
 882       emit = WRITEMASK_X;
 883       nr_texcoords = 1;
 884       break;
 885    case TEXTURE_2D_INDEX:
 886    case TEXTURE_RECT_INDEX:
 887       emit = WRITEMASK_XY;
 888       nr_texcoords = 2;
 889       break;
 890    case TEXTURE_3D_INDEX:
 891    case TEXTURE_CUBE_INDEX:
 892       emit = WRITEMASK_XYZ;
 893       nr_texcoords = 3;
 894       break;
 895    default:
 896       /* unexpected target */
 897       abort();
 898    }
 899
 900    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 901    if (!intel->is_ironlake && c->dispatch_width == 8)
 902       nr_texcoords = 3;
 903
 904    /* For shadow comparisons, we have to supply u,v,r. */
 905    if (shadow)
 906       nr_texcoords = 3;
 907
 908    /* Emit the texcoords. */
 909    for (i = 0; i < nr_texcoords; i++) {
 910       if (emit & (1<<i))
 911          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 912       else
 913          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 914       cur_mrf += mrf_per_channel;
 915    }
 916
 917    /* Fill in the shadow comparison reference value. */
 918    if (shadow) {
 919       if (intel->is_ironlake) {
 920          /* Fill in the cube map array index value. */
 921          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 922          cur_mrf += mrf_per_channel;
 923       } else if (c->dispatch_width == 8) {
 924          /* Fill in the LOD bias value. */
 925          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 926          cur_mrf += mrf_per_channel;
 927       }
 928       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 929       cur_mrf += mrf_per_channel;
 930    }
 931
 932    if (intel->is_ironlake) {
 933       if (shadow)
 934          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 935       else
 936          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 937    } else {
 938       /* Note that G45 and older determines shadow compare and dispatch width
 939        * from message length for most messages.
 940        */
 941       if (c->dispatch_width == 16 && shadow)
 942          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 943       else
 944          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 945    }
 946
 947    brw_SAMPLE(p,
 948               dst_retyped,
 949               1,
 950               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 951               SURF_INDEX_TEXTURE(sampler),
 952               sampler,
 953               dst_flags & WRITEMASK_XYZW,
 954               msg_type,
 955               response_length,
 956               cur_mrf - 1,
 957               0,
 958               1,
 959               simd_mode);
 960 }
 961
 962
 963 void emit_txb(struct brw_wm_compile *c,
 964               struct brw_reg *dst,
 965               GLuint dst_flags,
 966               struct brw_reg *arg,
 967               struct brw_reg depth_payload,
 968               GLuint tex_idx,
 969               GLuint sampler)
 970 {
 971    struct brw_compile *p = &c->func;
 972    struct intel_context *intel = &p->brw->intel;
 973    GLuint msgLength;
 974    GLuint msg_type;
 975    GLuint mrf_per_channel;
 976    GLuint response_length;
 977    struct brw_reg dst_retyped;
 978
 979    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
 980     * samples, so we'll use the 16-wide instruction, leave the second halves
 981     * undefined, and trust the execution mask to keep the undefined pixels
 982     * from mattering.
 983     */
 984    if (c->dispatch_width == 16 || !intel->is_ironlake) {
 985       if (intel->is_ironlake)
 986          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 987       else
 988          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 989       mrf_per_channel = 2;
 990       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 991       response_length = 8;
 992    } else {
 993       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
 994       mrf_per_channel = 1;
 995       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 996       response_length = 4;
 997    }
 998
 999    /* Shadow ignored for txb. */
1000    switch (tex_idx) {
1001    case TEXTURE_1D_INDEX:
1002       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1003       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1004       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1005       break;
1006    case TEXTURE_2D_INDEX:
1007    case TEXTURE_RECT_INDEX:
1008       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1009       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1010       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1011       break;
1012    case TEXTURE_3D_INDEX:
1013    case TEXTURE_CUBE_INDEX:
1014       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1015       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1016       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1017       break;
1018    default:
1019       /* unexpected target */
1020       abort();
1021    }
1022
1023    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1024    msgLength = 2 + 4 * mrf_per_channel - 1;
1025
1026    brw_SAMPLE(p,
1027               dst_retyped,
1028               1,
1029               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1030               SURF_INDEX_TEXTURE(sampler),
1031               sampler,
1032               dst_flags & WRITEMASK_XYZW,
1033               msg_type,
1034               response_length,
1035               msgLength,
1036               0,
1037               1,
1038               BRW_SAMPLER_SIMD_MODE_SIMD16);
1039 }
1040
1041
1042 static void emit_lit(struct brw_wm_compile *c,
1043                      const struct brw_reg *dst,
1044                      GLuint mask,
1045                      const struct brw_reg *arg0)
1046 {
1047    struct brw_compile *p = &c->func;
1048
1049    assert((mask & WRITEMASK_XW) == 0);
1050
1051    if (mask & WRITEMASK_Y) {
1052       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1053       brw_MOV(p, dst[1], arg0[0]);
1054       brw_set_saturate(p, 0);
1055    }
1056
1057    if (mask & WRITEMASK_Z) {
1058       emit_math2(c, BRW_MATH_FUNCTION_POW,
1059                  &dst[2],
1060                  WRITEMASK_X | (mask & SATURATE),
1061                  &arg0[1],
1062                  &arg0[3]);
1063    }
1064
1065    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1066     * some of the POW calculations above, but 16-wide iff statements
1067     * seem to lock c1 hardware, so this is a nasty workaround:
1068     */
1069    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1070    {
1071       if (mask & WRITEMASK_Y)
1072          brw_MOV(p, dst[1], brw_imm_f(0));
1073
1074       if (mask & WRITEMASK_Z)
1075          brw_MOV(p, dst[2], brw_imm_f(0));
1076    }
1077    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1078 }
1079
1080
1081 /* Kill pixel - set execution mask to zero for those pixels which
1082  * fail.
1083  */
1084 static void emit_kil( struct brw_wm_compile *c,
1085                       struct brw_reg *arg0)
1086 {
1087    struct brw_compile *p = &c->func;
1088    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1089    GLuint i;
1090
1091    /* XXX - usually won't need 4 compares!
1092     */
1093    for (i = 0; i < 4; i++) {
1094       brw_push_insn_state(p);
1095       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1096       brw_set_predicate_control_flag_value(p, 0xff);
1097       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1098       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1099       brw_pop_insn_state(p);
1100    }
1101 }
1102
1103 /* KIL_NV kills the pixels that are currently executing, not based on a test
1104  * of the arguments.
1105  */
1106 static void emit_kil_nv( struct brw_wm_compile *c )
1107 {
1108    struct brw_compile *p = &c->func;
1109    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1110
1111    brw_push_insn_state(p);
1112    brw_set_mask_control(p, BRW_MASK_DISABLE);
1113    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1114    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1115    brw_pop_insn_state(p);
1116 }
1117
1118 static void fire_fb_write( struct brw_wm_compile *c,
1119                            GLuint base_reg,
1120                            GLuint nr,
1121                            GLuint target,
1122                            GLuint eot )
1123 {
1124    struct brw_compile *p = &c->func;
1125    struct brw_reg dst;
1126
1127    if (c->dispatch_width == 16)
1128       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1129    else
1130       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1131
1132    /* Pass through control information:
1133     */
1134 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1135    {
1136       brw_push_insn_state(p);
1137       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1138       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1139       brw_MOV(p,
1140                brw_message_reg(base_reg + 1),
1141                brw_vec8_grf(1, 0));
1142       brw_pop_insn_state(p);
1143    }
1144
1145    /* Send framebuffer write message: */
1146 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1147    brw_fb_WRITE(p,
1148                 dst,
1149                 base_reg,
1150                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1151                 target,
1152                 nr,
1153                 0,
1154                 eot);
1155 }
1156
1157
1158 static void emit_aa( struct brw_wm_compile *c,
1159                      struct brw_reg *arg1,
1160                      GLuint reg )
1161 {
1162    struct brw_compile *p = &c->func;
1163    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1164    GLuint off = c->key.aa_dest_stencil_reg % 2;
1165    struct brw_reg aa = offset(arg1[comp], off);
1166
1167    brw_push_insn_state(p);
1168    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1169    brw_MOV(p, brw_message_reg(reg), aa);
1170    brw_pop_insn_state(p);
1171 }
1172
1173
1174 /* Post-fragment-program processing.  Send the results to the
1175  * framebuffer.
1176  * \param arg0  the fragment color
1177  * \param arg1  the pass-through depth value
1178  * \param arg2  the shader-computed depth value
1179  */
1180 void emit_fb_write(struct brw_wm_compile *c,
1181                    struct brw_reg *arg0,
1182                    struct brw_reg *arg1,
1183                    struct brw_reg *arg2,
1184                    GLuint target,
1185                    GLuint eot)
1186 {
1187    struct brw_compile *p = &c->func;
1188    struct brw_context *brw = p->brw;
1189    GLuint nr = 2;
1190    GLuint channel;
1191
1192    /* Reserve a space for AA - may not be needed:
1193     */
1194    if (c->key.aa_dest_stencil_reg)
1195       nr += 1;
1196
1197    /* I don't really understand how this achieves the color interleave
1198     * (ie RGBARGBA) in the result:  [Do the saturation here]
1199     */
1200    brw_push_insn_state(p);
1201
1202    for (channel = 0; channel < 4; channel++) {
1203       if (c->dispatch_width == 16 && brw->has_compr4) {
1204          /* By setting the high bit of the MRF register number, we indicate
1205           * that we want COMPR4 mode - instead of doing the usual destination
1206           * + 1 for the second half we get destination + 4.
1207           */
1208          brw_MOV(p,
1209                  brw_message_reg(nr + channel + (1 << 7)),
1210                  arg0[channel]);
1211       } else {
1212          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1213          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1214          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1215          brw_MOV(p,
1216                  brw_message_reg(nr + channel),
1217                  arg0[channel]);
1218
1219          if (c->dispatch_width == 16) {
1220             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1221             brw_MOV(p,
1222                     brw_message_reg(nr + channel + 4),
1223                     sechalf(arg0[channel]));
1224          }
1225       }
1226    }
1227    /* skip over the regs populated above:
1228     */
1229    nr += 8;
1230    brw_pop_insn_state(p);
1231
1232    if (c->key.source_depth_to_render_target)
1233    {
1234       if (c->key.computes_depth)
1235          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1236       else
1237          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1238
1239       nr += 2;
1240    }
1241
1242    if (c->key.dest_depth_reg)
1243    {
1244       GLuint comp = c->key.dest_depth_reg / 2;
1245       GLuint off = c->key.dest_depth_reg % 2;
1246
1247       if (off != 0) {
1248          brw_push_insn_state(p);
1249          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1250
1251          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1252          /* 2nd half? */
1253          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1254          brw_pop_insn_state(p);
1255       }
1256       else {
1257          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1258       }
1259       nr += 2;
1260    }
1261
1262    if (!c->key.runtime_check_aads_emit) {
1263       if (c->key.aa_dest_stencil_reg)
1264          emit_aa(c, arg1, 2);
1265
1266       fire_fb_write(c, 0, nr, target, eot);
1267    }
1268    else {
1269       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1270       struct brw_reg ip = brw_ip_reg();
1271       struct brw_instruction *jmp;
1272
1273       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1274       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1275       brw_AND(p,
1276               v1_null_ud,
1277               get_element_ud(brw_vec8_grf(1,0), 6),
1278               brw_imm_ud(1<<26));
1279
1280       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1281       {
1282          emit_aa(c, arg1, 2);
1283          fire_fb_write(c, 0, nr, target, eot);
1284          /* note - thread killed in subroutine */
1285       }
1286       brw_land_fwd_jump(p, jmp);
1287
1288       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1289        */
1290       fire_fb_write(c, 1, nr-1, target, eot);
1291    }
1292 }
1293
1294 /**
1295  * Move a GPR to scratch memory.
1296  */
1297 static void emit_spill( struct brw_wm_compile *c,
1298                         struct brw_reg reg,
1299                         GLuint slot )
1300 {
1301    struct brw_compile *p = &c->func;
1302
1303    /*
1304      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1305    */
1306    brw_MOV(p, brw_message_reg(2), reg);
1307
1308    /*
1309      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1310      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1311    */
1312    brw_dp_WRITE_16(p,
1313                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1314                    slot);
1315 }
1316
1317
1318 /**
1319  * Load a GPR from scratch memory.
1320  */
1321 static void emit_unspill( struct brw_wm_compile *c,
1322                           struct brw_reg reg,
1323                           GLuint slot )
1324 {
1325    struct brw_compile *p = &c->func;
1326
1327    /* Slot 0 is the undef value.
1328     */
1329    if (slot == 0) {
1330       brw_MOV(p, reg, brw_imm_f(0));
1331       return;
1332    }
1333
1334    /*
1335      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1336      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1337    */
1338
1339    brw_dp_READ_16(p,
1340                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1341                   slot);
1342 }
1343
1344
1345 /**
1346  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1347  * Args with unspill_reg != 0 will be loaded from scratch memory.
1348  */
1349 static void get_argument_regs( struct brw_wm_compile *c,
1350                                struct brw_wm_ref *arg[],
1351                                struct brw_reg *regs )
1352 {
1353    GLuint i;
1354
1355    for (i = 0; i < 4; i++) {
1356       if (arg[i]) {
1357          if (arg[i]->unspill_reg)
1358             emit_unspill(c,
1359                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1360                          arg[i]->value->spill_slot);
1361
1362          regs[i] = arg[i]->hw_reg;
1363       }
1364       else {
1365          regs[i] = brw_null_reg();
1366       }
1367    }
1368 }
1369
1370
1371 /**
1372  * For values that have a spill_slot!=0, write those regs to scratch memory.
1373  */
1374 static void spill_values( struct brw_wm_compile *c,
1375                           struct brw_wm_value *values,
1376                           GLuint nr )
1377 {
1378    GLuint i;
1379
1380    for (i = 0; i < nr; i++)
1381       if (values[i].spill_slot)
1382          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1383 }
1384
1385
1386 /* Emit the fragment program instructions here.
1387  */
1388 void brw_wm_emit( struct brw_wm_compile *c )
1389 {
1390    struct brw_compile *p = &c->func;
1391    GLuint insn;
1392
1393    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1394
1395    /* Check if any of the payload regs need to be spilled:
1396     */
1397    spill_values(c, c->payload.depth, 4);
1398    spill_values(c, c->creg, c->nr_creg);
1399    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1400
1401
1402    for (insn = 0; insn < c->nr_insns; insn++) {
1403
1404       struct brw_wm_instruction *inst = &c->instruction[insn];
1405       struct brw_reg args[3][4], dst[4];
1406       GLuint i, dst_flags;
1407
1408       /* Get argument regs:
1409        */
1410       for (i = 0; i < 3; i++)
1411          get_argument_regs(c, inst->src[i], args[i]);
1412
1413       /* Get dest regs:
1414        */
1415       for (i = 0; i < 4; i++)
1416          if (inst->dst[i])
1417             dst[i] = inst->dst[i]->hw_reg;
1418          else
1419             dst[i] = brw_null_reg();
1420
1421       /* Flags
1422        */
1423       dst_flags = inst->writemask;
1424       if (inst->saturate)
1425          dst_flags |= SATURATE;
1426
1427       switch (inst->opcode) {
1428          /* Generated instructions for calculating triangle interpolants:
1429           */
1430       case WM_PIXELXY:
1431          emit_pixel_xy(c, dst, dst_flags);
1432          break;
1433
1434       case WM_DELTAXY:
1435          emit_delta_xy(p, dst, dst_flags, args[0]);
1436          break;
1437
1438       case WM_WPOSXY:
1439          emit_wpos_xy(c, dst, dst_flags, args[0]);
1440          break;
1441
1442       case WM_PIXELW:
1443          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1444          break;
1445
1446       case WM_LINTERP:
1447          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1448          break;
1449
1450       case WM_PINTERP:
1451          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1452          break;
1453
1454       case WM_CINTERP:
1455          emit_cinterp(p, dst, dst_flags, args[0]);
1456          break;
1457
1458       case WM_FB_WRITE:
1459          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1460          break;
1461
1462       case WM_FRONTFACING:
1463          emit_frontfacing(p, dst, dst_flags);
1464          break;
1465
1466          /* Straightforward arithmetic:
1467           */
1468       case OPCODE_ADD:
1469          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1470          break;
1471
1472       case OPCODE_FRC:
1473          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1474          break;
1475
1476       case OPCODE_FLR:
1477          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1478          break;
1479
1480       case OPCODE_DDX:
1481          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1482          break;
1483
1484       case OPCODE_DDY:
1485          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1486          break;
1487
1488       case OPCODE_DP3:
1489          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1490          break;
1491
1492       case OPCODE_DP4:
1493          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1494          break;
1495
1496       case OPCODE_DPH:
1497          emit_dph(p, dst, dst_flags, args[0], args[1]);
1498          break;
1499
1500       case OPCODE_TRUNC:
1501          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1502          break;
1503
1504       case OPCODE_LRP:
1505          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1506          break;
1507
1508       case OPCODE_MAD:
1509          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1510          break;
1511
1512       case OPCODE_MOV:
1513       case OPCODE_SWZ:
1514          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1515          break;
1516
1517       case OPCODE_MUL:
1518          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1519          break;
1520
1521       case OPCODE_XPD:
1522          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1523          break;
1524
1525          /* Higher math functions:
1526           */
1527       case OPCODE_RCP:
1528          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1529          break;
1530
1531       case OPCODE_RSQ:
1532          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1533          break;
1534
1535       case OPCODE_SIN:
1536          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1537          break;
1538
1539       case OPCODE_COS:
1540          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1541          break;
1542
1543       case OPCODE_EX2:
1544          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1545          break;
1546
1547       case OPCODE_LG2:
1548          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1549          break;
1550
1551       case OPCODE_SCS:
1552          /* There is an scs math function, but it would need some
1553           * fixup for 16-element execution.
1554           */
1555          if (dst_flags & WRITEMASK_X)
1556             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1557          if (dst_flags & WRITEMASK_Y)
1558             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1559          break;
1560
1561       case OPCODE_POW:
1562          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1563          break;
1564
1565          /* Comparisons:
1566           */
1567       case OPCODE_CMP:
1568          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1569          break;
1570
1571       case OPCODE_MAX:
1572          emit_max(p, dst, dst_flags, args[0], args[1]);
1573          break;
1574
1575       case OPCODE_MIN:
1576          emit_min(p, dst, dst_flags, args[0], args[1]);
1577          break;
1578
1579       case OPCODE_SLT:
1580          emit_slt(p, dst, dst_flags, args[0], args[1]);
1581          break;
1582
1583       case OPCODE_SLE:
1584          emit_sle(p, dst, dst_flags, args[0], args[1]);
1585         break;
1586       case OPCODE_SGT:
1587          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1588         break;
1589       case OPCODE_SGE:
1590          emit_sge(p, dst, dst_flags, args[0], args[1]);
1591          break;
1592       case OPCODE_SEQ:
1593          emit_seq(p, dst, dst_flags, args[0], args[1]);
1594         break;
1595       case OPCODE_SNE:
1596          emit_sne(p, dst, dst_flags, args[0], args[1]);
1597         break;
1598
1599       case OPCODE_LIT:
1600          emit_lit(c, dst, dst_flags, args[0]);
1601          break;
1602
1603          /* Texturing operations:
1604           */
1605       case OPCODE_TEX:
1606          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1607                   inst->tex_idx, inst->tex_unit,
1608                   inst->tex_shadow);
1609          break;
1610
1611       case OPCODE_TXB:
1612          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1613                   inst->tex_idx, inst->tex_unit);
1614          break;
1615
1616       case OPCODE_KIL:
1617          emit_kil(c, args[0]);
1618          break;
1619
1620       case OPCODE_KIL_NV:
1621          emit_kil_nv(c);
1622          break;
1623
1624       default:
1625          printf("Unsupported opcode %i (%s) in fragment shader\n",
1626                 inst->opcode, inst->opcode < MAX_OPCODE ?
1627                 _mesa_opcode_string(inst->opcode) :
1628                 "unknown");
1629       }
1630
1631       for (i = 0; i < 4; i++)
1632         if (inst->dst[i] && inst->dst[i]->spill_slot)
1633            emit_spill(c,
1634                       inst->dst[i]->hw_reg,
1635                       inst->dst[i]->spill_slot);
1636    }
1637
1638    if (INTEL_DEBUG & DEBUG_WM) {
1639       int i;
1640
1641       printf("wm-native:\n");
1642       for (i = 0; i < p->nr_insn; i++)
1643          brw_disasm(stderr, &p->store[i]);
1644       printf("\n");
1645    }
1646 }