src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64
  65 /**
  66  * Computes the screen-space x,y position of the pixels.
  67  *
  68  * This will be used by emit_delta_xy() or emit_wpos_xy() for
  69  * interpolation of attributes..
  70  *
  71  * Payload R0:
  72  *
  73  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  74  *         corresponding to each of the 16 execution channels.
  75  * R0.1..8 -- ?
  76  * R1.0 -- triangle vertex 0.X
  77  * R1.1 -- triangle vertex 0.Y
  78  * R1.2 -- tile 0 x,y coords (2 packed uwords)
  79  * R1.3 -- tile 1 x,y coords (2 packed uwords)
  80  * R1.4 -- tile 2 x,y coords (2 packed uwords)
  81  * R1.5 -- tile 3 x,y coords (2 packed uwords)
  82  * R1.6 -- ?
  83  * R1.7 -- ?
  84  * R1.8 -- ?
  85  */
  86 void emit_pixel_xy(struct brw_wm_compile *c,
  87                    const struct brw_reg *dst,
  88                    GLuint mask)
  89 {
  90    struct brw_compile *p = &c->func;
  91    struct brw_reg r1 = brw_vec1_grf(1, 0);
  92    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
  93    struct brw_reg dst0_uw, dst1_uw;
  94
  95    brw_push_insn_state(p);
  96    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  97
  98    if (c->dispatch_width == 16) {
  99       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 100       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 101    } else {
 102       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 103       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 104    }
 105
 106    /* Calculate pixel centers by adding 1 or 0 to each of the
 107     * micro-tile coordinates passed in r1.
 108     */
 109    if (mask & WRITEMASK_X) {
 110       brw_ADD(p,
 111               dst0_uw,
 112               stride(suboffset(r1_uw, 4), 2, 4, 0),
 113               brw_imm_v(0x10101010));
 114    }
 115
 116    if (mask & WRITEMASK_Y) {
 117       brw_ADD(p,
 118               dst1_uw,
 119               stride(suboffset(r1_uw,5), 2, 4, 0),
 120               brw_imm_v(0x11001100));
 121    }
 122    brw_pop_insn_state(p);
 123 }
 124
 125 /**
 126  * Computes the screen-space x,y distance of the pixels from the start
 127  * vertex.
 128  *
 129  * This will be used in linterp or pinterp with the start vertex value
 130  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 131  * to produce interpolated attribute values.
 132  */
 133 void emit_delta_xy(struct brw_compile *p,
 134                    const struct brw_reg *dst,
 135                    GLuint mask,
 136                    const struct brw_reg *arg0)
 137 {
 138    struct brw_reg r1 = brw_vec1_grf(1, 0);
 139
 140    if (mask == 0)
 141       return;
 142
 143    assert(mask == WRITEMASK_XY);
 144
 145    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 146     * centers produced by emit_pixel_xy().
 147     */
 148    brw_ADD(p,
 149            dst[0],
 150            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 151            negate(r1));
 152    brw_ADD(p,
 153            dst[1],
 154            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 155            negate(suboffset(r1,1)));
 156 }
 157
 158 /**
 159  * Computes the pixel offset from the window origin for gl_FragCoord().
 160  */
 161 void emit_wpos_xy(struct brw_wm_compile *c,
 162                   const struct brw_reg *dst,
 163                   GLuint mask,
 164                   const struct brw_reg *arg0)
 165 {
 166    struct brw_compile *p = &c->func;
 167
 168    if (mask & WRITEMASK_X) {
 169       if (c->fp->program.PixelCenterInteger) {
 170          /* X' = X */
 171          brw_MOV(p,
 172                  dst[0],
 173                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 174       } else {
 175          /* X' = X + 0.5 */
 176          brw_ADD(p,
 177                  dst[0],
 178                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 179                  brw_imm_f(0.5));
 180       }
 181    }
 182
 183    if (mask & WRITEMASK_Y) {
 184       if (c->fp->program.OriginUpperLeft) {
 185          if (c->fp->program.PixelCenterInteger) {
 186             /* Y' = Y */
 187             brw_MOV(p,
 188                     dst[1],
 189                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 190          } else {
 191             /* Y' = Y + 0.5 */
 192             brw_ADD(p,
 193                     dst[1],
 194                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 195                     brw_imm_f(0.5));
 196          }
 197       } else {
 198          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 199
 200          /* Y' = (height - 1) - Y + center */
 201          brw_ADD(p,
 202                  dst[1],
 203                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 204                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 205       }
 206    }
 207 }
 208
 209
 210 void emit_pixel_w(struct brw_wm_compile *c,
 211                   const struct brw_reg *dst,
 212                   GLuint mask,
 213                   const struct brw_reg *arg0,
 214                   const struct brw_reg *deltas)
 215 {
 216    struct brw_compile *p = &c->func;
 217    struct intel_context *intel = &p->brw->intel;
 218
 219    /* Don't need this if all you are doing is interpolating color, for
 220     * instance.
 221     */
 222    if (mask & WRITEMASK_W) {
 223       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 224
 225       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 226        * result straight into a message reg.
 227        */
 228       if (can_do_pln(intel, deltas)) {
 229          brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
 230       } else {
 231          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 232          brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 233       }
 234
 235       /* Calc w */
 236       if (c->dispatch_width == 16) {
 237          brw_math_16(p, dst[3],
 238                      BRW_MATH_FUNCTION_INV,
 239                      BRW_MATH_SATURATE_NONE,
 240                      2, brw_null_reg(),
 241                      BRW_MATH_PRECISION_FULL);
 242       } else {
 243          brw_math(p, dst[3],
 244                   BRW_MATH_FUNCTION_INV,
 245                   BRW_MATH_SATURATE_NONE,
 246                   2, brw_null_reg(),
 247                   BRW_MATH_DATA_VECTOR,
 248                   BRW_MATH_PRECISION_FULL);
 249       }
 250    }
 251 }
 252
 253
 254 void emit_linterp(struct brw_compile *p,
 255                   const struct brw_reg *dst,
 256                   GLuint mask,
 257                   const struct brw_reg *arg0,
 258                   const struct brw_reg *deltas)
 259 {
 260    struct intel_context *intel = &p->brw->intel;
 261    struct brw_reg interp[4];
 262    GLuint nr = arg0[0].nr;
 263    GLuint i;
 264
 265    interp[0] = brw_vec1_grf(nr, 0);
 266    interp[1] = brw_vec1_grf(nr, 4);
 267    interp[2] = brw_vec1_grf(nr+1, 0);
 268    interp[3] = brw_vec1_grf(nr+1, 4);
 269
 270    for (i = 0; i < 4; i++) {
 271       if (mask & (1<<i)) {
 272          if (can_do_pln(intel, deltas)) {
 273             brw_PLN(p, dst[i], interp[i], deltas[0]);
 274          } else {
 275             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 276             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 277          }
 278       }
 279    }
 280 }
 281
 282
 283 void emit_pinterp(struct brw_compile *p,
 284                   const struct brw_reg *dst,
 285                   GLuint mask,
 286                   const struct brw_reg *arg0,
 287                   const struct brw_reg *deltas,
 288                   const struct brw_reg *w)
 289 {
 290    struct intel_context *intel = &p->brw->intel;
 291    struct brw_reg interp[4];
 292    GLuint nr = arg0[0].nr;
 293    GLuint i;
 294
 295    interp[0] = brw_vec1_grf(nr, 0);
 296    interp[1] = brw_vec1_grf(nr, 4);
 297    interp[2] = brw_vec1_grf(nr+1, 0);
 298    interp[3] = brw_vec1_grf(nr+1, 4);
 299
 300    for (i = 0; i < 4; i++) {
 301       if (mask & (1<<i)) {
 302          if (can_do_pln(intel, deltas)) {
 303             brw_PLN(p, dst[i], interp[i], deltas[0]);
 304          } else {
 305             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 306             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 307          }
 308       }
 309    }
 310    for (i = 0; i < 4; i++) {
 311       if (mask & (1<<i)) {
 312          brw_MUL(p, dst[i], dst[i], w[3]);
 313       }
 314    }
 315 }
 316
 317
 318 void emit_cinterp(struct brw_compile *p,
 319                   const struct brw_reg *dst,
 320                   GLuint mask,
 321                   const struct brw_reg *arg0)
 322 {
 323    struct brw_reg interp[4];
 324    GLuint nr = arg0[0].nr;
 325    GLuint i;
 326
 327    interp[0] = brw_vec1_grf(nr, 0);
 328    interp[1] = brw_vec1_grf(nr, 4);
 329    interp[2] = brw_vec1_grf(nr+1, 0);
 330    interp[3] = brw_vec1_grf(nr+1, 4);
 331
 332    for (i = 0; i < 4; i++) {
 333       if (mask & (1<<i)) {
 334          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 335       }
 336    }
 337 }
 338
 339 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 340 void emit_frontfacing(struct brw_compile *p,
 341                       const struct brw_reg *dst,
 342                       GLuint mask)
 343 {
 344    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 345    GLuint i;
 346
 347    if (!(mask & WRITEMASK_XYZW))
 348       return;
 349
 350    for (i = 0; i < 4; i++) {
 351       if (mask & (1<<i)) {
 352          brw_MOV(p, dst[i], brw_imm_f(0.0));
 353       }
 354    }
 355
 356    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 357     * us front face
 358     */
 359    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 360    for (i = 0; i < 4; i++) {
 361       if (mask & (1<<i)) {
 362          brw_MOV(p, dst[i], brw_imm_f(1.0));
 363       }
 364    }
 365    brw_set_predicate_control_flag_value(p, 0xff);
 366 }
 367
 368 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 369  * looking like:
 370  *
 371  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 372  *
 373  * and we're trying to produce:
 374  *
 375  *           DDX                     DDY
 376  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 377  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 378  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 379  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 380  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 381  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 382  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 383  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 384  *
 385  * and add another set of two more subspans if in 16-pixel dispatch mode.
 386  *
 387  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 388  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 389  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 390  * between each other.  We could probably do it like ddx and swizzle the right
 391  * order later, but bail for now and just produce
 392  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 393  */
 394 void emit_ddxy(struct brw_compile *p,
 395                const struct brw_reg *dst,
 396                GLuint mask,
 397                GLboolean is_ddx,
 398                const struct brw_reg *arg0)
 399 {
 400    int i;
 401    struct brw_reg src0, src1;
 402
 403    if (mask & SATURATE)
 404       brw_set_saturate(p, 1);
 405    for (i = 0; i < 4; i++ ) {
 406       if (mask & (1<<i)) {
 407          if (is_ddx) {
 408             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 409                            BRW_REGISTER_TYPE_F,
 410                            BRW_VERTICAL_STRIDE_2,
 411                            BRW_WIDTH_2,
 412                            BRW_HORIZONTAL_STRIDE_0,
 413                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 414             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 415                            BRW_REGISTER_TYPE_F,
 416                            BRW_VERTICAL_STRIDE_2,
 417                            BRW_WIDTH_2,
 418                            BRW_HORIZONTAL_STRIDE_0,
 419                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 420          } else {
 421             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 422                            BRW_REGISTER_TYPE_F,
 423                            BRW_VERTICAL_STRIDE_4,
 424                            BRW_WIDTH_4,
 425                            BRW_HORIZONTAL_STRIDE_0,
 426                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 427             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 428                            BRW_REGISTER_TYPE_F,
 429                            BRW_VERTICAL_STRIDE_4,
 430                            BRW_WIDTH_4,
 431                            BRW_HORIZONTAL_STRIDE_0,
 432                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 433          }
 434          brw_ADD(p, dst[i], src0, negate(src1));
 435       }
 436    }
 437    if (mask & SATURATE)
 438       brw_set_saturate(p, 0);
 439 }
 440
 441 void emit_alu1(struct brw_compile *p,
 442                struct brw_instruction *(*func)(struct brw_compile *,
 443                                                struct brw_reg,
 444                                                struct brw_reg),
 445                const struct brw_reg *dst,
 446                GLuint mask,
 447                const struct brw_reg *arg0)
 448 {
 449    GLuint i;
 450
 451    if (mask & SATURATE)
 452       brw_set_saturate(p, 1);
 453
 454    for (i = 0; i < 4; i++) {
 455       if (mask & (1<<i)) {
 456          func(p, dst[i], arg0[i]);
 457       }
 458    }
 459
 460    if (mask & SATURATE)
 461       brw_set_saturate(p, 0);
 462 }
 463
 464
 465 void emit_alu2(struct brw_compile *p,
 466                struct brw_instruction *(*func)(struct brw_compile *,
 467                                                struct brw_reg,
 468                                                struct brw_reg,
 469                                                struct brw_reg),
 470                const struct brw_reg *dst,
 471                GLuint mask,
 472                const struct brw_reg *arg0,
 473                const struct brw_reg *arg1)
 474 {
 475    GLuint i;
 476
 477    if (mask & SATURATE)
 478       brw_set_saturate(p, 1);
 479
 480    for (i = 0; i < 4; i++) {
 481       if (mask & (1<<i)) {
 482          func(p, dst[i], arg0[i], arg1[i]);
 483       }
 484    }
 485
 486    if (mask & SATURATE)
 487       brw_set_saturate(p, 0);
 488 }
 489
 490
 491 void emit_mad(struct brw_compile *p,
 492               const struct brw_reg *dst,
 493               GLuint mask,
 494               const struct brw_reg *arg0,
 495               const struct brw_reg *arg1,
 496               const struct brw_reg *arg2)
 497 {
 498    GLuint i;
 499
 500    for (i = 0; i < 4; i++) {
 501       if (mask & (1<<i)) {
 502          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 503
 504          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 505          brw_ADD(p, dst[i], dst[i], arg2[i]);
 506          brw_set_saturate(p, 0);
 507       }
 508    }
 509 }
 510
 511 void emit_lrp(struct brw_compile *p,
 512               const struct brw_reg *dst,
 513               GLuint mask,
 514               const struct brw_reg *arg0,
 515               const struct brw_reg *arg1,
 516               const struct brw_reg *arg2)
 517 {
 518    GLuint i;
 519
 520    /* Uses dst as a temporary:
 521     */
 522    for (i = 0; i < 4; i++) {
 523       if (mask & (1<<i)) {
 524          /* Can I use the LINE instruction for this?
 525           */
 526          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 527          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 528
 529          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 530          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 531          brw_set_saturate(p, 0);
 532       }
 533    }
 534 }
 535
 536 void emit_sop(struct brw_compile *p,
 537               const struct brw_reg *dst,
 538               GLuint mask,
 539               GLuint cond,
 540               const struct brw_reg *arg0,
 541               const struct brw_reg *arg1)
 542 {
 543    GLuint i;
 544
 545    for (i = 0; i < 4; i++) {
 546       if (mask & (1<<i)) {
 547          brw_push_insn_state(p);
 548          brw_CMP(p, brw_null_reg(), cond, arg1[i], arg0[i]);
 549          brw_SEL(p, dst[i], brw_null_reg(), brw_imm_f(1.0));
 550          brw_pop_insn_state(p);
 551       }
 552    }
 553 }
 554
 555 static void emit_slt( struct brw_compile *p,
 556                       const struct brw_reg *dst,
 557                       GLuint mask,
 558                       const struct brw_reg *arg0,
 559                       const struct brw_reg *arg1 )
 560 {
 561    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 562 }
 563
 564 static void emit_sle( struct brw_compile *p,
 565                       const struct brw_reg *dst,
 566                       GLuint mask,
 567                       const struct brw_reg *arg0,
 568                       const struct brw_reg *arg1 )
 569 {
 570    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 571 }
 572
 573 static void emit_sgt( struct brw_compile *p,
 574                       const struct brw_reg *dst,
 575                       GLuint mask,
 576                       const struct brw_reg *arg0,
 577                       const struct brw_reg *arg1 )
 578 {
 579    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 580 }
 581
 582 static void emit_sge( struct brw_compile *p,
 583                       const struct brw_reg *dst,
 584                       GLuint mask,
 585                       const struct brw_reg *arg0,
 586                       const struct brw_reg *arg1 )
 587 {
 588    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 589 }
 590
 591 static void emit_seq( struct brw_compile *p,
 592                       const struct brw_reg *dst,
 593                       GLuint mask,
 594                       const struct brw_reg *arg0,
 595                       const struct brw_reg *arg1 )
 596 {
 597    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 598 }
 599
 600 static void emit_sne( struct brw_compile *p,
 601                       const struct brw_reg *dst,
 602                       GLuint mask,
 603                       const struct brw_reg *arg0,
 604                       const struct brw_reg *arg1 )
 605 {
 606    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 607 }
 608
 609 void emit_cmp(struct brw_compile *p,
 610               const struct brw_reg *dst,
 611               GLuint mask,
 612               const struct brw_reg *arg0,
 613               const struct brw_reg *arg1,
 614               const struct brw_reg *arg2)
 615 {
 616    GLuint i;
 617
 618    for (i = 0; i < 4; i++) {
 619       if (mask & (1<<i)) {
 620          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 621          brw_MOV(p, dst[i], arg2[i]);
 622          brw_set_saturate(p, 0);
 623
 624          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 625
 626          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 627          brw_MOV(p, dst[i], arg1[i]);
 628          brw_set_saturate(p, 0);
 629          brw_set_predicate_control_flag_value(p, 0xff);
 630       }
 631    }
 632 }
 633
 634 void emit_max(struct brw_compile *p,
 635               const struct brw_reg *dst,
 636               GLuint mask,
 637               const struct brw_reg *arg0,
 638               const struct brw_reg *arg1)
 639 {
 640    GLuint i;
 641
 642    for (i = 0; i < 4; i++) {
 643       if (mask & (1<<i)) {
 644          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 645
 646          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 647          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 648          brw_set_saturate(p, 0);
 649          brw_set_predicate_control_flag_value(p, 0xff);
 650       }
 651    }
 652 }
 653
 654 void emit_min(struct brw_compile *p,
 655               const struct brw_reg *dst,
 656               GLuint mask,
 657               const struct brw_reg *arg0,
 658               const struct brw_reg *arg1)
 659 {
 660    GLuint i;
 661
 662    for (i = 0; i < 4; i++) {
 663       if (mask & (1<<i)) {
 664          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 665
 666          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 667          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 668          brw_set_saturate(p, 0);
 669          brw_set_predicate_control_flag_value(p, 0xff);
 670       }
 671    }
 672 }
 673
 674
 675 void emit_dp3(struct brw_compile *p,
 676               const struct brw_reg *dst,
 677               GLuint mask,
 678               const struct brw_reg *arg0,
 679               const struct brw_reg *arg1)
 680 {
 681    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 682
 683    if (!(mask & WRITEMASK_XYZW))
 684       return; /* Do not emit dead code */
 685
 686    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 687
 688    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 689    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 690
 691    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 692    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 693    brw_set_saturate(p, 0);
 694 }
 695
 696
 697 void emit_dp4(struct brw_compile *p,
 698               const struct brw_reg *dst,
 699               GLuint mask,
 700               const struct brw_reg *arg0,
 701               const struct brw_reg *arg1)
 702 {
 703    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 704
 705    if (!(mask & WRITEMASK_XYZW))
 706       return; /* Do not emit dead code */
 707
 708    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 709
 710    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 711    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 712    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 713
 714    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 715    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 716    brw_set_saturate(p, 0);
 717 }
 718
 719
 720 void emit_dph(struct brw_compile *p,
 721               const struct brw_reg *dst,
 722               GLuint mask,
 723               const struct brw_reg *arg0,
 724               const struct brw_reg *arg1)
 725 {
 726    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 727
 728    if (!(mask & WRITEMASK_XYZW))
 729       return; /* Do not emit dead code */
 730
 731    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 732
 733    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 734    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 735    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 736
 737    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 738    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 739    brw_set_saturate(p, 0);
 740 }
 741
 742
 743 void emit_xpd(struct brw_compile *p,
 744               const struct brw_reg *dst,
 745               GLuint mask,
 746               const struct brw_reg *arg0,
 747               const struct brw_reg *arg1)
 748 {
 749    GLuint i;
 750
 751    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 752
 753    for (i = 0 ; i < 3; i++) {
 754       if (mask & (1<<i)) {
 755          GLuint i2 = (i+2)%3;
 756          GLuint i1 = (i+1)%3;
 757
 758          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 759
 760          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 761          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 762          brw_set_saturate(p, 0);
 763       }
 764    }
 765 }
 766
 767
 768 void emit_math1(struct brw_wm_compile *c,
 769                 GLuint function,
 770                 const struct brw_reg *dst,
 771                 GLuint mask,
 772                 const struct brw_reg *arg0)
 773 {
 774    struct brw_compile *p = &c->func;
 775    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 776    GLuint saturate = ((mask & SATURATE) ?
 777                       BRW_MATH_SATURATE_SATURATE :
 778                       BRW_MATH_SATURATE_NONE);
 779
 780    if (!(mask & WRITEMASK_XYZW))
 781       return; /* Do not emit dead code */
 782
 783    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 784
 785    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 786     * channels.
 787     */
 788    brw_MOV(p, brw_message_reg(2), arg0[0]);
 789
 790    /* Send two messages to perform all 16 operations:
 791     */
 792    brw_push_insn_state(p);
 793    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 794    brw_math(p,
 795             dst[dst_chan],
 796             function,
 797             saturate,
 798             2,
 799             brw_null_reg(),
 800             BRW_MATH_DATA_VECTOR,
 801             BRW_MATH_PRECISION_FULL);
 802
 803    if (c->dispatch_width == 16) {
 804       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 805       brw_math(p,
 806                offset(dst[dst_chan],1),
 807                function,
 808                saturate,
 809                3,
 810                brw_null_reg(),
 811                BRW_MATH_DATA_VECTOR,
 812                BRW_MATH_PRECISION_FULL);
 813    }
 814    brw_pop_insn_state(p);
 815 }
 816
 817
 818 void emit_math2(struct brw_wm_compile *c,
 819                 GLuint function,
 820                 const struct brw_reg *dst,
 821                 GLuint mask,
 822                 const struct brw_reg *arg0,
 823                 const struct brw_reg *arg1)
 824 {
 825    struct brw_compile *p = &c->func;
 826    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 827    GLuint saturate = ((mask & SATURATE) ?
 828                       BRW_MATH_SATURATE_SATURATE :
 829                       BRW_MATH_SATURATE_NONE);
 830
 831    if (!(mask & WRITEMASK_XYZW))
 832       return; /* Do not emit dead code */
 833
 834    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 835
 836    brw_push_insn_state(p);
 837
 838    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 839    brw_MOV(p, brw_message_reg(2), arg0[0]);
 840    if (c->dispatch_width == 16) {
 841       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 842       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 843    }
 844
 845    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 846    brw_MOV(p, brw_message_reg(3), arg1[0]);
 847    if (c->dispatch_width == 16) {
 848       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 849       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 850    }
 851
 852    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 853    brw_math(p,
 854             dst[dst_chan],
 855             function,
 856             saturate,
 857             2,
 858             brw_null_reg(),
 859             BRW_MATH_DATA_VECTOR,
 860             BRW_MATH_PRECISION_FULL);
 861
 862    /* Send two messages to perform all 16 operations:
 863     */
 864    if (c->dispatch_width == 16) {
 865       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 866       brw_math(p,
 867                offset(dst[dst_chan],1),
 868                function,
 869                saturate,
 870                4,
 871                brw_null_reg(),
 872                BRW_MATH_DATA_VECTOR,
 873                BRW_MATH_PRECISION_FULL);
 874    }
 875    brw_pop_insn_state(p);
 876 }
 877
 878
 879 void emit_tex(struct brw_wm_compile *c,
 880               struct brw_reg *dst,
 881               GLuint dst_flags,
 882               struct brw_reg *arg,
 883               struct brw_reg depth_payload,
 884               GLuint tex_idx,
 885               GLuint sampler,
 886               GLboolean shadow)
 887 {
 888    struct brw_compile *p = &c->func;
 889    struct intel_context *intel = &p->brw->intel;
 890    struct brw_reg dst_retyped;
 891    GLuint cur_mrf = 2, response_length;
 892    GLuint i, nr_texcoords;
 893    GLuint emit;
 894    GLuint msg_type;
 895    GLuint mrf_per_channel;
 896    GLuint simd_mode;
 897
 898    if (c->dispatch_width == 16) {
 899       mrf_per_channel = 2;
 900       response_length = 8;
 901       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 902       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 903    } else {
 904       mrf_per_channel = 1;
 905       response_length = 4;
 906       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 907       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 908    }
 909
 910    /* How many input regs are there?
 911     */
 912    switch (tex_idx) {
 913    case TEXTURE_1D_INDEX:
 914       emit = WRITEMASK_X;
 915       nr_texcoords = 1;
 916       break;
 917    case TEXTURE_2D_INDEX:
 918    case TEXTURE_RECT_INDEX:
 919       emit = WRITEMASK_XY;
 920       nr_texcoords = 2;
 921       break;
 922    case TEXTURE_3D_INDEX:
 923    case TEXTURE_CUBE_INDEX:
 924       emit = WRITEMASK_XYZ;
 925       nr_texcoords = 3;
 926       break;
 927    default:
 928       /* unexpected target */
 929       abort();
 930    }
 931
 932    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 933    if (!intel->is_ironlake && c->dispatch_width == 8)
 934       nr_texcoords = 3;
 935
 936    /* For shadow comparisons, we have to supply u,v,r. */
 937    if (shadow)
 938       nr_texcoords = 3;
 939
 940    /* Emit the texcoords. */
 941    for (i = 0; i < nr_texcoords; i++) {
 942       if (emit & (1<<i))
 943          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 944       else
 945          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 946       cur_mrf += mrf_per_channel;
 947    }
 948
 949    /* Fill in the shadow comparison reference value. */
 950    if (shadow) {
 951       if (intel->is_ironlake) {
 952          /* Fill in the cube map array index value. */
 953          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 954          cur_mrf += mrf_per_channel;
 955       } else if (c->dispatch_width == 8) {
 956          /* Fill in the LOD bias value. */
 957          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 958          cur_mrf += mrf_per_channel;
 959       }
 960       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 961       cur_mrf += mrf_per_channel;
 962    }
 963
 964    if (intel->is_ironlake) {
 965       if (shadow)
 966          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
 967       else
 968          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
 969    } else {
 970       /* Note that G45 and older determines shadow compare and dispatch width
 971        * from message length for most messages.
 972        */
 973       if (c->dispatch_width == 16 && shadow)
 974          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 975       else
 976          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 977    }
 978
 979    brw_SAMPLE(p,
 980               dst_retyped,
 981               1,
 982               retype(depth_payload, BRW_REGISTER_TYPE_UW),
 983               SURF_INDEX_TEXTURE(sampler),
 984               sampler,
 985               dst_flags & WRITEMASK_XYZW,
 986               msg_type,
 987               response_length,
 988               cur_mrf - 1,
 989               0,
 990               1,
 991               simd_mode);
 992 }
 993
 994
 995 void emit_txb(struct brw_wm_compile *c,
 996               struct brw_reg *dst,
 997               GLuint dst_flags,
 998               struct brw_reg *arg,
 999               struct brw_reg depth_payload,
1000               GLuint tex_idx,
1001               GLuint sampler)
1002 {
1003    struct brw_compile *p = &c->func;
1004    struct intel_context *intel = &p->brw->intel;
1005    GLuint msgLength;
1006    GLuint msg_type;
1007    GLuint mrf_per_channel;
1008    GLuint response_length;
1009    struct brw_reg dst_retyped;
1010
1011    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1012     * samples, so we'll use the 16-wide instruction, leave the second halves
1013     * undefined, and trust the execution mask to keep the undefined pixels
1014     * from mattering.
1015     */
1016    if (c->dispatch_width == 16 || !intel->is_ironlake) {
1017       if (intel->is_ironlake)
1018          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1019       else
1020          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1021       mrf_per_channel = 2;
1022       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1023       response_length = 8;
1024    } else {
1025       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
1026       mrf_per_channel = 1;
1027       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1028       response_length = 4;
1029    }
1030
1031    /* Shadow ignored for txb. */
1032    switch (tex_idx) {
1033    case TEXTURE_1D_INDEX:
1034       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1035       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1036       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1037       break;
1038    case TEXTURE_2D_INDEX:
1039    case TEXTURE_RECT_INDEX:
1040       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1041       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1042       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1043       break;
1044    case TEXTURE_3D_INDEX:
1045    case TEXTURE_CUBE_INDEX:
1046       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1047       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1048       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1049       break;
1050    default:
1051       /* unexpected target */
1052       abort();
1053    }
1054
1055    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1056    msgLength = 2 + 4 * mrf_per_channel - 1;
1057
1058    brw_SAMPLE(p,
1059               dst_retyped,
1060               1,
1061               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1062               SURF_INDEX_TEXTURE(sampler),
1063               sampler,
1064               dst_flags & WRITEMASK_XYZW,
1065               msg_type,
1066               response_length,
1067               msgLength,
1068               0,
1069               1,
1070               BRW_SAMPLER_SIMD_MODE_SIMD16);
1071 }
1072
1073
1074 static void emit_lit(struct brw_wm_compile *c,
1075                      const struct brw_reg *dst,
1076                      GLuint mask,
1077                      const struct brw_reg *arg0)
1078 {
1079    struct brw_compile *p = &c->func;
1080
1081    assert((mask & WRITEMASK_XW) == 0);
1082
1083    if (mask & WRITEMASK_Y) {
1084       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1085       brw_MOV(p, dst[1], arg0[0]);
1086       brw_set_saturate(p, 0);
1087    }
1088
1089    if (mask & WRITEMASK_Z) {
1090       emit_math2(c, BRW_MATH_FUNCTION_POW,
1091                  &dst[2],
1092                  WRITEMASK_X | (mask & SATURATE),
1093                  &arg0[1],
1094                  &arg0[3]);
1095    }
1096
1097    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1098     * some of the POW calculations above, but 16-wide iff statements
1099     * seem to lock c1 hardware, so this is a nasty workaround:
1100     */
1101    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1102    {
1103       if (mask & WRITEMASK_Y)
1104          brw_MOV(p, dst[1], brw_imm_f(0));
1105
1106       if (mask & WRITEMASK_Z)
1107          brw_MOV(p, dst[2], brw_imm_f(0));
1108    }
1109    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1110 }
1111
1112
1113 /* Kill pixel - set execution mask to zero for those pixels which
1114  * fail.
1115  */
1116 static void emit_kil( struct brw_wm_compile *c,
1117                       struct brw_reg *arg0)
1118 {
1119    struct brw_compile *p = &c->func;
1120    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1121    GLuint i, j;
1122
1123    for (i = 0; i < 4; i++) {
1124       /* Check if we've already done the comparison for this reg
1125        * -- common when someone does KIL TEMP.wwww.
1126        */
1127       for (j = 0; j < i; j++) {
1128          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1129             break;
1130       }
1131       if (j != i)
1132          continue;
1133
1134       brw_push_insn_state(p);
1135       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1136       brw_set_predicate_control_flag_value(p, 0xff);
1137       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1138       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1139       brw_pop_insn_state(p);
1140    }
1141 }
1142
1143 /* KIL_NV kills the pixels that are currently executing, not based on a test
1144  * of the arguments.
1145  */
1146 static void emit_kil_nv( struct brw_wm_compile *c )
1147 {
1148    struct brw_compile *p = &c->func;
1149    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1150
1151    brw_push_insn_state(p);
1152    brw_set_mask_control(p, BRW_MASK_DISABLE);
1153    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1154    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1155    brw_pop_insn_state(p);
1156 }
1157
1158 static void fire_fb_write( struct brw_wm_compile *c,
1159                            GLuint base_reg,
1160                            GLuint nr,
1161                            GLuint target,
1162                            GLuint eot )
1163 {
1164    struct brw_compile *p = &c->func;
1165    struct brw_reg dst;
1166
1167    if (c->dispatch_width == 16)
1168       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1169    else
1170       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1171
1172    /* Pass through control information:
1173     */
1174 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1175    {
1176       brw_push_insn_state(p);
1177       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1178       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1179       brw_MOV(p,
1180                brw_message_reg(base_reg + 1),
1181                brw_vec8_grf(1, 0));
1182       brw_pop_insn_state(p);
1183    }
1184
1185    /* Send framebuffer write message: */
1186 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1187    brw_fb_WRITE(p,
1188                 dst,
1189                 base_reg,
1190                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1191                 target,
1192                 nr,
1193                 0,
1194                 eot);
1195 }
1196
1197
1198 static void emit_aa( struct brw_wm_compile *c,
1199                      struct brw_reg *arg1,
1200                      GLuint reg )
1201 {
1202    struct brw_compile *p = &c->func;
1203    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1204    GLuint off = c->key.aa_dest_stencil_reg % 2;
1205    struct brw_reg aa = offset(arg1[comp], off);
1206
1207    brw_push_insn_state(p);
1208    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1209    brw_MOV(p, brw_message_reg(reg), aa);
1210    brw_pop_insn_state(p);
1211 }
1212
1213
1214 /* Post-fragment-program processing.  Send the results to the
1215  * framebuffer.
1216  * \param arg0  the fragment color
1217  * \param arg1  the pass-through depth value
1218  * \param arg2  the shader-computed depth value
1219  */
1220 void emit_fb_write(struct brw_wm_compile *c,
1221                    struct brw_reg *arg0,
1222                    struct brw_reg *arg1,
1223                    struct brw_reg *arg2,
1224                    GLuint target,
1225                    GLuint eot)
1226 {
1227    struct brw_compile *p = &c->func;
1228    struct brw_context *brw = p->brw;
1229    GLuint nr = 2;
1230    GLuint channel;
1231
1232    /* Reserve a space for AA - may not be needed:
1233     */
1234    if (c->key.aa_dest_stencil_reg)
1235       nr += 1;
1236
1237    /* I don't really understand how this achieves the color interleave
1238     * (ie RGBARGBA) in the result:  [Do the saturation here]
1239     */
1240    brw_push_insn_state(p);
1241
1242    for (channel = 0; channel < 4; channel++) {
1243       if (c->dispatch_width == 16 && brw->has_compr4) {
1244          /* By setting the high bit of the MRF register number, we indicate
1245           * that we want COMPR4 mode - instead of doing the usual destination
1246           * + 1 for the second half we get destination + 4.
1247           */
1248          brw_MOV(p,
1249                  brw_message_reg(nr + channel + (1 << 7)),
1250                  arg0[channel]);
1251       } else {
1252          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1253          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1254          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1255          brw_MOV(p,
1256                  brw_message_reg(nr + channel),
1257                  arg0[channel]);
1258
1259          if (c->dispatch_width == 16) {
1260             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1261             brw_MOV(p,
1262                     brw_message_reg(nr + channel + 4),
1263                     sechalf(arg0[channel]));
1264          }
1265       }
1266    }
1267    /* skip over the regs populated above:
1268     */
1269    nr += 8;
1270    brw_pop_insn_state(p);
1271
1272    if (c->key.source_depth_to_render_target)
1273    {
1274       if (c->key.computes_depth)
1275          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1276       else
1277          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1278
1279       nr += 2;
1280    }
1281
1282    if (c->key.dest_depth_reg)
1283    {
1284       GLuint comp = c->key.dest_depth_reg / 2;
1285       GLuint off = c->key.dest_depth_reg % 2;
1286
1287       if (off != 0) {
1288          brw_push_insn_state(p);
1289          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1290
1291          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1292          /* 2nd half? */
1293          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1294          brw_pop_insn_state(p);
1295       }
1296       else {
1297          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1298       }
1299       nr += 2;
1300    }
1301
1302    if (!c->key.runtime_check_aads_emit) {
1303       if (c->key.aa_dest_stencil_reg)
1304          emit_aa(c, arg1, 2);
1305
1306       fire_fb_write(c, 0, nr, target, eot);
1307    }
1308    else {
1309       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1310       struct brw_reg ip = brw_ip_reg();
1311       struct brw_instruction *jmp;
1312
1313       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1314       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1315       brw_AND(p,
1316               v1_null_ud,
1317               get_element_ud(brw_vec8_grf(1,0), 6),
1318               brw_imm_ud(1<<26));
1319
1320       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1321       {
1322          emit_aa(c, arg1, 2);
1323          fire_fb_write(c, 0, nr, target, eot);
1324          /* note - thread killed in subroutine */
1325       }
1326       brw_land_fwd_jump(p, jmp);
1327
1328       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1329        */
1330       fire_fb_write(c, 1, nr-1, target, eot);
1331    }
1332 }
1333
1334 /**
1335  * Move a GPR to scratch memory.
1336  */
1337 static void emit_spill( struct brw_wm_compile *c,
1338                         struct brw_reg reg,
1339                         GLuint slot )
1340 {
1341    struct brw_compile *p = &c->func;
1342
1343    /*
1344      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1345    */
1346    brw_MOV(p, brw_message_reg(2), reg);
1347
1348    /*
1349      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1350      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1351    */
1352    brw_dp_WRITE_16(p,
1353                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1354                    slot);
1355 }
1356
1357
1358 /**
1359  * Load a GPR from scratch memory.
1360  */
1361 static void emit_unspill( struct brw_wm_compile *c,
1362                           struct brw_reg reg,
1363                           GLuint slot )
1364 {
1365    struct brw_compile *p = &c->func;
1366
1367    /* Slot 0 is the undef value.
1368     */
1369    if (slot == 0) {
1370       brw_MOV(p, reg, brw_imm_f(0));
1371       return;
1372    }
1373
1374    /*
1375      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1376      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1377    */
1378
1379    brw_dp_READ_16(p,
1380                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1381                   slot);
1382 }
1383
1384
1385 /**
1386  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1387  * Args with unspill_reg != 0 will be loaded from scratch memory.
1388  */
1389 static void get_argument_regs( struct brw_wm_compile *c,
1390                                struct brw_wm_ref *arg[],
1391                                struct brw_reg *regs )
1392 {
1393    GLuint i;
1394
1395    for (i = 0; i < 4; i++) {
1396       if (arg[i]) {
1397          if (arg[i]->unspill_reg)
1398             emit_unspill(c,
1399                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1400                          arg[i]->value->spill_slot);
1401
1402          regs[i] = arg[i]->hw_reg;
1403       }
1404       else {
1405          regs[i] = brw_null_reg();
1406       }
1407    }
1408 }
1409
1410
1411 /**
1412  * For values that have a spill_slot!=0, write those regs to scratch memory.
1413  */
1414 static void spill_values( struct brw_wm_compile *c,
1415                           struct brw_wm_value *values,
1416                           GLuint nr )
1417 {
1418    GLuint i;
1419
1420    for (i = 0; i < nr; i++)
1421       if (values[i].spill_slot)
1422          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1423 }
1424
1425
1426 /* Emit the fragment program instructions here.
1427  */
1428 void brw_wm_emit( struct brw_wm_compile *c )
1429 {
1430    struct brw_compile *p = &c->func;
1431    GLuint insn;
1432
1433    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1434
1435    /* Check if any of the payload regs need to be spilled:
1436     */
1437    spill_values(c, c->payload.depth, 4);
1438    spill_values(c, c->creg, c->nr_creg);
1439    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1440
1441
1442    for (insn = 0; insn < c->nr_insns; insn++) {
1443
1444       struct brw_wm_instruction *inst = &c->instruction[insn];
1445       struct brw_reg args[3][4], dst[4];
1446       GLuint i, dst_flags;
1447
1448       /* Get argument regs:
1449        */
1450       for (i = 0; i < 3; i++)
1451          get_argument_regs(c, inst->src[i], args[i]);
1452
1453       /* Get dest regs:
1454        */
1455       for (i = 0; i < 4; i++)
1456          if (inst->dst[i])
1457             dst[i] = inst->dst[i]->hw_reg;
1458          else
1459             dst[i] = brw_null_reg();
1460
1461       /* Flags
1462        */
1463       dst_flags = inst->writemask;
1464       if (inst->saturate)
1465          dst_flags |= SATURATE;
1466
1467       switch (inst->opcode) {
1468          /* Generated instructions for calculating triangle interpolants:
1469           */
1470       case WM_PIXELXY:
1471          emit_pixel_xy(c, dst, dst_flags);
1472          break;
1473
1474       case WM_DELTAXY:
1475          emit_delta_xy(p, dst, dst_flags, args[0]);
1476          break;
1477
1478       case WM_WPOSXY:
1479          emit_wpos_xy(c, dst, dst_flags, args[0]);
1480          break;
1481
1482       case WM_PIXELW:
1483          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1484          break;
1485
1486       case WM_LINTERP:
1487          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1488          break;
1489
1490       case WM_PINTERP:
1491          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1492          break;
1493
1494       case WM_CINTERP:
1495          emit_cinterp(p, dst, dst_flags, args[0]);
1496          break;
1497
1498       case WM_FB_WRITE:
1499          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1500          break;
1501
1502       case WM_FRONTFACING:
1503          emit_frontfacing(p, dst, dst_flags);
1504          break;
1505
1506          /* Straightforward arithmetic:
1507           */
1508       case OPCODE_ADD:
1509          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1510          break;
1511
1512       case OPCODE_FRC:
1513          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1514          break;
1515
1516       case OPCODE_FLR:
1517          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1518          break;
1519
1520       case OPCODE_DDX:
1521          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1522          break;
1523
1524       case OPCODE_DDY:
1525          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1526          break;
1527
1528       case OPCODE_DP3:
1529          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1530          break;
1531
1532       case OPCODE_DP4:
1533          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1534          break;
1535
1536       case OPCODE_DPH:
1537          emit_dph(p, dst, dst_flags, args[0], args[1]);
1538          break;
1539
1540       case OPCODE_TRUNC:
1541          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1542          break;
1543
1544       case OPCODE_LRP:
1545          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1546          break;
1547
1548       case OPCODE_MAD:
1549          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1550          break;
1551
1552       case OPCODE_MOV:
1553       case OPCODE_SWZ:
1554          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1555          break;
1556
1557       case OPCODE_MUL:
1558          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1559          break;
1560
1561       case OPCODE_XPD:
1562          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1563          break;
1564
1565          /* Higher math functions:
1566           */
1567       case OPCODE_RCP:
1568          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1569          break;
1570
1571       case OPCODE_RSQ:
1572          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1573          break;
1574
1575       case OPCODE_SIN:
1576          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1577          break;
1578
1579       case OPCODE_COS:
1580          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1581          break;
1582
1583       case OPCODE_EX2:
1584          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1585          break;
1586
1587       case OPCODE_LG2:
1588          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1589          break;
1590
1591       case OPCODE_SCS:
1592          /* There is an scs math function, but it would need some
1593           * fixup for 16-element execution.
1594           */
1595          if (dst_flags & WRITEMASK_X)
1596             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1597          if (dst_flags & WRITEMASK_Y)
1598             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1599          break;
1600
1601       case OPCODE_POW:
1602          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1603          break;
1604
1605          /* Comparisons:
1606           */
1607       case OPCODE_CMP:
1608          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1609          break;
1610
1611       case OPCODE_MAX:
1612          emit_max(p, dst, dst_flags, args[0], args[1]);
1613          break;
1614
1615       case OPCODE_MIN:
1616          emit_min(p, dst, dst_flags, args[0], args[1]);
1617          break;
1618
1619       case OPCODE_SLT:
1620          emit_slt(p, dst, dst_flags, args[0], args[1]);
1621          break;
1622
1623       case OPCODE_SLE:
1624          emit_sle(p, dst, dst_flags, args[0], args[1]);
1625         break;
1626       case OPCODE_SGT:
1627          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1628         break;
1629       case OPCODE_SGE:
1630          emit_sge(p, dst, dst_flags, args[0], args[1]);
1631          break;
1632       case OPCODE_SEQ:
1633          emit_seq(p, dst, dst_flags, args[0], args[1]);
1634         break;
1635       case OPCODE_SNE:
1636          emit_sne(p, dst, dst_flags, args[0], args[1]);
1637         break;
1638
1639       case OPCODE_LIT:
1640          emit_lit(c, dst, dst_flags, args[0]);
1641          break;
1642
1643          /* Texturing operations:
1644           */
1645       case OPCODE_TEX:
1646          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1647                   inst->tex_idx, inst->tex_unit,
1648                   inst->tex_shadow);
1649          break;
1650
1651       case OPCODE_TXB:
1652          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1653                   inst->tex_idx, inst->tex_unit);
1654          break;
1655
1656       case OPCODE_KIL:
1657          emit_kil(c, args[0]);
1658          break;
1659
1660       case OPCODE_KIL_NV:
1661          emit_kil_nv(c);
1662          break;
1663
1664       default:
1665          printf("Unsupported opcode %i (%s) in fragment shader\n",
1666                 inst->opcode, inst->opcode < MAX_OPCODE ?
1667                 _mesa_opcode_string(inst->opcode) :
1668                 "unknown");
1669       }
1670
1671       for (i = 0; i < 4; i++)
1672         if (inst->dst[i] && inst->dst[i]->spill_slot)
1673            emit_spill(c,
1674                       inst->dst[i]->hw_reg,
1675                       inst->dst[i]->spill_slot);
1676    }
1677
1678    if (INTEL_DEBUG & DEBUG_WM) {
1679       int i;
1680
1681       printf("wm-native:\n");
1682       for (i = 0; i < p->nr_insn; i++)
1683          brw_disasm(stderr, &p->store[i]);
1684       printf("\n");
1685    }
1686 }