src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_XPD] = 2,
  87    };
  88
  89    /* These opcodes get broken down in a way that allow two
  90     * args to be immediates.
  91     */
  92    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  93       if (arg == 1 || arg == 2)
  94          return GL_TRUE;
  95    }
  96
  97    if (opcode > ARRAY_SIZE(opcode_array))
  98       return GL_FALSE;
  99
 100    return arg == opcode_array[opcode] - 1;
 101 }
 102
 103 /**
 104  * Computes the screen-space x,y position of the pixels.
 105  *
 106  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 107  * interpolation of attributes..
 108  *
 109  * Payload R0:
 110  *
 111  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 112  *         corresponding to each of the 16 execution channels.
 113  * R0.1..8 -- ?
 114  * R1.0 -- triangle vertex 0.X
 115  * R1.1 -- triangle vertex 0.Y
 116  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 117  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 118  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 119  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 120  * R1.6 -- ?
 121  * R1.7 -- ?
 122  * R1.8 -- ?
 123  */
 124 void emit_pixel_xy(struct brw_wm_compile *c,
 125                    const struct brw_reg *dst,
 126                    GLuint mask)
 127 {
 128    struct brw_compile *p = &c->func;
 129    struct brw_reg r1 = brw_vec1_grf(1, 0);
 130    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 131    struct brw_reg dst0_uw, dst1_uw;
 132
 133    brw_push_insn_state(p);
 134    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 135
 136    if (c->dispatch_width == 16) {
 137       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 138       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 139    } else {
 140       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 141       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 142    }
 143
 144    /* Calculate pixel centers by adding 1 or 0 to each of the
 145     * micro-tile coordinates passed in r1.
 146     */
 147    if (mask & WRITEMASK_X) {
 148       brw_ADD(p,
 149               dst0_uw,
 150               stride(suboffset(r1_uw, 4), 2, 4, 0),
 151               brw_imm_v(0x10101010));
 152    }
 153
 154    if (mask & WRITEMASK_Y) {
 155       brw_ADD(p,
 156               dst1_uw,
 157               stride(suboffset(r1_uw,5), 2, 4, 0),
 158               brw_imm_v(0x11001100));
 159    }
 160    brw_pop_insn_state(p);
 161 }
 162
 163 /**
 164  * Computes the screen-space x,y distance of the pixels from the start
 165  * vertex.
 166  *
 167  * This will be used in linterp or pinterp with the start vertex value
 168  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 169  * to produce interpolated attribute values.
 170  */
 171 void emit_delta_xy(struct brw_compile *p,
 172                    const struct brw_reg *dst,
 173                    GLuint mask,
 174                    const struct brw_reg *arg0)
 175 {
 176    struct brw_reg r1 = brw_vec1_grf(1, 0);
 177
 178    if (mask == 0)
 179       return;
 180
 181    assert(mask == WRITEMASK_XY);
 182
 183    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 184     * centers produced by emit_pixel_xy().
 185     */
 186    brw_ADD(p,
 187            dst[0],
 188            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 189            negate(r1));
 190    brw_ADD(p,
 191            dst[1],
 192            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 193            negate(suboffset(r1,1)));
 194 }
 195
 196 /**
 197  * Computes the pixel offset from the window origin for gl_FragCoord().
 198  */
 199 void emit_wpos_xy(struct brw_wm_compile *c,
 200                   const struct brw_reg *dst,
 201                   GLuint mask,
 202                   const struct brw_reg *arg0)
 203 {
 204    struct brw_compile *p = &c->func;
 205
 206    if (mask & WRITEMASK_X) {
 207       if (c->fp->program.PixelCenterInteger) {
 208          /* X' = X */
 209          brw_MOV(p,
 210                  dst[0],
 211                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 212       } else {
 213          /* X' = X + 0.5 */
 214          brw_ADD(p,
 215                  dst[0],
 216                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 217                  brw_imm_f(0.5));
 218       }
 219    }
 220
 221    if (mask & WRITEMASK_Y) {
 222       if (c->fp->program.OriginUpperLeft) {
 223          if (c->fp->program.PixelCenterInteger) {
 224             /* Y' = Y */
 225             brw_MOV(p,
 226                     dst[1],
 227                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 228          } else {
 229             /* Y' = Y + 0.5 */
 230             brw_ADD(p,
 231                     dst[1],
 232                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 233                     brw_imm_f(0.5));
 234          }
 235       } else {
 236          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 237
 238          /* Y' = (height - 1) - Y + center */
 239          brw_ADD(p,
 240                  dst[1],
 241                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 242                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 243       }
 244    }
 245 }
 246
 247
 248 void emit_pixel_w(struct brw_wm_compile *c,
 249                   const struct brw_reg *dst,
 250                   GLuint mask,
 251                   const struct brw_reg *arg0,
 252                   const struct brw_reg *deltas)
 253 {
 254    struct brw_compile *p = &c->func;
 255    struct intel_context *intel = &p->brw->intel;
 256
 257    /* Don't need this if all you are doing is interpolating color, for
 258     * instance.
 259     */
 260    if (mask & WRITEMASK_W) {
 261       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 262
 263       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 264        * result straight into a message reg.
 265        */
 266       if (can_do_pln(intel, deltas)) {
 267          brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
 268       } else {
 269          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 270          brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 271       }
 272
 273       /* Calc w */
 274       if (c->dispatch_width == 16) {
 275          brw_math_16(p, dst[3],
 276                      BRW_MATH_FUNCTION_INV,
 277                      BRW_MATH_SATURATE_NONE,
 278                      2, brw_null_reg(),
 279                      BRW_MATH_PRECISION_FULL);
 280       } else {
 281          brw_math(p, dst[3],
 282                   BRW_MATH_FUNCTION_INV,
 283                   BRW_MATH_SATURATE_NONE,
 284                   2, brw_null_reg(),
 285                   BRW_MATH_DATA_VECTOR,
 286                   BRW_MATH_PRECISION_FULL);
 287       }
 288    }
 289 }
 290
 291
 292 void emit_linterp(struct brw_compile *p,
 293                   const struct brw_reg *dst,
 294                   GLuint mask,
 295                   const struct brw_reg *arg0,
 296                   const struct brw_reg *deltas)
 297 {
 298    struct intel_context *intel = &p->brw->intel;
 299    struct brw_reg interp[4];
 300    GLuint nr = arg0[0].nr;
 301    GLuint i;
 302
 303    interp[0] = brw_vec1_grf(nr, 0);
 304    interp[1] = brw_vec1_grf(nr, 4);
 305    interp[2] = brw_vec1_grf(nr+1, 0);
 306    interp[3] = brw_vec1_grf(nr+1, 4);
 307
 308    for (i = 0; i < 4; i++) {
 309       if (mask & (1<<i)) {
 310          if (can_do_pln(intel, deltas)) {
 311             brw_PLN(p, dst[i], interp[i], deltas[0]);
 312          } else {
 313             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 314             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 315          }
 316       }
 317    }
 318 }
 319
 320
 321 void emit_pinterp(struct brw_compile *p,
 322                   const struct brw_reg *dst,
 323                   GLuint mask,
 324                   const struct brw_reg *arg0,
 325                   const struct brw_reg *deltas,
 326                   const struct brw_reg *w)
 327 {
 328    struct intel_context *intel = &p->brw->intel;
 329    struct brw_reg interp[4];
 330    GLuint nr = arg0[0].nr;
 331    GLuint i;
 332
 333    interp[0] = brw_vec1_grf(nr, 0);
 334    interp[1] = brw_vec1_grf(nr, 4);
 335    interp[2] = brw_vec1_grf(nr+1, 0);
 336    interp[3] = brw_vec1_grf(nr+1, 4);
 337
 338    for (i = 0; i < 4; i++) {
 339       if (mask & (1<<i)) {
 340          if (can_do_pln(intel, deltas)) {
 341             brw_PLN(p, dst[i], interp[i], deltas[0]);
 342          } else {
 343             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 344             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 345          }
 346       }
 347    }
 348    for (i = 0; i < 4; i++) {
 349       if (mask & (1<<i)) {
 350          brw_MUL(p, dst[i], dst[i], w[3]);
 351       }
 352    }
 353 }
 354
 355
 356 void emit_cinterp(struct brw_compile *p,
 357                   const struct brw_reg *dst,
 358                   GLuint mask,
 359                   const struct brw_reg *arg0)
 360 {
 361    struct brw_reg interp[4];
 362    GLuint nr = arg0[0].nr;
 363    GLuint i;
 364
 365    interp[0] = brw_vec1_grf(nr, 0);
 366    interp[1] = brw_vec1_grf(nr, 4);
 367    interp[2] = brw_vec1_grf(nr+1, 0);
 368    interp[3] = brw_vec1_grf(nr+1, 4);
 369
 370    for (i = 0; i < 4; i++) {
 371       if (mask & (1<<i)) {
 372          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 373       }
 374    }
 375 }
 376
 377 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 378 void emit_frontfacing(struct brw_compile *p,
 379                       const struct brw_reg *dst,
 380                       GLuint mask)
 381 {
 382    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 383    GLuint i;
 384
 385    if (!(mask & WRITEMASK_XYZW))
 386       return;
 387
 388    for (i = 0; i < 4; i++) {
 389       if (mask & (1<<i)) {
 390          brw_MOV(p, dst[i], brw_imm_f(0.0));
 391       }
 392    }
 393
 394    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 395     * us front face
 396     */
 397    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 398    for (i = 0; i < 4; i++) {
 399       if (mask & (1<<i)) {
 400          brw_MOV(p, dst[i], brw_imm_f(1.0));
 401       }
 402    }
 403    brw_set_predicate_control_flag_value(p, 0xff);
 404 }
 405
 406 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 407  * looking like:
 408  *
 409  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 410  *
 411  * and we're trying to produce:
 412  *
 413  *           DDX                     DDY
 414  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 415  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 416  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 417  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 418  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 419  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 420  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 421  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 422  *
 423  * and add another set of two more subspans if in 16-pixel dispatch mode.
 424  *
 425  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 426  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 427  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 428  * between each other.  We could probably do it like ddx and swizzle the right
 429  * order later, but bail for now and just produce
 430  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 431  */
 432 void emit_ddxy(struct brw_compile *p,
 433                const struct brw_reg *dst,
 434                GLuint mask,
 435                GLboolean is_ddx,
 436                const struct brw_reg *arg0)
 437 {
 438    int i;
 439    struct brw_reg src0, src1;
 440
 441    if (mask & SATURATE)
 442       brw_set_saturate(p, 1);
 443    for (i = 0; i < 4; i++ ) {
 444       if (mask & (1<<i)) {
 445          if (is_ddx) {
 446             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 447                            BRW_REGISTER_TYPE_F,
 448                            BRW_VERTICAL_STRIDE_2,
 449                            BRW_WIDTH_2,
 450                            BRW_HORIZONTAL_STRIDE_0,
 451                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 452             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 453                            BRW_REGISTER_TYPE_F,
 454                            BRW_VERTICAL_STRIDE_2,
 455                            BRW_WIDTH_2,
 456                            BRW_HORIZONTAL_STRIDE_0,
 457                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 458          } else {
 459             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 460                            BRW_REGISTER_TYPE_F,
 461                            BRW_VERTICAL_STRIDE_4,
 462                            BRW_WIDTH_4,
 463                            BRW_HORIZONTAL_STRIDE_0,
 464                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 465             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 466                            BRW_REGISTER_TYPE_F,
 467                            BRW_VERTICAL_STRIDE_4,
 468                            BRW_WIDTH_4,
 469                            BRW_HORIZONTAL_STRIDE_0,
 470                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 471          }
 472          brw_ADD(p, dst[i], src0, negate(src1));
 473       }
 474    }
 475    if (mask & SATURATE)
 476       brw_set_saturate(p, 0);
 477 }
 478
 479 void emit_alu1(struct brw_compile *p,
 480                struct brw_instruction *(*func)(struct brw_compile *,
 481                                                struct brw_reg,
 482                                                struct brw_reg),
 483                const struct brw_reg *dst,
 484                GLuint mask,
 485                const struct brw_reg *arg0)
 486 {
 487    GLuint i;
 488
 489    if (mask & SATURATE)
 490       brw_set_saturate(p, 1);
 491
 492    for (i = 0; i < 4; i++) {
 493       if (mask & (1<<i)) {
 494          func(p, dst[i], arg0[i]);
 495       }
 496    }
 497
 498    if (mask & SATURATE)
 499       brw_set_saturate(p, 0);
 500 }
 501
 502
 503 void emit_alu2(struct brw_compile *p,
 504                struct brw_instruction *(*func)(struct brw_compile *,
 505                                                struct brw_reg,
 506                                                struct brw_reg,
 507                                                struct brw_reg),
 508                const struct brw_reg *dst,
 509                GLuint mask,
 510                const struct brw_reg *arg0,
 511                const struct brw_reg *arg1)
 512 {
 513    GLuint i;
 514
 515    if (mask & SATURATE)
 516       brw_set_saturate(p, 1);
 517
 518    for (i = 0; i < 4; i++) {
 519       if (mask & (1<<i)) {
 520          func(p, dst[i], arg0[i], arg1[i]);
 521       }
 522    }
 523
 524    if (mask & SATURATE)
 525       brw_set_saturate(p, 0);
 526 }
 527
 528
 529 void emit_mad(struct brw_compile *p,
 530               const struct brw_reg *dst,
 531               GLuint mask,
 532               const struct brw_reg *arg0,
 533               const struct brw_reg *arg1,
 534               const struct brw_reg *arg2)
 535 {
 536    GLuint i;
 537
 538    for (i = 0; i < 4; i++) {
 539       if (mask & (1<<i)) {
 540          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 541
 542          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 543          brw_ADD(p, dst[i], dst[i], arg2[i]);
 544          brw_set_saturate(p, 0);
 545       }
 546    }
 547 }
 548
 549 void emit_lrp(struct brw_compile *p,
 550               const struct brw_reg *dst,
 551               GLuint mask,
 552               const struct brw_reg *arg0,
 553               const struct brw_reg *arg1,
 554               const struct brw_reg *arg2)
 555 {
 556    GLuint i;
 557
 558    /* Uses dst as a temporary:
 559     */
 560    for (i = 0; i < 4; i++) {
 561       if (mask & (1<<i)) {
 562          /* Can I use the LINE instruction for this?
 563           */
 564          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 565          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 566
 567          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 568          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 569          brw_set_saturate(p, 0);
 570       }
 571    }
 572 }
 573
 574 void emit_sop(struct brw_compile *p,
 575               const struct brw_reg *dst,
 576               GLuint mask,
 577               GLuint cond,
 578               const struct brw_reg *arg0,
 579               const struct brw_reg *arg1)
 580 {
 581    GLuint i;
 582
 583    for (i = 0; i < 4; i++) {
 584       if (mask & (1<<i)) {
 585          brw_push_insn_state(p);
 586          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 587          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 588          brw_MOV(p, dst[i], brw_imm_f(0));
 589          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 590          brw_MOV(p, dst[i], brw_imm_f(1.0));
 591          brw_pop_insn_state(p);
 592       }
 593    }
 594 }
 595
 596 static void emit_slt( struct brw_compile *p,
 597                       const struct brw_reg *dst,
 598                       GLuint mask,
 599                       const struct brw_reg *arg0,
 600                       const struct brw_reg *arg1 )
 601 {
 602    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 603 }
 604
 605 static void emit_sle( struct brw_compile *p,
 606                       const struct brw_reg *dst,
 607                       GLuint mask,
 608                       const struct brw_reg *arg0,
 609                       const struct brw_reg *arg1 )
 610 {
 611    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 612 }
 613
 614 static void emit_sgt( struct brw_compile *p,
 615                       const struct brw_reg *dst,
 616                       GLuint mask,
 617                       const struct brw_reg *arg0,
 618                       const struct brw_reg *arg1 )
 619 {
 620    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 621 }
 622
 623 static void emit_sge( struct brw_compile *p,
 624                       const struct brw_reg *dst,
 625                       GLuint mask,
 626                       const struct brw_reg *arg0,
 627                       const struct brw_reg *arg1 )
 628 {
 629    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 630 }
 631
 632 static void emit_seq( struct brw_compile *p,
 633                       const struct brw_reg *dst,
 634                       GLuint mask,
 635                       const struct brw_reg *arg0,
 636                       const struct brw_reg *arg1 )
 637 {
 638    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 639 }
 640
 641 static void emit_sne( struct brw_compile *p,
 642                       const struct brw_reg *dst,
 643                       GLuint mask,
 644                       const struct brw_reg *arg0,
 645                       const struct brw_reg *arg1 )
 646 {
 647    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 648 }
 649
 650 void emit_cmp(struct brw_compile *p,
 651               const struct brw_reg *dst,
 652               GLuint mask,
 653               const struct brw_reg *arg0,
 654               const struct brw_reg *arg1,
 655               const struct brw_reg *arg2)
 656 {
 657    GLuint i;
 658
 659    for (i = 0; i < 4; i++) {
 660       if (mask & (1<<i)) {
 661          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 662
 663          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 664          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 665          brw_set_saturate(p, 0);
 666          brw_set_predicate_control_flag_value(p, 0xff);
 667       }
 668    }
 669 }
 670
 671 void emit_sign(struct brw_compile *p,
 672                const struct brw_reg *dst,
 673                GLuint mask,
 674                const struct brw_reg *arg0)
 675 {
 676    GLuint i;
 677
 678    for (i = 0; i < 4; i++) {
 679       if (mask & (1<<i)) {
 680          brw_MOV(p, dst[i], brw_imm_f(0.0));
 681
 682          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 683          brw_MOV(p, dst[i], brw_imm_f(-1.0));
 684          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 685
 686          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
 687          brw_MOV(p, dst[i], brw_imm_f(1.0));
 688          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 689       }
 690    }
 691 }
 692
 693 void emit_max(struct brw_compile *p,
 694               const struct brw_reg *dst,
 695               GLuint mask,
 696               const struct brw_reg *arg0,
 697               const struct brw_reg *arg1)
 698 {
 699    GLuint i;
 700
 701    for (i = 0; i < 4; i++) {
 702       if (mask & (1<<i)) {
 703          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 704
 705          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 706          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 707          brw_set_saturate(p, 0);
 708          brw_set_predicate_control_flag_value(p, 0xff);
 709       }
 710    }
 711 }
 712
 713 void emit_min(struct brw_compile *p,
 714               const struct brw_reg *dst,
 715               GLuint mask,
 716               const struct brw_reg *arg0,
 717               const struct brw_reg *arg1)
 718 {
 719    GLuint i;
 720
 721    for (i = 0; i < 4; i++) {
 722       if (mask & (1<<i)) {
 723          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 724
 725          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 726          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 727          brw_set_saturate(p, 0);
 728          brw_set_predicate_control_flag_value(p, 0xff);
 729       }
 730    }
 731 }
 732
 733
 734 void emit_dp3(struct brw_compile *p,
 735               const struct brw_reg *dst,
 736               GLuint mask,
 737               const struct brw_reg *arg0,
 738               const struct brw_reg *arg1)
 739 {
 740    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 741
 742    if (!(mask & WRITEMASK_XYZW))
 743       return; /* Do not emit dead code */
 744
 745    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 746
 747    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 748    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 749
 750    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 751    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 752    brw_set_saturate(p, 0);
 753 }
 754
 755
 756 void emit_dp4(struct brw_compile *p,
 757               const struct brw_reg *dst,
 758               GLuint mask,
 759               const struct brw_reg *arg0,
 760               const struct brw_reg *arg1)
 761 {
 762    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 763
 764    if (!(mask & WRITEMASK_XYZW))
 765       return; /* Do not emit dead code */
 766
 767    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 768
 769    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 770    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 771    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 772
 773    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 774    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 775    brw_set_saturate(p, 0);
 776 }
 777
 778
 779 void emit_dph(struct brw_compile *p,
 780               const struct brw_reg *dst,
 781               GLuint mask,
 782               const struct brw_reg *arg0,
 783               const struct brw_reg *arg1)
 784 {
 785    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 786
 787    if (!(mask & WRITEMASK_XYZW))
 788       return; /* Do not emit dead code */
 789
 790    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 791
 792    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 793    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 794    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 795
 796    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 797    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 798    brw_set_saturate(p, 0);
 799 }
 800
 801
 802 void emit_xpd(struct brw_compile *p,
 803               const struct brw_reg *dst,
 804               GLuint mask,
 805               const struct brw_reg *arg0,
 806               const struct brw_reg *arg1)
 807 {
 808    GLuint i;
 809
 810    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 811
 812    for (i = 0 ; i < 3; i++) {
 813       if (mask & (1<<i)) {
 814          GLuint i2 = (i+2)%3;
 815          GLuint i1 = (i+1)%3;
 816
 817          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 818
 819          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 820          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 821          brw_set_saturate(p, 0);
 822       }
 823    }
 824 }
 825
 826
 827 void emit_math1(struct brw_wm_compile *c,
 828                 GLuint function,
 829                 const struct brw_reg *dst,
 830                 GLuint mask,
 831                 const struct brw_reg *arg0)
 832 {
 833    struct brw_compile *p = &c->func;
 834    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 835    GLuint saturate = ((mask & SATURATE) ?
 836                       BRW_MATH_SATURATE_SATURATE :
 837                       BRW_MATH_SATURATE_NONE);
 838
 839    if (!(mask & WRITEMASK_XYZW))
 840       return; /* Do not emit dead code */
 841
 842    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 843
 844    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 845     * channels.
 846     */
 847    brw_MOV(p, brw_message_reg(2), arg0[0]);
 848
 849    /* Send two messages to perform all 16 operations:
 850     */
 851    brw_push_insn_state(p);
 852    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 853    brw_math(p,
 854             dst[dst_chan],
 855             function,
 856             saturate,
 857             2,
 858             brw_null_reg(),
 859             BRW_MATH_DATA_VECTOR,
 860             BRW_MATH_PRECISION_FULL);
 861
 862    if (c->dispatch_width == 16) {
 863       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 864       brw_math(p,
 865                offset(dst[dst_chan],1),
 866                function,
 867                saturate,
 868                3,
 869                brw_null_reg(),
 870                BRW_MATH_DATA_VECTOR,
 871                BRW_MATH_PRECISION_FULL);
 872    }
 873    brw_pop_insn_state(p);
 874 }
 875
 876
 877 void emit_math2(struct brw_wm_compile *c,
 878                 GLuint function,
 879                 const struct brw_reg *dst,
 880                 GLuint mask,
 881                 const struct brw_reg *arg0,
 882                 const struct brw_reg *arg1)
 883 {
 884    struct brw_compile *p = &c->func;
 885    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 886    GLuint saturate = ((mask & SATURATE) ?
 887                       BRW_MATH_SATURATE_SATURATE :
 888                       BRW_MATH_SATURATE_NONE);
 889
 890    if (!(mask & WRITEMASK_XYZW))
 891       return; /* Do not emit dead code */
 892
 893    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 894
 895    brw_push_insn_state(p);
 896
 897    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 898    brw_MOV(p, brw_message_reg(2), arg0[0]);
 899    if (c->dispatch_width == 16) {
 900       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 901       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 902    }
 903
 904    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 905    brw_MOV(p, brw_message_reg(3), arg1[0]);
 906    if (c->dispatch_width == 16) {
 907       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 908       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 909    }
 910
 911    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 912    brw_math(p,
 913             dst[dst_chan],
 914             function,
 915             saturate,
 916             2,
 917             brw_null_reg(),
 918             BRW_MATH_DATA_VECTOR,
 919             BRW_MATH_PRECISION_FULL);
 920
 921    /* Send two messages to perform all 16 operations:
 922     */
 923    if (c->dispatch_width == 16) {
 924       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 925       brw_math(p,
 926                offset(dst[dst_chan],1),
 927                function,
 928                saturate,
 929                4,
 930                brw_null_reg(),
 931                BRW_MATH_DATA_VECTOR,
 932                BRW_MATH_PRECISION_FULL);
 933    }
 934    brw_pop_insn_state(p);
 935 }
 936
 937
 938 void emit_tex(struct brw_wm_compile *c,
 939               struct brw_reg *dst,
 940               GLuint dst_flags,
 941               struct brw_reg *arg,
 942               struct brw_reg depth_payload,
 943               GLuint tex_idx,
 944               GLuint sampler,
 945               GLboolean shadow)
 946 {
 947    struct brw_compile *p = &c->func;
 948    struct intel_context *intel = &p->brw->intel;
 949    struct brw_reg dst_retyped;
 950    GLuint cur_mrf = 2, response_length;
 951    GLuint i, nr_texcoords;
 952    GLuint emit;
 953    GLuint msg_type;
 954    GLuint mrf_per_channel;
 955    GLuint simd_mode;
 956
 957    if (c->dispatch_width == 16) {
 958       mrf_per_channel = 2;
 959       response_length = 8;
 960       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 961       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 962    } else {
 963       mrf_per_channel = 1;
 964       response_length = 4;
 965       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 966       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 967    }
 968
 969    /* How many input regs are there?
 970     */
 971    switch (tex_idx) {
 972    case TEXTURE_1D_INDEX:
 973       emit = WRITEMASK_X;
 974       nr_texcoords = 1;
 975       break;
 976    case TEXTURE_2D_INDEX:
 977    case TEXTURE_RECT_INDEX:
 978       emit = WRITEMASK_XY;
 979       nr_texcoords = 2;
 980       break;
 981    case TEXTURE_3D_INDEX:
 982    case TEXTURE_CUBE_INDEX:
 983       emit = WRITEMASK_XYZ;
 984       nr_texcoords = 3;
 985       break;
 986    default:
 987       /* unexpected target */
 988       abort();
 989    }
 990
 991    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 992    if (intel->gen < 5 && c->dispatch_width == 8)
 993       nr_texcoords = 3;
 994
 995    /* For shadow comparisons, we have to supply u,v,r. */
 996    if (shadow)
 997       nr_texcoords = 3;
 998
 999    /* Emit the texcoords. */
1000    for (i = 0; i < nr_texcoords; i++) {
1001       if (emit & (1<<i))
1002          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1003       else
1004          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1005       cur_mrf += mrf_per_channel;
1006    }
1007
1008    /* Fill in the shadow comparison reference value. */
1009    if (shadow) {
1010       if (intel->gen == 5) {
1011          /* Fill in the cube map array index value. */
1012          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1013          cur_mrf += mrf_per_channel;
1014       } else if (c->dispatch_width == 8) {
1015          /* Fill in the LOD bias value. */
1016          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1017          cur_mrf += mrf_per_channel;
1018       }
1019       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1020       cur_mrf += mrf_per_channel;
1021    }
1022
1023    if (intel->gen == 5) {
1024       if (shadow)
1025          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1026       else
1027          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1028    } else {
1029       /* Note that G45 and older determines shadow compare and dispatch width
1030        * from message length for most messages.
1031        */
1032       if (c->dispatch_width == 16 && shadow)
1033          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1034       else
1035          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1036    }
1037
1038    brw_SAMPLE(p,
1039               dst_retyped,
1040               1,
1041               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1042               SURF_INDEX_TEXTURE(sampler),
1043               sampler,
1044               dst_flags & WRITEMASK_XYZW,
1045               msg_type,
1046               response_length,
1047               cur_mrf - 1,
1048               0,
1049               1,
1050               simd_mode);
1051 }
1052
1053
1054 void emit_txb(struct brw_wm_compile *c,
1055               struct brw_reg *dst,
1056               GLuint dst_flags,
1057               struct brw_reg *arg,
1058               struct brw_reg depth_payload,
1059               GLuint tex_idx,
1060               GLuint sampler)
1061 {
1062    struct brw_compile *p = &c->func;
1063    struct intel_context *intel = &p->brw->intel;
1064    GLuint msgLength;
1065    GLuint msg_type;
1066    GLuint mrf_per_channel;
1067    GLuint response_length;
1068    struct brw_reg dst_retyped;
1069
1070    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1071     * samples, so we'll use the 16-wide instruction, leave the second halves
1072     * undefined, and trust the execution mask to keep the undefined pixels
1073     * from mattering.
1074     */
1075    if (c->dispatch_width == 16 || intel->gen < 5) {
1076       if (intel->gen == 5)
1077          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1078       else
1079          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1080       mrf_per_channel = 2;
1081       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1082       response_length = 8;
1083    } else {
1084       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1085       mrf_per_channel = 1;
1086       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1087       response_length = 4;
1088    }
1089
1090    /* Shadow ignored for txb. */
1091    switch (tex_idx) {
1092    case TEXTURE_1D_INDEX:
1093       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1094       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1095       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1096       break;
1097    case TEXTURE_2D_INDEX:
1098    case TEXTURE_RECT_INDEX:
1099       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1100       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1101       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1102       break;
1103    case TEXTURE_3D_INDEX:
1104    case TEXTURE_CUBE_INDEX:
1105       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1106       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1107       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1108       break;
1109    default:
1110       /* unexpected target */
1111       abort();
1112    }
1113
1114    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1115    msgLength = 2 + 4 * mrf_per_channel - 1;
1116
1117    brw_SAMPLE(p,
1118               dst_retyped,
1119               1,
1120               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1121               SURF_INDEX_TEXTURE(sampler),
1122               sampler,
1123               dst_flags & WRITEMASK_XYZW,
1124               msg_type,
1125               response_length,
1126               msgLength,
1127               0,
1128               1,
1129               BRW_SAMPLER_SIMD_MODE_SIMD16);
1130 }
1131
1132
1133 static void emit_lit(struct brw_wm_compile *c,
1134                      const struct brw_reg *dst,
1135                      GLuint mask,
1136                      const struct brw_reg *arg0)
1137 {
1138    struct brw_compile *p = &c->func;
1139
1140    assert((mask & WRITEMASK_XW) == 0);
1141
1142    if (mask & WRITEMASK_Y) {
1143       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1144       brw_MOV(p, dst[1], arg0[0]);
1145       brw_set_saturate(p, 0);
1146    }
1147
1148    if (mask & WRITEMASK_Z) {
1149       emit_math2(c, BRW_MATH_FUNCTION_POW,
1150                  &dst[2],
1151                  WRITEMASK_X | (mask & SATURATE),
1152                  &arg0[1],
1153                  &arg0[3]);
1154    }
1155
1156    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1157     * some of the POW calculations above, but 16-wide iff statements
1158     * seem to lock c1 hardware, so this is a nasty workaround:
1159     */
1160    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1161    {
1162       if (mask & WRITEMASK_Y)
1163          brw_MOV(p, dst[1], brw_imm_f(0));
1164
1165       if (mask & WRITEMASK_Z)
1166          brw_MOV(p, dst[2], brw_imm_f(0));
1167    }
1168    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1169 }
1170
1171
1172 /* Kill pixel - set execution mask to zero for those pixels which
1173  * fail.
1174  */
1175 static void emit_kil( struct brw_wm_compile *c,
1176                       struct brw_reg *arg0)
1177 {
1178    struct brw_compile *p = &c->func;
1179    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1180    GLuint i, j;
1181
1182    for (i = 0; i < 4; i++) {
1183       /* Check if we've already done the comparison for this reg
1184        * -- common when someone does KIL TEMP.wwww.
1185        */
1186       for (j = 0; j < i; j++) {
1187          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1188             break;
1189       }
1190       if (j != i)
1191          continue;
1192
1193       brw_push_insn_state(p);
1194       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1195       brw_set_predicate_control_flag_value(p, 0xff);
1196       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1197       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1198       brw_pop_insn_state(p);
1199    }
1200 }
1201
1202 /* KIL_NV kills the pixels that are currently executing, not based on a test
1203  * of the arguments.
1204  */
1205 static void emit_kil_nv( struct brw_wm_compile *c )
1206 {
1207    struct brw_compile *p = &c->func;
1208    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1209
1210    brw_push_insn_state(p);
1211    brw_set_mask_control(p, BRW_MASK_DISABLE);
1212    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1213    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1214    brw_pop_insn_state(p);
1215 }
1216
1217 static void fire_fb_write( struct brw_wm_compile *c,
1218                            GLuint base_reg,
1219                            GLuint nr,
1220                            GLuint target,
1221                            GLuint eot )
1222 {
1223    struct brw_compile *p = &c->func;
1224    struct brw_reg dst;
1225
1226    if (c->dispatch_width == 16)
1227       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1228    else
1229       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1230
1231    /* Pass through control information:
1232     */
1233 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1234    {
1235       brw_push_insn_state(p);
1236       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1237       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1238       brw_MOV(p,
1239                brw_message_reg(base_reg + 1),
1240                brw_vec8_grf(1, 0));
1241       brw_pop_insn_state(p);
1242    }
1243
1244    /* Send framebuffer write message: */
1245 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1246    brw_fb_WRITE(p,
1247                 dst,
1248                 base_reg,
1249                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1250                 target,
1251                 nr,
1252                 0,
1253                 eot);
1254 }
1255
1256
1257 static void emit_aa( struct brw_wm_compile *c,
1258                      struct brw_reg *arg1,
1259                      GLuint reg )
1260 {
1261    struct brw_compile *p = &c->func;
1262    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1263    GLuint off = c->key.aa_dest_stencil_reg % 2;
1264    struct brw_reg aa = offset(arg1[comp], off);
1265
1266    brw_push_insn_state(p);
1267    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1268    brw_MOV(p, brw_message_reg(reg), aa);
1269    brw_pop_insn_state(p);
1270 }
1271
1272
1273 /* Post-fragment-program processing.  Send the results to the
1274  * framebuffer.
1275  * \param arg0  the fragment color
1276  * \param arg1  the pass-through depth value
1277  * \param arg2  the shader-computed depth value
1278  */
1279 void emit_fb_write(struct brw_wm_compile *c,
1280                    struct brw_reg *arg0,
1281                    struct brw_reg *arg1,
1282                    struct brw_reg *arg2,
1283                    GLuint target,
1284                    GLuint eot)
1285 {
1286    struct brw_compile *p = &c->func;
1287    struct brw_context *brw = p->brw;
1288    GLuint nr = 2;
1289    GLuint channel;
1290
1291    /* Reserve a space for AA - may not be needed:
1292     */
1293    if (c->key.aa_dest_stencil_reg)
1294       nr += 1;
1295
1296    /* I don't really understand how this achieves the color interleave
1297     * (ie RGBARGBA) in the result:  [Do the saturation here]
1298     */
1299    brw_push_insn_state(p);
1300
1301    for (channel = 0; channel < 4; channel++) {
1302       if (c->dispatch_width == 16 && brw->has_compr4) {
1303          /* By setting the high bit of the MRF register number, we indicate
1304           * that we want COMPR4 mode - instead of doing the usual destination
1305           * + 1 for the second half we get destination + 4.
1306           */
1307          brw_MOV(p,
1308                  brw_message_reg(nr + channel + (1 << 7)),
1309                  arg0[channel]);
1310       } else {
1311          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1312          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1313          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1314          brw_MOV(p,
1315                  brw_message_reg(nr + channel),
1316                  arg0[channel]);
1317
1318          if (c->dispatch_width == 16) {
1319             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1320             brw_MOV(p,
1321                     brw_message_reg(nr + channel + 4),
1322                     sechalf(arg0[channel]));
1323          }
1324       }
1325    }
1326    /* skip over the regs populated above:
1327     */
1328    nr += 8;
1329    brw_pop_insn_state(p);
1330
1331    if (c->key.source_depth_to_render_target)
1332    {
1333       if (c->key.computes_depth)
1334          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1335       else
1336          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1337
1338       nr += 2;
1339    }
1340
1341    if (c->key.dest_depth_reg)
1342    {
1343       GLuint comp = c->key.dest_depth_reg / 2;
1344       GLuint off = c->key.dest_depth_reg % 2;
1345
1346       if (off != 0) {
1347          brw_push_insn_state(p);
1348          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1349
1350          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1351          /* 2nd half? */
1352          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1353          brw_pop_insn_state(p);
1354       }
1355       else {
1356          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1357       }
1358       nr += 2;
1359    }
1360
1361    if (!c->key.runtime_check_aads_emit) {
1362       if (c->key.aa_dest_stencil_reg)
1363          emit_aa(c, arg1, 2);
1364
1365       fire_fb_write(c, 0, nr, target, eot);
1366    }
1367    else {
1368       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1369       struct brw_reg ip = brw_ip_reg();
1370       struct brw_instruction *jmp;
1371
1372       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1373       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1374       brw_AND(p,
1375               v1_null_ud,
1376               get_element_ud(brw_vec8_grf(1,0), 6),
1377               brw_imm_ud(1<<26));
1378
1379       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1380       {
1381          emit_aa(c, arg1, 2);
1382          fire_fb_write(c, 0, nr, target, eot);
1383          /* note - thread killed in subroutine */
1384       }
1385       brw_land_fwd_jump(p, jmp);
1386
1387       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1388        */
1389       fire_fb_write(c, 1, nr-1, target, eot);
1390    }
1391 }
1392
1393 /**
1394  * Move a GPR to scratch memory.
1395  */
1396 static void emit_spill( struct brw_wm_compile *c,
1397                         struct brw_reg reg,
1398                         GLuint slot )
1399 {
1400    struct brw_compile *p = &c->func;
1401
1402    /*
1403      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1404    */
1405    brw_MOV(p, brw_message_reg(2), reg);
1406
1407    /*
1408      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1409      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1410    */
1411    brw_dp_WRITE_16(p,
1412                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1413                    slot);
1414 }
1415
1416
1417 /**
1418  * Load a GPR from scratch memory.
1419  */
1420 static void emit_unspill( struct brw_wm_compile *c,
1421                           struct brw_reg reg,
1422                           GLuint slot )
1423 {
1424    struct brw_compile *p = &c->func;
1425
1426    /* Slot 0 is the undef value.
1427     */
1428    if (slot == 0) {
1429       brw_MOV(p, reg, brw_imm_f(0));
1430       return;
1431    }
1432
1433    /*
1434      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1435      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1436    */
1437
1438    brw_dp_READ_16(p,
1439                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1440                   slot);
1441 }
1442
1443
1444 /**
1445  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1446  * Args with unspill_reg != 0 will be loaded from scratch memory.
1447  */
1448 static void get_argument_regs( struct brw_wm_compile *c,
1449                                struct brw_wm_ref *arg[],
1450                                struct brw_reg *regs )
1451 {
1452    GLuint i;
1453
1454    for (i = 0; i < 4; i++) {
1455       if (arg[i]) {
1456          if (arg[i]->unspill_reg)
1457             emit_unspill(c,
1458                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1459                          arg[i]->value->spill_slot);
1460
1461          regs[i] = arg[i]->hw_reg;
1462       }
1463       else {
1464          regs[i] = brw_null_reg();
1465       }
1466    }
1467 }
1468
1469
1470 /**
1471  * For values that have a spill_slot!=0, write those regs to scratch memory.
1472  */
1473 static void spill_values( struct brw_wm_compile *c,
1474                           struct brw_wm_value *values,
1475                           GLuint nr )
1476 {
1477    GLuint i;
1478
1479    for (i = 0; i < nr; i++)
1480       if (values[i].spill_slot)
1481          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1482 }
1483
1484
1485 /* Emit the fragment program instructions here.
1486  */
1487 void brw_wm_emit( struct brw_wm_compile *c )
1488 {
1489    struct brw_compile *p = &c->func;
1490    GLuint insn;
1491
1492    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1493
1494    /* Check if any of the payload regs need to be spilled:
1495     */
1496    spill_values(c, c->payload.depth, 4);
1497    spill_values(c, c->creg, c->nr_creg);
1498    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1499
1500
1501    for (insn = 0; insn < c->nr_insns; insn++) {
1502
1503       struct brw_wm_instruction *inst = &c->instruction[insn];
1504       struct brw_reg args[3][4], dst[4];
1505       GLuint i, dst_flags;
1506
1507       /* Get argument regs:
1508        */
1509       for (i = 0; i < 3; i++)
1510          get_argument_regs(c, inst->src[i], args[i]);
1511
1512       /* Get dest regs:
1513        */
1514       for (i = 0; i < 4; i++)
1515          if (inst->dst[i])
1516             dst[i] = inst->dst[i]->hw_reg;
1517          else
1518             dst[i] = brw_null_reg();
1519
1520       /* Flags
1521        */
1522       dst_flags = inst->writemask;
1523       if (inst->saturate)
1524          dst_flags |= SATURATE;
1525
1526       switch (inst->opcode) {
1527          /* Generated instructions for calculating triangle interpolants:
1528           */
1529       case WM_PIXELXY:
1530          emit_pixel_xy(c, dst, dst_flags);
1531          break;
1532
1533       case WM_DELTAXY:
1534          emit_delta_xy(p, dst, dst_flags, args[0]);
1535          break;
1536
1537       case WM_WPOSXY:
1538          emit_wpos_xy(c, dst, dst_flags, args[0]);
1539          break;
1540
1541       case WM_PIXELW:
1542          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1543          break;
1544
1545       case WM_LINTERP:
1546          emit_linterp(p, dst, dst_flags, args[0], args[1]);
1547          break;
1548
1549       case WM_PINTERP:
1550          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1551          break;
1552
1553       case WM_CINTERP:
1554          emit_cinterp(p, dst, dst_flags, args[0]);
1555          break;
1556
1557       case WM_FB_WRITE:
1558          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1559          break;
1560
1561       case WM_FRONTFACING:
1562          emit_frontfacing(p, dst, dst_flags);
1563          break;
1564
1565          /* Straightforward arithmetic:
1566           */
1567       case OPCODE_ADD:
1568          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1569          break;
1570
1571       case OPCODE_FRC:
1572          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1573          break;
1574
1575       case OPCODE_FLR:
1576          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1577          break;
1578
1579       case OPCODE_DDX:
1580          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
1581          break;
1582
1583       case OPCODE_DDY:
1584          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
1585          break;
1586
1587       case OPCODE_DP3:
1588          emit_dp3(p, dst, dst_flags, args[0], args[1]);
1589          break;
1590
1591       case OPCODE_DP4:
1592          emit_dp4(p, dst, dst_flags, args[0], args[1]);
1593          break;
1594
1595       case OPCODE_DPH:
1596          emit_dph(p, dst, dst_flags, args[0], args[1]);
1597          break;
1598
1599       case OPCODE_TRUNC:
1600          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1601          break;
1602
1603       case OPCODE_LRP:
1604          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1605          break;
1606
1607       case OPCODE_MAD:
1608          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1609          break;
1610
1611       case OPCODE_MOV:
1612       case OPCODE_SWZ:
1613          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1614          break;
1615
1616       case OPCODE_MUL:
1617          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1618          break;
1619
1620       case OPCODE_XPD:
1621          emit_xpd(p, dst, dst_flags, args[0], args[1]);
1622          break;
1623
1624          /* Higher math functions:
1625           */
1626       case OPCODE_RCP:
1627          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1628          break;
1629
1630       case OPCODE_RSQ:
1631          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1632          break;
1633
1634       case OPCODE_SIN:
1635          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1636          break;
1637
1638       case OPCODE_COS:
1639          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1640          break;
1641
1642       case OPCODE_EX2:
1643          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1644          break;
1645
1646       case OPCODE_LG2:
1647          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1648          break;
1649
1650       case OPCODE_SCS:
1651          /* There is an scs math function, but it would need some
1652           * fixup for 16-element execution.
1653           */
1654          if (dst_flags & WRITEMASK_X)
1655             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1656          if (dst_flags & WRITEMASK_Y)
1657             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1658          break;
1659
1660       case OPCODE_POW:
1661          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1662          break;
1663
1664          /* Comparisons:
1665           */
1666       case OPCODE_CMP:
1667          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1668          break;
1669
1670       case OPCODE_MAX:
1671          emit_max(p, dst, dst_flags, args[0], args[1]);
1672          break;
1673
1674       case OPCODE_MIN:
1675          emit_min(p, dst, dst_flags, args[0], args[1]);
1676          break;
1677
1678       case OPCODE_SLT:
1679          emit_slt(p, dst, dst_flags, args[0], args[1]);
1680          break;
1681
1682       case OPCODE_SLE:
1683          emit_sle(p, dst, dst_flags, args[0], args[1]);
1684         break;
1685       case OPCODE_SGT:
1686          emit_sgt(p, dst, dst_flags, args[0], args[1]);
1687         break;
1688       case OPCODE_SGE:
1689          emit_sge(p, dst, dst_flags, args[0], args[1]);
1690          break;
1691       case OPCODE_SEQ:
1692          emit_seq(p, dst, dst_flags, args[0], args[1]);
1693         break;
1694       case OPCODE_SNE:
1695          emit_sne(p, dst, dst_flags, args[0], args[1]);
1696         break;
1697
1698       case OPCODE_SSG:
1699          emit_sign(p, dst, dst_flags, args[0]);
1700          break;
1701
1702       case OPCODE_LIT:
1703          emit_lit(c, dst, dst_flags, args[0]);
1704          break;
1705
1706          /* Texturing operations:
1707           */
1708       case OPCODE_TEX:
1709          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1710                   inst->tex_idx, inst->tex_unit,
1711                   inst->tex_shadow);
1712          break;
1713
1714       case OPCODE_TXB:
1715          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1716                   inst->tex_idx, inst->tex_unit);
1717          break;
1718
1719       case OPCODE_KIL:
1720          emit_kil(c, args[0]);
1721          break;
1722
1723       case OPCODE_KIL_NV:
1724          emit_kil_nv(c);
1725          break;
1726
1727       default:
1728          printf("Unsupported opcode %i (%s) in fragment shader\n",
1729                 inst->opcode, inst->opcode < MAX_OPCODE ?
1730                 _mesa_opcode_string(inst->opcode) :
1731                 "unknown");
1732       }
1733
1734       for (i = 0; i < 4; i++)
1735         if (inst->dst[i] && inst->dst[i]->spill_slot)
1736            emit_spill(c,
1737                       inst->dst[i]->hw_reg,
1738                       inst->dst[i]->spill_slot);
1739    }
1740
1741    if (INTEL_DEBUG & DEBUG_WM) {
1742       int i;
1743
1744       printf("wm-native:\n");
1745       for (i = 0; i < p->nr_insn; i++)
1746          brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
1747       printf("\n");
1748    }
1749 }