src/mesa/drivers/dri/i965/brw_wm_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "brw_context.h"
  35 #include "brw_wm.h"
  36
  37 static GLboolean can_do_pln(struct intel_context *intel,
  38                             const struct brw_reg *deltas)
  39 {
  40    struct brw_context *brw = brw_context(&intel->ctx);
  41
  42    if (!brw->has_pln)
  43       return GL_FALSE;
  44
  45    if (deltas[1].nr != deltas[0].nr + 1)
  46       return GL_FALSE;
  47
  48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
  49       return GL_FALSE;
  50
  51    return GL_TRUE;
  52 }
  53
  54 /* Not quite sure how correct this is - need to understand horiz
  55  * vs. vertical strides a little better.
  56  */
  57 static INLINE struct brw_reg sechalf( struct brw_reg reg )
  58 {
  59    if (reg.vstride)
  60       reg.nr++;
  61    return reg;
  62 }
  63
  64 /* Return the SrcReg index of the channels that can be immediate float operands
  65  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  66  */
  67 GLboolean
  68 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
  69 {
  70    int opcode_array[] = {
  71       [OPCODE_ADD] = 2,
  72       [OPCODE_CMP] = 3,
  73       [OPCODE_DP3] = 2,
  74       [OPCODE_DP4] = 2,
  75       [OPCODE_DPH] = 2,
  76       [OPCODE_MAX] = 2,
  77       [OPCODE_MIN] = 2,
  78       [OPCODE_MOV] = 1,
  79       [OPCODE_MUL] = 2,
  80       [OPCODE_SEQ] = 2,
  81       [OPCODE_SGE] = 2,
  82       [OPCODE_SGT] = 2,
  83       [OPCODE_SLE] = 2,
  84       [OPCODE_SLT] = 2,
  85       [OPCODE_SNE] = 2,
  86       [OPCODE_XPD] = 2,
  87    };
  88
  89    /* These opcodes get broken down in a way that allow two
  90     * args to be immediates.
  91     */
  92    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
  93       if (arg == 1 || arg == 2)
  94          return GL_TRUE;
  95    }
  96
  97    if (opcode > ARRAY_SIZE(opcode_array))
  98       return GL_FALSE;
  99
 100    return arg == opcode_array[opcode] - 1;
 101 }
 102
 103 /**
 104  * Computes the screen-space x,y position of the pixels.
 105  *
 106  * This will be used by emit_delta_xy() or emit_wpos_xy() for
 107  * interpolation of attributes..
 108  *
 109  * Payload R0:
 110  *
 111  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
 112  *         corresponding to each of the 16 execution channels.
 113  * R0.1..8 -- ?
 114  * R1.0 -- triangle vertex 0.X
 115  * R1.1 -- triangle vertex 0.Y
 116  * R1.2 -- tile 0 x,y coords (2 packed uwords)
 117  * R1.3 -- tile 1 x,y coords (2 packed uwords)
 118  * R1.4 -- tile 2 x,y coords (2 packed uwords)
 119  * R1.5 -- tile 3 x,y coords (2 packed uwords)
 120  * R1.6 -- ?
 121  * R1.7 -- ?
 122  * R1.8 -- ?
 123  */
 124 void emit_pixel_xy(struct brw_wm_compile *c,
 125                    const struct brw_reg *dst,
 126                    GLuint mask)
 127 {
 128    struct brw_compile *p = &c->func;
 129    struct brw_reg r1 = brw_vec1_grf(1, 0);
 130    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 131    struct brw_reg dst0_uw, dst1_uw;
 132
 133    brw_push_insn_state(p);
 134    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 135
 136    if (c->dispatch_width == 16) {
 137       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
 138       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
 139    } else {
 140       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
 141       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
 142    }
 143
 144    /* Calculate pixel centers by adding 1 or 0 to each of the
 145     * micro-tile coordinates passed in r1.
 146     */
 147    if (mask & WRITEMASK_X) {
 148       brw_ADD(p,
 149               dst0_uw,
 150               stride(suboffset(r1_uw, 4), 2, 4, 0),
 151               brw_imm_v(0x10101010));
 152    }
 153
 154    if (mask & WRITEMASK_Y) {
 155       brw_ADD(p,
 156               dst1_uw,
 157               stride(suboffset(r1_uw,5), 2, 4, 0),
 158               brw_imm_v(0x11001100));
 159    }
 160    brw_pop_insn_state(p);
 161 }
 162
 163 /**
 164  * Computes the screen-space x,y distance of the pixels from the start
 165  * vertex.
 166  *
 167  * This will be used in linterp or pinterp with the start vertex value
 168  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
 169  * to produce interpolated attribute values.
 170  */
 171 void emit_delta_xy(struct brw_compile *p,
 172                    const struct brw_reg *dst,
 173                    GLuint mask,
 174                    const struct brw_reg *arg0)
 175 {
 176    struct brw_reg r1 = brw_vec1_grf(1, 0);
 177
 178    if (mask == 0)
 179       return;
 180
 181    assert(mask == WRITEMASK_XY);
 182
 183    /* Calc delta X,Y by subtracting origin in r1 from the pixel
 184     * centers produced by emit_pixel_xy().
 185     */
 186    brw_ADD(p,
 187            dst[0],
 188            retype(arg0[0], BRW_REGISTER_TYPE_UW),
 189            negate(r1));
 190    brw_ADD(p,
 191            dst[1],
 192            retype(arg0[1], BRW_REGISTER_TYPE_UW),
 193            negate(suboffset(r1,1)));
 194 }
 195
 196 /**
 197  * Computes the pixel offset from the window origin for gl_FragCoord().
 198  */
 199 void emit_wpos_xy(struct brw_wm_compile *c,
 200                   const struct brw_reg *dst,
 201                   GLuint mask,
 202                   const struct brw_reg *arg0)
 203 {
 204    struct brw_compile *p = &c->func;
 205
 206    if (mask & WRITEMASK_X) {
 207       if (c->fp->program.PixelCenterInteger) {
 208          /* X' = X */
 209          brw_MOV(p,
 210                  dst[0],
 211                  retype(arg0[0], BRW_REGISTER_TYPE_W));
 212       } else {
 213          /* X' = X + 0.5 */
 214          brw_ADD(p,
 215                  dst[0],
 216                  retype(arg0[0], BRW_REGISTER_TYPE_W),
 217                  brw_imm_f(0.5));
 218       }
 219    }
 220
 221    if (mask & WRITEMASK_Y) {
 222       if (c->fp->program.OriginUpperLeft) {
 223          if (c->fp->program.PixelCenterInteger) {
 224             /* Y' = Y */
 225             brw_MOV(p,
 226                     dst[1],
 227                     retype(arg0[1], BRW_REGISTER_TYPE_W));
 228          } else {
 229             /* Y' = Y + 0.5 */
 230             brw_ADD(p,
 231                     dst[1],
 232                     retype(arg0[1], BRW_REGISTER_TYPE_W),
 233                     brw_imm_f(0.5));
 234          }
 235       } else {
 236          float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 237
 238          /* Y' = (height - 1) - Y + center */
 239          brw_ADD(p,
 240                  dst[1],
 241                  negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
 242                  brw_imm_f(c->key.drawable_height - 1 + center_offset));
 243       }
 244    }
 245 }
 246
 247
 248 void emit_pixel_w(struct brw_wm_compile *c,
 249                   const struct brw_reg *dst,
 250                   GLuint mask,
 251                   const struct brw_reg *arg0,
 252                   const struct brw_reg *deltas)
 253 {
 254    struct brw_compile *p = &c->func;
 255    struct intel_context *intel = &p->brw->intel;
 256
 257    /* Don't need this if all you are doing is interpolating color, for
 258     * instance.
 259     */
 260    if (mask & WRITEMASK_W) {
 261       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 262
 263       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 264        * result straight into a message reg.
 265        */
 266       if (can_do_pln(intel, deltas)) {
 267          brw_PLN(p, brw_message_reg(2), interp3, deltas[0]);
 268       } else {
 269          brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
 270          brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 271       }
 272
 273       /* Calc w */
 274       if (c->dispatch_width == 16) {
 275          brw_math_16(p, dst[3],
 276                      BRW_MATH_FUNCTION_INV,
 277                      BRW_MATH_SATURATE_NONE,
 278                      2, brw_null_reg(),
 279                      BRW_MATH_PRECISION_FULL);
 280       } else {
 281          brw_math(p, dst[3],
 282                   BRW_MATH_FUNCTION_INV,
 283                   BRW_MATH_SATURATE_NONE,
 284                   2, brw_null_reg(),
 285                   BRW_MATH_DATA_VECTOR,
 286                   BRW_MATH_PRECISION_FULL);
 287       }
 288    }
 289 }
 290
 291
 292 void emit_linterp(struct brw_compile *p,
 293                   const struct brw_reg *dst,
 294                   GLuint mask,
 295                   const struct brw_reg *arg0,
 296                   const struct brw_reg *deltas)
 297 {
 298    struct intel_context *intel = &p->brw->intel;
 299    struct brw_reg interp[4];
 300    GLuint nr = arg0[0].nr;
 301    GLuint i;
 302
 303    interp[0] = brw_vec1_grf(nr, 0);
 304    interp[1] = brw_vec1_grf(nr, 4);
 305    interp[2] = brw_vec1_grf(nr+1, 0);
 306    interp[3] = brw_vec1_grf(nr+1, 4);
 307
 308    for (i = 0; i < 4; i++) {
 309       if (mask & (1<<i)) {
 310          if (can_do_pln(intel, deltas)) {
 311             brw_PLN(p, dst[i], interp[i], deltas[0]);
 312          } else {
 313             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 314             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 315          }
 316       }
 317    }
 318 }
 319
 320
 321 void emit_pinterp(struct brw_compile *p,
 322                   const struct brw_reg *dst,
 323                   GLuint mask,
 324                   const struct brw_reg *arg0,
 325                   const struct brw_reg *deltas,
 326                   const struct brw_reg *w)
 327 {
 328    struct intel_context *intel = &p->brw->intel;
 329    struct brw_reg interp[4];
 330    GLuint nr = arg0[0].nr;
 331    GLuint i;
 332
 333    interp[0] = brw_vec1_grf(nr, 0);
 334    interp[1] = brw_vec1_grf(nr, 4);
 335    interp[2] = brw_vec1_grf(nr+1, 0);
 336    interp[3] = brw_vec1_grf(nr+1, 4);
 337
 338    for (i = 0; i < 4; i++) {
 339       if (mask & (1<<i)) {
 340          if (can_do_pln(intel, deltas)) {
 341             brw_PLN(p, dst[i], interp[i], deltas[0]);
 342          } else {
 343             brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 344             brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
 345          }
 346       }
 347    }
 348    for (i = 0; i < 4; i++) {
 349       if (mask & (1<<i)) {
 350          brw_MUL(p, dst[i], dst[i], w[3]);
 351       }
 352    }
 353 }
 354
 355
 356 void emit_cinterp(struct brw_compile *p,
 357                   const struct brw_reg *dst,
 358                   GLuint mask,
 359                   const struct brw_reg *arg0)
 360 {
 361    struct brw_reg interp[4];
 362    GLuint nr = arg0[0].nr;
 363    GLuint i;
 364
 365    interp[0] = brw_vec1_grf(nr, 0);
 366    interp[1] = brw_vec1_grf(nr, 4);
 367    interp[2] = brw_vec1_grf(nr+1, 0);
 368    interp[3] = brw_vec1_grf(nr+1, 4);
 369
 370    for (i = 0; i < 4; i++) {
 371       if (mask & (1<<i)) {
 372          brw_MOV(p, dst[i], suboffset(interp[i],3));    /* TODO: optimize away like other moves */
 373       }
 374    }
 375 }
 376
 377 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 378 void emit_frontfacing(struct brw_compile *p,
 379                       const struct brw_reg *dst,
 380                       GLuint mask)
 381 {
 382    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 383    GLuint i;
 384
 385    if (!(mask & WRITEMASK_XYZW))
 386       return;
 387
 388    for (i = 0; i < 4; i++) {
 389       if (mask & (1<<i)) {
 390          brw_MOV(p, dst[i], brw_imm_f(0.0));
 391       }
 392    }
 393
 394    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 395     * us front face
 396     */
 397    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 398    for (i = 0; i < 4; i++) {
 399       if (mask & (1<<i)) {
 400          brw_MOV(p, dst[i], brw_imm_f(1.0));
 401       }
 402    }
 403    brw_set_predicate_control_flag_value(p, 0xff);
 404 }
 405
 406 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 407  * looking like:
 408  *
 409  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 410  *
 411  * and we're trying to produce:
 412  *
 413  *           DDX                     DDY
 414  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 415  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 416  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 417  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 418  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 419  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 420  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 421  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 422  *
 423  * and add another set of two more subspans if in 16-pixel dispatch mode.
 424  *
 425  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 426  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 427  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 428  * between each other.  We could probably do it like ddx and swizzle the right
 429  * order later, but bail for now and just produce
 430  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 431  */
 432 void emit_ddxy(struct brw_compile *p,
 433                const struct brw_reg *dst,
 434                GLuint mask,
 435                GLboolean is_ddx,
 436                const struct brw_reg *arg0)
 437 {
 438    int i;
 439    struct brw_reg src0, src1;
 440
 441    if (mask & SATURATE)
 442       brw_set_saturate(p, 1);
 443    for (i = 0; i < 4; i++ ) {
 444       if (mask & (1<<i)) {
 445          if (is_ddx) {
 446             src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
 447                            BRW_REGISTER_TYPE_F,
 448                            BRW_VERTICAL_STRIDE_2,
 449                            BRW_WIDTH_2,
 450                            BRW_HORIZONTAL_STRIDE_0,
 451                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 452             src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 453                            BRW_REGISTER_TYPE_F,
 454                            BRW_VERTICAL_STRIDE_2,
 455                            BRW_WIDTH_2,
 456                            BRW_HORIZONTAL_STRIDE_0,
 457                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 458          } else {
 459             src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 460                            BRW_REGISTER_TYPE_F,
 461                            BRW_VERTICAL_STRIDE_4,
 462                            BRW_WIDTH_4,
 463                            BRW_HORIZONTAL_STRIDE_0,
 464                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 465             src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 466                            BRW_REGISTER_TYPE_F,
 467                            BRW_VERTICAL_STRIDE_4,
 468                            BRW_WIDTH_4,
 469                            BRW_HORIZONTAL_STRIDE_0,
 470                            BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 471          }
 472          brw_ADD(p, dst[i], src0, negate(src1));
 473       }
 474    }
 475    if (mask & SATURATE)
 476       brw_set_saturate(p, 0);
 477 }
 478
 479 void emit_alu1(struct brw_compile *p,
 480                struct brw_instruction *(*func)(struct brw_compile *,
 481                                                struct brw_reg,
 482                                                struct brw_reg),
 483                const struct brw_reg *dst,
 484                GLuint mask,
 485                const struct brw_reg *arg0)
 486 {
 487    GLuint i;
 488
 489    if (mask & SATURATE)
 490       brw_set_saturate(p, 1);
 491
 492    for (i = 0; i < 4; i++) {
 493       if (mask & (1<<i)) {
 494          func(p, dst[i], arg0[i]);
 495       }
 496    }
 497
 498    if (mask & SATURATE)
 499       brw_set_saturate(p, 0);
 500 }
 501
 502
 503 void emit_alu2(struct brw_compile *p,
 504                struct brw_instruction *(*func)(struct brw_compile *,
 505                                                struct brw_reg,
 506                                                struct brw_reg,
 507                                                struct brw_reg),
 508                const struct brw_reg *dst,
 509                GLuint mask,
 510                const struct brw_reg *arg0,
 511                const struct brw_reg *arg1)
 512 {
 513    GLuint i;
 514
 515    if (mask & SATURATE)
 516       brw_set_saturate(p, 1);
 517
 518    for (i = 0; i < 4; i++) {
 519       if (mask & (1<<i)) {
 520          func(p, dst[i], arg0[i], arg1[i]);
 521       }
 522    }
 523
 524    if (mask & SATURATE)
 525       brw_set_saturate(p, 0);
 526 }
 527
 528
 529 void emit_mad(struct brw_compile *p,
 530               const struct brw_reg *dst,
 531               GLuint mask,
 532               const struct brw_reg *arg0,
 533               const struct brw_reg *arg1,
 534               const struct brw_reg *arg2)
 535 {
 536    GLuint i;
 537
 538    for (i = 0; i < 4; i++) {
 539       if (mask & (1<<i)) {
 540          brw_MUL(p, dst[i], arg0[i], arg1[i]);
 541
 542          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 543          brw_ADD(p, dst[i], dst[i], arg2[i]);
 544          brw_set_saturate(p, 0);
 545       }
 546    }
 547 }
 548
 549 void emit_lrp(struct brw_compile *p,
 550               const struct brw_reg *dst,
 551               GLuint mask,
 552               const struct brw_reg *arg0,
 553               const struct brw_reg *arg1,
 554               const struct brw_reg *arg2)
 555 {
 556    GLuint i;
 557
 558    /* Uses dst as a temporary:
 559     */
 560    for (i = 0; i < 4; i++) {
 561       if (mask & (1<<i)) {
 562          /* Can I use the LINE instruction for this?
 563           */
 564          brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
 565          brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
 566
 567          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 568          brw_MAC(p, dst[i], arg0[i], arg1[i]);
 569          brw_set_saturate(p, 0);
 570       }
 571    }
 572 }
 573
 574 void emit_sop(struct brw_compile *p,
 575               const struct brw_reg *dst,
 576               GLuint mask,
 577               GLuint cond,
 578               const struct brw_reg *arg0,
 579               const struct brw_reg *arg1)
 580 {
 581    GLuint i;
 582
 583    for (i = 0; i < 4; i++) {
 584       if (mask & (1<<i)) {
 585          brw_push_insn_state(p);
 586          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 587          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 588          brw_MOV(p, dst[i], brw_imm_f(0));
 589          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 590          brw_MOV(p, dst[i], brw_imm_f(1.0));
 591          brw_pop_insn_state(p);
 592       }
 593    }
 594 }
 595
 596 static void emit_slt( struct brw_compile *p,
 597                       const struct brw_reg *dst,
 598                       GLuint mask,
 599                       const struct brw_reg *arg0,
 600                       const struct brw_reg *arg1 )
 601 {
 602    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
 603 }
 604
 605 static void emit_sle( struct brw_compile *p,
 606                       const struct brw_reg *dst,
 607                       GLuint mask,
 608                       const struct brw_reg *arg0,
 609                       const struct brw_reg *arg1 )
 610 {
 611    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
 612 }
 613
 614 static void emit_sgt( struct brw_compile *p,
 615                       const struct brw_reg *dst,
 616                       GLuint mask,
 617                       const struct brw_reg *arg0,
 618                       const struct brw_reg *arg1 )
 619 {
 620    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
 621 }
 622
 623 static void emit_sge( struct brw_compile *p,
 624                       const struct brw_reg *dst,
 625                       GLuint mask,
 626                       const struct brw_reg *arg0,
 627                       const struct brw_reg *arg1 )
 628 {
 629    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 630 }
 631
 632 static void emit_seq( struct brw_compile *p,
 633                       const struct brw_reg *dst,
 634                       GLuint mask,
 635                       const struct brw_reg *arg0,
 636                       const struct brw_reg *arg1 )
 637 {
 638    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
 639 }
 640
 641 static void emit_sne( struct brw_compile *p,
 642                       const struct brw_reg *dst,
 643                       GLuint mask,
 644                       const struct brw_reg *arg0,
 645                       const struct brw_reg *arg1 )
 646 {
 647    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 648 }
 649
 650 void emit_cmp(struct brw_compile *p,
 651               const struct brw_reg *dst,
 652               GLuint mask,
 653               const struct brw_reg *arg0,
 654               const struct brw_reg *arg1,
 655               const struct brw_reg *arg2)
 656 {
 657    GLuint i;
 658
 659    for (i = 0; i < 4; i++) {
 660       if (mask & (1<<i)) {
 661          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 662
 663          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 664          brw_SEL(p, dst[i], arg1[i], arg2[i]);
 665          brw_set_saturate(p, 0);
 666          brw_set_predicate_control_flag_value(p, 0xff);
 667       }
 668    }
 669 }
 670
 671 void emit_max(struct brw_compile *p,
 672               const struct brw_reg *dst,
 673               GLuint mask,
 674               const struct brw_reg *arg0,
 675               const struct brw_reg *arg1)
 676 {
 677    GLuint i;
 678
 679    for (i = 0; i < 4; i++) {
 680       if (mask & (1<<i)) {
 681          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 682
 683          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 684          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 685          brw_set_saturate(p, 0);
 686          brw_set_predicate_control_flag_value(p, 0xff);
 687       }
 688    }
 689 }
 690
 691 void emit_min(struct brw_compile *p,
 692               const struct brw_reg *dst,
 693               GLuint mask,
 694               const struct brw_reg *arg0,
 695               const struct brw_reg *arg1)
 696 {
 697    GLuint i;
 698
 699    for (i = 0; i < 4; i++) {
 700       if (mask & (1<<i)) {
 701          brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 702
 703          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 704          brw_SEL(p, dst[i], arg0[i], arg1[i]);
 705          brw_set_saturate(p, 0);
 706          brw_set_predicate_control_flag_value(p, 0xff);
 707       }
 708    }
 709 }
 710
 711
 712 void emit_dp3(struct brw_compile *p,
 713               const struct brw_reg *dst,
 714               GLuint mask,
 715               const struct brw_reg *arg0,
 716               const struct brw_reg *arg1)
 717 {
 718    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 719
 720    if (!(mask & WRITEMASK_XYZW))
 721       return; /* Do not emit dead code */
 722
 723    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 724
 725    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 726    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 727
 728    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 729    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 730    brw_set_saturate(p, 0);
 731 }
 732
 733
 734 void emit_dp4(struct brw_compile *p,
 735               const struct brw_reg *dst,
 736               GLuint mask,
 737               const struct brw_reg *arg0,
 738               const struct brw_reg *arg1)
 739 {
 740    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 741
 742    if (!(mask & WRITEMASK_XYZW))
 743       return; /* Do not emit dead code */
 744
 745    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 746
 747    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 748    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 749    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 750
 751    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 752    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
 753    brw_set_saturate(p, 0);
 754 }
 755
 756
 757 void emit_dph(struct brw_compile *p,
 758               const struct brw_reg *dst,
 759               GLuint mask,
 760               const struct brw_reg *arg0,
 761               const struct brw_reg *arg1)
 762 {
 763    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 764
 765    if (!(mask & WRITEMASK_XYZW))
 766       return; /* Do not emit dead code */
 767
 768    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 769
 770    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
 771    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 772    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 773
 774    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 775    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
 776    brw_set_saturate(p, 0);
 777 }
 778
 779
 780 void emit_xpd(struct brw_compile *p,
 781               const struct brw_reg *dst,
 782               GLuint mask,
 783               const struct brw_reg *arg0,
 784               const struct brw_reg *arg1)
 785 {
 786    GLuint i;
 787
 788    assert((mask & WRITEMASK_W) != WRITEMASK_W);
 789
 790    for (i = 0 ; i < 3; i++) {
 791       if (mask & (1<<i)) {
 792          GLuint i2 = (i+2)%3;
 793          GLuint i1 = (i+1)%3;
 794
 795          brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
 796
 797          brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
 798          brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
 799          brw_set_saturate(p, 0);
 800       }
 801    }
 802 }
 803
 804
 805 void emit_math1(struct brw_wm_compile *c,
 806                 GLuint function,
 807                 const struct brw_reg *dst,
 808                 GLuint mask,
 809                 const struct brw_reg *arg0)
 810 {
 811    struct brw_compile *p = &c->func;
 812    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 813    GLuint saturate = ((mask & SATURATE) ?
 814                       BRW_MATH_SATURATE_SATURATE :
 815                       BRW_MATH_SATURATE_NONE);
 816
 817    if (!(mask & WRITEMASK_XYZW))
 818       return; /* Do not emit dead code */
 819
 820    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 821
 822    /* If compressed, this will write message reg 2,3 from arg0.x's 16
 823     * channels.
 824     */
 825    brw_MOV(p, brw_message_reg(2), arg0[0]);
 826
 827    /* Send two messages to perform all 16 operations:
 828     */
 829    brw_push_insn_state(p);
 830    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 831    brw_math(p,
 832             dst[dst_chan],
 833             function,
 834             saturate,
 835             2,
 836             brw_null_reg(),
 837             BRW_MATH_DATA_VECTOR,
 838             BRW_MATH_PRECISION_FULL);
 839
 840    if (c->dispatch_width == 16) {
 841       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 842       brw_math(p,
 843                offset(dst[dst_chan],1),
 844                function,
 845                saturate,
 846                3,
 847                brw_null_reg(),
 848                BRW_MATH_DATA_VECTOR,
 849                BRW_MATH_PRECISION_FULL);
 850    }
 851    brw_pop_insn_state(p);
 852 }
 853
 854
 855 void emit_math2(struct brw_wm_compile *c,
 856                 GLuint function,
 857                 const struct brw_reg *dst,
 858                 GLuint mask,
 859                 const struct brw_reg *arg0,
 860                 const struct brw_reg *arg1)
 861 {
 862    struct brw_compile *p = &c->func;
 863    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 864    GLuint saturate = ((mask & SATURATE) ?
 865                       BRW_MATH_SATURATE_SATURATE :
 866                       BRW_MATH_SATURATE_NONE);
 867
 868    if (!(mask & WRITEMASK_XYZW))
 869       return; /* Do not emit dead code */
 870
 871    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 872
 873    brw_push_insn_state(p);
 874
 875    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 876    brw_MOV(p, brw_message_reg(2), arg0[0]);
 877    if (c->dispatch_width == 16) {
 878       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 879       brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
 880    }
 881
 882    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 883    brw_MOV(p, brw_message_reg(3), arg1[0]);
 884    if (c->dispatch_width == 16) {
 885       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 886       brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
 887    }
 888
 889    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 890    brw_math(p,
 891             dst[dst_chan],
 892             function,
 893             saturate,
 894             2,
 895             brw_null_reg(),
 896             BRW_MATH_DATA_VECTOR,
 897             BRW_MATH_PRECISION_FULL);
 898
 899    /* Send two messages to perform all 16 operations:
 900     */
 901    if (c->dispatch_width == 16) {
 902       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 903       brw_math(p,
 904                offset(dst[dst_chan],1),
 905                function,
 906                saturate,
 907                4,
 908                brw_null_reg(),
 909                BRW_MATH_DATA_VECTOR,
 910                BRW_MATH_PRECISION_FULL);
 911    }
 912    brw_pop_insn_state(p);
 913 }
 914
 915
 916 void emit_tex(struct brw_wm_compile *c,
 917               struct brw_reg *dst,
 918               GLuint dst_flags,
 919               struct brw_reg *arg,
 920               struct brw_reg depth_payload,
 921               GLuint tex_idx,
 922               GLuint sampler,
 923               GLboolean shadow)
 924 {
 925    struct brw_compile *p = &c->func;
 926    struct intel_context *intel = &p->brw->intel;
 927    struct brw_reg dst_retyped;
 928    GLuint cur_mrf = 2, response_length;
 929    GLuint i, nr_texcoords;
 930    GLuint emit;
 931    GLuint msg_type;
 932    GLuint mrf_per_channel;
 933    GLuint simd_mode;
 934
 935    if (c->dispatch_width == 16) {
 936       mrf_per_channel = 2;
 937       response_length = 8;
 938       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
 939       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 940    } else {
 941       mrf_per_channel = 1;
 942       response_length = 4;
 943       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
 944       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 945    }
 946
 947    /* How many input regs are there?
 948     */
 949    switch (tex_idx) {
 950    case TEXTURE_1D_INDEX:
 951       emit = WRITEMASK_X;
 952       nr_texcoords = 1;
 953       break;
 954    case TEXTURE_2D_INDEX:
 955    case TEXTURE_RECT_INDEX:
 956       emit = WRITEMASK_XY;
 957       nr_texcoords = 2;
 958       break;
 959    case TEXTURE_3D_INDEX:
 960    case TEXTURE_CUBE_INDEX:
 961       emit = WRITEMASK_XYZ;
 962       nr_texcoords = 3;
 963       break;
 964    default:
 965       /* unexpected target */
 966       abort();
 967    }
 968
 969    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
 970    if (intel->gen < 5 && c->dispatch_width == 8)
 971       nr_texcoords = 3;
 972
 973    /* For shadow comparisons, we have to supply u,v,r. */
 974    if (shadow)
 975       nr_texcoords = 3;
 976
 977    /* Emit the texcoords. */
 978    for (i = 0; i < nr_texcoords; i++) {
 979       if (emit & (1<<i))
 980          brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
 981       else
 982          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 983       cur_mrf += mrf_per_channel;
 984    }
 985
 986    /* Fill in the shadow comparison reference value. */
 987    if (shadow) {
 988       if (intel->gen == 5) {
 989          /* Fill in the cube map array index value. */
 990          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 991          cur_mrf += mrf_per_channel;
 992       } else if (c->dispatch_width == 8) {
 993          /* Fill in the LOD bias value. */
 994          brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 995          cur_mrf += mrf_per_channel;
 996       }
 997       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
 998       cur_mrf += mrf_per_channel;
 999    }
1000
1001    if (intel->gen == 5) {
1002       if (shadow)
1003          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1004       else
1005          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1006    } else {
1007       /* Note that G45 and older determines shadow compare and dispatch width
1008        * from message length for most messages.
1009        */
1010       if (c->dispatch_width == 16 && shadow)
1011          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1012       else
1013          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1014    }
1015
1016    brw_SAMPLE(p,
1017               dst_retyped,
1018               1,
1019               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1020               SURF_INDEX_TEXTURE(sampler),
1021               sampler,
1022               dst_flags & WRITEMASK_XYZW,
1023               msg_type,
1024               response_length,
1025               cur_mrf - 1,
1026               0,
1027               1,
1028               simd_mode);
1029 }
1030
1031
1032 void emit_txb(struct brw_wm_compile *c,
1033               struct brw_reg *dst,
1034               GLuint dst_flags,
1035               struct brw_reg *arg,
1036               struct brw_reg depth_payload,
1037               GLuint tex_idx,
1038               GLuint sampler)
1039 {
1040    struct brw_compile *p = &c->func;
1041    struct intel_context *intel = &p->brw->intel;
1042    GLuint msgLength;
1043    GLuint msg_type;
1044    GLuint mrf_per_channel;
1045    GLuint response_length;
1046    struct brw_reg dst_retyped;
1047
1048    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1049     * samples, so we'll use the 16-wide instruction, leave the second halves
1050     * undefined, and trust the execution mask to keep the undefined pixels
1051     * from mattering.
1052     */
1053    if (c->dispatch_width == 16 || intel->gen < 5) {
1054       if (intel->gen == 5)
1055          msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1056       else
1057          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1058       mrf_per_channel = 2;
1059       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1060       response_length = 8;
1061    } else {
1062       msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1063       mrf_per_channel = 1;
1064       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1065       response_length = 4;
1066    }
1067
1068    /* Shadow ignored for txb. */
1069    switch (tex_idx) {
1070    case TEXTURE_1D_INDEX:
1071       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1072       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1073       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1074       break;
1075    case TEXTURE_2D_INDEX:
1076    case TEXTURE_RECT_INDEX:
1077       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1078       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1079       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1080       break;
1081    case TEXTURE_3D_INDEX:
1082    case TEXTURE_CUBE_INDEX:
1083       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1084       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1085       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1086       break;
1087    default:
1088       /* unexpected target */
1089       abort();
1090    }
1091
1092    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1093    msgLength = 2 + 4 * mrf_per_channel - 1;
1094
1095    brw_SAMPLE(p,
1096               dst_retyped,
1097               1,
1098               retype(depth_payload, BRW_REGISTER_TYPE_UW),
1099               SURF_INDEX_TEXTURE(sampler),
1100               sampler,
1101               dst_flags & WRITEMASK_XYZW,
1102               msg_type,
1103               response_length,
1104               msgLength,
1105               0,
1106               1,
1107               BRW_SAMPLER_SIMD_MODE_SIMD16);
1108 }
1109
1110
1111 static void emit_lit(struct brw_wm_compile *c,
1112                      const struct brw_reg *dst,
1113                      GLuint mask,
1114                      const struct brw_reg *arg0)
1115 {
1116    struct brw_compile *p = &c->func;
1117
1118    assert((mask & WRITEMASK_XW) == 0);
1119
1120    if (mask & WRITEMASK_Y) {
1121       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1122       brw_MOV(p, dst[1], arg0[0]);
1123       brw_set_saturate(p, 0);
1124    }
1125
1126    if (mask & WRITEMASK_Z) {
1127       emit_math2(c, BRW_MATH_FUNCTION_POW,
1128                  &dst[2],
1129                  WRITEMASK_X | (mask & SATURATE),
1130                  &arg0[1],
1131                  &arg0[3]);
1132    }
1133
1134    /* Ordinarily you'd use an iff statement to skip or shortcircuit
1135     * some of the POW calculations above, but 16-wide iff statements
1136     * seem to lock c1 hardware, so this is a nasty workaround:
1137     */
1138    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1139    {
1140       if (mask & WRITEMASK_Y)
1141          brw_MOV(p, dst[1], brw_imm_f(0));
1142
1143       if (mask & WRITEMASK_Z)
1144          brw_MOV(p, dst[2], brw_imm_f(0));
1145    }
1146    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1147 }
1148
1149
1150 /* Kill pixel - set execution mask to zero for those pixels which
1151  * fail.
1152  */
1153 static void emit_kil( struct brw_wm_compile *c,
1154                       struct brw_reg *arg0)
1155 {
1156    struct brw_compile *p = &c->func;
1157    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1158    GLuint i, j;
1159
1160    for (i = 0; i < 4; i++) {
1161       /* Check if we've already done the comparison for this reg
1162        * -- common when someone does KIL TEMP.wwww.
1163        */
1164       for (j = 0; j < i; j++) {
1165          if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1166             break;
1167       }
1168       if (j != i)
1169          continue;
1170
1171       brw_push_insn_state(p);
1172       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1173       brw_set_predicate_control_flag_value(p, 0xff);
1174       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1175       brw_AND(p, r0uw, brw_flag_reg(), r0uw);
1176       brw_pop_insn_state(p);
1177    }
1178 }
1179
1180 /* KIL_NV kills the pixels that are currently executing, not based on a test
1181  * of the arguments.
1182  */
1183 static void emit_kil_nv( struct brw_wm_compile *c )
1184 {
1185    struct brw_compile *p = &c->func;
1186    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1187
1188    brw_push_insn_state(p);
1189    brw_set_mask_control(p, BRW_MASK_DISABLE);
1190    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
1191    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
1192    brw_pop_insn_state(p);
1193 }
1194
1195 static void fire_fb_write( struct brw_wm_compile *c,
1196                            GLuint base_reg,
1197                            GLuint nr,
1198                            GLuint target,
1199                            GLuint eot )
1200 {
1201    struct brw_compile *p = &c->func;
1202    struct brw_reg dst;
1203
1204    if (c->dispatch_width == 16)
1205       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1206    else
1207       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1208
1209    /* Pass through control information:
1210     */
1211 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1212    {
1213       brw_push_insn_state(p);
1214       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1215       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1216       brw_MOV(p,
1217                brw_message_reg(base_reg + 1),
1218                brw_vec8_grf(1, 0));
1219       brw_pop_insn_state(p);
1220    }
1221
1222    /* Send framebuffer write message: */
1223 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1224    brw_fb_WRITE(p,
1225                 dst,
1226                 base_reg,
1227                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1228                 target,
1229                 nr,
1230                 0,
1231                 eot);
1232 }
1233
1234
1235 static void emit_aa( struct brw_wm_compile *c,
1236                      struct brw_reg *arg1,
1237                      GLuint reg )
1238 {
1239    struct brw_compile *p = &c->func;
1240    GLuint comp = c->key.aa_dest_stencil_reg / 2;
1241    GLuint off = c->key.aa_dest_stencil_reg % 2;
1242    struct brw_reg aa = offset(arg1[comp], off);
1243
1244    brw_push_insn_state(p);
1245    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1246    brw_MOV(p, brw_message_reg(reg), aa);
1247    brw_pop_insn_state(p);
1248 }
1249
1250
1251 /* Post-fragment-program processing.  Send the results to the
1252  * framebuffer.
1253  * \param arg0  the fragment color
1254  * \param arg1  the pass-through depth value
1255  * \param arg2  the shader-computed depth value
1256  */
1257 void emit_fb_write(struct brw_wm_compile *c,
1258                    struct brw_reg *arg0,
1259                    struct brw_reg *arg1,
1260                    struct brw_reg *arg2,
1261                    GLuint target,
1262                    GLuint eot)
1263 {
1264    struct brw_compile *p = &c->func;
1265    struct brw_context *brw = p->brw;
1266    GLuint nr = 2;
1267    GLuint channel;
1268
1269    /* Reserve a space for AA - may not be needed:
1270     */
1271    if (c->key.aa_dest_stencil_reg)
1272       nr += 1;
1273
1274    /* I don't really understand how this achieves the color interleave
1275     * (ie RGBARGBA) in the result:  [Do the saturation here]
1276     */
1277    brw_push_insn_state(p);
1278
1279    for (channel = 0; channel < 4; channel++) {
1280       if (c->dispatch_width == 16 && brw->has_compr4) {
1281          /* By setting the high bit of the MRF register number, we indicate
1282           * that we want COMPR4 mode - instead of doing the usual destination
1283           * + 1 for the second half we get destination + 4.
1284           */
1285          brw_MOV(p,
1286                  brw_message_reg(nr + channel + (1 << 7)),
1287                  arg0[channel]);
1288       } else {
1289          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1290          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1291          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1292          brw_MOV(p,
1293                  brw_message_reg(nr + channel),
1294                  arg0[channel]);
1295
1296          if (c->dispatch_width == 16) {
1297             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1298             brw_MOV(p,
1299                     brw_message_reg(nr + channel + 4),
1300                     sechalf(arg0[channel]));
1301          }
1302       }
1303    }
1304    /* skip over the regs populated above:
1305     */
1306    nr += 8;
1307    brw_pop_insn_state(p);
1308
1309    if (c->key.source_depth_to_render_target)
1310    {
1311       if (c->key.computes_depth)
1312          brw_MOV(p, brw_message_reg(nr), arg2[2]);
1313       else
1314          brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1315
1316       nr += 2;
1317    }
1318
1319    if (c->key.dest_depth_reg)
1320    {
1321       GLuint comp = c->key.dest_depth_reg / 2;
1322       GLuint off = c->key.dest_depth_reg % 2;
1323
1324       if (off != 0) {
1325          brw_push_insn_state(p);
1326          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1327
1328          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1329          /* 2nd half? */
1330          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1331          brw_pop_insn_state(p);
1332       }
1333       else {
1334          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1335       }
1336       nr += 2;
1337    }
1338
1339    if (!c->key.runtime_check_aads_emit) {
1340       if (c->key.aa_dest_stencil_reg)
1341          emit_aa(c, arg1, 2);
1342
1343       fire_fb_write(c, 0, nr, target, eot);
1344    }
1345    else {
1346       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1347       struct brw_reg ip = brw_ip_reg();
1348       struct brw_instruction *jmp;
1349
1350       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1351       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1352       brw_AND(p,
1353               v1_null_ud,
1354               get_element_ud(brw_vec8_grf(1,0), 6),
1355               brw_imm_ud(1<<26));
1356
1357       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
1358       {
1359          emit_aa(c, arg1, 2);
1360          fire_fb_write(c, 0, nr, target, eot);
1361          /* note - thread killed in subroutine */
1362       }
1363       brw_land_fwd_jump(p, jmp);
1364
1365       /* ELSE: Shuffle up one register to fill in the hole left for AA:
1366        */
1367       fire_fb_write(c, 1, nr-1, target, eot);
1368    }
1369 }
1370
1371 /**
1372  * Move a GPR to scratch memory.
1373  */
1374 static void emit_spill( struct brw_wm_compile *c,
1375                         struct brw_reg reg,
1376                         GLuint slot )
1377 {
1378    struct brw_compile *p = &c->func;
1379
1380    /*
1381      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1382    */
1383    brw_MOV(p, brw_message_reg(2), reg);
1384
1385    /*
1386      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1387      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1388    */
1389    brw_dp_WRITE_16(p,
1390                    retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
1391                    slot);
1392 }
1393
1394
1395 /**
1396  * Load a GPR from scratch memory.
1397  */
1398 static void emit_unspill( struct brw_wm_compile *c,
1399                           struct brw_reg reg,
1400                           GLuint slot )
1401 {
1402    struct brw_compile *p = &c->func;
1403
1404    /* Slot 0 is the undef value.
1405     */
1406    if (slot == 0) {
1407       brw_MOV(p, reg, brw_imm_f(0));
1408       return;
1409    }
1410
1411    /*
1412      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1413      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1414    */
1415
1416    brw_dp_READ_16(p,
1417                   retype(vec16(reg), BRW_REGISTER_TYPE_UW),
1418                   slot);
1419 }
1420
1421
1422 /**
1423  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1424  * Args with unspill_reg != 0 will be loaded from scratch memory.
1425  */
1426 static void get_argument_regs( struct brw_wm_compile *c,
1427                                struct brw_wm_ref *arg[],
1428                                struct brw_reg *regs )
1429 {
1430    GLuint i;
1431
1432    for (i = 0; i < 4; i++) {
1433       if (arg[i]) {
1434          if (arg[i]->unspill_reg)
1435             emit_unspill(c,
1436                          brw_vec8_grf(arg[i]->unspill_reg, 0),
1437                          arg[i]->value->spill_slot);
1438
1439          regs[i] = arg[i]->hw_reg;
1440       }
1441       else {
1442          regs[i] = brw_null_reg();
1443       }
1444    }
1445 }
1446
1447
1448 /**
1449  * For values that have a spill_slot!=0, write those regs to scratch memory.
1450  */
1451 static void spill_values( struct brw_wm_compile *c,
1452                           struct brw_wm_value *values,
1453                           GLuint nr )
1454 {
1455    GLuint i;
1456
1457    for (i = 0; i < nr; i++)
1458       if (values[i].spill_slot)
1459          emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1460 }
1461
1462 #define BRW_MRF_NUM 16
1463 #define BRW_SIZE_OF_REG 32
1464
1465 static INLINE
1466 GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
1467 {
1468    switch (inst->header.opcode) {
1469       case BRW_OPCODE_MOV:
1470       case BRW_OPCODE_SEL:
1471       case BRW_OPCODE_NOT:
1472       case BRW_OPCODE_AND:
1473       case BRW_OPCODE_OR:
1474       case BRW_OPCODE_XOR:
1475       case BRW_OPCODE_SHR:
1476       case BRW_OPCODE_SHL:
1477       case BRW_OPCODE_RSR:
1478       case BRW_OPCODE_RSL:
1479       case BRW_OPCODE_ADD:
1480       case BRW_OPCODE_MUL:
1481       case BRW_OPCODE_AVG:
1482       case BRW_OPCODE_FRC:
1483       case BRW_OPCODE_RNDU:
1484       case BRW_OPCODE_RNDD:
1485       case BRW_OPCODE_RNDE:
1486       case BRW_OPCODE_RNDZ:
1487       case BRW_OPCODE_MAC:
1488       case BRW_OPCODE_MACH:
1489       case BRW_OPCODE_LINE:
1490          return GL_TRUE;
1491       default:
1492          return GL_FALSE;
1493    }
1494 }
1495
1496 static const struct {
1497     char    *name;
1498     int     nsrc;
1499     int     ndst;
1500 } inst_opcode[128] = {
1501     [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
1502     [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
1503     [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
1504     [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
1505     [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
1506     [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
1507     [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
1508     [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
1509
1510     [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
1511     [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
1512     [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
1513     [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
1514     [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
1515     [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
1516     [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
1517     [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
1518     [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
1519     [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
1520     [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
1521     [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
1522
1523     [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
1524     [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
1525     [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
1526     [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
1527     [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
1528     [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
1529     [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
1530     [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
1531     [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
1532     [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
1533     [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
1534
1535     [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
1536     [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
1537     [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
1538     [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
1539     [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
1540     [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
1541     [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
1542     [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
1543     [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
1544     [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
1545     [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
1546     [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
1547     [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
1548     [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
1549     [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
1550     [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
1551     [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
1552 };
1553
1554 static const GLuint inst_stride[7] = {
1555     [0] = 0,
1556     [1] = 1,
1557     [2] = 2,
1558     [3] = 4,
1559     [4] = 8,
1560     [5] = 16,
1561     [6] = 32
1562 };
1563
1564 static const GLuint inst_type_size[8] = {
1565     [0] = 4,
1566     [1] = 4,
1567     [2] = 2,
1568     [3] = 2,
1569     [4] = 1,
1570     [5] = 1,
1571     [7] = 4
1572 };
1573
1574 #define BRW_MAX_OFFSET(x0,x1) ((x0) > (x1) ? (x0) : (x1))
1575 #define BRW_MIN_OFFSET(x0,x1) ((x0) < (x1) ? (x0) : (x1));
1576
1577 static INLINE GLboolean
1578 brw_is_grf_written(const struct brw_instruction *inst,
1579                    int reg_index, int size,
1580                    int gen)
1581 {
1582    if (inst_opcode[inst->header.opcode].ndst == 0)
1583       return GL_FALSE;
1584
1585    if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
1586       if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
1587          return GL_TRUE;
1588
1589    if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
1590       return GL_FALSE;
1591
1592    const int reg_start = reg_index * BRW_SIZE_OF_REG;
1593    const int reg_end = reg_start + size;
1594
1595    const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
1596    const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
1597                          + inst->bits1.da1.dest_subreg_nr;
1598    int length, write_end;
1599
1600    /* SEND is specific */
1601    if (inst->header.opcode == BRW_OPCODE_SEND) {
1602       if (gen >= 5)
1603          length = inst->bits3.generic_gen5.response_length*BRW_SIZE_OF_REG;
1604       else
1605          length = inst->bits3.generic.response_length*BRW_SIZE_OF_REG;
1606    }
1607    else {
1608       length = 1 << inst->header.execution_size;
1609       length *= type_size;
1610       length *= inst->bits1.da1.dest_horiz_stride;
1611    }
1612
1613    /* If the two intervals intersect, we overwrite the register */
1614    write_end = write_start + length;
1615    const int left = BRW_MAX_OFFSET(write_start, reg_start);
1616    const int right = BRW_MIN_OFFSET(write_end, reg_end);
1617
1618    return left < right;
1619 }
1620
1621 /* Specific path for message register since we need to handle the compr4 case */
1622 static INLINE GLboolean
1623 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
1624 {
1625    if (inst_opcode[inst->header.opcode].ndst == 0)
1626       return GL_FALSE;
1627
1628    if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
1629       if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
1630          return GL_TRUE;
1631
1632    if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
1633       return GL_FALSE;
1634
1635    const int reg_start = reg_index * BRW_SIZE_OF_REG;
1636    const int reg_end = reg_start + size;
1637
1638    const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
1639    const int is_compr4 = inst->bits1.da1.dest_reg_nr & 0xf0;
1640    const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
1641
1642    /* We use compr4 with a size != 16 elements. Strange, we conservatively
1643     * consider that we are writing the register.
1644     */
1645    if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
1646       return GL_TRUE;
1647
1648    GLboolean is_written = GL_FALSE;
1649
1650    /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
1651    if (is_compr4) {
1652       const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
1653
1654       /* First 8-way register */
1655       const int write_start0 = mrf_index*BRW_SIZE_OF_REG
1656                              + inst->bits1.da1.dest_subreg_nr;
1657       const int write_end0 = write_start0 + length;
1658
1659       /* Second 8-way register */
1660       const int write_start1 = (mrf_index+4)*BRW_SIZE_OF_REG
1661                              + inst->bits1.da1.dest_subreg_nr;
1662       const int write_end1 = write_start1 + length;
1663
1664       /* If the two intervals intersect, we overwrite the register */
1665       const int left0 = BRW_MAX_OFFSET(write_start0, reg_start);
1666       const int right0 = BRW_MIN_OFFSET(write_end0, reg_end);
1667       const int left1 = BRW_MAX_OFFSET(write_start1, reg_start);
1668       const int right1 = BRW_MIN_OFFSET(write_end1, reg_end);
1669
1670       is_written = left0 < right0 || left1 < right1;
1671    }
1672    else {
1673       int length;
1674       length = 1 << inst->header.execution_size;
1675       length *= type_size;
1676       length *= inst->bits1.da1.dest_horiz_stride;
1677
1678       /* If the two intervals intersect, we write into the register */
1679       const int write_start = inst->bits1.da1.dest_reg_nr*BRW_SIZE_OF_REG
1680                             + inst->bits1.da1.dest_subreg_nr;
1681       const int write_end = write_start + length;
1682       const int left = BRW_MAX_OFFSET(write_start, reg_start);
1683       const int right = BRW_MIN_OFFSET(write_end, reg_end);;
1684
1685       is_written = left < right;
1686    }
1687
1688    /* SEND may perform an implicit mov to a mrf register */
1689    if (is_written == GL_FALSE &&
1690        inst->header.opcode == BRW_OPCODE_SEND &&
1691        inst->bits1.da1.src0_reg_file != 0) {
1692
1693       const int mrf_start = inst->header.destreg__conditionalmod;
1694       const int write_start = mrf_start * BRW_SIZE_OF_REG;
1695       const int write_end = write_start + BRW_SIZE_OF_REG;
1696       const int left = BRW_MAX_OFFSET(write_start, reg_start);
1697       const int right = BRW_MIN_OFFSET(write_end, reg_end);;
1698       is_written = left < right;
1699    }
1700
1701    return is_written;
1702 }
1703
1704 static INLINE GLboolean
1705 brw_is_mrf_read(const struct brw_instruction *inst,
1706                 int reg_index, int size, int gen)
1707 {
1708    if (inst->header.opcode != BRW_OPCODE_SEND)
1709       return GL_FALSE;
1710    if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
1711       return GL_TRUE;
1712
1713    const int reg_start = reg_index*BRW_SIZE_OF_REG;
1714    const int reg_end = reg_start + size;
1715
1716    int length, read_start, read_end;
1717    if (gen >= 5)
1718       length = inst->bits3.generic_gen5.msg_length*BRW_SIZE_OF_REG;
1719    else
1720       length = inst->bits3.generic.msg_length*BRW_SIZE_OF_REG;
1721
1722    /* Look if SEND uses an implicit mov. In that case, we read one less register
1723     * (but we write it)
1724     */
1725    if (inst->bits1.da1.src0_reg_file != 0)
1726       read_start = inst->header.destreg__conditionalmod;
1727    else {
1728       length--;
1729       read_start = inst->header.destreg__conditionalmod + 1;
1730    }
1731    read_start *= BRW_SIZE_OF_REG;
1732    read_end = read_start + length;
1733
1734    const int left = BRW_MAX_OFFSET(read_start, reg_start);
1735    const int right = BRW_MIN_OFFSET(read_end, reg_end);
1736
1737    return left < right;
1738 }
1739
1740 static INLINE GLboolean
1741 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
1742 {
1743    int i, j;
1744    if (inst_opcode[inst->header.opcode].nsrc == 0)
1745       return GL_FALSE;
1746
1747    /* Look at first source. We must take into account register regions to
1748     * monitor carefully the read. Note that we are a bit too conservative here
1749     * since we do not take into account the fact that some complete registers
1750     * may be skipped
1751     */
1752    if (inst_opcode[inst->header.opcode].nsrc >= 1) {
1753
1754       if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
1755          if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
1756             return GL_TRUE;
1757       if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
1758          return GL_FALSE;
1759
1760       const int reg_start = reg_index*BRW_SIZE_OF_REG;
1761       const int reg_end = reg_start + size;
1762
1763       /* See if at least one of this element intersects the interval */
1764       const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
1765       const int elem_num = 1 << inst->header.execution_size;
1766       const int width = 1 << inst->bits2.da1.src0_width;
1767       const int row_num = elem_num >> inst->bits2.da1.src0_width;
1768       const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
1769       const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
1770       int row_start = inst->bits2.da1.src0_reg_nr*BRW_SIZE_OF_REG
1771                     + inst->bits2.da1.src0_subreg_nr;
1772       for (j = 0; j < row_num; ++j) {
1773          int write_start = row_start;
1774          for (i = 0; i < width; ++i) {
1775             const int write_end = write_start + type_size;
1776             const int left = write_start > reg_start ? write_start : reg_start;
1777             const int right = write_end < reg_end ? write_end : reg_end;
1778             if (left < right)
1779                return GL_TRUE;
1780             write_start += hs;
1781          }
1782          row_start += vs;
1783       }
1784    }
1785
1786    /* Second src register */
1787    if (inst_opcode[inst->header.opcode].nsrc >= 2) {
1788
1789       if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
1790          if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
1791             return GL_TRUE;
1792       if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
1793          return GL_FALSE;
1794
1795       const int reg_start = reg_index*BRW_SIZE_OF_REG;
1796       const int reg_end = reg_start + size;
1797
1798       /* See if at least one of this element intersects the interval */
1799       const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
1800       const int elem_num = 1 << inst->header.execution_size;
1801       const int width = 1 << inst->bits3.da1.src1_width;
1802       const int row_num = elem_num >> inst->bits3.da1.src1_width;
1803       const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
1804       const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
1805       int row_start = inst->bits3.da1.src1_reg_nr*BRW_SIZE_OF_REG
1806                     + inst->bits3.da1.src1_subreg_nr;
1807       for (j = 0; j < row_num; ++j) {
1808          int write_start = row_start;
1809          for (i = 0; i < width; ++i) {
1810             const int write_end = write_start + type_size;
1811             const int left = write_start > reg_start ? write_start : reg_start;
1812             const int right = write_end < reg_end ? write_end : reg_end;
1813             if (left < right)
1814                return GL_TRUE;
1815             write_start += hs;
1816          }
1817          row_start += vs;
1818       }
1819    }
1820
1821    return GL_FALSE;
1822 }
1823
1824 static INLINE GLboolean
1825 brw_is_control_done(const struct brw_instruction *mov) {
1826    return
1827        mov->header.dependency_control != 0 ||
1828        mov->header.thread_control != 0 ||
1829        mov->header.mask_control != 0 ||
1830        mov->header.saturate != 0 ||
1831        mov->header.debug_control != 0;
1832 }
1833
1834 static INLINE GLboolean
1835 brw_is_predicated(const struct brw_instruction *mov) {
1836    return mov->header.predicate_control != 0;
1837 }
1838
1839 static INLINE GLboolean
1840 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
1841                       int *mrf_index,
1842                       int *grf_index,
1843                       GLboolean *is_compr4)
1844 {
1845    if (brw_is_predicated(mov) ||
1846        brw_is_control_done(mov) ||
1847        mov->header.debug_control != 0)
1848       return GL_FALSE;
1849
1850    if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
1851        mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
1852        mov->bits1.da1.dest_reg_type != 7 ||
1853        mov->bits1.da1.dest_horiz_stride != 1 ||
1854        mov->bits1.da1.dest_subreg_nr != 0)
1855       return GL_FALSE;
1856
1857    if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
1858        mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
1859        mov->bits1.da1.src0_reg_type != 7 ||
1860        mov->bits2.da1.src0_width != 3 ||
1861        mov->bits2.da1.src0_horiz_stride != 1 ||
1862        mov->bits2.da1.src0_vert_stride != 4 ||
1863        mov->bits2.da1.src0_subreg_nr != 0 ||
1864        mov->bits2.da1.src0_abs != 0 ||
1865        mov->bits2.da1.src0_negate != 0)
1866       return GL_FALSE;
1867
1868    *grf_index = mov->bits2.da1.src0_reg_nr;
1869    *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
1870    *is_compr4 = (mov->bits1.da1.dest_reg_nr & 0xf0) != 0;
1871    return GL_TRUE;
1872 }
1873
1874 static INLINE GLboolean
1875 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
1876 {
1877    /* remark: no problem to predicate a SEL instruction */
1878    if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
1879        brw_is_control_done(inst) == GL_FALSE &&
1880        inst->header.execution_size == 4 &&
1881        inst->header.access_mode == BRW_ALIGN_1 &&
1882        inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
1883        inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
1884        inst->bits1.da1.dest_reg_type == 7 &&
1885        inst->bits1.da1.dest_horiz_stride == 1 &&
1886        inst->bits1.da1.dest_reg_nr == grf_index &&
1887        inst->bits1.da1.dest_subreg_nr == 0 &&
1888        brw_is_arithmetic_inst(inst))
1889       return GL_TRUE;
1890
1891    return GL_FALSE;
1892 }
1893
1894 static INLINE GLboolean
1895 brw_inst_are_equal(const struct brw_instruction *src0,
1896                    const struct brw_instruction *src1)
1897 {
1898    const GLuint *field0 = (GLuint *) src0;
1899    const GLuint *field1 = (GLuint *) src1;
1900    return field0[0] == field1[0] &&
1901           field0[1] == field1[1] &&
1902           field0[2] == field1[2] &&
1903           field0[3] == field1[3];
1904 }
1905
1906 static INLINE void
1907 brw_inst_copy(struct brw_instruction *dst,
1908               const struct brw_instruction *src)
1909 {
1910    GLuint *field_dst = (GLuint *) dst;
1911    const GLuint *field_src = (GLuint *) src;
1912    field_dst[0] = field_src[0];
1913    field_dst[1] = field_src[1];
1914    field_dst[2] = field_src[2];
1915    field_dst[3] = field_src[3];
1916 }
1917
1918 static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
1919 {
1920    int i, nr_insn = 0, to = 0, from = 0;
1921
1922    for (from = 0; from < p->nr_insn; ++from) {
1923       if (removeInst[from])
1924          continue;
1925       if(to != from)
1926          brw_inst_copy(p->store + to, p->store + from);
1927       to++;
1928    }
1929
1930    for (i = 0; i < p->nr_insn; ++i)
1931       if (removeInst[i] == GL_FALSE)
1932          nr_insn++;
1933    p->nr_insn = nr_insn;
1934 }
1935
1936 /* The gen code emitter generates a lot of duplications in the mrf-to-grf moves.
1937  * Here, we monitor same mov mrf-to-grf instrutions and remove them as soon as
1938  * none of the two operands have been written
1939  */
1940 static void brw_remove_duplicate_mrf_moves(struct brw_wm_compile *c)
1941 {
1942    struct brw_compile *p = &c->func;
1943    const int gen = p->brw->intel.gen;
1944    int i, j;
1945
1946    GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
1947    for (i = 0; i < p->nr_insn; i++) {
1948       if (removeInst[i])
1949          continue;
1950
1951       const struct brw_instruction *mov = p->store + i;
1952       int mrf_index, grf_index;
1953       GLboolean is_compr4;
1954
1955       /* Only consider _straight_ grf-to-mrf moves */
1956       if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
1957          continue;
1958
1959       const int mrf_index0 = mrf_index;
1960       const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
1961       const int simd16_size = 2 * BRW_SIZE_OF_REG;
1962
1963       for (j = i + 1; j < p->nr_insn; j++) {
1964          const struct brw_instruction *inst = p->store + j;
1965
1966          if (brw_inst_are_equal(mov, inst)) {
1967             removeInst[j] = GL_TRUE;
1968             continue;
1969          }
1970
1971          if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
1972              brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG) ||
1973              brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG))
1974             break;
1975       }
1976    }
1977
1978    brw_remove_inst(p, removeInst);
1979    free(removeInst);
1980 }
1981
1982 static void brw_remove_mrf_to_grf_moves(struct brw_wm_compile *c)
1983 {
1984    int i, j, prev;
1985    struct brw_compile *p = &c->func;
1986    struct brw_context *brw = p->brw;
1987    const int gen = brw->intel.gen;
1988    const int simd16_size = 2*BRW_SIZE_OF_REG;
1989
1990    if (c->dispatch_width != 16 || brw->has_compr4 == GL_FALSE)
1991       return;
1992
1993    GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
1994    assert(removeInst);
1995
1996    for (i = 0; i < p->nr_insn; i++) {
1997       if (removeInst[i])
1998          continue;
1999
2000       struct brw_instruction *grf_inst = NULL;
2001       const struct brw_instruction *mov = p->store + i;
2002       int mrf_index, grf_index;
2003       GLboolean is_compr4;
2004
2005       /* Only consider _straight_ grf-to-mrf moves */
2006       if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
2007          continue;
2008
2009       /* Using comp4 enables a stride of 4 for this instruction */
2010       const int mrf_index0 = mrf_index;
2011       const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
2012
2013       /* Look where the register has been set */
2014       prev = i;
2015       GLboolean potential_remove = GL_FALSE;
2016       while (prev--) {
2017
2018          /* If _one_ instruction writes the grf, we try to remove the mov */
2019          struct brw_instruction *inst = p->store + prev;
2020          if (brw_is_grf_straight_write(inst, grf_index)) {
2021             potential_remove = GL_TRUE;
2022             grf_inst = inst;
2023             break;
2024          }
2025
2026       }
2027
2028       if (potential_remove == GL_FALSE)
2029          continue;
2030       removeInst[i] = GL_TRUE;
2031
2032       /* Monitor first the section of code between the grf computation and the
2033        * mov. Here we cannot read or write both mrf and grf register
2034        */
2035       for (j = prev + 1; j < i; ++j) {
2036          struct brw_instruction *inst = p->store + j;
2037          if (removeInst[j])
2038             continue;
2039          if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
2040              brw_is_grf_read(inst, grf_index, simd16_size)           ||
2041              brw_is_mrf_written(inst, mrf_index0, BRW_SIZE_OF_REG)   ||
2042              brw_is_mrf_written(inst, mrf_index1, BRW_SIZE_OF_REG)   ||
2043              brw_is_mrf_read(inst, mrf_index0, BRW_SIZE_OF_REG, gen) ||
2044              brw_is_mrf_read(inst, mrf_index1, BRW_SIZE_OF_REG, gen)) {
2045             removeInst[i] = GL_FALSE;
2046             break;
2047          }
2048       }
2049
2050       /* After the mov, we can read or write the mrf. If the grf is overwritten,
2051        * we are done
2052        */
2053       for (j = i + 1; j < p->nr_insn; ++j) {
2054          struct brw_instruction *inst = p->store + j;
2055          if (removeInst[j])
2056             continue;
2057
2058          if (brw_is_grf_read(inst, grf_index, simd16_size)) {
2059             removeInst[i] = GL_FALSE;
2060             break;
2061          }
2062
2063          if (brw_is_grf_straight_write(inst, grf_index))
2064             break;
2065       }
2066
2067       /* Note that with the top down traversal, we can safely pacth the mov
2068        * instruction
2069        */
2070       if (removeInst[i]) {
2071          grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
2072          grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
2073       }
2074    }
2075
2076    brw_remove_inst(p, removeInst);
2077    free(removeInst);
2078 }
2079
2080 /* Emit the fragment program instructions here.
2081  */
2082 void brw_wm_emit( struct brw_wm_compile *c )
2083 {
2084    struct brw_compile *p = &c->func;
2085    GLuint insn;
2086
2087    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2088
2089    /* Check if any of the payload regs need to be spilled:
2090     */
2091    spill_values(c, c->payload.depth, 4);
2092    spill_values(c, c->creg, c->nr_creg);
2093    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
2094
2095
2096    for (insn = 0; insn < c->nr_insns; insn++) {
2097
2098       struct brw_wm_instruction *inst = &c->instruction[insn];
2099       struct brw_reg args[3][4], dst[4];
2100       GLuint i, dst_flags;
2101
2102       /* Get argument regs:
2103        */
2104       for (i = 0; i < 3; i++)
2105          get_argument_regs(c, inst->src[i], args[i]);
2106
2107       /* Get dest regs:
2108        */
2109       for (i = 0; i < 4; i++)
2110          if (inst->dst[i])
2111             dst[i] = inst->dst[i]->hw_reg;
2112          else
2113             dst[i] = brw_null_reg();
2114
2115       /* Flags
2116        */
2117       dst_flags = inst->writemask;
2118       if (inst->saturate)
2119          dst_flags |= SATURATE;
2120
2121       switch (inst->opcode) {
2122          /* Generated instructions for calculating triangle interpolants:
2123           */
2124       case WM_PIXELXY:
2125          emit_pixel_xy(c, dst, dst_flags);
2126          break;
2127
2128       case WM_DELTAXY:
2129          emit_delta_xy(p, dst, dst_flags, args[0]);
2130          break;
2131
2132       case WM_WPOSXY:
2133          emit_wpos_xy(c, dst, dst_flags, args[0]);
2134          break;
2135
2136       case WM_PIXELW:
2137          emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
2138          break;
2139
2140       case WM_LINTERP:
2141          emit_linterp(p, dst, dst_flags, args[0], args[1]);
2142          break;
2143
2144       case WM_PINTERP:
2145          emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
2146          break;
2147
2148       case WM_CINTERP:
2149          emit_cinterp(p, dst, dst_flags, args[0]);
2150          break;
2151
2152       case WM_FB_WRITE:
2153          emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
2154          break;
2155
2156       case WM_FRONTFACING:
2157          emit_frontfacing(p, dst, dst_flags);
2158          break;
2159
2160          /* Straightforward arithmetic:
2161           */
2162       case OPCODE_ADD:
2163          emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2164          break;
2165
2166       case OPCODE_FRC:
2167          emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2168          break;
2169
2170       case OPCODE_FLR:
2171          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2172          break;
2173
2174       case OPCODE_DDX:
2175          emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
2176          break;
2177
2178       case OPCODE_DDY:
2179          emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
2180          break;
2181
2182       case OPCODE_DP3:
2183          emit_dp3(p, dst, dst_flags, args[0], args[1]);
2184          break;
2185
2186       case OPCODE_DP4:
2187          emit_dp4(p, dst, dst_flags, args[0], args[1]);
2188          break;
2189
2190       case OPCODE_DPH:
2191          emit_dph(p, dst, dst_flags, args[0], args[1]);
2192          break;
2193
2194       case OPCODE_TRUNC:
2195          emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2196          break;
2197
2198       case OPCODE_LRP:
2199          emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
2200          break;
2201
2202       case OPCODE_MAD:
2203          emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2204          break;
2205
2206       case OPCODE_MOV:
2207       case OPCODE_SWZ:
2208          emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2209          break;
2210
2211       case OPCODE_MUL:
2212          emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2213          break;
2214
2215       case OPCODE_XPD:
2216          emit_xpd(p, dst, dst_flags, args[0], args[1]);
2217          break;
2218
2219          /* Higher math functions:
2220           */
2221       case OPCODE_RCP:
2222          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
2223          break;
2224
2225       case OPCODE_RSQ:
2226          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
2227          break;
2228
2229       case OPCODE_SIN:
2230          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
2231          break;
2232
2233       case OPCODE_COS:
2234          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
2235          break;
2236
2237       case OPCODE_EX2:
2238          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
2239          break;
2240
2241       case OPCODE_LG2:
2242          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
2243          break;
2244
2245       case OPCODE_SCS:
2246          /* There is an scs math function, but it would need some
2247           * fixup for 16-element execution.
2248           */
2249          if (dst_flags & WRITEMASK_X)
2250             emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
2251          if (dst_flags & WRITEMASK_Y)
2252             emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
2253          break;
2254
2255       case OPCODE_POW:
2256          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
2257          break;
2258
2259          /* Comparisons:
2260           */
2261       case OPCODE_CMP:
2262          emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
2263          break;
2264
2265       case OPCODE_MAX:
2266          emit_max(p, dst, dst_flags, args[0], args[1]);
2267          break;
2268
2269       case OPCODE_MIN:
2270          emit_min(p, dst, dst_flags, args[0], args[1]);
2271          break;
2272
2273       case OPCODE_SLT:
2274          emit_slt(p, dst, dst_flags, args[0], args[1]);
2275          break;
2276
2277       case OPCODE_SLE:
2278          emit_sle(p, dst, dst_flags, args[0], args[1]);
2279         break;
2280       case OPCODE_SGT:
2281          emit_sgt(p, dst, dst_flags, args[0], args[1]);
2282         break;
2283       case OPCODE_SGE:
2284          emit_sge(p, dst, dst_flags, args[0], args[1]);
2285          break;
2286       case OPCODE_SEQ:
2287          emit_seq(p, dst, dst_flags, args[0], args[1]);
2288         break;
2289       case OPCODE_SNE:
2290          emit_sne(p, dst, dst_flags, args[0], args[1]);
2291         break;
2292
2293       case OPCODE_LIT:
2294          emit_lit(c, dst, dst_flags, args[0]);
2295          break;
2296
2297          /* Texturing operations:
2298           */
2299       case OPCODE_TEX:
2300          emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
2301                   inst->tex_idx, inst->tex_unit,
2302                   inst->tex_shadow);
2303          break;
2304
2305       case OPCODE_TXB:
2306          emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
2307                   inst->tex_idx, inst->tex_unit);
2308          break;
2309
2310       case OPCODE_KIL:
2311          emit_kil(c, args[0]);
2312          break;
2313
2314       case OPCODE_KIL_NV:
2315          emit_kil_nv(c);
2316          break;
2317
2318       default:
2319          printf("Unsupported opcode %i (%s) in fragment shader\n",
2320                 inst->opcode, inst->opcode < MAX_OPCODE ?
2321                 _mesa_opcode_string(inst->opcode) :
2322                 "unknown");
2323       }
2324
2325       for (i = 0; i < 4; i++)
2326         if (inst->dst[i] && inst->dst[i]->spill_slot)
2327            emit_spill(c,
2328                       inst->dst[i]->hw_reg,
2329                       inst->dst[i]->spill_slot);
2330    }
2331
2332    /* Only properly tested on ILK */
2333    if (p->brw->intel.gen == 5) {
2334      brw_remove_duplicate_mrf_moves(c);
2335      brw_remove_mrf_to_grf_moves(c);
2336    }
2337
2338    if (INTEL_DEBUG & DEBUG_WM) {
2339       int i;
2340
2341      printf("wm-native:\n");
2342      for (i = 0; i < p->nr_insn; i++)
2343          brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
2344       printf("\n");
2345    }
2346 }
2347