src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "brw_context.h"
   4 #include "brw_eu.h"
   5 #include "brw_wm.h"
   6
   7 enum _subroutine {
   8     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
   9 };
  10
  11
  12 /**
  13  * Determine if the given fragment program uses GLSL features such
  14  * as flow conditionals, loops, subroutines.
  15  * Some GLSL shaders may use these features, others might not.
  16  */
  17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  18 {
  19     int i;
  20     for (i = 0; i < fp->Base.NumInstructions; i++) {
  21         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  22         switch (inst->Opcode) {
  23             case OPCODE_IF:
  24             case OPCODE_TRUNC:
  25             case OPCODE_ENDIF:
  26             case OPCODE_CAL:
  27             case OPCODE_BRK:
  28             case OPCODE_RET:
  29             case OPCODE_DDX:
  30             case OPCODE_DDY:
  31             case OPCODE_NOISE1:
  32             case OPCODE_NOISE2:
  33             case OPCODE_NOISE3:
  34             case OPCODE_NOISE4:
  35             case OPCODE_BGNLOOP:
  36                 return GL_TRUE;
  37             default:
  38                 break;
  39         }
  40     }
  41     return GL_FALSE;
  42 }
  43
  44
  45 /**
  46  * Record the mapping of a Mesa register to a hardware register.
  47  */
  48 static void set_reg(struct brw_wm_compile *c, int file, int index,
  49         int component, struct brw_reg reg)
  50 {
  51     c->wm_regs[file][index][component].reg = reg;
  52     c->wm_regs[file][index][component].inited = GL_TRUE;
  53 }
  54
  55 /**
  56  * Examine instruction's write mask to find index of first component
  57  * enabled for writing.
  58  */
  59 static int get_scalar_dst_index(struct prog_instruction *inst)
  60 {
  61     int i;
  62     for (i = 0; i < 4; i++)
  63         if (inst->DstReg.WriteMask & (1<<i))
  64             break;
  65     return i;
  66 }
  67
  68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
  69 {
  70     struct brw_reg reg;
  71     if(c->tmp_index == c->tmp_max)
  72         c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
  73
  74     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
  75     return reg;
  76 }
  77
  78 /**
  79  * Save current temp register info.
  80  * There must be a matching call to release_tmps().
  81  */
  82 static int mark_tmps(struct brw_wm_compile *c)
  83 {
  84     return c->tmp_index;
  85 }
  86
  87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
  88 {
  89     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
  90 }
  91
  92 static void release_tmps(struct brw_wm_compile *c, int mark)
  93 {
  94     c->tmp_index = mark;
  95 }
  96
  97 /**
  98  * Convert Mesa src register to brw register.
  99  *
 100  * Since we're running in SOA mode each Mesa register corresponds to four
 101  * hardware registers.  We allocate the hardware registers as needed here.
 102  *
 103  * \param file  register file, one of PROGRAM_x
 104  * \param index  register number
 105  * \param component  src component (X=0, Y=1, Z=2, W=3)
 106  * \param nr  not used?!?
 107  * \param neg  negate value?
 108  * \param abs  take absolute value?
 109  */
 110 static struct brw_reg
 111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 112         int nr, GLuint neg, GLuint abs)
 113 {
 114     struct brw_reg reg;
 115     switch (file) {
 116         case PROGRAM_STATE_VAR:
 117         case PROGRAM_CONSTANT:
 118         case PROGRAM_UNIFORM:
 119             file = PROGRAM_STATE_VAR;
 120             break;
 121         case PROGRAM_UNDEFINED:
 122             return brw_null_reg();
 123         case PROGRAM_TEMPORARY:
 124         case PROGRAM_INPUT:
 125         case PROGRAM_OUTPUT:
 126         case PROGRAM_PAYLOAD:
 127             break;
 128         default:
 129             _mesa_problem(NULL, "Unexpected file in get_reg()");
 130             return brw_null_reg();
 131     }
 132
 133     /* see if we've already allocated a HW register for this Mesa register */
 134     if (c->wm_regs[file][index][component].inited) {
 135         /* yes, re-use */
 136         reg = c->wm_regs[file][index][component].reg;
 137     }
 138     else {
 139         /* no, allocate new register */
 140         reg = brw_vec8_grf(c->reg_index, 0);
 141     }
 142
 143     /* if this is a new register allocation, record it in the table */
 144     if (!c->wm_regs[file][index][component].inited) {
 145         set_reg(c, file, index, component, reg);
 146         c->reg_index++;
 147     }
 148
 149     if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
 150         /* ran out of temporary registers! */
 151 #if 1
 152         /* This is a big hack for now.
 153          * Return bad register index, just don't hang the GPU.
 154          */
 155         _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
 156         c->reg_index = BRW_WM_MAX_GRF - 13;
 157 #else
 158         return brw_null_reg();
 159 #endif
 160     }
 161
 162     if (neg & (1 << component)) {
 163         reg = negate(reg);
 164     }
 165     if (abs)
 166         reg = brw_abs(reg);
 167     return reg;
 168 }
 169
 170
 171 /**
 172  * Preallocate registers.  This sets up the Mesa to hardware register
 173  * mapping for certain registers, such as constants (uniforms/state vars)
 174  * and shader inputs.
 175  */
 176 static void prealloc_reg(struct brw_wm_compile *c)
 177 {
 178     int i, j;
 179     struct brw_reg reg;
 180     int nr_interp_regs = 0;
 181     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 182
 183     for (i = 0; i < 4; i++) {
 184         if (i < c->key.nr_depth_regs)
 185             reg = brw_vec8_grf(i * 2, 0);
 186         else
 187             reg = brw_vec8_grf(0, 0);
 188         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 189     }
 190     c->reg_index += 2 * c->key.nr_depth_regs;
 191
 192     /* constants */
 193     {
 194         const int nr_params = c->fp->program.Base.Parameters->NumParameters;
 195
 196         if (1 /* XXX threshold: nr_params <= 8 */) {
 197            const struct gl_program_parameter_list *plist =
 198               c->fp->program.Base.Parameters;
 199            int index = 0;
 200
 201            /* number of float constants in CURBE */
 202            c->prog_data.nr_params = 4 * nr_params;
 203
 204            /* loop over program constants (float[4]) */
 205            for (i = 0; i < nr_params; i++) {
 206               /* loop over XYZW channels */
 207               for (j = 0; j < 4; j++, index++) {
 208                  reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
 209                  /* Save pointer to parameter/constant value.
 210                   * Constants will be copied in prepare_constant_buffer()
 211                   */
 212                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 213                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 214               }
 215            }
 216            /* number of constant regs used (each reg is float[8]) */
 217            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 218            c->reg_index += c->nr_creg;
 219         }
 220         else {
 221            /* number of float constants in CURBE */
 222            c->prog_data.nr_params = 0;
 223
 224            /* When there's a lot of FP constanst we'll store them in a
 225             * texture-like buffer instead of using the CURBE buffer.
 226             * This means we won't use GRF registers for constants and we'll
 227             * have to fetch constants with a dataport read.
 228             */
 229         }
 230     }
 231
 232     /* fragment shader inputs */
 233     for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
 234         if (inputs & (1<<i)) {
 235             nr_interp_regs++;
 236             reg = brw_vec8_grf(c->reg_index, 0);
 237             for (j = 0; j < 4; j++)
 238                 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
 239             c->reg_index += 2;
 240         }
 241     }
 242
 243     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 244     c->prog_data.urb_read_length = nr_interp_regs * 2;
 245     c->prog_data.curb_read_length = c->nr_creg;
 246     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
 247     c->reg_index++;
 248     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
 249     c->reg_index += 2;
 250 }
 251
 252
 253 /**
 254  * Convert Mesa dst register to brw register.
 255  */
 256 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 257                                   const struct prog_instruction *inst,
 258                                   GLuint component)
 259 {
 260     const int nr = 1;
 261     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 262             0, 0);
 263 }
 264
 265
 266 /**
 267  * Convert Mesa src register to brw register.
 268  */
 269 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 270                                   const struct prog_instruction *inst,
 271                                   GLuint srcRegIndex, GLuint channel)
 272 {
 273     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 274     const GLuint nr = 1;
 275     const GLuint component = GET_SWZ(src->Swizzle, channel);
 276
 277     return get_reg(c, src->File, src->Index, component, nr,
 278                    src->NegateBase, src->Abs);
 279 }
 280
 281
 282 /**
 283  * Same as \sa get_src_reg() but if the register is a literal, emit
 284  * a brw_reg encoding the literal.
 285  * Note that a brw instruction only allows one src operand to be a literal.
 286  * For instructions with more than one operand, only the second can be a literal.
 287  */
 288 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 289                                       const struct prog_instruction *inst,
 290                                       GLuint srcRegIndex, GLuint channel)
 291 {
 292     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 293     if (src->File == PROGRAM_CONSTANT) {
 294        /* a literal */
 295        const int component = GET_SWZ(src->Swizzle, channel);
 296        const GLfloat *param =
 297           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 298        GLfloat value = param[component];
 299        if (src->NegateBase)
 300           value = -value;
 301        if (src->Abs)
 302           value = FABSF(value);
 303        return brw_imm_f(value);
 304     }
 305     else {
 306        return get_src_reg(c, inst, srcRegIndex, channel);
 307     }
 308 }
 309
 310
 311 /**
 312  * Subroutines are minimal support for resusable instruction sequences.
 313  * They are implemented as simply as possible to minimise overhead: there
 314  * is no explicit support for communication between the caller and callee
 315  * other than saving the return address in a temporary register, nor is
 316  * there any automatic local storage.  This implies that great care is
 317  * required before attempting reentrancy or any kind of nested
 318  * subroutine invocations.
 319  */
 320 static void invoke_subroutine( struct brw_wm_compile *c,
 321                                enum _subroutine subroutine,
 322                                void (*emit)( struct brw_wm_compile * ) )
 323 {
 324     struct brw_compile *p = &c->func;
 325
 326     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 327
 328     if( c->subroutines[ subroutine ] ) {
 329         /* subroutine previously emitted: reuse existing instructions */
 330
 331         int mark = mark_tmps( c );
 332         struct brw_reg return_address = retype( alloc_tmp( c ),
 333                                                 BRW_REGISTER_TYPE_UD );
 334         int here = p->nr_insn;
 335
 336         brw_push_insn_state(p);
 337         brw_set_mask_control(p, BRW_MASK_DISABLE);
 338         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 339
 340         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 341                  brw_imm_d( ( c->subroutines[ subroutine ] -
 342                               here - 1 ) << 4 ) );
 343         brw_pop_insn_state(p);
 344
 345         release_tmps( c, mark );
 346     } else {
 347         /* previously unused subroutine: emit, and mark for later reuse */
 348
 349         int mark = mark_tmps( c );
 350         struct brw_reg return_address = retype( alloc_tmp( c ),
 351                                                 BRW_REGISTER_TYPE_UD );
 352         struct brw_instruction *calc;
 353         int base = p->nr_insn;
 354
 355         brw_push_insn_state(p);
 356         brw_set_mask_control(p, BRW_MASK_DISABLE);
 357         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 358         brw_pop_insn_state(p);
 359
 360         c->subroutines[ subroutine ] = p->nr_insn;
 361
 362         emit( c );
 363
 364         brw_push_insn_state(p);
 365         brw_set_mask_control(p, BRW_MASK_DISABLE);
 366         brw_MOV( p, brw_ip_reg(), return_address );
 367         brw_pop_insn_state(p);
 368
 369         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 370
 371         release_tmps( c, mark );
 372     }
 373 }
 374
 375 static void emit_abs( struct brw_wm_compile *c,
 376                 struct prog_instruction *inst)
 377 {
 378     int i;
 379     struct brw_compile *p = &c->func;
 380     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 381     for (i = 0; i < 4; i++) {
 382         if (inst->DstReg.WriteMask & (1<<i)) {
 383             struct brw_reg src, dst;
 384             dst = get_dst_reg(c, inst, i);
 385             src = get_src_reg(c, inst, 0, i);
 386             brw_MOV(p, dst, brw_abs(src));
 387         }
 388     }
 389     brw_set_saturate(p, 0);
 390 }
 391
 392 static void emit_trunc( struct brw_wm_compile *c,
 393                 struct prog_instruction *inst)
 394 {
 395     int i;
 396     struct brw_compile *p = &c->func;
 397     GLuint mask = inst->DstReg.WriteMask;
 398     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 399     for (i = 0; i < 4; i++) {
 400         if (mask & (1<<i)) {
 401             struct brw_reg src, dst;
 402             dst = get_dst_reg(c, inst, i);
 403             src = get_src_reg(c, inst, 0, i);
 404             brw_RNDZ(p, dst, src);
 405         }
 406     }
 407     brw_set_saturate(p, 0);
 408 }
 409
 410 static void emit_mov( struct brw_wm_compile *c,
 411                 struct prog_instruction *inst)
 412 {
 413     int i;
 414     struct brw_compile *p = &c->func;
 415     GLuint mask = inst->DstReg.WriteMask;
 416     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 417     for (i = 0; i < 4; i++) {
 418         if (mask & (1<<i)) {
 419             struct brw_reg src, dst;
 420             dst = get_dst_reg(c, inst, i);
 421             src = get_src_reg_imm(c, inst, 0, i);
 422             brw_MOV(p, dst, src);
 423         }
 424     }
 425     brw_set_saturate(p, 0);
 426 }
 427
 428 static void emit_pixel_xy(struct brw_wm_compile *c,
 429                 struct prog_instruction *inst)
 430 {
 431     struct brw_reg r1 = brw_vec1_grf(1, 0);
 432     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 433
 434     struct brw_reg dst0, dst1;
 435     struct brw_compile *p = &c->func;
 436     GLuint mask = inst->DstReg.WriteMask;
 437
 438     dst0 = get_dst_reg(c, inst, 0);
 439     dst1 = get_dst_reg(c, inst, 1);
 440     /* Calculate pixel centers by adding 1 or 0 to each of the
 441      * micro-tile coordinates passed in r1.
 442      */
 443     if (mask & WRITEMASK_X) {
 444         brw_ADD(p,
 445                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 446                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 447                 brw_imm_v(0x10101010));
 448     }
 449
 450     if (mask & WRITEMASK_Y) {
 451         brw_ADD(p,
 452                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 453                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 454                 brw_imm_v(0x11001100));
 455     }
 456 }
 457
 458 static void emit_delta_xy(struct brw_wm_compile *c,
 459                           struct prog_instruction *inst)
 460 {
 461     struct brw_reg r1 = brw_vec1_grf(1, 0);
 462     struct brw_reg dst0, dst1, src0, src1;
 463     struct brw_compile *p = &c->func;
 464     GLuint mask = inst->DstReg.WriteMask;
 465
 466     dst0 = get_dst_reg(c, inst, 0);
 467     dst1 = get_dst_reg(c, inst, 1);
 468     src0 = get_src_reg(c, inst, 0, 0);
 469     src1 = get_src_reg(c, inst, 0, 1);
 470     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 471      * centers.
 472      */
 473     if (mask & WRITEMASK_X) {
 474         brw_ADD(p,
 475                 dst0,
 476                 retype(src0, BRW_REGISTER_TYPE_UW),
 477                 negate(r1));
 478     }
 479
 480     if (mask & WRITEMASK_Y) {
 481         brw_ADD(p,
 482                 dst1,
 483                 retype(src1, BRW_REGISTER_TYPE_UW),
 484                 negate(suboffset(r1,1)));
 485
 486     }
 487 }
 488
 489 static void fire_fb_write( struct brw_wm_compile *c,
 490                            GLuint base_reg,
 491                            GLuint nr,
 492                            GLuint target,
 493                            GLuint eot)
 494 {
 495     struct brw_compile *p = &c->func;
 496     /* Pass through control information:
 497      */
 498     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 499     {
 500         brw_push_insn_state(p);
 501         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 502         brw_MOV(p,
 503                 brw_message_reg(base_reg + 1),
 504                 brw_vec8_grf(1, 0));
 505         brw_pop_insn_state(p);
 506     }
 507     /* Send framebuffer write message: */
 508     brw_fb_WRITE(p,
 509             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 510             base_reg,
 511             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 512             target,
 513             nr,
 514             0,
 515             eot);
 516 }
 517
 518 static void emit_fb_write(struct brw_wm_compile *c,
 519                 struct prog_instruction *inst)
 520 {
 521     struct brw_compile *p = &c->func;
 522     int nr = 2;
 523     int channel;
 524     GLuint target, eot;
 525     struct brw_reg src0;
 526
 527     /* Reserve a space for AA - may not be needed:
 528      */
 529     if (c->key.aa_dest_stencil_reg)
 530         nr += 1;
 531
 532     brw_push_insn_state(p);
 533     for (channel = 0; channel < 4; channel++) {
 534         src0 = get_src_reg(c,  inst, 0, channel);
 535         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 536         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 537         brw_MOV(p, brw_message_reg(nr + channel), src0);
 538     }
 539     /* skip over the regs populated above: */
 540     nr += 8;
 541     brw_pop_insn_state(p);
 542
 543     if (c->key.source_depth_to_render_target) {
 544        if (c->key.computes_depth) {
 545           src0 = get_src_reg(c, inst, 2, 2);
 546           brw_MOV(p, brw_message_reg(nr), src0);
 547        }
 548        else {
 549           src0 = get_src_reg(c, inst, 1, 1);
 550           brw_MOV(p, brw_message_reg(nr), src0);
 551        }
 552
 553        nr += 2;
 554     }
 555
 556     if (c->key.dest_depth_reg) {
 557         GLuint comp = c->key.dest_depth_reg / 2;
 558         GLuint off = c->key.dest_depth_reg % 2;
 559
 560         assert(comp == 1);
 561         assert(off == 0);
 562 #if 0
 563         /* XXX do we need this code?   comp always 1, off always 0, it seems */
 564         if (off != 0) {
 565             brw_push_insn_state(p);
 566             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 567
 568             brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
 569             /* 2nd half? */
 570             brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
 571             brw_pop_insn_state(p);
 572         }
 573         else
 574 #endif
 575         {
 576            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 577            brw_MOV(p, brw_message_reg(nr), src);
 578         }
 579         nr += 2;
 580    }
 581
 582     target = inst->Aux >> 1;
 583     eot = inst->Aux & 1;
 584     fire_fb_write(c, 0, nr, target, eot);
 585 }
 586
 587 static void emit_pixel_w( struct brw_wm_compile *c,
 588                 struct prog_instruction *inst)
 589 {
 590     struct brw_compile *p = &c->func;
 591     GLuint mask = inst->DstReg.WriteMask;
 592     if (mask & WRITEMASK_W) {
 593         struct brw_reg dst, src0, delta0, delta1;
 594         struct brw_reg interp3;
 595
 596         dst = get_dst_reg(c, inst, 3);
 597         src0 = get_src_reg(c, inst, 0, 0);
 598         delta0 = get_src_reg(c, inst, 1, 0);
 599         delta1 = get_src_reg(c, inst, 1, 1);
 600
 601         interp3 = brw_vec1_grf(src0.nr+1, 4);
 602         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 603          * result straight into a message reg.
 604          */
 605         brw_LINE(p, brw_null_reg(), interp3, delta0);
 606         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 607
 608         /* Calc w */
 609         brw_math_16( p, dst,
 610                 BRW_MATH_FUNCTION_INV,
 611                 BRW_MATH_SATURATE_NONE,
 612                 2, brw_null_reg(),
 613                 BRW_MATH_PRECISION_FULL);
 614     }
 615 }
 616
 617 static void emit_linterp(struct brw_wm_compile *c,
 618                 struct prog_instruction *inst)
 619 {
 620     struct brw_compile *p = &c->func;
 621     GLuint mask = inst->DstReg.WriteMask;
 622     struct brw_reg interp[4];
 623     struct brw_reg dst, delta0, delta1;
 624     struct brw_reg src0;
 625     GLuint nr, i;
 626
 627     src0 = get_src_reg(c, inst, 0, 0);
 628     delta0 = get_src_reg(c, inst, 1, 0);
 629     delta1 = get_src_reg(c, inst, 1, 1);
 630     nr = src0.nr;
 631
 632     interp[0] = brw_vec1_grf(nr, 0);
 633     interp[1] = brw_vec1_grf(nr, 4);
 634     interp[2] = brw_vec1_grf(nr+1, 0);
 635     interp[3] = brw_vec1_grf(nr+1, 4);
 636
 637     for(i = 0; i < 4; i++ ) {
 638         if (mask & (1<<i)) {
 639             dst = get_dst_reg(c, inst, i);
 640             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 641             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 642         }
 643     }
 644 }
 645
 646 static void emit_cinterp(struct brw_wm_compile *c,
 647                 struct prog_instruction *inst)
 648 {
 649     struct brw_compile *p = &c->func;
 650     GLuint mask = inst->DstReg.WriteMask;
 651
 652     struct brw_reg interp[4];
 653     struct brw_reg dst, src0;
 654     GLuint nr, i;
 655
 656     src0 = get_src_reg(c, inst, 0, 0);
 657     nr = src0.nr;
 658
 659     interp[0] = brw_vec1_grf(nr, 0);
 660     interp[1] = brw_vec1_grf(nr, 4);
 661     interp[2] = brw_vec1_grf(nr+1, 0);
 662     interp[3] = brw_vec1_grf(nr+1, 4);
 663
 664     for(i = 0; i < 4; i++ ) {
 665         if (mask & (1<<i)) {
 666             dst = get_dst_reg(c, inst, i);
 667             brw_MOV(p, dst, suboffset(interp[i],3));
 668         }
 669     }
 670 }
 671
 672 static void emit_pinterp(struct brw_wm_compile *c,
 673                 struct prog_instruction *inst)
 674 {
 675     struct brw_compile *p = &c->func;
 676     GLuint mask = inst->DstReg.WriteMask;
 677
 678     struct brw_reg interp[4];
 679     struct brw_reg dst, delta0, delta1;
 680     struct brw_reg src0, w;
 681     GLuint nr, i;
 682
 683     src0 = get_src_reg(c, inst, 0, 0);
 684     delta0 = get_src_reg(c, inst, 1, 0);
 685     delta1 = get_src_reg(c, inst, 1, 1);
 686     w = get_src_reg(c, inst, 2, 3);
 687     nr = src0.nr;
 688
 689     interp[0] = brw_vec1_grf(nr, 0);
 690     interp[1] = brw_vec1_grf(nr, 4);
 691     interp[2] = brw_vec1_grf(nr+1, 0);
 692     interp[3] = brw_vec1_grf(nr+1, 4);
 693
 694     for(i = 0; i < 4; i++ ) {
 695         if (mask & (1<<i)) {
 696             dst = get_dst_reg(c, inst, i);
 697             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 698             brw_MAC(p, dst, suboffset(interp[i],1),
 699                     delta1);
 700             brw_MUL(p, dst, dst, w);
 701         }
 702     }
 703 }
 704
 705 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 706 static void emit_frontfacing(struct brw_wm_compile *c,
 707                              struct prog_instruction *inst)
 708 {
 709     struct brw_compile *p = &c->func;
 710     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 711     struct brw_reg dst;
 712     GLuint mask = inst->DstReg.WriteMask;
 713     int i;
 714
 715     for (i = 0; i < 4; i++) {
 716         if (mask & (1<<i)) {
 717             dst = get_dst_reg(c, inst, i);
 718             brw_MOV(p, dst, brw_imm_f(0.0));
 719         }
 720     }
 721
 722     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 723      * us front face
 724      */
 725     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 726     for (i = 0; i < 4; i++) {
 727         if (mask & (1<<i)) {
 728             dst = get_dst_reg(c, inst, i);
 729             brw_MOV(p, dst, brw_imm_f(1.0));
 730         }
 731     }
 732     brw_set_predicate_control_flag_value(p, 0xff);
 733 }
 734
 735 static void emit_xpd(struct brw_wm_compile *c,
 736                 struct prog_instruction *inst)
 737 {
 738     int i;
 739     struct brw_compile *p = &c->func;
 740     GLuint mask = inst->DstReg.WriteMask;
 741     for (i = 0; i < 4; i++) {
 742         GLuint i2 = (i+2)%3;
 743         GLuint i1 = (i+1)%3;
 744         if (mask & (1<<i)) {
 745             struct brw_reg src0, src1, dst;
 746             dst = get_dst_reg(c, inst, i);
 747             src0 = negate(get_src_reg(c, inst, 0, i2));
 748             src1 = get_src_reg_imm(c, inst, 1, i1);
 749             brw_MUL(p, brw_null_reg(), src0, src1);
 750             src0 = get_src_reg(c, inst, 0, i1);
 751             src1 = get_src_reg_imm(c, inst, 1, i2);
 752             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 753             brw_MAC(p, dst, src0, src1);
 754             brw_set_saturate(p, 0);
 755         }
 756     }
 757     brw_set_saturate(p, 0);
 758 }
 759
 760 static void emit_dp3(struct brw_wm_compile *c,
 761                 struct prog_instruction *inst)
 762 {
 763     struct brw_reg src0[3], src1[3], dst;
 764     int i;
 765     struct brw_compile *p = &c->func;
 766     for (i = 0; i < 3; i++) {
 767         src0[i] = get_src_reg(c, inst, 0, i);
 768         src1[i] = get_src_reg_imm(c, inst, 1, i);
 769     }
 770
 771     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
 772     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
 773     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
 774     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 775     brw_MAC(p, dst, src0[2], src1[2]);
 776     brw_set_saturate(p, 0);
 777 }
 778
 779 static void emit_dp4(struct brw_wm_compile *c,
 780                 struct prog_instruction *inst)
 781 {
 782     struct brw_reg src0[4], src1[4], dst;
 783     int i;
 784     struct brw_compile *p = &c->func;
 785     for (i = 0; i < 4; i++) {
 786         src0[i] = get_src_reg(c, inst, 0, i);
 787         src1[i] = get_src_reg_imm(c, inst, 1, i);
 788     }
 789     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
 790     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
 791     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
 792     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
 793     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 794     brw_MAC(p, dst, src0[3], src1[3]);
 795     brw_set_saturate(p, 0);
 796 }
 797
 798 static void emit_dph(struct brw_wm_compile *c,
 799                 struct prog_instruction *inst)
 800 {
 801     struct brw_reg src0[4], src1[4], dst;
 802     int i;
 803     struct brw_compile *p = &c->func;
 804     for (i = 0; i < 4; i++) {
 805         src0[i] = get_src_reg(c, inst, 0, i);
 806         src1[i] = get_src_reg_imm(c, inst, 1, i);
 807     }
 808     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
 809     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
 810     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
 811     brw_MAC(p, dst, src0[2], src1[2]);
 812     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 813     brw_ADD(p, dst, dst, src1[3]);
 814     brw_set_saturate(p, 0);
 815 }
 816
 817 /**
 818  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
 819  * Note that the result of the function is smeared across the dest
 820  * register's X, Y, Z and W channels (subject to writemasking of course).
 821  */
 822 static void emit_math1(struct brw_wm_compile *c,
 823                 struct prog_instruction *inst, GLuint func)
 824 {
 825     struct brw_compile *p = &c->func;
 826     struct brw_reg src0, dst, tmp;
 827     const int mark = mark_tmps( c );
 828     int i;
 829
 830     tmp = alloc_tmp(c);
 831
 832     /* Get first component of source register */
 833     src0 = get_src_reg(c, inst, 0, 0);
 834
 835     /* tmp = func(src0) */
 836     brw_MOV(p, brw_message_reg(2), src0);
 837     brw_math(p,
 838              tmp,
 839              func,
 840              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 841              2,
 842              brw_null_reg(),
 843              BRW_MATH_DATA_VECTOR,
 844              BRW_MATH_PRECISION_FULL);
 845
 846     /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
 847
 848     /* replicate tmp value across enabled dest channels */
 849     for (i = 0; i < 4; i++) {
 850        if (inst->DstReg.WriteMask & (1 << i)) {
 851           dst = get_dst_reg(c, inst, i);
 852           brw_MOV(p, dst, tmp);
 853        }
 854     }
 855
 856     release_tmps(c, mark);
 857 }
 858
 859 static void emit_rcp(struct brw_wm_compile *c,
 860                 struct prog_instruction *inst)
 861 {
 862     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
 863 }
 864
 865 static void emit_rsq(struct brw_wm_compile *c,
 866                 struct prog_instruction *inst)
 867 {
 868     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
 869 }
 870
 871 static void emit_sin(struct brw_wm_compile *c,
 872                 struct prog_instruction *inst)
 873 {
 874     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
 875 }
 876
 877 static void emit_cos(struct brw_wm_compile *c,
 878                 struct prog_instruction *inst)
 879 {
 880     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
 881 }
 882
 883 static void emit_ex2(struct brw_wm_compile *c,
 884                 struct prog_instruction *inst)
 885 {
 886     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
 887 }
 888
 889 static void emit_lg2(struct brw_wm_compile *c,
 890                 struct prog_instruction *inst)
 891 {
 892     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
 893 }
 894
 895 static void emit_add(struct brw_wm_compile *c,
 896                 struct prog_instruction *inst)
 897 {
 898     struct brw_compile *p = &c->func;
 899     struct brw_reg src0, src1, dst;
 900     GLuint mask = inst->DstReg.WriteMask;
 901     int i;
 902     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 903     for (i = 0 ; i < 4; i++) {
 904         if (mask & (1<<i)) {
 905             dst = get_dst_reg(c, inst, i);
 906             src0 = get_src_reg(c, inst, 0, i);
 907             src1 = get_src_reg_imm(c, inst, 1, i);
 908             brw_ADD(p, dst, src0, src1);
 909         }
 910     }
 911     brw_set_saturate(p, 0);
 912 }
 913
 914 static void emit_arl(struct brw_wm_compile *c,
 915                      struct prog_instruction *inst)
 916 {
 917     struct brw_compile *p = &c->func;
 918     struct brw_reg src0, addr_reg;
 919     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 920     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 921                            BRW_ARF_ADDRESS, 0);
 922     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 923     brw_MOV(p, addr_reg, src0);
 924     brw_set_saturate(p, 0);
 925 }
 926
 927 static void emit_sub(struct brw_wm_compile *c,
 928                 struct prog_instruction *inst)
 929 {
 930     struct brw_compile *p = &c->func;
 931     struct brw_reg src0, src1, dst;
 932     GLuint mask = inst->DstReg.WriteMask;
 933     int i;
 934     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 935     for (i = 0 ; i < 4; i++) {
 936         if (mask & (1<<i)) {
 937             dst = get_dst_reg(c, inst, i);
 938             src0 = get_src_reg(c, inst, 0, i);
 939             src1 = get_src_reg_imm(c, inst, 1, i);
 940             brw_ADD(p, dst, src0, negate(src1));
 941         }
 942     }
 943     brw_set_saturate(p, 0);
 944 }
 945
 946 static void emit_mul(struct brw_wm_compile *c,
 947                 struct prog_instruction *inst)
 948 {
 949     struct brw_compile *p = &c->func;
 950     struct brw_reg src0, src1, dst;
 951     GLuint mask = inst->DstReg.WriteMask;
 952     int i;
 953     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 954     for (i = 0 ; i < 4; i++) {
 955         if (mask & (1<<i)) {
 956             dst = get_dst_reg(c, inst, i);
 957             src0 = get_src_reg(c, inst, 0, i);
 958             src1 = get_src_reg_imm(c, inst, 1, i);
 959             brw_MUL(p, dst, src0, src1);
 960         }
 961     }
 962     brw_set_saturate(p, 0);
 963 }
 964
 965 static void emit_frc(struct brw_wm_compile *c,
 966                 struct prog_instruction *inst)
 967 {
 968     struct brw_compile *p = &c->func;
 969     struct brw_reg src0, dst;
 970     GLuint mask = inst->DstReg.WriteMask;
 971     int i;
 972     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 973     for (i = 0 ; i < 4; i++) {
 974         if (mask & (1<<i)) {
 975             dst = get_dst_reg(c, inst, i);
 976             src0 = get_src_reg_imm(c, inst, 0, i);
 977             brw_FRC(p, dst, src0);
 978         }
 979     }
 980     if (inst->SaturateMode != SATURATE_OFF)
 981         brw_set_saturate(p, 0);
 982 }
 983
 984 static void emit_flr(struct brw_wm_compile *c,
 985                 struct prog_instruction *inst)
 986 {
 987     struct brw_compile *p = &c->func;
 988     struct brw_reg src0, dst;
 989     GLuint mask = inst->DstReg.WriteMask;
 990     int i;
 991     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 992     for (i = 0 ; i < 4; i++) {
 993         if (mask & (1<<i)) {
 994             dst = get_dst_reg(c, inst, i);
 995             src0 = get_src_reg_imm(c, inst, 0, i);
 996             brw_RNDD(p, dst, src0);
 997         }
 998     }
 999     brw_set_saturate(p, 0);
1000 }
1001
1002 static void emit_max(struct brw_wm_compile *c,
1003                 struct prog_instruction *inst)
1004 {
1005     struct brw_compile *p = &c->func;
1006     GLuint mask = inst->DstReg.WriteMask;
1007     struct brw_reg src0, src1, dst;
1008     int i;
1009     brw_push_insn_state(p);
1010     for (i = 0; i < 4; i++) {
1011         if (mask & (1<<i)) {
1012             dst = get_dst_reg(c, inst, i);
1013             src0 = get_src_reg(c, inst, 0, i);
1014             src1 = get_src_reg_imm(c, inst, 1, i);
1015             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1016             brw_MOV(p, dst, src0);
1017             brw_set_saturate(p, 0);
1018
1019             brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
1020             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1021             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1022             brw_MOV(p, dst, src1);
1023             brw_set_saturate(p, 0);
1024             brw_set_predicate_control_flag_value(p, 0xff);
1025         }
1026     }
1027     brw_pop_insn_state(p);
1028 }
1029
1030 static void emit_min(struct brw_wm_compile *c,
1031                 struct prog_instruction *inst)
1032 {
1033     struct brw_compile *p = &c->func;
1034     GLuint mask = inst->DstReg.WriteMask;
1035     struct brw_reg src0, src1, dst;
1036     int i;
1037     brw_push_insn_state(p);
1038     for (i = 0; i < 4; i++) {
1039         if (mask & (1<<i)) {
1040             dst = get_dst_reg(c, inst, i);
1041             src0 = get_src_reg_imm(c, inst, 0, i);
1042             src1 = get_src_reg(c, inst, 1, i);
1043             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1044             brw_MOV(p, dst, src0);
1045             brw_set_saturate(p, 0);
1046
1047             brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1048             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1049             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1050             brw_MOV(p, dst, src1);
1051             brw_set_saturate(p, 0);
1052             brw_set_predicate_control_flag_value(p, 0xff);
1053         }
1054     }
1055     brw_pop_insn_state(p);
1056 }
1057
1058 static void emit_pow(struct brw_wm_compile *c,
1059                 struct prog_instruction *inst)
1060 {
1061     struct brw_compile *p = &c->func;
1062     struct brw_reg dst, src0, src1;
1063     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1064     src0 = get_src_reg_imm(c, inst, 0, 0);
1065     src1 = get_src_reg_imm(c, inst, 1, 0);
1066
1067     brw_MOV(p, brw_message_reg(2), src0);
1068     brw_MOV(p, brw_message_reg(3), src1);
1069
1070     brw_math(p,
1071             dst,
1072             BRW_MATH_FUNCTION_POW,
1073             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1074             2,
1075             brw_null_reg(),
1076             BRW_MATH_DATA_VECTOR,
1077             BRW_MATH_PRECISION_FULL);
1078 }
1079
1080 static void emit_lrp(struct brw_wm_compile *c,
1081                 struct prog_instruction *inst)
1082 {
1083     struct brw_compile *p = &c->func;
1084     GLuint mask = inst->DstReg.WriteMask;
1085     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1086     int i;
1087     int mark = mark_tmps(c);
1088     for (i = 0; i < 4; i++) {
1089         if (mask & (1<<i)) {
1090             dst = get_dst_reg(c, inst, i);
1091             src0 = get_src_reg(c, inst, 0, i);
1092
1093             src1 = get_src_reg_imm(c, inst, 1, i);
1094
1095             if (src1.nr == dst.nr) {
1096                 tmp1 = alloc_tmp(c);
1097                 brw_MOV(p, tmp1, src1);
1098             } else
1099                 tmp1 = src1;
1100
1101             src2 = get_src_reg(c, inst, 2, i);
1102             if (src2.nr == dst.nr) {
1103                 tmp2 = alloc_tmp(c);
1104                 brw_MOV(p, tmp2, src2);
1105             } else
1106                 tmp2 = src2;
1107
1108             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1109             brw_MUL(p, brw_null_reg(), dst, tmp2);
1110             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1111             brw_MAC(p, dst, src0, tmp1);
1112             brw_set_saturate(p, 0);
1113         }
1114         release_tmps(c, mark);
1115     }
1116 }
1117
1118 /**
1119  * For GLSL shaders, this KIL will be unconditional.
1120  * It may be contained inside an IF/ENDIF structure of course.
1121  */
1122 static void emit_kil(struct brw_wm_compile *c)
1123 {
1124     struct brw_compile *p = &c->func;
1125     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1126     brw_push_insn_state(p);
1127     brw_set_mask_control(p, BRW_MASK_DISABLE);
1128     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1129     brw_AND(p, depth, c->emit_mask_reg, depth);
1130     brw_pop_insn_state(p);
1131 }
1132
1133 static void emit_mad(struct brw_wm_compile *c,
1134                 struct prog_instruction *inst)
1135 {
1136     struct brw_compile *p = &c->func;
1137     GLuint mask = inst->DstReg.WriteMask;
1138     struct brw_reg dst, src0, src1, src2;
1139     int i;
1140
1141     for (i = 0; i < 4; i++) {
1142         if (mask & (1<<i)) {
1143             dst = get_dst_reg(c, inst, i);
1144             src0 = get_src_reg(c, inst, 0, i);
1145             src1 = get_src_reg_imm(c, inst, 1, i);
1146             src2 = get_src_reg_imm(c, inst, 2, i);
1147             brw_MUL(p, dst, src0, src1);
1148
1149             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1150             brw_ADD(p, dst, dst, src2);
1151             brw_set_saturate(p, 0);
1152         }
1153     }
1154 }
1155
1156 static void emit_sop(struct brw_wm_compile *c,
1157                 struct prog_instruction *inst, GLuint cond)
1158 {
1159     struct brw_compile *p = &c->func;
1160     GLuint mask = inst->DstReg.WriteMask;
1161     struct brw_reg dst, src0, src1;
1162     int i;
1163
1164     for (i = 0; i < 4; i++) {
1165         if (mask & (1<<i)) {
1166             dst = get_dst_reg(c, inst, i);
1167             src0 = get_src_reg(c, inst, 0, i);
1168             src1 = get_src_reg_imm(c, inst, 1, i);
1169             brw_push_insn_state(p);
1170             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1171             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1172             brw_MOV(p, dst, brw_imm_f(0.0));
1173             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1174             brw_MOV(p, dst, brw_imm_f(1.0));
1175             brw_pop_insn_state(p);
1176         }
1177     }
1178 }
1179
1180 static void emit_slt(struct brw_wm_compile *c,
1181                 struct prog_instruction *inst)
1182 {
1183     emit_sop(c, inst, BRW_CONDITIONAL_L);
1184 }
1185
1186 static void emit_sle(struct brw_wm_compile *c,
1187                 struct prog_instruction *inst)
1188 {
1189     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1190 }
1191
1192 static void emit_sgt(struct brw_wm_compile *c,
1193                 struct prog_instruction *inst)
1194 {
1195     emit_sop(c, inst, BRW_CONDITIONAL_G);
1196 }
1197
1198 static void emit_sge(struct brw_wm_compile *c,
1199                 struct prog_instruction *inst)
1200 {
1201     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1202 }
1203
1204 static void emit_seq(struct brw_wm_compile *c,
1205                 struct prog_instruction *inst)
1206 {
1207     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1208 }
1209
1210 static void emit_sne(struct brw_wm_compile *c,
1211                 struct prog_instruction *inst)
1212 {
1213     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1214 }
1215
1216 static void emit_ddx(struct brw_wm_compile *c,
1217                 struct prog_instruction *inst)
1218 {
1219     struct brw_compile *p = &c->func;
1220     GLuint mask = inst->DstReg.WriteMask;
1221     struct brw_reg interp[4];
1222     struct brw_reg dst;
1223     struct brw_reg src0, w;
1224     GLuint nr, i;
1225     src0 = get_src_reg(c, inst, 0, 0);
1226     w = get_src_reg(c, inst, 1, 3);
1227     nr = src0.nr;
1228     interp[0] = brw_vec1_grf(nr, 0);
1229     interp[1] = brw_vec1_grf(nr, 4);
1230     interp[2] = brw_vec1_grf(nr+1, 0);
1231     interp[3] = brw_vec1_grf(nr+1, 4);
1232     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1233     for(i = 0; i < 4; i++ ) {
1234         if (mask & (1<<i)) {
1235             dst = get_dst_reg(c, inst, i);
1236             brw_MOV(p, dst, interp[i]);
1237             brw_MUL(p, dst, dst, w);
1238         }
1239     }
1240     brw_set_saturate(p, 0);
1241 }
1242
1243 static void emit_ddy(struct brw_wm_compile *c,
1244                 struct prog_instruction *inst)
1245 {
1246     struct brw_compile *p = &c->func;
1247     GLuint mask = inst->DstReg.WriteMask;
1248     struct brw_reg interp[4];
1249     struct brw_reg dst;
1250     struct brw_reg src0, w;
1251     GLuint nr, i;
1252
1253     src0 = get_src_reg(c, inst, 0, 0);
1254     nr = src0.nr;
1255     w = get_src_reg(c, inst, 1, 3);
1256     interp[0] = brw_vec1_grf(nr, 0);
1257     interp[1] = brw_vec1_grf(nr, 4);
1258     interp[2] = brw_vec1_grf(nr+1, 0);
1259     interp[3] = brw_vec1_grf(nr+1, 4);
1260     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1261     for(i = 0; i < 4; i++ ) {
1262         if (mask & (1<<i)) {
1263             dst = get_dst_reg(c, inst, i);
1264             brw_MOV(p, dst, suboffset(interp[i], 1));
1265             brw_MUL(p, dst, dst, w);
1266         }
1267     }
1268     brw_set_saturate(p, 0);
1269 }
1270
1271 static INLINE struct brw_reg high_words( struct brw_reg reg )
1272 {
1273     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1274                    0, 8, 2 );
1275 }
1276
1277 static INLINE struct brw_reg low_words( struct brw_reg reg )
1278 {
1279     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1280 }
1281
1282 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1283 {
1284     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1285 }
1286
1287 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1288 {
1289     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1290                    0, 16, 2 );
1291 }
1292
1293 /* One-, two- and three-dimensional Perlin noise, similar to the description
1294    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1295 static void noise1_sub( struct brw_wm_compile *c ) {
1296
1297     struct brw_compile *p = &c->func;
1298     struct brw_reg param,
1299         x0, x1, /* gradients at each end */
1300         t, tmp[ 2 ], /* float temporaries */
1301         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1302     int i;
1303     int mark = mark_tmps( c );
1304
1305     x0 = alloc_tmp( c );
1306     x1 = alloc_tmp( c );
1307     t = alloc_tmp( c );
1308     tmp[ 0 ] = alloc_tmp( c );
1309     tmp[ 1 ] = alloc_tmp( c );
1310     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1311     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1312     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1313     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1314     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1315
1316     param = lookup_tmp( c, mark - 2 );
1317
1318     brw_set_access_mode( p, BRW_ALIGN_1 );
1319
1320     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1321
1322     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1323        be hashed.  Also compute the remainder (offset within the unit
1324        length), interleaved to reduce register dependency penalties. */
1325     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1326     brw_FRC( p, param, param );
1327     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1328     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1329     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1330
1331     /* We're now ready to perform the hashing.  The two hashes are
1332        interleaved for performance.  The hash function used is
1333        designed to rapidly achieve avalanche and require only 32x16
1334        bit multiplication, and 16-bit swizzles (which we get for
1335        free).  We can't use immediate operands in the multiplies,
1336        because immediates are permitted only in src1 and the 16-bit
1337        factor is permitted only in src0. */
1338     for( i = 0; i < 2; i++ )
1339         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1340     for( i = 0; i < 2; i++ )
1341        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1342                 high_words( itmp[ i ] ) );
1343     for( i = 0; i < 2; i++ )
1344         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1345     for( i = 0; i < 2; i++ )
1346        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1347                 high_words( itmp[ i ] ) );
1348     for( i = 0; i < 2; i++ )
1349         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1350     for( i = 0; i < 2; i++ )
1351        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1352                 high_words( itmp[ i ] ) );
1353
1354     /* Now we want to initialise the two gradients based on the
1355        hashes.  Format conversion from signed integer to float leaves
1356        everything scaled too high by a factor of pow( 2, 31 ), but
1357        we correct for that right at the end. */
1358     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1359     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1360     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1361
1362     brw_MUL( p, x0, x0, param );
1363     brw_MUL( p, x1, x1, t );
1364
1365     /* We interpolate between the gradients using the polynomial
1366        6t^5 - 15t^4 + 10t^3 (Perlin). */
1367     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1368     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1369     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1370     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1371     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1372     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1373                                            pipeline */
1374     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1375     brw_MUL( p, param, tmp[ 0 ], param );
1376     brw_MUL( p, x1, x1, param );
1377     brw_ADD( p, x0, x0, x1 );
1378     /* scale by pow( 2, -30 ), to compensate for the format conversion
1379        above and an extra factor of 2 so that a single gradient covers
1380        the [-1,1] range */
1381     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1382
1383     release_tmps( c, mark );
1384 }
1385
1386 static void emit_noise1( struct brw_wm_compile *c,
1387                          struct prog_instruction *inst )
1388 {
1389     struct brw_compile *p = &c->func;
1390     struct brw_reg src, param, dst;
1391     GLuint mask = inst->DstReg.WriteMask;
1392     int i;
1393     int mark = mark_tmps( c );
1394
1395     assert( mark == 0 );
1396
1397     src = get_src_reg( c, inst, 0, 0 );
1398
1399     param = alloc_tmp( c );
1400
1401     brw_MOV( p, param, src );
1402
1403     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1404
1405     /* Fill in the result: */
1406     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1407     for (i = 0 ; i < 4; i++) {
1408         if (mask & (1<<i)) {
1409             dst = get_dst_reg(c, inst, i);
1410             brw_MOV( p, dst, param );
1411         }
1412     }
1413     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1414         brw_set_saturate( p, 0 );
1415
1416     release_tmps( c, mark );
1417 }
1418
1419 static void noise2_sub( struct brw_wm_compile *c ) {
1420
1421     struct brw_compile *p = &c->func;
1422     struct brw_reg param0, param1,
1423         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1424         t, tmp[ 4 ], /* float temporaries */
1425         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1426     int i;
1427     int mark = mark_tmps( c );
1428
1429     x0y0 = alloc_tmp( c );
1430     x0y1 = alloc_tmp( c );
1431     x1y0 = alloc_tmp( c );
1432     x1y1 = alloc_tmp( c );
1433     t = alloc_tmp( c );
1434     for( i = 0; i < 4; i++ ) {
1435         tmp[ i ] = alloc_tmp( c );
1436         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1437     }
1438     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1439     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1440     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1441
1442     param0 = lookup_tmp( c, mark - 3 );
1443     param1 = lookup_tmp( c, mark - 2 );
1444
1445     brw_set_access_mode( p, BRW_ALIGN_1 );
1446
1447     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1448        be hashed.  Also compute the remainders (offsets within the unit
1449        square), interleaved to reduce register dependency penalties. */
1450     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1451     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1452     brw_FRC( p, param0, param0 );
1453     brw_FRC( p, param1, param1 );
1454     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1455     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1456              low_words( itmp[ 1 ] ) );
1457     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1458     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1459     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1460     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1461     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1462
1463     /* We're now ready to perform the hashing.  The four hashes are
1464        interleaved for performance.  The hash function used is
1465        designed to rapidly achieve avalanche and require only 32x16
1466        bit multiplication, and 16-bit swizzles (which we get for
1467        free).  We can't use immediate operands in the multiplies,
1468        because immediates are permitted only in src1 and the 16-bit
1469        factor is permitted only in src0. */
1470     for( i = 0; i < 4; i++ )
1471         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1472     for( i = 0; i < 4; i++ )
1473         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1474                  high_words( itmp[ i ] ) );
1475     for( i = 0; i < 4; i++ )
1476         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1477     for( i = 0; i < 4; i++ )
1478         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1479                  high_words( itmp[ i ] ) );
1480     for( i = 0; i < 4; i++ )
1481         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1482     for( i = 0; i < 4; i++ )
1483         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1484                  high_words( itmp[ i ] ) );
1485
1486     /* Now we want to initialise the four gradients based on the
1487        hashes.  Format conversion from signed integer to float leaves
1488        everything scaled too high by a factor of pow( 2, 15 ), but
1489        we correct for that right at the end. */
1490     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1491     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1492     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1493     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1494     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1495
1496     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1497     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1498     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1499     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1500
1501     brw_MUL( p, x1y0, x1y0, t );
1502     brw_MUL( p, x1y1, x1y1, t );
1503     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1504     brw_MUL( p, x0y0, x0y0, param0 );
1505     brw_MUL( p, x0y1, x0y1, param0 );
1506
1507     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1508     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1509     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1510     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1511
1512     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1513     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1514     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1515     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1516
1517     /* We interpolate between the gradients using the polynomial
1518        6t^5 - 15t^4 + 10t^3 (Perlin). */
1519     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1520     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1521     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1522     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1523     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1524     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1525     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1526                                                  pipeline */
1527     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1528     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1529     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1530     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1531     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1532                                                  pipeline */
1533     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1534     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1535     brw_MUL( p, param0, tmp[ 0 ], param0 );
1536     brw_MUL( p, param1, tmp[ 1 ], param1 );
1537
1538     /* Here we interpolate in the y dimension... */
1539     brw_MUL( p, x0y1, x0y1, param1 );
1540     brw_MUL( p, x1y1, x1y1, param1 );
1541     brw_ADD( p, x0y0, x0y0, x0y1 );
1542     brw_ADD( p, x1y0, x1y0, x1y1 );
1543
1544     /* And now in x.  There are horrible register dependencies here,
1545        but we have nothing else to do. */
1546     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1547     brw_MUL( p, x1y0, x1y0, param0 );
1548     brw_ADD( p, x0y0, x0y0, x1y0 );
1549
1550     /* scale by pow( 2, -15 ), as described above */
1551     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1552
1553     release_tmps( c, mark );
1554 }
1555
1556 static void emit_noise2( struct brw_wm_compile *c,
1557                          struct prog_instruction *inst )
1558 {
1559     struct brw_compile *p = &c->func;
1560     struct brw_reg src0, src1, param0, param1, dst;
1561     GLuint mask = inst->DstReg.WriteMask;
1562     int i;
1563     int mark = mark_tmps( c );
1564
1565     assert( mark == 0 );
1566
1567     src0 = get_src_reg( c, inst, 0, 0 );
1568     src1 = get_src_reg( c, inst, 0, 1 );
1569
1570     param0 = alloc_tmp( c );
1571     param1 = alloc_tmp( c );
1572
1573     brw_MOV( p, param0, src0 );
1574     brw_MOV( p, param1, src1 );
1575
1576     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1577
1578     /* Fill in the result: */
1579     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1580     for (i = 0 ; i < 4; i++) {
1581         if (mask & (1<<i)) {
1582             dst = get_dst_reg(c, inst, i);
1583             brw_MOV( p, dst, param0 );
1584         }
1585     }
1586     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1587         brw_set_saturate( p, 0 );
1588
1589     release_tmps( c, mark );
1590 }
1591
1592 /**
1593  * The three-dimensional case is much like the one- and two- versions above,
1594  * but since the number of corners is rapidly growing we now pack 16 16-bit
1595  * hashes into each register to extract more parallelism from the EUs.
1596  */
1597 static void noise3_sub( struct brw_wm_compile *c ) {
1598
1599     struct brw_compile *p = &c->func;
1600     struct brw_reg param0, param1, param2,
1601         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1602         xi, yi, zi, /* interpolation coefficients */
1603         t, tmp[ 8 ], /* float temporaries */
1604         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1605         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1606     int i;
1607     int mark = mark_tmps( c );
1608
1609     x0y0 = alloc_tmp( c );
1610     x0y1 = alloc_tmp( c );
1611     x1y0 = alloc_tmp( c );
1612     x1y1 = alloc_tmp( c );
1613     xi = alloc_tmp( c );
1614     yi = alloc_tmp( c );
1615     zi = alloc_tmp( c );
1616     t = alloc_tmp( c );
1617     for( i = 0; i < 8; i++ ) {
1618         tmp[ i ] = alloc_tmp( c );
1619         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1620         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1621     }
1622
1623     param0 = lookup_tmp( c, mark - 4 );
1624     param1 = lookup_tmp( c, mark - 3 );
1625     param2 = lookup_tmp( c, mark - 2 );
1626
1627     brw_set_access_mode( p, BRW_ALIGN_1 );
1628
1629     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1630        be hashed.  Also compute the remainders (offsets within the unit
1631        cube), interleaved to reduce register dependency penalties. */
1632     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1633     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1634     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1635     brw_FRC( p, param0, param0 );
1636     brw_FRC( p, param1, param1 );
1637     brw_FRC( p, param2, param2 );
1638     /* Since we now have only 16 bits of precision in the hash, we must
1639        be more careful about thorough mixing to maintain entropy as we
1640        squash the input vector into a small scalar. */
1641     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1642     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1643     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1644              brw_imm_uw( 0x9B93 ) );
1645     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1646              brw_imm_uw( 0xBC8F ) );
1647
1648     /* Temporarily disable the execution mask while we work with ExecSize=16
1649        channels (the mask is set for ExecSize=8 and is probably incorrect).
1650        Although this might cause execution of unwanted channels, the code
1651        writes only to temporary registers and has no side effects, so
1652        disabling the mask is harmless. */
1653     brw_push_insn_state( p );
1654     brw_set_mask_control( p, BRW_MASK_DISABLE );
1655     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1656     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1657     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1658
1659     /* We're now ready to perform the hashing.  The eight hashes are
1660        interleaved for performance.  The hash function used is
1661        designed to rapidly achieve avalanche and require only 16x16
1662        bit multiplication, and 8-bit swizzles (which we get for
1663        free). */
1664     for( i = 0; i < 4; i++ )
1665         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1666     for( i = 0; i < 4; i++ )
1667         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1668                  odd_bytes( wtmp[ i ] ) );
1669     for( i = 0; i < 4; i++ )
1670         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1671     for( i = 0; i < 4; i++ )
1672         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1673                  odd_bytes( wtmp[ i ] ) );
1674     brw_pop_insn_state( p );
1675
1676     /* Now we want to initialise the four rear gradients based on the
1677        hashes.  Format conversion from signed integer to float leaves
1678        everything scaled too high by a factor of pow( 2, 15 ), but
1679        we correct for that right at the end. */
1680     /* x component */
1681     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1682     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1683     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1684     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1685     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1686
1687     brw_push_insn_state( p );
1688     brw_set_mask_control( p, BRW_MASK_DISABLE );
1689     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1690     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1691     brw_pop_insn_state( p );
1692
1693     brw_MUL( p, x1y0, x1y0, t );
1694     brw_MUL( p, x1y1, x1y1, t );
1695     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1696     brw_MUL( p, x0y0, x0y0, param0 );
1697     brw_MUL( p, x0y1, x0y1, param0 );
1698
1699     /* y component */
1700     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1701     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1702     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1703     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1704
1705     brw_push_insn_state( p );
1706     brw_set_mask_control( p, BRW_MASK_DISABLE );
1707     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1708     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1709     brw_pop_insn_state( p );
1710
1711     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1712     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1713     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1714     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1715     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1716
1717     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1718     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1719     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1720     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1721
1722     /* z component */
1723     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1724     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1725     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1726     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1727
1728     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1729     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1730     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1731     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1732
1733     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1734     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1735     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1736     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1737
1738     /* We interpolate between the gradients using the polynomial
1739        6t^5 - 15t^4 + 10t^3 (Perlin). */
1740     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1741     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1742     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1743     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1744     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1745     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1746     brw_MUL( p, xi, xi, param0 );
1747     brw_MUL( p, yi, yi, param1 );
1748     brw_MUL( p, zi, zi, param2 );
1749     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1750     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1751     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1752     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1753     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1754     brw_MUL( p, xi, xi, param0 );
1755     brw_MUL( p, yi, yi, param1 );
1756     brw_MUL( p, zi, zi, param2 );
1757     brw_MUL( p, xi, xi, param0 );
1758     brw_MUL( p, yi, yi, param1 );
1759     brw_MUL( p, zi, zi, param2 );
1760     brw_MUL( p, xi, xi, param0 );
1761     brw_MUL( p, yi, yi, param1 );
1762     brw_MUL( p, zi, zi, param2 );
1763
1764     /* Here we interpolate in the y dimension... */
1765     brw_MUL( p, x0y1, x0y1, yi );
1766     brw_MUL( p, x1y1, x1y1, yi );
1767     brw_ADD( p, x0y0, x0y0, x0y1 );
1768     brw_ADD( p, x1y0, x1y0, x1y1 );
1769
1770     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1771     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1772     brw_MUL( p, x1y0, x1y0, xi );
1773     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1774
1775     /* Now do the same thing for the front four gradients... */
1776     /* x component */
1777     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1778     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1779     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1780     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1781
1782     brw_push_insn_state( p );
1783     brw_set_mask_control( p, BRW_MASK_DISABLE );
1784     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1785     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1786     brw_pop_insn_state( p );
1787
1788     brw_MUL( p, x1y0, x1y0, t );
1789     brw_MUL( p, x1y1, x1y1, t );
1790     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1791     brw_MUL( p, x0y0, x0y0, param0 );
1792     brw_MUL( p, x0y1, x0y1, param0 );
1793
1794     /* y component */
1795     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1796     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1797     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1798     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1799
1800     brw_push_insn_state( p );
1801     brw_set_mask_control( p, BRW_MASK_DISABLE );
1802     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1803     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1804     brw_pop_insn_state( p );
1805
1806     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1807     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1808     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1809     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1810     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1811
1812     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1813     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1814     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1815     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1816
1817     /* z component */
1818     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1819     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1820     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1821     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1822
1823     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1824     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1825     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1826     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1827
1828     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1829     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1830     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1831     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1832
1833     /* The interpolation coefficients are still around from last time, so
1834        again interpolate in the y dimension... */
1835     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1836     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1837     brw_MUL( p, x0y1, x0y1, yi );
1838     brw_MUL( p, x1y1, x1y1, yi );
1839     brw_ADD( p, x0y0, x0y0, x0y1 );
1840     brw_ADD( p, x1y0, x1y0, x1y1 );
1841
1842     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1843        time put the front face in tmp[ 1 ] and we're nearly there... */
1844     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1845     brw_MUL( p, x1y0, x1y0, xi );
1846     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1847
1848     /* The final interpolation, in the z dimension: */
1849     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1850     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1851     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1852
1853     /* scale by pow( 2, -15 ), as described above */
1854     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1855
1856     release_tmps( c, mark );
1857 }
1858
1859 static void emit_noise3( struct brw_wm_compile *c,
1860                          struct prog_instruction *inst )
1861 {
1862     struct brw_compile *p = &c->func;
1863     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1864     GLuint mask = inst->DstReg.WriteMask;
1865     int i;
1866     int mark = mark_tmps( c );
1867
1868     assert( mark == 0 );
1869
1870     src0 = get_src_reg( c, inst, 0, 0 );
1871     src1 = get_src_reg( c, inst, 0, 1 );
1872     src2 = get_src_reg( c, inst, 0, 2 );
1873
1874     param0 = alloc_tmp( c );
1875     param1 = alloc_tmp( c );
1876     param2 = alloc_tmp( c );
1877
1878     brw_MOV( p, param0, src0 );
1879     brw_MOV( p, param1, src1 );
1880     brw_MOV( p, param2, src2 );
1881
1882     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1883
1884     /* Fill in the result: */
1885     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1886     for (i = 0 ; i < 4; i++) {
1887         if (mask & (1<<i)) {
1888             dst = get_dst_reg(c, inst, i);
1889             brw_MOV( p, dst, param0 );
1890         }
1891     }
1892     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1893         brw_set_saturate( p, 0 );
1894
1895     release_tmps( c, mark );
1896 }
1897
1898 /**
1899  * For the four-dimensional case, the little micro-optimisation benefits
1900  * we obtain by unrolling all the loops aren't worth the massive bloat it
1901  * now causes.  Instead, we loop twice around performing a similar operation
1902  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1903  * code to glue it all together.
1904  */
1905 static void noise4_sub( struct brw_wm_compile *c )
1906 {
1907     struct brw_compile *p = &c->func;
1908     struct brw_reg param[ 4 ],
1909         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1910         w0, /* noise for the w=0 cube */
1911         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1912         interp[ 4 ], /* interpolation coefficients */
1913         t, tmp[ 8 ], /* float temporaries */
1914         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1915         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1916     int i, j;
1917     int mark = mark_tmps( c );
1918     GLuint loop, origin;
1919
1920     x0y0 = alloc_tmp( c );
1921     x0y1 = alloc_tmp( c );
1922     x1y0 = alloc_tmp( c );
1923     x1y1 = alloc_tmp( c );
1924     t = alloc_tmp( c );
1925     w0 = alloc_tmp( c );
1926     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1927     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1928
1929     for( i = 0; i < 4; i++ ) {
1930         param[ i ] = lookup_tmp( c, mark - 5 + i );
1931         interp[ i ] = alloc_tmp( c );
1932     }
1933
1934     for( i = 0; i < 8; i++ ) {
1935         tmp[ i ] = alloc_tmp( c );
1936         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1937         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1938     }
1939
1940     brw_set_access_mode( p, BRW_ALIGN_1 );
1941
1942     /* We only want 16 bits of precision from the integral part of each
1943        co-ordinate, but unfortunately the RNDD semantics would saturate
1944        at 16 bits if we performed the operation directly to a 16-bit
1945        destination.  Therefore, we round to 32-bit temporaries where
1946        appropriate, and then store only the lower 16 bits. */
1947     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1948     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1949     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1950     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1951     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1952     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1953
1954     /* Modify the flag register here, because the side effect is useful
1955        later (see below).  We know for certain that all flags will be
1956        cleared, since the FRC instruction cannot possibly generate
1957        negative results.  Even for exceptional inputs (infinities, denormals,
1958        NaNs), the architecture guarantees that the L conditional is false. */
1959     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1960     brw_FRC( p, param[ 0 ], param[ 0 ] );
1961     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1962     for( i = 1; i < 4; i++ )
1963         brw_FRC( p, param[ i ], param[ i ] );
1964
1965     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1966        of all. */
1967     for( i = 0; i < 4; i++ )
1968         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1969     for( i = 0; i < 4; i++ )
1970         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1971     for( i = 0; i < 4; i++ )
1972         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1973     for( i = 0; i < 4; i++ )
1974         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1975     for( j = 0; j < 3; j++ )
1976         for( i = 0; i < 4; i++ )
1977             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1978
1979     /* Mark the current address, as it will be a jump destination.  The
1980        following code will be executed twice: first, with the flag
1981        register clear indicating the w=0 case, and second with flags
1982        set for w=1. */
1983     loop = p->nr_insn;
1984
1985     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1986        be hashed.  Since we have only 16 bits of precision in the hash, we
1987        must be careful about thorough mixing to maintain entropy as we
1988        squash the input vector into a small scalar. */
1989     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1990              brw_imm_uw( 0xBC8F ) );
1991     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1992              brw_imm_uw( 0xD0BD ) );
1993     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1994              brw_imm_uw( 0x9B93 ) );
1995     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1996              brw_imm_uw( 0xA359 ) );
1997     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1998              brw_imm_uw( 0xBC8F ) );
1999
2000     /* Temporarily disable the execution mask while we work with ExecSize=16
2001        channels (the mask is set for ExecSize=8 and is probably incorrect).
2002        Although this might cause execution of unwanted channels, the code
2003        writes only to temporary registers and has no side effects, so
2004        disabling the mask is harmless. */
2005     brw_push_insn_state( p );
2006     brw_set_mask_control( p, BRW_MASK_DISABLE );
2007     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2008     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2009     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2010
2011     /* We're now ready to perform the hashing.  The eight hashes are
2012        interleaved for performance.  The hash function used is
2013        designed to rapidly achieve avalanche and require only 16x16
2014        bit multiplication, and 8-bit swizzles (which we get for
2015        free). */
2016     for( i = 0; i < 4; i++ )
2017         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2018     for( i = 0; i < 4; i++ )
2019         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2020                  odd_bytes( wtmp[ i ] ) );
2021     for( i = 0; i < 4; i++ )
2022         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2023     for( i = 0; i < 4; i++ )
2024         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2025                  odd_bytes( wtmp[ i ] ) );
2026     brw_pop_insn_state( p );
2027
2028     /* Now we want to initialise the four rear gradients based on the
2029        hashes.  Format conversion from signed integer to float leaves
2030        everything scaled too high by a factor of pow( 2, 15 ), but
2031        we correct for that right at the end. */
2032     /* x component */
2033     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2034     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2035     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2036     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2037     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2038
2039     brw_push_insn_state( p );
2040     brw_set_mask_control( p, BRW_MASK_DISABLE );
2041     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2042     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2043     brw_pop_insn_state( p );
2044
2045     brw_MUL( p, x1y0, x1y0, t );
2046     brw_MUL( p, x1y1, x1y1, t );
2047     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2048     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2049     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2050
2051     /* y component */
2052     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2053     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2054     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2055     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2056
2057     brw_push_insn_state( p );
2058     brw_set_mask_control( p, BRW_MASK_DISABLE );
2059     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2060     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2061     brw_pop_insn_state( p );
2062
2063     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2064     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2065     /* prepare t for the w component (used below): w the first time through
2066        the loop; w - 1 the second time) */
2067     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2068     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2069     p->current->header.predicate_inverse = 1;
2070     brw_MOV( p, t, param[ 3 ] );
2071     p->current->header.predicate_inverse = 0;
2072     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2073     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2074     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2075
2076     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2077     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2078     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2079     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2080
2081     /* z component */
2082     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2083     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2084     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2085     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2086
2087     brw_push_insn_state( p );
2088     brw_set_mask_control( p, BRW_MASK_DISABLE );
2089     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2090     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2091     brw_pop_insn_state( p );
2092
2093     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2094     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2095     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2096     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2097
2098     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2099     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2100     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2101     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2102
2103     /* w component */
2104     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2105     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2106     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2107     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2108
2109     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2110     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2111     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2112     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2113     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2114
2115     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2116     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2117     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2118     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2119
2120     /* Here we interpolate in the y dimension... */
2121     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2122     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2123     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2124     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2125     brw_ADD( p, x0y0, x0y0, x0y1 );
2126     brw_ADD( p, x1y0, x1y0, x1y1 );
2127
2128     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2129     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2130     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2131     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2132
2133     /* Now do the same thing for the front four gradients... */
2134     /* x component */
2135     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2136     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2137     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2138     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2139
2140     brw_push_insn_state( p );
2141     brw_set_mask_control( p, BRW_MASK_DISABLE );
2142     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2143     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2144     brw_pop_insn_state( p );
2145
2146     brw_MUL( p, x1y0, x1y0, t );
2147     brw_MUL( p, x1y1, x1y1, t );
2148     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2149     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2150     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2151
2152     /* y component */
2153     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2154     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2155     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2156     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2157
2158     brw_push_insn_state( p );
2159     brw_set_mask_control( p, BRW_MASK_DISABLE );
2160     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2161     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2162     brw_pop_insn_state( p );
2163
2164     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2165     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2166     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2167     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2168     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2169
2170     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2171     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2172     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2173     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2174
2175     /* z component */
2176     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2177     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2178     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2179     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2180
2181     brw_push_insn_state( p );
2182     brw_set_mask_control( p, BRW_MASK_DISABLE );
2183     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2184     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2185     brw_pop_insn_state( p );
2186
2187     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2188     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2189     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2190     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2191     /* prepare t for the w component (used below): w the first time through
2192        the loop; w - 1 the second time) */
2193     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2194     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2195     p->current->header.predicate_inverse = 1;
2196     brw_MOV( p, t, param[ 3 ] );
2197     p->current->header.predicate_inverse = 0;
2198     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2199
2200     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2201     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2202     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2203     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2204
2205     /* w component */
2206     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2207     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2208     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2209     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2210
2211     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2212     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2213     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2214     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2215
2216     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2217     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2218     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2219     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2220
2221     /* Interpolate in the y dimension: */
2222     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2223     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2224     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2225     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2226     brw_ADD( p, x0y0, x0y0, x0y1 );
2227     brw_ADD( p, x1y0, x1y0, x1y1 );
2228
2229     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2230        time put the front face in tmp[ 1 ] and we're nearly there... */
2231     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2232     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2233     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2234
2235     /* Another interpolation, in the z dimension: */
2236     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2237     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2238     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2239
2240     /* Exit the loop if we've computed both cubes... */
2241     origin = p->nr_insn;
2242     brw_push_insn_state( p );
2243     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2244     brw_set_mask_control( p, BRW_MASK_DISABLE );
2245     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2246     brw_pop_insn_state( p );
2247
2248     /* Save the result for the w=0 case, and increment the w coordinate: */
2249     brw_MOV( p, w0, tmp[ 0 ] );
2250     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2251              brw_imm_uw( 1 ) );
2252
2253     /* Loop around for the other cube.  Explicitly set the flag register
2254        (unfortunately we must spend an extra instruction to do this: we
2255        can't rely on a side effect of the previous MOV or ADD because
2256        conditional modifiers which are normally true might be false in
2257        exceptional circumstances, e.g. given a NaN input; the add to
2258        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2259     brw_push_insn_state( p );
2260     brw_set_mask_control( p, BRW_MASK_DISABLE );
2261     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2262     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2263              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2264     brw_pop_insn_state( p );
2265
2266     /* Patch the previous conditional branch now that we know the
2267        destination address. */
2268     brw_set_src1( p->store + origin,
2269                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2270
2271     /* The very last interpolation. */
2272     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2273     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2274     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2275
2276     /* scale by pow( 2, -15 ), as described above */
2277     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2278
2279     release_tmps( c, mark );
2280 }
2281
2282 static void emit_noise4( struct brw_wm_compile *c,
2283                          struct prog_instruction *inst )
2284 {
2285     struct brw_compile *p = &c->func;
2286     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2287     GLuint mask = inst->DstReg.WriteMask;
2288     int i;
2289     int mark = mark_tmps( c );
2290
2291     assert( mark == 0 );
2292
2293     src0 = get_src_reg( c, inst, 0, 0 );
2294     src1 = get_src_reg( c, inst, 0, 1 );
2295     src2 = get_src_reg( c, inst, 0, 2 );
2296     src3 = get_src_reg( c, inst, 0, 3 );
2297
2298     param0 = alloc_tmp( c );
2299     param1 = alloc_tmp( c );
2300     param2 = alloc_tmp( c );
2301     param3 = alloc_tmp( c );
2302
2303     brw_MOV( p, param0, src0 );
2304     brw_MOV( p, param1, src1 );
2305     brw_MOV( p, param2, src2 );
2306     brw_MOV( p, param3, src3 );
2307
2308     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2309
2310     /* Fill in the result: */
2311     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2312     for (i = 0 ; i < 4; i++) {
2313         if (mask & (1<<i)) {
2314             dst = get_dst_reg(c, inst, i);
2315             brw_MOV( p, dst, param0 );
2316         }
2317     }
2318     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2319         brw_set_saturate( p, 0 );
2320
2321     release_tmps( c, mark );
2322 }
2323
2324 static void emit_wpos_xy(struct brw_wm_compile *c,
2325                 struct prog_instruction *inst)
2326 {
2327     struct brw_compile *p = &c->func;
2328     GLuint mask = inst->DstReg.WriteMask;
2329     struct brw_reg src0[2], dst[2];
2330
2331     dst[0] = get_dst_reg(c, inst, 0);
2332     dst[1] = get_dst_reg(c, inst, 1);
2333
2334     src0[0] = get_src_reg(c, inst, 0, 0);
2335     src0[1] = get_src_reg(c, inst, 0, 1);
2336
2337     /* Calculate the pixel offset from window bottom left into destination
2338      * X and Y channels.
2339      */
2340     if (mask & WRITEMASK_X) {
2341         /* X' = X - origin_x */
2342         brw_ADD(p,
2343                 dst[0],
2344                 retype(src0[0], BRW_REGISTER_TYPE_W),
2345                 brw_imm_d(0 - c->key.origin_x));
2346     }
2347
2348     if (mask & WRITEMASK_Y) {
2349         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2350         brw_ADD(p,
2351                 dst[1],
2352                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2353                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2354     }
2355 }
2356
2357 /* TODO
2358    BIAS on SIMD8 not working yet...
2359  */
2360 static void emit_txb(struct brw_wm_compile *c,
2361                 struct prog_instruction *inst)
2362 {
2363     struct brw_compile *p = &c->func;
2364     struct brw_reg dst[4], src[4], payload_reg;
2365     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2366     GLuint i;
2367
2368     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2369
2370     for (i = 0; i < 4; i++)
2371         dst[i] = get_dst_reg(c, inst, i);
2372     for (i = 0; i < 4; i++)
2373         src[i] = get_src_reg(c, inst, 0, i);
2374
2375     switch (inst->TexSrcTarget) {
2376         case TEXTURE_1D_INDEX:
2377             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2378             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2379             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2380             break;
2381         case TEXTURE_2D_INDEX:
2382         case TEXTURE_RECT_INDEX:
2383             brw_MOV(p, brw_message_reg(2), src[0]);
2384             brw_MOV(p, brw_message_reg(3), src[1]);
2385             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2386             break;
2387         default:
2388             brw_MOV(p, brw_message_reg(2), src[0]);
2389             brw_MOV(p, brw_message_reg(3), src[1]);
2390             brw_MOV(p, brw_message_reg(4), src[2]);
2391             break;
2392     }
2393     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2394     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2395     brw_SAMPLE(p,
2396                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2397                1,                                           /* msg_reg_nr */
2398                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2399                unit + MAX_DRAW_BUFFERS,                     /* surface */
2400                unit,                                        /* sampler */
2401                inst->DstReg.WriteMask,                      /* writemask */
2402                BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
2403                4,                                           /* response_length */
2404                4,                                           /* msg_length */
2405                0);                                          /* eot */
2406 }
2407
2408
2409 static void emit_tex(struct brw_wm_compile *c,
2410                 struct prog_instruction *inst)
2411 {
2412     struct brw_compile *p = &c->func;
2413     struct brw_reg dst[4], src[4], payload_reg;
2414     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2415     GLuint msg_len;
2416     GLuint i, nr;
2417     GLuint emit;
2418     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2419
2420     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2421
2422     for (i = 0; i < 4; i++)
2423         dst[i] = get_dst_reg(c, inst, i);
2424     for (i = 0; i < 4; i++)
2425         src[i] = get_src_reg(c, inst, 0, i);
2426
2427     switch (inst->TexSrcTarget) {
2428         case TEXTURE_1D_INDEX:
2429             emit = WRITEMASK_X;
2430             nr = 1;
2431             break;
2432         case TEXTURE_2D_INDEX:
2433         case TEXTURE_RECT_INDEX:
2434             emit = WRITEMASK_XY;
2435             nr = 2;
2436             break;
2437         default:
2438             emit = WRITEMASK_XYZ;
2439             nr = 3;
2440             break;
2441     }
2442     msg_len = 1;
2443
2444     /* move/load S, T, R coords */
2445     for (i = 0; i < nr; i++) {
2446         static const GLuint swz[4] = {0,1,2,2};
2447         if (emit & (1<<i))
2448             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2449         else
2450             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2451         msg_len += 1;
2452     }
2453
2454     if (shadow) {
2455        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2456        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2457     }
2458
2459     brw_SAMPLE(p,
2460                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2461                1,                                          /* msg_reg_nr */
2462                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2463                unit + MAX_DRAW_BUFFERS,                    /* surface */
2464                unit,                                       /* sampler */
2465                inst->DstReg.WriteMask,                     /* writemask */
2466                BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
2467                4,                                          /* response_length */
2468                shadow ? 6 : 4,                             /* msg_length */
2469                0);                                         /* eot */
2470
2471     if (shadow)
2472         brw_MOV(p, dst[3], brw_imm_f(1.0));
2473 }
2474
2475
2476 static void emit_get_constant(struct brw_context *brw,
2477                               struct brw_wm_compile *c,
2478                               struct prog_instruction *inst,
2479                               GLuint constIndex)
2480 {
2481    struct brw_compile *p = &c->func;
2482    struct brw_reg dst[4];
2483    GLuint i;
2484    const int mark = mark_tmps( c );
2485    struct brw_reg writeback_reg[4];
2486
2487    /* XXX only need 1 temp reg??? */
2488    for (i = 0; i < 4; i++) {
2489       writeback_reg[i] = alloc_tmp(c);
2490    }
2491
2492    for (i = 0; i < 4; i++) {
2493       dst[i] = get_dst_reg(c, inst, i);
2494    }
2495
2496    /* Get float[4] vector from constant buffer */
2497    brw_dp_READ_4(p,
2498                  writeback_reg[0],     /* first writeback dest */
2499                  1,                    /* msg_reg */
2500                  GL_FALSE,             /* rel addr? */
2501                  16 * constIndex,      /* byte offset */
2502                  BRW_WM_MAX_SURF - 1   /* surface, binding table index */
2503                  );
2504
2505    /* Extract the four channel values, smear across dest registers */
2506    for (i = 0; i < 4; i++) {
2507       /* extract 1 float from the writeback reg */
2508       struct brw_reg new_src = stride(writeback_reg[0], 0, 1, 0);
2509       new_src.subnr = i * 4;
2510       /* and smear it into the dest register */
2511       brw_MOV(p, dst[i], new_src);
2512    }
2513
2514    release_tmps( c, mark );
2515 }
2516
2517
2518 /**
2519  * Resolve subroutine calls after code emit is done.
2520  */
2521 static void post_wm_emit( struct brw_wm_compile *c )
2522 {
2523     brw_resolve_cals(&c->func);
2524 }
2525
2526 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2527 {
2528 #define MAX_IFSN 32
2529 #define MAX_LOOP_DEPTH 32
2530     struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2531     struct brw_instruction *inst0, *inst1;
2532     int i, if_insn = 0, loop_insn = 0;
2533     struct brw_compile *p = &c->func;
2534     struct brw_indirect stack_index = brw_indirect(0, 0);
2535
2536     c->reg_index = 0;
2537     prealloc_reg(c);
2538     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2539     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2540
2541     for (i = 0; i < c->nr_fp_insns; i++) {
2542         struct prog_instruction *inst = &c->prog_instructions[i];
2543
2544         if (inst->CondUpdate)
2545             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2546         else
2547             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2548
2549         switch (inst->Opcode) {
2550             case WM_PIXELXY:
2551                 emit_pixel_xy(c, inst);
2552                 break;
2553             case WM_DELTAXY:
2554                 emit_delta_xy(c, inst);
2555                 break;
2556             case WM_PIXELW:
2557                 emit_pixel_w(c, inst);
2558                 break;
2559             case WM_LINTERP:
2560                 emit_linterp(c, inst);
2561                 break;
2562             case WM_PINTERP:
2563                 emit_pinterp(c, inst);
2564                 break;
2565             case WM_CINTERP:
2566                 emit_cinterp(c, inst);
2567                 break;
2568             case WM_WPOSXY:
2569                 emit_wpos_xy(c, inst);
2570                 break;
2571             case WM_FB_WRITE:
2572                 emit_fb_write(c, inst);
2573                 break;
2574             case WM_FRONTFACING:
2575                 emit_frontfacing(c, inst);
2576                 break;
2577             case OPCODE_ABS:
2578                 emit_abs(c, inst);
2579                 break;
2580             case OPCODE_ADD:
2581                 emit_add(c, inst);
2582                 break;
2583             case OPCODE_ARL:
2584                 emit_arl(c, inst);
2585                 break;
2586             case OPCODE_SUB:
2587                 emit_sub(c, inst);
2588                 break;
2589             case OPCODE_FRC:
2590                 emit_frc(c, inst);
2591                 break;
2592             case OPCODE_FLR:
2593                 emit_flr(c, inst);
2594                 break;
2595             case OPCODE_LRP:
2596                 emit_lrp(c, inst);
2597                 break;
2598             case OPCODE_TRUNC:
2599                 emit_trunc(c, inst);
2600                 break;
2601             case OPCODE_MOV:
2602 #if 0
2603                 /* test hook for new constant buffer code */
2604                 if (inst->SrcReg[0].File == PROGRAM_UNIFORM) {
2605                    emit_get_constant(brw, c, inst, inst->SrcReg[0].Index);
2606                 }
2607                 else {
2608                    emit_mov(c, inst);
2609                 }
2610 #else
2611                 emit_mov(c, inst);
2612 #endif
2613                 break;
2614             case OPCODE_DP3:
2615                 emit_dp3(c, inst);
2616                 break;
2617             case OPCODE_DP4:
2618                 emit_dp4(c, inst);
2619                 break;
2620             case OPCODE_XPD:
2621                 emit_xpd(c, inst);
2622                 break;
2623             case OPCODE_DPH:
2624                 emit_dph(c, inst);
2625                 break;
2626             case OPCODE_RCP:
2627                 emit_rcp(c, inst);
2628                 break;
2629             case OPCODE_RSQ:
2630                 emit_rsq(c, inst);
2631                 break;
2632             case OPCODE_SIN:
2633                 emit_sin(c, inst);
2634                 break;
2635             case OPCODE_COS:
2636                 emit_cos(c, inst);
2637                 break;
2638             case OPCODE_EX2:
2639                 emit_ex2(c, inst);
2640                 break;
2641             case OPCODE_LG2:
2642                 emit_lg2(c, inst);
2643                 break;
2644             case OPCODE_MAX:
2645                 emit_max(c, inst);
2646                 break;
2647             case OPCODE_MIN:
2648                 emit_min(c, inst);
2649                 break;
2650             case OPCODE_DDX:
2651                 emit_ddx(c, inst);
2652                 break;
2653             case OPCODE_DDY:
2654                 emit_ddy(c, inst);
2655                 break;
2656             case OPCODE_SLT:
2657                 emit_slt(c, inst);
2658                 break;
2659             case OPCODE_SLE:
2660                 emit_sle(c, inst);
2661                 break;
2662             case OPCODE_SGT:
2663                 emit_sgt(c, inst);
2664                 break;
2665             case OPCODE_SGE:
2666                 emit_sge(c, inst);
2667                 break;
2668             case OPCODE_SEQ:
2669                 emit_seq(c, inst);
2670                 break;
2671             case OPCODE_SNE:
2672                 emit_sne(c, inst);
2673                 break;
2674             case OPCODE_MUL:
2675                 emit_mul(c, inst);
2676                 break;
2677             case OPCODE_POW:
2678                 emit_pow(c, inst);
2679                 break;
2680             case OPCODE_MAD:
2681                 emit_mad(c, inst);
2682                 break;
2683             case OPCODE_NOISE1:
2684                 emit_noise1(c, inst);
2685                 break;
2686             case OPCODE_NOISE2:
2687                 emit_noise2(c, inst);
2688                 break;
2689             case OPCODE_NOISE3:
2690                 emit_noise3(c, inst);
2691                 break;
2692             case OPCODE_NOISE4:
2693                 emit_noise4(c, inst);
2694                 break;
2695             case OPCODE_TEX:
2696                 emit_tex(c, inst);
2697                 break;
2698             case OPCODE_TXB:
2699                 emit_txb(c, inst);
2700                 break;
2701             case OPCODE_KIL_NV:
2702                 emit_kil(c);
2703                 break;
2704             case OPCODE_IF:
2705                 assert(if_insn < MAX_IFSN);
2706                 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2707                 break;
2708             case OPCODE_ELSE:
2709                 if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
2710                 break;
2711             case OPCODE_ENDIF:
2712                 assert(if_insn > 0);
2713                 brw_ENDIF(p, if_inst[--if_insn]);
2714                 break;
2715             case OPCODE_BGNSUB:
2716                 brw_save_label(p, inst->Comment, p->nr_insn);
2717                 break;
2718             case OPCODE_ENDSUB:
2719                 /* no-op */
2720                 break;
2721             case OPCODE_CAL:
2722                 brw_push_insn_state(p);
2723                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2724                 brw_set_access_mode(p, BRW_ALIGN_1);
2725                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2726                 brw_set_access_mode(p, BRW_ALIGN_16);
2727                 brw_ADD(p, get_addr_reg(stack_index),
2728                          get_addr_reg(stack_index), brw_imm_d(4));
2729                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2730                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2731                 brw_pop_insn_state(p);
2732                 break;
2733
2734             case OPCODE_RET:
2735                 brw_push_insn_state(p);
2736                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2737                 brw_ADD(p, get_addr_reg(stack_index),
2738                         get_addr_reg(stack_index), brw_imm_d(-4));
2739                 brw_set_access_mode(p, BRW_ALIGN_1);
2740                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2741                 brw_set_access_mode(p, BRW_ALIGN_16);
2742                 brw_pop_insn_state(p);
2743
2744                 break;
2745             case OPCODE_BGNLOOP:
2746                 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2747                 break;
2748             case OPCODE_BRK:
2749                 brw_BREAK(p);
2750                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2751                 break;
2752             case OPCODE_CONT:
2753                 brw_CONT(p);
2754                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2755                 break;
2756             case OPCODE_ENDLOOP:
2757                 loop_insn--;
2758                 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2759                 /* patch all the BREAK instructions from
2760                    last BEGINLOOP */
2761                 while (inst0 > loop_inst[loop_insn]) {
2762                     inst0--;
2763                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2764                         inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2765                         inst0->bits3.if_else.pop_count = 0;
2766                     } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2767                         inst0->bits3.if_else.jump_count = inst1 - inst0;
2768                         inst0->bits3.if_else.pop_count = 0;
2769                     }
2770                 }
2771                 break;
2772             default:
2773                 _mesa_printf("unsupported IR in fragment shader %d\n",
2774                         inst->Opcode);
2775         }
2776         if (inst->CondUpdate)
2777             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2778         else
2779             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2780     }
2781     post_wm_emit(c);
2782
2783     if (c->reg_index >= BRW_WM_MAX_GRF) {
2784         _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2785         /* XXX we need to do some proper error recovery here */
2786     }
2787 }
2788
2789
2790 /**
2791  * Do GPU code generation for shaders that use GLSL features such as
2792  * flow control.  Other shaders will be compiled with the
2793  */
2794 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2795 {
2796     if (INTEL_DEBUG & DEBUG_WM) {
2797         _mesa_printf("brw_wm_glsl_emit:\n");
2798     }
2799
2800     /* initial instruction translation/simplification */
2801     brw_wm_pass_fp(c);
2802
2803     /* actual code generation */
2804     brw_wm_emit_glsl(brw, c);
2805
2806     if (INTEL_DEBUG & DEBUG_WM) {
2807         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2808     }
2809
2810     c->prog_data.total_grf = c->reg_index;
2811     c->prog_data.total_scratch = 0;
2812 }