src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
  27        return GL_TRUE;
  28
  29     for (i = 0; i < fp->Base.NumInstructions; i++) {
  30         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  31         switch (inst->Opcode) {
  32             case OPCODE_ARL:
  33             case OPCODE_IF:
  34             case OPCODE_ENDIF:
  35             case OPCODE_CAL:
  36             case OPCODE_BRK:
  37             case OPCODE_RET:
  38             case OPCODE_NOISE1:
  39             case OPCODE_NOISE2:
  40             case OPCODE_NOISE3:
  41             case OPCODE_NOISE4:
  42             case OPCODE_BGNLOOP:
  43                 return GL_TRUE;
  44             default:
  45                 break;
  46         }
  47     }
  48     return GL_FALSE;
  49 }
  50
  51
  52
  53 static void
  54 reclaim_temps(struct brw_wm_compile *c);
  55
  56
  57 /** Mark GRF register as used. */
  58 static void
  59 prealloc_grf(struct brw_wm_compile *c, int r)
  60 {
  61    c->used_grf[r] = GL_TRUE;
  62 }
  63
  64
  65 /** Mark given GRF register as not in use. */
  66 static void
  67 release_grf(struct brw_wm_compile *c, int r)
  68 {
  69    /*assert(c->used_grf[r]);*/
  70    c->used_grf[r] = GL_FALSE;
  71    c->first_free_grf = MIN2(c->first_free_grf, r);
  72 }
  73
  74
  75 /** Return index of a free GRF, mark it as used. */
  76 static int
  77 alloc_grf(struct brw_wm_compile *c)
  78 {
  79    GLuint r;
  80    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  81       if (!c->used_grf[r]) {
  82          c->used_grf[r] = GL_TRUE;
  83          c->first_free_grf = r + 1;  /* a guess */
  84          return r;
  85       }
  86    }
  87
  88    /* no free temps, try to reclaim some */
  89    reclaim_temps(c);
  90    c->first_free_grf = 0;
  91
  92    /* try alloc again */
  93    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  94       if (!c->used_grf[r]) {
  95          c->used_grf[r] = GL_TRUE;
  96          c->first_free_grf = r + 1;  /* a guess */
  97          return r;
  98       }
  99    }
 100
 101    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
 102       assert(c->used_grf[r]);
 103    }
 104
 105    /* really, no free GRF regs found */
 106    if (!c->out_of_regs) {
 107       /* print warning once per compilation */
 108       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 109       c->out_of_regs = GL_TRUE;
 110    }
 111
 112    return -1;
 113 }
 114
 115
 116 /** Return number of GRF registers used */
 117 static int
 118 num_grf_used(const struct brw_wm_compile *c)
 119 {
 120    int r;
 121    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 122       if (c->used_grf[r])
 123          return r + 1;
 124    return 0;
 125 }
 126
 127
 128
 129 /**
 130  * Record the mapping of a Mesa register to a hardware register.
 131  */
 132 static void set_reg(struct brw_wm_compile *c, int file, int index,
 133         int component, struct brw_reg reg)
 134 {
 135     c->wm_regs[file][index][component].reg = reg;
 136     c->wm_regs[file][index][component].inited = GL_TRUE;
 137 }
 138
 139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 140 {
 141     struct brw_reg reg;
 142
 143     /* if we need to allocate another temp, grow the tmp_regs[] array */
 144     if (c->tmp_index == c->tmp_max) {
 145        int r = alloc_grf(c);
 146        if (r < 0) {
 147           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 148           r = 50; /* XXX random register! */
 149        }
 150        c->tmp_regs[ c->tmp_max++ ] = r;
 151     }
 152
 153     /* form the GRF register */
 154     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 155     /*printf("alloc_temp %d\n", reg.nr);*/
 156     assert(reg.nr < BRW_WM_MAX_GRF);
 157     return reg;
 158
 159 }
 160
 161 /**
 162  * Save current temp register info.
 163  * There must be a matching call to release_tmps().
 164  */
 165 static int mark_tmps(struct brw_wm_compile *c)
 166 {
 167     return c->tmp_index;
 168 }
 169
 170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 171 {
 172     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 173 }
 174
 175 static void release_tmps(struct brw_wm_compile *c, int mark)
 176 {
 177     c->tmp_index = mark;
 178 }
 179
 180 /**
 181  * Convert Mesa src register to brw register.
 182  *
 183  * Since we're running in SOA mode each Mesa register corresponds to four
 184  * hardware registers.  We allocate the hardware registers as needed here.
 185  *
 186  * \param file  register file, one of PROGRAM_x
 187  * \param index  register number
 188  * \param component  src component (X=0, Y=1, Z=2, W=3)
 189  * \param nr  not used?!?
 190  * \param neg  negate value?
 191  * \param abs  take absolute value?
 192  */
 193 static struct brw_reg
 194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 195         int nr, GLuint neg, GLuint abs)
 196 {
 197     struct brw_reg reg;
 198     switch (file) {
 199         case PROGRAM_STATE_VAR:
 200         case PROGRAM_CONSTANT:
 201         case PROGRAM_UNIFORM:
 202             file = PROGRAM_STATE_VAR;
 203             break;
 204         case PROGRAM_UNDEFINED:
 205             return brw_null_reg();
 206         case PROGRAM_TEMPORARY:
 207         case PROGRAM_INPUT:
 208         case PROGRAM_OUTPUT:
 209         case PROGRAM_PAYLOAD:
 210             break;
 211         default:
 212             _mesa_problem(NULL, "Unexpected file in get_reg()");
 213             return brw_null_reg();
 214     }
 215
 216     assert(index < 256);
 217     assert(component < 4);
 218
 219     /* see if we've already allocated a HW register for this Mesa register */
 220     if (c->wm_regs[file][index][component].inited) {
 221        /* yes, re-use */
 222        reg = c->wm_regs[file][index][component].reg;
 223     }
 224     else {
 225         /* no, allocate new register */
 226        int grf = alloc_grf(c);
 227        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 228        if (grf < 0) {
 229           /* totally out of temps */
 230           grf = 51; /* XXX random register! */
 231        }
 232
 233        reg = brw_vec8_grf(grf, 0);
 234        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 235
 236        set_reg(c, file, index, component, reg);
 237     }
 238
 239     if (neg & (1 << component)) {
 240         reg = negate(reg);
 241     }
 242     if (abs)
 243         reg = brw_abs(reg);
 244     return reg;
 245 }
 246
 247
 248
 249 /**
 250  * This is called if we run out of GRF registers.  Examine the live intervals
 251  * of temp regs in the program and free those which won't be used again.
 252  */
 253 static void
 254 reclaim_temps(struct brw_wm_compile *c)
 255 {
 256    GLint intBegin[MAX_PROGRAM_TEMPS];
 257    GLint intEnd[MAX_PROGRAM_TEMPS];
 258    int index;
 259
 260    /*printf("Reclaim temps:\n");*/
 261
 262    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 263                              intBegin, intEnd);
 264
 265    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 266       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 267          /* program temp[i] can be freed */
 268          int component;
 269          /*printf("  temp[%d] is dead\n", index);*/
 270          for (component = 0; component < 4; component++) {
 271             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 272                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 273                release_grf(c, r);
 274                /*
 275                printf("  Reclaim temp %d, reg %d at inst %d\n",
 276                       index, r, c->cur_inst);
 277                */
 278                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 279             }
 280          }
 281       }
 282    }
 283 }
 284
 285
 286
 287
 288 /**
 289  * Preallocate registers.  This sets up the Mesa to hardware register
 290  * mapping for certain registers, such as constants (uniforms/state vars)
 291  * and shader inputs.
 292  */
 293 static void prealloc_reg(struct brw_wm_compile *c)
 294 {
 295     struct intel_context *intel = &c->func.brw->intel;
 296     int i, j;
 297     struct brw_reg reg;
 298     int urb_read_length = 0;
 299     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 300     GLuint reg_index = 0;
 301
 302     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 303     c->first_free_grf = 0;
 304
 305     for (i = 0; i < 4; i++) {
 306         if (i < c->key.nr_depth_regs)
 307             reg = brw_vec8_grf(i * 2, 0);
 308         else
 309             reg = brw_vec8_grf(0, 0);
 310         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 311     }
 312     reg_index += 2 * c->key.nr_depth_regs;
 313
 314     /* constants */
 315     {
 316         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 317         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 318
 319         /* use a real constant buffer, or just use a section of the GRF? */
 320         /* XXX this heuristic may need adjustment... */
 321         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 322            c->fp->use_const_buffer = GL_TRUE;
 323         else
 324            c->fp->use_const_buffer = GL_FALSE;
 325         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 326
 327         if (c->fp->use_const_buffer) {
 328            /* We'll use a real constant buffer and fetch constants from
 329             * it with a dataport read message.
 330             */
 331
 332            /* number of float constants in CURBE */
 333            c->prog_data.nr_params = 0;
 334         }
 335         else {
 336            const struct gl_program_parameter_list *plist =
 337               c->fp->program.Base.Parameters;
 338            int index = 0;
 339
 340            /* number of float constants in CURBE */
 341            c->prog_data.nr_params = 4 * nr_params;
 342
 343            /* loop over program constants (float[4]) */
 344            for (i = 0; i < nr_params; i++) {
 345               /* loop over XYZW channels */
 346               for (j = 0; j < 4; j++, index++) {
 347                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 348                  /* Save pointer to parameter/constant value.
 349                   * Constants will be copied in prepare_constant_buffer()
 350                   */
 351                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 352                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 353               }
 354            }
 355            /* number of constant regs used (each reg is float[8]) */
 356            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 357            reg_index += c->nr_creg;
 358         }
 359     }
 360
 361     /* fragment shader inputs */
 362     for (i = 0; i < VERT_RESULT_MAX; i++) {
 363        int fp_input;
 364
 365        if (i >= VERT_RESULT_VAR0)
 366           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 367        else if (i <= VERT_RESULT_TEX7)
 368           fp_input = i;
 369        else
 370           fp_input = -1;
 371
 372        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 373           urb_read_length = reg_index;
 374           reg = brw_vec8_grf(reg_index, 0);
 375           for (j = 0; j < 4; j++)
 376              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 377        }
 378        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 379           reg_index += 2;
 380        }
 381     }
 382
 383     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 384     c->prog_data.urb_read_length = urb_read_length;
 385     c->prog_data.curb_read_length = c->nr_creg;
 386     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 387     reg_index++;
 388     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 389     reg_index += 2;
 390
 391     /* mark GRF regs [0..reg_index-1] as in-use */
 392     for (i = 0; i < reg_index; i++)
 393        prealloc_grf(c, i);
 394
 395     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 396     prealloc_grf(c, 126);
 397     prealloc_grf(c, 127);
 398
 399     for (i = 0; i < c->nr_fp_insns; i++) {
 400         const struct prog_instruction *inst = &c->prog_instructions[i];
 401         struct brw_reg dst[4];
 402
 403         switch (inst->Opcode) {
 404         case OPCODE_TEX:
 405         case OPCODE_TXB:
 406             /* Allocate the channels of texture results contiguously,
 407              * since they are written out that way by the sampler unit.
 408              */
 409             for (j = 0; j < 4; j++) {
 410                 dst[j] = get_dst_reg(c, inst, j);
 411                 if (j != 0)
 412                     assert(dst[j].nr == dst[j - 1].nr + 1);
 413             }
 414             break;
 415         default:
 416             break;
 417         }
 418     }
 419
 420     for (i = 0; i < c->nr_fp_insns; i++) {
 421         const struct prog_instruction *inst = &c->prog_instructions[i];
 422
 423         switch (inst->Opcode) {
 424         case WM_DELTAXY:
 425             /* Allocate WM_DELTAXY destination on G45/GM45 to an
 426              * even-numbered GRF if possible so that we can use the PLN
 427              * instruction.
 428              */
 429             if (inst->DstReg.WriteMask == WRITEMASK_XY &&
 430                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
 431                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
 432                 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
 433                 int grf;
 434
 435                 for (grf = c->first_free_grf & ~1;
 436                      grf < BRW_WM_MAX_GRF;
 437                      grf += 2)
 438                 {
 439                     if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
 440                         c->used_grf[grf] = GL_TRUE;
 441                         c->used_grf[grf + 1] = GL_TRUE;
 442                         c->first_free_grf = grf + 2;  /* a guess */
 443
 444                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
 445                                 brw_vec8_grf(grf, 0));
 446                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
 447                                 brw_vec8_grf(grf + 1, 0));
 448                         break;
 449                     }
 450                 }
 451             }
 452         default:
 453             break;
 454         }
 455     }
 456
 457     /* An instruction may reference up to three constants.
 458      * They'll be found in these registers.
 459      * XXX alloc these on demand!
 460      */
 461     if (c->fp->use_const_buffer) {
 462        for (i = 0; i < 3; i++) {
 463           c->current_const[i].index = -1;
 464           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 465        }
 466     }
 467 #if 0
 468     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 469     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 470 #endif
 471 }
 472
 473
 474 /**
 475  * Check if any of the instruction's src registers are constants, uniforms,
 476  * or statevars.  If so, fetch any constants that we don't already have in
 477  * the three GRF slots.
 478  */
 479 static void fetch_constants(struct brw_wm_compile *c,
 480                             const struct prog_instruction *inst)
 481 {
 482    struct brw_compile *p = &c->func;
 483    GLuint i;
 484
 485    /* loop over instruction src regs */
 486    for (i = 0; i < 3; i++) {
 487       const struct prog_src_register *src = &inst->SrcReg[i];
 488       if (src->File == PROGRAM_STATE_VAR ||
 489           src->File == PROGRAM_CONSTANT ||
 490           src->File == PROGRAM_UNIFORM) {
 491          c->current_const[i].index = src->Index;
 492
 493 #if 0
 494          printf("  fetch const[%d] for arg %d into reg %d\n",
 495                 src->Index, i, c->current_const[i].reg.nr);
 496 #endif
 497
 498          /* need to fetch the constant now */
 499          brw_dp_READ_4(p,
 500                        c->current_const[i].reg,  /* writeback dest */
 501                        src->RelAddr,             /* relative indexing? */
 502                        16 * src->Index,          /* byte offset */
 503                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 504                        );
 505       }
 506    }
 507 }
 508
 509
 510 /**
 511  * Convert Mesa dst register to brw register.
 512  */
 513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 514                                   const struct prog_instruction *inst,
 515                                   GLuint component)
 516 {
 517     const int nr = 1;
 518     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 519             0, 0);
 520 }
 521
 522
 523 static struct brw_reg
 524 get_src_reg_const(struct brw_wm_compile *c,
 525                   const struct prog_instruction *inst,
 526                   GLuint srcRegIndex, GLuint component)
 527 {
 528    /* We should have already fetched the constant from the constant
 529     * buffer in fetch_constants().  Now we just have to return a
 530     * register description that extracts the needed component and
 531     * smears it across all eight vector components.
 532     */
 533    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 534    struct brw_reg const_reg;
 535
 536    assert(component < 4);
 537    assert(srcRegIndex < 3);
 538    assert(c->current_const[srcRegIndex].index != -1);
 539    const_reg = c->current_const[srcRegIndex].reg;
 540
 541    /* extract desired float from the const_reg, and smear */
 542    const_reg = stride(const_reg, 0, 1, 0);
 543    const_reg.subnr = component * 4;
 544
 545    if (src->Negate & (1 << component))
 546       const_reg = negate(const_reg);
 547    if (src->Abs)
 548       const_reg = brw_abs(const_reg);
 549
 550 #if 0
 551    printf("  form const[%d].%d for arg %d, reg %d\n",
 552           c->current_const[srcRegIndex].index,
 553           component,
 554           srcRegIndex,
 555           const_reg.nr);
 556 #endif
 557
 558    return const_reg;
 559 }
 560
 561
 562 /**
 563  * Convert Mesa src register to brw register.
 564  */
 565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 566                                   const struct prog_instruction *inst,
 567                                   GLuint srcRegIndex, GLuint channel)
 568 {
 569     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 570     const GLuint nr = 1;
 571     const GLuint component = GET_SWZ(src->Swizzle, channel);
 572
 573     /* Only one immediate value can be used per native opcode, and it
 574      * has be in the src1 slot, so not all Mesa instructions will get
 575      * to take advantage of immediate constants.
 576      */
 577     if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
 578        const struct gl_program_parameter_list *params;
 579
 580        params = c->fp->program.Base.Parameters;
 581
 582        /* Extended swizzle terms */
 583        if (component == SWIZZLE_ZERO) {
 584           return brw_imm_f(0.0F);
 585        } else if (component == SWIZZLE_ONE) {
 586           if (src->Negate)
 587              return brw_imm_f(-1.0F);
 588           else
 589              return brw_imm_f(1.0F);
 590        }
 591
 592        if (src->File == PROGRAM_CONSTANT) {
 593           float f = params->ParameterValues[src->Index][component];
 594
 595           if (src->Abs)
 596              f = fabs(f);
 597           if (src->Negate)
 598              f = -f;
 599
 600           return brw_imm_f(f);
 601        }
 602     }
 603
 604     if (c->fp->use_const_buffer &&
 605         (src->File == PROGRAM_STATE_VAR ||
 606          src->File == PROGRAM_CONSTANT ||
 607          src->File == PROGRAM_UNIFORM)) {
 608        return get_src_reg_const(c, inst, srcRegIndex, component);
 609     }
 610     else {
 611        /* other type of source register */
 612        return get_reg(c, src->File, src->Index, component, nr,
 613                       src->Negate, src->Abs);
 614     }
 615 }
 616
 617 /**
 618  * Subroutines are minimal support for resusable instruction sequences.
 619  * They are implemented as simply as possible to minimise overhead: there
 620  * is no explicit support for communication between the caller and callee
 621  * other than saving the return address in a temporary register, nor is
 622  * there any automatic local storage.  This implies that great care is
 623  * required before attempting reentrancy or any kind of nested
 624  * subroutine invocations.
 625  */
 626 static void invoke_subroutine( struct brw_wm_compile *c,
 627                                enum _subroutine subroutine,
 628                                void (*emit)( struct brw_wm_compile * ) )
 629 {
 630     struct brw_compile *p = &c->func;
 631
 632     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 633
 634     if( c->subroutines[ subroutine ] ) {
 635         /* subroutine previously emitted: reuse existing instructions */
 636
 637         int mark = mark_tmps( c );
 638         struct brw_reg return_address = retype( alloc_tmp( c ),
 639                                                 BRW_REGISTER_TYPE_UD );
 640         int here = p->nr_insn;
 641
 642         brw_push_insn_state(p);
 643         brw_set_mask_control(p, BRW_MASK_DISABLE);
 644         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 645
 646         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 647                  brw_imm_d( ( c->subroutines[ subroutine ] -
 648                               here - 1 ) << 4 ) );
 649         brw_pop_insn_state(p);
 650
 651         release_tmps( c, mark );
 652     } else {
 653         /* previously unused subroutine: emit, and mark for later reuse */
 654
 655         int mark = mark_tmps( c );
 656         struct brw_reg return_address = retype( alloc_tmp( c ),
 657                                                 BRW_REGISTER_TYPE_UD );
 658         struct brw_instruction *calc;
 659         int base = p->nr_insn;
 660
 661         brw_push_insn_state(p);
 662         brw_set_mask_control(p, BRW_MASK_DISABLE);
 663         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 664         brw_pop_insn_state(p);
 665
 666         c->subroutines[ subroutine ] = p->nr_insn;
 667
 668         emit( c );
 669
 670         brw_push_insn_state(p);
 671         brw_set_mask_control(p, BRW_MASK_DISABLE);
 672         brw_MOV( p, brw_ip_reg(), return_address );
 673         brw_pop_insn_state(p);
 674
 675         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 676
 677         release_tmps( c, mark );
 678     }
 679 }
 680
 681 static void emit_arl(struct brw_wm_compile *c,
 682                      const struct prog_instruction *inst)
 683 {
 684     struct brw_compile *p = &c->func;
 685     struct brw_reg src0, addr_reg;
 686     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 687     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 688                            BRW_ARF_ADDRESS, 0);
 689     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 690     brw_MOV(p, addr_reg, src0);
 691     brw_set_saturate(p, 0);
 692 }
 693
 694 /**
 695  * For GLSL shaders, this KIL will be unconditional.
 696  * It may be contained inside an IF/ENDIF structure of course.
 697  */
 698 static void emit_kil(struct brw_wm_compile *c)
 699 {
 700     struct brw_compile *p = &c->func;
 701     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 702     brw_push_insn_state(p);
 703     brw_set_mask_control(p, BRW_MASK_DISABLE);
 704     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
 705     brw_AND(p, depth, c->emit_mask_reg, depth);
 706     brw_pop_insn_state(p);
 707 }
 708
 709 static INLINE struct brw_reg high_words( struct brw_reg reg )
 710 {
 711     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 712                    0, 8, 2 );
 713 }
 714
 715 static INLINE struct brw_reg low_words( struct brw_reg reg )
 716 {
 717     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 718 }
 719
 720 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 721 {
 722     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 723 }
 724
 725 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 726 {
 727     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 728                    0, 16, 2 );
 729 }
 730
 731 /* One-, two- and three-dimensional Perlin noise, similar to the description
 732    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 733 static void noise1_sub( struct brw_wm_compile *c ) {
 734
 735     struct brw_compile *p = &c->func;
 736     struct brw_reg param,
 737         x0, x1, /* gradients at each end */
 738         t, tmp[ 2 ], /* float temporaries */
 739         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 740     int i;
 741     int mark = mark_tmps( c );
 742
 743     x0 = alloc_tmp( c );
 744     x1 = alloc_tmp( c );
 745     t = alloc_tmp( c );
 746     tmp[ 0 ] = alloc_tmp( c );
 747     tmp[ 1 ] = alloc_tmp( c );
 748     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 749     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 750     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 751     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 752     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 753
 754     param = lookup_tmp( c, mark - 2 );
 755
 756     brw_set_access_mode( p, BRW_ALIGN_1 );
 757
 758     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 759
 760     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 761        be hashed.  Also compute the remainder (offset within the unit
 762        length), interleaved to reduce register dependency penalties. */
 763     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 764     brw_FRC( p, param, param );
 765     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 766     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 767     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 768
 769     /* We're now ready to perform the hashing.  The two hashes are
 770        interleaved for performance.  The hash function used is
 771        designed to rapidly achieve avalanche and require only 32x16
 772        bit multiplication, and 16-bit swizzles (which we get for
 773        free).  We can't use immediate operands in the multiplies,
 774        because immediates are permitted only in src1 and the 16-bit
 775        factor is permitted only in src0. */
 776     for( i = 0; i < 2; i++ )
 777         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 778     for( i = 0; i < 2; i++ )
 779        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 780                 high_words( itmp[ i ] ) );
 781     for( i = 0; i < 2; i++ )
 782         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 783     for( i = 0; i < 2; i++ )
 784        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 785                 high_words( itmp[ i ] ) );
 786     for( i = 0; i < 2; i++ )
 787         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 788     for( i = 0; i < 2; i++ )
 789        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 790                 high_words( itmp[ i ] ) );
 791
 792     /* Now we want to initialise the two gradients based on the
 793        hashes.  Format conversion from signed integer to float leaves
 794        everything scaled too high by a factor of pow( 2, 31 ), but
 795        we correct for that right at the end. */
 796     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 797     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 798     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 799
 800     brw_MUL( p, x0, x0, param );
 801     brw_MUL( p, x1, x1, t );
 802
 803     /* We interpolate between the gradients using the polynomial
 804        6t^5 - 15t^4 + 10t^3 (Perlin). */
 805     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 806     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 807     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 808     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 809     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 810     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 811                                            pipeline */
 812     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 813     brw_MUL( p, param, tmp[ 0 ], param );
 814     brw_MUL( p, x1, x1, param );
 815     brw_ADD( p, x0, x0, x1 );
 816     /* scale by pow( 2, -30 ), to compensate for the format conversion
 817        above and an extra factor of 2 so that a single gradient covers
 818        the [-1,1] range */
 819     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 820
 821     release_tmps( c, mark );
 822 }
 823
 824 static void emit_noise1( struct brw_wm_compile *c,
 825                          const struct prog_instruction *inst )
 826 {
 827     struct brw_compile *p = &c->func;
 828     struct brw_reg src, param, dst;
 829     GLuint mask = inst->DstReg.WriteMask;
 830     int i;
 831     int mark = mark_tmps( c );
 832
 833     assert( mark == 0 );
 834
 835     src = get_src_reg( c, inst, 0, 0 );
 836
 837     param = alloc_tmp( c );
 838
 839     brw_MOV( p, param, src );
 840
 841     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 842
 843     /* Fill in the result: */
 844     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 845     for (i = 0 ; i < 4; i++) {
 846         if (mask & (1<<i)) {
 847             dst = get_dst_reg(c, inst, i);
 848             brw_MOV( p, dst, param );
 849         }
 850     }
 851     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 852         brw_set_saturate( p, 0 );
 853
 854     release_tmps( c, mark );
 855 }
 856
 857 static void noise2_sub( struct brw_wm_compile *c ) {
 858
 859     struct brw_compile *p = &c->func;
 860     struct brw_reg param0, param1,
 861         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 862         t, tmp[ 4 ], /* float temporaries */
 863         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 864     int i;
 865     int mark = mark_tmps( c );
 866
 867     x0y0 = alloc_tmp( c );
 868     x0y1 = alloc_tmp( c );
 869     x1y0 = alloc_tmp( c );
 870     x1y1 = alloc_tmp( c );
 871     t = alloc_tmp( c );
 872     for( i = 0; i < 4; i++ ) {
 873         tmp[ i ] = alloc_tmp( c );
 874         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 875     }
 876     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 877     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 878     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 879
 880     param0 = lookup_tmp( c, mark - 3 );
 881     param1 = lookup_tmp( c, mark - 2 );
 882
 883     brw_set_access_mode( p, BRW_ALIGN_1 );
 884
 885     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 886        be hashed.  Also compute the remainders (offsets within the unit
 887        square), interleaved to reduce register dependency penalties. */
 888     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 889     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 890     brw_FRC( p, param0, param0 );
 891     brw_FRC( p, param1, param1 );
 892     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 893     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 894              low_words( itmp[ 1 ] ) );
 895     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 896     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 897     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 898     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 899     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 900
 901     /* We're now ready to perform the hashing.  The four hashes are
 902        interleaved for performance.  The hash function used is
 903        designed to rapidly achieve avalanche and require only 32x16
 904        bit multiplication, and 16-bit swizzles (which we get for
 905        free).  We can't use immediate operands in the multiplies,
 906        because immediates are permitted only in src1 and the 16-bit
 907        factor is permitted only in src0. */
 908     for( i = 0; i < 4; i++ )
 909         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 910     for( i = 0; i < 4; i++ )
 911         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 912                  high_words( itmp[ i ] ) );
 913     for( i = 0; i < 4; i++ )
 914         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 915     for( i = 0; i < 4; i++ )
 916         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 917                  high_words( itmp[ i ] ) );
 918     for( i = 0; i < 4; i++ )
 919         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 920     for( i = 0; i < 4; i++ )
 921         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 922                  high_words( itmp[ i ] ) );
 923
 924     /* Now we want to initialise the four gradients based on the
 925        hashes.  Format conversion from signed integer to float leaves
 926        everything scaled too high by a factor of pow( 2, 15 ), but
 927        we correct for that right at the end. */
 928     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 929     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 930     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 931     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 932     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 933
 934     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 935     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 936     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 937     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 938
 939     brw_MUL( p, x1y0, x1y0, t );
 940     brw_MUL( p, x1y1, x1y1, t );
 941     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 942     brw_MUL( p, x0y0, x0y0, param0 );
 943     brw_MUL( p, x0y1, x0y1, param0 );
 944
 945     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 946     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 947     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 948     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 949
 950     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 951     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 952     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 953     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 954
 955     /* We interpolate between the gradients using the polynomial
 956        6t^5 - 15t^4 + 10t^3 (Perlin). */
 957     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
 958     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
 959     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 960     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
 961     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 962     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 963     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
 964                                                  pipeline */
 965     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 966     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
 967     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 968     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 969     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
 970                                                  pipeline */
 971     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 972     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 973     brw_MUL( p, param0, tmp[ 0 ], param0 );
 974     brw_MUL( p, param1, tmp[ 1 ], param1 );
 975
 976     /* Here we interpolate in the y dimension... */
 977     brw_MUL( p, x0y1, x0y1, param1 );
 978     brw_MUL( p, x1y1, x1y1, param1 );
 979     brw_ADD( p, x0y0, x0y0, x0y1 );
 980     brw_ADD( p, x1y0, x1y0, x1y1 );
 981
 982     /* And now in x.  There are horrible register dependencies here,
 983        but we have nothing else to do. */
 984     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
 985     brw_MUL( p, x1y0, x1y0, param0 );
 986     brw_ADD( p, x0y0, x0y0, x1y0 );
 987
 988     /* scale by pow( 2, -15 ), as described above */
 989     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
 990
 991     release_tmps( c, mark );
 992 }
 993
 994 static void emit_noise2( struct brw_wm_compile *c,
 995                          const struct prog_instruction *inst )
 996 {
 997     struct brw_compile *p = &c->func;
 998     struct brw_reg src0, src1, param0, param1, dst;
 999     GLuint mask = inst->DstReg.WriteMask;
1000     int i;
1001     int mark = mark_tmps( c );
1002
1003     assert( mark == 0 );
1004
1005     src0 = get_src_reg( c, inst, 0, 0 );
1006     src1 = get_src_reg( c, inst, 0, 1 );
1007
1008     param0 = alloc_tmp( c );
1009     param1 = alloc_tmp( c );
1010
1011     brw_MOV( p, param0, src0 );
1012     brw_MOV( p, param1, src1 );
1013
1014     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1015
1016     /* Fill in the result: */
1017     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1018     for (i = 0 ; i < 4; i++) {
1019         if (mask & (1<<i)) {
1020             dst = get_dst_reg(c, inst, i);
1021             brw_MOV( p, dst, param0 );
1022         }
1023     }
1024     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1025         brw_set_saturate( p, 0 );
1026
1027     release_tmps( c, mark );
1028 }
1029
1030 /**
1031  * The three-dimensional case is much like the one- and two- versions above,
1032  * but since the number of corners is rapidly growing we now pack 16 16-bit
1033  * hashes into each register to extract more parallelism from the EUs.
1034  */
1035 static void noise3_sub( struct brw_wm_compile *c ) {
1036
1037     struct brw_compile *p = &c->func;
1038     struct brw_reg param0, param1, param2,
1039         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1040         xi, yi, zi, /* interpolation coefficients */
1041         t, tmp[ 8 ], /* float temporaries */
1042         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1043         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1044     int i;
1045     int mark = mark_tmps( c );
1046
1047     x0y0 = alloc_tmp( c );
1048     x0y1 = alloc_tmp( c );
1049     x1y0 = alloc_tmp( c );
1050     x1y1 = alloc_tmp( c );
1051     xi = alloc_tmp( c );
1052     yi = alloc_tmp( c );
1053     zi = alloc_tmp( c );
1054     t = alloc_tmp( c );
1055     for( i = 0; i < 8; i++ ) {
1056         tmp[ i ] = alloc_tmp( c );
1057         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1058         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1059     }
1060
1061     param0 = lookup_tmp( c, mark - 4 );
1062     param1 = lookup_tmp( c, mark - 3 );
1063     param2 = lookup_tmp( c, mark - 2 );
1064
1065     brw_set_access_mode( p, BRW_ALIGN_1 );
1066
1067     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1068        be hashed.  Also compute the remainders (offsets within the unit
1069        cube), interleaved to reduce register dependency penalties. */
1070     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1071     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1072     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1073     brw_FRC( p, param0, param0 );
1074     brw_FRC( p, param1, param1 );
1075     brw_FRC( p, param2, param2 );
1076     /* Since we now have only 16 bits of precision in the hash, we must
1077        be more careful about thorough mixing to maintain entropy as we
1078        squash the input vector into a small scalar. */
1079     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1080     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1081     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1082              brw_imm_uw( 0x9B93 ) );
1083     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1084              brw_imm_uw( 0xBC8F ) );
1085
1086     /* Temporarily disable the execution mask while we work with ExecSize=16
1087        channels (the mask is set for ExecSize=8 and is probably incorrect).
1088        Although this might cause execution of unwanted channels, the code
1089        writes only to temporary registers and has no side effects, so
1090        disabling the mask is harmless. */
1091     brw_push_insn_state( p );
1092     brw_set_mask_control( p, BRW_MASK_DISABLE );
1093     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1094     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1095     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1096
1097     /* We're now ready to perform the hashing.  The eight hashes are
1098        interleaved for performance.  The hash function used is
1099        designed to rapidly achieve avalanche and require only 16x16
1100        bit multiplication, and 8-bit swizzles (which we get for
1101        free). */
1102     for( i = 0; i < 4; i++ )
1103         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1104     for( i = 0; i < 4; i++ )
1105         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1106                  odd_bytes( wtmp[ i ] ) );
1107     for( i = 0; i < 4; i++ )
1108         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1109     for( i = 0; i < 4; i++ )
1110         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1111                  odd_bytes( wtmp[ i ] ) );
1112     brw_pop_insn_state( p );
1113
1114     /* Now we want to initialise the four rear gradients based on the
1115        hashes.  Format conversion from signed integer to float leaves
1116        everything scaled too high by a factor of pow( 2, 15 ), but
1117        we correct for that right at the end. */
1118     /* x component */
1119     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1120     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1121     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1122     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1123     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1124
1125     brw_push_insn_state( p );
1126     brw_set_mask_control( p, BRW_MASK_DISABLE );
1127     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1128     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1129     brw_pop_insn_state( p );
1130
1131     brw_MUL( p, x1y0, x1y0, t );
1132     brw_MUL( p, x1y1, x1y1, t );
1133     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1134     brw_MUL( p, x0y0, x0y0, param0 );
1135     brw_MUL( p, x0y1, x0y1, param0 );
1136
1137     /* y component */
1138     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1139     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1140     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1141     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1142
1143     brw_push_insn_state( p );
1144     brw_set_mask_control( p, BRW_MASK_DISABLE );
1145     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1146     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1147     brw_pop_insn_state( p );
1148
1149     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1150     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1151     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1152     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1153     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1154
1155     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1156     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1157     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1158     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1159
1160     /* z component */
1161     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1162     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1163     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1164     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1165
1166     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1167     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1168     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1169     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1170
1171     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1172     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1173     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1174     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1175
1176     /* We interpolate between the gradients using the polynomial
1177        6t^5 - 15t^4 + 10t^3 (Perlin). */
1178     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1179     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1180     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1181     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1182     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1183     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1184     brw_MUL( p, xi, xi, param0 );
1185     brw_MUL( p, yi, yi, param1 );
1186     brw_MUL( p, zi, zi, param2 );
1187     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1188     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1189     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1190     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1191     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1192     brw_MUL( p, xi, xi, param0 );
1193     brw_MUL( p, yi, yi, param1 );
1194     brw_MUL( p, zi, zi, param2 );
1195     brw_MUL( p, xi, xi, param0 );
1196     brw_MUL( p, yi, yi, param1 );
1197     brw_MUL( p, zi, zi, param2 );
1198     brw_MUL( p, xi, xi, param0 );
1199     brw_MUL( p, yi, yi, param1 );
1200     brw_MUL( p, zi, zi, param2 );
1201
1202     /* Here we interpolate in the y dimension... */
1203     brw_MUL( p, x0y1, x0y1, yi );
1204     brw_MUL( p, x1y1, x1y1, yi );
1205     brw_ADD( p, x0y0, x0y0, x0y1 );
1206     brw_ADD( p, x1y0, x1y0, x1y1 );
1207
1208     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1209     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1210     brw_MUL( p, x1y0, x1y0, xi );
1211     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1212
1213     /* Now do the same thing for the front four gradients... */
1214     /* x component */
1215     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1216     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1217     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1218     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1219
1220     brw_push_insn_state( p );
1221     brw_set_mask_control( p, BRW_MASK_DISABLE );
1222     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1223     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1224     brw_pop_insn_state( p );
1225
1226     brw_MUL( p, x1y0, x1y0, t );
1227     brw_MUL( p, x1y1, x1y1, t );
1228     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1229     brw_MUL( p, x0y0, x0y0, param0 );
1230     brw_MUL( p, x0y1, x0y1, param0 );
1231
1232     /* y component */
1233     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1234     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1235     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1236     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1237
1238     brw_push_insn_state( p );
1239     brw_set_mask_control( p, BRW_MASK_DISABLE );
1240     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1241     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1242     brw_pop_insn_state( p );
1243
1244     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1245     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1246     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1247     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1248     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1249
1250     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1251     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1252     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1253     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1254
1255     /* z component */
1256     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1257     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1258     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1259     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1260
1261     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1262     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1263     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1264     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1265
1266     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1267     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1268     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1269     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1270
1271     /* The interpolation coefficients are still around from last time, so
1272        again interpolate in the y dimension... */
1273     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1274     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1275     brw_MUL( p, x0y1, x0y1, yi );
1276     brw_MUL( p, x1y1, x1y1, yi );
1277     brw_ADD( p, x0y0, x0y0, x0y1 );
1278     brw_ADD( p, x1y0, x1y0, x1y1 );
1279
1280     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1281        time put the front face in tmp[ 1 ] and we're nearly there... */
1282     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1283     brw_MUL( p, x1y0, x1y0, xi );
1284     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1285
1286     /* The final interpolation, in the z dimension: */
1287     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1288     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1289     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1290
1291     /* scale by pow( 2, -15 ), as described above */
1292     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1293
1294     release_tmps( c, mark );
1295 }
1296
1297 static void emit_noise3( struct brw_wm_compile *c,
1298                          const struct prog_instruction *inst )
1299 {
1300     struct brw_compile *p = &c->func;
1301     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1302     GLuint mask = inst->DstReg.WriteMask;
1303     int i;
1304     int mark = mark_tmps( c );
1305
1306     assert( mark == 0 );
1307
1308     src0 = get_src_reg( c, inst, 0, 0 );
1309     src1 = get_src_reg( c, inst, 0, 1 );
1310     src2 = get_src_reg( c, inst, 0, 2 );
1311
1312     param0 = alloc_tmp( c );
1313     param1 = alloc_tmp( c );
1314     param2 = alloc_tmp( c );
1315
1316     brw_MOV( p, param0, src0 );
1317     brw_MOV( p, param1, src1 );
1318     brw_MOV( p, param2, src2 );
1319
1320     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1321
1322     /* Fill in the result: */
1323     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1324     for (i = 0 ; i < 4; i++) {
1325         if (mask & (1<<i)) {
1326             dst = get_dst_reg(c, inst, i);
1327             brw_MOV( p, dst, param0 );
1328         }
1329     }
1330     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1331         brw_set_saturate( p, 0 );
1332
1333     release_tmps( c, mark );
1334 }
1335
1336 /**
1337  * For the four-dimensional case, the little micro-optimisation benefits
1338  * we obtain by unrolling all the loops aren't worth the massive bloat it
1339  * now causes.  Instead, we loop twice around performing a similar operation
1340  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1341  * code to glue it all together.
1342  */
1343 static void noise4_sub( struct brw_wm_compile *c )
1344 {
1345     struct brw_compile *p = &c->func;
1346     struct brw_reg param[ 4 ],
1347         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1348         w0, /* noise for the w=0 cube */
1349         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1350         interp[ 4 ], /* interpolation coefficients */
1351         t, tmp[ 8 ], /* float temporaries */
1352         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1353         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1354     int i, j;
1355     int mark = mark_tmps( c );
1356     GLuint loop, origin;
1357
1358     x0y0 = alloc_tmp( c );
1359     x0y1 = alloc_tmp( c );
1360     x1y0 = alloc_tmp( c );
1361     x1y1 = alloc_tmp( c );
1362     t = alloc_tmp( c );
1363     w0 = alloc_tmp( c );
1364     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1365     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1366
1367     for( i = 0; i < 4; i++ ) {
1368         param[ i ] = lookup_tmp( c, mark - 5 + i );
1369         interp[ i ] = alloc_tmp( c );
1370     }
1371
1372     for( i = 0; i < 8; i++ ) {
1373         tmp[ i ] = alloc_tmp( c );
1374         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1375         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1376     }
1377
1378     brw_set_access_mode( p, BRW_ALIGN_1 );
1379
1380     /* We only want 16 bits of precision from the integral part of each
1381        co-ordinate, but unfortunately the RNDD semantics would saturate
1382        at 16 bits if we performed the operation directly to a 16-bit
1383        destination.  Therefore, we round to 32-bit temporaries where
1384        appropriate, and then store only the lower 16 bits. */
1385     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1386     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1387     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1388     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1389     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1390     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1391
1392     /* Modify the flag register here, because the side effect is useful
1393        later (see below).  We know for certain that all flags will be
1394        cleared, since the FRC instruction cannot possibly generate
1395        negative results.  Even for exceptional inputs (infinities, denormals,
1396        NaNs), the architecture guarantees that the L conditional is false. */
1397     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1398     brw_FRC( p, param[ 0 ], param[ 0 ] );
1399     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1400     for( i = 1; i < 4; i++ )
1401         brw_FRC( p, param[ i ], param[ i ] );
1402
1403     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1404        of all. */
1405     for( i = 0; i < 4; i++ )
1406         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1407     for( i = 0; i < 4; i++ )
1408         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1409     for( i = 0; i < 4; i++ )
1410         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1411     for( i = 0; i < 4; i++ )
1412         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1413     for( j = 0; j < 3; j++ )
1414         for( i = 0; i < 4; i++ )
1415             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1416
1417     /* Mark the current address, as it will be a jump destination.  The
1418        following code will be executed twice: first, with the flag
1419        register clear indicating the w=0 case, and second with flags
1420        set for w=1. */
1421     loop = p->nr_insn;
1422
1423     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1424        be hashed.  Since we have only 16 bits of precision in the hash, we
1425        must be careful about thorough mixing to maintain entropy as we
1426        squash the input vector into a small scalar. */
1427     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1428              brw_imm_uw( 0xBC8F ) );
1429     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1430              brw_imm_uw( 0xD0BD ) );
1431     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1432              brw_imm_uw( 0x9B93 ) );
1433     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1434              brw_imm_uw( 0xA359 ) );
1435     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1436              brw_imm_uw( 0xBC8F ) );
1437
1438     /* Temporarily disable the execution mask while we work with ExecSize=16
1439        channels (the mask is set for ExecSize=8 and is probably incorrect).
1440        Although this might cause execution of unwanted channels, the code
1441        writes only to temporary registers and has no side effects, so
1442        disabling the mask is harmless. */
1443     brw_push_insn_state( p );
1444     brw_set_mask_control( p, BRW_MASK_DISABLE );
1445     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1446     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1447     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1448
1449     /* We're now ready to perform the hashing.  The eight hashes are
1450        interleaved for performance.  The hash function used is
1451        designed to rapidly achieve avalanche and require only 16x16
1452        bit multiplication, and 8-bit swizzles (which we get for
1453        free). */
1454     for( i = 0; i < 4; i++ )
1455         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1456     for( i = 0; i < 4; i++ )
1457         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1458                  odd_bytes( wtmp[ i ] ) );
1459     for( i = 0; i < 4; i++ )
1460         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1461     for( i = 0; i < 4; i++ )
1462         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1463                  odd_bytes( wtmp[ i ] ) );
1464     brw_pop_insn_state( p );
1465
1466     /* Now we want to initialise the four rear gradients based on the
1467        hashes.  Format conversion from signed integer to float leaves
1468        everything scaled too high by a factor of pow( 2, 15 ), but
1469        we correct for that right at the end. */
1470     /* x component */
1471     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1472     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1473     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1474     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1475     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1476
1477     brw_push_insn_state( p );
1478     brw_set_mask_control( p, BRW_MASK_DISABLE );
1479     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1480     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1481     brw_pop_insn_state( p );
1482
1483     brw_MUL( p, x1y0, x1y0, t );
1484     brw_MUL( p, x1y1, x1y1, t );
1485     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1486     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1487     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1488
1489     /* y component */
1490     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1491     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1492     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1493     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1494
1495     brw_push_insn_state( p );
1496     brw_set_mask_control( p, BRW_MASK_DISABLE );
1497     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1498     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1499     brw_pop_insn_state( p );
1500
1501     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1502     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1503     /* prepare t for the w component (used below): w the first time through
1504        the loop; w - 1 the second time) */
1505     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1506     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1507     p->current->header.predicate_inverse = 1;
1508     brw_MOV( p, t, param[ 3 ] );
1509     p->current->header.predicate_inverse = 0;
1510     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1511     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1512     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1513
1514     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1515     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1516     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1517     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1518
1519     /* z component */
1520     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1521     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1522     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1523     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1524
1525     brw_push_insn_state( p );
1526     brw_set_mask_control( p, BRW_MASK_DISABLE );
1527     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1528     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1529     brw_pop_insn_state( p );
1530
1531     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1532     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1533     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1534     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1535
1536     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1537     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1538     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1539     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1540
1541     /* w component */
1542     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1543     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1544     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1545     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1546
1547     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1548     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1549     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1550     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1551     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1552
1553     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1554     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1555     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1556     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1557
1558     /* Here we interpolate in the y dimension... */
1559     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1560     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1561     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1562     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1563     brw_ADD( p, x0y0, x0y0, x0y1 );
1564     brw_ADD( p, x1y0, x1y0, x1y1 );
1565
1566     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1567     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1568     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1569     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1570
1571     /* Now do the same thing for the front four gradients... */
1572     /* x component */
1573     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1574     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1575     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1576     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1577
1578     brw_push_insn_state( p );
1579     brw_set_mask_control( p, BRW_MASK_DISABLE );
1580     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1581     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1582     brw_pop_insn_state( p );
1583
1584     brw_MUL( p, x1y0, x1y0, t );
1585     brw_MUL( p, x1y1, x1y1, t );
1586     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1587     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1588     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1589
1590     /* y component */
1591     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1592     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1593     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1594     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1595
1596     brw_push_insn_state( p );
1597     brw_set_mask_control( p, BRW_MASK_DISABLE );
1598     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1599     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1600     brw_pop_insn_state( p );
1601
1602     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1603     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1604     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1605     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1606     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1607
1608     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1609     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1610     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1611     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1612
1613     /* z component */
1614     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1615     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1616     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1617     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1618
1619     brw_push_insn_state( p );
1620     brw_set_mask_control( p, BRW_MASK_DISABLE );
1621     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1622     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1623     brw_pop_insn_state( p );
1624
1625     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1626     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1627     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1628     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1629     /* prepare t for the w component (used below): w the first time through
1630        the loop; w - 1 the second time) */
1631     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1632     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1633     p->current->header.predicate_inverse = 1;
1634     brw_MOV( p, t, param[ 3 ] );
1635     p->current->header.predicate_inverse = 0;
1636     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1637
1638     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1639     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1640     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1641     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1642
1643     /* w component */
1644     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1645     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1646     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1647     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1648
1649     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1650     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1651     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1652     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1653
1654     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1655     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1656     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1657     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1658
1659     /* Interpolate in the y dimension: */
1660     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1661     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1662     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1663     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1664     brw_ADD( p, x0y0, x0y0, x0y1 );
1665     brw_ADD( p, x1y0, x1y0, x1y1 );
1666
1667     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1668        time put the front face in tmp[ 1 ] and we're nearly there... */
1669     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1670     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1671     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1672
1673     /* Another interpolation, in the z dimension: */
1674     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1675     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1676     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1677
1678     /* Exit the loop if we've computed both cubes... */
1679     origin = p->nr_insn;
1680     brw_push_insn_state( p );
1681     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1682     brw_set_mask_control( p, BRW_MASK_DISABLE );
1683     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1684     brw_pop_insn_state( p );
1685
1686     /* Save the result for the w=0 case, and increment the w coordinate: */
1687     brw_MOV( p, w0, tmp[ 0 ] );
1688     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1689              brw_imm_uw( 1 ) );
1690
1691     /* Loop around for the other cube.  Explicitly set the flag register
1692        (unfortunately we must spend an extra instruction to do this: we
1693        can't rely on a side effect of the previous MOV or ADD because
1694        conditional modifiers which are normally true might be false in
1695        exceptional circumstances, e.g. given a NaN input; the add to
1696        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1697     brw_push_insn_state( p );
1698     brw_set_mask_control( p, BRW_MASK_DISABLE );
1699     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1700     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1701              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1702     brw_pop_insn_state( p );
1703
1704     /* Patch the previous conditional branch now that we know the
1705        destination address. */
1706     brw_set_src1( p->store + origin,
1707                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1708
1709     /* The very last interpolation. */
1710     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1711     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1712     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1713
1714     /* scale by pow( 2, -15 ), as described above */
1715     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1716
1717     release_tmps( c, mark );
1718 }
1719
1720 static void emit_noise4( struct brw_wm_compile *c,
1721                          const struct prog_instruction *inst )
1722 {
1723     struct brw_compile *p = &c->func;
1724     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1725     GLuint mask = inst->DstReg.WriteMask;
1726     int i;
1727     int mark = mark_tmps( c );
1728
1729     assert( mark == 0 );
1730
1731     src0 = get_src_reg( c, inst, 0, 0 );
1732     src1 = get_src_reg( c, inst, 0, 1 );
1733     src2 = get_src_reg( c, inst, 0, 2 );
1734     src3 = get_src_reg( c, inst, 0, 3 );
1735
1736     param0 = alloc_tmp( c );
1737     param1 = alloc_tmp( c );
1738     param2 = alloc_tmp( c );
1739     param3 = alloc_tmp( c );
1740
1741     brw_MOV( p, param0, src0 );
1742     brw_MOV( p, param1, src1 );
1743     brw_MOV( p, param2, src2 );
1744     brw_MOV( p, param3, src3 );
1745
1746     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1747
1748     /* Fill in the result: */
1749     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1750     for (i = 0 ; i < 4; i++) {
1751         if (mask & (1<<i)) {
1752             dst = get_dst_reg(c, inst, i);
1753             brw_MOV( p, dst, param0 );
1754         }
1755     }
1756     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1757         brw_set_saturate( p, 0 );
1758
1759     release_tmps( c, mark );
1760 }
1761
1762 /**
1763  * Resolve subroutine calls after code emit is done.
1764  */
1765 static void post_wm_emit( struct brw_wm_compile *c )
1766 {
1767     brw_resolve_cals(&c->func);
1768 }
1769
1770 static void
1771 get_argument_regs(struct brw_wm_compile *c,
1772                   const struct prog_instruction *inst,
1773                   int index,
1774                   struct brw_reg *dst,
1775                   struct brw_reg *regs,
1776                   int mask)
1777 {
1778     struct brw_compile *p = &c->func;
1779     int i, j;
1780
1781     for (i = 0; i < 4; i++) {
1782         if (mask & (1 << i)) {
1783             regs[i] = get_src_reg(c, inst, index, i);
1784
1785             /* Unalias destination registers from our sources. */
1786             if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1787                for (j = 0; j < 4; j++) {
1788                    if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1789                        struct brw_reg tmp = alloc_tmp(c);
1790                        brw_MOV(p, tmp, regs[i]);
1791                        regs[i] = tmp;
1792                        break;
1793                    }
1794                }
1795             }
1796         }
1797     }
1798 }
1799
1800 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1801 {
1802    struct intel_context *intel = &brw->intel;
1803 #define MAX_IF_DEPTH 32
1804 #define MAX_LOOP_DEPTH 32
1805     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1806     GLuint i, if_depth = 0, loop_depth = 0;
1807     struct brw_compile *p = &c->func;
1808     struct brw_indirect stack_index = brw_indirect(0, 0);
1809
1810     c->out_of_regs = GL_FALSE;
1811
1812     prealloc_reg(c);
1813     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1814     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1815
1816     for (i = 0; i < c->nr_fp_insns; i++) {
1817         const struct prog_instruction *inst = &c->prog_instructions[i];
1818         int dst_flags;
1819         struct brw_reg args[3][4], dst[4];
1820         int j;
1821         int mark = mark_tmps( c );
1822
1823         c->cur_inst = i;
1824
1825 #if 0
1826         printf("Inst %d: ", i);
1827         _mesa_print_instruction(inst);
1828 #endif
1829
1830         /* fetch any constants that this instruction needs */
1831         if (c->fp->use_const_buffer)
1832            fetch_constants(c, inst);
1833
1834         if (inst->Opcode != OPCODE_ARL) {
1835            for (j = 0; j < 4; j++) {
1836               if (inst->DstReg.WriteMask & (1 << j))
1837                  dst[j] = get_dst_reg(c, inst, j);
1838               else
1839                  dst[j] = brw_null_reg();
1840            }
1841         }
1842         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1843             get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1844
1845         dst_flags = inst->DstReg.WriteMask;
1846         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1847             dst_flags |= SATURATE;
1848
1849         if (inst->CondUpdate)
1850             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1851         else
1852             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1853
1854         switch (inst->Opcode) {
1855             case WM_PIXELXY:
1856                 emit_pixel_xy(c, dst, dst_flags);
1857                 break;
1858             case WM_DELTAXY:
1859                 emit_delta_xy(p, dst, dst_flags, args[0]);
1860                 break;
1861             case WM_PIXELW:
1862                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1863                 break;
1864             case WM_LINTERP:
1865                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1866                 break;
1867             case WM_PINTERP:
1868                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1869                 break;
1870             case WM_CINTERP:
1871                 emit_cinterp(p, dst, dst_flags, args[0]);
1872                 break;
1873             case WM_WPOSXY:
1874                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1875                 break;
1876             case WM_FB_WRITE:
1877                 emit_fb_write(c, args[0], args[1], args[2],
1878                               INST_AUX_GET_TARGET(inst->Aux),
1879                               inst->Aux & INST_AUX_EOT);
1880                 break;
1881             case WM_FRONTFACING:
1882                 emit_frontfacing(p, dst, dst_flags);
1883                 break;
1884             case OPCODE_ADD:
1885                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1886                 break;
1887             case OPCODE_ARL:
1888                 emit_arl(c, inst);
1889                 break;
1890             case OPCODE_FRC:
1891                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1892                 break;
1893             case OPCODE_FLR:
1894                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1895                 break;
1896             case OPCODE_LRP:
1897                 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1898                 break;
1899             case OPCODE_TRUNC:
1900                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1901                 break;
1902             case OPCODE_MOV:
1903             case OPCODE_SWZ:
1904                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1905                 break;
1906             case OPCODE_DP3:
1907                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1908                 break;
1909             case OPCODE_DP4:
1910                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1911                 break;
1912             case OPCODE_XPD:
1913                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1914                 break;
1915             case OPCODE_DPH:
1916                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1917                 break;
1918             case OPCODE_RCP:
1919                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1920                 break;
1921             case OPCODE_RSQ:
1922                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1923                 break;
1924             case OPCODE_SIN:
1925                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1926                 break;
1927             case OPCODE_COS:
1928                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1929                 break;
1930             case OPCODE_EX2:
1931                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1932                 break;
1933             case OPCODE_LG2:
1934                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1935                 break;
1936             case OPCODE_CMP:
1937                 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1938                 break;
1939             case OPCODE_MIN:
1940                 emit_min(p, dst, dst_flags, args[0], args[1]);
1941                 break;
1942             case OPCODE_MAX:
1943                 emit_max(p, dst, dst_flags, args[0], args[1]);
1944                 break;
1945             case OPCODE_DDX:
1946             case OPCODE_DDY:
1947                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1948                           args[0]);
1949                 break;
1950             case OPCODE_SLT:
1951                 emit_sop(p, dst, dst_flags,
1952                          BRW_CONDITIONAL_L, args[0], args[1]);
1953                 break;
1954             case OPCODE_SLE:
1955                 emit_sop(p, dst, dst_flags,
1956                          BRW_CONDITIONAL_LE, args[0], args[1]);
1957                 break;
1958             case OPCODE_SGT:
1959                 emit_sop(p, dst, dst_flags,
1960                          BRW_CONDITIONAL_G, args[0], args[1]);
1961                 break;
1962             case OPCODE_SGE:
1963                 emit_sop(p, dst, dst_flags,
1964                          BRW_CONDITIONAL_GE, args[0], args[1]);
1965                 break;
1966             case OPCODE_SEQ:
1967                 emit_sop(p, dst, dst_flags,
1968                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1969                 break;
1970             case OPCODE_SNE:
1971                 emit_sop(p, dst, dst_flags,
1972                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1973                 break;
1974             case OPCODE_MUL:
1975                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1976                 break;
1977             case OPCODE_POW:
1978                 emit_math2(c, BRW_MATH_FUNCTION_POW,
1979                            dst, dst_flags, args[0], args[1]);
1980                 break;
1981             case OPCODE_MAD:
1982                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1983                 break;
1984             case OPCODE_NOISE1:
1985                 emit_noise1(c, inst);
1986                 break;
1987             case OPCODE_NOISE2:
1988                 emit_noise2(c, inst);
1989                 break;
1990             case OPCODE_NOISE3:
1991                 emit_noise3(c, inst);
1992                 break;
1993             case OPCODE_NOISE4:
1994                 emit_noise4(c, inst);
1995                 break;
1996             case OPCODE_TEX:
1997                 emit_tex(c, dst, dst_flags, args[0],
1998                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1999                                  0, 1, 0, 0),
2000                          inst->TexSrcTarget,
2001                          inst->TexSrcUnit,
2002                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
2003                 break;
2004             case OPCODE_TXB:
2005                 emit_txb(c, dst, dst_flags, args[0],
2006                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2007                                  0, 1, 0, 0),
2008                          inst->TexSrcTarget,
2009                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2010                 break;
2011             case OPCODE_KIL_NV:
2012                 emit_kil(c);
2013                 break;
2014             case OPCODE_IF:
2015                 assert(if_depth < MAX_IF_DEPTH);
2016                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2017                 break;
2018             case OPCODE_ELSE:
2019                 assert(if_depth > 0);
2020                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2021                 break;
2022             case OPCODE_ENDIF:
2023                 assert(if_depth > 0);
2024                 brw_ENDIF(p, if_inst[--if_depth]);
2025                 break;
2026             case OPCODE_BGNSUB:
2027                 brw_save_label(p, inst->Comment, p->nr_insn);
2028                 break;
2029             case OPCODE_ENDSUB:
2030                 /* no-op */
2031                 break;
2032             case OPCODE_CAL:
2033                 brw_push_insn_state(p);
2034                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2035                 brw_set_access_mode(p, BRW_ALIGN_1);
2036                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2037                 brw_set_access_mode(p, BRW_ALIGN_16);
2038                 brw_ADD(p, get_addr_reg(stack_index),
2039                          get_addr_reg(stack_index), brw_imm_d(4));
2040                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2041                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2042                 brw_pop_insn_state(p);
2043                 break;
2044
2045             case OPCODE_RET:
2046                 brw_push_insn_state(p);
2047                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2048                 brw_ADD(p, get_addr_reg(stack_index),
2049                         get_addr_reg(stack_index), brw_imm_d(-4));
2050                 brw_set_access_mode(p, BRW_ALIGN_1);
2051                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2052                 brw_set_access_mode(p, BRW_ALIGN_16);
2053                 brw_pop_insn_state(p);
2054
2055                 break;
2056             case OPCODE_BGNLOOP:
2057                 /* XXX may need to invalidate the current_constant regs */
2058                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2059                 break;
2060             case OPCODE_BRK:
2061                 brw_BREAK(p);
2062                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2063                 break;
2064             case OPCODE_CONT:
2065                 brw_CONT(p);
2066                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2067                 break;
2068             case OPCODE_ENDLOOP:
2069                {
2070                   struct brw_instruction *inst0, *inst1;
2071                   GLuint br = 1;
2072
2073                   if (intel->is_ironlake)
2074                      br = 2;
2075
2076                   assert(loop_depth > 0);
2077                   loop_depth--;
2078                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2079                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2080                   while (inst0 > loop_inst[loop_depth]) {
2081                      inst0--;
2082                      if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2083                          inst0->bits3.if_else.jump_count == 0) {
2084                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2085                         inst0->bits3.if_else.pop_count = 0;
2086                      }
2087                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2088                               inst0->bits3.if_else.jump_count == 0) {
2089                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2090                         inst0->bits3.if_else.pop_count = 0;
2091                      }
2092                   }
2093                }
2094                break;
2095             default:
2096                 printf("unsupported opcode %d (%s) in fragment shader\n",
2097                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2098                        _mesa_opcode_string(inst->Opcode) : "unknown");
2099         }
2100
2101         /* Release temporaries containing any unaliased source regs. */
2102         release_tmps( c, mark );
2103
2104         if (inst->CondUpdate)
2105             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2106         else
2107             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2108     }
2109     post_wm_emit(c);
2110
2111     if (INTEL_DEBUG & DEBUG_WM) {
2112       printf("wm-native:\n");
2113       for (i = 0; i < p->nr_insn; i++)
2114          brw_disasm(stderr, &p->store[i]);
2115       printf("\n");
2116     }
2117 }
2118
2119 /**
2120  * Do GPU code generation for shaders that use GLSL features such as
2121  * flow control.  Other shaders will be compiled with the
2122  */
2123 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2124 {
2125     if (INTEL_DEBUG & DEBUG_WM) {
2126         printf("brw_wm_glsl_emit:\n");
2127     }
2128
2129     /* initial instruction translation/simplification */
2130     brw_wm_pass_fp(c);
2131
2132     /* actual code generation */
2133     brw_wm_emit_glsl(brw, c);
2134
2135     if (INTEL_DEBUG & DEBUG_WM) {
2136         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2137     }
2138
2139     c->prog_data.total_grf = num_grf_used(c);
2140     c->prog_data.total_scratch = 0;
2141 }