src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
  27        return GL_TRUE;
  28
  29     for (i = 0; i < fp->Base.NumInstructions; i++) {
  30         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  31         switch (inst->Opcode) {
  32             case OPCODE_ARL:
  33             case OPCODE_IF:
  34             case OPCODE_ENDIF:
  35             case OPCODE_CAL:
  36             case OPCODE_BRK:
  37             case OPCODE_RET:
  38             case OPCODE_NOISE1:
  39             case OPCODE_NOISE2:
  40             case OPCODE_NOISE3:
  41             case OPCODE_NOISE4:
  42             case OPCODE_BGNLOOP:
  43                 return GL_TRUE;
  44             default:
  45                 break;
  46         }
  47     }
  48     return GL_FALSE;
  49 }
  50
  51
  52
  53 static void
  54 reclaim_temps(struct brw_wm_compile *c);
  55
  56
  57 /** Mark GRF register as used. */
  58 static void
  59 prealloc_grf(struct brw_wm_compile *c, int r)
  60 {
  61    c->used_grf[r] = GL_TRUE;
  62 }
  63
  64
  65 /** Mark given GRF register as not in use. */
  66 static void
  67 release_grf(struct brw_wm_compile *c, int r)
  68 {
  69    /*assert(c->used_grf[r]);*/
  70    c->used_grf[r] = GL_FALSE;
  71    c->first_free_grf = MIN2(c->first_free_grf, r);
  72 }
  73
  74
  75 /** Return index of a free GRF, mark it as used. */
  76 static int
  77 alloc_grf(struct brw_wm_compile *c)
  78 {
  79    GLuint r;
  80    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  81       if (!c->used_grf[r]) {
  82          c->used_grf[r] = GL_TRUE;
  83          c->first_free_grf = r + 1;  /* a guess */
  84          return r;
  85       }
  86    }
  87
  88    /* no free temps, try to reclaim some */
  89    reclaim_temps(c);
  90    c->first_free_grf = 0;
  91
  92    /* try alloc again */
  93    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  94       if (!c->used_grf[r]) {
  95          c->used_grf[r] = GL_TRUE;
  96          c->first_free_grf = r + 1;  /* a guess */
  97          return r;
  98       }
  99    }
 100
 101    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
 102       assert(c->used_grf[r]);
 103    }
 104
 105    /* really, no free GRF regs found */
 106    if (!c->out_of_regs) {
 107       /* print warning once per compilation */
 108       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 109       c->out_of_regs = GL_TRUE;
 110    }
 111
 112    return -1;
 113 }
 114
 115
 116 /** Return number of GRF registers used */
 117 static int
 118 num_grf_used(const struct brw_wm_compile *c)
 119 {
 120    int r;
 121    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 122       if (c->used_grf[r])
 123          return r + 1;
 124    return 0;
 125 }
 126
 127
 128
 129 /**
 130  * Record the mapping of a Mesa register to a hardware register.
 131  */
 132 static void set_reg(struct brw_wm_compile *c, int file, int index,
 133         int component, struct brw_reg reg)
 134 {
 135     c->wm_regs[file][index][component].reg = reg;
 136     c->wm_regs[file][index][component].inited = GL_TRUE;
 137 }
 138
 139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 140 {
 141     struct brw_reg reg;
 142
 143     /* if we need to allocate another temp, grow the tmp_regs[] array */
 144     if (c->tmp_index == c->tmp_max) {
 145        int r = alloc_grf(c);
 146        if (r < 0) {
 147           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 148           r = 50; /* XXX random register! */
 149        }
 150        c->tmp_regs[ c->tmp_max++ ] = r;
 151     }
 152
 153     /* form the GRF register */
 154     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 155     /*printf("alloc_temp %d\n", reg.nr);*/
 156     assert(reg.nr < BRW_WM_MAX_GRF);
 157     return reg;
 158
 159 }
 160
 161 /**
 162  * Save current temp register info.
 163  * There must be a matching call to release_tmps().
 164  */
 165 static int mark_tmps(struct brw_wm_compile *c)
 166 {
 167     return c->tmp_index;
 168 }
 169
 170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 171 {
 172     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 173 }
 174
 175 static void release_tmps(struct brw_wm_compile *c, int mark)
 176 {
 177     c->tmp_index = mark;
 178 }
 179
 180 /**
 181  * Convert Mesa src register to brw register.
 182  *
 183  * Since we're running in SOA mode each Mesa register corresponds to four
 184  * hardware registers.  We allocate the hardware registers as needed here.
 185  *
 186  * \param file  register file, one of PROGRAM_x
 187  * \param index  register number
 188  * \param component  src component (X=0, Y=1, Z=2, W=3)
 189  * \param nr  not used?!?
 190  * \param neg  negate value?
 191  * \param abs  take absolute value?
 192  */
 193 static struct brw_reg
 194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 195         int nr, GLuint neg, GLuint abs)
 196 {
 197     struct brw_reg reg;
 198     switch (file) {
 199         case PROGRAM_STATE_VAR:
 200         case PROGRAM_CONSTANT:
 201         case PROGRAM_UNIFORM:
 202             file = PROGRAM_STATE_VAR;
 203             break;
 204         case PROGRAM_UNDEFINED:
 205             return brw_null_reg();
 206         case PROGRAM_TEMPORARY:
 207         case PROGRAM_INPUT:
 208         case PROGRAM_OUTPUT:
 209         case PROGRAM_PAYLOAD:
 210             break;
 211         default:
 212             _mesa_problem(NULL, "Unexpected file in get_reg()");
 213             return brw_null_reg();
 214     }
 215
 216     assert(index < 256);
 217     assert(component < 4);
 218
 219     /* see if we've already allocated a HW register for this Mesa register */
 220     if (c->wm_regs[file][index][component].inited) {
 221        /* yes, re-use */
 222        reg = c->wm_regs[file][index][component].reg;
 223     }
 224     else {
 225         /* no, allocate new register */
 226        int grf = alloc_grf(c);
 227        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 228        if (grf < 0) {
 229           /* totally out of temps */
 230           grf = 51; /* XXX random register! */
 231        }
 232
 233        reg = brw_vec8_grf(grf, 0);
 234        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 235
 236        set_reg(c, file, index, component, reg);
 237     }
 238
 239     if (neg & (1 << component)) {
 240         reg = negate(reg);
 241     }
 242     if (abs)
 243         reg = brw_abs(reg);
 244     return reg;
 245 }
 246
 247
 248
 249 /**
 250  * This is called if we run out of GRF registers.  Examine the live intervals
 251  * of temp regs in the program and free those which won't be used again.
 252  */
 253 static void
 254 reclaim_temps(struct brw_wm_compile *c)
 255 {
 256    GLint intBegin[MAX_PROGRAM_TEMPS];
 257    GLint intEnd[MAX_PROGRAM_TEMPS];
 258    int index;
 259
 260    /*printf("Reclaim temps:\n");*/
 261
 262    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 263                              intBegin, intEnd);
 264
 265    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 266       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 267          /* program temp[i] can be freed */
 268          int component;
 269          /*printf("  temp[%d] is dead\n", index);*/
 270          for (component = 0; component < 4; component++) {
 271             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 272                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 273                release_grf(c, r);
 274                /*
 275                printf("  Reclaim temp %d, reg %d at inst %d\n",
 276                       index, r, c->cur_inst);
 277                */
 278                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 279             }
 280          }
 281       }
 282    }
 283 }
 284
 285
 286
 287
 288 /**
 289  * Preallocate registers.  This sets up the Mesa to hardware register
 290  * mapping for certain registers, such as constants (uniforms/state vars)
 291  * and shader inputs.
 292  */
 293 static void prealloc_reg(struct brw_wm_compile *c)
 294 {
 295     struct intel_context *intel = &c->func.brw->intel;
 296     int i, j;
 297     struct brw_reg reg;
 298     int urb_read_length = 0;
 299     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 300     GLuint reg_index = 0;
 301
 302     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 303     c->first_free_grf = 0;
 304
 305     for (i = 0; i < 4; i++) {
 306         if (i < c->key.nr_depth_regs)
 307             reg = brw_vec8_grf(i * 2, 0);
 308         else
 309             reg = brw_vec8_grf(0, 0);
 310         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 311     }
 312     reg_index += 2 * c->key.nr_depth_regs;
 313
 314     /* constants */
 315     {
 316         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 317         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 318
 319         /* use a real constant buffer, or just use a section of the GRF? */
 320         /* XXX this heuristic may need adjustment... */
 321         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 322            c->fp->use_const_buffer = GL_TRUE;
 323         else
 324            c->fp->use_const_buffer = GL_FALSE;
 325         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 326
 327         if (c->fp->use_const_buffer) {
 328            /* We'll use a real constant buffer and fetch constants from
 329             * it with a dataport read message.
 330             */
 331
 332            /* number of float constants in CURBE */
 333            c->prog_data.nr_params = 0;
 334         }
 335         else {
 336            const struct gl_program_parameter_list *plist =
 337               c->fp->program.Base.Parameters;
 338            int index = 0;
 339
 340            /* number of float constants in CURBE */
 341            c->prog_data.nr_params = 4 * nr_params;
 342
 343            /* loop over program constants (float[4]) */
 344            for (i = 0; i < nr_params; i++) {
 345               /* loop over XYZW channels */
 346               for (j = 0; j < 4; j++, index++) {
 347                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 348                  /* Save pointer to parameter/constant value.
 349                   * Constants will be copied in prepare_constant_buffer()
 350                   */
 351                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 352                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 353               }
 354            }
 355            /* number of constant regs used (each reg is float[8]) */
 356            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 357            reg_index += c->nr_creg;
 358         }
 359     }
 360
 361     /* fragment shader inputs */
 362     for (i = 0; i < VERT_RESULT_MAX; i++) {
 363        int fp_input;
 364
 365        if (i >= VERT_RESULT_VAR0)
 366           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 367        else if (i <= VERT_RESULT_TEX7)
 368           fp_input = i;
 369        else
 370           fp_input = -1;
 371
 372        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 373           urb_read_length = reg_index;
 374           reg = brw_vec8_grf(reg_index, 0);
 375           for (j = 0; j < 4; j++)
 376              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 377        }
 378        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 379           reg_index += 2;
 380        }
 381     }
 382
 383     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 384     c->prog_data.urb_read_length = urb_read_length;
 385     c->prog_data.curb_read_length = c->nr_creg;
 386     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 387     reg_index++;
 388     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 389     reg_index += 2;
 390
 391     /* mark GRF regs [0..reg_index-1] as in-use */
 392     for (i = 0; i < reg_index; i++)
 393        prealloc_grf(c, i);
 394
 395     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 396     prealloc_grf(c, 126);
 397     prealloc_grf(c, 127);
 398
 399     for (i = 0; i < c->nr_fp_insns; i++) {
 400         const struct prog_instruction *inst = &c->prog_instructions[i];
 401         struct brw_reg dst[4];
 402
 403         switch (inst->Opcode) {
 404         case OPCODE_TEX:
 405         case OPCODE_TXB:
 406             /* Allocate the channels of texture results contiguously,
 407              * since they are written out that way by the sampler unit.
 408              */
 409             for (j = 0; j < 4; j++) {
 410                 dst[j] = get_dst_reg(c, inst, j);
 411                 if (j != 0)
 412                     assert(dst[j].nr == dst[j - 1].nr + 1);
 413             }
 414             break;
 415         default:
 416             break;
 417         }
 418     }
 419
 420     for (i = 0; i < c->nr_fp_insns; i++) {
 421         const struct prog_instruction *inst = &c->prog_instructions[i];
 422
 423         switch (inst->Opcode) {
 424         case WM_DELTAXY:
 425             /* Allocate WM_DELTAXY destination on G45/GM45 to an
 426              * even-numbered GRF if possible so that we can use the PLN
 427              * instruction.
 428              */
 429             if (inst->DstReg.WriteMask == WRITEMASK_XY &&
 430                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
 431                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
 432                 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
 433                 int grf;
 434
 435                 for (grf = c->first_free_grf & ~1;
 436                      grf < BRW_WM_MAX_GRF;
 437                      grf += 2)
 438                 {
 439                     if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
 440                         c->used_grf[grf] = GL_TRUE;
 441                         c->used_grf[grf + 1] = GL_TRUE;
 442                         c->first_free_grf = grf + 2;  /* a guess */
 443
 444                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
 445                                 brw_vec8_grf(grf, 0));
 446                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
 447                                 brw_vec8_grf(grf + 1, 0));
 448                         break;
 449                     }
 450                 }
 451             }
 452         default:
 453             break;
 454         }
 455     }
 456
 457     /* An instruction may reference up to three constants.
 458      * They'll be found in these registers.
 459      * XXX alloc these on demand!
 460      */
 461     if (c->fp->use_const_buffer) {
 462        for (i = 0; i < 3; i++) {
 463           c->current_const[i].index = -1;
 464           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 465        }
 466     }
 467 #if 0
 468     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 469     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 470 #endif
 471 }
 472
 473
 474 /**
 475  * Check if any of the instruction's src registers are constants, uniforms,
 476  * or statevars.  If so, fetch any constants that we don't already have in
 477  * the three GRF slots.
 478  */
 479 static void fetch_constants(struct brw_wm_compile *c,
 480                             const struct prog_instruction *inst)
 481 {
 482    struct brw_compile *p = &c->func;
 483    GLuint i;
 484
 485    /* loop over instruction src regs */
 486    for (i = 0; i < 3; i++) {
 487       const struct prog_src_register *src = &inst->SrcReg[i];
 488       if (src->File == PROGRAM_STATE_VAR ||
 489           src->File == PROGRAM_CONSTANT ||
 490           src->File == PROGRAM_UNIFORM) {
 491          c->current_const[i].index = src->Index;
 492
 493 #if 0
 494          printf("  fetch const[%d] for arg %d into reg %d\n",
 495                 src->Index, i, c->current_const[i].reg.nr);
 496 #endif
 497
 498          /* need to fetch the constant now */
 499          brw_dp_READ_4(p,
 500                        c->current_const[i].reg,  /* writeback dest */
 501                        src->RelAddr,             /* relative indexing? */
 502                        16 * src->Index,          /* byte offset */
 503                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 504                        );
 505       }
 506    }
 507 }
 508
 509
 510 /**
 511  * Convert Mesa dst register to brw register.
 512  */
 513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 514                                   const struct prog_instruction *inst,
 515                                   GLuint component)
 516 {
 517     const int nr = 1;
 518     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 519             0, 0);
 520 }
 521
 522
 523 static struct brw_reg
 524 get_src_reg_const(struct brw_wm_compile *c,
 525                   const struct prog_instruction *inst,
 526                   GLuint srcRegIndex, GLuint component)
 527 {
 528    /* We should have already fetched the constant from the constant
 529     * buffer in fetch_constants().  Now we just have to return a
 530     * register description that extracts the needed component and
 531     * smears it across all eight vector components.
 532     */
 533    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 534    struct brw_reg const_reg;
 535
 536    assert(component < 4);
 537    assert(srcRegIndex < 3);
 538    assert(c->current_const[srcRegIndex].index != -1);
 539    const_reg = c->current_const[srcRegIndex].reg;
 540
 541    /* extract desired float from the const_reg, and smear */
 542    const_reg = stride(const_reg, 0, 1, 0);
 543    const_reg.subnr = component * 4;
 544
 545    if (src->Negate & (1 << component))
 546       const_reg = negate(const_reg);
 547    if (src->Abs)
 548       const_reg = brw_abs(const_reg);
 549
 550 #if 0
 551    printf("  form const[%d].%d for arg %d, reg %d\n",
 552           c->current_const[srcRegIndex].index,
 553           component,
 554           srcRegIndex,
 555           const_reg.nr);
 556 #endif
 557
 558    return const_reg;
 559 }
 560
 561
 562 /**
 563  * Convert Mesa src register to brw register.
 564  */
 565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 566                                   const struct prog_instruction *inst,
 567                                   GLuint srcRegIndex, GLuint channel)
 568 {
 569     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 570     const GLuint nr = 1;
 571     const GLuint component = GET_SWZ(src->Swizzle, channel);
 572
 573     /* Only one immediate value can be used per native opcode, and it
 574      * has be in the src1 slot, so not all Mesa instructions will get
 575      * to take advantage of immediate constants.
 576      */
 577     if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
 578        const struct gl_program_parameter_list *params;
 579
 580        params = c->fp->program.Base.Parameters;
 581
 582        /* Extended swizzle terms */
 583        if (component == SWIZZLE_ZERO) {
 584           return brw_imm_f(0.0F);
 585        } else if (component == SWIZZLE_ONE) {
 586           return brw_imm_f(1.0F);
 587        }
 588
 589        if (src->File == PROGRAM_CONSTANT) {
 590           return brw_imm_f(params->ParameterValues[src->Index][component]);
 591        }
 592     }
 593
 594     if (c->fp->use_const_buffer &&
 595         (src->File == PROGRAM_STATE_VAR ||
 596          src->File == PROGRAM_CONSTANT ||
 597          src->File == PROGRAM_UNIFORM)) {
 598        return get_src_reg_const(c, inst, srcRegIndex, component);
 599     }
 600     else {
 601        /* other type of source register */
 602        return get_reg(c, src->File, src->Index, component, nr,
 603                       src->Negate, src->Abs);
 604     }
 605 }
 606
 607 /**
 608  * Subroutines are minimal support for resusable instruction sequences.
 609  * They are implemented as simply as possible to minimise overhead: there
 610  * is no explicit support for communication between the caller and callee
 611  * other than saving the return address in a temporary register, nor is
 612  * there any automatic local storage.  This implies that great care is
 613  * required before attempting reentrancy or any kind of nested
 614  * subroutine invocations.
 615  */
 616 static void invoke_subroutine( struct brw_wm_compile *c,
 617                                enum _subroutine subroutine,
 618                                void (*emit)( struct brw_wm_compile * ) )
 619 {
 620     struct brw_compile *p = &c->func;
 621
 622     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 623
 624     if( c->subroutines[ subroutine ] ) {
 625         /* subroutine previously emitted: reuse existing instructions */
 626
 627         int mark = mark_tmps( c );
 628         struct brw_reg return_address = retype( alloc_tmp( c ),
 629                                                 BRW_REGISTER_TYPE_UD );
 630         int here = p->nr_insn;
 631
 632         brw_push_insn_state(p);
 633         brw_set_mask_control(p, BRW_MASK_DISABLE);
 634         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 635
 636         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 637                  brw_imm_d( ( c->subroutines[ subroutine ] -
 638                               here - 1 ) << 4 ) );
 639         brw_pop_insn_state(p);
 640
 641         release_tmps( c, mark );
 642     } else {
 643         /* previously unused subroutine: emit, and mark for later reuse */
 644
 645         int mark = mark_tmps( c );
 646         struct brw_reg return_address = retype( alloc_tmp( c ),
 647                                                 BRW_REGISTER_TYPE_UD );
 648         struct brw_instruction *calc;
 649         int base = p->nr_insn;
 650
 651         brw_push_insn_state(p);
 652         brw_set_mask_control(p, BRW_MASK_DISABLE);
 653         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 654         brw_pop_insn_state(p);
 655
 656         c->subroutines[ subroutine ] = p->nr_insn;
 657
 658         emit( c );
 659
 660         brw_push_insn_state(p);
 661         brw_set_mask_control(p, BRW_MASK_DISABLE);
 662         brw_MOV( p, brw_ip_reg(), return_address );
 663         brw_pop_insn_state(p);
 664
 665         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 666
 667         release_tmps( c, mark );
 668     }
 669 }
 670
 671 static void emit_arl(struct brw_wm_compile *c,
 672                      const struct prog_instruction *inst)
 673 {
 674     struct brw_compile *p = &c->func;
 675     struct brw_reg src0, addr_reg;
 676     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 677     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 678                            BRW_ARF_ADDRESS, 0);
 679     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 680     brw_MOV(p, addr_reg, src0);
 681     brw_set_saturate(p, 0);
 682 }
 683
 684 /**
 685  * For GLSL shaders, this KIL will be unconditional.
 686  * It may be contained inside an IF/ENDIF structure of course.
 687  */
 688 static void emit_kil(struct brw_wm_compile *c)
 689 {
 690     struct brw_compile *p = &c->func;
 691     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 692     brw_push_insn_state(p);
 693     brw_set_mask_control(p, BRW_MASK_DISABLE);
 694     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
 695     brw_AND(p, depth, c->emit_mask_reg, depth);
 696     brw_pop_insn_state(p);
 697 }
 698
 699 static INLINE struct brw_reg high_words( struct brw_reg reg )
 700 {
 701     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 702                    0, 8, 2 );
 703 }
 704
 705 static INLINE struct brw_reg low_words( struct brw_reg reg )
 706 {
 707     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 708 }
 709
 710 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 711 {
 712     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 713 }
 714
 715 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 716 {
 717     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 718                    0, 16, 2 );
 719 }
 720
 721 /* One-, two- and three-dimensional Perlin noise, similar to the description
 722    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 723 static void noise1_sub( struct brw_wm_compile *c ) {
 724
 725     struct brw_compile *p = &c->func;
 726     struct brw_reg param,
 727         x0, x1, /* gradients at each end */
 728         t, tmp[ 2 ], /* float temporaries */
 729         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 730     int i;
 731     int mark = mark_tmps( c );
 732
 733     x0 = alloc_tmp( c );
 734     x1 = alloc_tmp( c );
 735     t = alloc_tmp( c );
 736     tmp[ 0 ] = alloc_tmp( c );
 737     tmp[ 1 ] = alloc_tmp( c );
 738     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 739     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 740     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 741     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 742     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 743
 744     param = lookup_tmp( c, mark - 2 );
 745
 746     brw_set_access_mode( p, BRW_ALIGN_1 );
 747
 748     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 749
 750     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 751        be hashed.  Also compute the remainder (offset within the unit
 752        length), interleaved to reduce register dependency penalties. */
 753     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 754     brw_FRC( p, param, param );
 755     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 756     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 757     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 758
 759     /* We're now ready to perform the hashing.  The two hashes are
 760        interleaved for performance.  The hash function used is
 761        designed to rapidly achieve avalanche and require only 32x16
 762        bit multiplication, and 16-bit swizzles (which we get for
 763        free).  We can't use immediate operands in the multiplies,
 764        because immediates are permitted only in src1 and the 16-bit
 765        factor is permitted only in src0. */
 766     for( i = 0; i < 2; i++ )
 767         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 768     for( i = 0; i < 2; i++ )
 769        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 770                 high_words( itmp[ i ] ) );
 771     for( i = 0; i < 2; i++ )
 772         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 773     for( i = 0; i < 2; i++ )
 774        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 775                 high_words( itmp[ i ] ) );
 776     for( i = 0; i < 2; i++ )
 777         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 778     for( i = 0; i < 2; i++ )
 779        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 780                 high_words( itmp[ i ] ) );
 781
 782     /* Now we want to initialise the two gradients based on the
 783        hashes.  Format conversion from signed integer to float leaves
 784        everything scaled too high by a factor of pow( 2, 31 ), but
 785        we correct for that right at the end. */
 786     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 787     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 788     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 789
 790     brw_MUL( p, x0, x0, param );
 791     brw_MUL( p, x1, x1, t );
 792
 793     /* We interpolate between the gradients using the polynomial
 794        6t^5 - 15t^4 + 10t^3 (Perlin). */
 795     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 796     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 797     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 798     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 799     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 800     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 801                                            pipeline */
 802     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 803     brw_MUL( p, param, tmp[ 0 ], param );
 804     brw_MUL( p, x1, x1, param );
 805     brw_ADD( p, x0, x0, x1 );
 806     /* scale by pow( 2, -30 ), to compensate for the format conversion
 807        above and an extra factor of 2 so that a single gradient covers
 808        the [-1,1] range */
 809     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 810
 811     release_tmps( c, mark );
 812 }
 813
 814 static void emit_noise1( struct brw_wm_compile *c,
 815                          const struct prog_instruction *inst )
 816 {
 817     struct brw_compile *p = &c->func;
 818     struct brw_reg src, param, dst;
 819     GLuint mask = inst->DstReg.WriteMask;
 820     int i;
 821     int mark = mark_tmps( c );
 822
 823     assert( mark == 0 );
 824
 825     src = get_src_reg( c, inst, 0, 0 );
 826
 827     param = alloc_tmp( c );
 828
 829     brw_MOV( p, param, src );
 830
 831     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 832
 833     /* Fill in the result: */
 834     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 835     for (i = 0 ; i < 4; i++) {
 836         if (mask & (1<<i)) {
 837             dst = get_dst_reg(c, inst, i);
 838             brw_MOV( p, dst, param );
 839         }
 840     }
 841     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 842         brw_set_saturate( p, 0 );
 843
 844     release_tmps( c, mark );
 845 }
 846
 847 static void noise2_sub( struct brw_wm_compile *c ) {
 848
 849     struct brw_compile *p = &c->func;
 850     struct brw_reg param0, param1,
 851         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 852         t, tmp[ 4 ], /* float temporaries */
 853         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 854     int i;
 855     int mark = mark_tmps( c );
 856
 857     x0y0 = alloc_tmp( c );
 858     x0y1 = alloc_tmp( c );
 859     x1y0 = alloc_tmp( c );
 860     x1y1 = alloc_tmp( c );
 861     t = alloc_tmp( c );
 862     for( i = 0; i < 4; i++ ) {
 863         tmp[ i ] = alloc_tmp( c );
 864         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 865     }
 866     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 867     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 868     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 869
 870     param0 = lookup_tmp( c, mark - 3 );
 871     param1 = lookup_tmp( c, mark - 2 );
 872
 873     brw_set_access_mode( p, BRW_ALIGN_1 );
 874
 875     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 876        be hashed.  Also compute the remainders (offsets within the unit
 877        square), interleaved to reduce register dependency penalties. */
 878     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 879     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 880     brw_FRC( p, param0, param0 );
 881     brw_FRC( p, param1, param1 );
 882     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 883     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 884              low_words( itmp[ 1 ] ) );
 885     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 886     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 887     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 888     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 889     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 890
 891     /* We're now ready to perform the hashing.  The four hashes are
 892        interleaved for performance.  The hash function used is
 893        designed to rapidly achieve avalanche and require only 32x16
 894        bit multiplication, and 16-bit swizzles (which we get for
 895        free).  We can't use immediate operands in the multiplies,
 896        because immediates are permitted only in src1 and the 16-bit
 897        factor is permitted only in src0. */
 898     for( i = 0; i < 4; i++ )
 899         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 900     for( i = 0; i < 4; i++ )
 901         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 902                  high_words( itmp[ i ] ) );
 903     for( i = 0; i < 4; i++ )
 904         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 905     for( i = 0; i < 4; i++ )
 906         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 907                  high_words( itmp[ i ] ) );
 908     for( i = 0; i < 4; i++ )
 909         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 910     for( i = 0; i < 4; i++ )
 911         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 912                  high_words( itmp[ i ] ) );
 913
 914     /* Now we want to initialise the four gradients based on the
 915        hashes.  Format conversion from signed integer to float leaves
 916        everything scaled too high by a factor of pow( 2, 15 ), but
 917        we correct for that right at the end. */
 918     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 919     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 920     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 921     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 922     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 923
 924     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 925     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 926     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 927     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 928
 929     brw_MUL( p, x1y0, x1y0, t );
 930     brw_MUL( p, x1y1, x1y1, t );
 931     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 932     brw_MUL( p, x0y0, x0y0, param0 );
 933     brw_MUL( p, x0y1, x0y1, param0 );
 934
 935     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 936     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 937     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 938     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 939
 940     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 941     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 942     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 943     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 944
 945     /* We interpolate between the gradients using the polynomial
 946        6t^5 - 15t^4 + 10t^3 (Perlin). */
 947     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
 948     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
 949     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 950     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
 951     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 952     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 953     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
 954                                                  pipeline */
 955     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 956     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
 957     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 958     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 959     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
 960                                                  pipeline */
 961     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 962     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 963     brw_MUL( p, param0, tmp[ 0 ], param0 );
 964     brw_MUL( p, param1, tmp[ 1 ], param1 );
 965
 966     /* Here we interpolate in the y dimension... */
 967     brw_MUL( p, x0y1, x0y1, param1 );
 968     brw_MUL( p, x1y1, x1y1, param1 );
 969     brw_ADD( p, x0y0, x0y0, x0y1 );
 970     brw_ADD( p, x1y0, x1y0, x1y1 );
 971
 972     /* And now in x.  There are horrible register dependencies here,
 973        but we have nothing else to do. */
 974     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
 975     brw_MUL( p, x1y0, x1y0, param0 );
 976     brw_ADD( p, x0y0, x0y0, x1y0 );
 977
 978     /* scale by pow( 2, -15 ), as described above */
 979     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
 980
 981     release_tmps( c, mark );
 982 }
 983
 984 static void emit_noise2( struct brw_wm_compile *c,
 985                          const struct prog_instruction *inst )
 986 {
 987     struct brw_compile *p = &c->func;
 988     struct brw_reg src0, src1, param0, param1, dst;
 989     GLuint mask = inst->DstReg.WriteMask;
 990     int i;
 991     int mark = mark_tmps( c );
 992
 993     assert( mark == 0 );
 994
 995     src0 = get_src_reg( c, inst, 0, 0 );
 996     src1 = get_src_reg( c, inst, 0, 1 );
 997
 998     param0 = alloc_tmp( c );
 999     param1 = alloc_tmp( c );
1000
1001     brw_MOV( p, param0, src0 );
1002     brw_MOV( p, param1, src1 );
1003
1004     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1005
1006     /* Fill in the result: */
1007     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1008     for (i = 0 ; i < 4; i++) {
1009         if (mask & (1<<i)) {
1010             dst = get_dst_reg(c, inst, i);
1011             brw_MOV( p, dst, param0 );
1012         }
1013     }
1014     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1015         brw_set_saturate( p, 0 );
1016
1017     release_tmps( c, mark );
1018 }
1019
1020 /**
1021  * The three-dimensional case is much like the one- and two- versions above,
1022  * but since the number of corners is rapidly growing we now pack 16 16-bit
1023  * hashes into each register to extract more parallelism from the EUs.
1024  */
1025 static void noise3_sub( struct brw_wm_compile *c ) {
1026
1027     struct brw_compile *p = &c->func;
1028     struct brw_reg param0, param1, param2,
1029         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1030         xi, yi, zi, /* interpolation coefficients */
1031         t, tmp[ 8 ], /* float temporaries */
1032         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1033         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1034     int i;
1035     int mark = mark_tmps( c );
1036
1037     x0y0 = alloc_tmp( c );
1038     x0y1 = alloc_tmp( c );
1039     x1y0 = alloc_tmp( c );
1040     x1y1 = alloc_tmp( c );
1041     xi = alloc_tmp( c );
1042     yi = alloc_tmp( c );
1043     zi = alloc_tmp( c );
1044     t = alloc_tmp( c );
1045     for( i = 0; i < 8; i++ ) {
1046         tmp[ i ] = alloc_tmp( c );
1047         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1048         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1049     }
1050
1051     param0 = lookup_tmp( c, mark - 4 );
1052     param1 = lookup_tmp( c, mark - 3 );
1053     param2 = lookup_tmp( c, mark - 2 );
1054
1055     brw_set_access_mode( p, BRW_ALIGN_1 );
1056
1057     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1058        be hashed.  Also compute the remainders (offsets within the unit
1059        cube), interleaved to reduce register dependency penalties. */
1060     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1061     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1062     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1063     brw_FRC( p, param0, param0 );
1064     brw_FRC( p, param1, param1 );
1065     brw_FRC( p, param2, param2 );
1066     /* Since we now have only 16 bits of precision in the hash, we must
1067        be more careful about thorough mixing to maintain entropy as we
1068        squash the input vector into a small scalar. */
1069     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1070     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1071     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1072              brw_imm_uw( 0x9B93 ) );
1073     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1074              brw_imm_uw( 0xBC8F ) );
1075
1076     /* Temporarily disable the execution mask while we work with ExecSize=16
1077        channels (the mask is set for ExecSize=8 and is probably incorrect).
1078        Although this might cause execution of unwanted channels, the code
1079        writes only to temporary registers and has no side effects, so
1080        disabling the mask is harmless. */
1081     brw_push_insn_state( p );
1082     brw_set_mask_control( p, BRW_MASK_DISABLE );
1083     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1084     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1085     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1086
1087     /* We're now ready to perform the hashing.  The eight hashes are
1088        interleaved for performance.  The hash function used is
1089        designed to rapidly achieve avalanche and require only 16x16
1090        bit multiplication, and 8-bit swizzles (which we get for
1091        free). */
1092     for( i = 0; i < 4; i++ )
1093         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1094     for( i = 0; i < 4; i++ )
1095         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1096                  odd_bytes( wtmp[ i ] ) );
1097     for( i = 0; i < 4; i++ )
1098         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1099     for( i = 0; i < 4; i++ )
1100         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1101                  odd_bytes( wtmp[ i ] ) );
1102     brw_pop_insn_state( p );
1103
1104     /* Now we want to initialise the four rear gradients based on the
1105        hashes.  Format conversion from signed integer to float leaves
1106        everything scaled too high by a factor of pow( 2, 15 ), but
1107        we correct for that right at the end. */
1108     /* x component */
1109     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1110     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1111     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1112     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1113     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1114
1115     brw_push_insn_state( p );
1116     brw_set_mask_control( p, BRW_MASK_DISABLE );
1117     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1118     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1119     brw_pop_insn_state( p );
1120
1121     brw_MUL( p, x1y0, x1y0, t );
1122     brw_MUL( p, x1y1, x1y1, t );
1123     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1124     brw_MUL( p, x0y0, x0y0, param0 );
1125     brw_MUL( p, x0y1, x0y1, param0 );
1126
1127     /* y component */
1128     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1129     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1130     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1131     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1132
1133     brw_push_insn_state( p );
1134     brw_set_mask_control( p, BRW_MASK_DISABLE );
1135     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1136     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1137     brw_pop_insn_state( p );
1138
1139     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1140     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1141     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1142     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1143     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1144
1145     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1146     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1147     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1148     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1149
1150     /* z component */
1151     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1152     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1153     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1154     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1155
1156     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1157     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1158     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1159     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1160
1161     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1162     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1163     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1164     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1165
1166     /* We interpolate between the gradients using the polynomial
1167        6t^5 - 15t^4 + 10t^3 (Perlin). */
1168     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1169     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1170     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1171     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1172     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1173     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1174     brw_MUL( p, xi, xi, param0 );
1175     brw_MUL( p, yi, yi, param1 );
1176     brw_MUL( p, zi, zi, param2 );
1177     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1178     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1179     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1180     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1181     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1182     brw_MUL( p, xi, xi, param0 );
1183     brw_MUL( p, yi, yi, param1 );
1184     brw_MUL( p, zi, zi, param2 );
1185     brw_MUL( p, xi, xi, param0 );
1186     brw_MUL( p, yi, yi, param1 );
1187     brw_MUL( p, zi, zi, param2 );
1188     brw_MUL( p, xi, xi, param0 );
1189     brw_MUL( p, yi, yi, param1 );
1190     brw_MUL( p, zi, zi, param2 );
1191
1192     /* Here we interpolate in the y dimension... */
1193     brw_MUL( p, x0y1, x0y1, yi );
1194     brw_MUL( p, x1y1, x1y1, yi );
1195     brw_ADD( p, x0y0, x0y0, x0y1 );
1196     brw_ADD( p, x1y0, x1y0, x1y1 );
1197
1198     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1199     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1200     brw_MUL( p, x1y0, x1y0, xi );
1201     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1202
1203     /* Now do the same thing for the front four gradients... */
1204     /* x component */
1205     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1206     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1207     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1208     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1209
1210     brw_push_insn_state( p );
1211     brw_set_mask_control( p, BRW_MASK_DISABLE );
1212     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1213     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1214     brw_pop_insn_state( p );
1215
1216     brw_MUL( p, x1y0, x1y0, t );
1217     brw_MUL( p, x1y1, x1y1, t );
1218     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1219     brw_MUL( p, x0y0, x0y0, param0 );
1220     brw_MUL( p, x0y1, x0y1, param0 );
1221
1222     /* y component */
1223     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1224     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1225     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1226     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1227
1228     brw_push_insn_state( p );
1229     brw_set_mask_control( p, BRW_MASK_DISABLE );
1230     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1231     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1232     brw_pop_insn_state( p );
1233
1234     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1235     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1236     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1237     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1238     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1239
1240     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1241     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1242     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1243     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1244
1245     /* z component */
1246     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1247     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1248     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1249     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1250
1251     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1252     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1253     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1254     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1255
1256     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1257     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1258     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1259     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1260
1261     /* The interpolation coefficients are still around from last time, so
1262        again interpolate in the y dimension... */
1263     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1264     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1265     brw_MUL( p, x0y1, x0y1, yi );
1266     brw_MUL( p, x1y1, x1y1, yi );
1267     brw_ADD( p, x0y0, x0y0, x0y1 );
1268     brw_ADD( p, x1y0, x1y0, x1y1 );
1269
1270     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1271        time put the front face in tmp[ 1 ] and we're nearly there... */
1272     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1273     brw_MUL( p, x1y0, x1y0, xi );
1274     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1275
1276     /* The final interpolation, in the z dimension: */
1277     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1278     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1279     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1280
1281     /* scale by pow( 2, -15 ), as described above */
1282     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1283
1284     release_tmps( c, mark );
1285 }
1286
1287 static void emit_noise3( struct brw_wm_compile *c,
1288                          const struct prog_instruction *inst )
1289 {
1290     struct brw_compile *p = &c->func;
1291     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1292     GLuint mask = inst->DstReg.WriteMask;
1293     int i;
1294     int mark = mark_tmps( c );
1295
1296     assert( mark == 0 );
1297
1298     src0 = get_src_reg( c, inst, 0, 0 );
1299     src1 = get_src_reg( c, inst, 0, 1 );
1300     src2 = get_src_reg( c, inst, 0, 2 );
1301
1302     param0 = alloc_tmp( c );
1303     param1 = alloc_tmp( c );
1304     param2 = alloc_tmp( c );
1305
1306     brw_MOV( p, param0, src0 );
1307     brw_MOV( p, param1, src1 );
1308     brw_MOV( p, param2, src2 );
1309
1310     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1311
1312     /* Fill in the result: */
1313     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1314     for (i = 0 ; i < 4; i++) {
1315         if (mask & (1<<i)) {
1316             dst = get_dst_reg(c, inst, i);
1317             brw_MOV( p, dst, param0 );
1318         }
1319     }
1320     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1321         brw_set_saturate( p, 0 );
1322
1323     release_tmps( c, mark );
1324 }
1325
1326 /**
1327  * For the four-dimensional case, the little micro-optimisation benefits
1328  * we obtain by unrolling all the loops aren't worth the massive bloat it
1329  * now causes.  Instead, we loop twice around performing a similar operation
1330  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1331  * code to glue it all together.
1332  */
1333 static void noise4_sub( struct brw_wm_compile *c )
1334 {
1335     struct brw_compile *p = &c->func;
1336     struct brw_reg param[ 4 ],
1337         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1338         w0, /* noise for the w=0 cube */
1339         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1340         interp[ 4 ], /* interpolation coefficients */
1341         t, tmp[ 8 ], /* float temporaries */
1342         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1343         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1344     int i, j;
1345     int mark = mark_tmps( c );
1346     GLuint loop, origin;
1347
1348     x0y0 = alloc_tmp( c );
1349     x0y1 = alloc_tmp( c );
1350     x1y0 = alloc_tmp( c );
1351     x1y1 = alloc_tmp( c );
1352     t = alloc_tmp( c );
1353     w0 = alloc_tmp( c );
1354     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1355     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1356
1357     for( i = 0; i < 4; i++ ) {
1358         param[ i ] = lookup_tmp( c, mark - 5 + i );
1359         interp[ i ] = alloc_tmp( c );
1360     }
1361
1362     for( i = 0; i < 8; i++ ) {
1363         tmp[ i ] = alloc_tmp( c );
1364         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1365         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1366     }
1367
1368     brw_set_access_mode( p, BRW_ALIGN_1 );
1369
1370     /* We only want 16 bits of precision from the integral part of each
1371        co-ordinate, but unfortunately the RNDD semantics would saturate
1372        at 16 bits if we performed the operation directly to a 16-bit
1373        destination.  Therefore, we round to 32-bit temporaries where
1374        appropriate, and then store only the lower 16 bits. */
1375     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1376     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1377     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1378     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1379     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1380     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1381
1382     /* Modify the flag register here, because the side effect is useful
1383        later (see below).  We know for certain that all flags will be
1384        cleared, since the FRC instruction cannot possibly generate
1385        negative results.  Even for exceptional inputs (infinities, denormals,
1386        NaNs), the architecture guarantees that the L conditional is false. */
1387     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1388     brw_FRC( p, param[ 0 ], param[ 0 ] );
1389     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1390     for( i = 1; i < 4; i++ )
1391         brw_FRC( p, param[ i ], param[ i ] );
1392
1393     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1394        of all. */
1395     for( i = 0; i < 4; i++ )
1396         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1397     for( i = 0; i < 4; i++ )
1398         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1399     for( i = 0; i < 4; i++ )
1400         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1401     for( i = 0; i < 4; i++ )
1402         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1403     for( j = 0; j < 3; j++ )
1404         for( i = 0; i < 4; i++ )
1405             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1406
1407     /* Mark the current address, as it will be a jump destination.  The
1408        following code will be executed twice: first, with the flag
1409        register clear indicating the w=0 case, and second with flags
1410        set for w=1. */
1411     loop = p->nr_insn;
1412
1413     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1414        be hashed.  Since we have only 16 bits of precision in the hash, we
1415        must be careful about thorough mixing to maintain entropy as we
1416        squash the input vector into a small scalar. */
1417     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1418              brw_imm_uw( 0xBC8F ) );
1419     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1420              brw_imm_uw( 0xD0BD ) );
1421     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1422              brw_imm_uw( 0x9B93 ) );
1423     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1424              brw_imm_uw( 0xA359 ) );
1425     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1426              brw_imm_uw( 0xBC8F ) );
1427
1428     /* Temporarily disable the execution mask while we work with ExecSize=16
1429        channels (the mask is set for ExecSize=8 and is probably incorrect).
1430        Although this might cause execution of unwanted channels, the code
1431        writes only to temporary registers and has no side effects, so
1432        disabling the mask is harmless. */
1433     brw_push_insn_state( p );
1434     brw_set_mask_control( p, BRW_MASK_DISABLE );
1435     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1436     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1437     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1438
1439     /* We're now ready to perform the hashing.  The eight hashes are
1440        interleaved for performance.  The hash function used is
1441        designed to rapidly achieve avalanche and require only 16x16
1442        bit multiplication, and 8-bit swizzles (which we get for
1443        free). */
1444     for( i = 0; i < 4; i++ )
1445         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1446     for( i = 0; i < 4; i++ )
1447         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1448                  odd_bytes( wtmp[ i ] ) );
1449     for( i = 0; i < 4; i++ )
1450         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1451     for( i = 0; i < 4; i++ )
1452         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1453                  odd_bytes( wtmp[ i ] ) );
1454     brw_pop_insn_state( p );
1455
1456     /* Now we want to initialise the four rear gradients based on the
1457        hashes.  Format conversion from signed integer to float leaves
1458        everything scaled too high by a factor of pow( 2, 15 ), but
1459        we correct for that right at the end. */
1460     /* x component */
1461     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1462     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1463     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1464     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1465     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1466
1467     brw_push_insn_state( p );
1468     brw_set_mask_control( p, BRW_MASK_DISABLE );
1469     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1470     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1471     brw_pop_insn_state( p );
1472
1473     brw_MUL( p, x1y0, x1y0, t );
1474     brw_MUL( p, x1y1, x1y1, t );
1475     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1476     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1477     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1478
1479     /* y component */
1480     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1481     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1482     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1483     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1484
1485     brw_push_insn_state( p );
1486     brw_set_mask_control( p, BRW_MASK_DISABLE );
1487     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1488     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1489     brw_pop_insn_state( p );
1490
1491     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1492     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1493     /* prepare t for the w component (used below): w the first time through
1494        the loop; w - 1 the second time) */
1495     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1496     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1497     p->current->header.predicate_inverse = 1;
1498     brw_MOV( p, t, param[ 3 ] );
1499     p->current->header.predicate_inverse = 0;
1500     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1501     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1502     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1503
1504     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1505     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1506     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1507     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1508
1509     /* z component */
1510     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1511     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1512     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1513     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1514
1515     brw_push_insn_state( p );
1516     brw_set_mask_control( p, BRW_MASK_DISABLE );
1517     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1518     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1519     brw_pop_insn_state( p );
1520
1521     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1522     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1523     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1524     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1525
1526     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1527     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1528     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1529     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1530
1531     /* w component */
1532     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1533     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1534     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1535     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1536
1537     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1538     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1539     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1540     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1541     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1542
1543     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1544     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1545     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1546     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1547
1548     /* Here we interpolate in the y dimension... */
1549     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1550     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1551     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1552     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1553     brw_ADD( p, x0y0, x0y0, x0y1 );
1554     brw_ADD( p, x1y0, x1y0, x1y1 );
1555
1556     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1557     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1558     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1559     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1560
1561     /* Now do the same thing for the front four gradients... */
1562     /* x component */
1563     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1564     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1565     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1566     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1567
1568     brw_push_insn_state( p );
1569     brw_set_mask_control( p, BRW_MASK_DISABLE );
1570     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1571     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1572     brw_pop_insn_state( p );
1573
1574     brw_MUL( p, x1y0, x1y0, t );
1575     brw_MUL( p, x1y1, x1y1, t );
1576     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1577     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1578     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1579
1580     /* y component */
1581     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1582     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1583     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1584     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1585
1586     brw_push_insn_state( p );
1587     brw_set_mask_control( p, BRW_MASK_DISABLE );
1588     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1589     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1590     brw_pop_insn_state( p );
1591
1592     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1593     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1594     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1595     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1596     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1597
1598     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1599     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1600     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1601     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1602
1603     /* z component */
1604     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1605     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1606     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1607     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1608
1609     brw_push_insn_state( p );
1610     brw_set_mask_control( p, BRW_MASK_DISABLE );
1611     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1612     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1613     brw_pop_insn_state( p );
1614
1615     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1616     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1617     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1618     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1619     /* prepare t for the w component (used below): w the first time through
1620        the loop; w - 1 the second time) */
1621     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1622     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1623     p->current->header.predicate_inverse = 1;
1624     brw_MOV( p, t, param[ 3 ] );
1625     p->current->header.predicate_inverse = 0;
1626     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1627
1628     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1629     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1630     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1631     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1632
1633     /* w component */
1634     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1635     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1636     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1637     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1638
1639     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1640     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1641     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1642     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1643
1644     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1645     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1646     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1647     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1648
1649     /* Interpolate in the y dimension: */
1650     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1651     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1652     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1653     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1654     brw_ADD( p, x0y0, x0y0, x0y1 );
1655     brw_ADD( p, x1y0, x1y0, x1y1 );
1656
1657     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1658        time put the front face in tmp[ 1 ] and we're nearly there... */
1659     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1660     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1661     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1662
1663     /* Another interpolation, in the z dimension: */
1664     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1665     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1666     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1667
1668     /* Exit the loop if we've computed both cubes... */
1669     origin = p->nr_insn;
1670     brw_push_insn_state( p );
1671     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1672     brw_set_mask_control( p, BRW_MASK_DISABLE );
1673     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1674     brw_pop_insn_state( p );
1675
1676     /* Save the result for the w=0 case, and increment the w coordinate: */
1677     brw_MOV( p, w0, tmp[ 0 ] );
1678     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1679              brw_imm_uw( 1 ) );
1680
1681     /* Loop around for the other cube.  Explicitly set the flag register
1682        (unfortunately we must spend an extra instruction to do this: we
1683        can't rely on a side effect of the previous MOV or ADD because
1684        conditional modifiers which are normally true might be false in
1685        exceptional circumstances, e.g. given a NaN input; the add to
1686        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1687     brw_push_insn_state( p );
1688     brw_set_mask_control( p, BRW_MASK_DISABLE );
1689     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1690     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1691              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1692     brw_pop_insn_state( p );
1693
1694     /* Patch the previous conditional branch now that we know the
1695        destination address. */
1696     brw_set_src1( p->store + origin,
1697                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1698
1699     /* The very last interpolation. */
1700     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1701     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1702     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1703
1704     /* scale by pow( 2, -15 ), as described above */
1705     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1706
1707     release_tmps( c, mark );
1708 }
1709
1710 static void emit_noise4( struct brw_wm_compile *c,
1711                          const struct prog_instruction *inst )
1712 {
1713     struct brw_compile *p = &c->func;
1714     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1715     GLuint mask = inst->DstReg.WriteMask;
1716     int i;
1717     int mark = mark_tmps( c );
1718
1719     assert( mark == 0 );
1720
1721     src0 = get_src_reg( c, inst, 0, 0 );
1722     src1 = get_src_reg( c, inst, 0, 1 );
1723     src2 = get_src_reg( c, inst, 0, 2 );
1724     src3 = get_src_reg( c, inst, 0, 3 );
1725
1726     param0 = alloc_tmp( c );
1727     param1 = alloc_tmp( c );
1728     param2 = alloc_tmp( c );
1729     param3 = alloc_tmp( c );
1730
1731     brw_MOV( p, param0, src0 );
1732     brw_MOV( p, param1, src1 );
1733     brw_MOV( p, param2, src2 );
1734     brw_MOV( p, param3, src3 );
1735
1736     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1737
1738     /* Fill in the result: */
1739     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1740     for (i = 0 ; i < 4; i++) {
1741         if (mask & (1<<i)) {
1742             dst = get_dst_reg(c, inst, i);
1743             brw_MOV( p, dst, param0 );
1744         }
1745     }
1746     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1747         brw_set_saturate( p, 0 );
1748
1749     release_tmps( c, mark );
1750 }
1751
1752 /**
1753  * Resolve subroutine calls after code emit is done.
1754  */
1755 static void post_wm_emit( struct brw_wm_compile *c )
1756 {
1757     brw_resolve_cals(&c->func);
1758 }
1759
1760 static void
1761 get_argument_regs(struct brw_wm_compile *c,
1762                   const struct prog_instruction *inst,
1763                   int index,
1764                   struct brw_reg *dst,
1765                   struct brw_reg *regs,
1766                   int mask)
1767 {
1768     struct brw_compile *p = &c->func;
1769     int i, j;
1770
1771     for (i = 0; i < 4; i++) {
1772         if (mask & (1 << i)) {
1773             regs[i] = get_src_reg(c, inst, index, i);
1774
1775             /* Unalias destination registers from our sources. */
1776             if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1777                for (j = 0; j < 4; j++) {
1778                    if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1779                        struct brw_reg tmp = alloc_tmp(c);
1780                        brw_MOV(p, tmp, regs[i]);
1781                        regs[i] = tmp;
1782                        break;
1783                    }
1784                }
1785             }
1786         }
1787     }
1788 }
1789
1790 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1791 {
1792    struct intel_context *intel = &brw->intel;
1793 #define MAX_IF_DEPTH 32
1794 #define MAX_LOOP_DEPTH 32
1795     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1796     GLuint i, if_depth = 0, loop_depth = 0;
1797     struct brw_compile *p = &c->func;
1798     struct brw_indirect stack_index = brw_indirect(0, 0);
1799
1800     c->out_of_regs = GL_FALSE;
1801
1802     prealloc_reg(c);
1803     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1804     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1805
1806     for (i = 0; i < c->nr_fp_insns; i++) {
1807         const struct prog_instruction *inst = &c->prog_instructions[i];
1808         int dst_flags;
1809         struct brw_reg args[3][4], dst[4];
1810         int j;
1811         int mark = mark_tmps( c );
1812
1813         c->cur_inst = i;
1814
1815 #if 0
1816         printf("Inst %d: ", i);
1817         _mesa_print_instruction(inst);
1818 #endif
1819
1820         /* fetch any constants that this instruction needs */
1821         if (c->fp->use_const_buffer)
1822            fetch_constants(c, inst);
1823
1824         if (inst->Opcode != OPCODE_ARL) {
1825            for (j = 0; j < 4; j++) {
1826               if (inst->DstReg.WriteMask & (1 << j))
1827                  dst[j] = get_dst_reg(c, inst, j);
1828               else
1829                  dst[j] = brw_null_reg();
1830            }
1831         }
1832         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1833             get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1834
1835         dst_flags = inst->DstReg.WriteMask;
1836         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1837             dst_flags |= SATURATE;
1838
1839         if (inst->CondUpdate)
1840             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1841         else
1842             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1843
1844         switch (inst->Opcode) {
1845             case WM_PIXELXY:
1846                 emit_pixel_xy(c, dst, dst_flags);
1847                 break;
1848             case WM_DELTAXY:
1849                 emit_delta_xy(p, dst, dst_flags, args[0]);
1850                 break;
1851             case WM_PIXELW:
1852                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1853                 break;
1854             case WM_LINTERP:
1855                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1856                 break;
1857             case WM_PINTERP:
1858                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1859                 break;
1860             case WM_CINTERP:
1861                 emit_cinterp(p, dst, dst_flags, args[0]);
1862                 break;
1863             case WM_WPOSXY:
1864                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1865                 break;
1866             case WM_FB_WRITE:
1867                 emit_fb_write(c, args[0], args[1], args[2],
1868                               INST_AUX_GET_TARGET(inst->Aux),
1869                               inst->Aux & INST_AUX_EOT);
1870                 break;
1871             case WM_FRONTFACING:
1872                 emit_frontfacing(p, dst, dst_flags);
1873                 break;
1874             case OPCODE_ADD:
1875                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1876                 break;
1877             case OPCODE_ARL:
1878                 emit_arl(c, inst);
1879                 break;
1880             case OPCODE_FRC:
1881                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1882                 break;
1883             case OPCODE_FLR:
1884                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1885                 break;
1886             case OPCODE_LRP:
1887                 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1888                 break;
1889             case OPCODE_TRUNC:
1890                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1891                 break;
1892             case OPCODE_MOV:
1893             case OPCODE_SWZ:
1894                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1895                 break;
1896             case OPCODE_DP3:
1897                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1898                 break;
1899             case OPCODE_DP4:
1900                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1901                 break;
1902             case OPCODE_XPD:
1903                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1904                 break;
1905             case OPCODE_DPH:
1906                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1907                 break;
1908             case OPCODE_RCP:
1909                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1910                 break;
1911             case OPCODE_RSQ:
1912                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1913                 break;
1914             case OPCODE_SIN:
1915                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1916                 break;
1917             case OPCODE_COS:
1918                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1919                 break;
1920             case OPCODE_EX2:
1921                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1922                 break;
1923             case OPCODE_LG2:
1924                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1925                 break;
1926             case OPCODE_CMP:
1927                 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1928                 break;
1929             case OPCODE_MIN:
1930                 emit_min(p, dst, dst_flags, args[0], args[1]);
1931                 break;
1932             case OPCODE_MAX:
1933                 emit_max(p, dst, dst_flags, args[0], args[1]);
1934                 break;
1935             case OPCODE_DDX:
1936             case OPCODE_DDY:
1937                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1938                           args[0]);
1939                 break;
1940             case OPCODE_SLT:
1941                 emit_sop(p, dst, dst_flags,
1942                          BRW_CONDITIONAL_L, args[0], args[1]);
1943                 break;
1944             case OPCODE_SLE:
1945                 emit_sop(p, dst, dst_flags,
1946                          BRW_CONDITIONAL_LE, args[0], args[1]);
1947                 break;
1948             case OPCODE_SGT:
1949                 emit_sop(p, dst, dst_flags,
1950                          BRW_CONDITIONAL_G, args[0], args[1]);
1951                 break;
1952             case OPCODE_SGE:
1953                 emit_sop(p, dst, dst_flags,
1954                          BRW_CONDITIONAL_GE, args[0], args[1]);
1955                 break;
1956             case OPCODE_SEQ:
1957                 emit_sop(p, dst, dst_flags,
1958                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1959                 break;
1960             case OPCODE_SNE:
1961                 emit_sop(p, dst, dst_flags,
1962                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1963                 break;
1964             case OPCODE_MUL:
1965                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1966                 break;
1967             case OPCODE_POW:
1968                 emit_math2(c, BRW_MATH_FUNCTION_POW,
1969                            dst, dst_flags, args[0], args[1]);
1970                 break;
1971             case OPCODE_MAD:
1972                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1973                 break;
1974             case OPCODE_NOISE1:
1975                 emit_noise1(c, inst);
1976                 break;
1977             case OPCODE_NOISE2:
1978                 emit_noise2(c, inst);
1979                 break;
1980             case OPCODE_NOISE3:
1981                 emit_noise3(c, inst);
1982                 break;
1983             case OPCODE_NOISE4:
1984                 emit_noise4(c, inst);
1985                 break;
1986             case OPCODE_TEX:
1987                 emit_tex(c, dst, dst_flags, args[0],
1988                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1989                                  0, 1, 0, 0),
1990                          inst->TexSrcTarget,
1991                          inst->TexSrcUnit,
1992                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1993                 break;
1994             case OPCODE_TXB:
1995                 emit_txb(c, dst, dst_flags, args[0],
1996                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1997                                  0, 1, 0, 0),
1998                          inst->TexSrcTarget,
1999                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2000                 break;
2001             case OPCODE_KIL_NV:
2002                 emit_kil(c);
2003                 break;
2004             case OPCODE_IF:
2005                 assert(if_depth < MAX_IF_DEPTH);
2006                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2007                 break;
2008             case OPCODE_ELSE:
2009                 assert(if_depth > 0);
2010                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2011                 break;
2012             case OPCODE_ENDIF:
2013                 assert(if_depth > 0);
2014                 brw_ENDIF(p, if_inst[--if_depth]);
2015                 break;
2016             case OPCODE_BGNSUB:
2017                 brw_save_label(p, inst->Comment, p->nr_insn);
2018                 break;
2019             case OPCODE_ENDSUB:
2020                 /* no-op */
2021                 break;
2022             case OPCODE_CAL:
2023                 brw_push_insn_state(p);
2024                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2025                 brw_set_access_mode(p, BRW_ALIGN_1);
2026                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2027                 brw_set_access_mode(p, BRW_ALIGN_16);
2028                 brw_ADD(p, get_addr_reg(stack_index),
2029                          get_addr_reg(stack_index), brw_imm_d(4));
2030                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2031                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2032                 brw_pop_insn_state(p);
2033                 break;
2034
2035             case OPCODE_RET:
2036                 brw_push_insn_state(p);
2037                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2038                 brw_ADD(p, get_addr_reg(stack_index),
2039                         get_addr_reg(stack_index), brw_imm_d(-4));
2040                 brw_set_access_mode(p, BRW_ALIGN_1);
2041                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2042                 brw_set_access_mode(p, BRW_ALIGN_16);
2043                 brw_pop_insn_state(p);
2044
2045                 break;
2046             case OPCODE_BGNLOOP:
2047                 /* XXX may need to invalidate the current_constant regs */
2048                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2049                 break;
2050             case OPCODE_BRK:
2051                 brw_BREAK(p);
2052                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2053                 break;
2054             case OPCODE_CONT:
2055                 brw_CONT(p);
2056                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2057                 break;
2058             case OPCODE_ENDLOOP:
2059                {
2060                   struct brw_instruction *inst0, *inst1;
2061                   GLuint br = 1;
2062
2063                   if (intel->is_ironlake)
2064                      br = 2;
2065
2066                   assert(loop_depth > 0);
2067                   loop_depth--;
2068                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2069                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2070                   while (inst0 > loop_inst[loop_depth]) {
2071                      inst0--;
2072                      if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2073                          inst0->bits3.if_else.jump_count == 0) {
2074                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2075                         inst0->bits3.if_else.pop_count = 0;
2076                      }
2077                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2078                               inst0->bits3.if_else.jump_count == 0) {
2079                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2080                         inst0->bits3.if_else.pop_count = 0;
2081                      }
2082                   }
2083                }
2084                break;
2085             default:
2086                 printf("unsupported opcode %d (%s) in fragment shader\n",
2087                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2088                        _mesa_opcode_string(inst->Opcode) : "unknown");
2089         }
2090
2091         /* Release temporaries containing any unaliased source regs. */
2092         release_tmps( c, mark );
2093
2094         if (inst->CondUpdate)
2095             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2096         else
2097             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2098     }
2099     post_wm_emit(c);
2100
2101     if (INTEL_DEBUG & DEBUG_WM) {
2102       printf("wm-native:\n");
2103       for (i = 0; i < p->nr_insn; i++)
2104          brw_disasm(stderr, &p->store[i]);
2105       printf("\n");
2106     }
2107 }
2108
2109 /**
2110  * Do GPU code generation for shaders that use GLSL features such as
2111  * flow control.  Other shaders will be compiled with the
2112  */
2113 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2114 {
2115     if (INTEL_DEBUG & DEBUG_WM) {
2116         printf("brw_wm_glsl_emit:\n");
2117     }
2118
2119     /* initial instruction translation/simplification */
2120     brw_wm_pass_fp(c);
2121
2122     /* actual code generation */
2123     brw_wm_emit_glsl(brw, c);
2124
2125     if (INTEL_DEBUG & DEBUG_WM) {
2126         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2127     }
2128
2129     c->prog_data.total_grf = num_grf_used(c);
2130     c->prog_data.total_scratch = 0;
2131 }