src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
  27        return GL_TRUE;
  28
  29     for (i = 0; i < fp->Base.NumInstructions; i++) {
  30         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  31         switch (inst->Opcode) {
  32             case OPCODE_ARL:
  33             case OPCODE_IF:
  34             case OPCODE_ENDIF:
  35             case OPCODE_CAL:
  36             case OPCODE_BRK:
  37             case OPCODE_RET:
  38             case OPCODE_NOISE1:
  39             case OPCODE_NOISE2:
  40             case OPCODE_NOISE3:
  41             case OPCODE_NOISE4:
  42             case OPCODE_BGNLOOP:
  43                 return GL_TRUE;
  44             default:
  45                 break;
  46         }
  47     }
  48     return GL_FALSE;
  49 }
  50
  51
  52
  53 static void
  54 reclaim_temps(struct brw_wm_compile *c);
  55
  56
  57 /** Mark GRF register as used. */
  58 static void
  59 prealloc_grf(struct brw_wm_compile *c, int r)
  60 {
  61    c->used_grf[r] = GL_TRUE;
  62 }
  63
  64
  65 /** Mark given GRF register as not in use. */
  66 static void
  67 release_grf(struct brw_wm_compile *c, int r)
  68 {
  69    /*assert(c->used_grf[r]);*/
  70    c->used_grf[r] = GL_FALSE;
  71    c->first_free_grf = MIN2(c->first_free_grf, r);
  72 }
  73
  74
  75 /** Return index of a free GRF, mark it as used. */
  76 static int
  77 alloc_grf(struct brw_wm_compile *c)
  78 {
  79    GLuint r;
  80    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  81       if (!c->used_grf[r]) {
  82          c->used_grf[r] = GL_TRUE;
  83          c->first_free_grf = r + 1;  /* a guess */
  84          return r;
  85       }
  86    }
  87
  88    /* no free temps, try to reclaim some */
  89    reclaim_temps(c);
  90    c->first_free_grf = 0;
  91
  92    /* try alloc again */
  93    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  94       if (!c->used_grf[r]) {
  95          c->used_grf[r] = GL_TRUE;
  96          c->first_free_grf = r + 1;  /* a guess */
  97          return r;
  98       }
  99    }
 100
 101    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
 102       assert(c->used_grf[r]);
 103    }
 104
 105    /* really, no free GRF regs found */
 106    if (!c->out_of_regs) {
 107       /* print warning once per compilation */
 108       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 109       c->out_of_regs = GL_TRUE;
 110    }
 111
 112    return -1;
 113 }
 114
 115
 116 /** Return number of GRF registers used */
 117 static int
 118 num_grf_used(const struct brw_wm_compile *c)
 119 {
 120    int r;
 121    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 122       if (c->used_grf[r])
 123          return r + 1;
 124    return 0;
 125 }
 126
 127
 128
 129 /**
 130  * Record the mapping of a Mesa register to a hardware register.
 131  */
 132 static void set_reg(struct brw_wm_compile *c, int file, int index,
 133         int component, struct brw_reg reg)
 134 {
 135     c->wm_regs[file][index][component].reg = reg;
 136     c->wm_regs[file][index][component].inited = GL_TRUE;
 137 }
 138
 139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 140 {
 141     struct brw_reg reg;
 142
 143     /* if we need to allocate another temp, grow the tmp_regs[] array */
 144     if (c->tmp_index == c->tmp_max) {
 145        int r = alloc_grf(c);
 146        if (r < 0) {
 147           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 148           r = 50; /* XXX random register! */
 149        }
 150        c->tmp_regs[ c->tmp_max++ ] = r;
 151     }
 152
 153     /* form the GRF register */
 154     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 155     /*printf("alloc_temp %d\n", reg.nr);*/
 156     assert(reg.nr < BRW_WM_MAX_GRF);
 157     return reg;
 158
 159 }
 160
 161 /**
 162  * Save current temp register info.
 163  * There must be a matching call to release_tmps().
 164  */
 165 static int mark_tmps(struct brw_wm_compile *c)
 166 {
 167     return c->tmp_index;
 168 }
 169
 170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 171 {
 172     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 173 }
 174
 175 static void release_tmps(struct brw_wm_compile *c, int mark)
 176 {
 177     c->tmp_index = mark;
 178 }
 179
 180 /**
 181  * Convert Mesa src register to brw register.
 182  *
 183  * Since we're running in SOA mode each Mesa register corresponds to four
 184  * hardware registers.  We allocate the hardware registers as needed here.
 185  *
 186  * \param file  register file, one of PROGRAM_x
 187  * \param index  register number
 188  * \param component  src component (X=0, Y=1, Z=2, W=3)
 189  * \param nr  not used?!?
 190  * \param neg  negate value?
 191  * \param abs  take absolute value?
 192  */
 193 static struct brw_reg
 194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 195         int nr, GLuint neg, GLuint abs)
 196 {
 197     struct brw_reg reg;
 198     switch (file) {
 199         case PROGRAM_STATE_VAR:
 200         case PROGRAM_CONSTANT:
 201         case PROGRAM_UNIFORM:
 202             file = PROGRAM_STATE_VAR;
 203             break;
 204         case PROGRAM_UNDEFINED:
 205             return brw_null_reg();
 206         case PROGRAM_TEMPORARY:
 207         case PROGRAM_INPUT:
 208         case PROGRAM_OUTPUT:
 209         case PROGRAM_PAYLOAD:
 210             break;
 211         default:
 212             _mesa_problem(NULL, "Unexpected file in get_reg()");
 213             return brw_null_reg();
 214     }
 215
 216     assert(index < 256);
 217     assert(component < 4);
 218
 219     /* see if we've already allocated a HW register for this Mesa register */
 220     if (c->wm_regs[file][index][component].inited) {
 221        /* yes, re-use */
 222        reg = c->wm_regs[file][index][component].reg;
 223     }
 224     else {
 225         /* no, allocate new register */
 226        int grf = alloc_grf(c);
 227        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 228        if (grf < 0) {
 229           /* totally out of temps */
 230           grf = 51; /* XXX random register! */
 231        }
 232
 233        reg = brw_vec8_grf(grf, 0);
 234        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 235
 236        set_reg(c, file, index, component, reg);
 237     }
 238
 239     if (neg & (1 << component)) {
 240         reg = negate(reg);
 241     }
 242     if (abs)
 243         reg = brw_abs(reg);
 244     return reg;
 245 }
 246
 247
 248
 249 /**
 250  * This is called if we run out of GRF registers.  Examine the live intervals
 251  * of temp regs in the program and free those which won't be used again.
 252  */
 253 static void
 254 reclaim_temps(struct brw_wm_compile *c)
 255 {
 256    GLint intBegin[MAX_PROGRAM_TEMPS];
 257    GLint intEnd[MAX_PROGRAM_TEMPS];
 258    int index;
 259
 260    /*printf("Reclaim temps:\n");*/
 261
 262    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 263                              intBegin, intEnd);
 264
 265    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 266       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 267          /* program temp[i] can be freed */
 268          int component;
 269          /*printf("  temp[%d] is dead\n", index);*/
 270          for (component = 0; component < 4; component++) {
 271             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 272                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 273                release_grf(c, r);
 274                /*
 275                printf("  Reclaim temp %d, reg %d at inst %d\n",
 276                       index, r, c->cur_inst);
 277                */
 278                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 279             }
 280          }
 281       }
 282    }
 283 }
 284
 285
 286
 287
 288 /**
 289  * Preallocate registers.  This sets up the Mesa to hardware register
 290  * mapping for certain registers, such as constants (uniforms/state vars)
 291  * and shader inputs.
 292  */
 293 static void prealloc_reg(struct brw_wm_compile *c)
 294 {
 295     struct intel_context *intel = &c->func.brw->intel;
 296     int i, j;
 297     struct brw_reg reg;
 298     int urb_read_length = 0;
 299     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 300     GLuint reg_index = 0;
 301
 302     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 303     c->first_free_grf = 0;
 304
 305     for (i = 0; i < 4; i++) {
 306         if (i < c->key.nr_depth_regs)
 307             reg = brw_vec8_grf(i * 2, 0);
 308         else
 309             reg = brw_vec8_grf(0, 0);
 310         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 311     }
 312     reg_index += 2 * c->key.nr_depth_regs;
 313
 314     /* constants */
 315     {
 316         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 317         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 318
 319         /* use a real constant buffer, or just use a section of the GRF? */
 320         /* XXX this heuristic may need adjustment... */
 321         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 322            c->fp->use_const_buffer = GL_TRUE;
 323         else
 324            c->fp->use_const_buffer = GL_FALSE;
 325         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 326
 327         if (c->fp->use_const_buffer) {
 328            /* We'll use a real constant buffer and fetch constants from
 329             * it with a dataport read message.
 330             */
 331
 332            /* number of float constants in CURBE */
 333            c->prog_data.nr_params = 0;
 334         }
 335         else {
 336            const struct gl_program_parameter_list *plist =
 337               c->fp->program.Base.Parameters;
 338            int index = 0;
 339
 340            /* number of float constants in CURBE */
 341            c->prog_data.nr_params = 4 * nr_params;
 342
 343            /* loop over program constants (float[4]) */
 344            for (i = 0; i < nr_params; i++) {
 345               /* loop over XYZW channels */
 346               for (j = 0; j < 4; j++, index++) {
 347                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 348                  /* Save pointer to parameter/constant value.
 349                   * Constants will be copied in prepare_constant_buffer()
 350                   */
 351                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 352                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 353               }
 354            }
 355            /* number of constant regs used (each reg is float[8]) */
 356            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 357            reg_index += c->nr_creg;
 358         }
 359     }
 360
 361     /* fragment shader inputs */
 362     for (i = 0; i < VERT_RESULT_MAX; i++) {
 363        int fp_input;
 364
 365        if (i >= VERT_RESULT_VAR0)
 366           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 367        else if (i <= VERT_RESULT_TEX7)
 368           fp_input = i;
 369        else
 370           fp_input = -1;
 371
 372        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 373           urb_read_length = reg_index;
 374           reg = brw_vec8_grf(reg_index, 0);
 375           for (j = 0; j < 4; j++)
 376              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 377        }
 378        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 379           reg_index += 2;
 380        }
 381     }
 382
 383     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 384     c->prog_data.urb_read_length = urb_read_length;
 385     c->prog_data.curb_read_length = c->nr_creg;
 386     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 387     reg_index++;
 388     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 389     reg_index += 2;
 390
 391     /* mark GRF regs [0..reg_index-1] as in-use */
 392     for (i = 0; i < reg_index; i++)
 393        prealloc_grf(c, i);
 394
 395     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 396     prealloc_grf(c, 126);
 397     prealloc_grf(c, 127);
 398
 399     for (i = 0; i < c->nr_fp_insns; i++) {
 400         const struct prog_instruction *inst = &c->prog_instructions[i];
 401         struct brw_reg dst[4];
 402
 403         switch (inst->Opcode) {
 404         case OPCODE_TEX:
 405         case OPCODE_TXB:
 406             /* Allocate the channels of texture results contiguously,
 407              * since they are written out that way by the sampler unit.
 408              */
 409             for (j = 0; j < 4; j++) {
 410                 dst[j] = get_dst_reg(c, inst, j);
 411                 if (j != 0)
 412                     assert(dst[j].nr == dst[j - 1].nr + 1);
 413             }
 414             break;
 415         default:
 416             break;
 417         }
 418     }
 419
 420     for (i = 0; i < c->nr_fp_insns; i++) {
 421         const struct prog_instruction *inst = &c->prog_instructions[i];
 422
 423         switch (inst->Opcode) {
 424         case WM_DELTAXY:
 425             /* Allocate WM_DELTAXY destination on G45/GM45 to an
 426              * even-numbered GRF if possible so that we can use the PLN
 427              * instruction.
 428              */
 429             if (inst->DstReg.WriteMask == WRITEMASK_XY &&
 430                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
 431                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
 432                 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
 433                 int grf;
 434
 435                 for (grf = c->first_free_grf & ~1;
 436                      grf < BRW_WM_MAX_GRF;
 437                      grf += 2)
 438                 {
 439                     if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
 440                         c->used_grf[grf] = GL_TRUE;
 441                         c->used_grf[grf + 1] = GL_TRUE;
 442                         c->first_free_grf = grf + 2;  /* a guess */
 443
 444                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
 445                                 brw_vec8_grf(grf, 0));
 446                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
 447                                 brw_vec8_grf(grf + 1, 0));
 448                         break;
 449                     }
 450                 }
 451             }
 452         default:
 453             break;
 454         }
 455     }
 456
 457     /* An instruction may reference up to three constants.
 458      * They'll be found in these registers.
 459      * XXX alloc these on demand!
 460      */
 461     if (c->fp->use_const_buffer) {
 462        for (i = 0; i < 3; i++) {
 463           c->current_const[i].index = -1;
 464           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 465        }
 466     }
 467 #if 0
 468     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 469     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 470 #endif
 471 }
 472
 473
 474 /**
 475  * Check if any of the instruction's src registers are constants, uniforms,
 476  * or statevars.  If so, fetch any constants that we don't already have in
 477  * the three GRF slots.
 478  */
 479 static void fetch_constants(struct brw_wm_compile *c,
 480                             const struct prog_instruction *inst)
 481 {
 482    struct brw_compile *p = &c->func;
 483    GLuint i;
 484
 485    /* loop over instruction src regs */
 486    for (i = 0; i < 3; i++) {
 487       const struct prog_src_register *src = &inst->SrcReg[i];
 488       if (src->File == PROGRAM_STATE_VAR ||
 489           src->File == PROGRAM_CONSTANT ||
 490           src->File == PROGRAM_UNIFORM) {
 491          c->current_const[i].index = src->Index;
 492
 493 #if 0
 494          printf("  fetch const[%d] for arg %d into reg %d\n",
 495                 src->Index, i, c->current_const[i].reg.nr);
 496 #endif
 497
 498          /* need to fetch the constant now */
 499          brw_dp_READ_4(p,
 500                        c->current_const[i].reg,  /* writeback dest */
 501                        src->RelAddr,             /* relative indexing? */
 502                        16 * src->Index,          /* byte offset */
 503                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 504                        );
 505       }
 506    }
 507 }
 508
 509
 510 /**
 511  * Convert Mesa dst register to brw register.
 512  */
 513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 514                                   const struct prog_instruction *inst,
 515                                   GLuint component)
 516 {
 517     const int nr = 1;
 518     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 519             0, 0);
 520 }
 521
 522
 523 static struct brw_reg
 524 get_src_reg_const(struct brw_wm_compile *c,
 525                   const struct prog_instruction *inst,
 526                   GLuint srcRegIndex, GLuint component)
 527 {
 528    /* We should have already fetched the constant from the constant
 529     * buffer in fetch_constants().  Now we just have to return a
 530     * register description that extracts the needed component and
 531     * smears it across all eight vector components.
 532     */
 533    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 534    struct brw_reg const_reg;
 535
 536    assert(component < 4);
 537    assert(srcRegIndex < 3);
 538    assert(c->current_const[srcRegIndex].index != -1);
 539    const_reg = c->current_const[srcRegIndex].reg;
 540
 541    /* extract desired float from the const_reg, and smear */
 542    const_reg = stride(const_reg, 0, 1, 0);
 543    const_reg.subnr = component * 4;
 544
 545    if (src->Negate & (1 << component))
 546       const_reg = negate(const_reg);
 547    if (src->Abs)
 548       const_reg = brw_abs(const_reg);
 549
 550 #if 0
 551    printf("  form const[%d].%d for arg %d, reg %d\n",
 552           c->current_const[srcRegIndex].index,
 553           component,
 554           srcRegIndex,
 555           const_reg.nr);
 556 #endif
 557
 558    return const_reg;
 559 }
 560
 561
 562 /**
 563  * Convert Mesa src register to brw register.
 564  */
 565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 566                                   const struct prog_instruction *inst,
 567                                   GLuint srcRegIndex, GLuint channel)
 568 {
 569     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 570     const GLuint nr = 1;
 571     const GLuint component = GET_SWZ(src->Swizzle, channel);
 572
 573     /* Extended swizzle terms */
 574     if (component == SWIZZLE_ZERO) {
 575        return brw_imm_f(0.0F);
 576     }
 577     else if (component == SWIZZLE_ONE) {
 578        return brw_imm_f(1.0F);
 579     }
 580
 581     if (c->fp->use_const_buffer &&
 582         (src->File == PROGRAM_STATE_VAR ||
 583          src->File == PROGRAM_CONSTANT ||
 584          src->File == PROGRAM_UNIFORM)) {
 585        return get_src_reg_const(c, inst, srcRegIndex, component);
 586     }
 587     else {
 588        /* other type of source register */
 589        return get_reg(c, src->File, src->Index, component, nr,
 590                       src->Negate, src->Abs);
 591     }
 592 }
 593
 594 /**
 595  * Subroutines are minimal support for resusable instruction sequences.
 596  * They are implemented as simply as possible to minimise overhead: there
 597  * is no explicit support for communication between the caller and callee
 598  * other than saving the return address in a temporary register, nor is
 599  * there any automatic local storage.  This implies that great care is
 600  * required before attempting reentrancy or any kind of nested
 601  * subroutine invocations.
 602  */
 603 static void invoke_subroutine( struct brw_wm_compile *c,
 604                                enum _subroutine subroutine,
 605                                void (*emit)( struct brw_wm_compile * ) )
 606 {
 607     struct brw_compile *p = &c->func;
 608
 609     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 610
 611     if( c->subroutines[ subroutine ] ) {
 612         /* subroutine previously emitted: reuse existing instructions */
 613
 614         int mark = mark_tmps( c );
 615         struct brw_reg return_address = retype( alloc_tmp( c ),
 616                                                 BRW_REGISTER_TYPE_UD );
 617         int here = p->nr_insn;
 618
 619         brw_push_insn_state(p);
 620         brw_set_mask_control(p, BRW_MASK_DISABLE);
 621         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 622
 623         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 624                  brw_imm_d( ( c->subroutines[ subroutine ] -
 625                               here - 1 ) << 4 ) );
 626         brw_pop_insn_state(p);
 627
 628         release_tmps( c, mark );
 629     } else {
 630         /* previously unused subroutine: emit, and mark for later reuse */
 631
 632         int mark = mark_tmps( c );
 633         struct brw_reg return_address = retype( alloc_tmp( c ),
 634                                                 BRW_REGISTER_TYPE_UD );
 635         struct brw_instruction *calc;
 636         int base = p->nr_insn;
 637
 638         brw_push_insn_state(p);
 639         brw_set_mask_control(p, BRW_MASK_DISABLE);
 640         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 641         brw_pop_insn_state(p);
 642
 643         c->subroutines[ subroutine ] = p->nr_insn;
 644
 645         emit( c );
 646
 647         brw_push_insn_state(p);
 648         brw_set_mask_control(p, BRW_MASK_DISABLE);
 649         brw_MOV( p, brw_ip_reg(), return_address );
 650         brw_pop_insn_state(p);
 651
 652         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 653
 654         release_tmps( c, mark );
 655     }
 656 }
 657
 658 static void emit_arl(struct brw_wm_compile *c,
 659                      const struct prog_instruction *inst)
 660 {
 661     struct brw_compile *p = &c->func;
 662     struct brw_reg src0, addr_reg;
 663     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 664     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 665                            BRW_ARF_ADDRESS, 0);
 666     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 667     brw_MOV(p, addr_reg, src0);
 668     brw_set_saturate(p, 0);
 669 }
 670
 671 /**
 672  * For GLSL shaders, this KIL will be unconditional.
 673  * It may be contained inside an IF/ENDIF structure of course.
 674  */
 675 static void emit_kil(struct brw_wm_compile *c)
 676 {
 677     struct brw_compile *p = &c->func;
 678     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 679     brw_push_insn_state(p);
 680     brw_set_mask_control(p, BRW_MASK_DISABLE);
 681     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
 682     brw_AND(p, depth, c->emit_mask_reg, depth);
 683     brw_pop_insn_state(p);
 684 }
 685
 686 static INLINE struct brw_reg high_words( struct brw_reg reg )
 687 {
 688     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 689                    0, 8, 2 );
 690 }
 691
 692 static INLINE struct brw_reg low_words( struct brw_reg reg )
 693 {
 694     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 695 }
 696
 697 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 698 {
 699     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 700 }
 701
 702 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 703 {
 704     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 705                    0, 16, 2 );
 706 }
 707
 708 /* One-, two- and three-dimensional Perlin noise, similar to the description
 709    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 710 static void noise1_sub( struct brw_wm_compile *c ) {
 711
 712     struct brw_compile *p = &c->func;
 713     struct brw_reg param,
 714         x0, x1, /* gradients at each end */
 715         t, tmp[ 2 ], /* float temporaries */
 716         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 717     int i;
 718     int mark = mark_tmps( c );
 719
 720     x0 = alloc_tmp( c );
 721     x1 = alloc_tmp( c );
 722     t = alloc_tmp( c );
 723     tmp[ 0 ] = alloc_tmp( c );
 724     tmp[ 1 ] = alloc_tmp( c );
 725     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 726     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 727     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 728     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 729     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 730
 731     param = lookup_tmp( c, mark - 2 );
 732
 733     brw_set_access_mode( p, BRW_ALIGN_1 );
 734
 735     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 736
 737     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 738        be hashed.  Also compute the remainder (offset within the unit
 739        length), interleaved to reduce register dependency penalties. */
 740     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 741     brw_FRC( p, param, param );
 742     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 743     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 744     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 745
 746     /* We're now ready to perform the hashing.  The two hashes are
 747        interleaved for performance.  The hash function used is
 748        designed to rapidly achieve avalanche and require only 32x16
 749        bit multiplication, and 16-bit swizzles (which we get for
 750        free).  We can't use immediate operands in the multiplies,
 751        because immediates are permitted only in src1 and the 16-bit
 752        factor is permitted only in src0. */
 753     for( i = 0; i < 2; i++ )
 754         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 755     for( i = 0; i < 2; i++ )
 756        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 757                 high_words( itmp[ i ] ) );
 758     for( i = 0; i < 2; i++ )
 759         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 760     for( i = 0; i < 2; i++ )
 761        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 762                 high_words( itmp[ i ] ) );
 763     for( i = 0; i < 2; i++ )
 764         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 765     for( i = 0; i < 2; i++ )
 766        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 767                 high_words( itmp[ i ] ) );
 768
 769     /* Now we want to initialise the two gradients based on the
 770        hashes.  Format conversion from signed integer to float leaves
 771        everything scaled too high by a factor of pow( 2, 31 ), but
 772        we correct for that right at the end. */
 773     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 774     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 775     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 776
 777     brw_MUL( p, x0, x0, param );
 778     brw_MUL( p, x1, x1, t );
 779
 780     /* We interpolate between the gradients using the polynomial
 781        6t^5 - 15t^4 + 10t^3 (Perlin). */
 782     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 783     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 784     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 785     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 786     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 787     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 788                                            pipeline */
 789     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 790     brw_MUL( p, param, tmp[ 0 ], param );
 791     brw_MUL( p, x1, x1, param );
 792     brw_ADD( p, x0, x0, x1 );
 793     /* scale by pow( 2, -30 ), to compensate for the format conversion
 794        above and an extra factor of 2 so that a single gradient covers
 795        the [-1,1] range */
 796     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 797
 798     release_tmps( c, mark );
 799 }
 800
 801 static void emit_noise1( struct brw_wm_compile *c,
 802                          const struct prog_instruction *inst )
 803 {
 804     struct brw_compile *p = &c->func;
 805     struct brw_reg src, param, dst;
 806     GLuint mask = inst->DstReg.WriteMask;
 807     int i;
 808     int mark = mark_tmps( c );
 809
 810     assert( mark == 0 );
 811
 812     src = get_src_reg( c, inst, 0, 0 );
 813
 814     param = alloc_tmp( c );
 815
 816     brw_MOV( p, param, src );
 817
 818     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 819
 820     /* Fill in the result: */
 821     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 822     for (i = 0 ; i < 4; i++) {
 823         if (mask & (1<<i)) {
 824             dst = get_dst_reg(c, inst, i);
 825             brw_MOV( p, dst, param );
 826         }
 827     }
 828     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 829         brw_set_saturate( p, 0 );
 830
 831     release_tmps( c, mark );
 832 }
 833
 834 static void noise2_sub( struct brw_wm_compile *c ) {
 835
 836     struct brw_compile *p = &c->func;
 837     struct brw_reg param0, param1,
 838         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 839         t, tmp[ 4 ], /* float temporaries */
 840         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 841     int i;
 842     int mark = mark_tmps( c );
 843
 844     x0y0 = alloc_tmp( c );
 845     x0y1 = alloc_tmp( c );
 846     x1y0 = alloc_tmp( c );
 847     x1y1 = alloc_tmp( c );
 848     t = alloc_tmp( c );
 849     for( i = 0; i < 4; i++ ) {
 850         tmp[ i ] = alloc_tmp( c );
 851         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 852     }
 853     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 854     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 855     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 856
 857     param0 = lookup_tmp( c, mark - 3 );
 858     param1 = lookup_tmp( c, mark - 2 );
 859
 860     brw_set_access_mode( p, BRW_ALIGN_1 );
 861
 862     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 863        be hashed.  Also compute the remainders (offsets within the unit
 864        square), interleaved to reduce register dependency penalties. */
 865     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 866     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 867     brw_FRC( p, param0, param0 );
 868     brw_FRC( p, param1, param1 );
 869     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 870     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 871              low_words( itmp[ 1 ] ) );
 872     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 873     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 874     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 875     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 876     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 877
 878     /* We're now ready to perform the hashing.  The four hashes are
 879        interleaved for performance.  The hash function used is
 880        designed to rapidly achieve avalanche and require only 32x16
 881        bit multiplication, and 16-bit swizzles (which we get for
 882        free).  We can't use immediate operands in the multiplies,
 883        because immediates are permitted only in src1 and the 16-bit
 884        factor is permitted only in src0. */
 885     for( i = 0; i < 4; i++ )
 886         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 887     for( i = 0; i < 4; i++ )
 888         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 889                  high_words( itmp[ i ] ) );
 890     for( i = 0; i < 4; i++ )
 891         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 892     for( i = 0; i < 4; i++ )
 893         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 894                  high_words( itmp[ i ] ) );
 895     for( i = 0; i < 4; i++ )
 896         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 897     for( i = 0; i < 4; i++ )
 898         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 899                  high_words( itmp[ i ] ) );
 900
 901     /* Now we want to initialise the four gradients based on the
 902        hashes.  Format conversion from signed integer to float leaves
 903        everything scaled too high by a factor of pow( 2, 15 ), but
 904        we correct for that right at the end. */
 905     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 906     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 907     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 908     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 909     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 910
 911     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 912     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 913     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 914     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 915
 916     brw_MUL( p, x1y0, x1y0, t );
 917     brw_MUL( p, x1y1, x1y1, t );
 918     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 919     brw_MUL( p, x0y0, x0y0, param0 );
 920     brw_MUL( p, x0y1, x0y1, param0 );
 921
 922     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 923     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 924     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 925     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 926
 927     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 928     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 929     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 930     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 931
 932     /* We interpolate between the gradients using the polynomial
 933        6t^5 - 15t^4 + 10t^3 (Perlin). */
 934     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
 935     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
 936     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 937     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
 938     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 939     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 940     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
 941                                                  pipeline */
 942     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 943     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
 944     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 945     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 946     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
 947                                                  pipeline */
 948     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 949     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 950     brw_MUL( p, param0, tmp[ 0 ], param0 );
 951     brw_MUL( p, param1, tmp[ 1 ], param1 );
 952
 953     /* Here we interpolate in the y dimension... */
 954     brw_MUL( p, x0y1, x0y1, param1 );
 955     brw_MUL( p, x1y1, x1y1, param1 );
 956     brw_ADD( p, x0y0, x0y0, x0y1 );
 957     brw_ADD( p, x1y0, x1y0, x1y1 );
 958
 959     /* And now in x.  There are horrible register dependencies here,
 960        but we have nothing else to do. */
 961     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
 962     brw_MUL( p, x1y0, x1y0, param0 );
 963     brw_ADD( p, x0y0, x0y0, x1y0 );
 964
 965     /* scale by pow( 2, -15 ), as described above */
 966     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
 967
 968     release_tmps( c, mark );
 969 }
 970
 971 static void emit_noise2( struct brw_wm_compile *c,
 972                          const struct prog_instruction *inst )
 973 {
 974     struct brw_compile *p = &c->func;
 975     struct brw_reg src0, src1, param0, param1, dst;
 976     GLuint mask = inst->DstReg.WriteMask;
 977     int i;
 978     int mark = mark_tmps( c );
 979
 980     assert( mark == 0 );
 981
 982     src0 = get_src_reg( c, inst, 0, 0 );
 983     src1 = get_src_reg( c, inst, 0, 1 );
 984
 985     param0 = alloc_tmp( c );
 986     param1 = alloc_tmp( c );
 987
 988     brw_MOV( p, param0, src0 );
 989     brw_MOV( p, param1, src1 );
 990
 991     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
 992
 993     /* Fill in the result: */
 994     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 995     for (i = 0 ; i < 4; i++) {
 996         if (mask & (1<<i)) {
 997             dst = get_dst_reg(c, inst, i);
 998             brw_MOV( p, dst, param0 );
 999         }
1000     }
1001     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1002         brw_set_saturate( p, 0 );
1003
1004     release_tmps( c, mark );
1005 }
1006
1007 /**
1008  * The three-dimensional case is much like the one- and two- versions above,
1009  * but since the number of corners is rapidly growing we now pack 16 16-bit
1010  * hashes into each register to extract more parallelism from the EUs.
1011  */
1012 static void noise3_sub( struct brw_wm_compile *c ) {
1013
1014     struct brw_compile *p = &c->func;
1015     struct brw_reg param0, param1, param2,
1016         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1017         xi, yi, zi, /* interpolation coefficients */
1018         t, tmp[ 8 ], /* float temporaries */
1019         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1020         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1021     int i;
1022     int mark = mark_tmps( c );
1023
1024     x0y0 = alloc_tmp( c );
1025     x0y1 = alloc_tmp( c );
1026     x1y0 = alloc_tmp( c );
1027     x1y1 = alloc_tmp( c );
1028     xi = alloc_tmp( c );
1029     yi = alloc_tmp( c );
1030     zi = alloc_tmp( c );
1031     t = alloc_tmp( c );
1032     for( i = 0; i < 8; i++ ) {
1033         tmp[ i ] = alloc_tmp( c );
1034         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1035         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1036     }
1037
1038     param0 = lookup_tmp( c, mark - 4 );
1039     param1 = lookup_tmp( c, mark - 3 );
1040     param2 = lookup_tmp( c, mark - 2 );
1041
1042     brw_set_access_mode( p, BRW_ALIGN_1 );
1043
1044     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1045        be hashed.  Also compute the remainders (offsets within the unit
1046        cube), interleaved to reduce register dependency penalties. */
1047     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1048     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1049     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1050     brw_FRC( p, param0, param0 );
1051     brw_FRC( p, param1, param1 );
1052     brw_FRC( p, param2, param2 );
1053     /* Since we now have only 16 bits of precision in the hash, we must
1054        be more careful about thorough mixing to maintain entropy as we
1055        squash the input vector into a small scalar. */
1056     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1057     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1058     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1059              brw_imm_uw( 0x9B93 ) );
1060     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1061              brw_imm_uw( 0xBC8F ) );
1062
1063     /* Temporarily disable the execution mask while we work with ExecSize=16
1064        channels (the mask is set for ExecSize=8 and is probably incorrect).
1065        Although this might cause execution of unwanted channels, the code
1066        writes only to temporary registers and has no side effects, so
1067        disabling the mask is harmless. */
1068     brw_push_insn_state( p );
1069     brw_set_mask_control( p, BRW_MASK_DISABLE );
1070     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1071     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1072     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1073
1074     /* We're now ready to perform the hashing.  The eight hashes are
1075        interleaved for performance.  The hash function used is
1076        designed to rapidly achieve avalanche and require only 16x16
1077        bit multiplication, and 8-bit swizzles (which we get for
1078        free). */
1079     for( i = 0; i < 4; i++ )
1080         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1081     for( i = 0; i < 4; i++ )
1082         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1083                  odd_bytes( wtmp[ i ] ) );
1084     for( i = 0; i < 4; i++ )
1085         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1086     for( i = 0; i < 4; i++ )
1087         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1088                  odd_bytes( wtmp[ i ] ) );
1089     brw_pop_insn_state( p );
1090
1091     /* Now we want to initialise the four rear gradients based on the
1092        hashes.  Format conversion from signed integer to float leaves
1093        everything scaled too high by a factor of pow( 2, 15 ), but
1094        we correct for that right at the end. */
1095     /* x component */
1096     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1097     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1098     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1099     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1100     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1101
1102     brw_push_insn_state( p );
1103     brw_set_mask_control( p, BRW_MASK_DISABLE );
1104     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1105     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1106     brw_pop_insn_state( p );
1107
1108     brw_MUL( p, x1y0, x1y0, t );
1109     brw_MUL( p, x1y1, x1y1, t );
1110     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1111     brw_MUL( p, x0y0, x0y0, param0 );
1112     brw_MUL( p, x0y1, x0y1, param0 );
1113
1114     /* y component */
1115     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1116     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1117     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1118     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1119
1120     brw_push_insn_state( p );
1121     brw_set_mask_control( p, BRW_MASK_DISABLE );
1122     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1123     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1124     brw_pop_insn_state( p );
1125
1126     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1127     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1128     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1129     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1130     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1131
1132     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1133     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1134     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1135     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1136
1137     /* z component */
1138     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1139     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1140     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1141     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1142
1143     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1144     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1145     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1146     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1147
1148     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1149     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1150     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1151     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1152
1153     /* We interpolate between the gradients using the polynomial
1154        6t^5 - 15t^4 + 10t^3 (Perlin). */
1155     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1156     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1157     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1158     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1159     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1160     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1161     brw_MUL( p, xi, xi, param0 );
1162     brw_MUL( p, yi, yi, param1 );
1163     brw_MUL( p, zi, zi, param2 );
1164     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1165     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1166     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1167     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1168     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1169     brw_MUL( p, xi, xi, param0 );
1170     brw_MUL( p, yi, yi, param1 );
1171     brw_MUL( p, zi, zi, param2 );
1172     brw_MUL( p, xi, xi, param0 );
1173     brw_MUL( p, yi, yi, param1 );
1174     brw_MUL( p, zi, zi, param2 );
1175     brw_MUL( p, xi, xi, param0 );
1176     brw_MUL( p, yi, yi, param1 );
1177     brw_MUL( p, zi, zi, param2 );
1178
1179     /* Here we interpolate in the y dimension... */
1180     brw_MUL( p, x0y1, x0y1, yi );
1181     brw_MUL( p, x1y1, x1y1, yi );
1182     brw_ADD( p, x0y0, x0y0, x0y1 );
1183     brw_ADD( p, x1y0, x1y0, x1y1 );
1184
1185     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1186     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1187     brw_MUL( p, x1y0, x1y0, xi );
1188     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1189
1190     /* Now do the same thing for the front four gradients... */
1191     /* x component */
1192     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1193     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1194     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1195     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1196
1197     brw_push_insn_state( p );
1198     brw_set_mask_control( p, BRW_MASK_DISABLE );
1199     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1200     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1201     brw_pop_insn_state( p );
1202
1203     brw_MUL( p, x1y0, x1y0, t );
1204     brw_MUL( p, x1y1, x1y1, t );
1205     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1206     brw_MUL( p, x0y0, x0y0, param0 );
1207     brw_MUL( p, x0y1, x0y1, param0 );
1208
1209     /* y component */
1210     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1211     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1212     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1213     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1214
1215     brw_push_insn_state( p );
1216     brw_set_mask_control( p, BRW_MASK_DISABLE );
1217     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1218     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1219     brw_pop_insn_state( p );
1220
1221     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1222     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1223     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1224     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1225     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1226
1227     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1228     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1229     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1230     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1231
1232     /* z component */
1233     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1234     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1235     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1236     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1237
1238     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1239     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1240     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1241     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1242
1243     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1244     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1245     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1246     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1247
1248     /* The interpolation coefficients are still around from last time, so
1249        again interpolate in the y dimension... */
1250     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1251     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1252     brw_MUL( p, x0y1, x0y1, yi );
1253     brw_MUL( p, x1y1, x1y1, yi );
1254     brw_ADD( p, x0y0, x0y0, x0y1 );
1255     brw_ADD( p, x1y0, x1y0, x1y1 );
1256
1257     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1258        time put the front face in tmp[ 1 ] and we're nearly there... */
1259     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1260     brw_MUL( p, x1y0, x1y0, xi );
1261     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1262
1263     /* The final interpolation, in the z dimension: */
1264     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1265     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1266     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1267
1268     /* scale by pow( 2, -15 ), as described above */
1269     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1270
1271     release_tmps( c, mark );
1272 }
1273
1274 static void emit_noise3( struct brw_wm_compile *c,
1275                          const struct prog_instruction *inst )
1276 {
1277     struct brw_compile *p = &c->func;
1278     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1279     GLuint mask = inst->DstReg.WriteMask;
1280     int i;
1281     int mark = mark_tmps( c );
1282
1283     assert( mark == 0 );
1284
1285     src0 = get_src_reg( c, inst, 0, 0 );
1286     src1 = get_src_reg( c, inst, 0, 1 );
1287     src2 = get_src_reg( c, inst, 0, 2 );
1288
1289     param0 = alloc_tmp( c );
1290     param1 = alloc_tmp( c );
1291     param2 = alloc_tmp( c );
1292
1293     brw_MOV( p, param0, src0 );
1294     brw_MOV( p, param1, src1 );
1295     brw_MOV( p, param2, src2 );
1296
1297     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1298
1299     /* Fill in the result: */
1300     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1301     for (i = 0 ; i < 4; i++) {
1302         if (mask & (1<<i)) {
1303             dst = get_dst_reg(c, inst, i);
1304             brw_MOV( p, dst, param0 );
1305         }
1306     }
1307     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1308         brw_set_saturate( p, 0 );
1309
1310     release_tmps( c, mark );
1311 }
1312
1313 /**
1314  * For the four-dimensional case, the little micro-optimisation benefits
1315  * we obtain by unrolling all the loops aren't worth the massive bloat it
1316  * now causes.  Instead, we loop twice around performing a similar operation
1317  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1318  * code to glue it all together.
1319  */
1320 static void noise4_sub( struct brw_wm_compile *c )
1321 {
1322     struct brw_compile *p = &c->func;
1323     struct brw_reg param[ 4 ],
1324         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1325         w0, /* noise for the w=0 cube */
1326         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1327         interp[ 4 ], /* interpolation coefficients */
1328         t, tmp[ 8 ], /* float temporaries */
1329         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1330         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1331     int i, j;
1332     int mark = mark_tmps( c );
1333     GLuint loop, origin;
1334
1335     x0y0 = alloc_tmp( c );
1336     x0y1 = alloc_tmp( c );
1337     x1y0 = alloc_tmp( c );
1338     x1y1 = alloc_tmp( c );
1339     t = alloc_tmp( c );
1340     w0 = alloc_tmp( c );
1341     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1342     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1343
1344     for( i = 0; i < 4; i++ ) {
1345         param[ i ] = lookup_tmp( c, mark - 5 + i );
1346         interp[ i ] = alloc_tmp( c );
1347     }
1348
1349     for( i = 0; i < 8; i++ ) {
1350         tmp[ i ] = alloc_tmp( c );
1351         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1352         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1353     }
1354
1355     brw_set_access_mode( p, BRW_ALIGN_1 );
1356
1357     /* We only want 16 bits of precision from the integral part of each
1358        co-ordinate, but unfortunately the RNDD semantics would saturate
1359        at 16 bits if we performed the operation directly to a 16-bit
1360        destination.  Therefore, we round to 32-bit temporaries where
1361        appropriate, and then store only the lower 16 bits. */
1362     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1363     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1364     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1365     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1366     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1367     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1368
1369     /* Modify the flag register here, because the side effect is useful
1370        later (see below).  We know for certain that all flags will be
1371        cleared, since the FRC instruction cannot possibly generate
1372        negative results.  Even for exceptional inputs (infinities, denormals,
1373        NaNs), the architecture guarantees that the L conditional is false. */
1374     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1375     brw_FRC( p, param[ 0 ], param[ 0 ] );
1376     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1377     for( i = 1; i < 4; i++ )
1378         brw_FRC( p, param[ i ], param[ i ] );
1379
1380     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1381        of all. */
1382     for( i = 0; i < 4; i++ )
1383         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1384     for( i = 0; i < 4; i++ )
1385         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1386     for( i = 0; i < 4; i++ )
1387         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1388     for( i = 0; i < 4; i++ )
1389         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1390     for( j = 0; j < 3; j++ )
1391         for( i = 0; i < 4; i++ )
1392             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1393
1394     /* Mark the current address, as it will be a jump destination.  The
1395        following code will be executed twice: first, with the flag
1396        register clear indicating the w=0 case, and second with flags
1397        set for w=1. */
1398     loop = p->nr_insn;
1399
1400     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1401        be hashed.  Since we have only 16 bits of precision in the hash, we
1402        must be careful about thorough mixing to maintain entropy as we
1403        squash the input vector into a small scalar. */
1404     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1405              brw_imm_uw( 0xBC8F ) );
1406     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1407              brw_imm_uw( 0xD0BD ) );
1408     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1409              brw_imm_uw( 0x9B93 ) );
1410     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1411              brw_imm_uw( 0xA359 ) );
1412     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1413              brw_imm_uw( 0xBC8F ) );
1414
1415     /* Temporarily disable the execution mask while we work with ExecSize=16
1416        channels (the mask is set for ExecSize=8 and is probably incorrect).
1417        Although this might cause execution of unwanted channels, the code
1418        writes only to temporary registers and has no side effects, so
1419        disabling the mask is harmless. */
1420     brw_push_insn_state( p );
1421     brw_set_mask_control( p, BRW_MASK_DISABLE );
1422     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1423     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1424     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1425
1426     /* We're now ready to perform the hashing.  The eight hashes are
1427        interleaved for performance.  The hash function used is
1428        designed to rapidly achieve avalanche and require only 16x16
1429        bit multiplication, and 8-bit swizzles (which we get for
1430        free). */
1431     for( i = 0; i < 4; i++ )
1432         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1433     for( i = 0; i < 4; i++ )
1434         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1435                  odd_bytes( wtmp[ i ] ) );
1436     for( i = 0; i < 4; i++ )
1437         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1438     for( i = 0; i < 4; i++ )
1439         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1440                  odd_bytes( wtmp[ i ] ) );
1441     brw_pop_insn_state( p );
1442
1443     /* Now we want to initialise the four rear gradients based on the
1444        hashes.  Format conversion from signed integer to float leaves
1445        everything scaled too high by a factor of pow( 2, 15 ), but
1446        we correct for that right at the end. */
1447     /* x component */
1448     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1449     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1450     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1451     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1452     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1453
1454     brw_push_insn_state( p );
1455     brw_set_mask_control( p, BRW_MASK_DISABLE );
1456     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1457     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1458     brw_pop_insn_state( p );
1459
1460     brw_MUL( p, x1y0, x1y0, t );
1461     brw_MUL( p, x1y1, x1y1, t );
1462     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1463     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1464     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1465
1466     /* y component */
1467     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1468     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1469     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1470     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1471
1472     brw_push_insn_state( p );
1473     brw_set_mask_control( p, BRW_MASK_DISABLE );
1474     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1475     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1476     brw_pop_insn_state( p );
1477
1478     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1479     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1480     /* prepare t for the w component (used below): w the first time through
1481        the loop; w - 1 the second time) */
1482     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1483     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1484     p->current->header.predicate_inverse = 1;
1485     brw_MOV( p, t, param[ 3 ] );
1486     p->current->header.predicate_inverse = 0;
1487     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1488     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1489     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1490
1491     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1492     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1493     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1494     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1495
1496     /* z component */
1497     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1498     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1499     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1500     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1501
1502     brw_push_insn_state( p );
1503     brw_set_mask_control( p, BRW_MASK_DISABLE );
1504     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1505     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1506     brw_pop_insn_state( p );
1507
1508     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1509     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1510     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1511     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1512
1513     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1514     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1515     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1516     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1517
1518     /* w component */
1519     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1520     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1521     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1522     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1523
1524     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1525     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1526     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1527     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1528     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1529
1530     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1531     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1532     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1533     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1534
1535     /* Here we interpolate in the y dimension... */
1536     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1537     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1538     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1539     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1540     brw_ADD( p, x0y0, x0y0, x0y1 );
1541     brw_ADD( p, x1y0, x1y0, x1y1 );
1542
1543     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1544     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1545     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1546     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1547
1548     /* Now do the same thing for the front four gradients... */
1549     /* x component */
1550     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1551     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1552     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1553     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1554
1555     brw_push_insn_state( p );
1556     brw_set_mask_control( p, BRW_MASK_DISABLE );
1557     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1558     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1559     brw_pop_insn_state( p );
1560
1561     brw_MUL( p, x1y0, x1y0, t );
1562     brw_MUL( p, x1y1, x1y1, t );
1563     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1564     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1565     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1566
1567     /* y component */
1568     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1569     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1570     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1571     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1572
1573     brw_push_insn_state( p );
1574     brw_set_mask_control( p, BRW_MASK_DISABLE );
1575     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1576     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1577     brw_pop_insn_state( p );
1578
1579     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1580     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1581     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1582     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1583     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1584
1585     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1586     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1587     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1588     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1589
1590     /* z component */
1591     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1592     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1593     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1594     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1595
1596     brw_push_insn_state( p );
1597     brw_set_mask_control( p, BRW_MASK_DISABLE );
1598     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1599     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1600     brw_pop_insn_state( p );
1601
1602     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1603     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1604     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1605     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1606     /* prepare t for the w component (used below): w the first time through
1607        the loop; w - 1 the second time) */
1608     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1609     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1610     p->current->header.predicate_inverse = 1;
1611     brw_MOV( p, t, param[ 3 ] );
1612     p->current->header.predicate_inverse = 0;
1613     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1614
1615     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1616     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1617     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1618     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1619
1620     /* w component */
1621     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1622     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1623     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1624     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1625
1626     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1627     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1628     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1629     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1630
1631     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1632     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1633     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1634     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1635
1636     /* Interpolate in the y dimension: */
1637     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1638     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1639     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1640     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1641     brw_ADD( p, x0y0, x0y0, x0y1 );
1642     brw_ADD( p, x1y0, x1y0, x1y1 );
1643
1644     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1645        time put the front face in tmp[ 1 ] and we're nearly there... */
1646     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1647     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1648     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1649
1650     /* Another interpolation, in the z dimension: */
1651     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1652     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1653     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1654
1655     /* Exit the loop if we've computed both cubes... */
1656     origin = p->nr_insn;
1657     brw_push_insn_state( p );
1658     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1659     brw_set_mask_control( p, BRW_MASK_DISABLE );
1660     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1661     brw_pop_insn_state( p );
1662
1663     /* Save the result for the w=0 case, and increment the w coordinate: */
1664     brw_MOV( p, w0, tmp[ 0 ] );
1665     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1666              brw_imm_uw( 1 ) );
1667
1668     /* Loop around for the other cube.  Explicitly set the flag register
1669        (unfortunately we must spend an extra instruction to do this: we
1670        can't rely on a side effect of the previous MOV or ADD because
1671        conditional modifiers which are normally true might be false in
1672        exceptional circumstances, e.g. given a NaN input; the add to
1673        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1674     brw_push_insn_state( p );
1675     brw_set_mask_control( p, BRW_MASK_DISABLE );
1676     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1677     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1678              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1679     brw_pop_insn_state( p );
1680
1681     /* Patch the previous conditional branch now that we know the
1682        destination address. */
1683     brw_set_src1( p->store + origin,
1684                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1685
1686     /* The very last interpolation. */
1687     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1688     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1689     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1690
1691     /* scale by pow( 2, -15 ), as described above */
1692     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1693
1694     release_tmps( c, mark );
1695 }
1696
1697 static void emit_noise4( struct brw_wm_compile *c,
1698                          const struct prog_instruction *inst )
1699 {
1700     struct brw_compile *p = &c->func;
1701     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1702     GLuint mask = inst->DstReg.WriteMask;
1703     int i;
1704     int mark = mark_tmps( c );
1705
1706     assert( mark == 0 );
1707
1708     src0 = get_src_reg( c, inst, 0, 0 );
1709     src1 = get_src_reg( c, inst, 0, 1 );
1710     src2 = get_src_reg( c, inst, 0, 2 );
1711     src3 = get_src_reg( c, inst, 0, 3 );
1712
1713     param0 = alloc_tmp( c );
1714     param1 = alloc_tmp( c );
1715     param2 = alloc_tmp( c );
1716     param3 = alloc_tmp( c );
1717
1718     brw_MOV( p, param0, src0 );
1719     brw_MOV( p, param1, src1 );
1720     brw_MOV( p, param2, src2 );
1721     brw_MOV( p, param3, src3 );
1722
1723     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1724
1725     /* Fill in the result: */
1726     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1727     for (i = 0 ; i < 4; i++) {
1728         if (mask & (1<<i)) {
1729             dst = get_dst_reg(c, inst, i);
1730             brw_MOV( p, dst, param0 );
1731         }
1732     }
1733     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1734         brw_set_saturate( p, 0 );
1735
1736     release_tmps( c, mark );
1737 }
1738
1739 /**
1740  * Resolve subroutine calls after code emit is done.
1741  */
1742 static void post_wm_emit( struct brw_wm_compile *c )
1743 {
1744     brw_resolve_cals(&c->func);
1745 }
1746
1747 static void
1748 get_argument_regs(struct brw_wm_compile *c,
1749                   const struct prog_instruction *inst,
1750                   int index,
1751                   struct brw_reg *dst,
1752                   struct brw_reg *regs,
1753                   int mask)
1754 {
1755     struct brw_compile *p = &c->func;
1756     int i, j;
1757
1758     for (i = 0; i < 4; i++) {
1759         if (mask & (1 << i)) {
1760             regs[i] = get_src_reg(c, inst, index, i);
1761
1762             /* Unalias destination registers from our sources. */
1763             if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1764                for (j = 0; j < 4; j++) {
1765                    if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1766                        struct brw_reg tmp = alloc_tmp(c);
1767                        brw_MOV(p, tmp, regs[i]);
1768                        regs[i] = tmp;
1769                        break;
1770                    }
1771                }
1772             }
1773         }
1774     }
1775 }
1776
1777 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1778 {
1779    struct intel_context *intel = &brw->intel;
1780 #define MAX_IF_DEPTH 32
1781 #define MAX_LOOP_DEPTH 32
1782     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1783     GLuint i, if_depth = 0, loop_depth = 0;
1784     struct brw_compile *p = &c->func;
1785     struct brw_indirect stack_index = brw_indirect(0, 0);
1786
1787     c->out_of_regs = GL_FALSE;
1788
1789     prealloc_reg(c);
1790     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1791     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1792
1793     for (i = 0; i < c->nr_fp_insns; i++) {
1794         const struct prog_instruction *inst = &c->prog_instructions[i];
1795         int dst_flags;
1796         struct brw_reg args[3][4], dst[4];
1797         int j;
1798         int mark = mark_tmps( c );
1799
1800         c->cur_inst = i;
1801
1802 #if 0
1803         printf("Inst %d: ", i);
1804         _mesa_print_instruction(inst);
1805 #endif
1806
1807         /* fetch any constants that this instruction needs */
1808         if (c->fp->use_const_buffer)
1809            fetch_constants(c, inst);
1810
1811         if (inst->Opcode != OPCODE_ARL) {
1812            for (j = 0; j < 4; j++) {
1813               if (inst->DstReg.WriteMask & (1 << j))
1814                  dst[j] = get_dst_reg(c, inst, j);
1815               else
1816                  dst[j] = brw_null_reg();
1817            }
1818         }
1819         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1820             get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1821
1822         dst_flags = inst->DstReg.WriteMask;
1823         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1824             dst_flags |= SATURATE;
1825
1826         if (inst->CondUpdate)
1827             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1828         else
1829             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1830
1831         switch (inst->Opcode) {
1832             case WM_PIXELXY:
1833                 emit_pixel_xy(c, dst, dst_flags);
1834                 break;
1835             case WM_DELTAXY:
1836                 emit_delta_xy(p, dst, dst_flags, args[0]);
1837                 break;
1838             case WM_PIXELW:
1839                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1840                 break;
1841             case WM_LINTERP:
1842                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1843                 break;
1844             case WM_PINTERP:
1845                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1846                 break;
1847             case WM_CINTERP:
1848                 emit_cinterp(p, dst, dst_flags, args[0]);
1849                 break;
1850             case WM_WPOSXY:
1851                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1852                 break;
1853             case WM_FB_WRITE:
1854                 emit_fb_write(c, args[0], args[1], args[2],
1855                               INST_AUX_GET_TARGET(inst->Aux),
1856                               inst->Aux & INST_AUX_EOT);
1857                 break;
1858             case WM_FRONTFACING:
1859                 emit_frontfacing(p, dst, dst_flags);
1860                 break;
1861             case OPCODE_ADD:
1862                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1863                 break;
1864             case OPCODE_ARL:
1865                 emit_arl(c, inst);
1866                 break;
1867             case OPCODE_FRC:
1868                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1869                 break;
1870             case OPCODE_FLR:
1871                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1872                 break;
1873             case OPCODE_LRP:
1874                 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1875                 break;
1876             case OPCODE_TRUNC:
1877                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1878                 break;
1879             case OPCODE_MOV:
1880             case OPCODE_SWZ:
1881                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1882                 break;
1883             case OPCODE_DP3:
1884                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1885                 break;
1886             case OPCODE_DP4:
1887                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1888                 break;
1889             case OPCODE_XPD:
1890                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1891                 break;
1892             case OPCODE_DPH:
1893                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1894                 break;
1895             case OPCODE_RCP:
1896                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1897                 break;
1898             case OPCODE_RSQ:
1899                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1900                 break;
1901             case OPCODE_SIN:
1902                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1903                 break;
1904             case OPCODE_COS:
1905                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1906                 break;
1907             case OPCODE_EX2:
1908                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1909                 break;
1910             case OPCODE_LG2:
1911                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1912                 break;
1913             case OPCODE_CMP:
1914                 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1915                 break;
1916             case OPCODE_MIN:
1917                 emit_min(p, dst, dst_flags, args[0], args[1]);
1918                 break;
1919             case OPCODE_MAX:
1920                 emit_max(p, dst, dst_flags, args[0], args[1]);
1921                 break;
1922             case OPCODE_DDX:
1923             case OPCODE_DDY:
1924                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1925                           args[0]);
1926                 break;
1927             case OPCODE_SLT:
1928                 emit_sop(p, dst, dst_flags,
1929                          BRW_CONDITIONAL_L, args[0], args[1]);
1930                 break;
1931             case OPCODE_SLE:
1932                 emit_sop(p, dst, dst_flags,
1933                          BRW_CONDITIONAL_LE, args[0], args[1]);
1934                 break;
1935             case OPCODE_SGT:
1936                 emit_sop(p, dst, dst_flags,
1937                          BRW_CONDITIONAL_G, args[0], args[1]);
1938                 break;
1939             case OPCODE_SGE:
1940                 emit_sop(p, dst, dst_flags,
1941                          BRW_CONDITIONAL_GE, args[0], args[1]);
1942                 break;
1943             case OPCODE_SEQ:
1944                 emit_sop(p, dst, dst_flags,
1945                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1946                 break;
1947             case OPCODE_SNE:
1948                 emit_sop(p, dst, dst_flags,
1949                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1950                 break;
1951             case OPCODE_MUL:
1952                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1953                 break;
1954             case OPCODE_POW:
1955                 emit_math2(c, BRW_MATH_FUNCTION_POW,
1956                            dst, dst_flags, args[0], args[1]);
1957                 break;
1958             case OPCODE_MAD:
1959                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1960                 break;
1961             case OPCODE_NOISE1:
1962                 emit_noise1(c, inst);
1963                 break;
1964             case OPCODE_NOISE2:
1965                 emit_noise2(c, inst);
1966                 break;
1967             case OPCODE_NOISE3:
1968                 emit_noise3(c, inst);
1969                 break;
1970             case OPCODE_NOISE4:
1971                 emit_noise4(c, inst);
1972                 break;
1973             case OPCODE_TEX:
1974                 emit_tex(c, dst, dst_flags, args[0],
1975                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1976                                  0, 1, 0, 0),
1977                          inst->TexSrcTarget,
1978                          inst->TexSrcUnit,
1979                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1980                 break;
1981             case OPCODE_TXB:
1982                 emit_txb(c, dst, dst_flags, args[0],
1983                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1984                                  0, 1, 0, 0),
1985                          inst->TexSrcTarget,
1986                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1987                 break;
1988             case OPCODE_KIL_NV:
1989                 emit_kil(c);
1990                 break;
1991             case OPCODE_IF:
1992                 assert(if_depth < MAX_IF_DEPTH);
1993                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1994                 break;
1995             case OPCODE_ELSE:
1996                 assert(if_depth > 0);
1997                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
1998                 break;
1999             case OPCODE_ENDIF:
2000                 assert(if_depth > 0);
2001                 brw_ENDIF(p, if_inst[--if_depth]);
2002                 break;
2003             case OPCODE_BGNSUB:
2004                 brw_save_label(p, inst->Comment, p->nr_insn);
2005                 break;
2006             case OPCODE_ENDSUB:
2007                 /* no-op */
2008                 break;
2009             case OPCODE_CAL:
2010                 brw_push_insn_state(p);
2011                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2012                 brw_set_access_mode(p, BRW_ALIGN_1);
2013                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2014                 brw_set_access_mode(p, BRW_ALIGN_16);
2015                 brw_ADD(p, get_addr_reg(stack_index),
2016                          get_addr_reg(stack_index), brw_imm_d(4));
2017                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2018                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2019                 brw_pop_insn_state(p);
2020                 break;
2021
2022             case OPCODE_RET:
2023                 brw_push_insn_state(p);
2024                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2025                 brw_ADD(p, get_addr_reg(stack_index),
2026                         get_addr_reg(stack_index), brw_imm_d(-4));
2027                 brw_set_access_mode(p, BRW_ALIGN_1);
2028                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2029                 brw_set_access_mode(p, BRW_ALIGN_16);
2030                 brw_pop_insn_state(p);
2031
2032                 break;
2033             case OPCODE_BGNLOOP:
2034                 /* XXX may need to invalidate the current_constant regs */
2035                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2036                 break;
2037             case OPCODE_BRK:
2038                 brw_BREAK(p);
2039                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2040                 break;
2041             case OPCODE_CONT:
2042                 brw_CONT(p);
2043                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2044                 break;
2045             case OPCODE_ENDLOOP:
2046                {
2047                   struct brw_instruction *inst0, *inst1;
2048                   GLuint br = 1;
2049
2050                   if (intel->is_ironlake)
2051                      br = 2;
2052
2053                   assert(loop_depth > 0);
2054                   loop_depth--;
2055                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2056                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2057                   while (inst0 > loop_inst[loop_depth]) {
2058                      inst0--;
2059                      if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2060                          inst0->bits3.if_else.jump_count == 0) {
2061                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2062                         inst0->bits3.if_else.pop_count = 0;
2063                      }
2064                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2065                               inst0->bits3.if_else.jump_count == 0) {
2066                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2067                         inst0->bits3.if_else.pop_count = 0;
2068                      }
2069                   }
2070                }
2071                break;
2072             default:
2073                 printf("unsupported opcode %d (%s) in fragment shader\n",
2074                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2075                        _mesa_opcode_string(inst->Opcode) : "unknown");
2076         }
2077
2078         /* Release temporaries containing any unaliased source regs. */
2079         release_tmps( c, mark );
2080
2081         if (inst->CondUpdate)
2082             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2083         else
2084             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2085     }
2086     post_wm_emit(c);
2087
2088     if (INTEL_DEBUG & DEBUG_WM) {
2089       printf("wm-native:\n");
2090       for (i = 0; i < p->nr_insn; i++)
2091          brw_disasm(stderr, &p->store[i]);
2092       printf("\n");
2093     }
2094 }
2095
2096 /**
2097  * Do GPU code generation for shaders that use GLSL features such as
2098  * flow control.  Other shaders will be compiled with the
2099  */
2100 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2101 {
2102     if (INTEL_DEBUG & DEBUG_WM) {
2103         printf("brw_wm_glsl_emit:\n");
2104     }
2105
2106     /* initial instruction translation/simplification */
2107     brw_wm_pass_fp(c);
2108
2109     /* actual code generation */
2110     brw_wm_emit_glsl(brw, c);
2111
2112     if (INTEL_DEBUG & DEBUG_WM) {
2113         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2114     }
2115
2116     c->prog_data.total_grf = num_grf_used(c);
2117     c->prog_data.total_scratch = 0;
2118 }