src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     struct intel_context *intel = &c->func.brw->intel;
 293     int i, j;
 294     struct brw_reg reg;
 295     int urb_read_length = 0;
 296     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 297     GLuint reg_index = 0;
 298
 299     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 300     c->first_free_grf = 0;
 301
 302     for (i = 0; i < 4; i++) {
 303         if (i < c->key.nr_depth_regs)
 304             reg = brw_vec8_grf(i * 2, 0);
 305         else
 306             reg = brw_vec8_grf(0, 0);
 307         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 308     }
 309     reg_index += 2 * c->key.nr_depth_regs;
 310
 311     /* constants */
 312     {
 313         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 314         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 315
 316         /* use a real constant buffer, or just use a section of the GRF? */
 317         /* XXX this heuristic may need adjustment... */
 318         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 319            c->fp->use_const_buffer = GL_TRUE;
 320         else
 321            c->fp->use_const_buffer = GL_FALSE;
 322         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 323
 324         if (c->fp->use_const_buffer) {
 325            /* We'll use a real constant buffer and fetch constants from
 326             * it with a dataport read message.
 327             */
 328
 329            /* number of float constants in CURBE */
 330            c->prog_data.nr_params = 0;
 331         }
 332         else {
 333            const struct gl_program_parameter_list *plist =
 334               c->fp->program.Base.Parameters;
 335            int index = 0;
 336
 337            /* number of float constants in CURBE */
 338            c->prog_data.nr_params = 4 * nr_params;
 339
 340            /* loop over program constants (float[4]) */
 341            for (i = 0; i < nr_params; i++) {
 342               /* loop over XYZW channels */
 343               for (j = 0; j < 4; j++, index++) {
 344                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 345                  /* Save pointer to parameter/constant value.
 346                   * Constants will be copied in prepare_constant_buffer()
 347                   */
 348                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 349                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 350               }
 351            }
 352            /* number of constant regs used (each reg is float[8]) */
 353            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 354            reg_index += c->nr_creg;
 355         }
 356     }
 357
 358     /* fragment shader inputs */
 359     for (i = 0; i < VERT_RESULT_MAX; i++) {
 360        int fp_input;
 361
 362        if (i >= VERT_RESULT_VAR0)
 363           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 364        else if (i <= VERT_RESULT_TEX7)
 365           fp_input = i;
 366        else
 367           fp_input = -1;
 368
 369        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 370           urb_read_length = reg_index;
 371           reg = brw_vec8_grf(reg_index, 0);
 372           for (j = 0; j < 4; j++)
 373              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 374        }
 375        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 376           reg_index += 2;
 377        }
 378     }
 379
 380     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 381     c->prog_data.urb_read_length = urb_read_length;
 382     c->prog_data.curb_read_length = c->nr_creg;
 383     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 384     reg_index++;
 385     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 386     reg_index += 2;
 387
 388     /* mark GRF regs [0..reg_index-1] as in-use */
 389     for (i = 0; i < reg_index; i++)
 390        prealloc_grf(c, i);
 391
 392     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 393     prealloc_grf(c, 126);
 394     prealloc_grf(c, 127);
 395
 396     for (i = 0; i < c->nr_fp_insns; i++) {
 397         const struct prog_instruction *inst = &c->prog_instructions[i];
 398         struct brw_reg dst[4];
 399
 400         switch (inst->Opcode) {
 401         case OPCODE_TEX:
 402         case OPCODE_TXB:
 403             /* Allocate the channels of texture results contiguously,
 404              * since they are written out that way by the sampler unit.
 405              */
 406             for (j = 0; j < 4; j++) {
 407                 dst[j] = get_dst_reg(c, inst, j);
 408                 if (j != 0)
 409                     assert(dst[j].nr == dst[j - 1].nr + 1);
 410             }
 411             break;
 412         default:
 413             break;
 414         }
 415     }
 416
 417     for (i = 0; i < c->nr_fp_insns; i++) {
 418         const struct prog_instruction *inst = &c->prog_instructions[i];
 419
 420         switch (inst->Opcode) {
 421         case WM_DELTAXY:
 422             /* Allocate WM_DELTAXY destination on G45/GM45 to an
 423              * even-numbered GRF if possible so that we can use the PLN
 424              * instruction.
 425              */
 426             if (inst->DstReg.WriteMask == WRITEMASK_XY &&
 427                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
 428                 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
 429                 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
 430                 int grf;
 431
 432                 for (grf = c->first_free_grf & ~1;
 433                      grf < BRW_WM_MAX_GRF;
 434                      grf += 2)
 435                 {
 436                     if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
 437                         c->used_grf[grf] = GL_TRUE;
 438                         c->used_grf[grf + 1] = GL_TRUE;
 439                         c->first_free_grf = grf + 2;  /* a guess */
 440
 441                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
 442                                 brw_vec8_grf(grf, 0));
 443                         set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
 444                                 brw_vec8_grf(grf + 1, 0));
 445                         break;
 446                     }
 447                 }
 448             }
 449         default:
 450             break;
 451         }
 452     }
 453
 454     /* An instruction may reference up to three constants.
 455      * They'll be found in these registers.
 456      * XXX alloc these on demand!
 457      */
 458     if (c->fp->use_const_buffer) {
 459        for (i = 0; i < 3; i++) {
 460           c->current_const[i].index = -1;
 461           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 462        }
 463     }
 464 #if 0
 465     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 466     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 467 #endif
 468 }
 469
 470
 471 /**
 472  * Check if any of the instruction's src registers are constants, uniforms,
 473  * or statevars.  If so, fetch any constants that we don't already have in
 474  * the three GRF slots.
 475  */
 476 static void fetch_constants(struct brw_wm_compile *c,
 477                             const struct prog_instruction *inst)
 478 {
 479    struct brw_compile *p = &c->func;
 480    GLuint i;
 481
 482    /* loop over instruction src regs */
 483    for (i = 0; i < 3; i++) {
 484       const struct prog_src_register *src = &inst->SrcReg[i];
 485       if (src->File == PROGRAM_STATE_VAR ||
 486           src->File == PROGRAM_CONSTANT ||
 487           src->File == PROGRAM_UNIFORM) {
 488          c->current_const[i].index = src->Index;
 489
 490 #if 0
 491          printf("  fetch const[%d] for arg %d into reg %d\n",
 492                 src->Index, i, c->current_const[i].reg.nr);
 493 #endif
 494
 495          /* need to fetch the constant now */
 496          brw_dp_READ_4(p,
 497                        c->current_const[i].reg,  /* writeback dest */
 498                        src->RelAddr,             /* relative indexing? */
 499                        16 * src->Index,          /* byte offset */
 500                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 501                        );
 502       }
 503    }
 504 }
 505
 506
 507 /**
 508  * Convert Mesa dst register to brw register.
 509  */
 510 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 511                                   const struct prog_instruction *inst,
 512                                   GLuint component)
 513 {
 514     const int nr = 1;
 515     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 516             0, 0);
 517 }
 518
 519
 520 static struct brw_reg
 521 get_src_reg_const(struct brw_wm_compile *c,
 522                   const struct prog_instruction *inst,
 523                   GLuint srcRegIndex, GLuint component)
 524 {
 525    /* We should have already fetched the constant from the constant
 526     * buffer in fetch_constants().  Now we just have to return a
 527     * register description that extracts the needed component and
 528     * smears it across all eight vector components.
 529     */
 530    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 531    struct brw_reg const_reg;
 532
 533    assert(component < 4);
 534    assert(srcRegIndex < 3);
 535    assert(c->current_const[srcRegIndex].index != -1);
 536    const_reg = c->current_const[srcRegIndex].reg;
 537
 538    /* extract desired float from the const_reg, and smear */
 539    const_reg = stride(const_reg, 0, 1, 0);
 540    const_reg.subnr = component * 4;
 541
 542    if (src->Negate & (1 << component))
 543       const_reg = negate(const_reg);
 544    if (src->Abs)
 545       const_reg = brw_abs(const_reg);
 546
 547 #if 0
 548    printf("  form const[%d].%d for arg %d, reg %d\n",
 549           c->current_const[srcRegIndex].index,
 550           component,
 551           srcRegIndex,
 552           const_reg.nr);
 553 #endif
 554
 555    return const_reg;
 556 }
 557
 558
 559 /**
 560  * Convert Mesa src register to brw register.
 561  */
 562 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 563                                   const struct prog_instruction *inst,
 564                                   GLuint srcRegIndex, GLuint channel)
 565 {
 566     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 567     const GLuint nr = 1;
 568     const GLuint component = GET_SWZ(src->Swizzle, channel);
 569
 570     /* Extended swizzle terms */
 571     if (component == SWIZZLE_ZERO) {
 572        return brw_imm_f(0.0F);
 573     }
 574     else if (component == SWIZZLE_ONE) {
 575        return brw_imm_f(1.0F);
 576     }
 577
 578     if (c->fp->use_const_buffer &&
 579         (src->File == PROGRAM_STATE_VAR ||
 580          src->File == PROGRAM_CONSTANT ||
 581          src->File == PROGRAM_UNIFORM)) {
 582        return get_src_reg_const(c, inst, srcRegIndex, component);
 583     }
 584     else {
 585        /* other type of source register */
 586        return get_reg(c, src->File, src->Index, component, nr,
 587                       src->Negate, src->Abs);
 588     }
 589 }
 590
 591 /**
 592  * Subroutines are minimal support for resusable instruction sequences.
 593  * They are implemented as simply as possible to minimise overhead: there
 594  * is no explicit support for communication between the caller and callee
 595  * other than saving the return address in a temporary register, nor is
 596  * there any automatic local storage.  This implies that great care is
 597  * required before attempting reentrancy or any kind of nested
 598  * subroutine invocations.
 599  */
 600 static void invoke_subroutine( struct brw_wm_compile *c,
 601                                enum _subroutine subroutine,
 602                                void (*emit)( struct brw_wm_compile * ) )
 603 {
 604     struct brw_compile *p = &c->func;
 605
 606     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 607
 608     if( c->subroutines[ subroutine ] ) {
 609         /* subroutine previously emitted: reuse existing instructions */
 610
 611         int mark = mark_tmps( c );
 612         struct brw_reg return_address = retype( alloc_tmp( c ),
 613                                                 BRW_REGISTER_TYPE_UD );
 614         int here = p->nr_insn;
 615
 616         brw_push_insn_state(p);
 617         brw_set_mask_control(p, BRW_MASK_DISABLE);
 618         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 619
 620         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 621                  brw_imm_d( ( c->subroutines[ subroutine ] -
 622                               here - 1 ) << 4 ) );
 623         brw_pop_insn_state(p);
 624
 625         release_tmps( c, mark );
 626     } else {
 627         /* previously unused subroutine: emit, and mark for later reuse */
 628
 629         int mark = mark_tmps( c );
 630         struct brw_reg return_address = retype( alloc_tmp( c ),
 631                                                 BRW_REGISTER_TYPE_UD );
 632         struct brw_instruction *calc;
 633         int base = p->nr_insn;
 634
 635         brw_push_insn_state(p);
 636         brw_set_mask_control(p, BRW_MASK_DISABLE);
 637         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 638         brw_pop_insn_state(p);
 639
 640         c->subroutines[ subroutine ] = p->nr_insn;
 641
 642         emit( c );
 643
 644         brw_push_insn_state(p);
 645         brw_set_mask_control(p, BRW_MASK_DISABLE);
 646         brw_MOV( p, brw_ip_reg(), return_address );
 647         brw_pop_insn_state(p);
 648
 649         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 650
 651         release_tmps( c, mark );
 652     }
 653 }
 654
 655 static void emit_arl(struct brw_wm_compile *c,
 656                      const struct prog_instruction *inst)
 657 {
 658     struct brw_compile *p = &c->func;
 659     struct brw_reg src0, addr_reg;
 660     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 661     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 662                            BRW_ARF_ADDRESS, 0);
 663     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 664     brw_MOV(p, addr_reg, src0);
 665     brw_set_saturate(p, 0);
 666 }
 667
 668 /**
 669  * For GLSL shaders, this KIL will be unconditional.
 670  * It may be contained inside an IF/ENDIF structure of course.
 671  */
 672 static void emit_kil(struct brw_wm_compile *c)
 673 {
 674     struct brw_compile *p = &c->func;
 675     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 676     brw_push_insn_state(p);
 677     brw_set_mask_control(p, BRW_MASK_DISABLE);
 678     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
 679     brw_AND(p, depth, c->emit_mask_reg, depth);
 680     brw_pop_insn_state(p);
 681 }
 682
 683 static INLINE struct brw_reg high_words( struct brw_reg reg )
 684 {
 685     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 686                    0, 8, 2 );
 687 }
 688
 689 static INLINE struct brw_reg low_words( struct brw_reg reg )
 690 {
 691     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 692 }
 693
 694 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 695 {
 696     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 697 }
 698
 699 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 700 {
 701     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 702                    0, 16, 2 );
 703 }
 704
 705 /* One-, two- and three-dimensional Perlin noise, similar to the description
 706    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 707 static void noise1_sub( struct brw_wm_compile *c ) {
 708
 709     struct brw_compile *p = &c->func;
 710     struct brw_reg param,
 711         x0, x1, /* gradients at each end */
 712         t, tmp[ 2 ], /* float temporaries */
 713         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 714     int i;
 715     int mark = mark_tmps( c );
 716
 717     x0 = alloc_tmp( c );
 718     x1 = alloc_tmp( c );
 719     t = alloc_tmp( c );
 720     tmp[ 0 ] = alloc_tmp( c );
 721     tmp[ 1 ] = alloc_tmp( c );
 722     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 723     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 724     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 725     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 726     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 727
 728     param = lookup_tmp( c, mark - 2 );
 729
 730     brw_set_access_mode( p, BRW_ALIGN_1 );
 731
 732     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 733
 734     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 735        be hashed.  Also compute the remainder (offset within the unit
 736        length), interleaved to reduce register dependency penalties. */
 737     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 738     brw_FRC( p, param, param );
 739     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 740     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 741     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 742
 743     /* We're now ready to perform the hashing.  The two hashes are
 744        interleaved for performance.  The hash function used is
 745        designed to rapidly achieve avalanche and require only 32x16
 746        bit multiplication, and 16-bit swizzles (which we get for
 747        free).  We can't use immediate operands in the multiplies,
 748        because immediates are permitted only in src1 and the 16-bit
 749        factor is permitted only in src0. */
 750     for( i = 0; i < 2; i++ )
 751         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 752     for( i = 0; i < 2; i++ )
 753        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 754                 high_words( itmp[ i ] ) );
 755     for( i = 0; i < 2; i++ )
 756         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 757     for( i = 0; i < 2; i++ )
 758        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 759                 high_words( itmp[ i ] ) );
 760     for( i = 0; i < 2; i++ )
 761         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 762     for( i = 0; i < 2; i++ )
 763        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 764                 high_words( itmp[ i ] ) );
 765
 766     /* Now we want to initialise the two gradients based on the
 767        hashes.  Format conversion from signed integer to float leaves
 768        everything scaled too high by a factor of pow( 2, 31 ), but
 769        we correct for that right at the end. */
 770     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 771     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 772     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 773
 774     brw_MUL( p, x0, x0, param );
 775     brw_MUL( p, x1, x1, t );
 776
 777     /* We interpolate between the gradients using the polynomial
 778        6t^5 - 15t^4 + 10t^3 (Perlin). */
 779     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 780     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 781     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 782     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 783     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 784     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 785                                            pipeline */
 786     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 787     brw_MUL( p, param, tmp[ 0 ], param );
 788     brw_MUL( p, x1, x1, param );
 789     brw_ADD( p, x0, x0, x1 );
 790     /* scale by pow( 2, -30 ), to compensate for the format conversion
 791        above and an extra factor of 2 so that a single gradient covers
 792        the [-1,1] range */
 793     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 794
 795     release_tmps( c, mark );
 796 }
 797
 798 static void emit_noise1( struct brw_wm_compile *c,
 799                          const struct prog_instruction *inst )
 800 {
 801     struct brw_compile *p = &c->func;
 802     struct brw_reg src, param, dst;
 803     GLuint mask = inst->DstReg.WriteMask;
 804     int i;
 805     int mark = mark_tmps( c );
 806
 807     assert( mark == 0 );
 808
 809     src = get_src_reg( c, inst, 0, 0 );
 810
 811     param = alloc_tmp( c );
 812
 813     brw_MOV( p, param, src );
 814
 815     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 816
 817     /* Fill in the result: */
 818     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 819     for (i = 0 ; i < 4; i++) {
 820         if (mask & (1<<i)) {
 821             dst = get_dst_reg(c, inst, i);
 822             brw_MOV( p, dst, param );
 823         }
 824     }
 825     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 826         brw_set_saturate( p, 0 );
 827
 828     release_tmps( c, mark );
 829 }
 830
 831 static void noise2_sub( struct brw_wm_compile *c ) {
 832
 833     struct brw_compile *p = &c->func;
 834     struct brw_reg param0, param1,
 835         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 836         t, tmp[ 4 ], /* float temporaries */
 837         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 838     int i;
 839     int mark = mark_tmps( c );
 840
 841     x0y0 = alloc_tmp( c );
 842     x0y1 = alloc_tmp( c );
 843     x1y0 = alloc_tmp( c );
 844     x1y1 = alloc_tmp( c );
 845     t = alloc_tmp( c );
 846     for( i = 0; i < 4; i++ ) {
 847         tmp[ i ] = alloc_tmp( c );
 848         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 849     }
 850     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 851     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 852     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 853
 854     param0 = lookup_tmp( c, mark - 3 );
 855     param1 = lookup_tmp( c, mark - 2 );
 856
 857     brw_set_access_mode( p, BRW_ALIGN_1 );
 858
 859     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 860        be hashed.  Also compute the remainders (offsets within the unit
 861        square), interleaved to reduce register dependency penalties. */
 862     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 863     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 864     brw_FRC( p, param0, param0 );
 865     brw_FRC( p, param1, param1 );
 866     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 867     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 868              low_words( itmp[ 1 ] ) );
 869     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 870     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 871     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 872     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 873     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 874
 875     /* We're now ready to perform the hashing.  The four hashes are
 876        interleaved for performance.  The hash function used is
 877        designed to rapidly achieve avalanche and require only 32x16
 878        bit multiplication, and 16-bit swizzles (which we get for
 879        free).  We can't use immediate operands in the multiplies,
 880        because immediates are permitted only in src1 and the 16-bit
 881        factor is permitted only in src0. */
 882     for( i = 0; i < 4; i++ )
 883         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 884     for( i = 0; i < 4; i++ )
 885         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 886                  high_words( itmp[ i ] ) );
 887     for( i = 0; i < 4; i++ )
 888         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 889     for( i = 0; i < 4; i++ )
 890         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 891                  high_words( itmp[ i ] ) );
 892     for( i = 0; i < 4; i++ )
 893         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 894     for( i = 0; i < 4; i++ )
 895         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 896                  high_words( itmp[ i ] ) );
 897
 898     /* Now we want to initialise the four gradients based on the
 899        hashes.  Format conversion from signed integer to float leaves
 900        everything scaled too high by a factor of pow( 2, 15 ), but
 901        we correct for that right at the end. */
 902     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 903     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 904     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 905     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 906     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 907
 908     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 909     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 910     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 911     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 912
 913     brw_MUL( p, x1y0, x1y0, t );
 914     brw_MUL( p, x1y1, x1y1, t );
 915     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 916     brw_MUL( p, x0y0, x0y0, param0 );
 917     brw_MUL( p, x0y1, x0y1, param0 );
 918
 919     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 920     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 921     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 922     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 923
 924     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 925     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 926     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 927     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 928
 929     /* We interpolate between the gradients using the polynomial
 930        6t^5 - 15t^4 + 10t^3 (Perlin). */
 931     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
 932     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
 933     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 934     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
 935     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 936     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 937     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
 938                                                  pipeline */
 939     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 940     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
 941     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 942     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 943     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
 944                                                  pipeline */
 945     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
 946     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
 947     brw_MUL( p, param0, tmp[ 0 ], param0 );
 948     brw_MUL( p, param1, tmp[ 1 ], param1 );
 949
 950     /* Here we interpolate in the y dimension... */
 951     brw_MUL( p, x0y1, x0y1, param1 );
 952     brw_MUL( p, x1y1, x1y1, param1 );
 953     brw_ADD( p, x0y0, x0y0, x0y1 );
 954     brw_ADD( p, x1y0, x1y0, x1y1 );
 955
 956     /* And now in x.  There are horrible register dependencies here,
 957        but we have nothing else to do. */
 958     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
 959     brw_MUL( p, x1y0, x1y0, param0 );
 960     brw_ADD( p, x0y0, x0y0, x1y0 );
 961
 962     /* scale by pow( 2, -15 ), as described above */
 963     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
 964
 965     release_tmps( c, mark );
 966 }
 967
 968 static void emit_noise2( struct brw_wm_compile *c,
 969                          const struct prog_instruction *inst )
 970 {
 971     struct brw_compile *p = &c->func;
 972     struct brw_reg src0, src1, param0, param1, dst;
 973     GLuint mask = inst->DstReg.WriteMask;
 974     int i;
 975     int mark = mark_tmps( c );
 976
 977     assert( mark == 0 );
 978
 979     src0 = get_src_reg( c, inst, 0, 0 );
 980     src1 = get_src_reg( c, inst, 0, 1 );
 981
 982     param0 = alloc_tmp( c );
 983     param1 = alloc_tmp( c );
 984
 985     brw_MOV( p, param0, src0 );
 986     brw_MOV( p, param1, src1 );
 987
 988     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
 989
 990     /* Fill in the result: */
 991     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 992     for (i = 0 ; i < 4; i++) {
 993         if (mask & (1<<i)) {
 994             dst = get_dst_reg(c, inst, i);
 995             brw_MOV( p, dst, param0 );
 996         }
 997     }
 998     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 999         brw_set_saturate( p, 0 );
1000
1001     release_tmps( c, mark );
1002 }
1003
1004 /**
1005  * The three-dimensional case is much like the one- and two- versions above,
1006  * but since the number of corners is rapidly growing we now pack 16 16-bit
1007  * hashes into each register to extract more parallelism from the EUs.
1008  */
1009 static void noise3_sub( struct brw_wm_compile *c ) {
1010
1011     struct brw_compile *p = &c->func;
1012     struct brw_reg param0, param1, param2,
1013         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1014         xi, yi, zi, /* interpolation coefficients */
1015         t, tmp[ 8 ], /* float temporaries */
1016         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1017         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1018     int i;
1019     int mark = mark_tmps( c );
1020
1021     x0y0 = alloc_tmp( c );
1022     x0y1 = alloc_tmp( c );
1023     x1y0 = alloc_tmp( c );
1024     x1y1 = alloc_tmp( c );
1025     xi = alloc_tmp( c );
1026     yi = alloc_tmp( c );
1027     zi = alloc_tmp( c );
1028     t = alloc_tmp( c );
1029     for( i = 0; i < 8; i++ ) {
1030         tmp[ i ] = alloc_tmp( c );
1031         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1032         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1033     }
1034
1035     param0 = lookup_tmp( c, mark - 4 );
1036     param1 = lookup_tmp( c, mark - 3 );
1037     param2 = lookup_tmp( c, mark - 2 );
1038
1039     brw_set_access_mode( p, BRW_ALIGN_1 );
1040
1041     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1042        be hashed.  Also compute the remainders (offsets within the unit
1043        cube), interleaved to reduce register dependency penalties. */
1044     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1045     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1046     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1047     brw_FRC( p, param0, param0 );
1048     brw_FRC( p, param1, param1 );
1049     brw_FRC( p, param2, param2 );
1050     /* Since we now have only 16 bits of precision in the hash, we must
1051        be more careful about thorough mixing to maintain entropy as we
1052        squash the input vector into a small scalar. */
1053     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1054     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1055     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1056              brw_imm_uw( 0x9B93 ) );
1057     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1058              brw_imm_uw( 0xBC8F ) );
1059
1060     /* Temporarily disable the execution mask while we work with ExecSize=16
1061        channels (the mask is set for ExecSize=8 and is probably incorrect).
1062        Although this might cause execution of unwanted channels, the code
1063        writes only to temporary registers and has no side effects, so
1064        disabling the mask is harmless. */
1065     brw_push_insn_state( p );
1066     brw_set_mask_control( p, BRW_MASK_DISABLE );
1067     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1068     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1069     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1070
1071     /* We're now ready to perform the hashing.  The eight hashes are
1072        interleaved for performance.  The hash function used is
1073        designed to rapidly achieve avalanche and require only 16x16
1074        bit multiplication, and 8-bit swizzles (which we get for
1075        free). */
1076     for( i = 0; i < 4; i++ )
1077         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1078     for( i = 0; i < 4; i++ )
1079         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1080                  odd_bytes( wtmp[ i ] ) );
1081     for( i = 0; i < 4; i++ )
1082         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1083     for( i = 0; i < 4; i++ )
1084         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1085                  odd_bytes( wtmp[ i ] ) );
1086     brw_pop_insn_state( p );
1087
1088     /* Now we want to initialise the four rear gradients based on the
1089        hashes.  Format conversion from signed integer to float leaves
1090        everything scaled too high by a factor of pow( 2, 15 ), but
1091        we correct for that right at the end. */
1092     /* x component */
1093     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1094     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1095     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1096     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1097     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1098
1099     brw_push_insn_state( p );
1100     brw_set_mask_control( p, BRW_MASK_DISABLE );
1101     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1102     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1103     brw_pop_insn_state( p );
1104
1105     brw_MUL( p, x1y0, x1y0, t );
1106     brw_MUL( p, x1y1, x1y1, t );
1107     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1108     brw_MUL( p, x0y0, x0y0, param0 );
1109     brw_MUL( p, x0y1, x0y1, param0 );
1110
1111     /* y component */
1112     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1113     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1114     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1115     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1116
1117     brw_push_insn_state( p );
1118     brw_set_mask_control( p, BRW_MASK_DISABLE );
1119     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1120     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1121     brw_pop_insn_state( p );
1122
1123     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1124     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1125     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1126     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1127     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1128
1129     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1130     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1131     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1132     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1133
1134     /* z component */
1135     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1136     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1137     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1138     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1139
1140     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1141     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1142     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1143     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1144
1145     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1146     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1147     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1148     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1149
1150     /* We interpolate between the gradients using the polynomial
1151        6t^5 - 15t^4 + 10t^3 (Perlin). */
1152     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1153     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1154     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1155     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1156     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1157     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1158     brw_MUL( p, xi, xi, param0 );
1159     brw_MUL( p, yi, yi, param1 );
1160     brw_MUL( p, zi, zi, param2 );
1161     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1162     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1163     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1164     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1165     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1166     brw_MUL( p, xi, xi, param0 );
1167     brw_MUL( p, yi, yi, param1 );
1168     brw_MUL( p, zi, zi, param2 );
1169     brw_MUL( p, xi, xi, param0 );
1170     brw_MUL( p, yi, yi, param1 );
1171     brw_MUL( p, zi, zi, param2 );
1172     brw_MUL( p, xi, xi, param0 );
1173     brw_MUL( p, yi, yi, param1 );
1174     brw_MUL( p, zi, zi, param2 );
1175
1176     /* Here we interpolate in the y dimension... */
1177     brw_MUL( p, x0y1, x0y1, yi );
1178     brw_MUL( p, x1y1, x1y1, yi );
1179     brw_ADD( p, x0y0, x0y0, x0y1 );
1180     brw_ADD( p, x1y0, x1y0, x1y1 );
1181
1182     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1183     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1184     brw_MUL( p, x1y0, x1y0, xi );
1185     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1186
1187     /* Now do the same thing for the front four gradients... */
1188     /* x component */
1189     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1190     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1191     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1192     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1193
1194     brw_push_insn_state( p );
1195     brw_set_mask_control( p, BRW_MASK_DISABLE );
1196     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1197     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1198     brw_pop_insn_state( p );
1199
1200     brw_MUL( p, x1y0, x1y0, t );
1201     brw_MUL( p, x1y1, x1y1, t );
1202     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1203     brw_MUL( p, x0y0, x0y0, param0 );
1204     brw_MUL( p, x0y1, x0y1, param0 );
1205
1206     /* y component */
1207     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1208     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1209     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1210     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1211
1212     brw_push_insn_state( p );
1213     brw_set_mask_control( p, BRW_MASK_DISABLE );
1214     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1215     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1216     brw_pop_insn_state( p );
1217
1218     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1219     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1220     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1221     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1222     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1223
1224     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1225     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1226     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1227     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1228
1229     /* z component */
1230     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1231     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1232     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1233     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1234
1235     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1236     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1237     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1238     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1239
1240     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1241     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1242     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1243     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1244
1245     /* The interpolation coefficients are still around from last time, so
1246        again interpolate in the y dimension... */
1247     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1248     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1249     brw_MUL( p, x0y1, x0y1, yi );
1250     brw_MUL( p, x1y1, x1y1, yi );
1251     brw_ADD( p, x0y0, x0y0, x0y1 );
1252     brw_ADD( p, x1y0, x1y0, x1y1 );
1253
1254     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1255        time put the front face in tmp[ 1 ] and we're nearly there... */
1256     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1257     brw_MUL( p, x1y0, x1y0, xi );
1258     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1259
1260     /* The final interpolation, in the z dimension: */
1261     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1262     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1263     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1264
1265     /* scale by pow( 2, -15 ), as described above */
1266     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1267
1268     release_tmps( c, mark );
1269 }
1270
1271 static void emit_noise3( struct brw_wm_compile *c,
1272                          const struct prog_instruction *inst )
1273 {
1274     struct brw_compile *p = &c->func;
1275     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1276     GLuint mask = inst->DstReg.WriteMask;
1277     int i;
1278     int mark = mark_tmps( c );
1279
1280     assert( mark == 0 );
1281
1282     src0 = get_src_reg( c, inst, 0, 0 );
1283     src1 = get_src_reg( c, inst, 0, 1 );
1284     src2 = get_src_reg( c, inst, 0, 2 );
1285
1286     param0 = alloc_tmp( c );
1287     param1 = alloc_tmp( c );
1288     param2 = alloc_tmp( c );
1289
1290     brw_MOV( p, param0, src0 );
1291     brw_MOV( p, param1, src1 );
1292     brw_MOV( p, param2, src2 );
1293
1294     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1295
1296     /* Fill in the result: */
1297     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1298     for (i = 0 ; i < 4; i++) {
1299         if (mask & (1<<i)) {
1300             dst = get_dst_reg(c, inst, i);
1301             brw_MOV( p, dst, param0 );
1302         }
1303     }
1304     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1305         brw_set_saturate( p, 0 );
1306
1307     release_tmps( c, mark );
1308 }
1309
1310 /**
1311  * For the four-dimensional case, the little micro-optimisation benefits
1312  * we obtain by unrolling all the loops aren't worth the massive bloat it
1313  * now causes.  Instead, we loop twice around performing a similar operation
1314  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1315  * code to glue it all together.
1316  */
1317 static void noise4_sub( struct brw_wm_compile *c )
1318 {
1319     struct brw_compile *p = &c->func;
1320     struct brw_reg param[ 4 ],
1321         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1322         w0, /* noise for the w=0 cube */
1323         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1324         interp[ 4 ], /* interpolation coefficients */
1325         t, tmp[ 8 ], /* float temporaries */
1326         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1327         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1328     int i, j;
1329     int mark = mark_tmps( c );
1330     GLuint loop, origin;
1331
1332     x0y0 = alloc_tmp( c );
1333     x0y1 = alloc_tmp( c );
1334     x1y0 = alloc_tmp( c );
1335     x1y1 = alloc_tmp( c );
1336     t = alloc_tmp( c );
1337     w0 = alloc_tmp( c );
1338     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1339     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1340
1341     for( i = 0; i < 4; i++ ) {
1342         param[ i ] = lookup_tmp( c, mark - 5 + i );
1343         interp[ i ] = alloc_tmp( c );
1344     }
1345
1346     for( i = 0; i < 8; i++ ) {
1347         tmp[ i ] = alloc_tmp( c );
1348         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1349         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1350     }
1351
1352     brw_set_access_mode( p, BRW_ALIGN_1 );
1353
1354     /* We only want 16 bits of precision from the integral part of each
1355        co-ordinate, but unfortunately the RNDD semantics would saturate
1356        at 16 bits if we performed the operation directly to a 16-bit
1357        destination.  Therefore, we round to 32-bit temporaries where
1358        appropriate, and then store only the lower 16 bits. */
1359     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1360     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1361     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1362     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1363     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1364     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1365
1366     /* Modify the flag register here, because the side effect is useful
1367        later (see below).  We know for certain that all flags will be
1368        cleared, since the FRC instruction cannot possibly generate
1369        negative results.  Even for exceptional inputs (infinities, denormals,
1370        NaNs), the architecture guarantees that the L conditional is false. */
1371     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1372     brw_FRC( p, param[ 0 ], param[ 0 ] );
1373     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1374     for( i = 1; i < 4; i++ )
1375         brw_FRC( p, param[ i ], param[ i ] );
1376
1377     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1378        of all. */
1379     for( i = 0; i < 4; i++ )
1380         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1381     for( i = 0; i < 4; i++ )
1382         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1383     for( i = 0; i < 4; i++ )
1384         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1385     for( i = 0; i < 4; i++ )
1386         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1387     for( j = 0; j < 3; j++ )
1388         for( i = 0; i < 4; i++ )
1389             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1390
1391     /* Mark the current address, as it will be a jump destination.  The
1392        following code will be executed twice: first, with the flag
1393        register clear indicating the w=0 case, and second with flags
1394        set for w=1. */
1395     loop = p->nr_insn;
1396
1397     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1398        be hashed.  Since we have only 16 bits of precision in the hash, we
1399        must be careful about thorough mixing to maintain entropy as we
1400        squash the input vector into a small scalar. */
1401     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1402              brw_imm_uw( 0xBC8F ) );
1403     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1404              brw_imm_uw( 0xD0BD ) );
1405     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1406              brw_imm_uw( 0x9B93 ) );
1407     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1408              brw_imm_uw( 0xA359 ) );
1409     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1410              brw_imm_uw( 0xBC8F ) );
1411
1412     /* Temporarily disable the execution mask while we work with ExecSize=16
1413        channels (the mask is set for ExecSize=8 and is probably incorrect).
1414        Although this might cause execution of unwanted channels, the code
1415        writes only to temporary registers and has no side effects, so
1416        disabling the mask is harmless. */
1417     brw_push_insn_state( p );
1418     brw_set_mask_control( p, BRW_MASK_DISABLE );
1419     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1420     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1421     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1422
1423     /* We're now ready to perform the hashing.  The eight hashes are
1424        interleaved for performance.  The hash function used is
1425        designed to rapidly achieve avalanche and require only 16x16
1426        bit multiplication, and 8-bit swizzles (which we get for
1427        free). */
1428     for( i = 0; i < 4; i++ )
1429         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1430     for( i = 0; i < 4; i++ )
1431         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1432                  odd_bytes( wtmp[ i ] ) );
1433     for( i = 0; i < 4; i++ )
1434         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1435     for( i = 0; i < 4; i++ )
1436         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1437                  odd_bytes( wtmp[ i ] ) );
1438     brw_pop_insn_state( p );
1439
1440     /* Now we want to initialise the four rear gradients based on the
1441        hashes.  Format conversion from signed integer to float leaves
1442        everything scaled too high by a factor of pow( 2, 15 ), but
1443        we correct for that right at the end. */
1444     /* x component */
1445     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1446     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1447     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1448     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1449     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1450
1451     brw_push_insn_state( p );
1452     brw_set_mask_control( p, BRW_MASK_DISABLE );
1453     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1454     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1455     brw_pop_insn_state( p );
1456
1457     brw_MUL( p, x1y0, x1y0, t );
1458     brw_MUL( p, x1y1, x1y1, t );
1459     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1460     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1461     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1462
1463     /* y component */
1464     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1465     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1466     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1467     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1468
1469     brw_push_insn_state( p );
1470     brw_set_mask_control( p, BRW_MASK_DISABLE );
1471     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1472     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1473     brw_pop_insn_state( p );
1474
1475     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1476     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1477     /* prepare t for the w component (used below): w the first time through
1478        the loop; w - 1 the second time) */
1479     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1480     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1481     p->current->header.predicate_inverse = 1;
1482     brw_MOV( p, t, param[ 3 ] );
1483     p->current->header.predicate_inverse = 0;
1484     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1485     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1486     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1487
1488     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1489     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1490     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1491     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1492
1493     /* z component */
1494     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1495     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1496     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1497     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1498
1499     brw_push_insn_state( p );
1500     brw_set_mask_control( p, BRW_MASK_DISABLE );
1501     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1502     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1503     brw_pop_insn_state( p );
1504
1505     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1506     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1507     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1508     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1509
1510     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1511     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1512     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1513     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1514
1515     /* w component */
1516     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1517     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1518     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1519     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1520
1521     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1522     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1523     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1524     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1525     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1526
1527     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1528     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1529     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1530     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1531
1532     /* Here we interpolate in the y dimension... */
1533     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1534     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1535     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1536     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1537     brw_ADD( p, x0y0, x0y0, x0y1 );
1538     brw_ADD( p, x1y0, x1y0, x1y1 );
1539
1540     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1541     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1542     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1543     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1544
1545     /* Now do the same thing for the front four gradients... */
1546     /* x component */
1547     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1548     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1549     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1550     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1551
1552     brw_push_insn_state( p );
1553     brw_set_mask_control( p, BRW_MASK_DISABLE );
1554     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1555     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1556     brw_pop_insn_state( p );
1557
1558     brw_MUL( p, x1y0, x1y0, t );
1559     brw_MUL( p, x1y1, x1y1, t );
1560     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1561     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1562     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1563
1564     /* y component */
1565     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1566     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1567     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1568     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1569
1570     brw_push_insn_state( p );
1571     brw_set_mask_control( p, BRW_MASK_DISABLE );
1572     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1573     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1574     brw_pop_insn_state( p );
1575
1576     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1577     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1578     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1579     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1580     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1581
1582     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1583     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1584     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1585     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1586
1587     /* z component */
1588     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1589     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1590     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1591     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1592
1593     brw_push_insn_state( p );
1594     brw_set_mask_control( p, BRW_MASK_DISABLE );
1595     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1596     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1597     brw_pop_insn_state( p );
1598
1599     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1600     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1601     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1602     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1603     /* prepare t for the w component (used below): w the first time through
1604        the loop; w - 1 the second time) */
1605     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1606     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1607     p->current->header.predicate_inverse = 1;
1608     brw_MOV( p, t, param[ 3 ] );
1609     p->current->header.predicate_inverse = 0;
1610     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1611
1612     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1613     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1614     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1615     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1616
1617     /* w component */
1618     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1619     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1620     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1621     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1622
1623     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1624     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1625     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1626     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1627
1628     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1629     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1630     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1631     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1632
1633     /* Interpolate in the y dimension: */
1634     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1635     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1636     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1637     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1638     brw_ADD( p, x0y0, x0y0, x0y1 );
1639     brw_ADD( p, x1y0, x1y0, x1y1 );
1640
1641     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1642        time put the front face in tmp[ 1 ] and we're nearly there... */
1643     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1644     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1645     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1646
1647     /* Another interpolation, in the z dimension: */
1648     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1649     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1650     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1651
1652     /* Exit the loop if we've computed both cubes... */
1653     origin = p->nr_insn;
1654     brw_push_insn_state( p );
1655     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1656     brw_set_mask_control( p, BRW_MASK_DISABLE );
1657     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1658     brw_pop_insn_state( p );
1659
1660     /* Save the result for the w=0 case, and increment the w coordinate: */
1661     brw_MOV( p, w0, tmp[ 0 ] );
1662     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1663              brw_imm_uw( 1 ) );
1664
1665     /* Loop around for the other cube.  Explicitly set the flag register
1666        (unfortunately we must spend an extra instruction to do this: we
1667        can't rely on a side effect of the previous MOV or ADD because
1668        conditional modifiers which are normally true might be false in
1669        exceptional circumstances, e.g. given a NaN input; the add to
1670        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1671     brw_push_insn_state( p );
1672     brw_set_mask_control( p, BRW_MASK_DISABLE );
1673     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1674     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1675              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1676     brw_pop_insn_state( p );
1677
1678     /* Patch the previous conditional branch now that we know the
1679        destination address. */
1680     brw_set_src1( p->store + origin,
1681                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1682
1683     /* The very last interpolation. */
1684     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1685     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1686     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1687
1688     /* scale by pow( 2, -15 ), as described above */
1689     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1690
1691     release_tmps( c, mark );
1692 }
1693
1694 static void emit_noise4( struct brw_wm_compile *c,
1695                          const struct prog_instruction *inst )
1696 {
1697     struct brw_compile *p = &c->func;
1698     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1699     GLuint mask = inst->DstReg.WriteMask;
1700     int i;
1701     int mark = mark_tmps( c );
1702
1703     assert( mark == 0 );
1704
1705     src0 = get_src_reg( c, inst, 0, 0 );
1706     src1 = get_src_reg( c, inst, 0, 1 );
1707     src2 = get_src_reg( c, inst, 0, 2 );
1708     src3 = get_src_reg( c, inst, 0, 3 );
1709
1710     param0 = alloc_tmp( c );
1711     param1 = alloc_tmp( c );
1712     param2 = alloc_tmp( c );
1713     param3 = alloc_tmp( c );
1714
1715     brw_MOV( p, param0, src0 );
1716     brw_MOV( p, param1, src1 );
1717     brw_MOV( p, param2, src2 );
1718     brw_MOV( p, param3, src3 );
1719
1720     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1721
1722     /* Fill in the result: */
1723     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1724     for (i = 0 ; i < 4; i++) {
1725         if (mask & (1<<i)) {
1726             dst = get_dst_reg(c, inst, i);
1727             brw_MOV( p, dst, param0 );
1728         }
1729     }
1730     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1731         brw_set_saturate( p, 0 );
1732
1733     release_tmps( c, mark );
1734 }
1735
1736 /**
1737  * Resolve subroutine calls after code emit is done.
1738  */
1739 static void post_wm_emit( struct brw_wm_compile *c )
1740 {
1741     brw_resolve_cals(&c->func);
1742 }
1743
1744 static void
1745 get_argument_regs(struct brw_wm_compile *c,
1746                   const struct prog_instruction *inst,
1747                   int index,
1748                   struct brw_reg *dst,
1749                   struct brw_reg *regs,
1750                   int mask)
1751 {
1752     struct brw_compile *p = &c->func;
1753     int i, j;
1754
1755     for (i = 0; i < 4; i++) {
1756         if (mask & (1 << i)) {
1757             regs[i] = get_src_reg(c, inst, index, i);
1758
1759             /* Unalias destination registers from our sources. */
1760             if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1761                for (j = 0; j < 4; j++) {
1762                    if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1763                        struct brw_reg tmp = alloc_tmp(c);
1764                        brw_MOV(p, tmp, regs[i]);
1765                        regs[i] = tmp;
1766                        break;
1767                    }
1768                }
1769             }
1770         }
1771     }
1772 }
1773
1774 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1775 {
1776    struct intel_context *intel = &brw->intel;
1777 #define MAX_IF_DEPTH 32
1778 #define MAX_LOOP_DEPTH 32
1779     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1780     GLuint i, if_depth = 0, loop_depth = 0;
1781     struct brw_compile *p = &c->func;
1782     struct brw_indirect stack_index = brw_indirect(0, 0);
1783
1784     c->out_of_regs = GL_FALSE;
1785
1786     prealloc_reg(c);
1787     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1788     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1789
1790     for (i = 0; i < c->nr_fp_insns; i++) {
1791         const struct prog_instruction *inst = &c->prog_instructions[i];
1792         int dst_flags;
1793         struct brw_reg args[3][4], dst[4];
1794         int j;
1795         int mark = mark_tmps( c );
1796
1797         c->cur_inst = i;
1798
1799 #if 0
1800         printf("Inst %d: ", i);
1801         _mesa_print_instruction(inst);
1802 #endif
1803
1804         /* fetch any constants that this instruction needs */
1805         if (c->fp->use_const_buffer)
1806            fetch_constants(c, inst);
1807
1808         if (inst->Opcode != OPCODE_ARL) {
1809            for (j = 0; j < 4; j++) {
1810               if (inst->DstReg.WriteMask & (1 << j))
1811                  dst[j] = get_dst_reg(c, inst, j);
1812               else
1813                  dst[j] = brw_null_reg();
1814            }
1815         }
1816         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1817             get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1818
1819         dst_flags = inst->DstReg.WriteMask;
1820         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1821             dst_flags |= SATURATE;
1822
1823         if (inst->CondUpdate)
1824             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1825         else
1826             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1827
1828         switch (inst->Opcode) {
1829             case WM_PIXELXY:
1830                 emit_pixel_xy(c, dst, dst_flags);
1831                 break;
1832             case WM_DELTAXY:
1833                 emit_delta_xy(p, dst, dst_flags, args[0]);
1834                 break;
1835             case WM_PIXELW:
1836                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1837                 break;
1838             case WM_LINTERP:
1839                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1840                 break;
1841             case WM_PINTERP:
1842                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1843                 break;
1844             case WM_CINTERP:
1845                 emit_cinterp(p, dst, dst_flags, args[0]);
1846                 break;
1847             case WM_WPOSXY:
1848                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1849                 break;
1850             case WM_FB_WRITE:
1851                 emit_fb_write(c, args[0], args[1], args[2],
1852                               INST_AUX_GET_TARGET(inst->Aux),
1853                               inst->Aux & INST_AUX_EOT);
1854                 break;
1855             case WM_FRONTFACING:
1856                 emit_frontfacing(p, dst, dst_flags);
1857                 break;
1858             case OPCODE_ADD:
1859                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1860                 break;
1861             case OPCODE_ARL:
1862                 emit_arl(c, inst);
1863                 break;
1864             case OPCODE_FRC:
1865                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1866                 break;
1867             case OPCODE_FLR:
1868                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1869                 break;
1870             case OPCODE_LRP:
1871                 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1872                 break;
1873             case OPCODE_TRUNC:
1874                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1875                 break;
1876             case OPCODE_MOV:
1877             case OPCODE_SWZ:
1878                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1879                 break;
1880             case OPCODE_DP3:
1881                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1882                 break;
1883             case OPCODE_DP4:
1884                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1885                 break;
1886             case OPCODE_XPD:
1887                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1888                 break;
1889             case OPCODE_DPH:
1890                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1891                 break;
1892             case OPCODE_RCP:
1893                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1894                 break;
1895             case OPCODE_RSQ:
1896                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1897                 break;
1898             case OPCODE_SIN:
1899                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1900                 break;
1901             case OPCODE_COS:
1902                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1903                 break;
1904             case OPCODE_EX2:
1905                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1906                 break;
1907             case OPCODE_LG2:
1908                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1909                 break;
1910             case OPCODE_CMP:
1911                 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1912                 break;
1913             case OPCODE_MIN:
1914                 emit_min(p, dst, dst_flags, args[0], args[1]);
1915                 break;
1916             case OPCODE_MAX:
1917                 emit_max(p, dst, dst_flags, args[0], args[1]);
1918                 break;
1919             case OPCODE_DDX:
1920             case OPCODE_DDY:
1921                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1922                           args[0]);
1923                 break;
1924             case OPCODE_SLT:
1925                 emit_sop(p, dst, dst_flags,
1926                          BRW_CONDITIONAL_L, args[0], args[1]);
1927                 break;
1928             case OPCODE_SLE:
1929                 emit_sop(p, dst, dst_flags,
1930                          BRW_CONDITIONAL_LE, args[0], args[1]);
1931                 break;
1932             case OPCODE_SGT:
1933                 emit_sop(p, dst, dst_flags,
1934                          BRW_CONDITIONAL_G, args[0], args[1]);
1935                 break;
1936             case OPCODE_SGE:
1937                 emit_sop(p, dst, dst_flags,
1938                          BRW_CONDITIONAL_GE, args[0], args[1]);
1939                 break;
1940             case OPCODE_SEQ:
1941                 emit_sop(p, dst, dst_flags,
1942                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1943                 break;
1944             case OPCODE_SNE:
1945                 emit_sop(p, dst, dst_flags,
1946                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1947                 break;
1948             case OPCODE_MUL:
1949                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1950                 break;
1951             case OPCODE_POW:
1952                 emit_math2(c, BRW_MATH_FUNCTION_POW,
1953                            dst, dst_flags, args[0], args[1]);
1954                 break;
1955             case OPCODE_MAD:
1956                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1957                 break;
1958             case OPCODE_NOISE1:
1959                 emit_noise1(c, inst);
1960                 break;
1961             case OPCODE_NOISE2:
1962                 emit_noise2(c, inst);
1963                 break;
1964             case OPCODE_NOISE3:
1965                 emit_noise3(c, inst);
1966                 break;
1967             case OPCODE_NOISE4:
1968                 emit_noise4(c, inst);
1969                 break;
1970             case OPCODE_TEX:
1971                 emit_tex(c, dst, dst_flags, args[0],
1972                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1973                                  0, 1, 0, 0),
1974                          inst->TexSrcTarget,
1975                          inst->TexSrcUnit,
1976                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1977                 break;
1978             case OPCODE_TXB:
1979                 emit_txb(c, dst, dst_flags, args[0],
1980                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1981                                  0, 1, 0, 0),
1982                          inst->TexSrcTarget,
1983                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1984                 break;
1985             case OPCODE_KIL_NV:
1986                 emit_kil(c);
1987                 break;
1988             case OPCODE_IF:
1989                 assert(if_depth < MAX_IF_DEPTH);
1990                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1991                 break;
1992             case OPCODE_ELSE:
1993                 assert(if_depth > 0);
1994                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
1995                 break;
1996             case OPCODE_ENDIF:
1997                 assert(if_depth > 0);
1998                 brw_ENDIF(p, if_inst[--if_depth]);
1999                 break;
2000             case OPCODE_BGNSUB:
2001                 brw_save_label(p, inst->Comment, p->nr_insn);
2002                 break;
2003             case OPCODE_ENDSUB:
2004                 /* no-op */
2005                 break;
2006             case OPCODE_CAL:
2007                 brw_push_insn_state(p);
2008                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2009                 brw_set_access_mode(p, BRW_ALIGN_1);
2010                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2011                 brw_set_access_mode(p, BRW_ALIGN_16);
2012                 brw_ADD(p, get_addr_reg(stack_index),
2013                          get_addr_reg(stack_index), brw_imm_d(4));
2014                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2015                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2016                 brw_pop_insn_state(p);
2017                 break;
2018
2019             case OPCODE_RET:
2020                 brw_push_insn_state(p);
2021                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2022                 brw_ADD(p, get_addr_reg(stack_index),
2023                         get_addr_reg(stack_index), brw_imm_d(-4));
2024                 brw_set_access_mode(p, BRW_ALIGN_1);
2025                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2026                 brw_set_access_mode(p, BRW_ALIGN_16);
2027                 brw_pop_insn_state(p);
2028
2029                 break;
2030             case OPCODE_BGNLOOP:
2031                 /* XXX may need to invalidate the current_constant regs */
2032                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2033                 break;
2034             case OPCODE_BRK:
2035                 brw_BREAK(p);
2036                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2037                 break;
2038             case OPCODE_CONT:
2039                 brw_CONT(p);
2040                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2041                 break;
2042             case OPCODE_ENDLOOP:
2043                {
2044                   struct brw_instruction *inst0, *inst1;
2045                   GLuint br = 1;
2046
2047                   if (intel->is_ironlake)
2048                      br = 2;
2049
2050                   assert(loop_depth > 0);
2051                   loop_depth--;
2052                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2053                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2054                   while (inst0 > loop_inst[loop_depth]) {
2055                      inst0--;
2056                      if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2057                          inst0->bits3.if_else.jump_count == 0) {
2058                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2059                         inst0->bits3.if_else.pop_count = 0;
2060                      }
2061                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2062                               inst0->bits3.if_else.jump_count == 0) {
2063                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2064                         inst0->bits3.if_else.pop_count = 0;
2065                      }
2066                   }
2067                }
2068                break;
2069             default:
2070                 printf("unsupported opcode %d (%s) in fragment shader\n",
2071                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
2072                        _mesa_opcode_string(inst->Opcode) : "unknown");
2073         }
2074
2075         /* Release temporaries containing any unaliased source regs. */
2076         release_tmps( c, mark );
2077
2078         if (inst->CondUpdate)
2079             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2080         else
2081             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2082     }
2083     post_wm_emit(c);
2084
2085     if (INTEL_DEBUG & DEBUG_WM) {
2086       printf("wm-native:\n");
2087       for (i = 0; i < p->nr_insn; i++)
2088          brw_disasm(stderr, &p->store[i]);
2089       printf("\n");
2090     }
2091 }
2092
2093 /**
2094  * Do GPU code generation for shaders that use GLSL features such as
2095  * flow control.  Other shaders will be compiled with the
2096  */
2097 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2098 {
2099     if (INTEL_DEBUG & DEBUG_WM) {
2100         printf("brw_wm_glsl_emit:\n");
2101     }
2102
2103     /* initial instruction translation/simplification */
2104     brw_wm_pass_fp(c);
2105
2106     /* actual code generation */
2107     brw_wm_emit_glsl(brw, c);
2108
2109     if (INTEL_DEBUG & DEBUG_WM) {
2110         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2111     }
2112
2113     c->prog_data.total_grf = num_grf_used(c);
2114     c->prog_data.total_scratch = 0;
2115 }