src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553 /**
 554  * Subroutines are minimal support for resusable instruction sequences.
 555  * They are implemented as simply as possible to minimise overhead: there
 556  * is no explicit support for communication between the caller and callee
 557  * other than saving the return address in a temporary register, nor is
 558  * there any automatic local storage.  This implies that great care is
 559  * required before attempting reentrancy or any kind of nested
 560  * subroutine invocations.
 561  */
 562 static void invoke_subroutine( struct brw_wm_compile *c,
 563                                enum _subroutine subroutine,
 564                                void (*emit)( struct brw_wm_compile * ) )
 565 {
 566     struct brw_compile *p = &c->func;
 567
 568     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 569
 570     if( c->subroutines[ subroutine ] ) {
 571         /* subroutine previously emitted: reuse existing instructions */
 572
 573         int mark = mark_tmps( c );
 574         struct brw_reg return_address = retype( alloc_tmp( c ),
 575                                                 BRW_REGISTER_TYPE_UD );
 576         int here = p->nr_insn;
 577
 578         brw_push_insn_state(p);
 579         brw_set_mask_control(p, BRW_MASK_DISABLE);
 580         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 581
 582         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 583                  brw_imm_d( ( c->subroutines[ subroutine ] -
 584                               here - 1 ) << 4 ) );
 585         brw_pop_insn_state(p);
 586
 587         release_tmps( c, mark );
 588     } else {
 589         /* previously unused subroutine: emit, and mark for later reuse */
 590
 591         int mark = mark_tmps( c );
 592         struct brw_reg return_address = retype( alloc_tmp( c ),
 593                                                 BRW_REGISTER_TYPE_UD );
 594         struct brw_instruction *calc;
 595         int base = p->nr_insn;
 596
 597         brw_push_insn_state(p);
 598         brw_set_mask_control(p, BRW_MASK_DISABLE);
 599         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 600         brw_pop_insn_state(p);
 601
 602         c->subroutines[ subroutine ] = p->nr_insn;
 603
 604         emit( c );
 605
 606         brw_push_insn_state(p);
 607         brw_set_mask_control(p, BRW_MASK_DISABLE);
 608         brw_MOV( p, brw_ip_reg(), return_address );
 609         brw_pop_insn_state(p);
 610
 611         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 612
 613         release_tmps( c, mark );
 614     }
 615 }
 616
 617 /* Workaround for using brw_wm_emit.c's emit functions, which expect
 618  * destination regs to be uniquely written.  Moves arguments out to
 619  * temporaries as necessary for instructions which use their destination as
 620  * a temporary.
 621  */
 622 static void
 623 unalias3(struct brw_wm_compile *c,
 624          void (*func)(struct brw_compile *c,
 625                       const struct brw_reg *dst,
 626                       GLuint mask,
 627                       const struct brw_reg *arg0,
 628                       const struct brw_reg *arg1,
 629                       const struct brw_reg *arg2),
 630          const struct brw_reg *dst,
 631          GLuint mask,
 632          const struct brw_reg *arg0,
 633          const struct brw_reg *arg1,
 634          const struct brw_reg *arg2)
 635 {
 636     struct brw_compile *p = &c->func;
 637     struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
 638     int i, j;
 639     int mark = mark_tmps(c);
 640
 641     for (j = 0; j < 4; j++) {
 642         tmp_arg0[j] = arg0[j];
 643         tmp_arg1[j] = arg1[j];
 644         tmp_arg2[j] = arg2[j];
 645     }
 646
 647     for (i = 0; i < 4; i++) {
 648         if (mask & (1<<i)) {
 649             for (j = 0; j < 4; j++) {
 650                 if (arg0[j].file == dst[i].file &&
 651                     dst[i].nr == arg0[j].nr) {
 652                     tmp_arg0[j] = alloc_tmp(c);
 653                     brw_MOV(p, tmp_arg0[j], arg0[j]);
 654                 }
 655                 if (arg1[j].file == dst[i].file &&
 656                     dst[i].nr == arg1[j].nr) {
 657                     tmp_arg1[j] = alloc_tmp(c);
 658                     brw_MOV(p, tmp_arg1[j], arg1[j]);
 659                 }
 660                 if (arg2[j].file == dst[i].file &&
 661                     dst[i].nr == arg2[j].nr) {
 662                     tmp_arg2[j] = alloc_tmp(c);
 663                     brw_MOV(p, tmp_arg2[j], arg2[j]);
 664                 }
 665             }
 666         }
 667     }
 668
 669     func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
 670
 671     release_tmps(c, mark);
 672 }
 673
 674 /* Workaround for using brw_wm_emit.c's emit functions, which expect
 675  * destination regs to be uniquely written.  Moves arguments out to
 676  * temporaries as necessary for instructions which use their destination as
 677  * a temporary.
 678  */
 679 static void
 680 unalias2(struct brw_wm_compile *c,
 681          void (*func)(struct brw_compile *c,
 682                       const struct brw_reg *dst,
 683                       GLuint mask,
 684                       const struct brw_reg *arg0,
 685                       const struct brw_reg *arg1),
 686          const struct brw_reg *dst,
 687          GLuint mask,
 688          const struct brw_reg *arg0,
 689          const struct brw_reg *arg1)
 690 {
 691     struct brw_compile *p = &c->func;
 692     struct brw_reg tmp_arg0[4], tmp_arg1[4];
 693     int i, j;
 694     int mark = mark_tmps(c);
 695
 696     for (j = 0; j < 4; j++) {
 697         tmp_arg0[j] = arg0[j];
 698         tmp_arg1[j] = arg1[j];
 699     }
 700
 701     for (i = 0; i < 4; i++) {
 702         if (mask & (1<<i)) {
 703             for (j = 0; j < 4; j++) {
 704                 if (arg0[j].file == dst[i].file &&
 705                     dst[i].nr == arg0[j].nr) {
 706                     tmp_arg0[j] = alloc_tmp(c);
 707                     brw_MOV(p, tmp_arg0[j], arg0[j]);
 708                 }
 709                 if (arg1[j].file == dst[i].file &&
 710                     dst[i].nr == arg1[j].nr) {
 711                     tmp_arg1[j] = alloc_tmp(c);
 712                     brw_MOV(p, tmp_arg1[j], arg1[j]);
 713                 }
 714             }
 715         }
 716     }
 717
 718     func(p, dst, mask, tmp_arg0, tmp_arg1);
 719
 720     release_tmps(c, mark);
 721 }
 722
 723 static void emit_arl(struct brw_wm_compile *c,
 724                      const struct prog_instruction *inst)
 725 {
 726     struct brw_compile *p = &c->func;
 727     struct brw_reg src0, addr_reg;
 728     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 729     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 730                            BRW_ARF_ADDRESS, 0);
 731     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 732     brw_MOV(p, addr_reg, src0);
 733     brw_set_saturate(p, 0);
 734 }
 735
 736 /**
 737  * For GLSL shaders, this KIL will be unconditional.
 738  * It may be contained inside an IF/ENDIF structure of course.
 739  */
 740 static void emit_kil(struct brw_wm_compile *c)
 741 {
 742     struct brw_compile *p = &c->func;
 743     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 744     brw_push_insn_state(p);
 745     brw_set_mask_control(p, BRW_MASK_DISABLE);
 746     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
 747     brw_AND(p, depth, c->emit_mask_reg, depth);
 748     brw_pop_insn_state(p);
 749 }
 750
 751 static INLINE struct brw_reg high_words( struct brw_reg reg )
 752 {
 753     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 754                    0, 8, 2 );
 755 }
 756
 757 static INLINE struct brw_reg low_words( struct brw_reg reg )
 758 {
 759     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 760 }
 761
 762 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 763 {
 764     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 765 }
 766
 767 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 768 {
 769     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 770                    0, 16, 2 );
 771 }
 772
 773 /* One-, two- and three-dimensional Perlin noise, similar to the description
 774    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 775 static void noise1_sub( struct brw_wm_compile *c ) {
 776
 777     struct brw_compile *p = &c->func;
 778     struct brw_reg param,
 779         x0, x1, /* gradients at each end */
 780         t, tmp[ 2 ], /* float temporaries */
 781         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 782     int i;
 783     int mark = mark_tmps( c );
 784
 785     x0 = alloc_tmp( c );
 786     x1 = alloc_tmp( c );
 787     t = alloc_tmp( c );
 788     tmp[ 0 ] = alloc_tmp( c );
 789     tmp[ 1 ] = alloc_tmp( c );
 790     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 791     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 792     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 793     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 794     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 795
 796     param = lookup_tmp( c, mark - 2 );
 797
 798     brw_set_access_mode( p, BRW_ALIGN_1 );
 799
 800     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 801
 802     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 803        be hashed.  Also compute the remainder (offset within the unit
 804        length), interleaved to reduce register dependency penalties. */
 805     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 806     brw_FRC( p, param, param );
 807     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 808     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 809     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 810
 811     /* We're now ready to perform the hashing.  The two hashes are
 812        interleaved for performance.  The hash function used is
 813        designed to rapidly achieve avalanche and require only 32x16
 814        bit multiplication, and 16-bit swizzles (which we get for
 815        free).  We can't use immediate operands in the multiplies,
 816        because immediates are permitted only in src1 and the 16-bit
 817        factor is permitted only in src0. */
 818     for( i = 0; i < 2; i++ )
 819         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 820     for( i = 0; i < 2; i++ )
 821        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 822                 high_words( itmp[ i ] ) );
 823     for( i = 0; i < 2; i++ )
 824         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 825     for( i = 0; i < 2; i++ )
 826        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 827                 high_words( itmp[ i ] ) );
 828     for( i = 0; i < 2; i++ )
 829         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 830     for( i = 0; i < 2; i++ )
 831        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 832                 high_words( itmp[ i ] ) );
 833
 834     /* Now we want to initialise the two gradients based on the
 835        hashes.  Format conversion from signed integer to float leaves
 836        everything scaled too high by a factor of pow( 2, 31 ), but
 837        we correct for that right at the end. */
 838     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 839     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 840     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 841
 842     brw_MUL( p, x0, x0, param );
 843     brw_MUL( p, x1, x1, t );
 844
 845     /* We interpolate between the gradients using the polynomial
 846        6t^5 - 15t^4 + 10t^3 (Perlin). */
 847     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 848     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 849     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 850     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 851     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 852     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 853                                            pipeline */
 854     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 855     brw_MUL( p, param, tmp[ 0 ], param );
 856     brw_MUL( p, x1, x1, param );
 857     brw_ADD( p, x0, x0, x1 );
 858     /* scale by pow( 2, -30 ), to compensate for the format conversion
 859        above and an extra factor of 2 so that a single gradient covers
 860        the [-1,1] range */
 861     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 862
 863     release_tmps( c, mark );
 864 }
 865
 866 static void emit_noise1( struct brw_wm_compile *c,
 867                          const struct prog_instruction *inst )
 868 {
 869     struct brw_compile *p = &c->func;
 870     struct brw_reg src, param, dst;
 871     GLuint mask = inst->DstReg.WriteMask;
 872     int i;
 873     int mark = mark_tmps( c );
 874
 875     assert( mark == 0 );
 876
 877     src = get_src_reg( c, inst, 0, 0 );
 878
 879     param = alloc_tmp( c );
 880
 881     brw_MOV( p, param, src );
 882
 883     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 884
 885     /* Fill in the result: */
 886     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 887     for (i = 0 ; i < 4; i++) {
 888         if (mask & (1<<i)) {
 889             dst = get_dst_reg(c, inst, i);
 890             brw_MOV( p, dst, param );
 891         }
 892     }
 893     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 894         brw_set_saturate( p, 0 );
 895
 896     release_tmps( c, mark );
 897 }
 898
 899 static void noise2_sub( struct brw_wm_compile *c ) {
 900
 901     struct brw_compile *p = &c->func;
 902     struct brw_reg param0, param1,
 903         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
 904         t, tmp[ 4 ], /* float temporaries */
 905         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
 906     int i;
 907     int mark = mark_tmps( c );
 908
 909     x0y0 = alloc_tmp( c );
 910     x0y1 = alloc_tmp( c );
 911     x1y0 = alloc_tmp( c );
 912     x1y1 = alloc_tmp( c );
 913     t = alloc_tmp( c );
 914     for( i = 0; i < 4; i++ ) {
 915         tmp[ i ] = alloc_tmp( c );
 916         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
 917     }
 918     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
 919     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
 920     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
 921
 922     param0 = lookup_tmp( c, mark - 3 );
 923     param1 = lookup_tmp( c, mark - 2 );
 924
 925     brw_set_access_mode( p, BRW_ALIGN_1 );
 926
 927     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
 928        be hashed.  Also compute the remainders (offsets within the unit
 929        square), interleaved to reduce register dependency penalties. */
 930     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
 931     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
 932     brw_FRC( p, param0, param0 );
 933     brw_FRC( p, param1, param1 );
 934     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 935     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
 936              low_words( itmp[ 1 ] ) );
 937     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 938     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 939     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
 940     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
 941     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
 942
 943     /* We're now ready to perform the hashing.  The four hashes are
 944        interleaved for performance.  The hash function used is
 945        designed to rapidly achieve avalanche and require only 32x16
 946        bit multiplication, and 16-bit swizzles (which we get for
 947        free).  We can't use immediate operands in the multiplies,
 948        because immediates are permitted only in src1 and the 16-bit
 949        factor is permitted only in src0. */
 950     for( i = 0; i < 4; i++ )
 951         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 952     for( i = 0; i < 4; i++ )
 953         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 954                  high_words( itmp[ i ] ) );
 955     for( i = 0; i < 4; i++ )
 956         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
 957     for( i = 0; i < 4; i++ )
 958         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 959                  high_words( itmp[ i ] ) );
 960     for( i = 0; i < 4; i++ )
 961         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
 962     for( i = 0; i < 4; i++ )
 963         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 964                  high_words( itmp[ i ] ) );
 965
 966     /* Now we want to initialise the four gradients based on the
 967        hashes.  Format conversion from signed integer to float leaves
 968        everything scaled too high by a factor of pow( 2, 15 ), but
 969        we correct for that right at the end. */
 970     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
 971     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
 972     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
 973     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
 974     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
 975
 976     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
 977     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
 978     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
 979     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
 980
 981     brw_MUL( p, x1y0, x1y0, t );
 982     brw_MUL( p, x1y1, x1y1, t );
 983     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
 984     brw_MUL( p, x0y0, x0y0, param0 );
 985     brw_MUL( p, x0y1, x0y1, param0 );
 986
 987     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
 988     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
 989     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
 990     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
 991
 992     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
 993     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
 994     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
 995     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
 996
 997     /* We interpolate between the gradients using the polynomial
 998        6t^5 - 15t^4 + 10t^3 (Perlin). */
 999     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1000     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1001     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1002     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1003     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1004     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1005     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1006                                                  pipeline */
1007     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1008     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1009     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1010     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1011     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1012                                                  pipeline */
1013     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1014     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1015     brw_MUL( p, param0, tmp[ 0 ], param0 );
1016     brw_MUL( p, param1, tmp[ 1 ], param1 );
1017
1018     /* Here we interpolate in the y dimension... */
1019     brw_MUL( p, x0y1, x0y1, param1 );
1020     brw_MUL( p, x1y1, x1y1, param1 );
1021     brw_ADD( p, x0y0, x0y0, x0y1 );
1022     brw_ADD( p, x1y0, x1y0, x1y1 );
1023
1024     /* And now in x.  There are horrible register dependencies here,
1025        but we have nothing else to do. */
1026     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1027     brw_MUL( p, x1y0, x1y0, param0 );
1028     brw_ADD( p, x0y0, x0y0, x1y0 );
1029
1030     /* scale by pow( 2, -15 ), as described above */
1031     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1032
1033     release_tmps( c, mark );
1034 }
1035
1036 static void emit_noise2( struct brw_wm_compile *c,
1037                          const struct prog_instruction *inst )
1038 {
1039     struct brw_compile *p = &c->func;
1040     struct brw_reg src0, src1, param0, param1, dst;
1041     GLuint mask = inst->DstReg.WriteMask;
1042     int i;
1043     int mark = mark_tmps( c );
1044
1045     assert( mark == 0 );
1046
1047     src0 = get_src_reg( c, inst, 0, 0 );
1048     src1 = get_src_reg( c, inst, 0, 1 );
1049
1050     param0 = alloc_tmp( c );
1051     param1 = alloc_tmp( c );
1052
1053     brw_MOV( p, param0, src0 );
1054     brw_MOV( p, param1, src1 );
1055
1056     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1057
1058     /* Fill in the result: */
1059     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1060     for (i = 0 ; i < 4; i++) {
1061         if (mask & (1<<i)) {
1062             dst = get_dst_reg(c, inst, i);
1063             brw_MOV( p, dst, param0 );
1064         }
1065     }
1066     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1067         brw_set_saturate( p, 0 );
1068
1069     release_tmps( c, mark );
1070 }
1071
1072 /**
1073  * The three-dimensional case is much like the one- and two- versions above,
1074  * but since the number of corners is rapidly growing we now pack 16 16-bit
1075  * hashes into each register to extract more parallelism from the EUs.
1076  */
1077 static void noise3_sub( struct brw_wm_compile *c ) {
1078
1079     struct brw_compile *p = &c->func;
1080     struct brw_reg param0, param1, param2,
1081         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1082         xi, yi, zi, /* interpolation coefficients */
1083         t, tmp[ 8 ], /* float temporaries */
1084         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1085         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1086     int i;
1087     int mark = mark_tmps( c );
1088
1089     x0y0 = alloc_tmp( c );
1090     x0y1 = alloc_tmp( c );
1091     x1y0 = alloc_tmp( c );
1092     x1y1 = alloc_tmp( c );
1093     xi = alloc_tmp( c );
1094     yi = alloc_tmp( c );
1095     zi = alloc_tmp( c );
1096     t = alloc_tmp( c );
1097     for( i = 0; i < 8; i++ ) {
1098         tmp[ i ] = alloc_tmp( c );
1099         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1100         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1101     }
1102
1103     param0 = lookup_tmp( c, mark - 4 );
1104     param1 = lookup_tmp( c, mark - 3 );
1105     param2 = lookup_tmp( c, mark - 2 );
1106
1107     brw_set_access_mode( p, BRW_ALIGN_1 );
1108
1109     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1110        be hashed.  Also compute the remainders (offsets within the unit
1111        cube), interleaved to reduce register dependency penalties. */
1112     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1113     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1114     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1115     brw_FRC( p, param0, param0 );
1116     brw_FRC( p, param1, param1 );
1117     brw_FRC( p, param2, param2 );
1118     /* Since we now have only 16 bits of precision in the hash, we must
1119        be more careful about thorough mixing to maintain entropy as we
1120        squash the input vector into a small scalar. */
1121     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1122     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1123     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1124              brw_imm_uw( 0x9B93 ) );
1125     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1126              brw_imm_uw( 0xBC8F ) );
1127
1128     /* Temporarily disable the execution mask while we work with ExecSize=16
1129        channels (the mask is set for ExecSize=8 and is probably incorrect).
1130        Although this might cause execution of unwanted channels, the code
1131        writes only to temporary registers and has no side effects, so
1132        disabling the mask is harmless. */
1133     brw_push_insn_state( p );
1134     brw_set_mask_control( p, BRW_MASK_DISABLE );
1135     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1136     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1137     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1138
1139     /* We're now ready to perform the hashing.  The eight hashes are
1140        interleaved for performance.  The hash function used is
1141        designed to rapidly achieve avalanche and require only 16x16
1142        bit multiplication, and 8-bit swizzles (which we get for
1143        free). */
1144     for( i = 0; i < 4; i++ )
1145         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1146     for( i = 0; i < 4; i++ )
1147         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1148                  odd_bytes( wtmp[ i ] ) );
1149     for( i = 0; i < 4; i++ )
1150         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1151     for( i = 0; i < 4; i++ )
1152         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1153                  odd_bytes( wtmp[ i ] ) );
1154     brw_pop_insn_state( p );
1155
1156     /* Now we want to initialise the four rear gradients based on the
1157        hashes.  Format conversion from signed integer to float leaves
1158        everything scaled too high by a factor of pow( 2, 15 ), but
1159        we correct for that right at the end. */
1160     /* x component */
1161     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1162     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1163     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1164     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1165     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1166
1167     brw_push_insn_state( p );
1168     brw_set_mask_control( p, BRW_MASK_DISABLE );
1169     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1170     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1171     brw_pop_insn_state( p );
1172
1173     brw_MUL( p, x1y0, x1y0, t );
1174     brw_MUL( p, x1y1, x1y1, t );
1175     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1176     brw_MUL( p, x0y0, x0y0, param0 );
1177     brw_MUL( p, x0y1, x0y1, param0 );
1178
1179     /* y component */
1180     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1181     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1182     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1183     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1184
1185     brw_push_insn_state( p );
1186     brw_set_mask_control( p, BRW_MASK_DISABLE );
1187     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1188     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1189     brw_pop_insn_state( p );
1190
1191     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1192     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1193     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1194     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1195     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1196
1197     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1198     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1199     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1200     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1201
1202     /* z component */
1203     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1204     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1205     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1206     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1207
1208     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1209     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1210     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1211     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1212
1213     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1214     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1215     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1216     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1217
1218     /* We interpolate between the gradients using the polynomial
1219        6t^5 - 15t^4 + 10t^3 (Perlin). */
1220     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1221     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1222     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1223     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1224     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1225     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1226     brw_MUL( p, xi, xi, param0 );
1227     brw_MUL( p, yi, yi, param1 );
1228     brw_MUL( p, zi, zi, param2 );
1229     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1230     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1231     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1232     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1233     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1234     brw_MUL( p, xi, xi, param0 );
1235     brw_MUL( p, yi, yi, param1 );
1236     brw_MUL( p, zi, zi, param2 );
1237     brw_MUL( p, xi, xi, param0 );
1238     brw_MUL( p, yi, yi, param1 );
1239     brw_MUL( p, zi, zi, param2 );
1240     brw_MUL( p, xi, xi, param0 );
1241     brw_MUL( p, yi, yi, param1 );
1242     brw_MUL( p, zi, zi, param2 );
1243
1244     /* Here we interpolate in the y dimension... */
1245     brw_MUL( p, x0y1, x0y1, yi );
1246     brw_MUL( p, x1y1, x1y1, yi );
1247     brw_ADD( p, x0y0, x0y0, x0y1 );
1248     brw_ADD( p, x1y0, x1y0, x1y1 );
1249
1250     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1251     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1252     brw_MUL( p, x1y0, x1y0, xi );
1253     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1254
1255     /* Now do the same thing for the front four gradients... */
1256     /* x component */
1257     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1258     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1259     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1260     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1261
1262     brw_push_insn_state( p );
1263     brw_set_mask_control( p, BRW_MASK_DISABLE );
1264     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1265     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1266     brw_pop_insn_state( p );
1267
1268     brw_MUL( p, x1y0, x1y0, t );
1269     brw_MUL( p, x1y1, x1y1, t );
1270     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1271     brw_MUL( p, x0y0, x0y0, param0 );
1272     brw_MUL( p, x0y1, x0y1, param0 );
1273
1274     /* y component */
1275     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1276     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1277     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1278     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1279
1280     brw_push_insn_state( p );
1281     brw_set_mask_control( p, BRW_MASK_DISABLE );
1282     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1283     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1284     brw_pop_insn_state( p );
1285
1286     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1287     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1288     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1289     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1290     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1291
1292     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1293     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1294     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1295     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1296
1297     /* z component */
1298     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1299     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1300     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1301     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1302
1303     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1304     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1305     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1306     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1307
1308     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1309     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1310     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1311     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1312
1313     /* The interpolation coefficients are still around from last time, so
1314        again interpolate in the y dimension... */
1315     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1316     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1317     brw_MUL( p, x0y1, x0y1, yi );
1318     brw_MUL( p, x1y1, x1y1, yi );
1319     brw_ADD( p, x0y0, x0y0, x0y1 );
1320     brw_ADD( p, x1y0, x1y0, x1y1 );
1321
1322     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1323        time put the front face in tmp[ 1 ] and we're nearly there... */
1324     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1325     brw_MUL( p, x1y0, x1y0, xi );
1326     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1327
1328     /* The final interpolation, in the z dimension: */
1329     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1330     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1331     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1332
1333     /* scale by pow( 2, -15 ), as described above */
1334     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1335
1336     release_tmps( c, mark );
1337 }
1338
1339 static void emit_noise3( struct brw_wm_compile *c,
1340                          const struct prog_instruction *inst )
1341 {
1342     struct brw_compile *p = &c->func;
1343     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1344     GLuint mask = inst->DstReg.WriteMask;
1345     int i;
1346     int mark = mark_tmps( c );
1347
1348     assert( mark == 0 );
1349
1350     src0 = get_src_reg( c, inst, 0, 0 );
1351     src1 = get_src_reg( c, inst, 0, 1 );
1352     src2 = get_src_reg( c, inst, 0, 2 );
1353
1354     param0 = alloc_tmp( c );
1355     param1 = alloc_tmp( c );
1356     param2 = alloc_tmp( c );
1357
1358     brw_MOV( p, param0, src0 );
1359     brw_MOV( p, param1, src1 );
1360     brw_MOV( p, param2, src2 );
1361
1362     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1363
1364     /* Fill in the result: */
1365     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1366     for (i = 0 ; i < 4; i++) {
1367         if (mask & (1<<i)) {
1368             dst = get_dst_reg(c, inst, i);
1369             brw_MOV( p, dst, param0 );
1370         }
1371     }
1372     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1373         brw_set_saturate( p, 0 );
1374
1375     release_tmps( c, mark );
1376 }
1377
1378 /**
1379  * For the four-dimensional case, the little micro-optimisation benefits
1380  * we obtain by unrolling all the loops aren't worth the massive bloat it
1381  * now causes.  Instead, we loop twice around performing a similar operation
1382  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1383  * code to glue it all together.
1384  */
1385 static void noise4_sub( struct brw_wm_compile *c )
1386 {
1387     struct brw_compile *p = &c->func;
1388     struct brw_reg param[ 4 ],
1389         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1390         w0, /* noise for the w=0 cube */
1391         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1392         interp[ 4 ], /* interpolation coefficients */
1393         t, tmp[ 8 ], /* float temporaries */
1394         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1395         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1396     int i, j;
1397     int mark = mark_tmps( c );
1398     GLuint loop, origin;
1399
1400     x0y0 = alloc_tmp( c );
1401     x0y1 = alloc_tmp( c );
1402     x1y0 = alloc_tmp( c );
1403     x1y1 = alloc_tmp( c );
1404     t = alloc_tmp( c );
1405     w0 = alloc_tmp( c );
1406     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1407     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1408
1409     for( i = 0; i < 4; i++ ) {
1410         param[ i ] = lookup_tmp( c, mark - 5 + i );
1411         interp[ i ] = alloc_tmp( c );
1412     }
1413
1414     for( i = 0; i < 8; i++ ) {
1415         tmp[ i ] = alloc_tmp( c );
1416         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1417         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1418     }
1419
1420     brw_set_access_mode( p, BRW_ALIGN_1 );
1421
1422     /* We only want 16 bits of precision from the integral part of each
1423        co-ordinate, but unfortunately the RNDD semantics would saturate
1424        at 16 bits if we performed the operation directly to a 16-bit
1425        destination.  Therefore, we round to 32-bit temporaries where
1426        appropriate, and then store only the lower 16 bits. */
1427     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1428     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1429     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1430     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1431     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1432     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1433
1434     /* Modify the flag register here, because the side effect is useful
1435        later (see below).  We know for certain that all flags will be
1436        cleared, since the FRC instruction cannot possibly generate
1437        negative results.  Even for exceptional inputs (infinities, denormals,
1438        NaNs), the architecture guarantees that the L conditional is false. */
1439     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1440     brw_FRC( p, param[ 0 ], param[ 0 ] );
1441     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1442     for( i = 1; i < 4; i++ )
1443         brw_FRC( p, param[ i ], param[ i ] );
1444
1445     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1446        of all. */
1447     for( i = 0; i < 4; i++ )
1448         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1449     for( i = 0; i < 4; i++ )
1450         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1451     for( i = 0; i < 4; i++ )
1452         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1453     for( i = 0; i < 4; i++ )
1454         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1455     for( j = 0; j < 3; j++ )
1456         for( i = 0; i < 4; i++ )
1457             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1458
1459     /* Mark the current address, as it will be a jump destination.  The
1460        following code will be executed twice: first, with the flag
1461        register clear indicating the w=0 case, and second with flags
1462        set for w=1. */
1463     loop = p->nr_insn;
1464
1465     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1466        be hashed.  Since we have only 16 bits of precision in the hash, we
1467        must be careful about thorough mixing to maintain entropy as we
1468        squash the input vector into a small scalar. */
1469     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1470              brw_imm_uw( 0xBC8F ) );
1471     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1472              brw_imm_uw( 0xD0BD ) );
1473     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1474              brw_imm_uw( 0x9B93 ) );
1475     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1476              brw_imm_uw( 0xA359 ) );
1477     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1478              brw_imm_uw( 0xBC8F ) );
1479
1480     /* Temporarily disable the execution mask while we work with ExecSize=16
1481        channels (the mask is set for ExecSize=8 and is probably incorrect).
1482        Although this might cause execution of unwanted channels, the code
1483        writes only to temporary registers and has no side effects, so
1484        disabling the mask is harmless. */
1485     brw_push_insn_state( p );
1486     brw_set_mask_control( p, BRW_MASK_DISABLE );
1487     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1488     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1489     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1490
1491     /* We're now ready to perform the hashing.  The eight hashes are
1492        interleaved for performance.  The hash function used is
1493        designed to rapidly achieve avalanche and require only 16x16
1494        bit multiplication, and 8-bit swizzles (which we get for
1495        free). */
1496     for( i = 0; i < 4; i++ )
1497         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1498     for( i = 0; i < 4; i++ )
1499         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1500                  odd_bytes( wtmp[ i ] ) );
1501     for( i = 0; i < 4; i++ )
1502         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1503     for( i = 0; i < 4; i++ )
1504         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1505                  odd_bytes( wtmp[ i ] ) );
1506     brw_pop_insn_state( p );
1507
1508     /* Now we want to initialise the four rear gradients based on the
1509        hashes.  Format conversion from signed integer to float leaves
1510        everything scaled too high by a factor of pow( 2, 15 ), but
1511        we correct for that right at the end. */
1512     /* x component */
1513     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1514     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1515     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1516     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1517     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1518
1519     brw_push_insn_state( p );
1520     brw_set_mask_control( p, BRW_MASK_DISABLE );
1521     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1522     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1523     brw_pop_insn_state( p );
1524
1525     brw_MUL( p, x1y0, x1y0, t );
1526     brw_MUL( p, x1y1, x1y1, t );
1527     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1528     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1529     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1530
1531     /* y component */
1532     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1533     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1534     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1535     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1536
1537     brw_push_insn_state( p );
1538     brw_set_mask_control( p, BRW_MASK_DISABLE );
1539     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1540     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1541     brw_pop_insn_state( p );
1542
1543     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1544     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1545     /* prepare t for the w component (used below): w the first time through
1546        the loop; w - 1 the second time) */
1547     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1548     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1549     p->current->header.predicate_inverse = 1;
1550     brw_MOV( p, t, param[ 3 ] );
1551     p->current->header.predicate_inverse = 0;
1552     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1553     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1554     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1555
1556     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1557     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1558     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1559     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1560
1561     /* z component */
1562     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1563     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1564     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1565     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1566
1567     brw_push_insn_state( p );
1568     brw_set_mask_control( p, BRW_MASK_DISABLE );
1569     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1570     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1571     brw_pop_insn_state( p );
1572
1573     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1574     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1575     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1576     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1577
1578     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1579     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1580     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1581     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1582
1583     /* w component */
1584     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1585     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1586     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1587     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1588
1589     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1590     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1591     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1592     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1593     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1594
1595     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1596     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1597     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1598     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1599
1600     /* Here we interpolate in the y dimension... */
1601     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1602     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1603     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1604     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1605     brw_ADD( p, x0y0, x0y0, x0y1 );
1606     brw_ADD( p, x1y0, x1y0, x1y1 );
1607
1608     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1609     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1610     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1611     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1612
1613     /* Now do the same thing for the front four gradients... */
1614     /* x component */
1615     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1616     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1617     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1618     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1619
1620     brw_push_insn_state( p );
1621     brw_set_mask_control( p, BRW_MASK_DISABLE );
1622     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1623     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1624     brw_pop_insn_state( p );
1625
1626     brw_MUL( p, x1y0, x1y0, t );
1627     brw_MUL( p, x1y1, x1y1, t );
1628     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1629     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1630     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1631
1632     /* y component */
1633     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1634     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1635     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1636     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1637
1638     brw_push_insn_state( p );
1639     brw_set_mask_control( p, BRW_MASK_DISABLE );
1640     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1641     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1642     brw_pop_insn_state( p );
1643
1644     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1645     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1646     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1647     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1648     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1649
1650     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1651     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1652     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1653     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1654
1655     /* z component */
1656     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1657     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1658     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1659     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1660
1661     brw_push_insn_state( p );
1662     brw_set_mask_control( p, BRW_MASK_DISABLE );
1663     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1664     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1665     brw_pop_insn_state( p );
1666
1667     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1668     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1669     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1670     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1671     /* prepare t for the w component (used below): w the first time through
1672        the loop; w - 1 the second time) */
1673     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1674     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1675     p->current->header.predicate_inverse = 1;
1676     brw_MOV( p, t, param[ 3 ] );
1677     p->current->header.predicate_inverse = 0;
1678     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1679
1680     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1681     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1682     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1683     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1684
1685     /* w component */
1686     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1687     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1688     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1689     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1690
1691     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1692     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1693     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1694     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1695
1696     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1697     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1698     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1699     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1700
1701     /* Interpolate in the y dimension: */
1702     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1703     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1704     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1705     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1706     brw_ADD( p, x0y0, x0y0, x0y1 );
1707     brw_ADD( p, x1y0, x1y0, x1y1 );
1708
1709     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1710        time put the front face in tmp[ 1 ] and we're nearly there... */
1711     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1712     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1713     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1714
1715     /* Another interpolation, in the z dimension: */
1716     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1717     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1718     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1719
1720     /* Exit the loop if we've computed both cubes... */
1721     origin = p->nr_insn;
1722     brw_push_insn_state( p );
1723     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1724     brw_set_mask_control( p, BRW_MASK_DISABLE );
1725     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1726     brw_pop_insn_state( p );
1727
1728     /* Save the result for the w=0 case, and increment the w coordinate: */
1729     brw_MOV( p, w0, tmp[ 0 ] );
1730     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1731              brw_imm_uw( 1 ) );
1732
1733     /* Loop around for the other cube.  Explicitly set the flag register
1734        (unfortunately we must spend an extra instruction to do this: we
1735        can't rely on a side effect of the previous MOV or ADD because
1736        conditional modifiers which are normally true might be false in
1737        exceptional circumstances, e.g. given a NaN input; the add to
1738        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1739     brw_push_insn_state( p );
1740     brw_set_mask_control( p, BRW_MASK_DISABLE );
1741     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1742     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1743              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1744     brw_pop_insn_state( p );
1745
1746     /* Patch the previous conditional branch now that we know the
1747        destination address. */
1748     brw_set_src1( p->store + origin,
1749                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1750
1751     /* The very last interpolation. */
1752     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1753     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1754     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1755
1756     /* scale by pow( 2, -15 ), as described above */
1757     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1758
1759     release_tmps( c, mark );
1760 }
1761
1762 static void emit_noise4( struct brw_wm_compile *c,
1763                          const struct prog_instruction *inst )
1764 {
1765     struct brw_compile *p = &c->func;
1766     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1767     GLuint mask = inst->DstReg.WriteMask;
1768     int i;
1769     int mark = mark_tmps( c );
1770
1771     assert( mark == 0 );
1772
1773     src0 = get_src_reg( c, inst, 0, 0 );
1774     src1 = get_src_reg( c, inst, 0, 1 );
1775     src2 = get_src_reg( c, inst, 0, 2 );
1776     src3 = get_src_reg( c, inst, 0, 3 );
1777
1778     param0 = alloc_tmp( c );
1779     param1 = alloc_tmp( c );
1780     param2 = alloc_tmp( c );
1781     param3 = alloc_tmp( c );
1782
1783     brw_MOV( p, param0, src0 );
1784     brw_MOV( p, param1, src1 );
1785     brw_MOV( p, param2, src2 );
1786     brw_MOV( p, param3, src3 );
1787
1788     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1789
1790     /* Fill in the result: */
1791     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1792     for (i = 0 ; i < 4; i++) {
1793         if (mask & (1<<i)) {
1794             dst = get_dst_reg(c, inst, i);
1795             brw_MOV( p, dst, param0 );
1796         }
1797     }
1798     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1799         brw_set_saturate( p, 0 );
1800
1801     release_tmps( c, mark );
1802 }
1803
1804 /**
1805  * Resolve subroutine calls after code emit is done.
1806  */
1807 static void post_wm_emit( struct brw_wm_compile *c )
1808 {
1809     brw_resolve_cals(&c->func);
1810 }
1811
1812 static void
1813 get_argument_regs(struct brw_wm_compile *c,
1814                   const struct prog_instruction *inst,
1815                   int index,
1816                   struct brw_reg *regs,
1817                   int mask)
1818 {
1819     int i;
1820
1821     for (i = 0; i < 4; i++) {
1822         if (mask & (1 << i))
1823             regs[i] = get_src_reg(c, inst, index, i);
1824     }
1825 }
1826
1827 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1828 {
1829    struct intel_context *intel = &brw->intel;
1830 #define MAX_IF_DEPTH 32
1831 #define MAX_LOOP_DEPTH 32
1832     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1833     GLuint i, if_depth = 0, loop_depth = 0;
1834     struct brw_compile *p = &c->func;
1835     struct brw_indirect stack_index = brw_indirect(0, 0);
1836
1837     c->out_of_regs = GL_FALSE;
1838
1839     prealloc_reg(c);
1840     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1841     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1842
1843     for (i = 0; i < c->nr_fp_insns; i++) {
1844         const struct prog_instruction *inst = &c->prog_instructions[i];
1845         int dst_flags;
1846         struct brw_reg args[3][4], dst[4];
1847         int j;
1848
1849         c->cur_inst = i;
1850
1851 #if 0
1852         _mesa_printf("Inst %d: ", i);
1853         _mesa_print_instruction(inst);
1854 #endif
1855
1856         /* fetch any constants that this instruction needs */
1857         if (c->fp->use_const_buffer)
1858            fetch_constants(c, inst);
1859
1860         if (inst->Opcode != OPCODE_ARL) {
1861            for (j = 0; j < 4; j++) {
1862               if (inst->DstReg.WriteMask & (1 << j))
1863                  dst[j] = get_dst_reg(c, inst, j);
1864               else
1865                  dst[j] = brw_null_reg();
1866            }
1867         }
1868         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1869             get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
1870
1871         dst_flags = inst->DstReg.WriteMask;
1872         if (inst->SaturateMode == SATURATE_ZERO_ONE)
1873             dst_flags |= SATURATE;
1874
1875         if (inst->CondUpdate)
1876             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1877         else
1878             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1879
1880         switch (inst->Opcode) {
1881             case WM_PIXELXY:
1882                 emit_pixel_xy(c, dst, dst_flags);
1883                 break;
1884             case WM_DELTAXY:
1885                 emit_delta_xy(p, dst, dst_flags, args[0]);
1886                 break;
1887             case WM_PIXELW:
1888                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1889                 break;
1890             case WM_LINTERP:
1891                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1892                 break;
1893             case WM_PINTERP:
1894                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1895                 break;
1896             case WM_CINTERP:
1897                 emit_cinterp(p, dst, dst_flags, args[0]);
1898                 break;
1899             case WM_WPOSXY:
1900                 emit_wpos_xy(c, dst, dst_flags, args[0]);
1901                 break;
1902             case WM_FB_WRITE:
1903                 emit_fb_write(c, args[0], args[1], args[2],
1904                               INST_AUX_GET_TARGET(inst->Aux),
1905                               inst->Aux & INST_AUX_EOT);
1906                 break;
1907             case WM_FRONTFACING:
1908                 emit_frontfacing(p, dst, dst_flags);
1909                 break;
1910             case OPCODE_ADD:
1911                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1912                 break;
1913             case OPCODE_ARL:
1914                 emit_arl(c, inst);
1915                 break;
1916             case OPCODE_FRC:
1917                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1918                 break;
1919             case OPCODE_FLR:
1920                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1921                 break;
1922             case OPCODE_LRP:
1923                 unalias3(c, emit_lrp,
1924                          dst, dst_flags, args[0], args[1], args[2]);
1925                 break;
1926             case OPCODE_TRUNC:
1927                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1928                 break;
1929             case OPCODE_MOV:
1930             case OPCODE_SWZ:
1931                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1932                 break;
1933             case OPCODE_DP3:
1934                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1935                 break;
1936             case OPCODE_DP4:
1937                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1938                 break;
1939             case OPCODE_XPD:
1940                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1941                 break;
1942             case OPCODE_DPH:
1943                 emit_dph(p, dst, dst_flags, args[0], args[1]);
1944                 break;
1945             case OPCODE_RCP:
1946                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1947                 break;
1948             case OPCODE_RSQ:
1949                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1950                 break;
1951             case OPCODE_SIN:
1952                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1953                 break;
1954             case OPCODE_COS:
1955                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1956                 break;
1957             case OPCODE_EX2:
1958                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1959                 break;
1960             case OPCODE_LG2:
1961                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1962                 break;
1963             case OPCODE_MIN:
1964                 unalias2(c, emit_min, dst, dst_flags, args[0], args[1]);
1965                 break;
1966             case OPCODE_MAX:
1967                 unalias2(c, emit_max, dst, dst_flags, args[0], args[1]);
1968                 break;
1969             case OPCODE_DDX:
1970             case OPCODE_DDY:
1971                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1972                           args[0]);
1973                 break;
1974             case OPCODE_SLT:
1975                 emit_sop(p, dst, dst_flags,
1976                          BRW_CONDITIONAL_L, args[0], args[1]);
1977                 break;
1978             case OPCODE_SLE:
1979                 emit_sop(p, dst, dst_flags,
1980                          BRW_CONDITIONAL_LE, args[0], args[1]);
1981                 break;
1982             case OPCODE_SGT:
1983                 emit_sop(p, dst, dst_flags,
1984                          BRW_CONDITIONAL_G, args[0], args[1]);
1985                 break;
1986             case OPCODE_SGE:
1987                 emit_sop(p, dst, dst_flags,
1988                          BRW_CONDITIONAL_GE, args[0], args[1]);
1989                 break;
1990             case OPCODE_SEQ:
1991                 emit_sop(p, dst, dst_flags,
1992                          BRW_CONDITIONAL_EQ, args[0], args[1]);
1993                 break;
1994             case OPCODE_SNE:
1995                 emit_sop(p, dst, dst_flags,
1996                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
1997                 break;
1998             case OPCODE_MUL:
1999                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2000                 break;
2001             case OPCODE_POW:
2002                 emit_math2(c, BRW_MATH_FUNCTION_POW,
2003                            dst, dst_flags, args[0], args[1]);
2004                 break;
2005             case OPCODE_MAD:
2006                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2007                 break;
2008             case OPCODE_NOISE1:
2009                 emit_noise1(c, inst);
2010                 break;
2011             case OPCODE_NOISE2:
2012                 emit_noise2(c, inst);
2013                 break;
2014             case OPCODE_NOISE3:
2015                 emit_noise3(c, inst);
2016                 break;
2017             case OPCODE_NOISE4:
2018                 emit_noise4(c, inst);
2019                 break;
2020             case OPCODE_TEX:
2021                 emit_tex(c, dst, dst_flags, args[0],
2022                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2023                                  0, 1, 0, 0),
2024                          inst->TexSrcTarget,
2025                          inst->TexSrcUnit,
2026                          (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
2027                 break;
2028             case OPCODE_TXB:
2029                 emit_txb(c, dst, dst_flags, args[0],
2030                          get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2031                                  0, 1, 0, 0),
2032                          inst->TexSrcTarget,
2033                          c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2034                 break;
2035             case OPCODE_KIL_NV:
2036                 emit_kil(c);
2037                 break;
2038             case OPCODE_IF:
2039                 assert(if_depth < MAX_IF_DEPTH);
2040                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2041                 break;
2042             case OPCODE_ELSE:
2043                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2044                 break;
2045             case OPCODE_ENDIF:
2046                 assert(if_depth > 0);
2047                 brw_ENDIF(p, if_inst[--if_depth]);
2048                 break;
2049             case OPCODE_BGNSUB:
2050                 brw_save_label(p, inst->Comment, p->nr_insn);
2051                 break;
2052             case OPCODE_ENDSUB:
2053                 /* no-op */
2054                 break;
2055             case OPCODE_CAL:
2056                 brw_push_insn_state(p);
2057                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2058                 brw_set_access_mode(p, BRW_ALIGN_1);
2059                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2060                 brw_set_access_mode(p, BRW_ALIGN_16);
2061                 brw_ADD(p, get_addr_reg(stack_index),
2062                          get_addr_reg(stack_index), brw_imm_d(4));
2063                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2064                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2065                 brw_pop_insn_state(p);
2066                 break;
2067
2068             case OPCODE_RET:
2069                 brw_push_insn_state(p);
2070                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2071                 brw_ADD(p, get_addr_reg(stack_index),
2072                         get_addr_reg(stack_index), brw_imm_d(-4));
2073                 brw_set_access_mode(p, BRW_ALIGN_1);
2074                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2075                 brw_set_access_mode(p, BRW_ALIGN_16);
2076                 brw_pop_insn_state(p);
2077
2078                 break;
2079             case OPCODE_BGNLOOP:
2080                 /* XXX may need to invalidate the current_constant regs */
2081                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2082                 break;
2083             case OPCODE_BRK:
2084                 brw_BREAK(p);
2085                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2086                 break;
2087             case OPCODE_CONT:
2088                 brw_CONT(p);
2089                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2090                 break;
2091             case OPCODE_ENDLOOP:
2092                {
2093                   struct brw_instruction *inst0, *inst1;
2094                   GLuint br = 1;
2095
2096                   if (intel->is_ironlake)
2097                      br = 2;
2098
2099                   loop_depth--;
2100                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2101                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2102                   while (inst0 > loop_inst[loop_depth]) {
2103                      inst0--;
2104                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2105                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2106                         inst0->bits3.if_else.pop_count = 0;
2107                      }
2108                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2109                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2110                         inst0->bits3.if_else.pop_count = 0;
2111                      }
2112                   }
2113                }
2114                break;
2115             default:
2116                 _mesa_printf("unsupported IR in fragment shader %d\n",
2117                         inst->Opcode);
2118         }
2119
2120         if (inst->CondUpdate)
2121             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2122         else
2123             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2124     }
2125     post_wm_emit(c);
2126
2127     if (INTEL_DEBUG & DEBUG_WM) {
2128       _mesa_printf("wm-native:\n");
2129       for (i = 0; i < p->nr_insn; i++)
2130          brw_disasm(stderr, &p->store[i]);
2131       _mesa_printf("\n");
2132     }
2133 }
2134
2135 /**
2136  * Do GPU code generation for shaders that use GLSL features such as
2137  * flow control.  Other shaders will be compiled with the
2138  */
2139 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2140 {
2141     if (INTEL_DEBUG & DEBUG_WM) {
2142         _mesa_printf("brw_wm_glsl_emit:\n");
2143     }
2144
2145     /* initial instruction translation/simplification */
2146     brw_wm_pass_fp(c);
2147
2148     /* actual code generation */
2149     brw_wm_emit_glsl(brw, c);
2150
2151     if (INTEL_DEBUG & DEBUG_WM) {
2152         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2153     }
2154
2155     c->prog_data.total_grf = num_grf_used(c);
2156     c->prog_data.total_scratch = 0;
2157 }