src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13
  14 /**
  15  * Determine if the given fragment program uses GLSL features such
  16  * as flow conditionals, loops, subroutines.
  17  * Some GLSL shaders may use these features, others might not.
  18  */
  19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  20 {
  21     int i;
  22     for (i = 0; i < fp->Base.NumInstructions; i++) {
  23         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  24         switch (inst->Opcode) {
  25             case OPCODE_ARL:
  26             case OPCODE_IF:
  27             case OPCODE_ENDIF:
  28             case OPCODE_CAL:
  29             case OPCODE_BRK:
  30             case OPCODE_RET:
  31             case OPCODE_DDX:
  32             case OPCODE_DDY:
  33             case OPCODE_NOISE1:
  34             case OPCODE_NOISE2:
  35             case OPCODE_NOISE3:
  36             case OPCODE_NOISE4:
  37             case OPCODE_BGNLOOP:
  38                 return GL_TRUE;
  39             default:
  40                 break;
  41         }
  42     }
  43     return GL_FALSE;
  44 }
  45
  46
  47
  48 static void
  49 reclaim_temps(struct brw_wm_compile *c);
  50
  51
  52 /** Mark GRF register as used. */
  53 static void
  54 prealloc_grf(struct brw_wm_compile *c, int r)
  55 {
  56    c->used_grf[r] = GL_TRUE;
  57 }
  58
  59
  60 /** Mark given GRF register as not in use. */
  61 static void
  62 release_grf(struct brw_wm_compile *c, int r)
  63 {
  64    /*assert(c->used_grf[r]);*/
  65    c->used_grf[r] = GL_FALSE;
  66    c->first_free_grf = MIN2(c->first_free_grf, r);
  67 }
  68
  69
  70 /** Return index of a free GRF, mark it as used. */
  71 static int
  72 alloc_grf(struct brw_wm_compile *c)
  73 {
  74    GLuint r;
  75    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  76       if (!c->used_grf[r]) {
  77          c->used_grf[r] = GL_TRUE;
  78          c->first_free_grf = r + 1;  /* a guess */
  79          return r;
  80       }
  81    }
  82
  83    /* no free temps, try to reclaim some */
  84    reclaim_temps(c);
  85    c->first_free_grf = 0;
  86
  87    /* try alloc again */
  88    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  89       if (!c->used_grf[r]) {
  90          c->used_grf[r] = GL_TRUE;
  91          c->first_free_grf = r + 1;  /* a guess */
  92          return r;
  93       }
  94    }
  95
  96    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  97       assert(c->used_grf[r]);
  98    }
  99
 100    /* really, no free GRF regs found */
 101    if (!c->out_of_regs) {
 102       /* print warning once per compilation */
 103       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 104       c->out_of_regs = GL_TRUE;
 105    }
 106
 107    return -1;
 108 }
 109
 110
 111 /** Return number of GRF registers used */
 112 static int
 113 num_grf_used(const struct brw_wm_compile *c)
 114 {
 115    int r;
 116    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 117       if (c->used_grf[r])
 118          return r + 1;
 119    return 0;
 120 }
 121
 122
 123
 124 /**
 125  * Record the mapping of a Mesa register to a hardware register.
 126  */
 127 static void set_reg(struct brw_wm_compile *c, int file, int index,
 128         int component, struct brw_reg reg)
 129 {
 130     c->wm_regs[file][index][component].reg = reg;
 131     c->wm_regs[file][index][component].inited = GL_TRUE;
 132 }
 133
 134 /**
 135  * Examine instruction's write mask to find index of first component
 136  * enabled for writing.
 137  */
 138 static int get_scalar_dst_index(const struct prog_instruction *inst)
 139 {
 140     int i;
 141     for (i = 0; i < 4; i++)
 142         if (inst->DstReg.WriteMask & (1<<i))
 143             break;
 144     return i;
 145 }
 146
 147 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 148 {
 149     struct brw_reg reg;
 150
 151     /* if we need to allocate another temp, grow the tmp_regs[] array */
 152     if (c->tmp_index == c->tmp_max) {
 153        int r = alloc_grf(c);
 154        if (r < 0) {
 155           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 156           r = 50; /* XXX random register! */
 157        }
 158        c->tmp_regs[ c->tmp_max++ ] = r;
 159     }
 160
 161     /* form the GRF register */
 162     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 163     /*printf("alloc_temp %d\n", reg.nr);*/
 164     assert(reg.nr < BRW_WM_MAX_GRF);
 165     return reg;
 166
 167 }
 168
 169 /**
 170  * Save current temp register info.
 171  * There must be a matching call to release_tmps().
 172  */
 173 static int mark_tmps(struct brw_wm_compile *c)
 174 {
 175     return c->tmp_index;
 176 }
 177
 178 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 179 {
 180     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 181 }
 182
 183 static void release_tmps(struct brw_wm_compile *c, int mark)
 184 {
 185     c->tmp_index = mark;
 186 }
 187
 188 /**
 189  * Convert Mesa src register to brw register.
 190  *
 191  * Since we're running in SOA mode each Mesa register corresponds to four
 192  * hardware registers.  We allocate the hardware registers as needed here.
 193  *
 194  * \param file  register file, one of PROGRAM_x
 195  * \param index  register number
 196  * \param component  src component (X=0, Y=1, Z=2, W=3)
 197  * \param nr  not used?!?
 198  * \param neg  negate value?
 199  * \param abs  take absolute value?
 200  */
 201 static struct brw_reg
 202 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 203         int nr, GLuint neg, GLuint abs)
 204 {
 205     struct brw_reg reg;
 206     switch (file) {
 207         case PROGRAM_STATE_VAR:
 208         case PROGRAM_CONSTANT:
 209         case PROGRAM_UNIFORM:
 210             file = PROGRAM_STATE_VAR;
 211             break;
 212         case PROGRAM_UNDEFINED:
 213             return brw_null_reg();
 214         case PROGRAM_TEMPORARY:
 215         case PROGRAM_INPUT:
 216         case PROGRAM_OUTPUT:
 217         case PROGRAM_PAYLOAD:
 218             break;
 219         default:
 220             _mesa_problem(NULL, "Unexpected file in get_reg()");
 221             return brw_null_reg();
 222     }
 223
 224     assert(index < 256);
 225     assert(component < 4);
 226
 227     /* see if we've already allocated a HW register for this Mesa register */
 228     if (c->wm_regs[file][index][component].inited) {
 229        /* yes, re-use */
 230        reg = c->wm_regs[file][index][component].reg;
 231     }
 232     else {
 233         /* no, allocate new register */
 234        int grf = alloc_grf(c);
 235        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 236        if (grf < 0) {
 237           /* totally out of temps */
 238           grf = 51; /* XXX random register! */
 239        }
 240
 241        reg = brw_vec8_grf(grf, 0);
 242        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 243
 244        set_reg(c, file, index, component, reg);
 245     }
 246
 247     if (neg & (1 << component)) {
 248         reg = negate(reg);
 249     }
 250     if (abs)
 251         reg = brw_abs(reg);
 252     return reg;
 253 }
 254
 255
 256
 257 /**
 258  * This is called if we run out of GRF registers.  Examine the live intervals
 259  * of temp regs in the program and free those which won't be used again.
 260  */
 261 static void
 262 reclaim_temps(struct brw_wm_compile *c)
 263 {
 264    GLint intBegin[MAX_PROGRAM_TEMPS];
 265    GLint intEnd[MAX_PROGRAM_TEMPS];
 266    int index;
 267
 268    /*printf("Reclaim temps:\n");*/
 269
 270    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 271                              intBegin, intEnd);
 272
 273    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 274       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 275          /* program temp[i] can be freed */
 276          int component;
 277          /*printf("  temp[%d] is dead\n", index);*/
 278          for (component = 0; component < 4; component++) {
 279             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 280                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 281                release_grf(c, r);
 282                /*
 283                printf("  Reclaim temp %d, reg %d at inst %d\n",
 284                       index, r, c->cur_inst);
 285                */
 286                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 287             }
 288          }
 289       }
 290    }
 291 }
 292
 293
 294
 295
 296 /**
 297  * Preallocate registers.  This sets up the Mesa to hardware register
 298  * mapping for certain registers, such as constants (uniforms/state vars)
 299  * and shader inputs.
 300  */
 301 static void prealloc_reg(struct brw_wm_compile *c)
 302 {
 303     int i, j;
 304     struct brw_reg reg;
 305     int urb_read_length = 0;
 306     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 307     GLuint reg_index = 0;
 308
 309     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 310     c->first_free_grf = 0;
 311
 312     for (i = 0; i < 4; i++) {
 313         if (i < c->key.nr_depth_regs)
 314             reg = brw_vec8_grf(i * 2, 0);
 315         else
 316             reg = brw_vec8_grf(0, 0);
 317         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 318     }
 319     reg_index += 2 * c->key.nr_depth_regs;
 320
 321     /* constants */
 322     {
 323         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 324         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 325
 326         /* use a real constant buffer, or just use a section of the GRF? */
 327         /* XXX this heuristic may need adjustment... */
 328         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 329            c->fp->use_const_buffer = GL_TRUE;
 330         else
 331            c->fp->use_const_buffer = GL_FALSE;
 332         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 333
 334         if (c->fp->use_const_buffer) {
 335            /* We'll use a real constant buffer and fetch constants from
 336             * it with a dataport read message.
 337             */
 338
 339            /* number of float constants in CURBE */
 340            c->prog_data.nr_params = 0;
 341         }
 342         else {
 343            const struct gl_program_parameter_list *plist =
 344               c->fp->program.Base.Parameters;
 345            int index = 0;
 346
 347            /* number of float constants in CURBE */
 348            c->prog_data.nr_params = 4 * nr_params;
 349
 350            /* loop over program constants (float[4]) */
 351            for (i = 0; i < nr_params; i++) {
 352               /* loop over XYZW channels */
 353               for (j = 0; j < 4; j++, index++) {
 354                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 355                  /* Save pointer to parameter/constant value.
 356                   * Constants will be copied in prepare_constant_buffer()
 357                   */
 358                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 359                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 360               }
 361            }
 362            /* number of constant regs used (each reg is float[8]) */
 363            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 364            reg_index += c->nr_creg;
 365         }
 366     }
 367
 368     /* fragment shader inputs */
 369     for (i = 0; i < VERT_RESULT_MAX; i++) {
 370        int fp_input;
 371
 372        if (i >= VERT_RESULT_VAR0)
 373           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 374        else if (i <= VERT_RESULT_TEX7)
 375           fp_input = i;
 376        else
 377           fp_input = -1;
 378
 379        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 380           urb_read_length = reg_index;
 381           reg = brw_vec8_grf(reg_index, 0);
 382           for (j = 0; j < 4; j++)
 383              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 384        }
 385        if (c->key.vp_outputs_written & (1 << i)) {
 386           reg_index += 2;
 387        }
 388     }
 389
 390     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 391     c->prog_data.urb_read_length = urb_read_length;
 392     c->prog_data.curb_read_length = c->nr_creg;
 393     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 394     reg_index++;
 395     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 396     reg_index += 2;
 397
 398     /* mark GRF regs [0..reg_index-1] as in-use */
 399     for (i = 0; i < reg_index; i++)
 400        prealloc_grf(c, i);
 401
 402     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 403     prealloc_grf(c, 126);
 404     prealloc_grf(c, 127);
 405
 406     /* An instruction may reference up to three constants.
 407      * They'll be found in these registers.
 408      * XXX alloc these on demand!
 409      */
 410     if (c->fp->use_const_buffer) {
 411        for (i = 0; i < 3; i++) {
 412           c->current_const[i].index = -1;
 413           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 414        }
 415     }
 416 #if 0
 417     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 418     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 419 #endif
 420 }
 421
 422
 423 /**
 424  * Check if any of the instruction's src registers are constants, uniforms,
 425  * or statevars.  If so, fetch any constants that we don't already have in
 426  * the three GRF slots.
 427  */
 428 static void fetch_constants(struct brw_wm_compile *c,
 429                             const struct prog_instruction *inst)
 430 {
 431    struct brw_compile *p = &c->func;
 432    GLuint i;
 433
 434    /* loop over instruction src regs */
 435    for (i = 0; i < 3; i++) {
 436       const struct prog_src_register *src = &inst->SrcReg[i];
 437       if (src->File == PROGRAM_STATE_VAR ||
 438           src->File == PROGRAM_CONSTANT ||
 439           src->File == PROGRAM_UNIFORM) {
 440          c->current_const[i].index = src->Index;
 441
 442 #if 0
 443          printf("  fetch const[%d] for arg %d into reg %d\n",
 444                 src->Index, i, c->current_const[i].reg.nr);
 445 #endif
 446
 447          /* need to fetch the constant now */
 448          brw_dp_READ_4(p,
 449                        c->current_const[i].reg,  /* writeback dest */
 450                        src->RelAddr,             /* relative indexing? */
 451                        16 * src->Index,          /* byte offset */
 452                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 453                        );
 454       }
 455    }
 456 }
 457
 458
 459 /**
 460  * Convert Mesa dst register to brw register.
 461  */
 462 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 463                                   const struct prog_instruction *inst,
 464                                   GLuint component)
 465 {
 466     const int nr = 1;
 467     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 468             0, 0);
 469 }
 470
 471
 472 static struct brw_reg
 473 get_src_reg_const(struct brw_wm_compile *c,
 474                   const struct prog_instruction *inst,
 475                   GLuint srcRegIndex, GLuint component)
 476 {
 477    /* We should have already fetched the constant from the constant
 478     * buffer in fetch_constants().  Now we just have to return a
 479     * register description that extracts the needed component and
 480     * smears it across all eight vector components.
 481     */
 482    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 483    struct brw_reg const_reg;
 484
 485    assert(component < 4);
 486    assert(srcRegIndex < 3);
 487    assert(c->current_const[srcRegIndex].index != -1);
 488    const_reg = c->current_const[srcRegIndex].reg;
 489
 490    /* extract desired float from the const_reg, and smear */
 491    const_reg = stride(const_reg, 0, 1, 0);
 492    const_reg.subnr = component * 4;
 493
 494    if (src->Negate & (1 << component))
 495       const_reg = negate(const_reg);
 496    if (src->Abs)
 497       const_reg = brw_abs(const_reg);
 498
 499 #if 0
 500    printf("  form const[%d].%d for arg %d, reg %d\n",
 501           c->current_const[srcRegIndex].index,
 502           component,
 503           srcRegIndex,
 504           const_reg.nr);
 505 #endif
 506
 507    return const_reg;
 508 }
 509
 510
 511 /**
 512  * Convert Mesa src register to brw register.
 513  */
 514 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 515                                   const struct prog_instruction *inst,
 516                                   GLuint srcRegIndex, GLuint channel)
 517 {
 518     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 519     const GLuint nr = 1;
 520     const GLuint component = GET_SWZ(src->Swizzle, channel);
 521
 522     /* Extended swizzle terms */
 523     if (component == SWIZZLE_ZERO) {
 524        return brw_imm_f(0.0F);
 525     }
 526     else if (component == SWIZZLE_ONE) {
 527        return brw_imm_f(1.0F);
 528     }
 529
 530     if (c->fp->use_const_buffer &&
 531         (src->File == PROGRAM_STATE_VAR ||
 532          src->File == PROGRAM_CONSTANT ||
 533          src->File == PROGRAM_UNIFORM)) {
 534        return get_src_reg_const(c, inst, srcRegIndex, component);
 535     }
 536     else {
 537        /* other type of source register */
 538        return get_reg(c, src->File, src->Index, component, nr,
 539                       src->Negate, src->Abs);
 540     }
 541 }
 542
 543
 544 /**
 545  * Same as \sa get_src_reg() but if the register is a literal, emit
 546  * a brw_reg encoding the literal.
 547  * Note that a brw instruction only allows one src operand to be a literal.
 548  * For instructions with more than one operand, only the second can be a
 549  * literal.  This means that we treat some literals as constants/uniforms
 550  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 551  *
 552  */
 553 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 554                                       const struct prog_instruction *inst,
 555                                       GLuint srcRegIndex, GLuint channel)
 556 {
 557     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 558     if (src->File == PROGRAM_CONSTANT) {
 559        /* a literal */
 560        const int component = GET_SWZ(src->Swizzle, channel);
 561        const GLfloat *param =
 562           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 563        GLfloat value = param[component];
 564        if (src->Negate & (1 << channel))
 565           value = -value;
 566        if (src->Abs)
 567           value = FABSF(value);
 568 #if 0
 569        printf("  form immed value %f for chan %d\n", value, channel);
 570 #endif
 571        return brw_imm_f(value);
 572     }
 573     else {
 574        return get_src_reg(c, inst, srcRegIndex, channel);
 575     }
 576 }
 577
 578
 579 /**
 580  * Subroutines are minimal support for resusable instruction sequences.
 581  * They are implemented as simply as possible to minimise overhead: there
 582  * is no explicit support for communication between the caller and callee
 583  * other than saving the return address in a temporary register, nor is
 584  * there any automatic local storage.  This implies that great care is
 585  * required before attempting reentrancy or any kind of nested
 586  * subroutine invocations.
 587  */
 588 static void invoke_subroutine( struct brw_wm_compile *c,
 589                                enum _subroutine subroutine,
 590                                void (*emit)( struct brw_wm_compile * ) )
 591 {
 592     struct brw_compile *p = &c->func;
 593
 594     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 595
 596     if( c->subroutines[ subroutine ] ) {
 597         /* subroutine previously emitted: reuse existing instructions */
 598
 599         int mark = mark_tmps( c );
 600         struct brw_reg return_address = retype( alloc_tmp( c ),
 601                                                 BRW_REGISTER_TYPE_UD );
 602         int here = p->nr_insn;
 603
 604         brw_push_insn_state(p);
 605         brw_set_mask_control(p, BRW_MASK_DISABLE);
 606         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 607
 608         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 609                  brw_imm_d( ( c->subroutines[ subroutine ] -
 610                               here - 1 ) << 4 ) );
 611         brw_pop_insn_state(p);
 612
 613         release_tmps( c, mark );
 614     } else {
 615         /* previously unused subroutine: emit, and mark for later reuse */
 616
 617         int mark = mark_tmps( c );
 618         struct brw_reg return_address = retype( alloc_tmp( c ),
 619                                                 BRW_REGISTER_TYPE_UD );
 620         struct brw_instruction *calc;
 621         int base = p->nr_insn;
 622
 623         brw_push_insn_state(p);
 624         brw_set_mask_control(p, BRW_MASK_DISABLE);
 625         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 626         brw_pop_insn_state(p);
 627
 628         c->subroutines[ subroutine ] = p->nr_insn;
 629
 630         emit( c );
 631
 632         brw_push_insn_state(p);
 633         brw_set_mask_control(p, BRW_MASK_DISABLE);
 634         brw_MOV( p, brw_ip_reg(), return_address );
 635         brw_pop_insn_state(p);
 636
 637         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 638
 639         release_tmps( c, mark );
 640     }
 641 }
 642
 643 static void emit_trunc( struct brw_wm_compile *c,
 644                         const struct prog_instruction *inst)
 645 {
 646     int i;
 647     struct brw_compile *p = &c->func;
 648     GLuint mask = inst->DstReg.WriteMask;
 649     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 650     for (i = 0; i < 4; i++) {
 651         if (mask & (1<<i)) {
 652             struct brw_reg src, dst;
 653             dst = get_dst_reg(c, inst, i);
 654             src = get_src_reg(c, inst, 0, i);
 655             brw_RNDZ(p, dst, src);
 656         }
 657     }
 658     brw_set_saturate(p, 0);
 659 }
 660
 661 static void emit_mov( struct brw_wm_compile *c,
 662                       const struct prog_instruction *inst)
 663 {
 664     int i;
 665     struct brw_compile *p = &c->func;
 666     GLuint mask = inst->DstReg.WriteMask;
 667     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 668     for (i = 0; i < 4; i++) {
 669         if (mask & (1<<i)) {
 670             struct brw_reg src, dst;
 671             dst = get_dst_reg(c, inst, i);
 672             /* XXX some moves from immediate value don't work reliably!!! */
 673             /*src = get_src_reg_imm(c, inst, 0, i);*/
 674             src = get_src_reg(c, inst, 0, i);
 675             brw_MOV(p, dst, src);
 676         }
 677     }
 678     brw_set_saturate(p, 0);
 679 }
 680
 681 static void emit_pixel_xy(struct brw_wm_compile *c,
 682                           const struct prog_instruction *inst)
 683 {
 684     struct brw_reg r1 = brw_vec1_grf(1, 0);
 685     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 686
 687     struct brw_reg dst0, dst1;
 688     struct brw_compile *p = &c->func;
 689     GLuint mask = inst->DstReg.WriteMask;
 690
 691     dst0 = get_dst_reg(c, inst, 0);
 692     dst1 = get_dst_reg(c, inst, 1);
 693     /* Calculate pixel centers by adding 1 or 0 to each of the
 694      * micro-tile coordinates passed in r1.
 695      */
 696     if (mask & WRITEMASK_X) {
 697         brw_ADD(p,
 698                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 699                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 700                 brw_imm_v(0x10101010));
 701     }
 702
 703     if (mask & WRITEMASK_Y) {
 704         brw_ADD(p,
 705                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 706                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 707                 brw_imm_v(0x11001100));
 708     }
 709 }
 710
 711 static void emit_delta_xy(struct brw_wm_compile *c,
 712                           const struct prog_instruction *inst)
 713 {
 714     struct brw_reg r1 = brw_vec1_grf(1, 0);
 715     struct brw_reg dst0, dst1, src0, src1;
 716     struct brw_compile *p = &c->func;
 717     GLuint mask = inst->DstReg.WriteMask;
 718
 719     dst0 = get_dst_reg(c, inst, 0);
 720     dst1 = get_dst_reg(c, inst, 1);
 721     src0 = get_src_reg(c, inst, 0, 0);
 722     src1 = get_src_reg(c, inst, 0, 1);
 723     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 724      * centers.
 725      */
 726     if (mask & WRITEMASK_X) {
 727         brw_ADD(p,
 728                 dst0,
 729                 retype(src0, BRW_REGISTER_TYPE_UW),
 730                 negate(r1));
 731     }
 732
 733     if (mask & WRITEMASK_Y) {
 734         brw_ADD(p,
 735                 dst1,
 736                 retype(src1, BRW_REGISTER_TYPE_UW),
 737                 negate(suboffset(r1,1)));
 738
 739     }
 740 }
 741
 742 static void fire_fb_write( struct brw_wm_compile *c,
 743                            GLuint base_reg,
 744                            GLuint nr,
 745                            GLuint target,
 746                            GLuint eot)
 747 {
 748     struct brw_compile *p = &c->func;
 749     /* Pass through control information:
 750      */
 751     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 752     {
 753         brw_push_insn_state(p);
 754         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 755         brw_MOV(p,
 756                 brw_message_reg(base_reg + 1),
 757                 brw_vec8_grf(1, 0));
 758         brw_pop_insn_state(p);
 759     }
 760     /* Send framebuffer write message: */
 761     brw_fb_WRITE(p,
 762             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 763             base_reg,
 764             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 765             target,
 766             nr,
 767             0,
 768             eot);
 769 }
 770
 771 static void emit_fb_write(struct brw_wm_compile *c,
 772                           const struct prog_instruction *inst)
 773 {
 774     struct brw_compile *p = &c->func;
 775     int nr = 2;
 776     int channel;
 777     GLuint target, eot;
 778     struct brw_reg src0;
 779
 780     /* Reserve a space for AA - may not be needed:
 781      */
 782     if (c->key.aa_dest_stencil_reg)
 783         nr += 1;
 784
 785     brw_push_insn_state(p);
 786     for (channel = 0; channel < 4; channel++) {
 787         src0 = get_src_reg(c,  inst, 0, channel);
 788         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 789         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 790         brw_MOV(p, brw_message_reg(nr + channel), src0);
 791     }
 792     /* skip over the regs populated above: */
 793     nr += 8;
 794     brw_pop_insn_state(p);
 795
 796     if (c->key.source_depth_to_render_target) {
 797        if (c->key.computes_depth) {
 798           src0 = get_src_reg(c, inst, 2, 2);
 799           brw_MOV(p, brw_message_reg(nr), src0);
 800        }
 801        else {
 802           src0 = get_src_reg(c, inst, 1, 1);
 803           brw_MOV(p, brw_message_reg(nr), src0);
 804        }
 805
 806        nr += 2;
 807     }
 808
 809     if (c->key.dest_depth_reg) {
 810         const GLuint comp = c->key.dest_depth_reg / 2;
 811         const GLuint off = c->key.dest_depth_reg % 2;
 812
 813         if (off != 0) {
 814             /* XXX this code needs review/testing */
 815             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 816             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 817
 818             brw_push_insn_state(p);
 819             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 820
 821             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 822             /* 2nd half? */
 823             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 824             brw_pop_insn_state(p);
 825         }
 826         else
 827         {
 828             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 829             brw_MOV(p, brw_message_reg(nr), src);
 830         }
 831         nr += 2;
 832    }
 833
 834     target = inst->Aux >> 1;
 835     eot = inst->Aux & 1;
 836     fire_fb_write(c, 0, nr, target, eot);
 837 }
 838
 839 static void emit_pixel_w( struct brw_wm_compile *c,
 840                           const struct prog_instruction *inst)
 841 {
 842     struct brw_compile *p = &c->func;
 843     GLuint mask = inst->DstReg.WriteMask;
 844     if (mask & WRITEMASK_W) {
 845         struct brw_reg dst, src0, delta0, delta1;
 846         struct brw_reg interp3;
 847
 848         dst = get_dst_reg(c, inst, 3);
 849         src0 = get_src_reg(c, inst, 0, 0);
 850         delta0 = get_src_reg(c, inst, 1, 0);
 851         delta1 = get_src_reg(c, inst, 1, 1);
 852
 853         interp3 = brw_vec1_grf(src0.nr+1, 4);
 854         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 855          * result straight into a message reg.
 856          */
 857         brw_LINE(p, brw_null_reg(), interp3, delta0);
 858         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 859
 860         /* Calc w */
 861         brw_math_16( p, dst,
 862                 BRW_MATH_FUNCTION_INV,
 863                 BRW_MATH_SATURATE_NONE,
 864                 2, brw_null_reg(),
 865                 BRW_MATH_PRECISION_FULL);
 866     }
 867 }
 868
 869 static void emit_linterp(struct brw_wm_compile *c,
 870                          const struct prog_instruction *inst)
 871 {
 872     struct brw_compile *p = &c->func;
 873     GLuint mask = inst->DstReg.WriteMask;
 874     struct brw_reg interp[4];
 875     struct brw_reg dst, delta0, delta1;
 876     struct brw_reg src0;
 877     GLuint nr, i;
 878
 879     src0 = get_src_reg(c, inst, 0, 0);
 880     delta0 = get_src_reg(c, inst, 1, 0);
 881     delta1 = get_src_reg(c, inst, 1, 1);
 882     nr = src0.nr;
 883
 884     interp[0] = brw_vec1_grf(nr, 0);
 885     interp[1] = brw_vec1_grf(nr, 4);
 886     interp[2] = brw_vec1_grf(nr+1, 0);
 887     interp[3] = brw_vec1_grf(nr+1, 4);
 888
 889     for(i = 0; i < 4; i++ ) {
 890         if (mask & (1<<i)) {
 891             dst = get_dst_reg(c, inst, i);
 892             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 893             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 894         }
 895     }
 896 }
 897
 898 static void emit_cinterp(struct brw_wm_compile *c,
 899                          const struct prog_instruction *inst)
 900 {
 901     struct brw_compile *p = &c->func;
 902     GLuint mask = inst->DstReg.WriteMask;
 903
 904     struct brw_reg interp[4];
 905     struct brw_reg dst, src0;
 906     GLuint nr, i;
 907
 908     src0 = get_src_reg(c, inst, 0, 0);
 909     nr = src0.nr;
 910
 911     interp[0] = brw_vec1_grf(nr, 0);
 912     interp[1] = brw_vec1_grf(nr, 4);
 913     interp[2] = brw_vec1_grf(nr+1, 0);
 914     interp[3] = brw_vec1_grf(nr+1, 4);
 915
 916     for(i = 0; i < 4; i++ ) {
 917         if (mask & (1<<i)) {
 918             dst = get_dst_reg(c, inst, i);
 919             brw_MOV(p, dst, suboffset(interp[i],3));
 920         }
 921     }
 922 }
 923
 924 static void emit_pinterp(struct brw_wm_compile *c,
 925                          const struct prog_instruction *inst)
 926 {
 927     struct brw_compile *p = &c->func;
 928     GLuint mask = inst->DstReg.WriteMask;
 929
 930     struct brw_reg interp[4];
 931     struct brw_reg dst, delta0, delta1;
 932     struct brw_reg src0, w;
 933     GLuint nr, i;
 934
 935     src0 = get_src_reg(c, inst, 0, 0);
 936     delta0 = get_src_reg(c, inst, 1, 0);
 937     delta1 = get_src_reg(c, inst, 1, 1);
 938     w = get_src_reg(c, inst, 2, 3);
 939     nr = src0.nr;
 940
 941     interp[0] = brw_vec1_grf(nr, 0);
 942     interp[1] = brw_vec1_grf(nr, 4);
 943     interp[2] = brw_vec1_grf(nr+1, 0);
 944     interp[3] = brw_vec1_grf(nr+1, 4);
 945
 946     for(i = 0; i < 4; i++ ) {
 947         if (mask & (1<<i)) {
 948             dst = get_dst_reg(c, inst, i);
 949             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 950             brw_MAC(p, dst, suboffset(interp[i],1),
 951                     delta1);
 952             brw_MUL(p, dst, dst, w);
 953         }
 954     }
 955 }
 956
 957 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 958 static void emit_frontfacing(struct brw_wm_compile *c,
 959                              const struct prog_instruction *inst)
 960 {
 961     struct brw_compile *p = &c->func;
 962     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 963     struct brw_reg dst;
 964     GLuint mask = inst->DstReg.WriteMask;
 965     int i;
 966
 967     for (i = 0; i < 4; i++) {
 968         if (mask & (1<<i)) {
 969             dst = get_dst_reg(c, inst, i);
 970             brw_MOV(p, dst, brw_imm_f(0.0));
 971         }
 972     }
 973
 974     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 975      * us front face
 976      */
 977     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 978     for (i = 0; i < 4; i++) {
 979         if (mask & (1<<i)) {
 980             dst = get_dst_reg(c, inst, i);
 981             brw_MOV(p, dst, brw_imm_f(1.0));
 982         }
 983     }
 984     brw_set_predicate_control_flag_value(p, 0xff);
 985 }
 986
 987 static void emit_xpd(struct brw_wm_compile *c,
 988                      const struct prog_instruction *inst)
 989 {
 990     int i;
 991     struct brw_compile *p = &c->func;
 992     GLuint mask = inst->DstReg.WriteMask;
 993     for (i = 0; i < 4; i++) {
 994         GLuint i2 = (i+2)%3;
 995         GLuint i1 = (i+1)%3;
 996         if (mask & (1<<i)) {
 997             struct brw_reg src0, src1, dst;
 998             dst = get_dst_reg(c, inst, i);
 999             src0 = negate(get_src_reg(c, inst, 0, i2));
1000             src1 = get_src_reg_imm(c, inst, 1, i1);
1001             brw_MUL(p, brw_null_reg(), src0, src1);
1002             src0 = get_src_reg(c, inst, 0, i1);
1003             src1 = get_src_reg_imm(c, inst, 1, i2);
1004             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1005             brw_MAC(p, dst, src0, src1);
1006             brw_set_saturate(p, 0);
1007         }
1008     }
1009     brw_set_saturate(p, 0);
1010 }
1011
1012 static void emit_dp3(struct brw_wm_compile *c,
1013                      const struct prog_instruction *inst)
1014 {
1015     struct brw_reg src0[3], src1[3], dst;
1016     int i;
1017     struct brw_compile *p = &c->func;
1018     GLuint mask = inst->DstReg.WriteMask;
1019     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1020
1021     if (!(mask & WRITEMASK_XYZW))
1022         return;
1023
1024     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1025
1026     for (i = 0; i < 3; i++) {
1027         src0[i] = get_src_reg(c, inst, 0, i);
1028         src1[i] = get_src_reg_imm(c, inst, 1, i);
1029     }
1030
1031     dst = get_dst_reg(c, inst, dst_chan);
1032     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1033     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1034     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1035     brw_MAC(p, dst, src0[2], src1[2]);
1036     brw_set_saturate(p, 0);
1037 }
1038
1039 static void emit_dp4(struct brw_wm_compile *c,
1040                      const struct prog_instruction *inst)
1041 {
1042     struct brw_reg src0[4], src1[4], dst;
1043     int i;
1044     struct brw_compile *p = &c->func;
1045     GLuint mask = inst->DstReg.WriteMask;
1046     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1047
1048     if (!(mask & WRITEMASK_XYZW))
1049         return;
1050
1051     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1052
1053     for (i = 0; i < 4; i++) {
1054         src0[i] = get_src_reg(c, inst, 0, i);
1055         src1[i] = get_src_reg_imm(c, inst, 1, i);
1056     }
1057     dst = get_dst_reg(c, inst, dst_chan);
1058     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1059     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1060     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1061     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1062     brw_MAC(p, dst, src0[3], src1[3]);
1063     brw_set_saturate(p, 0);
1064 }
1065
1066 static void emit_dph(struct brw_wm_compile *c,
1067                      const struct prog_instruction *inst)
1068 {
1069     struct brw_reg src0[4], src1[4], dst;
1070     int i;
1071     struct brw_compile *p = &c->func;
1072     GLuint mask = inst->DstReg.WriteMask;
1073     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1074
1075     if (!(mask & WRITEMASK_XYZW))
1076         return;
1077
1078     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1079
1080     for (i = 0; i < 4; i++) {
1081         src0[i] = get_src_reg(c, inst, 0, i);
1082         src1[i] = get_src_reg_imm(c, inst, 1, i);
1083     }
1084     dst = get_dst_reg(c, inst, dst_chan);
1085     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1086     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1087     brw_MAC(p, dst, src0[2], src1[2]);
1088     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1089     brw_ADD(p, dst, dst, src1[3]);
1090     brw_set_saturate(p, 0);
1091 }
1092
1093 /**
1094  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1095  * Note that the result of the function is smeared across the dest
1096  * register's X, Y, Z and W channels (subject to writemasking of course).
1097  */
1098 static void emit_math1(struct brw_wm_compile *c,
1099                        const struct prog_instruction *inst, GLuint func)
1100 {
1101     struct brw_compile *p = &c->func;
1102     struct brw_reg src0, dst;
1103     GLuint mask = inst->DstReg.WriteMask;
1104     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1105
1106     if (!(mask & WRITEMASK_XYZW))
1107         return;
1108
1109     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1110
1111     /* Get first component of source register */
1112     dst = get_dst_reg(c, inst, dst_chan);
1113     src0 = get_src_reg(c, inst, 0, 0);
1114
1115     brw_MOV(p, brw_message_reg(2), src0);
1116     brw_math(p,
1117              dst,
1118              func,
1119              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1120              2,
1121              brw_null_reg(),
1122              BRW_MATH_DATA_VECTOR,
1123              BRW_MATH_PRECISION_FULL);
1124 }
1125
1126 static void emit_rcp(struct brw_wm_compile *c,
1127                      const struct prog_instruction *inst)
1128 {
1129     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1130 }
1131
1132 static void emit_rsq(struct brw_wm_compile *c,
1133                      const struct prog_instruction *inst)
1134 {
1135     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1136 }
1137
1138 static void emit_sin(struct brw_wm_compile *c,
1139                      const struct prog_instruction *inst)
1140 {
1141     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1142 }
1143
1144 static void emit_cos(struct brw_wm_compile *c,
1145                      const struct prog_instruction *inst)
1146 {
1147     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1148 }
1149
1150 static void emit_ex2(struct brw_wm_compile *c,
1151                      const struct prog_instruction *inst)
1152 {
1153     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1154 }
1155
1156 static void emit_lg2(struct brw_wm_compile *c,
1157                      const struct prog_instruction *inst)
1158 {
1159     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1160 }
1161
1162 static void emit_add(struct brw_wm_compile *c,
1163                      const struct prog_instruction *inst)
1164 {
1165     struct brw_compile *p = &c->func;
1166     struct brw_reg src0, src1, dst;
1167     GLuint mask = inst->DstReg.WriteMask;
1168     int i;
1169     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1170     for (i = 0 ; i < 4; i++) {
1171         if (mask & (1<<i)) {
1172             dst = get_dst_reg(c, inst, i);
1173             src0 = get_src_reg(c, inst, 0, i);
1174             src1 = get_src_reg_imm(c, inst, 1, i);
1175             brw_ADD(p, dst, src0, src1);
1176         }
1177     }
1178     brw_set_saturate(p, 0);
1179 }
1180
1181 static void emit_arl(struct brw_wm_compile *c,
1182                      const struct prog_instruction *inst)
1183 {
1184     struct brw_compile *p = &c->func;
1185     struct brw_reg src0, addr_reg;
1186     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1187     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1188                            BRW_ARF_ADDRESS, 0);
1189     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1190     brw_MOV(p, addr_reg, src0);
1191     brw_set_saturate(p, 0);
1192 }
1193
1194
1195 static void emit_mul(struct brw_wm_compile *c,
1196                      const struct prog_instruction *inst)
1197 {
1198     struct brw_compile *p = &c->func;
1199     struct brw_reg src0, src1, dst;
1200     GLuint mask = inst->DstReg.WriteMask;
1201     int i;
1202     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1203     for (i = 0 ; i < 4; i++) {
1204         if (mask & (1<<i)) {
1205             dst = get_dst_reg(c, inst, i);
1206             src0 = get_src_reg(c, inst, 0, i);
1207             src1 = get_src_reg_imm(c, inst, 1, i);
1208             brw_MUL(p, dst, src0, src1);
1209         }
1210     }
1211     brw_set_saturate(p, 0);
1212 }
1213
1214 static void emit_frc(struct brw_wm_compile *c,
1215                      const struct prog_instruction *inst)
1216 {
1217     struct brw_compile *p = &c->func;
1218     struct brw_reg src0, dst;
1219     GLuint mask = inst->DstReg.WriteMask;
1220     int i;
1221     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1222     for (i = 0 ; i < 4; i++) {
1223         if (mask & (1<<i)) {
1224             dst = get_dst_reg(c, inst, i);
1225             src0 = get_src_reg_imm(c, inst, 0, i);
1226             brw_FRC(p, dst, src0);
1227         }
1228     }
1229     if (inst->SaturateMode != SATURATE_OFF)
1230         brw_set_saturate(p, 0);
1231 }
1232
1233 static void emit_flr(struct brw_wm_compile *c,
1234                      const struct prog_instruction *inst)
1235 {
1236     struct brw_compile *p = &c->func;
1237     struct brw_reg src0, dst;
1238     GLuint mask = inst->DstReg.WriteMask;
1239     int i;
1240     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1241     for (i = 0 ; i < 4; i++) {
1242         if (mask & (1<<i)) {
1243             dst = get_dst_reg(c, inst, i);
1244             src0 = get_src_reg_imm(c, inst, 0, i);
1245             brw_RNDD(p, dst, src0);
1246         }
1247     }
1248     brw_set_saturate(p, 0);
1249 }
1250
1251
1252 static void emit_min_max(struct brw_wm_compile *c,
1253                          const struct prog_instruction *inst)
1254 {
1255     struct brw_compile *p = &c->func;
1256     const GLuint mask = inst->DstReg.WriteMask;
1257     const int mark = mark_tmps(c);
1258     int i;
1259     brw_push_insn_state(p);
1260     for (i = 0; i < 4; i++) {
1261         if (mask & (1<<i)) {
1262             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1263             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1264             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1265             struct brw_reg dst;
1266             /* if dst==src0 or dst==src1 we need to use a temp reg */
1267             GLboolean use_temp = brw_same_reg(dst, src0) ||
1268                                  brw_same_reg(dst, src1);
1269             if (use_temp)
1270                dst = alloc_tmp(c);
1271             else
1272                dst = real_dst;
1273
1274             /*
1275             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1276                    dst.nr, src0.nr, src1.nr);
1277             */
1278             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1279             brw_MOV(p, dst, src0);
1280             brw_set_saturate(p, 0);
1281
1282             if (inst->Opcode == OPCODE_MIN)
1283                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1284             else
1285                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1286
1287             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1288             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1289             brw_MOV(p, dst, src1);
1290             brw_set_saturate(p, 0);
1291             brw_set_predicate_control_flag_value(p, 0xff);
1292             if (use_temp)
1293                brw_MOV(p, real_dst, dst);
1294         }
1295     }
1296     brw_pop_insn_state(p);
1297     release_tmps(c, mark);
1298 }
1299
1300 static void emit_pow(struct brw_wm_compile *c,
1301                      const struct prog_instruction *inst)
1302 {
1303     struct brw_compile *p = &c->func;
1304     struct brw_reg dst, src0, src1;
1305     GLuint mask = inst->DstReg.WriteMask;
1306     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1307
1308     if (!(mask & WRITEMASK_XYZW))
1309         return;
1310
1311     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1312
1313     dst = get_dst_reg(c, inst, dst_chan);
1314     src0 = get_src_reg_imm(c, inst, 0, 0);
1315     src1 = get_src_reg_imm(c, inst, 1, 0);
1316
1317     brw_MOV(p, brw_message_reg(2), src0);
1318     brw_MOV(p, brw_message_reg(3), src1);
1319
1320     brw_math(p,
1321             dst,
1322             BRW_MATH_FUNCTION_POW,
1323             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1324             2,
1325             brw_null_reg(),
1326             BRW_MATH_DATA_VECTOR,
1327             BRW_MATH_PRECISION_FULL);
1328 }
1329
1330 static void emit_lrp(struct brw_wm_compile *c,
1331                      const struct prog_instruction *inst)
1332 {
1333     struct brw_compile *p = &c->func;
1334     GLuint mask = inst->DstReg.WriteMask;
1335     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1336     int i;
1337     int mark = mark_tmps(c);
1338     for (i = 0; i < 4; i++) {
1339         if (mask & (1<<i)) {
1340             dst = get_dst_reg(c, inst, i);
1341             src0 = get_src_reg(c, inst, 0, i);
1342
1343             src1 = get_src_reg_imm(c, inst, 1, i);
1344
1345             if (src1.nr == dst.nr) {
1346                 tmp1 = alloc_tmp(c);
1347                 brw_MOV(p, tmp1, src1);
1348             } else
1349                 tmp1 = src1;
1350
1351             src2 = get_src_reg(c, inst, 2, i);
1352             if (src2.nr == dst.nr) {
1353                 tmp2 = alloc_tmp(c);
1354                 brw_MOV(p, tmp2, src2);
1355             } else
1356                 tmp2 = src2;
1357
1358             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1359             brw_MUL(p, brw_null_reg(), dst, tmp2);
1360             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1361             brw_MAC(p, dst, src0, tmp1);
1362             brw_set_saturate(p, 0);
1363         }
1364         release_tmps(c, mark);
1365     }
1366 }
1367
1368 /**
1369  * For GLSL shaders, this KIL will be unconditional.
1370  * It may be contained inside an IF/ENDIF structure of course.
1371  */
1372 static void emit_kil(struct brw_wm_compile *c)
1373 {
1374     struct brw_compile *p = &c->func;
1375     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1376     brw_push_insn_state(p);
1377     brw_set_mask_control(p, BRW_MASK_DISABLE);
1378     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1379     brw_AND(p, depth, c->emit_mask_reg, depth);
1380     brw_pop_insn_state(p);
1381 }
1382
1383 static void emit_mad(struct brw_wm_compile *c,
1384                      const struct prog_instruction *inst)
1385 {
1386     struct brw_compile *p = &c->func;
1387     GLuint mask = inst->DstReg.WriteMask;
1388     struct brw_reg dst, src0, src1, src2;
1389     int i;
1390
1391     for (i = 0; i < 4; i++) {
1392         if (mask & (1<<i)) {
1393             dst = get_dst_reg(c, inst, i);
1394             src0 = get_src_reg(c, inst, 0, i);
1395             src1 = get_src_reg_imm(c, inst, 1, i);
1396             src2 = get_src_reg_imm(c, inst, 2, i);
1397             brw_MUL(p, dst, src0, src1);
1398
1399             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1400             brw_ADD(p, dst, dst, src2);
1401             brw_set_saturate(p, 0);
1402         }
1403     }
1404 }
1405
1406 static void emit_sop(struct brw_wm_compile *c,
1407                      const struct prog_instruction *inst, GLuint cond)
1408 {
1409     struct brw_compile *p = &c->func;
1410     GLuint mask = inst->DstReg.WriteMask;
1411     struct brw_reg dst, src0, src1;
1412     int i;
1413
1414     for (i = 0; i < 4; i++) {
1415         if (mask & (1<<i)) {
1416             dst = get_dst_reg(c, inst, i);
1417             src0 = get_src_reg(c, inst, 0, i);
1418             src1 = get_src_reg_imm(c, inst, 1, i);
1419             brw_push_insn_state(p);
1420             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1421             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1422             brw_MOV(p, dst, brw_imm_f(0.0));
1423             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1424             brw_MOV(p, dst, brw_imm_f(1.0));
1425             brw_pop_insn_state(p);
1426         }
1427     }
1428 }
1429
1430 static void emit_slt(struct brw_wm_compile *c,
1431                      const struct prog_instruction *inst)
1432 {
1433     emit_sop(c, inst, BRW_CONDITIONAL_L);
1434 }
1435
1436 static void emit_sle(struct brw_wm_compile *c,
1437                      const struct prog_instruction *inst)
1438 {
1439     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1440 }
1441
1442 static void emit_sgt(struct brw_wm_compile *c,
1443                      const struct prog_instruction *inst)
1444 {
1445     emit_sop(c, inst, BRW_CONDITIONAL_G);
1446 }
1447
1448 static void emit_sge(struct brw_wm_compile *c,
1449                      const struct prog_instruction *inst)
1450 {
1451     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1452 }
1453
1454 static void emit_seq(struct brw_wm_compile *c,
1455                      const struct prog_instruction *inst)
1456 {
1457     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1458 }
1459
1460 static void emit_sne(struct brw_wm_compile *c,
1461                      const struct prog_instruction *inst)
1462 {
1463     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1464 }
1465
1466 static void emit_ddx(struct brw_wm_compile *c,
1467                      const struct prog_instruction *inst)
1468 {
1469     struct brw_compile *p = &c->func;
1470     GLuint mask = inst->DstReg.WriteMask;
1471     struct brw_reg interp[4];
1472     struct brw_reg dst;
1473     struct brw_reg src0, w;
1474     GLuint nr, i;
1475     src0 = get_src_reg(c, inst, 0, 0);
1476     w = get_src_reg(c, inst, 1, 3);
1477     nr = src0.nr;
1478     interp[0] = brw_vec1_grf(nr, 0);
1479     interp[1] = brw_vec1_grf(nr, 4);
1480     interp[2] = brw_vec1_grf(nr+1, 0);
1481     interp[3] = brw_vec1_grf(nr+1, 4);
1482     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1483     for(i = 0; i < 4; i++ ) {
1484         if (mask & (1<<i)) {
1485             dst = get_dst_reg(c, inst, i);
1486             brw_MOV(p, dst, interp[i]);
1487             brw_MUL(p, dst, dst, w);
1488         }
1489     }
1490     brw_set_saturate(p, 0);
1491 }
1492
1493 static void emit_ddy(struct brw_wm_compile *c,
1494                      const struct prog_instruction *inst)
1495 {
1496     struct brw_compile *p = &c->func;
1497     GLuint mask = inst->DstReg.WriteMask;
1498     struct brw_reg interp[4];
1499     struct brw_reg dst;
1500     struct brw_reg src0, w;
1501     GLuint nr, i;
1502
1503     src0 = get_src_reg(c, inst, 0, 0);
1504     nr = src0.nr;
1505     w = get_src_reg(c, inst, 1, 3);
1506     interp[0] = brw_vec1_grf(nr, 0);
1507     interp[1] = brw_vec1_grf(nr, 4);
1508     interp[2] = brw_vec1_grf(nr+1, 0);
1509     interp[3] = brw_vec1_grf(nr+1, 4);
1510     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1511     for(i = 0; i < 4; i++ ) {
1512         if (mask & (1<<i)) {
1513             dst = get_dst_reg(c, inst, i);
1514             brw_MOV(p, dst, suboffset(interp[i], 1));
1515             brw_MUL(p, dst, dst, w);
1516         }
1517     }
1518     brw_set_saturate(p, 0);
1519 }
1520
1521 static INLINE struct brw_reg high_words( struct brw_reg reg )
1522 {
1523     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1524                    0, 8, 2 );
1525 }
1526
1527 static INLINE struct brw_reg low_words( struct brw_reg reg )
1528 {
1529     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1530 }
1531
1532 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1533 {
1534     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1535 }
1536
1537 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1538 {
1539     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1540                    0, 16, 2 );
1541 }
1542
1543 /* One-, two- and three-dimensional Perlin noise, similar to the description
1544    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1545 static void noise1_sub( struct brw_wm_compile *c ) {
1546
1547     struct brw_compile *p = &c->func;
1548     struct brw_reg param,
1549         x0, x1, /* gradients at each end */
1550         t, tmp[ 2 ], /* float temporaries */
1551         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1552     int i;
1553     int mark = mark_tmps( c );
1554
1555     x0 = alloc_tmp( c );
1556     x1 = alloc_tmp( c );
1557     t = alloc_tmp( c );
1558     tmp[ 0 ] = alloc_tmp( c );
1559     tmp[ 1 ] = alloc_tmp( c );
1560     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1561     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1562     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1563     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1564     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1565
1566     param = lookup_tmp( c, mark - 2 );
1567
1568     brw_set_access_mode( p, BRW_ALIGN_1 );
1569
1570     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1571
1572     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1573        be hashed.  Also compute the remainder (offset within the unit
1574        length), interleaved to reduce register dependency penalties. */
1575     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1576     brw_FRC( p, param, param );
1577     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1578     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1579     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1580
1581     /* We're now ready to perform the hashing.  The two hashes are
1582        interleaved for performance.  The hash function used is
1583        designed to rapidly achieve avalanche and require only 32x16
1584        bit multiplication, and 16-bit swizzles (which we get for
1585        free).  We can't use immediate operands in the multiplies,
1586        because immediates are permitted only in src1 and the 16-bit
1587        factor is permitted only in src0. */
1588     for( i = 0; i < 2; i++ )
1589         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1590     for( i = 0; i < 2; i++ )
1591        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1592                 high_words( itmp[ i ] ) );
1593     for( i = 0; i < 2; i++ )
1594         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1595     for( i = 0; i < 2; i++ )
1596        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1597                 high_words( itmp[ i ] ) );
1598     for( i = 0; i < 2; i++ )
1599         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1600     for( i = 0; i < 2; i++ )
1601        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1602                 high_words( itmp[ i ] ) );
1603
1604     /* Now we want to initialise the two gradients based on the
1605        hashes.  Format conversion from signed integer to float leaves
1606        everything scaled too high by a factor of pow( 2, 31 ), but
1607        we correct for that right at the end. */
1608     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1609     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1610     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1611
1612     brw_MUL( p, x0, x0, param );
1613     brw_MUL( p, x1, x1, t );
1614
1615     /* We interpolate between the gradients using the polynomial
1616        6t^5 - 15t^4 + 10t^3 (Perlin). */
1617     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1618     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1619     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1620     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1621     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1622     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1623                                            pipeline */
1624     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1625     brw_MUL( p, param, tmp[ 0 ], param );
1626     brw_MUL( p, x1, x1, param );
1627     brw_ADD( p, x0, x0, x1 );
1628     /* scale by pow( 2, -30 ), to compensate for the format conversion
1629        above and an extra factor of 2 so that a single gradient covers
1630        the [-1,1] range */
1631     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1632
1633     release_tmps( c, mark );
1634 }
1635
1636 static void emit_noise1( struct brw_wm_compile *c,
1637                          const struct prog_instruction *inst )
1638 {
1639     struct brw_compile *p = &c->func;
1640     struct brw_reg src, param, dst;
1641     GLuint mask = inst->DstReg.WriteMask;
1642     int i;
1643     int mark = mark_tmps( c );
1644
1645     assert( mark == 0 );
1646
1647     src = get_src_reg( c, inst, 0, 0 );
1648
1649     param = alloc_tmp( c );
1650
1651     brw_MOV( p, param, src );
1652
1653     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1654
1655     /* Fill in the result: */
1656     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1657     for (i = 0 ; i < 4; i++) {
1658         if (mask & (1<<i)) {
1659             dst = get_dst_reg(c, inst, i);
1660             brw_MOV( p, dst, param );
1661         }
1662     }
1663     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1664         brw_set_saturate( p, 0 );
1665
1666     release_tmps( c, mark );
1667 }
1668
1669 static void noise2_sub( struct brw_wm_compile *c ) {
1670
1671     struct brw_compile *p = &c->func;
1672     struct brw_reg param0, param1,
1673         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1674         t, tmp[ 4 ], /* float temporaries */
1675         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1676     int i;
1677     int mark = mark_tmps( c );
1678
1679     x0y0 = alloc_tmp( c );
1680     x0y1 = alloc_tmp( c );
1681     x1y0 = alloc_tmp( c );
1682     x1y1 = alloc_tmp( c );
1683     t = alloc_tmp( c );
1684     for( i = 0; i < 4; i++ ) {
1685         tmp[ i ] = alloc_tmp( c );
1686         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1687     }
1688     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1689     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1690     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1691
1692     param0 = lookup_tmp( c, mark - 3 );
1693     param1 = lookup_tmp( c, mark - 2 );
1694
1695     brw_set_access_mode( p, BRW_ALIGN_1 );
1696
1697     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1698        be hashed.  Also compute the remainders (offsets within the unit
1699        square), interleaved to reduce register dependency penalties. */
1700     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1701     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1702     brw_FRC( p, param0, param0 );
1703     brw_FRC( p, param1, param1 );
1704     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1705     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1706              low_words( itmp[ 1 ] ) );
1707     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1708     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1709     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1710     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1711     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1712
1713     /* We're now ready to perform the hashing.  The four hashes are
1714        interleaved for performance.  The hash function used is
1715        designed to rapidly achieve avalanche and require only 32x16
1716        bit multiplication, and 16-bit swizzles (which we get for
1717        free).  We can't use immediate operands in the multiplies,
1718        because immediates are permitted only in src1 and the 16-bit
1719        factor is permitted only in src0. */
1720     for( i = 0; i < 4; i++ )
1721         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1722     for( i = 0; i < 4; i++ )
1723         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1724                  high_words( itmp[ i ] ) );
1725     for( i = 0; i < 4; i++ )
1726         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1727     for( i = 0; i < 4; i++ )
1728         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1729                  high_words( itmp[ i ] ) );
1730     for( i = 0; i < 4; i++ )
1731         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1732     for( i = 0; i < 4; i++ )
1733         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1734                  high_words( itmp[ i ] ) );
1735
1736     /* Now we want to initialise the four gradients based on the
1737        hashes.  Format conversion from signed integer to float leaves
1738        everything scaled too high by a factor of pow( 2, 15 ), but
1739        we correct for that right at the end. */
1740     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1741     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1742     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1743     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1744     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1745
1746     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1747     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1748     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1749     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1750
1751     brw_MUL( p, x1y0, x1y0, t );
1752     brw_MUL( p, x1y1, x1y1, t );
1753     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1754     brw_MUL( p, x0y0, x0y0, param0 );
1755     brw_MUL( p, x0y1, x0y1, param0 );
1756
1757     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1758     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1759     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1760     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1761
1762     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1763     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1764     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1765     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1766
1767     /* We interpolate between the gradients using the polynomial
1768        6t^5 - 15t^4 + 10t^3 (Perlin). */
1769     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1770     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1771     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1772     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1773     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1774     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1775     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1776                                                  pipeline */
1777     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1778     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1779     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1780     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1781     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1782                                                  pipeline */
1783     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1784     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1785     brw_MUL( p, param0, tmp[ 0 ], param0 );
1786     brw_MUL( p, param1, tmp[ 1 ], param1 );
1787
1788     /* Here we interpolate in the y dimension... */
1789     brw_MUL( p, x0y1, x0y1, param1 );
1790     brw_MUL( p, x1y1, x1y1, param1 );
1791     brw_ADD( p, x0y0, x0y0, x0y1 );
1792     brw_ADD( p, x1y0, x1y0, x1y1 );
1793
1794     /* And now in x.  There are horrible register dependencies here,
1795        but we have nothing else to do. */
1796     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1797     brw_MUL( p, x1y0, x1y0, param0 );
1798     brw_ADD( p, x0y0, x0y0, x1y0 );
1799
1800     /* scale by pow( 2, -15 ), as described above */
1801     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1802
1803     release_tmps( c, mark );
1804 }
1805
1806 static void emit_noise2( struct brw_wm_compile *c,
1807                          const struct prog_instruction *inst )
1808 {
1809     struct brw_compile *p = &c->func;
1810     struct brw_reg src0, src1, param0, param1, dst;
1811     GLuint mask = inst->DstReg.WriteMask;
1812     int i;
1813     int mark = mark_tmps( c );
1814
1815     assert( mark == 0 );
1816
1817     src0 = get_src_reg( c, inst, 0, 0 );
1818     src1 = get_src_reg( c, inst, 0, 1 );
1819
1820     param0 = alloc_tmp( c );
1821     param1 = alloc_tmp( c );
1822
1823     brw_MOV( p, param0, src0 );
1824     brw_MOV( p, param1, src1 );
1825
1826     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1827
1828     /* Fill in the result: */
1829     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1830     for (i = 0 ; i < 4; i++) {
1831         if (mask & (1<<i)) {
1832             dst = get_dst_reg(c, inst, i);
1833             brw_MOV( p, dst, param0 );
1834         }
1835     }
1836     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1837         brw_set_saturate( p, 0 );
1838
1839     release_tmps( c, mark );
1840 }
1841
1842 /**
1843  * The three-dimensional case is much like the one- and two- versions above,
1844  * but since the number of corners is rapidly growing we now pack 16 16-bit
1845  * hashes into each register to extract more parallelism from the EUs.
1846  */
1847 static void noise3_sub( struct brw_wm_compile *c ) {
1848
1849     struct brw_compile *p = &c->func;
1850     struct brw_reg param0, param1, param2,
1851         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1852         xi, yi, zi, /* interpolation coefficients */
1853         t, tmp[ 8 ], /* float temporaries */
1854         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1855         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1856     int i;
1857     int mark = mark_tmps( c );
1858
1859     x0y0 = alloc_tmp( c );
1860     x0y1 = alloc_tmp( c );
1861     x1y0 = alloc_tmp( c );
1862     x1y1 = alloc_tmp( c );
1863     xi = alloc_tmp( c );
1864     yi = alloc_tmp( c );
1865     zi = alloc_tmp( c );
1866     t = alloc_tmp( c );
1867     for( i = 0; i < 8; i++ ) {
1868         tmp[ i ] = alloc_tmp( c );
1869         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1870         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1871     }
1872
1873     param0 = lookup_tmp( c, mark - 4 );
1874     param1 = lookup_tmp( c, mark - 3 );
1875     param2 = lookup_tmp( c, mark - 2 );
1876
1877     brw_set_access_mode( p, BRW_ALIGN_1 );
1878
1879     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1880        be hashed.  Also compute the remainders (offsets within the unit
1881        cube), interleaved to reduce register dependency penalties. */
1882     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1883     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1884     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1885     brw_FRC( p, param0, param0 );
1886     brw_FRC( p, param1, param1 );
1887     brw_FRC( p, param2, param2 );
1888     /* Since we now have only 16 bits of precision in the hash, we must
1889        be more careful about thorough mixing to maintain entropy as we
1890        squash the input vector into a small scalar. */
1891     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1892     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1893     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1894              brw_imm_uw( 0x9B93 ) );
1895     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1896              brw_imm_uw( 0xBC8F ) );
1897
1898     /* Temporarily disable the execution mask while we work with ExecSize=16
1899        channels (the mask is set for ExecSize=8 and is probably incorrect).
1900        Although this might cause execution of unwanted channels, the code
1901        writes only to temporary registers and has no side effects, so
1902        disabling the mask is harmless. */
1903     brw_push_insn_state( p );
1904     brw_set_mask_control( p, BRW_MASK_DISABLE );
1905     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1906     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1907     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1908
1909     /* We're now ready to perform the hashing.  The eight hashes are
1910        interleaved for performance.  The hash function used is
1911        designed to rapidly achieve avalanche and require only 16x16
1912        bit multiplication, and 8-bit swizzles (which we get for
1913        free). */
1914     for( i = 0; i < 4; i++ )
1915         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1916     for( i = 0; i < 4; i++ )
1917         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1918                  odd_bytes( wtmp[ i ] ) );
1919     for( i = 0; i < 4; i++ )
1920         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1921     for( i = 0; i < 4; i++ )
1922         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1923                  odd_bytes( wtmp[ i ] ) );
1924     brw_pop_insn_state( p );
1925
1926     /* Now we want to initialise the four rear gradients based on the
1927        hashes.  Format conversion from signed integer to float leaves
1928        everything scaled too high by a factor of pow( 2, 15 ), but
1929        we correct for that right at the end. */
1930     /* x component */
1931     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1932     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1933     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1934     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1935     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1936
1937     brw_push_insn_state( p );
1938     brw_set_mask_control( p, BRW_MASK_DISABLE );
1939     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1940     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1941     brw_pop_insn_state( p );
1942
1943     brw_MUL( p, x1y0, x1y0, t );
1944     brw_MUL( p, x1y1, x1y1, t );
1945     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1946     brw_MUL( p, x0y0, x0y0, param0 );
1947     brw_MUL( p, x0y1, x0y1, param0 );
1948
1949     /* y component */
1950     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1951     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1952     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1953     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1954
1955     brw_push_insn_state( p );
1956     brw_set_mask_control( p, BRW_MASK_DISABLE );
1957     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1958     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1959     brw_pop_insn_state( p );
1960
1961     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1962     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1963     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1964     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1965     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1966
1967     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1968     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1969     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1970     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1971
1972     /* z component */
1973     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1974     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1975     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1976     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1977
1978     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1979     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1980     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1981     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1982
1983     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1984     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1985     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1986     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1987
1988     /* We interpolate between the gradients using the polynomial
1989        6t^5 - 15t^4 + 10t^3 (Perlin). */
1990     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1991     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1992     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1993     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1994     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1995     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1996     brw_MUL( p, xi, xi, param0 );
1997     brw_MUL( p, yi, yi, param1 );
1998     brw_MUL( p, zi, zi, param2 );
1999     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2000     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2001     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2002     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2003     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2004     brw_MUL( p, xi, xi, param0 );
2005     brw_MUL( p, yi, yi, param1 );
2006     brw_MUL( p, zi, zi, param2 );
2007     brw_MUL( p, xi, xi, param0 );
2008     brw_MUL( p, yi, yi, param1 );
2009     brw_MUL( p, zi, zi, param2 );
2010     brw_MUL( p, xi, xi, param0 );
2011     brw_MUL( p, yi, yi, param1 );
2012     brw_MUL( p, zi, zi, param2 );
2013
2014     /* Here we interpolate in the y dimension... */
2015     brw_MUL( p, x0y1, x0y1, yi );
2016     brw_MUL( p, x1y1, x1y1, yi );
2017     brw_ADD( p, x0y0, x0y0, x0y1 );
2018     brw_ADD( p, x1y0, x1y0, x1y1 );
2019
2020     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2021     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2022     brw_MUL( p, x1y0, x1y0, xi );
2023     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2024
2025     /* Now do the same thing for the front four gradients... */
2026     /* x component */
2027     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2028     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2029     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2030     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2031
2032     brw_push_insn_state( p );
2033     brw_set_mask_control( p, BRW_MASK_DISABLE );
2034     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2035     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2036     brw_pop_insn_state( p );
2037
2038     brw_MUL( p, x1y0, x1y0, t );
2039     brw_MUL( p, x1y1, x1y1, t );
2040     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2041     brw_MUL( p, x0y0, x0y0, param0 );
2042     brw_MUL( p, x0y1, x0y1, param0 );
2043
2044     /* y component */
2045     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2046     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2047     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2048     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2049
2050     brw_push_insn_state( p );
2051     brw_set_mask_control( p, BRW_MASK_DISABLE );
2052     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2053     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2054     brw_pop_insn_state( p );
2055
2056     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2057     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2058     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2059     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2060     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2061
2062     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2063     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2064     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2065     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2066
2067     /* z component */
2068     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2069     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2070     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2071     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2072
2073     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2074     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2075     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2076     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2077
2078     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2079     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2080     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2081     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2082
2083     /* The interpolation coefficients are still around from last time, so
2084        again interpolate in the y dimension... */
2085     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2086     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2087     brw_MUL( p, x0y1, x0y1, yi );
2088     brw_MUL( p, x1y1, x1y1, yi );
2089     brw_ADD( p, x0y0, x0y0, x0y1 );
2090     brw_ADD( p, x1y0, x1y0, x1y1 );
2091
2092     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2093        time put the front face in tmp[ 1 ] and we're nearly there... */
2094     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2095     brw_MUL( p, x1y0, x1y0, xi );
2096     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2097
2098     /* The final interpolation, in the z dimension: */
2099     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2100     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2101     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2102
2103     /* scale by pow( 2, -15 ), as described above */
2104     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2105
2106     release_tmps( c, mark );
2107 }
2108
2109 static void emit_noise3( struct brw_wm_compile *c,
2110                          const struct prog_instruction *inst )
2111 {
2112     struct brw_compile *p = &c->func;
2113     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2114     GLuint mask = inst->DstReg.WriteMask;
2115     int i;
2116     int mark = mark_tmps( c );
2117
2118     assert( mark == 0 );
2119
2120     src0 = get_src_reg( c, inst, 0, 0 );
2121     src1 = get_src_reg( c, inst, 0, 1 );
2122     src2 = get_src_reg( c, inst, 0, 2 );
2123
2124     param0 = alloc_tmp( c );
2125     param1 = alloc_tmp( c );
2126     param2 = alloc_tmp( c );
2127
2128     brw_MOV( p, param0, src0 );
2129     brw_MOV( p, param1, src1 );
2130     brw_MOV( p, param2, src2 );
2131
2132     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2133
2134     /* Fill in the result: */
2135     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2136     for (i = 0 ; i < 4; i++) {
2137         if (mask & (1<<i)) {
2138             dst = get_dst_reg(c, inst, i);
2139             brw_MOV( p, dst, param0 );
2140         }
2141     }
2142     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2143         brw_set_saturate( p, 0 );
2144
2145     release_tmps( c, mark );
2146 }
2147
2148 /**
2149  * For the four-dimensional case, the little micro-optimisation benefits
2150  * we obtain by unrolling all the loops aren't worth the massive bloat it
2151  * now causes.  Instead, we loop twice around performing a similar operation
2152  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2153  * code to glue it all together.
2154  */
2155 static void noise4_sub( struct brw_wm_compile *c )
2156 {
2157     struct brw_compile *p = &c->func;
2158     struct brw_reg param[ 4 ],
2159         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2160         w0, /* noise for the w=0 cube */
2161         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2162         interp[ 4 ], /* interpolation coefficients */
2163         t, tmp[ 8 ], /* float temporaries */
2164         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2165         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2166     int i, j;
2167     int mark = mark_tmps( c );
2168     GLuint loop, origin;
2169
2170     x0y0 = alloc_tmp( c );
2171     x0y1 = alloc_tmp( c );
2172     x1y0 = alloc_tmp( c );
2173     x1y1 = alloc_tmp( c );
2174     t = alloc_tmp( c );
2175     w0 = alloc_tmp( c );
2176     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2177     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2178
2179     for( i = 0; i < 4; i++ ) {
2180         param[ i ] = lookup_tmp( c, mark - 5 + i );
2181         interp[ i ] = alloc_tmp( c );
2182     }
2183
2184     for( i = 0; i < 8; i++ ) {
2185         tmp[ i ] = alloc_tmp( c );
2186         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2187         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2188     }
2189
2190     brw_set_access_mode( p, BRW_ALIGN_1 );
2191
2192     /* We only want 16 bits of precision from the integral part of each
2193        co-ordinate, but unfortunately the RNDD semantics would saturate
2194        at 16 bits if we performed the operation directly to a 16-bit
2195        destination.  Therefore, we round to 32-bit temporaries where
2196        appropriate, and then store only the lower 16 bits. */
2197     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2198     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2199     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2200     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2201     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2202     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2203
2204     /* Modify the flag register here, because the side effect is useful
2205        later (see below).  We know for certain that all flags will be
2206        cleared, since the FRC instruction cannot possibly generate
2207        negative results.  Even for exceptional inputs (infinities, denormals,
2208        NaNs), the architecture guarantees that the L conditional is false. */
2209     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2210     brw_FRC( p, param[ 0 ], param[ 0 ] );
2211     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2212     for( i = 1; i < 4; i++ )
2213         brw_FRC( p, param[ i ], param[ i ] );
2214
2215     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2216        of all. */
2217     for( i = 0; i < 4; i++ )
2218         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2219     for( i = 0; i < 4; i++ )
2220         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2221     for( i = 0; i < 4; i++ )
2222         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2223     for( i = 0; i < 4; i++ )
2224         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2225     for( j = 0; j < 3; j++ )
2226         for( i = 0; i < 4; i++ )
2227             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2228
2229     /* Mark the current address, as it will be a jump destination.  The
2230        following code will be executed twice: first, with the flag
2231        register clear indicating the w=0 case, and second with flags
2232        set for w=1. */
2233     loop = p->nr_insn;
2234
2235     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2236        be hashed.  Since we have only 16 bits of precision in the hash, we
2237        must be careful about thorough mixing to maintain entropy as we
2238        squash the input vector into a small scalar. */
2239     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2240              brw_imm_uw( 0xBC8F ) );
2241     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2242              brw_imm_uw( 0xD0BD ) );
2243     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2244              brw_imm_uw( 0x9B93 ) );
2245     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2246              brw_imm_uw( 0xA359 ) );
2247     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2248              brw_imm_uw( 0xBC8F ) );
2249
2250     /* Temporarily disable the execution mask while we work with ExecSize=16
2251        channels (the mask is set for ExecSize=8 and is probably incorrect).
2252        Although this might cause execution of unwanted channels, the code
2253        writes only to temporary registers and has no side effects, so
2254        disabling the mask is harmless. */
2255     brw_push_insn_state( p );
2256     brw_set_mask_control( p, BRW_MASK_DISABLE );
2257     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2258     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2259     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2260
2261     /* We're now ready to perform the hashing.  The eight hashes are
2262        interleaved for performance.  The hash function used is
2263        designed to rapidly achieve avalanche and require only 16x16
2264        bit multiplication, and 8-bit swizzles (which we get for
2265        free). */
2266     for( i = 0; i < 4; i++ )
2267         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2268     for( i = 0; i < 4; i++ )
2269         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2270                  odd_bytes( wtmp[ i ] ) );
2271     for( i = 0; i < 4; i++ )
2272         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2273     for( i = 0; i < 4; i++ )
2274         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2275                  odd_bytes( wtmp[ i ] ) );
2276     brw_pop_insn_state( p );
2277
2278     /* Now we want to initialise the four rear gradients based on the
2279        hashes.  Format conversion from signed integer to float leaves
2280        everything scaled too high by a factor of pow( 2, 15 ), but
2281        we correct for that right at the end. */
2282     /* x component */
2283     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2284     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2285     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2286     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2287     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2288
2289     brw_push_insn_state( p );
2290     brw_set_mask_control( p, BRW_MASK_DISABLE );
2291     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2292     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2293     brw_pop_insn_state( p );
2294
2295     brw_MUL( p, x1y0, x1y0, t );
2296     brw_MUL( p, x1y1, x1y1, t );
2297     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2298     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2299     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2300
2301     /* y component */
2302     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2303     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2304     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2305     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2306
2307     brw_push_insn_state( p );
2308     brw_set_mask_control( p, BRW_MASK_DISABLE );
2309     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2310     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2311     brw_pop_insn_state( p );
2312
2313     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2314     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2315     /* prepare t for the w component (used below): w the first time through
2316        the loop; w - 1 the second time) */
2317     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2318     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2319     p->current->header.predicate_inverse = 1;
2320     brw_MOV( p, t, param[ 3 ] );
2321     p->current->header.predicate_inverse = 0;
2322     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2323     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2324     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2325
2326     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2327     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2328     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2329     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2330
2331     /* z component */
2332     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2333     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2334     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2335     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2336
2337     brw_push_insn_state( p );
2338     brw_set_mask_control( p, BRW_MASK_DISABLE );
2339     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2340     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2341     brw_pop_insn_state( p );
2342
2343     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2344     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2345     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2346     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2347
2348     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2349     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2350     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2351     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2352
2353     /* w component */
2354     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2355     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2356     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2357     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2358
2359     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2360     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2361     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2362     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2363     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2364
2365     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2366     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2367     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2368     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2369
2370     /* Here we interpolate in the y dimension... */
2371     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2372     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2373     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2374     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2375     brw_ADD( p, x0y0, x0y0, x0y1 );
2376     brw_ADD( p, x1y0, x1y0, x1y1 );
2377
2378     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2379     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2380     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2381     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2382
2383     /* Now do the same thing for the front four gradients... */
2384     /* x component */
2385     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2386     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2387     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2388     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2389
2390     brw_push_insn_state( p );
2391     brw_set_mask_control( p, BRW_MASK_DISABLE );
2392     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2393     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2394     brw_pop_insn_state( p );
2395
2396     brw_MUL( p, x1y0, x1y0, t );
2397     brw_MUL( p, x1y1, x1y1, t );
2398     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2399     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2400     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2401
2402     /* y component */
2403     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2404     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2405     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2406     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2407
2408     brw_push_insn_state( p );
2409     brw_set_mask_control( p, BRW_MASK_DISABLE );
2410     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2411     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2412     brw_pop_insn_state( p );
2413
2414     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2415     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2416     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2417     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2418     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2419
2420     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2421     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2422     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2423     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2424
2425     /* z component */
2426     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2427     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2428     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2429     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2430
2431     brw_push_insn_state( p );
2432     brw_set_mask_control( p, BRW_MASK_DISABLE );
2433     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2434     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2435     brw_pop_insn_state( p );
2436
2437     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2438     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2439     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2440     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2441     /* prepare t for the w component (used below): w the first time through
2442        the loop; w - 1 the second time) */
2443     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2444     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2445     p->current->header.predicate_inverse = 1;
2446     brw_MOV( p, t, param[ 3 ] );
2447     p->current->header.predicate_inverse = 0;
2448     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2449
2450     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2451     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2452     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2453     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2454
2455     /* w component */
2456     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2457     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2458     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2459     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2460
2461     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2462     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2463     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2464     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2465
2466     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2467     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2468     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2469     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2470
2471     /* Interpolate in the y dimension: */
2472     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2473     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2474     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2475     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2476     brw_ADD( p, x0y0, x0y0, x0y1 );
2477     brw_ADD( p, x1y0, x1y0, x1y1 );
2478
2479     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2480        time put the front face in tmp[ 1 ] and we're nearly there... */
2481     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2482     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2483     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2484
2485     /* Another interpolation, in the z dimension: */
2486     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2487     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2488     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2489
2490     /* Exit the loop if we've computed both cubes... */
2491     origin = p->nr_insn;
2492     brw_push_insn_state( p );
2493     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2494     brw_set_mask_control( p, BRW_MASK_DISABLE );
2495     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2496     brw_pop_insn_state( p );
2497
2498     /* Save the result for the w=0 case, and increment the w coordinate: */
2499     brw_MOV( p, w0, tmp[ 0 ] );
2500     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2501              brw_imm_uw( 1 ) );
2502
2503     /* Loop around for the other cube.  Explicitly set the flag register
2504        (unfortunately we must spend an extra instruction to do this: we
2505        can't rely on a side effect of the previous MOV or ADD because
2506        conditional modifiers which are normally true might be false in
2507        exceptional circumstances, e.g. given a NaN input; the add to
2508        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2509     brw_push_insn_state( p );
2510     brw_set_mask_control( p, BRW_MASK_DISABLE );
2511     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2512     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2513              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2514     brw_pop_insn_state( p );
2515
2516     /* Patch the previous conditional branch now that we know the
2517        destination address. */
2518     brw_set_src1( p->store + origin,
2519                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2520
2521     /* The very last interpolation. */
2522     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2523     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2524     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2525
2526     /* scale by pow( 2, -15 ), as described above */
2527     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2528
2529     release_tmps( c, mark );
2530 }
2531
2532 static void emit_noise4( struct brw_wm_compile *c,
2533                          const struct prog_instruction *inst )
2534 {
2535     struct brw_compile *p = &c->func;
2536     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2537     GLuint mask = inst->DstReg.WriteMask;
2538     int i;
2539     int mark = mark_tmps( c );
2540
2541     assert( mark == 0 );
2542
2543     src0 = get_src_reg( c, inst, 0, 0 );
2544     src1 = get_src_reg( c, inst, 0, 1 );
2545     src2 = get_src_reg( c, inst, 0, 2 );
2546     src3 = get_src_reg( c, inst, 0, 3 );
2547
2548     param0 = alloc_tmp( c );
2549     param1 = alloc_tmp( c );
2550     param2 = alloc_tmp( c );
2551     param3 = alloc_tmp( c );
2552
2553     brw_MOV( p, param0, src0 );
2554     brw_MOV( p, param1, src1 );
2555     brw_MOV( p, param2, src2 );
2556     brw_MOV( p, param3, src3 );
2557
2558     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2559
2560     /* Fill in the result: */
2561     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2562     for (i = 0 ; i < 4; i++) {
2563         if (mask & (1<<i)) {
2564             dst = get_dst_reg(c, inst, i);
2565             brw_MOV( p, dst, param0 );
2566         }
2567     }
2568     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2569         brw_set_saturate( p, 0 );
2570
2571     release_tmps( c, mark );
2572 }
2573
2574 static void emit_wpos_xy(struct brw_wm_compile *c,
2575                          const struct prog_instruction *inst)
2576 {
2577     struct brw_compile *p = &c->func;
2578     GLuint mask = inst->DstReg.WriteMask;
2579     struct brw_reg src0[2], dst[2];
2580
2581     dst[0] = get_dst_reg(c, inst, 0);
2582     dst[1] = get_dst_reg(c, inst, 1);
2583
2584     src0[0] = get_src_reg(c, inst, 0, 0);
2585     src0[1] = get_src_reg(c, inst, 0, 1);
2586
2587     /* Calculate the pixel offset from window bottom left into destination
2588      * X and Y channels.
2589      */
2590     if (mask & WRITEMASK_X) {
2591         /* X' = X - origin_x */
2592         brw_ADD(p,
2593                 dst[0],
2594                 retype(src0[0], BRW_REGISTER_TYPE_W),
2595                 brw_imm_d(0 - c->key.origin_x));
2596     }
2597
2598     if (mask & WRITEMASK_Y) {
2599         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2600         brw_ADD(p,
2601                 dst[1],
2602                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2603                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2604     }
2605 }
2606
2607 /* TODO
2608    BIAS on SIMD8 not working yet...
2609  */
2610 static void emit_txb(struct brw_wm_compile *c,
2611                      const struct prog_instruction *inst)
2612 {
2613     struct brw_compile *p = &c->func;
2614     struct brw_reg dst[4], src[4], payload_reg;
2615     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2616     GLuint i;
2617     GLuint msg_type;
2618
2619     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2620
2621     for (i = 0; i < 4; i++)
2622         dst[i] = get_dst_reg(c, inst, i);
2623     for (i = 0; i < 4; i++)
2624         src[i] = get_src_reg(c, inst, 0, i);
2625
2626     switch (inst->TexSrcTarget) {
2627         case TEXTURE_1D_INDEX:
2628             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2629             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2630             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2631             break;
2632         case TEXTURE_2D_INDEX:
2633         case TEXTURE_RECT_INDEX:
2634             brw_MOV(p, brw_message_reg(2), src[0]);
2635             brw_MOV(p, brw_message_reg(3), src[1]);
2636             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2637             break;
2638         default:
2639             brw_MOV(p, brw_message_reg(2), src[0]);
2640             brw_MOV(p, brw_message_reg(3), src[1]);
2641             brw_MOV(p, brw_message_reg(4), src[2]);
2642             break;
2643     }
2644     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2645     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2646
2647     if (BRW_IS_IGDNG(p->brw)) {
2648         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2649     } else {
2650         /* Does it work well on SIMD8? */
2651         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2652     }
2653
2654     brw_SAMPLE(p,
2655                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2656                1,                                           /* msg_reg_nr */
2657                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2658                SURF_INDEX_TEXTURE(unit),
2659                unit,                                        /* sampler */
2660                inst->DstReg.WriteMask,                      /* writemask */
2661                msg_type,                                    /* msg_type */
2662                4,                                           /* response_length */
2663                4,                                           /* msg_length */
2664                0,                                           /* eot */
2665                1,
2666                BRW_SAMPLER_SIMD_MODE_SIMD8);
2667 }
2668
2669
2670 static void emit_tex(struct brw_wm_compile *c,
2671                      const struct prog_instruction *inst)
2672 {
2673     struct brw_compile *p = &c->func;
2674     struct brw_reg dst[4], src[4], payload_reg;
2675     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2676     GLuint msg_len;
2677     GLuint i, nr;
2678     GLuint emit;
2679     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2680     GLuint msg_type;
2681
2682     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2683
2684     for (i = 0; i < 4; i++)
2685         dst[i] = get_dst_reg(c, inst, i);
2686     for (i = 0; i < 4; i++)
2687         src[i] = get_src_reg(c, inst, 0, i);
2688
2689     switch (inst->TexSrcTarget) {
2690         case TEXTURE_1D_INDEX:
2691             emit = WRITEMASK_X;
2692             nr = 1;
2693             break;
2694         case TEXTURE_2D_INDEX:
2695         case TEXTURE_RECT_INDEX:
2696             emit = WRITEMASK_XY;
2697             nr = 2;
2698             break;
2699         default:
2700             emit = WRITEMASK_XYZ;
2701             nr = 3;
2702             break;
2703     }
2704     msg_len = 1;
2705
2706     /* move/load S, T, R coords */
2707     for (i = 0; i < nr; i++) {
2708         static const GLuint swz[4] = {0,1,2,2};
2709         if (emit & (1<<i))
2710             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2711         else
2712             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2713         msg_len += 1;
2714     }
2715
2716     if (shadow) {
2717        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2718        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2719     }
2720
2721     if (BRW_IS_IGDNG(p->brw)) {
2722         if (shadow)
2723             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2724         else
2725             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2726     } else {
2727         /* Does it work for shadow on SIMD8 ? */
2728         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2729     }
2730
2731     brw_SAMPLE(p,
2732                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2733                1,                                          /* msg_reg_nr */
2734                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2735                SURF_INDEX_TEXTURE(unit),
2736                unit,                                       /* sampler */
2737                inst->DstReg.WriteMask,                     /* writemask */
2738                msg_type,                                   /* msg_type */
2739                4,                                          /* response_length */
2740                shadow ? 6 : 4,                             /* msg_length */
2741                0,                                          /* eot */
2742                1,
2743                BRW_SAMPLER_SIMD_MODE_SIMD8);
2744
2745     if (shadow)
2746         brw_MOV(p, dst[3], brw_imm_f(1.0));
2747 }
2748
2749
2750 /**
2751  * Resolve subroutine calls after code emit is done.
2752  */
2753 static void post_wm_emit( struct brw_wm_compile *c )
2754 {
2755     brw_resolve_cals(&c->func);
2756 }
2757
2758 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2759 {
2760 #define MAX_IF_DEPTH 32
2761 #define MAX_LOOP_DEPTH 32
2762     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2763     GLuint i, if_depth = 0, loop_depth = 0;
2764     struct brw_compile *p = &c->func;
2765     struct brw_indirect stack_index = brw_indirect(0, 0);
2766
2767     c->out_of_regs = GL_FALSE;
2768
2769     prealloc_reg(c);
2770     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2771     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2772
2773     for (i = 0; i < c->nr_fp_insns; i++) {
2774         const struct prog_instruction *inst = &c->prog_instructions[i];
2775
2776         c->cur_inst = i;
2777
2778 #if 0
2779         _mesa_printf("Inst %d: ", i);
2780         _mesa_print_instruction(inst);
2781 #endif
2782
2783         /* fetch any constants that this instruction needs */
2784         if (c->fp->use_const_buffer)
2785            fetch_constants(c, inst);
2786
2787         if (inst->CondUpdate)
2788             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2789         else
2790             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2791
2792         switch (inst->Opcode) {
2793             case WM_PIXELXY:
2794                 emit_pixel_xy(c, inst);
2795                 break;
2796             case WM_DELTAXY:
2797                 emit_delta_xy(c, inst);
2798                 break;
2799             case WM_PIXELW:
2800                 emit_pixel_w(c, inst);
2801                 break;
2802             case WM_LINTERP:
2803                 emit_linterp(c, inst);
2804                 break;
2805             case WM_PINTERP:
2806                 emit_pinterp(c, inst);
2807                 break;
2808             case WM_CINTERP:
2809                 emit_cinterp(c, inst);
2810                 break;
2811             case WM_WPOSXY:
2812                 emit_wpos_xy(c, inst);
2813                 break;
2814             case WM_FB_WRITE:
2815                 emit_fb_write(c, inst);
2816                 break;
2817             case WM_FRONTFACING:
2818                 emit_frontfacing(c, inst);
2819                 break;
2820             case OPCODE_ADD:
2821                 emit_add(c, inst);
2822                 break;
2823             case OPCODE_ARL:
2824                 emit_arl(c, inst);
2825                 break;
2826             case OPCODE_FRC:
2827                 emit_frc(c, inst);
2828                 break;
2829             case OPCODE_FLR:
2830                 emit_flr(c, inst);
2831                 break;
2832             case OPCODE_LRP:
2833                 emit_lrp(c, inst);
2834                 break;
2835             case OPCODE_TRUNC:
2836                 emit_trunc(c, inst);
2837                 break;
2838             case OPCODE_MOV:
2839             case OPCODE_SWZ:
2840                 emit_mov(c, inst);
2841                 break;
2842             case OPCODE_DP3:
2843                 emit_dp3(c, inst);
2844                 break;
2845             case OPCODE_DP4:
2846                 emit_dp4(c, inst);
2847                 break;
2848             case OPCODE_XPD:
2849                 emit_xpd(c, inst);
2850                 break;
2851             case OPCODE_DPH:
2852                 emit_dph(c, inst);
2853                 break;
2854             case OPCODE_RCP:
2855                 emit_rcp(c, inst);
2856                 break;
2857             case OPCODE_RSQ:
2858                 emit_rsq(c, inst);
2859                 break;
2860             case OPCODE_SIN:
2861                 emit_sin(c, inst);
2862                 break;
2863             case OPCODE_COS:
2864                 emit_cos(c, inst);
2865                 break;
2866             case OPCODE_EX2:
2867                 emit_ex2(c, inst);
2868                 break;
2869             case OPCODE_LG2:
2870                 emit_lg2(c, inst);
2871                 break;
2872             case OPCODE_MIN:
2873             case OPCODE_MAX:
2874                 emit_min_max(c, inst);
2875                 break;
2876             case OPCODE_DDX:
2877                 emit_ddx(c, inst);
2878                 break;
2879             case OPCODE_DDY:
2880                 emit_ddy(c, inst);
2881                 break;
2882             case OPCODE_SLT:
2883                 emit_slt(c, inst);
2884                 break;
2885             case OPCODE_SLE:
2886                 emit_sle(c, inst);
2887                 break;
2888             case OPCODE_SGT:
2889                 emit_sgt(c, inst);
2890                 break;
2891             case OPCODE_SGE:
2892                 emit_sge(c, inst);
2893                 break;
2894             case OPCODE_SEQ:
2895                 emit_seq(c, inst);
2896                 break;
2897             case OPCODE_SNE:
2898                 emit_sne(c, inst);
2899                 break;
2900             case OPCODE_MUL:
2901                 emit_mul(c, inst);
2902                 break;
2903             case OPCODE_POW:
2904                 emit_pow(c, inst);
2905                 break;
2906             case OPCODE_MAD:
2907                 emit_mad(c, inst);
2908                 break;
2909             case OPCODE_NOISE1:
2910                 emit_noise1(c, inst);
2911                 break;
2912             case OPCODE_NOISE2:
2913                 emit_noise2(c, inst);
2914                 break;
2915             case OPCODE_NOISE3:
2916                 emit_noise3(c, inst);
2917                 break;
2918             case OPCODE_NOISE4:
2919                 emit_noise4(c, inst);
2920                 break;
2921             case OPCODE_TEX:
2922                 emit_tex(c, inst);
2923                 break;
2924             case OPCODE_TXB:
2925                 emit_txb(c, inst);
2926                 break;
2927             case OPCODE_KIL_NV:
2928                 emit_kil(c);
2929                 break;
2930             case OPCODE_IF:
2931                 assert(if_depth < MAX_IF_DEPTH);
2932                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2933                 break;
2934             case OPCODE_ELSE:
2935                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2936                 break;
2937             case OPCODE_ENDIF:
2938                 assert(if_depth > 0);
2939                 brw_ENDIF(p, if_inst[--if_depth]);
2940                 break;
2941             case OPCODE_BGNSUB:
2942                 brw_save_label(p, inst->Comment, p->nr_insn);
2943                 break;
2944             case OPCODE_ENDSUB:
2945                 /* no-op */
2946                 break;
2947             case OPCODE_CAL:
2948                 brw_push_insn_state(p);
2949                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2950                 brw_set_access_mode(p, BRW_ALIGN_1);
2951                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2952                 brw_set_access_mode(p, BRW_ALIGN_16);
2953                 brw_ADD(p, get_addr_reg(stack_index),
2954                          get_addr_reg(stack_index), brw_imm_d(4));
2955                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2956                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2957                 brw_pop_insn_state(p);
2958                 break;
2959
2960             case OPCODE_RET:
2961                 brw_push_insn_state(p);
2962                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2963                 brw_ADD(p, get_addr_reg(stack_index),
2964                         get_addr_reg(stack_index), brw_imm_d(-4));
2965                 brw_set_access_mode(p, BRW_ALIGN_1);
2966                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2967                 brw_set_access_mode(p, BRW_ALIGN_16);
2968                 brw_pop_insn_state(p);
2969
2970                 break;
2971             case OPCODE_BGNLOOP:
2972                 /* XXX may need to invalidate the current_constant regs */
2973                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2974                 break;
2975             case OPCODE_BRK:
2976                 brw_BREAK(p);
2977                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2978                 break;
2979             case OPCODE_CONT:
2980                 brw_CONT(p);
2981                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2982                 break;
2983             case OPCODE_ENDLOOP:
2984                {
2985                   struct brw_instruction *inst0, *inst1;
2986                   GLuint br = 1;
2987
2988                   if (BRW_IS_IGDNG(brw))
2989                      br = 2;
2990
2991                   loop_depth--;
2992                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2993                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2994                   while (inst0 > loop_inst[loop_depth]) {
2995                      inst0--;
2996                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2997                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2998                         inst0->bits3.if_else.pop_count = 0;
2999                      }
3000                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3001                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3002                         inst0->bits3.if_else.pop_count = 0;
3003                      }
3004                   }
3005                }
3006                break;
3007             default:
3008                 _mesa_printf("unsupported IR in fragment shader %d\n",
3009                         inst->Opcode);
3010         }
3011
3012         if (inst->CondUpdate)
3013             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3014         else
3015             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3016     }
3017     post_wm_emit(c);
3018
3019     if (INTEL_DEBUG & DEBUG_WM) {
3020       _mesa_printf("wm-native:\n");
3021       for (i = 0; i < p->nr_insn; i++)
3022          brw_disasm(stderr, &p->store[i]);
3023       _mesa_printf("\n");
3024     }
3025 }
3026
3027 /**
3028  * Do GPU code generation for shaders that use GLSL features such as
3029  * flow control.  Other shaders will be compiled with the
3030  */
3031 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3032 {
3033     if (INTEL_DEBUG & DEBUG_WM) {
3034         _mesa_printf("brw_wm_glsl_emit:\n");
3035     }
3036
3037     /* initial instruction translation/simplification */
3038     brw_wm_pass_fp(c);
3039
3040     /* actual code generation */
3041     brw_wm_emit_glsl(brw, c);
3042
3043     if (INTEL_DEBUG & DEBUG_WM) {
3044         brw_wm_print_program(c, "brw_wm_glsl_emit done");
3045     }
3046
3047     c->prog_data.total_grf = num_grf_used(c);
3048     c->prog_data.total_scratch = 0;
3049 }