src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13
  14 /**
  15  * Determine if the given fragment program uses GLSL features such
  16  * as flow conditionals, loops, subroutines.
  17  * Some GLSL shaders may use these features, others might not.
  18  */
  19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  20 {
  21     int i;
  22     for (i = 0; i < fp->Base.NumInstructions; i++) {
  23         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  24         switch (inst->Opcode) {
  25             case OPCODE_IF:
  26             case OPCODE_TRUNC:
  27             case OPCODE_ENDIF:
  28             case OPCODE_CAL:
  29             case OPCODE_BRK:
  30             case OPCODE_RET:
  31             case OPCODE_DDX:
  32             case OPCODE_DDY:
  33             case OPCODE_NOISE1:
  34             case OPCODE_NOISE2:
  35             case OPCODE_NOISE3:
  36             case OPCODE_NOISE4:
  37             case OPCODE_BGNLOOP:
  38                 return GL_TRUE;
  39             default:
  40                 break;
  41         }
  42     }
  43     return GL_FALSE;
  44 }
  45
  46
  47
  48 static void
  49 reclaim_temps(struct brw_wm_compile *c);
  50
  51
  52 /** Mark GRF register as used. */
  53 static void
  54 prealloc_grf(struct brw_wm_compile *c, int r)
  55 {
  56    c->used_grf[r] = GL_TRUE;
  57 }
  58
  59
  60 /** Mark given GRF register as not in use. */
  61 static void
  62 release_grf(struct brw_wm_compile *c, int r)
  63 {
  64    /*assert(c->used_grf[r]);*/
  65    c->used_grf[r] = GL_FALSE;
  66    c->first_free_grf = MIN2(c->first_free_grf, r);
  67 }
  68
  69
  70 /** Return index of a free GRF, mark it as used. */
  71 static int
  72 alloc_grf(struct brw_wm_compile *c)
  73 {
  74    GLuint r;
  75    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  76       if (!c->used_grf[r]) {
  77          c->used_grf[r] = GL_TRUE;
  78          c->first_free_grf = r + 1;  /* a guess */
  79          return r;
  80       }
  81    }
  82
  83    /* no free temps, try to reclaim some */
  84    reclaim_temps(c);
  85    c->first_free_grf = 0;
  86
  87    /* try alloc again */
  88    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  89       if (!c->used_grf[r]) {
  90          c->used_grf[r] = GL_TRUE;
  91          c->first_free_grf = r + 1;  /* a guess */
  92          return r;
  93       }
  94    }
  95
  96    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  97       assert(c->used_grf[r]);
  98    }
  99
 100    /* really, no free GRF regs found */
 101    if (!c->out_of_regs) {
 102       /* print warning once per compilation */
 103       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 104       c->out_of_regs = GL_TRUE;
 105    }
 106
 107    return -1;
 108 }
 109
 110
 111 /** Return number of GRF registers used */
 112 static int
 113 num_grf_used(const struct brw_wm_compile *c)
 114 {
 115    int r;
 116    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 117       if (c->used_grf[r])
 118          return r + 1;
 119    return 0;
 120 }
 121
 122
 123
 124 /**
 125  * Record the mapping of a Mesa register to a hardware register.
 126  */
 127 static void set_reg(struct brw_wm_compile *c, int file, int index,
 128         int component, struct brw_reg reg)
 129 {
 130     c->wm_regs[file][index][component].reg = reg;
 131     c->wm_regs[file][index][component].inited = GL_TRUE;
 132 }
 133
 134 /**
 135  * Examine instruction's write mask to find index of first component
 136  * enabled for writing.
 137  */
 138 static int get_scalar_dst_index(const struct prog_instruction *inst)
 139 {
 140     int i;
 141     for (i = 0; i < 4; i++)
 142         if (inst->DstReg.WriteMask & (1<<i))
 143             break;
 144     return i;
 145 }
 146
 147 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 148 {
 149     struct brw_reg reg;
 150
 151     /* if we need to allocate another temp, grow the tmp_regs[] array */
 152     if (c->tmp_index == c->tmp_max) {
 153        int r = alloc_grf(c);
 154        if (r < 0) {
 155           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 156           r = 50; /* XXX random register! */
 157        }
 158        c->tmp_regs[ c->tmp_max++ ] = r;
 159     }
 160
 161     /* form the GRF register */
 162     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 163     /*printf("alloc_temp %d\n", reg.nr);*/
 164     assert(reg.nr < BRW_WM_MAX_GRF);
 165     return reg;
 166
 167 }
 168
 169 /**
 170  * Save current temp register info.
 171  * There must be a matching call to release_tmps().
 172  */
 173 static int mark_tmps(struct brw_wm_compile *c)
 174 {
 175     return c->tmp_index;
 176 }
 177
 178 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 179 {
 180     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 181 }
 182
 183 static void release_tmps(struct brw_wm_compile *c, int mark)
 184 {
 185     c->tmp_index = mark;
 186 }
 187
 188 /**
 189  * Convert Mesa src register to brw register.
 190  *
 191  * Since we're running in SOA mode each Mesa register corresponds to four
 192  * hardware registers.  We allocate the hardware registers as needed here.
 193  *
 194  * \param file  register file, one of PROGRAM_x
 195  * \param index  register number
 196  * \param component  src component (X=0, Y=1, Z=2, W=3)
 197  * \param nr  not used?!?
 198  * \param neg  negate value?
 199  * \param abs  take absolute value?
 200  */
 201 static struct brw_reg
 202 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 203         int nr, GLuint neg, GLuint abs)
 204 {
 205     struct brw_reg reg;
 206     switch (file) {
 207         case PROGRAM_STATE_VAR:
 208         case PROGRAM_CONSTANT:
 209         case PROGRAM_UNIFORM:
 210             file = PROGRAM_STATE_VAR;
 211             break;
 212         case PROGRAM_UNDEFINED:
 213             return brw_null_reg();
 214         case PROGRAM_TEMPORARY:
 215         case PROGRAM_INPUT:
 216         case PROGRAM_OUTPUT:
 217         case PROGRAM_PAYLOAD:
 218             break;
 219         default:
 220             _mesa_problem(NULL, "Unexpected file in get_reg()");
 221             return brw_null_reg();
 222     }
 223
 224     assert(index < 256);
 225     assert(component < 4);
 226
 227     /* see if we've already allocated a HW register for this Mesa register */
 228     if (c->wm_regs[file][index][component].inited) {
 229        /* yes, re-use */
 230        reg = c->wm_regs[file][index][component].reg;
 231     }
 232     else {
 233         /* no, allocate new register */
 234        int grf = alloc_grf(c);
 235        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 236        if (grf < 0) {
 237           /* totally out of temps */
 238           grf = 51; /* XXX random register! */
 239        }
 240
 241        reg = brw_vec8_grf(grf, 0);
 242        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 243
 244        set_reg(c, file, index, component, reg);
 245     }
 246
 247     if (neg & (1 << component)) {
 248         reg = negate(reg);
 249     }
 250     if (abs)
 251         reg = brw_abs(reg);
 252     return reg;
 253 }
 254
 255
 256
 257 /**
 258  * This is called if we run out of GRF registers.  Examine the live intervals
 259  * of temp regs in the program and free those which won't be used again.
 260  */
 261 static void
 262 reclaim_temps(struct brw_wm_compile *c)
 263 {
 264    GLint intBegin[MAX_PROGRAM_TEMPS];
 265    GLint intEnd[MAX_PROGRAM_TEMPS];
 266    int index;
 267
 268    /*printf("Reclaim temps:\n");*/
 269
 270    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 271                              intBegin, intEnd);
 272
 273    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 274       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 275          /* program temp[i] can be freed */
 276          int component;
 277          /*printf("  temp[%d] is dead\n", index);*/
 278          for (component = 0; component < 4; component++) {
 279             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 280                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 281                release_grf(c, r);
 282                /*
 283                printf("  Reclaim temp %d, reg %d at inst %d\n",
 284                       index, r, c->cur_inst);
 285                */
 286                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 287             }
 288          }
 289       }
 290    }
 291 }
 292
 293
 294
 295
 296 /**
 297  * Preallocate registers.  This sets up the Mesa to hardware register
 298  * mapping for certain registers, such as constants (uniforms/state vars)
 299  * and shader inputs.
 300  */
 301 static void prealloc_reg(struct brw_wm_compile *c)
 302 {
 303     int i, j;
 304     struct brw_reg reg;
 305     int urb_read_length = 0;
 306     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 307     GLuint reg_index = 0;
 308
 309     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 310     c->first_free_grf = 0;
 311
 312     for (i = 0; i < 4; i++) {
 313         if (i < c->key.nr_depth_regs)
 314             reg = brw_vec8_grf(i * 2, 0);
 315         else
 316             reg = brw_vec8_grf(0, 0);
 317         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 318     }
 319     reg_index += 2 * c->key.nr_depth_regs;
 320
 321     /* constants */
 322     {
 323         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 324         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 325
 326         /* use a real constant buffer, or just use a section of the GRF? */
 327         /* XXX this heuristic may need adjustment... */
 328         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 329            c->fp->use_const_buffer = GL_TRUE;
 330         else
 331            c->fp->use_const_buffer = GL_FALSE;
 332         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 333
 334         if (c->fp->use_const_buffer) {
 335            /* We'll use a real constant buffer and fetch constants from
 336             * it with a dataport read message.
 337             */
 338
 339            /* number of float constants in CURBE */
 340            c->prog_data.nr_params = 0;
 341         }
 342         else {
 343            const struct gl_program_parameter_list *plist =
 344               c->fp->program.Base.Parameters;
 345            int index = 0;
 346
 347            /* number of float constants in CURBE */
 348            c->prog_data.nr_params = 4 * nr_params;
 349
 350            /* loop over program constants (float[4]) */
 351            for (i = 0; i < nr_params; i++) {
 352               /* loop over XYZW channels */
 353               for (j = 0; j < 4; j++, index++) {
 354                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 355                  /* Save pointer to parameter/constant value.
 356                   * Constants will be copied in prepare_constant_buffer()
 357                   */
 358                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 359                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 360               }
 361            }
 362            /* number of constant regs used (each reg is float[8]) */
 363            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 364            reg_index += c->nr_creg;
 365         }
 366     }
 367
 368     /* fragment shader inputs */
 369     for (i = 0; i < VERT_RESULT_MAX; i++) {
 370        int fp_input;
 371
 372        if (i >= VERT_RESULT_VAR0)
 373           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 374        else if (i <= VERT_RESULT_TEX7)
 375           fp_input = i;
 376        else
 377           fp_input = -1;
 378
 379        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 380           urb_read_length = reg_index;
 381           reg = brw_vec8_grf(reg_index, 0);
 382           for (j = 0; j < 4; j++)
 383              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 384        }
 385        if (c->key.vp_outputs_written & (1 << i)) {
 386           reg_index += 2;
 387        }
 388     }
 389
 390     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 391     c->prog_data.urb_read_length = urb_read_length;
 392     c->prog_data.curb_read_length = c->nr_creg;
 393     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 394     reg_index++;
 395     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 396     reg_index += 2;
 397
 398     /* mark GRF regs [0..reg_index-1] as in-use */
 399     for (i = 0; i < reg_index; i++)
 400        prealloc_grf(c, i);
 401
 402     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 403     prealloc_grf(c, 126);
 404     prealloc_grf(c, 127);
 405
 406     /* An instruction may reference up to three constants.
 407      * They'll be found in these registers.
 408      * XXX alloc these on demand!
 409      */
 410     if (c->fp->use_const_buffer) {
 411        for (i = 0; i < 3; i++) {
 412           c->current_const[i].index = -1;
 413           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 414        }
 415     }
 416 #if 0
 417     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 418     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 419 #endif
 420 }
 421
 422
 423 /**
 424  * Check if any of the instruction's src registers are constants, uniforms,
 425  * or statevars.  If so, fetch any constants that we don't already have in
 426  * the three GRF slots.
 427  */
 428 static void fetch_constants(struct brw_wm_compile *c,
 429                             const struct prog_instruction *inst)
 430 {
 431    struct brw_compile *p = &c->func;
 432    GLuint i;
 433
 434    /* loop over instruction src regs */
 435    for (i = 0; i < 3; i++) {
 436       const struct prog_src_register *src = &inst->SrcReg[i];
 437       if (src->File == PROGRAM_STATE_VAR ||
 438           src->File == PROGRAM_CONSTANT ||
 439           src->File == PROGRAM_UNIFORM) {
 440          c->current_const[i].index = src->Index;
 441
 442 #if 0
 443          printf("  fetch const[%d] for arg %d into reg %d\n",
 444                 src->Index, i, c->current_const[i].reg.nr);
 445 #endif
 446
 447          /* need to fetch the constant now */
 448          brw_dp_READ_4(p,
 449                        c->current_const[i].reg,  /* writeback dest */
 450                        1,                        /* msg_reg */
 451                        src->RelAddr,             /* relative indexing? */
 452                        16 * src->Index,          /* byte offset */
 453                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 454                        );
 455       }
 456    }
 457 }
 458
 459
 460 /**
 461  * Convert Mesa dst register to brw register.
 462  */
 463 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 464                                   const struct prog_instruction *inst,
 465                                   GLuint component)
 466 {
 467     const int nr = 1;
 468     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 469             0, 0);
 470 }
 471
 472
 473 static struct brw_reg
 474 get_src_reg_const(struct brw_wm_compile *c,
 475                   const struct prog_instruction *inst,
 476                   GLuint srcRegIndex, GLuint component)
 477 {
 478    /* We should have already fetched the constant from the constant
 479     * buffer in fetch_constants().  Now we just have to return a
 480     * register description that extracts the needed component and
 481     * smears it across all eight vector components.
 482     */
 483    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 484    struct brw_reg const_reg;
 485
 486    assert(component < 4);
 487    assert(srcRegIndex < 3);
 488    assert(c->current_const[srcRegIndex].index != -1);
 489    const_reg = c->current_const[srcRegIndex].reg;
 490
 491    /* extract desired float from the const_reg, and smear */
 492    const_reg = stride(const_reg, 0, 1, 0);
 493    const_reg.subnr = component * 4;
 494
 495    if (src->Negate & (1 << component))
 496       const_reg = negate(const_reg);
 497    if (src->Abs)
 498       const_reg = brw_abs(const_reg);
 499
 500 #if 0
 501    printf("  form const[%d].%d for arg %d, reg %d\n",
 502           c->current_const[srcRegIndex].index,
 503           component,
 504           srcRegIndex,
 505           const_reg.nr);
 506 #endif
 507
 508    return const_reg;
 509 }
 510
 511
 512 /**
 513  * Convert Mesa src register to brw register.
 514  */
 515 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 516                                   const struct prog_instruction *inst,
 517                                   GLuint srcRegIndex, GLuint channel)
 518 {
 519     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 520     const GLuint nr = 1;
 521     const GLuint component = GET_SWZ(src->Swizzle, channel);
 522
 523     /* Extended swizzle terms */
 524     if (component == SWIZZLE_ZERO) {
 525        return brw_imm_f(0.0F);
 526     }
 527     else if (component == SWIZZLE_ONE) {
 528        return brw_imm_f(1.0F);
 529     }
 530
 531     if (c->fp->use_const_buffer &&
 532         (src->File == PROGRAM_STATE_VAR ||
 533          src->File == PROGRAM_CONSTANT ||
 534          src->File == PROGRAM_UNIFORM)) {
 535        return get_src_reg_const(c, inst, srcRegIndex, component);
 536     }
 537     else {
 538        /* other type of source register */
 539        return get_reg(c, src->File, src->Index, component, nr,
 540                       src->Negate, src->Abs);
 541     }
 542 }
 543
 544
 545 /**
 546  * Same as \sa get_src_reg() but if the register is a literal, emit
 547  * a brw_reg encoding the literal.
 548  * Note that a brw instruction only allows one src operand to be a literal.
 549  * For instructions with more than one operand, only the second can be a
 550  * literal.  This means that we treat some literals as constants/uniforms
 551  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 552  *
 553  */
 554 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 555                                       const struct prog_instruction *inst,
 556                                       GLuint srcRegIndex, GLuint channel)
 557 {
 558     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 559     if (src->File == PROGRAM_CONSTANT) {
 560        /* a literal */
 561        const int component = GET_SWZ(src->Swizzle, channel);
 562        const GLfloat *param =
 563           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 564        GLfloat value = param[component];
 565        if (src->Negate & (1 << channel))
 566           value = -value;
 567        if (src->Abs)
 568           value = FABSF(value);
 569 #if 0
 570        printf("  form immed value %f for chan %d\n", value, channel);
 571 #endif
 572        return brw_imm_f(value);
 573     }
 574     else {
 575        return get_src_reg(c, inst, srcRegIndex, channel);
 576     }
 577 }
 578
 579
 580 /**
 581  * Subroutines are minimal support for resusable instruction sequences.
 582  * They are implemented as simply as possible to minimise overhead: there
 583  * is no explicit support for communication between the caller and callee
 584  * other than saving the return address in a temporary register, nor is
 585  * there any automatic local storage.  This implies that great care is
 586  * required before attempting reentrancy or any kind of nested
 587  * subroutine invocations.
 588  */
 589 static void invoke_subroutine( struct brw_wm_compile *c,
 590                                enum _subroutine subroutine,
 591                                void (*emit)( struct brw_wm_compile * ) )
 592 {
 593     struct brw_compile *p = &c->func;
 594
 595     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 596
 597     if( c->subroutines[ subroutine ] ) {
 598         /* subroutine previously emitted: reuse existing instructions */
 599
 600         int mark = mark_tmps( c );
 601         struct brw_reg return_address = retype( alloc_tmp( c ),
 602                                                 BRW_REGISTER_TYPE_UD );
 603         int here = p->nr_insn;
 604
 605         brw_push_insn_state(p);
 606         brw_set_mask_control(p, BRW_MASK_DISABLE);
 607         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 608
 609         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 610                  brw_imm_d( ( c->subroutines[ subroutine ] -
 611                               here - 1 ) << 4 ) );
 612         brw_pop_insn_state(p);
 613
 614         release_tmps( c, mark );
 615     } else {
 616         /* previously unused subroutine: emit, and mark for later reuse */
 617
 618         int mark = mark_tmps( c );
 619         struct brw_reg return_address = retype( alloc_tmp( c ),
 620                                                 BRW_REGISTER_TYPE_UD );
 621         struct brw_instruction *calc;
 622         int base = p->nr_insn;
 623
 624         brw_push_insn_state(p);
 625         brw_set_mask_control(p, BRW_MASK_DISABLE);
 626         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 627         brw_pop_insn_state(p);
 628
 629         c->subroutines[ subroutine ] = p->nr_insn;
 630
 631         emit( c );
 632
 633         brw_push_insn_state(p);
 634         brw_set_mask_control(p, BRW_MASK_DISABLE);
 635         brw_MOV( p, brw_ip_reg(), return_address );
 636         brw_pop_insn_state(p);
 637
 638         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 639
 640         release_tmps( c, mark );
 641     }
 642 }
 643
 644 static void emit_abs( struct brw_wm_compile *c,
 645                       const struct prog_instruction *inst)
 646 {
 647     int i;
 648     struct brw_compile *p = &c->func;
 649     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 650     for (i = 0; i < 4; i++) {
 651         if (inst->DstReg.WriteMask & (1<<i)) {
 652             struct brw_reg src, dst;
 653             dst = get_dst_reg(c, inst, i);
 654             src = get_src_reg(c, inst, 0, i);
 655             brw_MOV(p, dst, brw_abs(src));
 656         }
 657     }
 658     brw_set_saturate(p, 0);
 659 }
 660
 661 static void emit_trunc( struct brw_wm_compile *c,
 662                         const struct prog_instruction *inst)
 663 {
 664     int i;
 665     struct brw_compile *p = &c->func;
 666     GLuint mask = inst->DstReg.WriteMask;
 667     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 668     for (i = 0; i < 4; i++) {
 669         if (mask & (1<<i)) {
 670             struct brw_reg src, dst;
 671             dst = get_dst_reg(c, inst, i);
 672             src = get_src_reg(c, inst, 0, i);
 673             brw_RNDZ(p, dst, src);
 674         }
 675     }
 676     brw_set_saturate(p, 0);
 677 }
 678
 679 static void emit_mov( struct brw_wm_compile *c,
 680                       const struct prog_instruction *inst)
 681 {
 682     int i;
 683     struct brw_compile *p = &c->func;
 684     GLuint mask = inst->DstReg.WriteMask;
 685     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 686     for (i = 0; i < 4; i++) {
 687         if (mask & (1<<i)) {
 688             struct brw_reg src, dst;
 689             dst = get_dst_reg(c, inst, i);
 690             /* XXX some moves from immediate value don't work reliably!!! */
 691             /*src = get_src_reg_imm(c, inst, 0, i);*/
 692             src = get_src_reg(c, inst, 0, i);
 693             brw_MOV(p, dst, src);
 694         }
 695     }
 696     brw_set_saturate(p, 0);
 697 }
 698
 699 static void emit_pixel_xy(struct brw_wm_compile *c,
 700                           const struct prog_instruction *inst)
 701 {
 702     struct brw_reg r1 = brw_vec1_grf(1, 0);
 703     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 704
 705     struct brw_reg dst0, dst1;
 706     struct brw_compile *p = &c->func;
 707     GLuint mask = inst->DstReg.WriteMask;
 708
 709     dst0 = get_dst_reg(c, inst, 0);
 710     dst1 = get_dst_reg(c, inst, 1);
 711     /* Calculate pixel centers by adding 1 or 0 to each of the
 712      * micro-tile coordinates passed in r1.
 713      */
 714     if (mask & WRITEMASK_X) {
 715         brw_ADD(p,
 716                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 717                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 718                 brw_imm_v(0x10101010));
 719     }
 720
 721     if (mask & WRITEMASK_Y) {
 722         brw_ADD(p,
 723                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 724                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 725                 brw_imm_v(0x11001100));
 726     }
 727 }
 728
 729 static void emit_delta_xy(struct brw_wm_compile *c,
 730                           const struct prog_instruction *inst)
 731 {
 732     struct brw_reg r1 = brw_vec1_grf(1, 0);
 733     struct brw_reg dst0, dst1, src0, src1;
 734     struct brw_compile *p = &c->func;
 735     GLuint mask = inst->DstReg.WriteMask;
 736
 737     dst0 = get_dst_reg(c, inst, 0);
 738     dst1 = get_dst_reg(c, inst, 1);
 739     src0 = get_src_reg(c, inst, 0, 0);
 740     src1 = get_src_reg(c, inst, 0, 1);
 741     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 742      * centers.
 743      */
 744     if (mask & WRITEMASK_X) {
 745         brw_ADD(p,
 746                 dst0,
 747                 retype(src0, BRW_REGISTER_TYPE_UW),
 748                 negate(r1));
 749     }
 750
 751     if (mask & WRITEMASK_Y) {
 752         brw_ADD(p,
 753                 dst1,
 754                 retype(src1, BRW_REGISTER_TYPE_UW),
 755                 negate(suboffset(r1,1)));
 756
 757     }
 758 }
 759
 760 static void fire_fb_write( struct brw_wm_compile *c,
 761                            GLuint base_reg,
 762                            GLuint nr,
 763                            GLuint target,
 764                            GLuint eot)
 765 {
 766     struct brw_compile *p = &c->func;
 767     /* Pass through control information:
 768      */
 769     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 770     {
 771         brw_push_insn_state(p);
 772         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 773         brw_MOV(p,
 774                 brw_message_reg(base_reg + 1),
 775                 brw_vec8_grf(1, 0));
 776         brw_pop_insn_state(p);
 777     }
 778     /* Send framebuffer write message: */
 779     brw_fb_WRITE(p,
 780             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 781             base_reg,
 782             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 783             target,
 784             nr,
 785             0,
 786             eot);
 787 }
 788
 789 static void emit_fb_write(struct brw_wm_compile *c,
 790                           const struct prog_instruction *inst)
 791 {
 792     struct brw_compile *p = &c->func;
 793     int nr = 2;
 794     int channel;
 795     GLuint target, eot;
 796     struct brw_reg src0;
 797
 798     /* Reserve a space for AA - may not be needed:
 799      */
 800     if (c->key.aa_dest_stencil_reg)
 801         nr += 1;
 802
 803     brw_push_insn_state(p);
 804     for (channel = 0; channel < 4; channel++) {
 805         src0 = get_src_reg(c,  inst, 0, channel);
 806         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 807         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 808         brw_MOV(p, brw_message_reg(nr + channel), src0);
 809     }
 810     /* skip over the regs populated above: */
 811     nr += 8;
 812     brw_pop_insn_state(p);
 813
 814     if (c->key.source_depth_to_render_target) {
 815        if (c->key.computes_depth) {
 816           src0 = get_src_reg(c, inst, 2, 2);
 817           brw_MOV(p, brw_message_reg(nr), src0);
 818        }
 819        else {
 820           src0 = get_src_reg(c, inst, 1, 1);
 821           brw_MOV(p, brw_message_reg(nr), src0);
 822        }
 823
 824        nr += 2;
 825     }
 826
 827     if (c->key.dest_depth_reg) {
 828         const GLuint comp = c->key.dest_depth_reg / 2;
 829         const GLuint off = c->key.dest_depth_reg % 2;
 830
 831         if (off != 0) {
 832             /* XXX this code needs review/testing */
 833             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 834             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 835
 836             brw_push_insn_state(p);
 837             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 838
 839             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 840             /* 2nd half? */
 841             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 842             brw_pop_insn_state(p);
 843         }
 844         else
 845         {
 846             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 847             brw_MOV(p, brw_message_reg(nr), src);
 848         }
 849         nr += 2;
 850    }
 851
 852     target = inst->Aux >> 1;
 853     eot = inst->Aux & 1;
 854     fire_fb_write(c, 0, nr, target, eot);
 855 }
 856
 857 static void emit_pixel_w( struct brw_wm_compile *c,
 858                           const struct prog_instruction *inst)
 859 {
 860     struct brw_compile *p = &c->func;
 861     GLuint mask = inst->DstReg.WriteMask;
 862     if (mask & WRITEMASK_W) {
 863         struct brw_reg dst, src0, delta0, delta1;
 864         struct brw_reg interp3;
 865
 866         dst = get_dst_reg(c, inst, 3);
 867         src0 = get_src_reg(c, inst, 0, 0);
 868         delta0 = get_src_reg(c, inst, 1, 0);
 869         delta1 = get_src_reg(c, inst, 1, 1);
 870
 871         interp3 = brw_vec1_grf(src0.nr+1, 4);
 872         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 873          * result straight into a message reg.
 874          */
 875         brw_LINE(p, brw_null_reg(), interp3, delta0);
 876         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 877
 878         /* Calc w */
 879         brw_math_16( p, dst,
 880                 BRW_MATH_FUNCTION_INV,
 881                 BRW_MATH_SATURATE_NONE,
 882                 2, brw_null_reg(),
 883                 BRW_MATH_PRECISION_FULL);
 884     }
 885 }
 886
 887 static void emit_linterp(struct brw_wm_compile *c,
 888                          const struct prog_instruction *inst)
 889 {
 890     struct brw_compile *p = &c->func;
 891     GLuint mask = inst->DstReg.WriteMask;
 892     struct brw_reg interp[4];
 893     struct brw_reg dst, delta0, delta1;
 894     struct brw_reg src0;
 895     GLuint nr, i;
 896
 897     src0 = get_src_reg(c, inst, 0, 0);
 898     delta0 = get_src_reg(c, inst, 1, 0);
 899     delta1 = get_src_reg(c, inst, 1, 1);
 900     nr = src0.nr;
 901
 902     interp[0] = brw_vec1_grf(nr, 0);
 903     interp[1] = brw_vec1_grf(nr, 4);
 904     interp[2] = brw_vec1_grf(nr+1, 0);
 905     interp[3] = brw_vec1_grf(nr+1, 4);
 906
 907     for(i = 0; i < 4; i++ ) {
 908         if (mask & (1<<i)) {
 909             dst = get_dst_reg(c, inst, i);
 910             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 911             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 912         }
 913     }
 914 }
 915
 916 static void emit_cinterp(struct brw_wm_compile *c,
 917                          const struct prog_instruction *inst)
 918 {
 919     struct brw_compile *p = &c->func;
 920     GLuint mask = inst->DstReg.WriteMask;
 921
 922     struct brw_reg interp[4];
 923     struct brw_reg dst, src0;
 924     GLuint nr, i;
 925
 926     src0 = get_src_reg(c, inst, 0, 0);
 927     nr = src0.nr;
 928
 929     interp[0] = brw_vec1_grf(nr, 0);
 930     interp[1] = brw_vec1_grf(nr, 4);
 931     interp[2] = brw_vec1_grf(nr+1, 0);
 932     interp[3] = brw_vec1_grf(nr+1, 4);
 933
 934     for(i = 0; i < 4; i++ ) {
 935         if (mask & (1<<i)) {
 936             dst = get_dst_reg(c, inst, i);
 937             brw_MOV(p, dst, suboffset(interp[i],3));
 938         }
 939     }
 940 }
 941
 942 static void emit_pinterp(struct brw_wm_compile *c,
 943                          const struct prog_instruction *inst)
 944 {
 945     struct brw_compile *p = &c->func;
 946     GLuint mask = inst->DstReg.WriteMask;
 947
 948     struct brw_reg interp[4];
 949     struct brw_reg dst, delta0, delta1;
 950     struct brw_reg src0, w;
 951     GLuint nr, i;
 952
 953     src0 = get_src_reg(c, inst, 0, 0);
 954     delta0 = get_src_reg(c, inst, 1, 0);
 955     delta1 = get_src_reg(c, inst, 1, 1);
 956     w = get_src_reg(c, inst, 2, 3);
 957     nr = src0.nr;
 958
 959     interp[0] = brw_vec1_grf(nr, 0);
 960     interp[1] = brw_vec1_grf(nr, 4);
 961     interp[2] = brw_vec1_grf(nr+1, 0);
 962     interp[3] = brw_vec1_grf(nr+1, 4);
 963
 964     for(i = 0; i < 4; i++ ) {
 965         if (mask & (1<<i)) {
 966             dst = get_dst_reg(c, inst, i);
 967             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 968             brw_MAC(p, dst, suboffset(interp[i],1),
 969                     delta1);
 970             brw_MUL(p, dst, dst, w);
 971         }
 972     }
 973 }
 974
 975 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 976 static void emit_frontfacing(struct brw_wm_compile *c,
 977                              const struct prog_instruction *inst)
 978 {
 979     struct brw_compile *p = &c->func;
 980     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 981     struct brw_reg dst;
 982     GLuint mask = inst->DstReg.WriteMask;
 983     int i;
 984
 985     for (i = 0; i < 4; i++) {
 986         if (mask & (1<<i)) {
 987             dst = get_dst_reg(c, inst, i);
 988             brw_MOV(p, dst, brw_imm_f(0.0));
 989         }
 990     }
 991
 992     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 993      * us front face
 994      */
 995     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 996     for (i = 0; i < 4; i++) {
 997         if (mask & (1<<i)) {
 998             dst = get_dst_reg(c, inst, i);
 999             brw_MOV(p, dst, brw_imm_f(1.0));
1000         }
1001     }
1002     brw_set_predicate_control_flag_value(p, 0xff);
1003 }
1004
1005 static void emit_xpd(struct brw_wm_compile *c,
1006                      const struct prog_instruction *inst)
1007 {
1008     int i;
1009     struct brw_compile *p = &c->func;
1010     GLuint mask = inst->DstReg.WriteMask;
1011     for (i = 0; i < 4; i++) {
1012         GLuint i2 = (i+2)%3;
1013         GLuint i1 = (i+1)%3;
1014         if (mask & (1<<i)) {
1015             struct brw_reg src0, src1, dst;
1016             dst = get_dst_reg(c, inst, i);
1017             src0 = negate(get_src_reg(c, inst, 0, i2));
1018             src1 = get_src_reg_imm(c, inst, 1, i1);
1019             brw_MUL(p, brw_null_reg(), src0, src1);
1020             src0 = get_src_reg(c, inst, 0, i1);
1021             src1 = get_src_reg_imm(c, inst, 1, i2);
1022             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1023             brw_MAC(p, dst, src0, src1);
1024             brw_set_saturate(p, 0);
1025         }
1026     }
1027     brw_set_saturate(p, 0);
1028 }
1029
1030 static void emit_dp3(struct brw_wm_compile *c,
1031                      const struct prog_instruction *inst)
1032 {
1033     struct brw_reg src0[3], src1[3], dst;
1034     int i;
1035     struct brw_compile *p = &c->func;
1036     for (i = 0; i < 3; i++) {
1037         src0[i] = get_src_reg(c, inst, 0, i);
1038         src1[i] = get_src_reg_imm(c, inst, 1, i);
1039     }
1040
1041     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1042     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1043     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1044     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1045     brw_MAC(p, dst, src0[2], src1[2]);
1046     brw_set_saturate(p, 0);
1047 }
1048
1049 static void emit_dp4(struct brw_wm_compile *c,
1050                      const struct prog_instruction *inst)
1051 {
1052     struct brw_reg src0[4], src1[4], dst;
1053     int i;
1054     struct brw_compile *p = &c->func;
1055     for (i = 0; i < 4; i++) {
1056         src0[i] = get_src_reg(c, inst, 0, i);
1057         src1[i] = get_src_reg_imm(c, inst, 1, i);
1058     }
1059     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1060     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1061     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1062     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1063     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1064     brw_MAC(p, dst, src0[3], src1[3]);
1065     brw_set_saturate(p, 0);
1066 }
1067
1068 static void emit_dph(struct brw_wm_compile *c,
1069                      const struct prog_instruction *inst)
1070 {
1071     struct brw_reg src0[4], src1[4], dst;
1072     int i;
1073     struct brw_compile *p = &c->func;
1074     for (i = 0; i < 4; i++) {
1075         src0[i] = get_src_reg(c, inst, 0, i);
1076         src1[i] = get_src_reg_imm(c, inst, 1, i);
1077     }
1078     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1079     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1080     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1081     brw_MAC(p, dst, src0[2], src1[2]);
1082     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1083     brw_ADD(p, dst, dst, src1[3]);
1084     brw_set_saturate(p, 0);
1085 }
1086
1087 /**
1088  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1089  * Note that the result of the function is smeared across the dest
1090  * register's X, Y, Z and W channels (subject to writemasking of course).
1091  */
1092 static void emit_math1(struct brw_wm_compile *c,
1093                        const struct prog_instruction *inst, GLuint func)
1094 {
1095     struct brw_compile *p = &c->func;
1096     struct brw_reg src0, dst, tmp;
1097     const int mark = mark_tmps( c );
1098     int i;
1099
1100     tmp = alloc_tmp(c);
1101
1102     /* Get first component of source register */
1103     src0 = get_src_reg(c, inst, 0, 0);
1104
1105     /* tmp = func(src0) */
1106     brw_MOV(p, brw_message_reg(2), src0);
1107     brw_math(p,
1108              tmp,
1109              func,
1110              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1111              2,
1112              brw_null_reg(),
1113              BRW_MATH_DATA_VECTOR,
1114              BRW_MATH_PRECISION_FULL);
1115
1116     /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1117
1118     /* replicate tmp value across enabled dest channels */
1119     for (i = 0; i < 4; i++) {
1120        if (inst->DstReg.WriteMask & (1 << i)) {
1121           dst = get_dst_reg(c, inst, i);
1122           brw_MOV(p, dst, tmp);
1123        }
1124     }
1125
1126     release_tmps(c, mark);
1127 }
1128
1129 static void emit_rcp(struct brw_wm_compile *c,
1130                      const struct prog_instruction *inst)
1131 {
1132     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1133 }
1134
1135 static void emit_rsq(struct brw_wm_compile *c,
1136                      const struct prog_instruction *inst)
1137 {
1138     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1139 }
1140
1141 static void emit_sin(struct brw_wm_compile *c,
1142                      const struct prog_instruction *inst)
1143 {
1144     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1145 }
1146
1147 static void emit_cos(struct brw_wm_compile *c,
1148                      const struct prog_instruction *inst)
1149 {
1150     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1151 }
1152
1153 static void emit_ex2(struct brw_wm_compile *c,
1154                      const struct prog_instruction *inst)
1155 {
1156     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1157 }
1158
1159 static void emit_lg2(struct brw_wm_compile *c,
1160                      const struct prog_instruction *inst)
1161 {
1162     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1163 }
1164
1165 static void emit_add(struct brw_wm_compile *c,
1166                      const struct prog_instruction *inst)
1167 {
1168     struct brw_compile *p = &c->func;
1169     struct brw_reg src0, src1, dst;
1170     GLuint mask = inst->DstReg.WriteMask;
1171     int i;
1172     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1173     for (i = 0 ; i < 4; i++) {
1174         if (mask & (1<<i)) {
1175             dst = get_dst_reg(c, inst, i);
1176             src0 = get_src_reg(c, inst, 0, i);
1177             src1 = get_src_reg_imm(c, inst, 1, i);
1178             brw_ADD(p, dst, src0, src1);
1179         }
1180     }
1181     brw_set_saturate(p, 0);
1182 }
1183
1184 static void emit_arl(struct brw_wm_compile *c,
1185                      const struct prog_instruction *inst)
1186 {
1187     struct brw_compile *p = &c->func;
1188     struct brw_reg src0, addr_reg;
1189     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1190     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1191                            BRW_ARF_ADDRESS, 0);
1192     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1193     brw_MOV(p, addr_reg, src0);
1194     brw_set_saturate(p, 0);
1195 }
1196
1197 static void emit_sub(struct brw_wm_compile *c,
1198                      const struct prog_instruction *inst)
1199 {
1200     struct brw_compile *p = &c->func;
1201     struct brw_reg src0, src1, dst;
1202     GLuint mask = inst->DstReg.WriteMask;
1203     int i;
1204     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1205     for (i = 0 ; i < 4; i++) {
1206         if (mask & (1<<i)) {
1207             dst = get_dst_reg(c, inst, i);
1208             src0 = get_src_reg(c, inst, 0, i);
1209             src1 = get_src_reg_imm(c, inst, 1, i);
1210             brw_ADD(p, dst, src0, negate(src1));
1211         }
1212     }
1213     brw_set_saturate(p, 0);
1214 }
1215
1216 static void emit_mul(struct brw_wm_compile *c,
1217                      const struct prog_instruction *inst)
1218 {
1219     struct brw_compile *p = &c->func;
1220     struct brw_reg src0, src1, dst;
1221     GLuint mask = inst->DstReg.WriteMask;
1222     int i;
1223     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1224     for (i = 0 ; i < 4; i++) {
1225         if (mask & (1<<i)) {
1226             dst = get_dst_reg(c, inst, i);
1227             src0 = get_src_reg(c, inst, 0, i);
1228             src1 = get_src_reg_imm(c, inst, 1, i);
1229             brw_MUL(p, dst, src0, src1);
1230         }
1231     }
1232     brw_set_saturate(p, 0);
1233 }
1234
1235 static void emit_frc(struct brw_wm_compile *c,
1236                      const struct prog_instruction *inst)
1237 {
1238     struct brw_compile *p = &c->func;
1239     struct brw_reg src0, dst;
1240     GLuint mask = inst->DstReg.WriteMask;
1241     int i;
1242     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1243     for (i = 0 ; i < 4; i++) {
1244         if (mask & (1<<i)) {
1245             dst = get_dst_reg(c, inst, i);
1246             src0 = get_src_reg_imm(c, inst, 0, i);
1247             brw_FRC(p, dst, src0);
1248         }
1249     }
1250     if (inst->SaturateMode != SATURATE_OFF)
1251         brw_set_saturate(p, 0);
1252 }
1253
1254 static void emit_flr(struct brw_wm_compile *c,
1255                      const struct prog_instruction *inst)
1256 {
1257     struct brw_compile *p = &c->func;
1258     struct brw_reg src0, dst;
1259     GLuint mask = inst->DstReg.WriteMask;
1260     int i;
1261     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1262     for (i = 0 ; i < 4; i++) {
1263         if (mask & (1<<i)) {
1264             dst = get_dst_reg(c, inst, i);
1265             src0 = get_src_reg_imm(c, inst, 0, i);
1266             brw_RNDD(p, dst, src0);
1267         }
1268     }
1269     brw_set_saturate(p, 0);
1270 }
1271
1272
1273 static void emit_min_max(struct brw_wm_compile *c,
1274                          const struct prog_instruction *inst)
1275 {
1276     struct brw_compile *p = &c->func;
1277     const GLuint mask = inst->DstReg.WriteMask;
1278     const int mark = mark_tmps(c);
1279     int i;
1280     brw_push_insn_state(p);
1281     for (i = 0; i < 4; i++) {
1282         if (mask & (1<<i)) {
1283             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1284             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1285             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1286             struct brw_reg dst;
1287             /* if dst==src0 or dst==src1 we need to use a temp reg */
1288             GLboolean use_temp = brw_same_reg(dst, src0) ||
1289                                  brw_same_reg(dst, src1);
1290             if (use_temp)
1291                dst = alloc_tmp(c);
1292             else
1293                dst = real_dst;
1294
1295             /*
1296             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1297                    dst.nr, src0.nr, src1.nr);
1298             */
1299             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1300             brw_MOV(p, dst, src0);
1301             brw_set_saturate(p, 0);
1302
1303             if (inst->Opcode == OPCODE_MIN)
1304                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1305             else
1306                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1307
1308             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1309             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1310             brw_MOV(p, dst, src1);
1311             brw_set_saturate(p, 0);
1312             brw_set_predicate_control_flag_value(p, 0xff);
1313             if (use_temp)
1314                brw_MOV(p, real_dst, dst);
1315         }
1316     }
1317     brw_pop_insn_state(p);
1318     release_tmps(c, mark);
1319 }
1320
1321 static void emit_pow(struct brw_wm_compile *c,
1322                      const struct prog_instruction *inst)
1323 {
1324     struct brw_compile *p = &c->func;
1325     struct brw_reg dst, src0, src1;
1326     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1327     src0 = get_src_reg_imm(c, inst, 0, 0);
1328     src1 = get_src_reg_imm(c, inst, 1, 0);
1329
1330     brw_MOV(p, brw_message_reg(2), src0);
1331     brw_MOV(p, brw_message_reg(3), src1);
1332
1333     brw_math(p,
1334             dst,
1335             BRW_MATH_FUNCTION_POW,
1336             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1337             2,
1338             brw_null_reg(),
1339             BRW_MATH_DATA_VECTOR,
1340             BRW_MATH_PRECISION_FULL);
1341 }
1342
1343 static void emit_lrp(struct brw_wm_compile *c,
1344                      const struct prog_instruction *inst)
1345 {
1346     struct brw_compile *p = &c->func;
1347     GLuint mask = inst->DstReg.WriteMask;
1348     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1349     int i;
1350     int mark = mark_tmps(c);
1351     for (i = 0; i < 4; i++) {
1352         if (mask & (1<<i)) {
1353             dst = get_dst_reg(c, inst, i);
1354             src0 = get_src_reg(c, inst, 0, i);
1355
1356             src1 = get_src_reg_imm(c, inst, 1, i);
1357
1358             if (src1.nr == dst.nr) {
1359                 tmp1 = alloc_tmp(c);
1360                 brw_MOV(p, tmp1, src1);
1361             } else
1362                 tmp1 = src1;
1363
1364             src2 = get_src_reg(c, inst, 2, i);
1365             if (src2.nr == dst.nr) {
1366                 tmp2 = alloc_tmp(c);
1367                 brw_MOV(p, tmp2, src2);
1368             } else
1369                 tmp2 = src2;
1370
1371             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1372             brw_MUL(p, brw_null_reg(), dst, tmp2);
1373             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1374             brw_MAC(p, dst, src0, tmp1);
1375             brw_set_saturate(p, 0);
1376         }
1377         release_tmps(c, mark);
1378     }
1379 }
1380
1381 /**
1382  * For GLSL shaders, this KIL will be unconditional.
1383  * It may be contained inside an IF/ENDIF structure of course.
1384  */
1385 static void emit_kil(struct brw_wm_compile *c)
1386 {
1387     struct brw_compile *p = &c->func;
1388     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1389     brw_push_insn_state(p);
1390     brw_set_mask_control(p, BRW_MASK_DISABLE);
1391     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1392     brw_AND(p, depth, c->emit_mask_reg, depth);
1393     brw_pop_insn_state(p);
1394 }
1395
1396 static void emit_mad(struct brw_wm_compile *c,
1397                      const struct prog_instruction *inst)
1398 {
1399     struct brw_compile *p = &c->func;
1400     GLuint mask = inst->DstReg.WriteMask;
1401     struct brw_reg dst, src0, src1, src2;
1402     int i;
1403
1404     for (i = 0; i < 4; i++) {
1405         if (mask & (1<<i)) {
1406             dst = get_dst_reg(c, inst, i);
1407             src0 = get_src_reg(c, inst, 0, i);
1408             src1 = get_src_reg_imm(c, inst, 1, i);
1409             src2 = get_src_reg_imm(c, inst, 2, i);
1410             brw_MUL(p, dst, src0, src1);
1411
1412             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1413             brw_ADD(p, dst, dst, src2);
1414             brw_set_saturate(p, 0);
1415         }
1416     }
1417 }
1418
1419 static void emit_sop(struct brw_wm_compile *c,
1420                      const struct prog_instruction *inst, GLuint cond)
1421 {
1422     struct brw_compile *p = &c->func;
1423     GLuint mask = inst->DstReg.WriteMask;
1424     struct brw_reg dst, src0, src1;
1425     int i;
1426
1427     for (i = 0; i < 4; i++) {
1428         if (mask & (1<<i)) {
1429             dst = get_dst_reg(c, inst, i);
1430             src0 = get_src_reg(c, inst, 0, i);
1431             src1 = get_src_reg_imm(c, inst, 1, i);
1432             brw_push_insn_state(p);
1433             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1434             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1435             brw_MOV(p, dst, brw_imm_f(0.0));
1436             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1437             brw_MOV(p, dst, brw_imm_f(1.0));
1438             brw_pop_insn_state(p);
1439         }
1440     }
1441 }
1442
1443 static void emit_slt(struct brw_wm_compile *c,
1444                      const struct prog_instruction *inst)
1445 {
1446     emit_sop(c, inst, BRW_CONDITIONAL_L);
1447 }
1448
1449 static void emit_sle(struct brw_wm_compile *c,
1450                      const struct prog_instruction *inst)
1451 {
1452     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1453 }
1454
1455 static void emit_sgt(struct brw_wm_compile *c,
1456                      const struct prog_instruction *inst)
1457 {
1458     emit_sop(c, inst, BRW_CONDITIONAL_G);
1459 }
1460
1461 static void emit_sge(struct brw_wm_compile *c,
1462                      const struct prog_instruction *inst)
1463 {
1464     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1465 }
1466
1467 static void emit_seq(struct brw_wm_compile *c,
1468                      const struct prog_instruction *inst)
1469 {
1470     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1471 }
1472
1473 static void emit_sne(struct brw_wm_compile *c,
1474                      const struct prog_instruction *inst)
1475 {
1476     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1477 }
1478
1479 static void emit_ddx(struct brw_wm_compile *c,
1480                      const struct prog_instruction *inst)
1481 {
1482     struct brw_compile *p = &c->func;
1483     GLuint mask = inst->DstReg.WriteMask;
1484     struct brw_reg interp[4];
1485     struct brw_reg dst;
1486     struct brw_reg src0, w;
1487     GLuint nr, i;
1488     src0 = get_src_reg(c, inst, 0, 0);
1489     w = get_src_reg(c, inst, 1, 3);
1490     nr = src0.nr;
1491     interp[0] = brw_vec1_grf(nr, 0);
1492     interp[1] = brw_vec1_grf(nr, 4);
1493     interp[2] = brw_vec1_grf(nr+1, 0);
1494     interp[3] = brw_vec1_grf(nr+1, 4);
1495     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1496     for(i = 0; i < 4; i++ ) {
1497         if (mask & (1<<i)) {
1498             dst = get_dst_reg(c, inst, i);
1499             brw_MOV(p, dst, interp[i]);
1500             brw_MUL(p, dst, dst, w);
1501         }
1502     }
1503     brw_set_saturate(p, 0);
1504 }
1505
1506 static void emit_ddy(struct brw_wm_compile *c,
1507                      const struct prog_instruction *inst)
1508 {
1509     struct brw_compile *p = &c->func;
1510     GLuint mask = inst->DstReg.WriteMask;
1511     struct brw_reg interp[4];
1512     struct brw_reg dst;
1513     struct brw_reg src0, w;
1514     GLuint nr, i;
1515
1516     src0 = get_src_reg(c, inst, 0, 0);
1517     nr = src0.nr;
1518     w = get_src_reg(c, inst, 1, 3);
1519     interp[0] = brw_vec1_grf(nr, 0);
1520     interp[1] = brw_vec1_grf(nr, 4);
1521     interp[2] = brw_vec1_grf(nr+1, 0);
1522     interp[3] = brw_vec1_grf(nr+1, 4);
1523     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1524     for(i = 0; i < 4; i++ ) {
1525         if (mask & (1<<i)) {
1526             dst = get_dst_reg(c, inst, i);
1527             brw_MOV(p, dst, suboffset(interp[i], 1));
1528             brw_MUL(p, dst, dst, w);
1529         }
1530     }
1531     brw_set_saturate(p, 0);
1532 }
1533
1534 static INLINE struct brw_reg high_words( struct brw_reg reg )
1535 {
1536     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1537                    0, 8, 2 );
1538 }
1539
1540 static INLINE struct brw_reg low_words( struct brw_reg reg )
1541 {
1542     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1543 }
1544
1545 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1546 {
1547     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1548 }
1549
1550 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1551 {
1552     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1553                    0, 16, 2 );
1554 }
1555
1556 /* One-, two- and three-dimensional Perlin noise, similar to the description
1557    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1558 static void noise1_sub( struct brw_wm_compile *c ) {
1559
1560     struct brw_compile *p = &c->func;
1561     struct brw_reg param,
1562         x0, x1, /* gradients at each end */
1563         t, tmp[ 2 ], /* float temporaries */
1564         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1565     int i;
1566     int mark = mark_tmps( c );
1567
1568     x0 = alloc_tmp( c );
1569     x1 = alloc_tmp( c );
1570     t = alloc_tmp( c );
1571     tmp[ 0 ] = alloc_tmp( c );
1572     tmp[ 1 ] = alloc_tmp( c );
1573     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1574     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1575     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1576     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1577     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1578
1579     param = lookup_tmp( c, mark - 2 );
1580
1581     brw_set_access_mode( p, BRW_ALIGN_1 );
1582
1583     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1584
1585     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1586        be hashed.  Also compute the remainder (offset within the unit
1587        length), interleaved to reduce register dependency penalties. */
1588     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1589     brw_FRC( p, param, param );
1590     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1591     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1592     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1593
1594     /* We're now ready to perform the hashing.  The two hashes are
1595        interleaved for performance.  The hash function used is
1596        designed to rapidly achieve avalanche and require only 32x16
1597        bit multiplication, and 16-bit swizzles (which we get for
1598        free).  We can't use immediate operands in the multiplies,
1599        because immediates are permitted only in src1 and the 16-bit
1600        factor is permitted only in src0. */
1601     for( i = 0; i < 2; i++ )
1602         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1603     for( i = 0; i < 2; i++ )
1604        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1605                 high_words( itmp[ i ] ) );
1606     for( i = 0; i < 2; i++ )
1607         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1608     for( i = 0; i < 2; i++ )
1609        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1610                 high_words( itmp[ i ] ) );
1611     for( i = 0; i < 2; i++ )
1612         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1613     for( i = 0; i < 2; i++ )
1614        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1615                 high_words( itmp[ i ] ) );
1616
1617     /* Now we want to initialise the two gradients based on the
1618        hashes.  Format conversion from signed integer to float leaves
1619        everything scaled too high by a factor of pow( 2, 31 ), but
1620        we correct for that right at the end. */
1621     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1622     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1623     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1624
1625     brw_MUL( p, x0, x0, param );
1626     brw_MUL( p, x1, x1, t );
1627
1628     /* We interpolate between the gradients using the polynomial
1629        6t^5 - 15t^4 + 10t^3 (Perlin). */
1630     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1631     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1632     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1633     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1634     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1635     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1636                                            pipeline */
1637     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1638     brw_MUL( p, param, tmp[ 0 ], param );
1639     brw_MUL( p, x1, x1, param );
1640     brw_ADD( p, x0, x0, x1 );
1641     /* scale by pow( 2, -30 ), to compensate for the format conversion
1642        above and an extra factor of 2 so that a single gradient covers
1643        the [-1,1] range */
1644     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1645
1646     release_tmps( c, mark );
1647 }
1648
1649 static void emit_noise1( struct brw_wm_compile *c,
1650                          const struct prog_instruction *inst )
1651 {
1652     struct brw_compile *p = &c->func;
1653     struct brw_reg src, param, dst;
1654     GLuint mask = inst->DstReg.WriteMask;
1655     int i;
1656     int mark = mark_tmps( c );
1657
1658     assert( mark == 0 );
1659
1660     src = get_src_reg( c, inst, 0, 0 );
1661
1662     param = alloc_tmp( c );
1663
1664     brw_MOV( p, param, src );
1665
1666     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1667
1668     /* Fill in the result: */
1669     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1670     for (i = 0 ; i < 4; i++) {
1671         if (mask & (1<<i)) {
1672             dst = get_dst_reg(c, inst, i);
1673             brw_MOV( p, dst, param );
1674         }
1675     }
1676     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1677         brw_set_saturate( p, 0 );
1678
1679     release_tmps( c, mark );
1680 }
1681
1682 static void noise2_sub( struct brw_wm_compile *c ) {
1683
1684     struct brw_compile *p = &c->func;
1685     struct brw_reg param0, param1,
1686         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1687         t, tmp[ 4 ], /* float temporaries */
1688         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1689     int i;
1690     int mark = mark_tmps( c );
1691
1692     x0y0 = alloc_tmp( c );
1693     x0y1 = alloc_tmp( c );
1694     x1y0 = alloc_tmp( c );
1695     x1y1 = alloc_tmp( c );
1696     t = alloc_tmp( c );
1697     for( i = 0; i < 4; i++ ) {
1698         tmp[ i ] = alloc_tmp( c );
1699         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1700     }
1701     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1702     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1703     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1704
1705     param0 = lookup_tmp( c, mark - 3 );
1706     param1 = lookup_tmp( c, mark - 2 );
1707
1708     brw_set_access_mode( p, BRW_ALIGN_1 );
1709
1710     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1711        be hashed.  Also compute the remainders (offsets within the unit
1712        square), interleaved to reduce register dependency penalties. */
1713     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1714     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1715     brw_FRC( p, param0, param0 );
1716     brw_FRC( p, param1, param1 );
1717     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1718     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1719              low_words( itmp[ 1 ] ) );
1720     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1721     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1722     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1723     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1724     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1725
1726     /* We're now ready to perform the hashing.  The four hashes are
1727        interleaved for performance.  The hash function used is
1728        designed to rapidly achieve avalanche and require only 32x16
1729        bit multiplication, and 16-bit swizzles (which we get for
1730        free).  We can't use immediate operands in the multiplies,
1731        because immediates are permitted only in src1 and the 16-bit
1732        factor is permitted only in src0. */
1733     for( i = 0; i < 4; i++ )
1734         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1735     for( i = 0; i < 4; i++ )
1736         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1737                  high_words( itmp[ i ] ) );
1738     for( i = 0; i < 4; i++ )
1739         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1740     for( i = 0; i < 4; i++ )
1741         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1742                  high_words( itmp[ i ] ) );
1743     for( i = 0; i < 4; i++ )
1744         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1745     for( i = 0; i < 4; i++ )
1746         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1747                  high_words( itmp[ i ] ) );
1748
1749     /* Now we want to initialise the four gradients based on the
1750        hashes.  Format conversion from signed integer to float leaves
1751        everything scaled too high by a factor of pow( 2, 15 ), but
1752        we correct for that right at the end. */
1753     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1754     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1755     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1756     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1757     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1758
1759     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1760     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1761     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1762     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1763
1764     brw_MUL( p, x1y0, x1y0, t );
1765     brw_MUL( p, x1y1, x1y1, t );
1766     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1767     brw_MUL( p, x0y0, x0y0, param0 );
1768     brw_MUL( p, x0y1, x0y1, param0 );
1769
1770     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1771     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1772     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1773     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1774
1775     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1776     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1777     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1778     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1779
1780     /* We interpolate between the gradients using the polynomial
1781        6t^5 - 15t^4 + 10t^3 (Perlin). */
1782     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1783     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1784     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1785     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1786     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1787     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1788     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1789                                                  pipeline */
1790     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1791     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1792     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1793     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1794     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1795                                                  pipeline */
1796     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1797     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1798     brw_MUL( p, param0, tmp[ 0 ], param0 );
1799     brw_MUL( p, param1, tmp[ 1 ], param1 );
1800
1801     /* Here we interpolate in the y dimension... */
1802     brw_MUL( p, x0y1, x0y1, param1 );
1803     brw_MUL( p, x1y1, x1y1, param1 );
1804     brw_ADD( p, x0y0, x0y0, x0y1 );
1805     brw_ADD( p, x1y0, x1y0, x1y1 );
1806
1807     /* And now in x.  There are horrible register dependencies here,
1808        but we have nothing else to do. */
1809     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1810     brw_MUL( p, x1y0, x1y0, param0 );
1811     brw_ADD( p, x0y0, x0y0, x1y0 );
1812
1813     /* scale by pow( 2, -15 ), as described above */
1814     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1815
1816     release_tmps( c, mark );
1817 }
1818
1819 static void emit_noise2( struct brw_wm_compile *c,
1820                          const struct prog_instruction *inst )
1821 {
1822     struct brw_compile *p = &c->func;
1823     struct brw_reg src0, src1, param0, param1, dst;
1824     GLuint mask = inst->DstReg.WriteMask;
1825     int i;
1826     int mark = mark_tmps( c );
1827
1828     assert( mark == 0 );
1829
1830     src0 = get_src_reg( c, inst, 0, 0 );
1831     src1 = get_src_reg( c, inst, 0, 1 );
1832
1833     param0 = alloc_tmp( c );
1834     param1 = alloc_tmp( c );
1835
1836     brw_MOV( p, param0, src0 );
1837     brw_MOV( p, param1, src1 );
1838
1839     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1840
1841     /* Fill in the result: */
1842     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1843     for (i = 0 ; i < 4; i++) {
1844         if (mask & (1<<i)) {
1845             dst = get_dst_reg(c, inst, i);
1846             brw_MOV( p, dst, param0 );
1847         }
1848     }
1849     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1850         brw_set_saturate( p, 0 );
1851
1852     release_tmps( c, mark );
1853 }
1854
1855 /**
1856  * The three-dimensional case is much like the one- and two- versions above,
1857  * but since the number of corners is rapidly growing we now pack 16 16-bit
1858  * hashes into each register to extract more parallelism from the EUs.
1859  */
1860 static void noise3_sub( struct brw_wm_compile *c ) {
1861
1862     struct brw_compile *p = &c->func;
1863     struct brw_reg param0, param1, param2,
1864         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1865         xi, yi, zi, /* interpolation coefficients */
1866         t, tmp[ 8 ], /* float temporaries */
1867         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1868         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1869     int i;
1870     int mark = mark_tmps( c );
1871
1872     x0y0 = alloc_tmp( c );
1873     x0y1 = alloc_tmp( c );
1874     x1y0 = alloc_tmp( c );
1875     x1y1 = alloc_tmp( c );
1876     xi = alloc_tmp( c );
1877     yi = alloc_tmp( c );
1878     zi = alloc_tmp( c );
1879     t = alloc_tmp( c );
1880     for( i = 0; i < 8; i++ ) {
1881         tmp[ i ] = alloc_tmp( c );
1882         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1883         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1884     }
1885
1886     param0 = lookup_tmp( c, mark - 4 );
1887     param1 = lookup_tmp( c, mark - 3 );
1888     param2 = lookup_tmp( c, mark - 2 );
1889
1890     brw_set_access_mode( p, BRW_ALIGN_1 );
1891
1892     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1893        be hashed.  Also compute the remainders (offsets within the unit
1894        cube), interleaved to reduce register dependency penalties. */
1895     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1896     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1897     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1898     brw_FRC( p, param0, param0 );
1899     brw_FRC( p, param1, param1 );
1900     brw_FRC( p, param2, param2 );
1901     /* Since we now have only 16 bits of precision in the hash, we must
1902        be more careful about thorough mixing to maintain entropy as we
1903        squash the input vector into a small scalar. */
1904     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1905     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1906     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1907              brw_imm_uw( 0x9B93 ) );
1908     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1909              brw_imm_uw( 0xBC8F ) );
1910
1911     /* Temporarily disable the execution mask while we work with ExecSize=16
1912        channels (the mask is set for ExecSize=8 and is probably incorrect).
1913        Although this might cause execution of unwanted channels, the code
1914        writes only to temporary registers and has no side effects, so
1915        disabling the mask is harmless. */
1916     brw_push_insn_state( p );
1917     brw_set_mask_control( p, BRW_MASK_DISABLE );
1918     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1919     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1920     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1921
1922     /* We're now ready to perform the hashing.  The eight hashes are
1923        interleaved for performance.  The hash function used is
1924        designed to rapidly achieve avalanche and require only 16x16
1925        bit multiplication, and 8-bit swizzles (which we get for
1926        free). */
1927     for( i = 0; i < 4; i++ )
1928         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1929     for( i = 0; i < 4; i++ )
1930         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1931                  odd_bytes( wtmp[ i ] ) );
1932     for( i = 0; i < 4; i++ )
1933         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1934     for( i = 0; i < 4; i++ )
1935         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1936                  odd_bytes( wtmp[ i ] ) );
1937     brw_pop_insn_state( p );
1938
1939     /* Now we want to initialise the four rear gradients based on the
1940        hashes.  Format conversion from signed integer to float leaves
1941        everything scaled too high by a factor of pow( 2, 15 ), but
1942        we correct for that right at the end. */
1943     /* x component */
1944     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1945     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1946     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1947     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1948     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1949
1950     brw_push_insn_state( p );
1951     brw_set_mask_control( p, BRW_MASK_DISABLE );
1952     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1953     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1954     brw_pop_insn_state( p );
1955
1956     brw_MUL( p, x1y0, x1y0, t );
1957     brw_MUL( p, x1y1, x1y1, t );
1958     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1959     brw_MUL( p, x0y0, x0y0, param0 );
1960     brw_MUL( p, x0y1, x0y1, param0 );
1961
1962     /* y component */
1963     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1964     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1965     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1966     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1967
1968     brw_push_insn_state( p );
1969     brw_set_mask_control( p, BRW_MASK_DISABLE );
1970     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1971     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1972     brw_pop_insn_state( p );
1973
1974     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1975     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1976     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1977     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1978     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1979
1980     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1981     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1982     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1983     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1984
1985     /* z component */
1986     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1987     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1988     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1989     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1990
1991     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1992     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1993     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1994     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1995
1996     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1997     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1998     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1999     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2000
2001     /* We interpolate between the gradients using the polynomial
2002        6t^5 - 15t^4 + 10t^3 (Perlin). */
2003     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2004     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2005     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2006     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2007     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2008     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2009     brw_MUL( p, xi, xi, param0 );
2010     brw_MUL( p, yi, yi, param1 );
2011     brw_MUL( p, zi, zi, param2 );
2012     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2013     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2014     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2015     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2016     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2017     brw_MUL( p, xi, xi, param0 );
2018     brw_MUL( p, yi, yi, param1 );
2019     brw_MUL( p, zi, zi, param2 );
2020     brw_MUL( p, xi, xi, param0 );
2021     brw_MUL( p, yi, yi, param1 );
2022     brw_MUL( p, zi, zi, param2 );
2023     brw_MUL( p, xi, xi, param0 );
2024     brw_MUL( p, yi, yi, param1 );
2025     brw_MUL( p, zi, zi, param2 );
2026
2027     /* Here we interpolate in the y dimension... */
2028     brw_MUL( p, x0y1, x0y1, yi );
2029     brw_MUL( p, x1y1, x1y1, yi );
2030     brw_ADD( p, x0y0, x0y0, x0y1 );
2031     brw_ADD( p, x1y0, x1y0, x1y1 );
2032
2033     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2034     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2035     brw_MUL( p, x1y0, x1y0, xi );
2036     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2037
2038     /* Now do the same thing for the front four gradients... */
2039     /* x component */
2040     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2041     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2042     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2043     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2044
2045     brw_push_insn_state( p );
2046     brw_set_mask_control( p, BRW_MASK_DISABLE );
2047     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2048     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2049     brw_pop_insn_state( p );
2050
2051     brw_MUL( p, x1y0, x1y0, t );
2052     brw_MUL( p, x1y1, x1y1, t );
2053     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2054     brw_MUL( p, x0y0, x0y0, param0 );
2055     brw_MUL( p, x0y1, x0y1, param0 );
2056
2057     /* y component */
2058     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2059     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2060     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2061     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2062
2063     brw_push_insn_state( p );
2064     brw_set_mask_control( p, BRW_MASK_DISABLE );
2065     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2066     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2067     brw_pop_insn_state( p );
2068
2069     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2070     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2071     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2072     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2073     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2074
2075     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2076     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2077     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2078     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2079
2080     /* z component */
2081     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2082     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2083     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2084     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2085
2086     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2087     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2088     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2089     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2090
2091     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2092     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2093     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2094     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2095
2096     /* The interpolation coefficients are still around from last time, so
2097        again interpolate in the y dimension... */
2098     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2099     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2100     brw_MUL( p, x0y1, x0y1, yi );
2101     brw_MUL( p, x1y1, x1y1, yi );
2102     brw_ADD( p, x0y0, x0y0, x0y1 );
2103     brw_ADD( p, x1y0, x1y0, x1y1 );
2104
2105     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2106        time put the front face in tmp[ 1 ] and we're nearly there... */
2107     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2108     brw_MUL( p, x1y0, x1y0, xi );
2109     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2110
2111     /* The final interpolation, in the z dimension: */
2112     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2113     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2114     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2115
2116     /* scale by pow( 2, -15 ), as described above */
2117     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2118
2119     release_tmps( c, mark );
2120 }
2121
2122 static void emit_noise3( struct brw_wm_compile *c,
2123                          const struct prog_instruction *inst )
2124 {
2125     struct brw_compile *p = &c->func;
2126     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2127     GLuint mask = inst->DstReg.WriteMask;
2128     int i;
2129     int mark = mark_tmps( c );
2130
2131     assert( mark == 0 );
2132
2133     src0 = get_src_reg( c, inst, 0, 0 );
2134     src1 = get_src_reg( c, inst, 0, 1 );
2135     src2 = get_src_reg( c, inst, 0, 2 );
2136
2137     param0 = alloc_tmp( c );
2138     param1 = alloc_tmp( c );
2139     param2 = alloc_tmp( c );
2140
2141     brw_MOV( p, param0, src0 );
2142     brw_MOV( p, param1, src1 );
2143     brw_MOV( p, param2, src2 );
2144
2145     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2146
2147     /* Fill in the result: */
2148     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2149     for (i = 0 ; i < 4; i++) {
2150         if (mask & (1<<i)) {
2151             dst = get_dst_reg(c, inst, i);
2152             brw_MOV( p, dst, param0 );
2153         }
2154     }
2155     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2156         brw_set_saturate( p, 0 );
2157
2158     release_tmps( c, mark );
2159 }
2160
2161 /**
2162  * For the four-dimensional case, the little micro-optimisation benefits
2163  * we obtain by unrolling all the loops aren't worth the massive bloat it
2164  * now causes.  Instead, we loop twice around performing a similar operation
2165  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2166  * code to glue it all together.
2167  */
2168 static void noise4_sub( struct brw_wm_compile *c )
2169 {
2170     struct brw_compile *p = &c->func;
2171     struct brw_reg param[ 4 ],
2172         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2173         w0, /* noise for the w=0 cube */
2174         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2175         interp[ 4 ], /* interpolation coefficients */
2176         t, tmp[ 8 ], /* float temporaries */
2177         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2178         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2179     int i, j;
2180     int mark = mark_tmps( c );
2181     GLuint loop, origin;
2182
2183     x0y0 = alloc_tmp( c );
2184     x0y1 = alloc_tmp( c );
2185     x1y0 = alloc_tmp( c );
2186     x1y1 = alloc_tmp( c );
2187     t = alloc_tmp( c );
2188     w0 = alloc_tmp( c );
2189     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2190     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2191
2192     for( i = 0; i < 4; i++ ) {
2193         param[ i ] = lookup_tmp( c, mark - 5 + i );
2194         interp[ i ] = alloc_tmp( c );
2195     }
2196
2197     for( i = 0; i < 8; i++ ) {
2198         tmp[ i ] = alloc_tmp( c );
2199         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2200         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2201     }
2202
2203     brw_set_access_mode( p, BRW_ALIGN_1 );
2204
2205     /* We only want 16 bits of precision from the integral part of each
2206        co-ordinate, but unfortunately the RNDD semantics would saturate
2207        at 16 bits if we performed the operation directly to a 16-bit
2208        destination.  Therefore, we round to 32-bit temporaries where
2209        appropriate, and then store only the lower 16 bits. */
2210     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2211     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2212     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2213     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2214     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2215     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2216
2217     /* Modify the flag register here, because the side effect is useful
2218        later (see below).  We know for certain that all flags will be
2219        cleared, since the FRC instruction cannot possibly generate
2220        negative results.  Even for exceptional inputs (infinities, denormals,
2221        NaNs), the architecture guarantees that the L conditional is false. */
2222     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2223     brw_FRC( p, param[ 0 ], param[ 0 ] );
2224     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2225     for( i = 1; i < 4; i++ )
2226         brw_FRC( p, param[ i ], param[ i ] );
2227
2228     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2229        of all. */
2230     for( i = 0; i < 4; i++ )
2231         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2232     for( i = 0; i < 4; i++ )
2233         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2234     for( i = 0; i < 4; i++ )
2235         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2236     for( i = 0; i < 4; i++ )
2237         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2238     for( j = 0; j < 3; j++ )
2239         for( i = 0; i < 4; i++ )
2240             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2241
2242     /* Mark the current address, as it will be a jump destination.  The
2243        following code will be executed twice: first, with the flag
2244        register clear indicating the w=0 case, and second with flags
2245        set for w=1. */
2246     loop = p->nr_insn;
2247
2248     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2249        be hashed.  Since we have only 16 bits of precision in the hash, we
2250        must be careful about thorough mixing to maintain entropy as we
2251        squash the input vector into a small scalar. */
2252     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2253              brw_imm_uw( 0xBC8F ) );
2254     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2255              brw_imm_uw( 0xD0BD ) );
2256     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2257              brw_imm_uw( 0x9B93 ) );
2258     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2259              brw_imm_uw( 0xA359 ) );
2260     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2261              brw_imm_uw( 0xBC8F ) );
2262
2263     /* Temporarily disable the execution mask while we work with ExecSize=16
2264        channels (the mask is set for ExecSize=8 and is probably incorrect).
2265        Although this might cause execution of unwanted channels, the code
2266        writes only to temporary registers and has no side effects, so
2267        disabling the mask is harmless. */
2268     brw_push_insn_state( p );
2269     brw_set_mask_control( p, BRW_MASK_DISABLE );
2270     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2271     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2272     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2273
2274     /* We're now ready to perform the hashing.  The eight hashes are
2275        interleaved for performance.  The hash function used is
2276        designed to rapidly achieve avalanche and require only 16x16
2277        bit multiplication, and 8-bit swizzles (which we get for
2278        free). */
2279     for( i = 0; i < 4; i++ )
2280         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2281     for( i = 0; i < 4; i++ )
2282         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2283                  odd_bytes( wtmp[ i ] ) );
2284     for( i = 0; i < 4; i++ )
2285         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2286     for( i = 0; i < 4; i++ )
2287         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2288                  odd_bytes( wtmp[ i ] ) );
2289     brw_pop_insn_state( p );
2290
2291     /* Now we want to initialise the four rear gradients based on the
2292        hashes.  Format conversion from signed integer to float leaves
2293        everything scaled too high by a factor of pow( 2, 15 ), but
2294        we correct for that right at the end. */
2295     /* x component */
2296     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2297     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2298     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2299     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2300     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2301
2302     brw_push_insn_state( p );
2303     brw_set_mask_control( p, BRW_MASK_DISABLE );
2304     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2305     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2306     brw_pop_insn_state( p );
2307
2308     brw_MUL( p, x1y0, x1y0, t );
2309     brw_MUL( p, x1y1, x1y1, t );
2310     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2311     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2312     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2313
2314     /* y component */
2315     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2316     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2317     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2318     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2319
2320     brw_push_insn_state( p );
2321     brw_set_mask_control( p, BRW_MASK_DISABLE );
2322     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2323     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2324     brw_pop_insn_state( p );
2325
2326     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2327     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2328     /* prepare t for the w component (used below): w the first time through
2329        the loop; w - 1 the second time) */
2330     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2331     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2332     p->current->header.predicate_inverse = 1;
2333     brw_MOV( p, t, param[ 3 ] );
2334     p->current->header.predicate_inverse = 0;
2335     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2336     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2337     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2338
2339     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2340     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2341     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2342     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2343
2344     /* z component */
2345     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2346     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2347     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2348     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2349
2350     brw_push_insn_state( p );
2351     brw_set_mask_control( p, BRW_MASK_DISABLE );
2352     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2353     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2354     brw_pop_insn_state( p );
2355
2356     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2357     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2358     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2359     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2360
2361     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2362     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2363     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2364     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2365
2366     /* w component */
2367     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2368     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2369     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2370     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2371
2372     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2373     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2374     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2375     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2376     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2377
2378     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2379     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2380     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2381     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2382
2383     /* Here we interpolate in the y dimension... */
2384     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2385     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2386     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2387     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2388     brw_ADD( p, x0y0, x0y0, x0y1 );
2389     brw_ADD( p, x1y0, x1y0, x1y1 );
2390
2391     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2392     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2393     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2394     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2395
2396     /* Now do the same thing for the front four gradients... */
2397     /* x component */
2398     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2399     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2400     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2401     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2402
2403     brw_push_insn_state( p );
2404     brw_set_mask_control( p, BRW_MASK_DISABLE );
2405     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2406     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2407     brw_pop_insn_state( p );
2408
2409     brw_MUL( p, x1y0, x1y0, t );
2410     brw_MUL( p, x1y1, x1y1, t );
2411     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2412     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2413     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2414
2415     /* y component */
2416     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2417     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2418     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2419     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2420
2421     brw_push_insn_state( p );
2422     brw_set_mask_control( p, BRW_MASK_DISABLE );
2423     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2424     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2425     brw_pop_insn_state( p );
2426
2427     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2428     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2429     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2430     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2431     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2432
2433     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2434     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2435     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2436     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2437
2438     /* z component */
2439     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2440     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2441     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2442     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2443
2444     brw_push_insn_state( p );
2445     brw_set_mask_control( p, BRW_MASK_DISABLE );
2446     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2447     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2448     brw_pop_insn_state( p );
2449
2450     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2451     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2452     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2453     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2454     /* prepare t for the w component (used below): w the first time through
2455        the loop; w - 1 the second time) */
2456     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2457     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2458     p->current->header.predicate_inverse = 1;
2459     brw_MOV( p, t, param[ 3 ] );
2460     p->current->header.predicate_inverse = 0;
2461     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2462
2463     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2464     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2465     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2466     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2467
2468     /* w component */
2469     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2470     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2471     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2472     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2473
2474     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2475     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2476     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2477     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2478
2479     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2480     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2481     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2482     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2483
2484     /* Interpolate in the y dimension: */
2485     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2486     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2487     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2488     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2489     brw_ADD( p, x0y0, x0y0, x0y1 );
2490     brw_ADD( p, x1y0, x1y0, x1y1 );
2491
2492     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2493        time put the front face in tmp[ 1 ] and we're nearly there... */
2494     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2495     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2496     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2497
2498     /* Another interpolation, in the z dimension: */
2499     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2500     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2501     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2502
2503     /* Exit the loop if we've computed both cubes... */
2504     origin = p->nr_insn;
2505     brw_push_insn_state( p );
2506     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2507     brw_set_mask_control( p, BRW_MASK_DISABLE );
2508     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2509     brw_pop_insn_state( p );
2510
2511     /* Save the result for the w=0 case, and increment the w coordinate: */
2512     brw_MOV( p, w0, tmp[ 0 ] );
2513     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2514              brw_imm_uw( 1 ) );
2515
2516     /* Loop around for the other cube.  Explicitly set the flag register
2517        (unfortunately we must spend an extra instruction to do this: we
2518        can't rely on a side effect of the previous MOV or ADD because
2519        conditional modifiers which are normally true might be false in
2520        exceptional circumstances, e.g. given a NaN input; the add to
2521        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2522     brw_push_insn_state( p );
2523     brw_set_mask_control( p, BRW_MASK_DISABLE );
2524     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2525     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2526              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2527     brw_pop_insn_state( p );
2528
2529     /* Patch the previous conditional branch now that we know the
2530        destination address. */
2531     brw_set_src1( p->store + origin,
2532                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2533
2534     /* The very last interpolation. */
2535     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2536     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2537     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2538
2539     /* scale by pow( 2, -15 ), as described above */
2540     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2541
2542     release_tmps( c, mark );
2543 }
2544
2545 static void emit_noise4( struct brw_wm_compile *c,
2546                          const struct prog_instruction *inst )
2547 {
2548     struct brw_compile *p = &c->func;
2549     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2550     GLuint mask = inst->DstReg.WriteMask;
2551     int i;
2552     int mark = mark_tmps( c );
2553
2554     assert( mark == 0 );
2555
2556     src0 = get_src_reg( c, inst, 0, 0 );
2557     src1 = get_src_reg( c, inst, 0, 1 );
2558     src2 = get_src_reg( c, inst, 0, 2 );
2559     src3 = get_src_reg( c, inst, 0, 3 );
2560
2561     param0 = alloc_tmp( c );
2562     param1 = alloc_tmp( c );
2563     param2 = alloc_tmp( c );
2564     param3 = alloc_tmp( c );
2565
2566     brw_MOV( p, param0, src0 );
2567     brw_MOV( p, param1, src1 );
2568     brw_MOV( p, param2, src2 );
2569     brw_MOV( p, param3, src3 );
2570
2571     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2572
2573     /* Fill in the result: */
2574     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2575     for (i = 0 ; i < 4; i++) {
2576         if (mask & (1<<i)) {
2577             dst = get_dst_reg(c, inst, i);
2578             brw_MOV( p, dst, param0 );
2579         }
2580     }
2581     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2582         brw_set_saturate( p, 0 );
2583
2584     release_tmps( c, mark );
2585 }
2586
2587 static void emit_wpos_xy(struct brw_wm_compile *c,
2588                          const struct prog_instruction *inst)
2589 {
2590     struct brw_compile *p = &c->func;
2591     GLuint mask = inst->DstReg.WriteMask;
2592     struct brw_reg src0[2], dst[2];
2593
2594     dst[0] = get_dst_reg(c, inst, 0);
2595     dst[1] = get_dst_reg(c, inst, 1);
2596
2597     src0[0] = get_src_reg(c, inst, 0, 0);
2598     src0[1] = get_src_reg(c, inst, 0, 1);
2599
2600     /* Calculate the pixel offset from window bottom left into destination
2601      * X and Y channels.
2602      */
2603     if (mask & WRITEMASK_X) {
2604         /* X' = X - origin_x */
2605         brw_ADD(p,
2606                 dst[0],
2607                 retype(src0[0], BRW_REGISTER_TYPE_W),
2608                 brw_imm_d(0 - c->key.origin_x));
2609     }
2610
2611     if (mask & WRITEMASK_Y) {
2612         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2613         brw_ADD(p,
2614                 dst[1],
2615                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2616                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2617     }
2618 }
2619
2620 /* TODO
2621    BIAS on SIMD8 not working yet...
2622  */
2623 static void emit_txb(struct brw_wm_compile *c,
2624                      const struct prog_instruction *inst)
2625 {
2626     struct brw_compile *p = &c->func;
2627     struct brw_reg dst[4], src[4], payload_reg;
2628     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2629     GLuint i;
2630
2631     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2632
2633     for (i = 0; i < 4; i++)
2634         dst[i] = get_dst_reg(c, inst, i);
2635     for (i = 0; i < 4; i++)
2636         src[i] = get_src_reg(c, inst, 0, i);
2637
2638     switch (inst->TexSrcTarget) {
2639         case TEXTURE_1D_INDEX:
2640             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2641             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2642             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2643             break;
2644         case TEXTURE_2D_INDEX:
2645         case TEXTURE_RECT_INDEX:
2646             brw_MOV(p, brw_message_reg(2), src[0]);
2647             brw_MOV(p, brw_message_reg(3), src[1]);
2648             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2649             break;
2650         default:
2651             brw_MOV(p, brw_message_reg(2), src[0]);
2652             brw_MOV(p, brw_message_reg(3), src[1]);
2653             brw_MOV(p, brw_message_reg(4), src[2]);
2654             break;
2655     }
2656     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2657     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2658     brw_SAMPLE(p,
2659                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2660                1,                                           /* msg_reg_nr */
2661                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2662                SURF_INDEX_TEXTURE(unit),
2663                unit,                                        /* sampler */
2664                inst->DstReg.WriteMask,                      /* writemask */
2665                BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
2666                4,                                           /* response_length */
2667                4,                                           /* msg_length */
2668                0);                                          /* eot */
2669 }
2670
2671
2672 static void emit_tex(struct brw_wm_compile *c,
2673                      const struct prog_instruction *inst)
2674 {
2675     struct brw_compile *p = &c->func;
2676     struct brw_reg dst[4], src[4], payload_reg;
2677     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2678     GLuint msg_len;
2679     GLuint i, nr;
2680     GLuint emit;
2681     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2682
2683     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2684
2685     for (i = 0; i < 4; i++)
2686         dst[i] = get_dst_reg(c, inst, i);
2687     for (i = 0; i < 4; i++)
2688         src[i] = get_src_reg(c, inst, 0, i);
2689
2690     switch (inst->TexSrcTarget) {
2691         case TEXTURE_1D_INDEX:
2692             emit = WRITEMASK_X;
2693             nr = 1;
2694             break;
2695         case TEXTURE_2D_INDEX:
2696         case TEXTURE_RECT_INDEX:
2697             emit = WRITEMASK_XY;
2698             nr = 2;
2699             break;
2700         default:
2701             emit = WRITEMASK_XYZ;
2702             nr = 3;
2703             break;
2704     }
2705     msg_len = 1;
2706
2707     /* move/load S, T, R coords */
2708     for (i = 0; i < nr; i++) {
2709         static const GLuint swz[4] = {0,1,2,2};
2710         if (emit & (1<<i))
2711             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2712         else
2713             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2714         msg_len += 1;
2715     }
2716
2717     if (shadow) {
2718        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2719        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2720     }
2721
2722     brw_SAMPLE(p,
2723                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2724                1,                                          /* msg_reg_nr */
2725                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2726                SURF_INDEX_TEXTURE(unit),
2727                unit,                                       /* sampler */
2728                inst->DstReg.WriteMask,                     /* writemask */
2729                BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
2730                4,                                          /* response_length */
2731                shadow ? 6 : 4,                             /* msg_length */
2732                0);                                         /* eot */
2733
2734     if (shadow)
2735         brw_MOV(p, dst[3], brw_imm_f(1.0));
2736 }
2737
2738
2739 /**
2740  * Resolve subroutine calls after code emit is done.
2741  */
2742 static void post_wm_emit( struct brw_wm_compile *c )
2743 {
2744     brw_resolve_cals(&c->func);
2745 }
2746
2747 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2748 {
2749 #define MAX_IFSN 32
2750 #define MAX_LOOP_DEPTH 32
2751     struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2752     struct brw_instruction *inst0, *inst1;
2753     int i, if_insn = 0, loop_insn = 0;
2754     struct brw_compile *p = &c->func;
2755     struct brw_indirect stack_index = brw_indirect(0, 0);
2756
2757     c->out_of_regs = GL_FALSE;
2758
2759     prealloc_reg(c);
2760     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2761     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2762
2763     for (i = 0; i < c->nr_fp_insns; i++) {
2764         const struct prog_instruction *inst = &c->prog_instructions[i];
2765
2766         c->cur_inst = i;
2767
2768 #if 0
2769         _mesa_printf("Inst %d: ", i);
2770         _mesa_print_instruction(inst);
2771 #endif
2772
2773         /* fetch any constants that this instruction needs */
2774         if (c->fp->use_const_buffer)
2775            fetch_constants(c, inst);
2776
2777         if (inst->CondUpdate)
2778             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2779         else
2780             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2781
2782         switch (inst->Opcode) {
2783             case WM_PIXELXY:
2784                 emit_pixel_xy(c, inst);
2785                 break;
2786             case WM_DELTAXY:
2787                 emit_delta_xy(c, inst);
2788                 break;
2789             case WM_PIXELW:
2790                 emit_pixel_w(c, inst);
2791                 break;
2792             case WM_LINTERP:
2793                 emit_linterp(c, inst);
2794                 break;
2795             case WM_PINTERP:
2796                 emit_pinterp(c, inst);
2797                 break;
2798             case WM_CINTERP:
2799                 emit_cinterp(c, inst);
2800                 break;
2801             case WM_WPOSXY:
2802                 emit_wpos_xy(c, inst);
2803                 break;
2804             case WM_FB_WRITE:
2805                 emit_fb_write(c, inst);
2806                 break;
2807             case WM_FRONTFACING:
2808                 emit_frontfacing(c, inst);
2809                 break;
2810             case OPCODE_ABS:
2811                 emit_abs(c, inst);
2812                 break;
2813             case OPCODE_ADD:
2814                 emit_add(c, inst);
2815                 break;
2816             case OPCODE_ARL:
2817                 emit_arl(c, inst);
2818                 break;
2819             case OPCODE_SUB:
2820                 emit_sub(c, inst);
2821                 break;
2822             case OPCODE_FRC:
2823                 emit_frc(c, inst);
2824                 break;
2825             case OPCODE_FLR:
2826                 emit_flr(c, inst);
2827                 break;
2828             case OPCODE_LRP:
2829                 emit_lrp(c, inst);
2830                 break;
2831             case OPCODE_TRUNC:
2832                 emit_trunc(c, inst);
2833                 break;
2834             case OPCODE_MOV:
2835                 emit_mov(c, inst);
2836                 break;
2837             case OPCODE_DP3:
2838                 emit_dp3(c, inst);
2839                 break;
2840             case OPCODE_DP4:
2841                 emit_dp4(c, inst);
2842                 break;
2843             case OPCODE_XPD:
2844                 emit_xpd(c, inst);
2845                 break;
2846             case OPCODE_DPH:
2847                 emit_dph(c, inst);
2848                 break;
2849             case OPCODE_RCP:
2850                 emit_rcp(c, inst);
2851                 break;
2852             case OPCODE_RSQ:
2853                 emit_rsq(c, inst);
2854                 break;
2855             case OPCODE_SIN:
2856                 emit_sin(c, inst);
2857                 break;
2858             case OPCODE_COS:
2859                 emit_cos(c, inst);
2860                 break;
2861             case OPCODE_EX2:
2862                 emit_ex2(c, inst);
2863                 break;
2864             case OPCODE_LG2:
2865                 emit_lg2(c, inst);
2866                 break;
2867             case OPCODE_MIN:
2868             case OPCODE_MAX:
2869                 emit_min_max(c, inst);
2870                 break;
2871             case OPCODE_DDX:
2872                 emit_ddx(c, inst);
2873                 break;
2874             case OPCODE_DDY:
2875                 emit_ddy(c, inst);
2876                 break;
2877             case OPCODE_SLT:
2878                 emit_slt(c, inst);
2879                 break;
2880             case OPCODE_SLE:
2881                 emit_sle(c, inst);
2882                 break;
2883             case OPCODE_SGT:
2884                 emit_sgt(c, inst);
2885                 break;
2886             case OPCODE_SGE:
2887                 emit_sge(c, inst);
2888                 break;
2889             case OPCODE_SEQ:
2890                 emit_seq(c, inst);
2891                 break;
2892             case OPCODE_SNE:
2893                 emit_sne(c, inst);
2894                 break;
2895             case OPCODE_MUL:
2896                 emit_mul(c, inst);
2897                 break;
2898             case OPCODE_POW:
2899                 emit_pow(c, inst);
2900                 break;
2901             case OPCODE_MAD:
2902                 emit_mad(c, inst);
2903                 break;
2904             case OPCODE_NOISE1:
2905                 emit_noise1(c, inst);
2906                 break;
2907             case OPCODE_NOISE2:
2908                 emit_noise2(c, inst);
2909                 break;
2910             case OPCODE_NOISE3:
2911                 emit_noise3(c, inst);
2912                 break;
2913             case OPCODE_NOISE4:
2914                 emit_noise4(c, inst);
2915                 break;
2916             case OPCODE_TEX:
2917                 emit_tex(c, inst);
2918                 break;
2919             case OPCODE_TXB:
2920                 emit_txb(c, inst);
2921                 break;
2922             case OPCODE_KIL_NV:
2923                 emit_kil(c);
2924                 break;
2925             case OPCODE_IF:
2926                 assert(if_insn < MAX_IFSN);
2927                 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2928                 break;
2929             case OPCODE_ELSE:
2930                 if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
2931                 break;
2932             case OPCODE_ENDIF:
2933                 assert(if_insn > 0);
2934                 brw_ENDIF(p, if_inst[--if_insn]);
2935                 break;
2936             case OPCODE_BGNSUB:
2937                 brw_save_label(p, inst->Comment, p->nr_insn);
2938                 break;
2939             case OPCODE_ENDSUB:
2940                 /* no-op */
2941                 break;
2942             case OPCODE_CAL:
2943                 brw_push_insn_state(p);
2944                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2945                 brw_set_access_mode(p, BRW_ALIGN_1);
2946                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2947                 brw_set_access_mode(p, BRW_ALIGN_16);
2948                 brw_ADD(p, get_addr_reg(stack_index),
2949                          get_addr_reg(stack_index), brw_imm_d(4));
2950                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2951                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2952                 brw_pop_insn_state(p);
2953                 break;
2954
2955             case OPCODE_RET:
2956                 brw_push_insn_state(p);
2957                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2958                 brw_ADD(p, get_addr_reg(stack_index),
2959                         get_addr_reg(stack_index), brw_imm_d(-4));
2960                 brw_set_access_mode(p, BRW_ALIGN_1);
2961                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2962                 brw_set_access_mode(p, BRW_ALIGN_16);
2963                 brw_pop_insn_state(p);
2964
2965                 break;
2966             case OPCODE_BGNLOOP:
2967                 /* XXX may need to invalidate the current_constant regs */
2968                 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2969                 break;
2970             case OPCODE_BRK:
2971                 brw_BREAK(p);
2972                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2973                 break;
2974             case OPCODE_CONT:
2975                 brw_CONT(p);
2976                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2977                 break;
2978             case OPCODE_ENDLOOP:
2979                 loop_insn--;
2980                 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2981                 /* patch all the BREAK instructions from
2982                    last BEGINLOOP */
2983                 while (inst0 > loop_inst[loop_insn]) {
2984                     inst0--;
2985                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2986                         inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2987                         inst0->bits3.if_else.pop_count = 0;
2988                     } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2989                         inst0->bits3.if_else.jump_count = inst1 - inst0;
2990                         inst0->bits3.if_else.pop_count = 0;
2991                     }
2992                 }
2993                 break;
2994             default:
2995                 _mesa_printf("unsupported IR in fragment shader %d\n",
2996                         inst->Opcode);
2997         }
2998
2999         if (inst->CondUpdate)
3000             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3001         else
3002             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3003     }
3004     post_wm_emit(c);
3005 }
3006
3007
3008 /**
3009  * Do GPU code generation for shaders that use GLSL features such as
3010  * flow control.  Other shaders will be compiled with the
3011  */
3012 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3013 {
3014     if (INTEL_DEBUG & DEBUG_WM) {
3015         _mesa_printf("brw_wm_glsl_emit:\n");
3016     }
3017
3018     /* initial instruction translation/simplification */
3019     brw_wm_pass_fp(c);
3020
3021     /* actual code generation */
3022     brw_wm_emit_glsl(brw, c);
3023
3024     if (INTEL_DEBUG & DEBUG_WM) {
3025         brw_wm_print_program(c, "brw_wm_glsl_emit done");
3026     }
3027
3028     c->prog_data.total_grf = num_grf_used(c);
3029     c->prog_data.total_scratch = 0;
3030 }