src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & (1 << i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553
 554 /**
 555  * Same as \sa get_src_reg() but if the register is a literal, emit
 556  * a brw_reg encoding the literal.
 557  * Note that a brw instruction only allows one src operand to be a literal.
 558  * For instructions with more than one operand, only the second can be a
 559  * literal.  This means that we treat some literals as constants/uniforms
 560  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 561  *
 562  */
 563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 564                                       const struct prog_instruction *inst,
 565                                       GLuint srcRegIndex, GLuint channel)
 566 {
 567     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 568     if (src->File == PROGRAM_CONSTANT) {
 569        /* a literal */
 570        const int component = GET_SWZ(src->Swizzle, channel);
 571        const GLfloat *param =
 572           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 573        GLfloat value = param[component];
 574        if (src->Negate & (1 << channel))
 575           value = -value;
 576        if (src->Abs)
 577           value = FABSF(value);
 578 #if 0
 579        printf("  form immed value %f for chan %d\n", value, channel);
 580 #endif
 581        return brw_imm_f(value);
 582     }
 583     else {
 584        return get_src_reg(c, inst, srcRegIndex, channel);
 585     }
 586 }
 587
 588
 589 /**
 590  * Subroutines are minimal support for resusable instruction sequences.
 591  * They are implemented as simply as possible to minimise overhead: there
 592  * is no explicit support for communication between the caller and callee
 593  * other than saving the return address in a temporary register, nor is
 594  * there any automatic local storage.  This implies that great care is
 595  * required before attempting reentrancy or any kind of nested
 596  * subroutine invocations.
 597  */
 598 static void invoke_subroutine( struct brw_wm_compile *c,
 599                                enum _subroutine subroutine,
 600                                void (*emit)( struct brw_wm_compile * ) )
 601 {
 602     struct brw_compile *p = &c->func;
 603
 604     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 605
 606     if( c->subroutines[ subroutine ] ) {
 607         /* subroutine previously emitted: reuse existing instructions */
 608
 609         int mark = mark_tmps( c );
 610         struct brw_reg return_address = retype( alloc_tmp( c ),
 611                                                 BRW_REGISTER_TYPE_UD );
 612         int here = p->nr_insn;
 613
 614         brw_push_insn_state(p);
 615         brw_set_mask_control(p, BRW_MASK_DISABLE);
 616         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 617
 618         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 619                  brw_imm_d( ( c->subroutines[ subroutine ] -
 620                               here - 1 ) << 4 ) );
 621         brw_pop_insn_state(p);
 622
 623         release_tmps( c, mark );
 624     } else {
 625         /* previously unused subroutine: emit, and mark for later reuse */
 626
 627         int mark = mark_tmps( c );
 628         struct brw_reg return_address = retype( alloc_tmp( c ),
 629                                                 BRW_REGISTER_TYPE_UD );
 630         struct brw_instruction *calc;
 631         int base = p->nr_insn;
 632
 633         brw_push_insn_state(p);
 634         brw_set_mask_control(p, BRW_MASK_DISABLE);
 635         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 636         brw_pop_insn_state(p);
 637
 638         c->subroutines[ subroutine ] = p->nr_insn;
 639
 640         emit( c );
 641
 642         brw_push_insn_state(p);
 643         brw_set_mask_control(p, BRW_MASK_DISABLE);
 644         brw_MOV( p, brw_ip_reg(), return_address );
 645         brw_pop_insn_state(p);
 646
 647         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 648
 649         release_tmps( c, mark );
 650     }
 651 }
 652
 653 /* Workaround for using brw_wm_emit.c's emit functions, which expect
 654  * destination regs to be uniquely written.  Moves arguments out to
 655  * temporaries as necessary for instructions which use their destination as
 656  * a temporary.
 657  */
 658 static void
 659 unalias3(struct brw_wm_compile *c,
 660          void (*func)(struct brw_compile *c,
 661                       const struct brw_reg *dst,
 662                       GLuint mask,
 663                       const struct brw_reg *arg0,
 664                       const struct brw_reg *arg1,
 665                       const struct brw_reg *arg2),
 666          const struct brw_reg *dst,
 667          GLuint mask,
 668          const struct brw_reg *arg0,
 669          const struct brw_reg *arg1,
 670          const struct brw_reg *arg2)
 671 {
 672     struct brw_compile *p = &c->func;
 673     struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
 674     int i, j;
 675     int mark = mark_tmps(c);
 676
 677     for (j = 0; j < 4; j++) {
 678         tmp_arg0[j] = arg0[j];
 679         tmp_arg1[j] = arg1[j];
 680         tmp_arg2[j] = arg2[j];
 681     }
 682
 683     for (i = 0; i < 4; i++) {
 684         if (mask & (1<<i)) {
 685             for (j = 0; j < 4; j++) {
 686                 if (arg0[j].file == dst[i].file &&
 687                     dst[i].nr == arg0[j].nr) {
 688                     tmp_arg0[j] = alloc_tmp(c);
 689                     brw_MOV(p, tmp_arg0[j], arg0[j]);
 690                 }
 691                 if (arg1[j].file == dst[i].file &&
 692                     dst[i].nr == arg1[j].nr) {
 693                     tmp_arg1[j] = alloc_tmp(c);
 694                     brw_MOV(p, tmp_arg1[j], arg1[j]);
 695                 }
 696                 if (arg2[j].file == dst[i].file &&
 697                     dst[i].nr == arg2[j].nr) {
 698                     tmp_arg2[j] = alloc_tmp(c);
 699                     brw_MOV(p, tmp_arg2[j], arg2[j]);
 700                 }
 701             }
 702         }
 703     }
 704
 705     func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
 706
 707     release_tmps(c, mark);
 708 }
 709
 710 static void emit_pixel_xy(struct brw_wm_compile *c,
 711                           const struct prog_instruction *inst)
 712 {
 713     struct brw_reg r1 = brw_vec1_grf(1, 0);
 714     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 715
 716     struct brw_reg dst0, dst1;
 717     struct brw_compile *p = &c->func;
 718     GLuint mask = inst->DstReg.WriteMask;
 719
 720     dst0 = get_dst_reg(c, inst, 0);
 721     dst1 = get_dst_reg(c, inst, 1);
 722     /* Calculate pixel centers by adding 1 or 0 to each of the
 723      * micro-tile coordinates passed in r1.
 724      */
 725     if (mask & WRITEMASK_X) {
 726         brw_ADD(p,
 727                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 728                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 729                 brw_imm_v(0x10101010));
 730     }
 731
 732     if (mask & WRITEMASK_Y) {
 733         brw_ADD(p,
 734                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 735                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 736                 brw_imm_v(0x11001100));
 737     }
 738 }
 739
 740 static void emit_delta_xy(struct brw_wm_compile *c,
 741                           const struct prog_instruction *inst)
 742 {
 743     struct brw_reg r1 = brw_vec1_grf(1, 0);
 744     struct brw_reg dst0, dst1, src0, src1;
 745     struct brw_compile *p = &c->func;
 746     GLuint mask = inst->DstReg.WriteMask;
 747
 748     dst0 = get_dst_reg(c, inst, 0);
 749     dst1 = get_dst_reg(c, inst, 1);
 750     src0 = get_src_reg(c, inst, 0, 0);
 751     src1 = get_src_reg(c, inst, 0, 1);
 752     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 753      * centers.
 754      */
 755     if (mask & WRITEMASK_X) {
 756         brw_ADD(p,
 757                 dst0,
 758                 retype(src0, BRW_REGISTER_TYPE_UW),
 759                 negate(r1));
 760     }
 761
 762     if (mask & WRITEMASK_Y) {
 763         brw_ADD(p,
 764                 dst1,
 765                 retype(src1, BRW_REGISTER_TYPE_UW),
 766                 negate(suboffset(r1,1)));
 767
 768     }
 769 }
 770
 771 static void fire_fb_write( struct brw_wm_compile *c,
 772                            GLuint base_reg,
 773                            GLuint nr,
 774                            GLuint target,
 775                            GLuint eot)
 776 {
 777     struct brw_compile *p = &c->func;
 778     /* Pass through control information:
 779      */
 780     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 781     {
 782         brw_push_insn_state(p);
 783         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 784         brw_MOV(p,
 785                 brw_message_reg(base_reg + 1),
 786                 brw_vec8_grf(1, 0));
 787         brw_pop_insn_state(p);
 788     }
 789     /* Send framebuffer write message: */
 790     brw_fb_WRITE(p,
 791             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 792             base_reg,
 793             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 794             target,
 795             nr,
 796             0,
 797             eot);
 798 }
 799
 800 static void emit_fb_write(struct brw_wm_compile *c,
 801                           const struct prog_instruction *inst)
 802 {
 803     struct brw_compile *p = &c->func;
 804     int nr = 2;
 805     int channel;
 806     GLuint target, eot;
 807     struct brw_reg src0;
 808
 809     /* Reserve a space for AA - may not be needed:
 810      */
 811     if (c->key.aa_dest_stencil_reg)
 812         nr += 1;
 813
 814     brw_push_insn_state(p);
 815     for (channel = 0; channel < 4; channel++) {
 816         src0 = get_src_reg(c,  inst, 0, channel);
 817         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 818         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 819         brw_MOV(p, brw_message_reg(nr + channel), src0);
 820     }
 821     /* skip over the regs populated above: */
 822     nr += 8;
 823     brw_pop_insn_state(p);
 824
 825     if (c->key.source_depth_to_render_target) {
 826        if (c->key.computes_depth) {
 827           src0 = get_src_reg(c, inst, 2, 2);
 828           brw_MOV(p, brw_message_reg(nr), src0);
 829        }
 830        else {
 831           src0 = get_src_reg(c, inst, 1, 1);
 832           brw_MOV(p, brw_message_reg(nr), src0);
 833        }
 834
 835        nr += 2;
 836     }
 837
 838     if (c->key.dest_depth_reg) {
 839         const GLuint comp = c->key.dest_depth_reg / 2;
 840         const GLuint off = c->key.dest_depth_reg % 2;
 841
 842         if (off != 0) {
 843             /* XXX this code needs review/testing */
 844             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 845             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 846
 847             brw_push_insn_state(p);
 848             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 849
 850             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 851             /* 2nd half? */
 852             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 853             brw_pop_insn_state(p);
 854         }
 855         else
 856         {
 857             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 858             brw_MOV(p, brw_message_reg(nr), src);
 859         }
 860         nr += 2;
 861    }
 862
 863     target = INST_AUX_GET_TARGET(inst->Aux);
 864     eot = inst->Aux & INST_AUX_EOT;
 865     fire_fb_write(c, 0, nr, target, eot);
 866 }
 867
 868 static void emit_pixel_w( struct brw_wm_compile *c,
 869                           const struct prog_instruction *inst)
 870 {
 871     struct brw_compile *p = &c->func;
 872     GLuint mask = inst->DstReg.WriteMask;
 873     if (mask & WRITEMASK_W) {
 874         struct brw_reg dst, src0, delta0, delta1;
 875         struct brw_reg interp3;
 876
 877         dst = get_dst_reg(c, inst, 3);
 878         src0 = get_src_reg(c, inst, 0, 0);
 879         delta0 = get_src_reg(c, inst, 1, 0);
 880         delta1 = get_src_reg(c, inst, 1, 1);
 881
 882         interp3 = brw_vec1_grf(src0.nr+1, 4);
 883         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 884          * result straight into a message reg.
 885          */
 886         brw_LINE(p, brw_null_reg(), interp3, delta0);
 887         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 888
 889         /* Calc w */
 890         brw_math_16( p, dst,
 891                 BRW_MATH_FUNCTION_INV,
 892                 BRW_MATH_SATURATE_NONE,
 893                 2, brw_null_reg(),
 894                 BRW_MATH_PRECISION_FULL);
 895     }
 896 }
 897
 898 static void emit_linterp(struct brw_wm_compile *c,
 899                          const struct prog_instruction *inst)
 900 {
 901     struct brw_compile *p = &c->func;
 902     GLuint mask = inst->DstReg.WriteMask;
 903     struct brw_reg interp[4];
 904     struct brw_reg dst, delta0, delta1;
 905     struct brw_reg src0;
 906     GLuint nr, i;
 907
 908     src0 = get_src_reg(c, inst, 0, 0);
 909     delta0 = get_src_reg(c, inst, 1, 0);
 910     delta1 = get_src_reg(c, inst, 1, 1);
 911     nr = src0.nr;
 912
 913     interp[0] = brw_vec1_grf(nr, 0);
 914     interp[1] = brw_vec1_grf(nr, 4);
 915     interp[2] = brw_vec1_grf(nr+1, 0);
 916     interp[3] = brw_vec1_grf(nr+1, 4);
 917
 918     for(i = 0; i < 4; i++ ) {
 919         if (mask & (1<<i)) {
 920             dst = get_dst_reg(c, inst, i);
 921             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 922             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 923         }
 924     }
 925 }
 926
 927 static void emit_cinterp(struct brw_wm_compile *c,
 928                          const struct prog_instruction *inst)
 929 {
 930     struct brw_compile *p = &c->func;
 931     GLuint mask = inst->DstReg.WriteMask;
 932
 933     struct brw_reg interp[4];
 934     struct brw_reg dst, src0;
 935     GLuint nr, i;
 936
 937     src0 = get_src_reg(c, inst, 0, 0);
 938     nr = src0.nr;
 939
 940     interp[0] = brw_vec1_grf(nr, 0);
 941     interp[1] = brw_vec1_grf(nr, 4);
 942     interp[2] = brw_vec1_grf(nr+1, 0);
 943     interp[3] = brw_vec1_grf(nr+1, 4);
 944
 945     for(i = 0; i < 4; i++ ) {
 946         if (mask & (1<<i)) {
 947             dst = get_dst_reg(c, inst, i);
 948             brw_MOV(p, dst, suboffset(interp[i],3));
 949         }
 950     }
 951 }
 952
 953 static void emit_pinterp(struct brw_wm_compile *c,
 954                          const struct prog_instruction *inst)
 955 {
 956     struct brw_compile *p = &c->func;
 957     GLuint mask = inst->DstReg.WriteMask;
 958
 959     struct brw_reg interp[4];
 960     struct brw_reg dst, delta0, delta1;
 961     struct brw_reg src0, w;
 962     GLuint nr, i;
 963
 964     src0 = get_src_reg(c, inst, 0, 0);
 965     delta0 = get_src_reg(c, inst, 1, 0);
 966     delta1 = get_src_reg(c, inst, 1, 1);
 967     w = get_src_reg(c, inst, 2, 3);
 968     nr = src0.nr;
 969
 970     interp[0] = brw_vec1_grf(nr, 0);
 971     interp[1] = brw_vec1_grf(nr, 4);
 972     interp[2] = brw_vec1_grf(nr+1, 0);
 973     interp[3] = brw_vec1_grf(nr+1, 4);
 974
 975     for(i = 0; i < 4; i++ ) {
 976         if (mask & (1<<i)) {
 977             dst = get_dst_reg(c, inst, i);
 978             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 979             brw_MAC(p, dst, suboffset(interp[i],1),
 980                     delta1);
 981             brw_MUL(p, dst, dst, w);
 982         }
 983     }
 984 }
 985
 986 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 987 static void emit_frontfacing(struct brw_wm_compile *c,
 988                              const struct prog_instruction *inst)
 989 {
 990     struct brw_compile *p = &c->func;
 991     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 992     struct brw_reg dst;
 993     GLuint mask = inst->DstReg.WriteMask;
 994     int i;
 995
 996     for (i = 0; i < 4; i++) {
 997         if (mask & (1<<i)) {
 998             dst = get_dst_reg(c, inst, i);
 999             brw_MOV(p, dst, brw_imm_f(0.0));
1000         }
1001     }
1002
1003     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1004      * us front face
1005      */
1006     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
1007     for (i = 0; i < 4; i++) {
1008         if (mask & (1<<i)) {
1009             dst = get_dst_reg(c, inst, i);
1010             brw_MOV(p, dst, brw_imm_f(1.0));
1011         }
1012     }
1013     brw_set_predicate_control_flag_value(p, 0xff);
1014 }
1015
1016 static void emit_xpd(struct brw_wm_compile *c,
1017                      const struct prog_instruction *inst)
1018 {
1019     int i;
1020     struct brw_compile *p = &c->func;
1021     GLuint mask = inst->DstReg.WriteMask;
1022     for (i = 0; i < 4; i++) {
1023         GLuint i2 = (i+2)%3;
1024         GLuint i1 = (i+1)%3;
1025         if (mask & (1<<i)) {
1026             struct brw_reg src0, src1, dst;
1027             dst = get_dst_reg(c, inst, i);
1028             src0 = negate(get_src_reg(c, inst, 0, i2));
1029             src1 = get_src_reg_imm(c, inst, 1, i1);
1030             brw_MUL(p, brw_null_reg(), src0, src1);
1031             src0 = get_src_reg(c, inst, 0, i1);
1032             src1 = get_src_reg_imm(c, inst, 1, i2);
1033             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1034             brw_MAC(p, dst, src0, src1);
1035             brw_set_saturate(p, 0);
1036         }
1037     }
1038     brw_set_saturate(p, 0);
1039 }
1040
1041 /**
1042  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1043  * Note that the result of the function is smeared across the dest
1044  * register's X, Y, Z and W channels (subject to writemasking of course).
1045  */
1046 static void emit_math1(struct brw_wm_compile *c,
1047                        const struct prog_instruction *inst, GLuint func)
1048 {
1049     struct brw_compile *p = &c->func;
1050     struct brw_reg src0, dst;
1051     GLuint mask = inst->DstReg.WriteMask;
1052     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1053
1054     if (!(mask & WRITEMASK_XYZW))
1055         return;
1056
1057     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1058
1059     /* Get first component of source register */
1060     dst = get_dst_reg(c, inst, dst_chan);
1061     src0 = get_src_reg(c, inst, 0, 0);
1062
1063     brw_MOV(p, brw_message_reg(2), src0);
1064     brw_math(p,
1065              dst,
1066              func,
1067              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1068              2,
1069              brw_null_reg(),
1070              BRW_MATH_DATA_VECTOR,
1071              BRW_MATH_PRECISION_FULL);
1072 }
1073
1074 static void emit_rcp(struct brw_wm_compile *c,
1075                      const struct prog_instruction *inst)
1076 {
1077     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1078 }
1079
1080 static void emit_rsq(struct brw_wm_compile *c,
1081                      const struct prog_instruction *inst)
1082 {
1083     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1084 }
1085
1086 static void emit_sin(struct brw_wm_compile *c,
1087                      const struct prog_instruction *inst)
1088 {
1089     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1090 }
1091
1092 static void emit_cos(struct brw_wm_compile *c,
1093                      const struct prog_instruction *inst)
1094 {
1095     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1096 }
1097
1098 static void emit_ex2(struct brw_wm_compile *c,
1099                      const struct prog_instruction *inst)
1100 {
1101     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1102 }
1103
1104 static void emit_lg2(struct brw_wm_compile *c,
1105                      const struct prog_instruction *inst)
1106 {
1107     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1108 }
1109
1110 static void emit_arl(struct brw_wm_compile *c,
1111                      const struct prog_instruction *inst)
1112 {
1113     struct brw_compile *p = &c->func;
1114     struct brw_reg src0, addr_reg;
1115     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1116     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1117                            BRW_ARF_ADDRESS, 0);
1118     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1119     brw_MOV(p, addr_reg, src0);
1120     brw_set_saturate(p, 0);
1121 }
1122
1123
1124 static void emit_min_max(struct brw_wm_compile *c,
1125                          const struct prog_instruction *inst)
1126 {
1127     struct brw_compile *p = &c->func;
1128     const GLuint mask = inst->DstReg.WriteMask;
1129     const int mark = mark_tmps(c);
1130     int i;
1131     brw_push_insn_state(p);
1132     for (i = 0; i < 4; i++) {
1133         if (mask & (1<<i)) {
1134             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1135             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1136             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1137             struct brw_reg dst;
1138             /* if dst==src0 or dst==src1 we need to use a temp reg */
1139             GLboolean use_temp = brw_same_reg(dst, src0) ||
1140                                  brw_same_reg(dst, src1);
1141             if (use_temp)
1142                dst = alloc_tmp(c);
1143             else
1144                dst = real_dst;
1145
1146             /*
1147             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1148                    dst.nr, src0.nr, src1.nr);
1149             */
1150             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1151             brw_MOV(p, dst, src0);
1152             brw_set_saturate(p, 0);
1153
1154             if (inst->Opcode == OPCODE_MIN)
1155                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1156             else
1157                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1158
1159             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1160             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1161             brw_MOV(p, dst, src1);
1162             brw_set_saturate(p, 0);
1163             brw_set_predicate_control_flag_value(p, 0xff);
1164             if (use_temp)
1165                brw_MOV(p, real_dst, dst);
1166         }
1167     }
1168     brw_pop_insn_state(p);
1169     release_tmps(c, mark);
1170 }
1171
1172 static void emit_pow(struct brw_wm_compile *c,
1173                      const struct prog_instruction *inst)
1174 {
1175     struct brw_compile *p = &c->func;
1176     struct brw_reg dst, src0, src1;
1177     GLuint mask = inst->DstReg.WriteMask;
1178     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1179
1180     if (!(mask & WRITEMASK_XYZW))
1181         return;
1182
1183     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1184
1185     dst = get_dst_reg(c, inst, dst_chan);
1186     src0 = get_src_reg_imm(c, inst, 0, 0);
1187     src1 = get_src_reg_imm(c, inst, 1, 0);
1188
1189     brw_MOV(p, brw_message_reg(2), src0);
1190     brw_MOV(p, brw_message_reg(3), src1);
1191
1192     brw_math(p,
1193             dst,
1194             BRW_MATH_FUNCTION_POW,
1195             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1196             2,
1197             brw_null_reg(),
1198             BRW_MATH_DATA_VECTOR,
1199             BRW_MATH_PRECISION_FULL);
1200 }
1201
1202 /**
1203  * For GLSL shaders, this KIL will be unconditional.
1204  * It may be contained inside an IF/ENDIF structure of course.
1205  */
1206 static void emit_kil(struct brw_wm_compile *c)
1207 {
1208     struct brw_compile *p = &c->func;
1209     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1210     brw_push_insn_state(p);
1211     brw_set_mask_control(p, BRW_MASK_DISABLE);
1212     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1213     brw_AND(p, depth, c->emit_mask_reg, depth);
1214     brw_pop_insn_state(p);
1215 }
1216
1217 static INLINE struct brw_reg high_words( struct brw_reg reg )
1218 {
1219     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1220                    0, 8, 2 );
1221 }
1222
1223 static INLINE struct brw_reg low_words( struct brw_reg reg )
1224 {
1225     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1226 }
1227
1228 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1229 {
1230     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1231 }
1232
1233 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1234 {
1235     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1236                    0, 16, 2 );
1237 }
1238
1239 /* One-, two- and three-dimensional Perlin noise, similar to the description
1240    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1241 static void noise1_sub( struct brw_wm_compile *c ) {
1242
1243     struct brw_compile *p = &c->func;
1244     struct brw_reg param,
1245         x0, x1, /* gradients at each end */
1246         t, tmp[ 2 ], /* float temporaries */
1247         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1248     int i;
1249     int mark = mark_tmps( c );
1250
1251     x0 = alloc_tmp( c );
1252     x1 = alloc_tmp( c );
1253     t = alloc_tmp( c );
1254     tmp[ 0 ] = alloc_tmp( c );
1255     tmp[ 1 ] = alloc_tmp( c );
1256     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1257     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1258     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1259     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1260     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1261
1262     param = lookup_tmp( c, mark - 2 );
1263
1264     brw_set_access_mode( p, BRW_ALIGN_1 );
1265
1266     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1267
1268     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1269        be hashed.  Also compute the remainder (offset within the unit
1270        length), interleaved to reduce register dependency penalties. */
1271     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1272     brw_FRC( p, param, param );
1273     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1274     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1275     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1276
1277     /* We're now ready to perform the hashing.  The two hashes are
1278        interleaved for performance.  The hash function used is
1279        designed to rapidly achieve avalanche and require only 32x16
1280        bit multiplication, and 16-bit swizzles (which we get for
1281        free).  We can't use immediate operands in the multiplies,
1282        because immediates are permitted only in src1 and the 16-bit
1283        factor is permitted only in src0. */
1284     for( i = 0; i < 2; i++ )
1285         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1286     for( i = 0; i < 2; i++ )
1287        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1288                 high_words( itmp[ i ] ) );
1289     for( i = 0; i < 2; i++ )
1290         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1291     for( i = 0; i < 2; i++ )
1292        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1293                 high_words( itmp[ i ] ) );
1294     for( i = 0; i < 2; i++ )
1295         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1296     for( i = 0; i < 2; i++ )
1297        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1298                 high_words( itmp[ i ] ) );
1299
1300     /* Now we want to initialise the two gradients based on the
1301        hashes.  Format conversion from signed integer to float leaves
1302        everything scaled too high by a factor of pow( 2, 31 ), but
1303        we correct for that right at the end. */
1304     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1305     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1306     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1307
1308     brw_MUL( p, x0, x0, param );
1309     brw_MUL( p, x1, x1, t );
1310
1311     /* We interpolate between the gradients using the polynomial
1312        6t^5 - 15t^4 + 10t^3 (Perlin). */
1313     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1314     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1315     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1316     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1317     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1318     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1319                                            pipeline */
1320     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1321     brw_MUL( p, param, tmp[ 0 ], param );
1322     brw_MUL( p, x1, x1, param );
1323     brw_ADD( p, x0, x0, x1 );
1324     /* scale by pow( 2, -30 ), to compensate for the format conversion
1325        above and an extra factor of 2 so that a single gradient covers
1326        the [-1,1] range */
1327     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1328
1329     release_tmps( c, mark );
1330 }
1331
1332 static void emit_noise1( struct brw_wm_compile *c,
1333                          const struct prog_instruction *inst )
1334 {
1335     struct brw_compile *p = &c->func;
1336     struct brw_reg src, param, dst;
1337     GLuint mask = inst->DstReg.WriteMask;
1338     int i;
1339     int mark = mark_tmps( c );
1340
1341     assert( mark == 0 );
1342
1343     src = get_src_reg( c, inst, 0, 0 );
1344
1345     param = alloc_tmp( c );
1346
1347     brw_MOV( p, param, src );
1348
1349     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1350
1351     /* Fill in the result: */
1352     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1353     for (i = 0 ; i < 4; i++) {
1354         if (mask & (1<<i)) {
1355             dst = get_dst_reg(c, inst, i);
1356             brw_MOV( p, dst, param );
1357         }
1358     }
1359     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1360         brw_set_saturate( p, 0 );
1361
1362     release_tmps( c, mark );
1363 }
1364
1365 static void noise2_sub( struct brw_wm_compile *c ) {
1366
1367     struct brw_compile *p = &c->func;
1368     struct brw_reg param0, param1,
1369         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1370         t, tmp[ 4 ], /* float temporaries */
1371         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1372     int i;
1373     int mark = mark_tmps( c );
1374
1375     x0y0 = alloc_tmp( c );
1376     x0y1 = alloc_tmp( c );
1377     x1y0 = alloc_tmp( c );
1378     x1y1 = alloc_tmp( c );
1379     t = alloc_tmp( c );
1380     for( i = 0; i < 4; i++ ) {
1381         tmp[ i ] = alloc_tmp( c );
1382         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1383     }
1384     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1385     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1386     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1387
1388     param0 = lookup_tmp( c, mark - 3 );
1389     param1 = lookup_tmp( c, mark - 2 );
1390
1391     brw_set_access_mode( p, BRW_ALIGN_1 );
1392
1393     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1394        be hashed.  Also compute the remainders (offsets within the unit
1395        square), interleaved to reduce register dependency penalties. */
1396     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1397     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1398     brw_FRC( p, param0, param0 );
1399     brw_FRC( p, param1, param1 );
1400     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1401     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1402              low_words( itmp[ 1 ] ) );
1403     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1404     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1405     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1406     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1407     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1408
1409     /* We're now ready to perform the hashing.  The four hashes are
1410        interleaved for performance.  The hash function used is
1411        designed to rapidly achieve avalanche and require only 32x16
1412        bit multiplication, and 16-bit swizzles (which we get for
1413        free).  We can't use immediate operands in the multiplies,
1414        because immediates are permitted only in src1 and the 16-bit
1415        factor is permitted only in src0. */
1416     for( i = 0; i < 4; i++ )
1417         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1418     for( i = 0; i < 4; i++ )
1419         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1420                  high_words( itmp[ i ] ) );
1421     for( i = 0; i < 4; i++ )
1422         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1423     for( i = 0; i < 4; i++ )
1424         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1425                  high_words( itmp[ i ] ) );
1426     for( i = 0; i < 4; i++ )
1427         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1428     for( i = 0; i < 4; i++ )
1429         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1430                  high_words( itmp[ i ] ) );
1431
1432     /* Now we want to initialise the four gradients based on the
1433        hashes.  Format conversion from signed integer to float leaves
1434        everything scaled too high by a factor of pow( 2, 15 ), but
1435        we correct for that right at the end. */
1436     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1437     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1438     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1439     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1440     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1441
1442     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1443     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1444     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1445     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1446
1447     brw_MUL( p, x1y0, x1y0, t );
1448     brw_MUL( p, x1y1, x1y1, t );
1449     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1450     brw_MUL( p, x0y0, x0y0, param0 );
1451     brw_MUL( p, x0y1, x0y1, param0 );
1452
1453     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1454     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1455     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1456     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1457
1458     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1459     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1460     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1461     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1462
1463     /* We interpolate between the gradients using the polynomial
1464        6t^5 - 15t^4 + 10t^3 (Perlin). */
1465     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1466     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1467     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1468     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1469     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1470     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1471     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1472                                                  pipeline */
1473     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1474     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1475     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1476     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1477     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1478                                                  pipeline */
1479     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1480     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1481     brw_MUL( p, param0, tmp[ 0 ], param0 );
1482     brw_MUL( p, param1, tmp[ 1 ], param1 );
1483
1484     /* Here we interpolate in the y dimension... */
1485     brw_MUL( p, x0y1, x0y1, param1 );
1486     brw_MUL( p, x1y1, x1y1, param1 );
1487     brw_ADD( p, x0y0, x0y0, x0y1 );
1488     brw_ADD( p, x1y0, x1y0, x1y1 );
1489
1490     /* And now in x.  There are horrible register dependencies here,
1491        but we have nothing else to do. */
1492     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1493     brw_MUL( p, x1y0, x1y0, param0 );
1494     brw_ADD( p, x0y0, x0y0, x1y0 );
1495
1496     /* scale by pow( 2, -15 ), as described above */
1497     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1498
1499     release_tmps( c, mark );
1500 }
1501
1502 static void emit_noise2( struct brw_wm_compile *c,
1503                          const struct prog_instruction *inst )
1504 {
1505     struct brw_compile *p = &c->func;
1506     struct brw_reg src0, src1, param0, param1, dst;
1507     GLuint mask = inst->DstReg.WriteMask;
1508     int i;
1509     int mark = mark_tmps( c );
1510
1511     assert( mark == 0 );
1512
1513     src0 = get_src_reg( c, inst, 0, 0 );
1514     src1 = get_src_reg( c, inst, 0, 1 );
1515
1516     param0 = alloc_tmp( c );
1517     param1 = alloc_tmp( c );
1518
1519     brw_MOV( p, param0, src0 );
1520     brw_MOV( p, param1, src1 );
1521
1522     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1523
1524     /* Fill in the result: */
1525     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1526     for (i = 0 ; i < 4; i++) {
1527         if (mask & (1<<i)) {
1528             dst = get_dst_reg(c, inst, i);
1529             brw_MOV( p, dst, param0 );
1530         }
1531     }
1532     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1533         brw_set_saturate( p, 0 );
1534
1535     release_tmps( c, mark );
1536 }
1537
1538 /**
1539  * The three-dimensional case is much like the one- and two- versions above,
1540  * but since the number of corners is rapidly growing we now pack 16 16-bit
1541  * hashes into each register to extract more parallelism from the EUs.
1542  */
1543 static void noise3_sub( struct brw_wm_compile *c ) {
1544
1545     struct brw_compile *p = &c->func;
1546     struct brw_reg param0, param1, param2,
1547         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1548         xi, yi, zi, /* interpolation coefficients */
1549         t, tmp[ 8 ], /* float temporaries */
1550         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1551         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1552     int i;
1553     int mark = mark_tmps( c );
1554
1555     x0y0 = alloc_tmp( c );
1556     x0y1 = alloc_tmp( c );
1557     x1y0 = alloc_tmp( c );
1558     x1y1 = alloc_tmp( c );
1559     xi = alloc_tmp( c );
1560     yi = alloc_tmp( c );
1561     zi = alloc_tmp( c );
1562     t = alloc_tmp( c );
1563     for( i = 0; i < 8; i++ ) {
1564         tmp[ i ] = alloc_tmp( c );
1565         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1566         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1567     }
1568
1569     param0 = lookup_tmp( c, mark - 4 );
1570     param1 = lookup_tmp( c, mark - 3 );
1571     param2 = lookup_tmp( c, mark - 2 );
1572
1573     brw_set_access_mode( p, BRW_ALIGN_1 );
1574
1575     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1576        be hashed.  Also compute the remainders (offsets within the unit
1577        cube), interleaved to reduce register dependency penalties. */
1578     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1579     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1580     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1581     brw_FRC( p, param0, param0 );
1582     brw_FRC( p, param1, param1 );
1583     brw_FRC( p, param2, param2 );
1584     /* Since we now have only 16 bits of precision in the hash, we must
1585        be more careful about thorough mixing to maintain entropy as we
1586        squash the input vector into a small scalar. */
1587     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1588     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1589     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1590              brw_imm_uw( 0x9B93 ) );
1591     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1592              brw_imm_uw( 0xBC8F ) );
1593
1594     /* Temporarily disable the execution mask while we work with ExecSize=16
1595        channels (the mask is set for ExecSize=8 and is probably incorrect).
1596        Although this might cause execution of unwanted channels, the code
1597        writes only to temporary registers and has no side effects, so
1598        disabling the mask is harmless. */
1599     brw_push_insn_state( p );
1600     brw_set_mask_control( p, BRW_MASK_DISABLE );
1601     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1602     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1603     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1604
1605     /* We're now ready to perform the hashing.  The eight hashes are
1606        interleaved for performance.  The hash function used is
1607        designed to rapidly achieve avalanche and require only 16x16
1608        bit multiplication, and 8-bit swizzles (which we get for
1609        free). */
1610     for( i = 0; i < 4; i++ )
1611         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1612     for( i = 0; i < 4; i++ )
1613         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1614                  odd_bytes( wtmp[ i ] ) );
1615     for( i = 0; i < 4; i++ )
1616         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1617     for( i = 0; i < 4; i++ )
1618         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1619                  odd_bytes( wtmp[ i ] ) );
1620     brw_pop_insn_state( p );
1621
1622     /* Now we want to initialise the four rear gradients based on the
1623        hashes.  Format conversion from signed integer to float leaves
1624        everything scaled too high by a factor of pow( 2, 15 ), but
1625        we correct for that right at the end. */
1626     /* x component */
1627     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1628     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1629     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1630     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1631     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1632
1633     brw_push_insn_state( p );
1634     brw_set_mask_control( p, BRW_MASK_DISABLE );
1635     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1636     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1637     brw_pop_insn_state( p );
1638
1639     brw_MUL( p, x1y0, x1y0, t );
1640     brw_MUL( p, x1y1, x1y1, t );
1641     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1642     brw_MUL( p, x0y0, x0y0, param0 );
1643     brw_MUL( p, x0y1, x0y1, param0 );
1644
1645     /* y component */
1646     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1647     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1648     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1649     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1650
1651     brw_push_insn_state( p );
1652     brw_set_mask_control( p, BRW_MASK_DISABLE );
1653     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1654     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1655     brw_pop_insn_state( p );
1656
1657     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1658     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1659     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1660     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1661     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1662
1663     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1664     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1665     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1666     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1667
1668     /* z component */
1669     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1670     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1671     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1672     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1673
1674     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1675     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1676     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1677     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1678
1679     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1680     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1681     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1682     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1683
1684     /* We interpolate between the gradients using the polynomial
1685        6t^5 - 15t^4 + 10t^3 (Perlin). */
1686     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1687     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1688     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1689     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1690     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1691     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1692     brw_MUL( p, xi, xi, param0 );
1693     brw_MUL( p, yi, yi, param1 );
1694     brw_MUL( p, zi, zi, param2 );
1695     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1696     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1697     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1698     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1699     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1700     brw_MUL( p, xi, xi, param0 );
1701     brw_MUL( p, yi, yi, param1 );
1702     brw_MUL( p, zi, zi, param2 );
1703     brw_MUL( p, xi, xi, param0 );
1704     brw_MUL( p, yi, yi, param1 );
1705     brw_MUL( p, zi, zi, param2 );
1706     brw_MUL( p, xi, xi, param0 );
1707     brw_MUL( p, yi, yi, param1 );
1708     brw_MUL( p, zi, zi, param2 );
1709
1710     /* Here we interpolate in the y dimension... */
1711     brw_MUL( p, x0y1, x0y1, yi );
1712     brw_MUL( p, x1y1, x1y1, yi );
1713     brw_ADD( p, x0y0, x0y0, x0y1 );
1714     brw_ADD( p, x1y0, x1y0, x1y1 );
1715
1716     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1717     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1718     brw_MUL( p, x1y0, x1y0, xi );
1719     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1720
1721     /* Now do the same thing for the front four gradients... */
1722     /* x component */
1723     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1724     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1725     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1726     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1727
1728     brw_push_insn_state( p );
1729     brw_set_mask_control( p, BRW_MASK_DISABLE );
1730     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1731     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1732     brw_pop_insn_state( p );
1733
1734     brw_MUL( p, x1y0, x1y0, t );
1735     brw_MUL( p, x1y1, x1y1, t );
1736     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1737     brw_MUL( p, x0y0, x0y0, param0 );
1738     brw_MUL( p, x0y1, x0y1, param0 );
1739
1740     /* y component */
1741     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1742     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1743     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1744     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1745
1746     brw_push_insn_state( p );
1747     brw_set_mask_control( p, BRW_MASK_DISABLE );
1748     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1749     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1750     brw_pop_insn_state( p );
1751
1752     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1753     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1754     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1755     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1756     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1757
1758     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1759     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1760     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1761     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1762
1763     /* z component */
1764     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1765     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1766     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1767     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1768
1769     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1770     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1771     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1772     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1773
1774     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1775     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1776     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1777     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1778
1779     /* The interpolation coefficients are still around from last time, so
1780        again interpolate in the y dimension... */
1781     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1782     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1783     brw_MUL( p, x0y1, x0y1, yi );
1784     brw_MUL( p, x1y1, x1y1, yi );
1785     brw_ADD( p, x0y0, x0y0, x0y1 );
1786     brw_ADD( p, x1y0, x1y0, x1y1 );
1787
1788     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1789        time put the front face in tmp[ 1 ] and we're nearly there... */
1790     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1791     brw_MUL( p, x1y0, x1y0, xi );
1792     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1793
1794     /* The final interpolation, in the z dimension: */
1795     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1796     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1797     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1798
1799     /* scale by pow( 2, -15 ), as described above */
1800     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1801
1802     release_tmps( c, mark );
1803 }
1804
1805 static void emit_noise3( struct brw_wm_compile *c,
1806                          const struct prog_instruction *inst )
1807 {
1808     struct brw_compile *p = &c->func;
1809     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1810     GLuint mask = inst->DstReg.WriteMask;
1811     int i;
1812     int mark = mark_tmps( c );
1813
1814     assert( mark == 0 );
1815
1816     src0 = get_src_reg( c, inst, 0, 0 );
1817     src1 = get_src_reg( c, inst, 0, 1 );
1818     src2 = get_src_reg( c, inst, 0, 2 );
1819
1820     param0 = alloc_tmp( c );
1821     param1 = alloc_tmp( c );
1822     param2 = alloc_tmp( c );
1823
1824     brw_MOV( p, param0, src0 );
1825     brw_MOV( p, param1, src1 );
1826     brw_MOV( p, param2, src2 );
1827
1828     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1829
1830     /* Fill in the result: */
1831     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1832     for (i = 0 ; i < 4; i++) {
1833         if (mask & (1<<i)) {
1834             dst = get_dst_reg(c, inst, i);
1835             brw_MOV( p, dst, param0 );
1836         }
1837     }
1838     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1839         brw_set_saturate( p, 0 );
1840
1841     release_tmps( c, mark );
1842 }
1843
1844 /**
1845  * For the four-dimensional case, the little micro-optimisation benefits
1846  * we obtain by unrolling all the loops aren't worth the massive bloat it
1847  * now causes.  Instead, we loop twice around performing a similar operation
1848  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1849  * code to glue it all together.
1850  */
1851 static void noise4_sub( struct brw_wm_compile *c )
1852 {
1853     struct brw_compile *p = &c->func;
1854     struct brw_reg param[ 4 ],
1855         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1856         w0, /* noise for the w=0 cube */
1857         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1858         interp[ 4 ], /* interpolation coefficients */
1859         t, tmp[ 8 ], /* float temporaries */
1860         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1861         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1862     int i, j;
1863     int mark = mark_tmps( c );
1864     GLuint loop, origin;
1865
1866     x0y0 = alloc_tmp( c );
1867     x0y1 = alloc_tmp( c );
1868     x1y0 = alloc_tmp( c );
1869     x1y1 = alloc_tmp( c );
1870     t = alloc_tmp( c );
1871     w0 = alloc_tmp( c );
1872     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1873     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1874
1875     for( i = 0; i < 4; i++ ) {
1876         param[ i ] = lookup_tmp( c, mark - 5 + i );
1877         interp[ i ] = alloc_tmp( c );
1878     }
1879
1880     for( i = 0; i < 8; i++ ) {
1881         tmp[ i ] = alloc_tmp( c );
1882         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1883         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1884     }
1885
1886     brw_set_access_mode( p, BRW_ALIGN_1 );
1887
1888     /* We only want 16 bits of precision from the integral part of each
1889        co-ordinate, but unfortunately the RNDD semantics would saturate
1890        at 16 bits if we performed the operation directly to a 16-bit
1891        destination.  Therefore, we round to 32-bit temporaries where
1892        appropriate, and then store only the lower 16 bits. */
1893     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1894     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1895     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1896     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1897     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1898     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1899
1900     /* Modify the flag register here, because the side effect is useful
1901        later (see below).  We know for certain that all flags will be
1902        cleared, since the FRC instruction cannot possibly generate
1903        negative results.  Even for exceptional inputs (infinities, denormals,
1904        NaNs), the architecture guarantees that the L conditional is false. */
1905     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1906     brw_FRC( p, param[ 0 ], param[ 0 ] );
1907     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1908     for( i = 1; i < 4; i++ )
1909         brw_FRC( p, param[ i ], param[ i ] );
1910
1911     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1912        of all. */
1913     for( i = 0; i < 4; i++ )
1914         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1915     for( i = 0; i < 4; i++ )
1916         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1917     for( i = 0; i < 4; i++ )
1918         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1919     for( i = 0; i < 4; i++ )
1920         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1921     for( j = 0; j < 3; j++ )
1922         for( i = 0; i < 4; i++ )
1923             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1924
1925     /* Mark the current address, as it will be a jump destination.  The
1926        following code will be executed twice: first, with the flag
1927        register clear indicating the w=0 case, and second with flags
1928        set for w=1. */
1929     loop = p->nr_insn;
1930
1931     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1932        be hashed.  Since we have only 16 bits of precision in the hash, we
1933        must be careful about thorough mixing to maintain entropy as we
1934        squash the input vector into a small scalar. */
1935     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1936              brw_imm_uw( 0xBC8F ) );
1937     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1938              brw_imm_uw( 0xD0BD ) );
1939     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1940              brw_imm_uw( 0x9B93 ) );
1941     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1942              brw_imm_uw( 0xA359 ) );
1943     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1944              brw_imm_uw( 0xBC8F ) );
1945
1946     /* Temporarily disable the execution mask while we work with ExecSize=16
1947        channels (the mask is set for ExecSize=8 and is probably incorrect).
1948        Although this might cause execution of unwanted channels, the code
1949        writes only to temporary registers and has no side effects, so
1950        disabling the mask is harmless. */
1951     brw_push_insn_state( p );
1952     brw_set_mask_control( p, BRW_MASK_DISABLE );
1953     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1954     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1955     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1956
1957     /* We're now ready to perform the hashing.  The eight hashes are
1958        interleaved for performance.  The hash function used is
1959        designed to rapidly achieve avalanche and require only 16x16
1960        bit multiplication, and 8-bit swizzles (which we get for
1961        free). */
1962     for( i = 0; i < 4; i++ )
1963         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1964     for( i = 0; i < 4; i++ )
1965         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1966                  odd_bytes( wtmp[ i ] ) );
1967     for( i = 0; i < 4; i++ )
1968         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1969     for( i = 0; i < 4; i++ )
1970         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1971                  odd_bytes( wtmp[ i ] ) );
1972     brw_pop_insn_state( p );
1973
1974     /* Now we want to initialise the four rear gradients based on the
1975        hashes.  Format conversion from signed integer to float leaves
1976        everything scaled too high by a factor of pow( 2, 15 ), but
1977        we correct for that right at the end. */
1978     /* x component */
1979     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1980     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1981     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1982     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1983     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1984
1985     brw_push_insn_state( p );
1986     brw_set_mask_control( p, BRW_MASK_DISABLE );
1987     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1988     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1989     brw_pop_insn_state( p );
1990
1991     brw_MUL( p, x1y0, x1y0, t );
1992     brw_MUL( p, x1y1, x1y1, t );
1993     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1994     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1995     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1996
1997     /* y component */
1998     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1999     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2000     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2001     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2002
2003     brw_push_insn_state( p );
2004     brw_set_mask_control( p, BRW_MASK_DISABLE );
2005     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2006     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2007     brw_pop_insn_state( p );
2008
2009     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2010     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2011     /* prepare t for the w component (used below): w the first time through
2012        the loop; w - 1 the second time) */
2013     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2014     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2015     p->current->header.predicate_inverse = 1;
2016     brw_MOV( p, t, param[ 3 ] );
2017     p->current->header.predicate_inverse = 0;
2018     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2019     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2020     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2021
2022     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2023     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2024     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2025     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2026
2027     /* z component */
2028     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2029     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2030     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2031     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2032
2033     brw_push_insn_state( p );
2034     brw_set_mask_control( p, BRW_MASK_DISABLE );
2035     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2036     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2037     brw_pop_insn_state( p );
2038
2039     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2040     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2041     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2042     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2043
2044     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2045     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2046     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2047     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2048
2049     /* w component */
2050     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2051     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2052     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2053     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2054
2055     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2056     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2057     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2058     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2059     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2060
2061     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2062     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2063     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2064     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2065
2066     /* Here we interpolate in the y dimension... */
2067     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2068     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2069     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2070     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2071     brw_ADD( p, x0y0, x0y0, x0y1 );
2072     brw_ADD( p, x1y0, x1y0, x1y1 );
2073
2074     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2075     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2076     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2077     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2078
2079     /* Now do the same thing for the front four gradients... */
2080     /* x component */
2081     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2082     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2083     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2084     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2085
2086     brw_push_insn_state( p );
2087     brw_set_mask_control( p, BRW_MASK_DISABLE );
2088     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2089     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2090     brw_pop_insn_state( p );
2091
2092     brw_MUL( p, x1y0, x1y0, t );
2093     brw_MUL( p, x1y1, x1y1, t );
2094     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2095     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2096     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2097
2098     /* y component */
2099     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2100     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2101     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2102     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2103
2104     brw_push_insn_state( p );
2105     brw_set_mask_control( p, BRW_MASK_DISABLE );
2106     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2107     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2108     brw_pop_insn_state( p );
2109
2110     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2111     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2112     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2113     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2114     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2115
2116     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2117     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2118     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2119     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2120
2121     /* z component */
2122     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2123     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2124     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2125     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2126
2127     brw_push_insn_state( p );
2128     brw_set_mask_control( p, BRW_MASK_DISABLE );
2129     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2130     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2131     brw_pop_insn_state( p );
2132
2133     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2134     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2135     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2136     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2137     /* prepare t for the w component (used below): w the first time through
2138        the loop; w - 1 the second time) */
2139     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2140     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2141     p->current->header.predicate_inverse = 1;
2142     brw_MOV( p, t, param[ 3 ] );
2143     p->current->header.predicate_inverse = 0;
2144     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2145
2146     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2147     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2148     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2149     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2150
2151     /* w component */
2152     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2153     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2154     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2155     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2156
2157     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2158     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2159     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2160     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2161
2162     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2163     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2164     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2165     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2166
2167     /* Interpolate in the y dimension: */
2168     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2169     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2170     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2171     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2172     brw_ADD( p, x0y0, x0y0, x0y1 );
2173     brw_ADD( p, x1y0, x1y0, x1y1 );
2174
2175     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2176        time put the front face in tmp[ 1 ] and we're nearly there... */
2177     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2178     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2179     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2180
2181     /* Another interpolation, in the z dimension: */
2182     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2183     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2184     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2185
2186     /* Exit the loop if we've computed both cubes... */
2187     origin = p->nr_insn;
2188     brw_push_insn_state( p );
2189     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2190     brw_set_mask_control( p, BRW_MASK_DISABLE );
2191     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2192     brw_pop_insn_state( p );
2193
2194     /* Save the result for the w=0 case, and increment the w coordinate: */
2195     brw_MOV( p, w0, tmp[ 0 ] );
2196     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2197              brw_imm_uw( 1 ) );
2198
2199     /* Loop around for the other cube.  Explicitly set the flag register
2200        (unfortunately we must spend an extra instruction to do this: we
2201        can't rely on a side effect of the previous MOV or ADD because
2202        conditional modifiers which are normally true might be false in
2203        exceptional circumstances, e.g. given a NaN input; the add to
2204        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2205     brw_push_insn_state( p );
2206     brw_set_mask_control( p, BRW_MASK_DISABLE );
2207     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2208     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2209              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2210     brw_pop_insn_state( p );
2211
2212     /* Patch the previous conditional branch now that we know the
2213        destination address. */
2214     brw_set_src1( p->store + origin,
2215                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2216
2217     /* The very last interpolation. */
2218     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2219     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2220     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2221
2222     /* scale by pow( 2, -15 ), as described above */
2223     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2224
2225     release_tmps( c, mark );
2226 }
2227
2228 static void emit_noise4( struct brw_wm_compile *c,
2229                          const struct prog_instruction *inst )
2230 {
2231     struct brw_compile *p = &c->func;
2232     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2233     GLuint mask = inst->DstReg.WriteMask;
2234     int i;
2235     int mark = mark_tmps( c );
2236
2237     assert( mark == 0 );
2238
2239     src0 = get_src_reg( c, inst, 0, 0 );
2240     src1 = get_src_reg( c, inst, 0, 1 );
2241     src2 = get_src_reg( c, inst, 0, 2 );
2242     src3 = get_src_reg( c, inst, 0, 3 );
2243
2244     param0 = alloc_tmp( c );
2245     param1 = alloc_tmp( c );
2246     param2 = alloc_tmp( c );
2247     param3 = alloc_tmp( c );
2248
2249     brw_MOV( p, param0, src0 );
2250     brw_MOV( p, param1, src1 );
2251     brw_MOV( p, param2, src2 );
2252     brw_MOV( p, param3, src3 );
2253
2254     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2255
2256     /* Fill in the result: */
2257     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2258     for (i = 0 ; i < 4; i++) {
2259         if (mask & (1<<i)) {
2260             dst = get_dst_reg(c, inst, i);
2261             brw_MOV( p, dst, param0 );
2262         }
2263     }
2264     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2265         brw_set_saturate( p, 0 );
2266
2267     release_tmps( c, mark );
2268 }
2269
2270 static void emit_wpos_xy(struct brw_wm_compile *c,
2271                          const struct prog_instruction *inst)
2272 {
2273     struct brw_compile *p = &c->func;
2274     GLuint mask = inst->DstReg.WriteMask;
2275     struct brw_reg src0[2], dst[2];
2276
2277     dst[0] = get_dst_reg(c, inst, 0);
2278     dst[1] = get_dst_reg(c, inst, 1);
2279
2280     src0[0] = get_src_reg(c, inst, 0, 0);
2281     src0[1] = get_src_reg(c, inst, 0, 1);
2282
2283     /* Calculate the pixel offset from window bottom left into destination
2284      * X and Y channels.
2285      */
2286     if (mask & WRITEMASK_X) {
2287         /* X' = X - origin_x */
2288         brw_ADD(p,
2289                 dst[0],
2290                 retype(src0[0], BRW_REGISTER_TYPE_W),
2291                 brw_imm_d(0 - c->key.origin_x));
2292     }
2293
2294     if (mask & WRITEMASK_Y) {
2295         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2296         brw_ADD(p,
2297                 dst[1],
2298                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2299                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2300     }
2301 }
2302
2303 /* TODO
2304    BIAS on SIMD8 not working yet...
2305  */
2306 static void emit_txb(struct brw_wm_compile *c,
2307                      const struct prog_instruction *inst)
2308 {
2309     struct brw_compile *p = &c->func;
2310     struct brw_reg dst[4], src[4], payload_reg;
2311     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2312     const GLuint unit = inst->TexSrcUnit;
2313     GLuint i;
2314     GLuint msg_type;
2315
2316     assert(unit < BRW_MAX_TEX_UNIT);
2317
2318     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2319
2320     for (i = 0; i < 4; i++)
2321         dst[i] = get_dst_reg(c, inst, i);
2322     for (i = 0; i < 4; i++)
2323         src[i] = get_src_reg(c, inst, 0, i);
2324
2325     switch (inst->TexSrcTarget) {
2326         case TEXTURE_1D_INDEX:
2327             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2328             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2329             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2330             break;
2331         case TEXTURE_2D_INDEX:
2332         case TEXTURE_RECT_INDEX:
2333             brw_MOV(p, brw_message_reg(2), src[0]);
2334             brw_MOV(p, brw_message_reg(3), src[1]);
2335             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2336             break;
2337         case TEXTURE_3D_INDEX:
2338         case TEXTURE_CUBE_INDEX:
2339             brw_MOV(p, brw_message_reg(2), src[0]);
2340             brw_MOV(p, brw_message_reg(3), src[1]);
2341             brw_MOV(p, brw_message_reg(4), src[2]);
2342             break;
2343         default:
2344             /* invalid target */
2345             abort();
2346     }
2347     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2348     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2349
2350     if (BRW_IS_IGDNG(p->brw)) {
2351         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2352     } else {
2353         /* Does it work well on SIMD8? */
2354         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2355     }
2356
2357     brw_SAMPLE(p,
2358                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2359                1,                                           /* msg_reg_nr */
2360                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2361                SURF_INDEX_TEXTURE(unit),
2362                unit,                                        /* sampler */
2363                inst->DstReg.WriteMask,                      /* writemask */
2364                msg_type,                                    /* msg_type */
2365                4,                                           /* response_length */
2366                4,                                           /* msg_length */
2367                0,                                           /* eot */
2368                1,
2369                BRW_SAMPLER_SIMD_MODE_SIMD8);
2370 }
2371
2372
2373 static void emit_tex(struct brw_wm_compile *c,
2374                      const struct prog_instruction *inst)
2375 {
2376     struct brw_compile *p = &c->func;
2377     struct brw_reg dst[4], src[4], payload_reg;
2378     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2379     const GLuint unit = inst->TexSrcUnit;
2380     GLuint msg_len;
2381     GLuint i, nr;
2382     GLuint emit;
2383     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2384     GLuint msg_type;
2385
2386     assert(unit < BRW_MAX_TEX_UNIT);
2387
2388     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2389
2390     for (i = 0; i < 4; i++)
2391         dst[i] = get_dst_reg(c, inst, i);
2392     for (i = 0; i < 4; i++)
2393         src[i] = get_src_reg(c, inst, 0, i);
2394
2395     switch (inst->TexSrcTarget) {
2396         case TEXTURE_1D_INDEX:
2397             emit = WRITEMASK_X;
2398             nr = 1;
2399             break;
2400         case TEXTURE_2D_INDEX:
2401         case TEXTURE_RECT_INDEX:
2402             emit = WRITEMASK_XY;
2403             nr = 2;
2404             break;
2405         case TEXTURE_3D_INDEX:
2406         case TEXTURE_CUBE_INDEX:
2407             emit = WRITEMASK_XYZ;
2408             nr = 3;
2409             break;
2410         default:
2411            /* invalid target */
2412            abort();
2413     }
2414     msg_len = 1;
2415
2416     /* move/load S, T, R coords */
2417     for (i = 0; i < nr; i++) {
2418         static const GLuint swz[4] = {0,1,2,2};
2419         if (emit & (1<<i))
2420             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2421         else
2422             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2423         msg_len += 1;
2424     }
2425
2426     if (shadow) {
2427        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2428        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2429     }
2430
2431     if (BRW_IS_IGDNG(p->brw)) {
2432         if (shadow)
2433             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2434         else
2435             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2436     } else {
2437         /* Does it work for shadow on SIMD8 ? */
2438         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2439     }
2440
2441     brw_SAMPLE(p,
2442                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2443                1,                                          /* msg_reg_nr */
2444                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2445                SURF_INDEX_TEXTURE(unit),
2446                unit,                                       /* sampler */
2447                inst->DstReg.WriteMask,                     /* writemask */
2448                msg_type,                                   /* msg_type */
2449                4,                                          /* response_length */
2450                shadow ? 6 : 4,                             /* msg_length */
2451                0,                                          /* eot */
2452                1,
2453                BRW_SAMPLER_SIMD_MODE_SIMD8);
2454
2455     if (shadow)
2456         brw_MOV(p, dst[3], brw_imm_f(1.0));
2457 }
2458
2459
2460 /**
2461  * Resolve subroutine calls after code emit is done.
2462  */
2463 static void post_wm_emit( struct brw_wm_compile *c )
2464 {
2465     brw_resolve_cals(&c->func);
2466 }
2467
2468 static void
2469 get_argument_regs(struct brw_wm_compile *c,
2470                   const struct prog_instruction *inst,
2471                   int index,
2472                   struct brw_reg *regs,
2473                   int mask)
2474 {
2475     int i;
2476
2477     for (i = 0; i < 4; i++) {
2478         if (mask & (1 << i))
2479             regs[i] = get_src_reg(c, inst, index, i);
2480     }
2481 }
2482
2483 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2484 {
2485 #define MAX_IF_DEPTH 32
2486 #define MAX_LOOP_DEPTH 32
2487     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2488     GLuint i, if_depth = 0, loop_depth = 0;
2489     struct brw_compile *p = &c->func;
2490     struct brw_indirect stack_index = brw_indirect(0, 0);
2491
2492     c->out_of_regs = GL_FALSE;
2493
2494     prealloc_reg(c);
2495     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2496     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2497
2498     for (i = 0; i < c->nr_fp_insns; i++) {
2499         const struct prog_instruction *inst = &c->prog_instructions[i];
2500         int dst_flags;
2501         struct brw_reg args[3][4], dst[4];
2502         int j;
2503
2504         c->cur_inst = i;
2505
2506 #if 0
2507         _mesa_printf("Inst %d: ", i);
2508         _mesa_print_instruction(inst);
2509 #endif
2510
2511         /* fetch any constants that this instruction needs */
2512         if (c->fp->use_const_buffer)
2513            fetch_constants(c, inst);
2514
2515         if (inst->Opcode != OPCODE_ARL) {
2516            for (j = 0; j < 4; j++) {
2517               if (inst->DstReg.WriteMask & (1 << j))
2518                  dst[j] = get_dst_reg(c, inst, j);
2519               else
2520                  dst[j] = brw_null_reg();
2521            }
2522         }
2523         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2524             get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2525
2526         dst_flags = inst->DstReg.WriteMask;
2527         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2528             dst_flags |= SATURATE;
2529
2530         if (inst->CondUpdate)
2531             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2532         else
2533             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2534
2535         dst_flags = inst->DstReg.WriteMask;
2536         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2537             dst_flags |= SATURATE;
2538
2539         switch (inst->Opcode) {
2540             case WM_PIXELXY:
2541                 emit_pixel_xy(c, inst);
2542                 break;
2543             case WM_DELTAXY:
2544                 emit_delta_xy(c, inst);
2545                 break;
2546             case WM_PIXELW:
2547                 emit_pixel_w(c, inst);
2548                 break;
2549             case WM_LINTERP:
2550                 emit_linterp(c, inst);
2551                 break;
2552             case WM_PINTERP:
2553                 emit_pinterp(c, inst);
2554                 break;
2555             case WM_CINTERP:
2556                 emit_cinterp(c, inst);
2557                 break;
2558             case WM_WPOSXY:
2559                 emit_wpos_xy(c, inst);
2560                 break;
2561             case WM_FB_WRITE:
2562                 emit_fb_write(c, inst);
2563                 break;
2564             case WM_FRONTFACING:
2565                 emit_frontfacing(c, inst);
2566                 break;
2567             case OPCODE_ADD:
2568                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2569                 break;
2570             case OPCODE_ARL:
2571                 emit_arl(c, inst);
2572                 break;
2573             case OPCODE_FRC:
2574                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2575                 break;
2576             case OPCODE_FLR:
2577                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2578                 break;
2579             case OPCODE_LRP:
2580                 unalias3(c, emit_lrp,
2581                          dst, dst_flags, args[0], args[1], args[2]);
2582                 break;
2583             case OPCODE_TRUNC:
2584                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2585                 break;
2586             case OPCODE_MOV:
2587             case OPCODE_SWZ:
2588                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2589                 break;
2590             case OPCODE_DP3:
2591                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
2592                 break;
2593             case OPCODE_DP4:
2594                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
2595                 break;
2596             case OPCODE_XPD:
2597                 emit_xpd(c, inst);
2598                 break;
2599             case OPCODE_DPH:
2600                 emit_dph(p, dst, dst_flags, args[0], args[1]);
2601                 break;
2602             case OPCODE_RCP:
2603                 emit_rcp(c, inst);
2604                 break;
2605             case OPCODE_RSQ:
2606                 emit_rsq(c, inst);
2607                 break;
2608             case OPCODE_SIN:
2609                 emit_sin(c, inst);
2610                 break;
2611             case OPCODE_COS:
2612                 emit_cos(c, inst);
2613                 break;
2614             case OPCODE_EX2:
2615                 emit_ex2(c, inst);
2616                 break;
2617             case OPCODE_LG2:
2618                 emit_lg2(c, inst);
2619                 break;
2620             case OPCODE_MIN:
2621             case OPCODE_MAX:
2622                 emit_min_max(c, inst);
2623                 break;
2624             case OPCODE_DDX:
2625             case OPCODE_DDY:
2626                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2627                           args[0]);
2628                 break;
2629             case OPCODE_SLT:
2630                 emit_sop(p, dst, dst_flags,
2631                          BRW_CONDITIONAL_L, args[0], args[1]);
2632                 break;
2633             case OPCODE_SLE:
2634                 emit_sop(p, dst, dst_flags,
2635                          BRW_CONDITIONAL_LE, args[0], args[1]);
2636                 break;
2637             case OPCODE_SGT:
2638                 emit_sop(p, dst, dst_flags,
2639                          BRW_CONDITIONAL_G, args[0], args[1]);
2640                 break;
2641             case OPCODE_SGE:
2642                 emit_sop(p, dst, dst_flags,
2643                          BRW_CONDITIONAL_GE, args[0], args[1]);
2644                 break;
2645             case OPCODE_SEQ:
2646                 emit_sop(p, dst, dst_flags,
2647                          BRW_CONDITIONAL_EQ, args[0], args[1]);
2648                 break;
2649             case OPCODE_SNE:
2650                 emit_sop(p, dst, dst_flags,
2651                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
2652                 break;
2653             case OPCODE_MUL:
2654                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2655                 break;
2656             case OPCODE_POW:
2657                 emit_pow(c, inst);
2658                 break;
2659             case OPCODE_MAD:
2660                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2661                 break;
2662             case OPCODE_NOISE1:
2663                 emit_noise1(c, inst);
2664                 break;
2665             case OPCODE_NOISE2:
2666                 emit_noise2(c, inst);
2667                 break;
2668             case OPCODE_NOISE3:
2669                 emit_noise3(c, inst);
2670                 break;
2671             case OPCODE_NOISE4:
2672                 emit_noise4(c, inst);
2673                 break;
2674             case OPCODE_TEX:
2675                 emit_tex(c, inst);
2676                 break;
2677             case OPCODE_TXB:
2678                 emit_txb(c, inst);
2679                 break;
2680             case OPCODE_KIL_NV:
2681                 emit_kil(c);
2682                 break;
2683             case OPCODE_IF:
2684                 assert(if_depth < MAX_IF_DEPTH);
2685                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2686                 break;
2687             case OPCODE_ELSE:
2688                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2689                 break;
2690             case OPCODE_ENDIF:
2691                 assert(if_depth > 0);
2692                 brw_ENDIF(p, if_inst[--if_depth]);
2693                 break;
2694             case OPCODE_BGNSUB:
2695                 brw_save_label(p, inst->Comment, p->nr_insn);
2696                 break;
2697             case OPCODE_ENDSUB:
2698                 /* no-op */
2699                 break;
2700             case OPCODE_CAL:
2701                 brw_push_insn_state(p);
2702                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2703                 brw_set_access_mode(p, BRW_ALIGN_1);
2704                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2705                 brw_set_access_mode(p, BRW_ALIGN_16);
2706                 brw_ADD(p, get_addr_reg(stack_index),
2707                          get_addr_reg(stack_index), brw_imm_d(4));
2708                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2709                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2710                 brw_pop_insn_state(p);
2711                 break;
2712
2713             case OPCODE_RET:
2714                 brw_push_insn_state(p);
2715                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2716                 brw_ADD(p, get_addr_reg(stack_index),
2717                         get_addr_reg(stack_index), brw_imm_d(-4));
2718                 brw_set_access_mode(p, BRW_ALIGN_1);
2719                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2720                 brw_set_access_mode(p, BRW_ALIGN_16);
2721                 brw_pop_insn_state(p);
2722
2723                 break;
2724             case OPCODE_BGNLOOP:
2725                 /* XXX may need to invalidate the current_constant regs */
2726                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2727                 break;
2728             case OPCODE_BRK:
2729                 brw_BREAK(p);
2730                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2731                 break;
2732             case OPCODE_CONT:
2733                 brw_CONT(p);
2734                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2735                 break;
2736             case OPCODE_ENDLOOP:
2737                {
2738                   struct brw_instruction *inst0, *inst1;
2739                   GLuint br = 1;
2740
2741                   if (BRW_IS_IGDNG(brw))
2742                      br = 2;
2743
2744                   loop_depth--;
2745                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2746                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2747                   while (inst0 > loop_inst[loop_depth]) {
2748                      inst0--;
2749                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2750                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2751                         inst0->bits3.if_else.pop_count = 0;
2752                      }
2753                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2754                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2755                         inst0->bits3.if_else.pop_count = 0;
2756                      }
2757                   }
2758                }
2759                break;
2760             default:
2761                 _mesa_printf("unsupported IR in fragment shader %d\n",
2762                         inst->Opcode);
2763         }
2764
2765         if (inst->CondUpdate)
2766             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2767         else
2768             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2769     }
2770     post_wm_emit(c);
2771
2772     if (INTEL_DEBUG & DEBUG_WM) {
2773       _mesa_printf("wm-native:\n");
2774       for (i = 0; i < p->nr_insn; i++)
2775          brw_disasm(stderr, &p->store[i]);
2776       _mesa_printf("\n");
2777     }
2778 }
2779
2780 /**
2781  * Do GPU code generation for shaders that use GLSL features such as
2782  * flow control.  Other shaders will be compiled with the
2783  */
2784 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2785 {
2786     if (INTEL_DEBUG & DEBUG_WM) {
2787         _mesa_printf("brw_wm_glsl_emit:\n");
2788     }
2789
2790     /* initial instruction translation/simplification */
2791     brw_wm_pass_fp(c);
2792
2793     /* actual code generation */
2794     brw_wm_emit_glsl(brw, c);
2795
2796     if (INTEL_DEBUG & DEBUG_WM) {
2797         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2798     }
2799
2800     c->prog_data.total_grf = num_grf_used(c);
2801     c->prog_data.total_scratch = 0;
2802 }