src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & (1 << i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553
 554 /**
 555  * Same as \sa get_src_reg() but if the register is a literal, emit
 556  * a brw_reg encoding the literal.
 557  * Note that a brw instruction only allows one src operand to be a literal.
 558  * For instructions with more than one operand, only the second can be a
 559  * literal.  This means that we treat some literals as constants/uniforms
 560  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 561  *
 562  */
 563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 564                                       const struct prog_instruction *inst,
 565                                       GLuint srcRegIndex, GLuint channel)
 566 {
 567     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 568     if (src->File == PROGRAM_CONSTANT) {
 569        /* a literal */
 570        const int component = GET_SWZ(src->Swizzle, channel);
 571        const GLfloat *param =
 572           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 573        GLfloat value = param[component];
 574        if (src->Negate & (1 << channel))
 575           value = -value;
 576        if (src->Abs)
 577           value = FABSF(value);
 578 #if 0
 579        printf("  form immed value %f for chan %d\n", value, channel);
 580 #endif
 581        return brw_imm_f(value);
 582     }
 583     else {
 584        return get_src_reg(c, inst, srcRegIndex, channel);
 585     }
 586 }
 587
 588
 589 /**
 590  * Subroutines are minimal support for resusable instruction sequences.
 591  * They are implemented as simply as possible to minimise overhead: there
 592  * is no explicit support for communication between the caller and callee
 593  * other than saving the return address in a temporary register, nor is
 594  * there any automatic local storage.  This implies that great care is
 595  * required before attempting reentrancy or any kind of nested
 596  * subroutine invocations.
 597  */
 598 static void invoke_subroutine( struct brw_wm_compile *c,
 599                                enum _subroutine subroutine,
 600                                void (*emit)( struct brw_wm_compile * ) )
 601 {
 602     struct brw_compile *p = &c->func;
 603
 604     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 605
 606     if( c->subroutines[ subroutine ] ) {
 607         /* subroutine previously emitted: reuse existing instructions */
 608
 609         int mark = mark_tmps( c );
 610         struct brw_reg return_address = retype( alloc_tmp( c ),
 611                                                 BRW_REGISTER_TYPE_UD );
 612         int here = p->nr_insn;
 613
 614         brw_push_insn_state(p);
 615         brw_set_mask_control(p, BRW_MASK_DISABLE);
 616         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 617
 618         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 619                  brw_imm_d( ( c->subroutines[ subroutine ] -
 620                               here - 1 ) << 4 ) );
 621         brw_pop_insn_state(p);
 622
 623         release_tmps( c, mark );
 624     } else {
 625         /* previously unused subroutine: emit, and mark for later reuse */
 626
 627         int mark = mark_tmps( c );
 628         struct brw_reg return_address = retype( alloc_tmp( c ),
 629                                                 BRW_REGISTER_TYPE_UD );
 630         struct brw_instruction *calc;
 631         int base = p->nr_insn;
 632
 633         brw_push_insn_state(p);
 634         brw_set_mask_control(p, BRW_MASK_DISABLE);
 635         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 636         brw_pop_insn_state(p);
 637
 638         c->subroutines[ subroutine ] = p->nr_insn;
 639
 640         emit( c );
 641
 642         brw_push_insn_state(p);
 643         brw_set_mask_control(p, BRW_MASK_DISABLE);
 644         brw_MOV( p, brw_ip_reg(), return_address );
 645         brw_pop_insn_state(p);
 646
 647         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 648
 649         release_tmps( c, mark );
 650     }
 651 }
 652
 653 static void emit_trunc( struct brw_wm_compile *c,
 654                         const struct prog_instruction *inst)
 655 {
 656     int i;
 657     struct brw_compile *p = &c->func;
 658     GLuint mask = inst->DstReg.WriteMask;
 659     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 660     for (i = 0; i < 4; i++) {
 661         if (mask & (1<<i)) {
 662             struct brw_reg src, dst;
 663             dst = get_dst_reg(c, inst, i);
 664             src = get_src_reg(c, inst, 0, i);
 665             brw_RNDZ(p, dst, src);
 666         }
 667     }
 668     brw_set_saturate(p, 0);
 669 }
 670
 671 static void emit_pixel_xy(struct brw_wm_compile *c,
 672                           const struct prog_instruction *inst)
 673 {
 674     struct brw_reg r1 = brw_vec1_grf(1, 0);
 675     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 676
 677     struct brw_reg dst0, dst1;
 678     struct brw_compile *p = &c->func;
 679     GLuint mask = inst->DstReg.WriteMask;
 680
 681     dst0 = get_dst_reg(c, inst, 0);
 682     dst1 = get_dst_reg(c, inst, 1);
 683     /* Calculate pixel centers by adding 1 or 0 to each of the
 684      * micro-tile coordinates passed in r1.
 685      */
 686     if (mask & WRITEMASK_X) {
 687         brw_ADD(p,
 688                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 689                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 690                 brw_imm_v(0x10101010));
 691     }
 692
 693     if (mask & WRITEMASK_Y) {
 694         brw_ADD(p,
 695                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 696                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 697                 brw_imm_v(0x11001100));
 698     }
 699 }
 700
 701 static void emit_delta_xy(struct brw_wm_compile *c,
 702                           const struct prog_instruction *inst)
 703 {
 704     struct brw_reg r1 = brw_vec1_grf(1, 0);
 705     struct brw_reg dst0, dst1, src0, src1;
 706     struct brw_compile *p = &c->func;
 707     GLuint mask = inst->DstReg.WriteMask;
 708
 709     dst0 = get_dst_reg(c, inst, 0);
 710     dst1 = get_dst_reg(c, inst, 1);
 711     src0 = get_src_reg(c, inst, 0, 0);
 712     src1 = get_src_reg(c, inst, 0, 1);
 713     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 714      * centers.
 715      */
 716     if (mask & WRITEMASK_X) {
 717         brw_ADD(p,
 718                 dst0,
 719                 retype(src0, BRW_REGISTER_TYPE_UW),
 720                 negate(r1));
 721     }
 722
 723     if (mask & WRITEMASK_Y) {
 724         brw_ADD(p,
 725                 dst1,
 726                 retype(src1, BRW_REGISTER_TYPE_UW),
 727                 negate(suboffset(r1,1)));
 728
 729     }
 730 }
 731
 732 static void fire_fb_write( struct brw_wm_compile *c,
 733                            GLuint base_reg,
 734                            GLuint nr,
 735                            GLuint target,
 736                            GLuint eot)
 737 {
 738     struct brw_compile *p = &c->func;
 739     /* Pass through control information:
 740      */
 741     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 742     {
 743         brw_push_insn_state(p);
 744         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 745         brw_MOV(p,
 746                 brw_message_reg(base_reg + 1),
 747                 brw_vec8_grf(1, 0));
 748         brw_pop_insn_state(p);
 749     }
 750     /* Send framebuffer write message: */
 751     brw_fb_WRITE(p,
 752             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 753             base_reg,
 754             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 755             target,
 756             nr,
 757             0,
 758             eot);
 759 }
 760
 761 static void emit_fb_write(struct brw_wm_compile *c,
 762                           const struct prog_instruction *inst)
 763 {
 764     struct brw_compile *p = &c->func;
 765     int nr = 2;
 766     int channel;
 767     GLuint target, eot;
 768     struct brw_reg src0;
 769
 770     /* Reserve a space for AA - may not be needed:
 771      */
 772     if (c->key.aa_dest_stencil_reg)
 773         nr += 1;
 774
 775     brw_push_insn_state(p);
 776     for (channel = 0; channel < 4; channel++) {
 777         src0 = get_src_reg(c,  inst, 0, channel);
 778         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 779         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 780         brw_MOV(p, brw_message_reg(nr + channel), src0);
 781     }
 782     /* skip over the regs populated above: */
 783     nr += 8;
 784     brw_pop_insn_state(p);
 785
 786     if (c->key.source_depth_to_render_target) {
 787        if (c->key.computes_depth) {
 788           src0 = get_src_reg(c, inst, 2, 2);
 789           brw_MOV(p, brw_message_reg(nr), src0);
 790        }
 791        else {
 792           src0 = get_src_reg(c, inst, 1, 1);
 793           brw_MOV(p, brw_message_reg(nr), src0);
 794        }
 795
 796        nr += 2;
 797     }
 798
 799     if (c->key.dest_depth_reg) {
 800         const GLuint comp = c->key.dest_depth_reg / 2;
 801         const GLuint off = c->key.dest_depth_reg % 2;
 802
 803         if (off != 0) {
 804             /* XXX this code needs review/testing */
 805             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 806             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 807
 808             brw_push_insn_state(p);
 809             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 810
 811             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 812             /* 2nd half? */
 813             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 814             brw_pop_insn_state(p);
 815         }
 816         else
 817         {
 818             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 819             brw_MOV(p, brw_message_reg(nr), src);
 820         }
 821         nr += 2;
 822    }
 823
 824     target = INST_AUX_GET_TARGET(inst->Aux);
 825     eot = inst->Aux & INST_AUX_EOT;
 826     fire_fb_write(c, 0, nr, target, eot);
 827 }
 828
 829 static void emit_pixel_w( struct brw_wm_compile *c,
 830                           const struct prog_instruction *inst)
 831 {
 832     struct brw_compile *p = &c->func;
 833     GLuint mask = inst->DstReg.WriteMask;
 834     if (mask & WRITEMASK_W) {
 835         struct brw_reg dst, src0, delta0, delta1;
 836         struct brw_reg interp3;
 837
 838         dst = get_dst_reg(c, inst, 3);
 839         src0 = get_src_reg(c, inst, 0, 0);
 840         delta0 = get_src_reg(c, inst, 1, 0);
 841         delta1 = get_src_reg(c, inst, 1, 1);
 842
 843         interp3 = brw_vec1_grf(src0.nr+1, 4);
 844         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 845          * result straight into a message reg.
 846          */
 847         brw_LINE(p, brw_null_reg(), interp3, delta0);
 848         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 849
 850         /* Calc w */
 851         brw_math_16( p, dst,
 852                 BRW_MATH_FUNCTION_INV,
 853                 BRW_MATH_SATURATE_NONE,
 854                 2, brw_null_reg(),
 855                 BRW_MATH_PRECISION_FULL);
 856     }
 857 }
 858
 859 static void emit_linterp(struct brw_wm_compile *c,
 860                          const struct prog_instruction *inst)
 861 {
 862     struct brw_compile *p = &c->func;
 863     GLuint mask = inst->DstReg.WriteMask;
 864     struct brw_reg interp[4];
 865     struct brw_reg dst, delta0, delta1;
 866     struct brw_reg src0;
 867     GLuint nr, i;
 868
 869     src0 = get_src_reg(c, inst, 0, 0);
 870     delta0 = get_src_reg(c, inst, 1, 0);
 871     delta1 = get_src_reg(c, inst, 1, 1);
 872     nr = src0.nr;
 873
 874     interp[0] = brw_vec1_grf(nr, 0);
 875     interp[1] = brw_vec1_grf(nr, 4);
 876     interp[2] = brw_vec1_grf(nr+1, 0);
 877     interp[3] = brw_vec1_grf(nr+1, 4);
 878
 879     for(i = 0; i < 4; i++ ) {
 880         if (mask & (1<<i)) {
 881             dst = get_dst_reg(c, inst, i);
 882             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 883             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 884         }
 885     }
 886 }
 887
 888 static void emit_cinterp(struct brw_wm_compile *c,
 889                          const struct prog_instruction *inst)
 890 {
 891     struct brw_compile *p = &c->func;
 892     GLuint mask = inst->DstReg.WriteMask;
 893
 894     struct brw_reg interp[4];
 895     struct brw_reg dst, src0;
 896     GLuint nr, i;
 897
 898     src0 = get_src_reg(c, inst, 0, 0);
 899     nr = src0.nr;
 900
 901     interp[0] = brw_vec1_grf(nr, 0);
 902     interp[1] = brw_vec1_grf(nr, 4);
 903     interp[2] = brw_vec1_grf(nr+1, 0);
 904     interp[3] = brw_vec1_grf(nr+1, 4);
 905
 906     for(i = 0; i < 4; i++ ) {
 907         if (mask & (1<<i)) {
 908             dst = get_dst_reg(c, inst, i);
 909             brw_MOV(p, dst, suboffset(interp[i],3));
 910         }
 911     }
 912 }
 913
 914 static void emit_pinterp(struct brw_wm_compile *c,
 915                          const struct prog_instruction *inst)
 916 {
 917     struct brw_compile *p = &c->func;
 918     GLuint mask = inst->DstReg.WriteMask;
 919
 920     struct brw_reg interp[4];
 921     struct brw_reg dst, delta0, delta1;
 922     struct brw_reg src0, w;
 923     GLuint nr, i;
 924
 925     src0 = get_src_reg(c, inst, 0, 0);
 926     delta0 = get_src_reg(c, inst, 1, 0);
 927     delta1 = get_src_reg(c, inst, 1, 1);
 928     w = get_src_reg(c, inst, 2, 3);
 929     nr = src0.nr;
 930
 931     interp[0] = brw_vec1_grf(nr, 0);
 932     interp[1] = brw_vec1_grf(nr, 4);
 933     interp[2] = brw_vec1_grf(nr+1, 0);
 934     interp[3] = brw_vec1_grf(nr+1, 4);
 935
 936     for(i = 0; i < 4; i++ ) {
 937         if (mask & (1<<i)) {
 938             dst = get_dst_reg(c, inst, i);
 939             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 940             brw_MAC(p, dst, suboffset(interp[i],1),
 941                     delta1);
 942             brw_MUL(p, dst, dst, w);
 943         }
 944     }
 945 }
 946
 947 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 948 static void emit_frontfacing(struct brw_wm_compile *c,
 949                              const struct prog_instruction *inst)
 950 {
 951     struct brw_compile *p = &c->func;
 952     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 953     struct brw_reg dst;
 954     GLuint mask = inst->DstReg.WriteMask;
 955     int i;
 956
 957     for (i = 0; i < 4; i++) {
 958         if (mask & (1<<i)) {
 959             dst = get_dst_reg(c, inst, i);
 960             brw_MOV(p, dst, brw_imm_f(0.0));
 961         }
 962     }
 963
 964     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 965      * us front face
 966      */
 967     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 968     for (i = 0; i < 4; i++) {
 969         if (mask & (1<<i)) {
 970             dst = get_dst_reg(c, inst, i);
 971             brw_MOV(p, dst, brw_imm_f(1.0));
 972         }
 973     }
 974     brw_set_predicate_control_flag_value(p, 0xff);
 975 }
 976
 977 static void emit_xpd(struct brw_wm_compile *c,
 978                      const struct prog_instruction *inst)
 979 {
 980     int i;
 981     struct brw_compile *p = &c->func;
 982     GLuint mask = inst->DstReg.WriteMask;
 983     for (i = 0; i < 4; i++) {
 984         GLuint i2 = (i+2)%3;
 985         GLuint i1 = (i+1)%3;
 986         if (mask & (1<<i)) {
 987             struct brw_reg src0, src1, dst;
 988             dst = get_dst_reg(c, inst, i);
 989             src0 = negate(get_src_reg(c, inst, 0, i2));
 990             src1 = get_src_reg_imm(c, inst, 1, i1);
 991             brw_MUL(p, brw_null_reg(), src0, src1);
 992             src0 = get_src_reg(c, inst, 0, i1);
 993             src1 = get_src_reg_imm(c, inst, 1, i2);
 994             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 995             brw_MAC(p, dst, src0, src1);
 996             brw_set_saturate(p, 0);
 997         }
 998     }
 999     brw_set_saturate(p, 0);
1000 }
1001
1002 static void emit_dp3(struct brw_wm_compile *c,
1003                      const struct prog_instruction *inst)
1004 {
1005     struct brw_reg src0[3], src1[3], dst;
1006     int i;
1007     struct brw_compile *p = &c->func;
1008     GLuint mask = inst->DstReg.WriteMask;
1009     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1010
1011     if (!(mask & WRITEMASK_XYZW))
1012         return;
1013
1014     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1015
1016     for (i = 0; i < 3; i++) {
1017         src0[i] = get_src_reg(c, inst, 0, i);
1018         src1[i] = get_src_reg_imm(c, inst, 1, i);
1019     }
1020
1021     dst = get_dst_reg(c, inst, dst_chan);
1022     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1023     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1024     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1025     brw_MAC(p, dst, src0[2], src1[2]);
1026     brw_set_saturate(p, 0);
1027 }
1028
1029 static void emit_dp4(struct brw_wm_compile *c,
1030                      const struct prog_instruction *inst)
1031 {
1032     struct brw_reg src0[4], src1[4], dst;
1033     int i;
1034     struct brw_compile *p = &c->func;
1035     GLuint mask = inst->DstReg.WriteMask;
1036     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1037
1038     if (!(mask & WRITEMASK_XYZW))
1039         return;
1040
1041     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1042
1043     for (i = 0; i < 4; i++) {
1044         src0[i] = get_src_reg(c, inst, 0, i);
1045         src1[i] = get_src_reg_imm(c, inst, 1, i);
1046     }
1047     dst = get_dst_reg(c, inst, dst_chan);
1048     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1049     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1050     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1051     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1052     brw_MAC(p, dst, src0[3], src1[3]);
1053     brw_set_saturate(p, 0);
1054 }
1055
1056 static void emit_dph(struct brw_wm_compile *c,
1057                      const struct prog_instruction *inst)
1058 {
1059     struct brw_reg src0[4], src1[4], dst;
1060     int i;
1061     struct brw_compile *p = &c->func;
1062     GLuint mask = inst->DstReg.WriteMask;
1063     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1064
1065     if (!(mask & WRITEMASK_XYZW))
1066         return;
1067
1068     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1069
1070     for (i = 0; i < 4; i++) {
1071         src0[i] = get_src_reg(c, inst, 0, i);
1072         src1[i] = get_src_reg_imm(c, inst, 1, i);
1073     }
1074     dst = get_dst_reg(c, inst, dst_chan);
1075     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1076     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1077     brw_MAC(p, dst, src0[2], src1[2]);
1078     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1079     brw_ADD(p, dst, dst, src1[3]);
1080     brw_set_saturate(p, 0);
1081 }
1082
1083 /**
1084  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1085  * Note that the result of the function is smeared across the dest
1086  * register's X, Y, Z and W channels (subject to writemasking of course).
1087  */
1088 static void emit_math1(struct brw_wm_compile *c,
1089                        const struct prog_instruction *inst, GLuint func)
1090 {
1091     struct brw_compile *p = &c->func;
1092     struct brw_reg src0, dst;
1093     GLuint mask = inst->DstReg.WriteMask;
1094     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1095
1096     if (!(mask & WRITEMASK_XYZW))
1097         return;
1098
1099     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1100
1101     /* Get first component of source register */
1102     dst = get_dst_reg(c, inst, dst_chan);
1103     src0 = get_src_reg(c, inst, 0, 0);
1104
1105     brw_MOV(p, brw_message_reg(2), src0);
1106     brw_math(p,
1107              dst,
1108              func,
1109              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1110              2,
1111              brw_null_reg(),
1112              BRW_MATH_DATA_VECTOR,
1113              BRW_MATH_PRECISION_FULL);
1114 }
1115
1116 static void emit_rcp(struct brw_wm_compile *c,
1117                      const struct prog_instruction *inst)
1118 {
1119     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1120 }
1121
1122 static void emit_rsq(struct brw_wm_compile *c,
1123                      const struct prog_instruction *inst)
1124 {
1125     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1126 }
1127
1128 static void emit_sin(struct brw_wm_compile *c,
1129                      const struct prog_instruction *inst)
1130 {
1131     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1132 }
1133
1134 static void emit_cos(struct brw_wm_compile *c,
1135                      const struct prog_instruction *inst)
1136 {
1137     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1138 }
1139
1140 static void emit_ex2(struct brw_wm_compile *c,
1141                      const struct prog_instruction *inst)
1142 {
1143     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1144 }
1145
1146 static void emit_lg2(struct brw_wm_compile *c,
1147                      const struct prog_instruction *inst)
1148 {
1149     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1150 }
1151
1152 static void emit_arl(struct brw_wm_compile *c,
1153                      const struct prog_instruction *inst)
1154 {
1155     struct brw_compile *p = &c->func;
1156     struct brw_reg src0, addr_reg;
1157     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1158     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1159                            BRW_ARF_ADDRESS, 0);
1160     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1161     brw_MOV(p, addr_reg, src0);
1162     brw_set_saturate(p, 0);
1163 }
1164
1165
1166 static void emit_min_max(struct brw_wm_compile *c,
1167                          const struct prog_instruction *inst)
1168 {
1169     struct brw_compile *p = &c->func;
1170     const GLuint mask = inst->DstReg.WriteMask;
1171     const int mark = mark_tmps(c);
1172     int i;
1173     brw_push_insn_state(p);
1174     for (i = 0; i < 4; i++) {
1175         if (mask & (1<<i)) {
1176             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1177             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1178             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1179             struct brw_reg dst;
1180             /* if dst==src0 or dst==src1 we need to use a temp reg */
1181             GLboolean use_temp = brw_same_reg(dst, src0) ||
1182                                  brw_same_reg(dst, src1);
1183             if (use_temp)
1184                dst = alloc_tmp(c);
1185             else
1186                dst = real_dst;
1187
1188             /*
1189             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1190                    dst.nr, src0.nr, src1.nr);
1191             */
1192             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1193             brw_MOV(p, dst, src0);
1194             brw_set_saturate(p, 0);
1195
1196             if (inst->Opcode == OPCODE_MIN)
1197                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1198             else
1199                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1200
1201             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1202             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1203             brw_MOV(p, dst, src1);
1204             brw_set_saturate(p, 0);
1205             brw_set_predicate_control_flag_value(p, 0xff);
1206             if (use_temp)
1207                brw_MOV(p, real_dst, dst);
1208         }
1209     }
1210     brw_pop_insn_state(p);
1211     release_tmps(c, mark);
1212 }
1213
1214 static void emit_pow(struct brw_wm_compile *c,
1215                      const struct prog_instruction *inst)
1216 {
1217     struct brw_compile *p = &c->func;
1218     struct brw_reg dst, src0, src1;
1219     GLuint mask = inst->DstReg.WriteMask;
1220     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1221
1222     if (!(mask & WRITEMASK_XYZW))
1223         return;
1224
1225     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1226
1227     dst = get_dst_reg(c, inst, dst_chan);
1228     src0 = get_src_reg_imm(c, inst, 0, 0);
1229     src1 = get_src_reg_imm(c, inst, 1, 0);
1230
1231     brw_MOV(p, brw_message_reg(2), src0);
1232     brw_MOV(p, brw_message_reg(3), src1);
1233
1234     brw_math(p,
1235             dst,
1236             BRW_MATH_FUNCTION_POW,
1237             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1238             2,
1239             brw_null_reg(),
1240             BRW_MATH_DATA_VECTOR,
1241             BRW_MATH_PRECISION_FULL);
1242 }
1243
1244 static void emit_lrp(struct brw_wm_compile *c,
1245                      const struct prog_instruction *inst)
1246 {
1247     struct brw_compile *p = &c->func;
1248     GLuint mask = inst->DstReg.WriteMask;
1249     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1250     int i;
1251     int mark = mark_tmps(c);
1252     for (i = 0; i < 4; i++) {
1253         if (mask & (1<<i)) {
1254             dst = get_dst_reg(c, inst, i);
1255             src0 = get_src_reg(c, inst, 0, i);
1256
1257             src1 = get_src_reg_imm(c, inst, 1, i);
1258
1259             if (src1.nr == dst.nr) {
1260                 tmp1 = alloc_tmp(c);
1261                 brw_MOV(p, tmp1, src1);
1262             } else
1263                 tmp1 = src1;
1264
1265             src2 = get_src_reg(c, inst, 2, i);
1266             if (src2.nr == dst.nr) {
1267                 tmp2 = alloc_tmp(c);
1268                 brw_MOV(p, tmp2, src2);
1269             } else
1270                 tmp2 = src2;
1271
1272             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1273             brw_MUL(p, brw_null_reg(), dst, tmp2);
1274             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1275             brw_MAC(p, dst, src0, tmp1);
1276             brw_set_saturate(p, 0);
1277         }
1278         release_tmps(c, mark);
1279     }
1280 }
1281
1282 /**
1283  * For GLSL shaders, this KIL will be unconditional.
1284  * It may be contained inside an IF/ENDIF structure of course.
1285  */
1286 static void emit_kil(struct brw_wm_compile *c)
1287 {
1288     struct brw_compile *p = &c->func;
1289     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1290     brw_push_insn_state(p);
1291     brw_set_mask_control(p, BRW_MASK_DISABLE);
1292     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1293     brw_AND(p, depth, c->emit_mask_reg, depth);
1294     brw_pop_insn_state(p);
1295 }
1296
1297 static void emit_mad(struct brw_wm_compile *c,
1298                      const struct prog_instruction *inst)
1299 {
1300     struct brw_compile *p = &c->func;
1301     GLuint mask = inst->DstReg.WriteMask;
1302     struct brw_reg dst, src0, src1, src2;
1303     int i;
1304
1305     for (i = 0; i < 4; i++) {
1306         if (mask & (1<<i)) {
1307             dst = get_dst_reg(c, inst, i);
1308             src0 = get_src_reg(c, inst, 0, i);
1309             src1 = get_src_reg_imm(c, inst, 1, i);
1310             src2 = get_src_reg_imm(c, inst, 2, i);
1311             brw_MUL(p, dst, src0, src1);
1312
1313             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1314             brw_ADD(p, dst, dst, src2);
1315             brw_set_saturate(p, 0);
1316         }
1317     }
1318 }
1319
1320 static void emit_sop(struct brw_wm_compile *c,
1321                      const struct prog_instruction *inst, GLuint cond)
1322 {
1323     struct brw_compile *p = &c->func;
1324     GLuint mask = inst->DstReg.WriteMask;
1325     struct brw_reg dst, src0, src1;
1326     int i;
1327
1328     for (i = 0; i < 4; i++) {
1329         if (mask & (1<<i)) {
1330             dst = get_dst_reg(c, inst, i);
1331             src0 = get_src_reg(c, inst, 0, i);
1332             src1 = get_src_reg_imm(c, inst, 1, i);
1333             brw_push_insn_state(p);
1334             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1335             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1336             brw_MOV(p, dst, brw_imm_f(0.0));
1337             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1338             brw_MOV(p, dst, brw_imm_f(1.0));
1339             brw_pop_insn_state(p);
1340         }
1341     }
1342 }
1343
1344 static void emit_slt(struct brw_wm_compile *c,
1345                      const struct prog_instruction *inst)
1346 {
1347     emit_sop(c, inst, BRW_CONDITIONAL_L);
1348 }
1349
1350 static void emit_sle(struct brw_wm_compile *c,
1351                      const struct prog_instruction *inst)
1352 {
1353     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1354 }
1355
1356 static void emit_sgt(struct brw_wm_compile *c,
1357                      const struct prog_instruction *inst)
1358 {
1359     emit_sop(c, inst, BRW_CONDITIONAL_G);
1360 }
1361
1362 static void emit_sge(struct brw_wm_compile *c,
1363                      const struct prog_instruction *inst)
1364 {
1365     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1366 }
1367
1368 static void emit_seq(struct brw_wm_compile *c,
1369                      const struct prog_instruction *inst)
1370 {
1371     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1372 }
1373
1374 static void emit_sne(struct brw_wm_compile *c,
1375                      const struct prog_instruction *inst)
1376 {
1377     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1378 }
1379
1380 static INLINE struct brw_reg high_words( struct brw_reg reg )
1381 {
1382     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1383                    0, 8, 2 );
1384 }
1385
1386 static INLINE struct brw_reg low_words( struct brw_reg reg )
1387 {
1388     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1389 }
1390
1391 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1392 {
1393     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1394 }
1395
1396 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1397 {
1398     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1399                    0, 16, 2 );
1400 }
1401
1402 /* One-, two- and three-dimensional Perlin noise, similar to the description
1403    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1404 static void noise1_sub( struct brw_wm_compile *c ) {
1405
1406     struct brw_compile *p = &c->func;
1407     struct brw_reg param,
1408         x0, x1, /* gradients at each end */
1409         t, tmp[ 2 ], /* float temporaries */
1410         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1411     int i;
1412     int mark = mark_tmps( c );
1413
1414     x0 = alloc_tmp( c );
1415     x1 = alloc_tmp( c );
1416     t = alloc_tmp( c );
1417     tmp[ 0 ] = alloc_tmp( c );
1418     tmp[ 1 ] = alloc_tmp( c );
1419     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1420     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1421     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1422     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1423     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1424
1425     param = lookup_tmp( c, mark - 2 );
1426
1427     brw_set_access_mode( p, BRW_ALIGN_1 );
1428
1429     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1430
1431     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1432        be hashed.  Also compute the remainder (offset within the unit
1433        length), interleaved to reduce register dependency penalties. */
1434     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1435     brw_FRC( p, param, param );
1436     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1437     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1438     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1439
1440     /* We're now ready to perform the hashing.  The two hashes are
1441        interleaved for performance.  The hash function used is
1442        designed to rapidly achieve avalanche and require only 32x16
1443        bit multiplication, and 16-bit swizzles (which we get for
1444        free).  We can't use immediate operands in the multiplies,
1445        because immediates are permitted only in src1 and the 16-bit
1446        factor is permitted only in src0. */
1447     for( i = 0; i < 2; i++ )
1448         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1449     for( i = 0; i < 2; i++ )
1450        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1451                 high_words( itmp[ i ] ) );
1452     for( i = 0; i < 2; i++ )
1453         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1454     for( i = 0; i < 2; i++ )
1455        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1456                 high_words( itmp[ i ] ) );
1457     for( i = 0; i < 2; i++ )
1458         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1459     for( i = 0; i < 2; i++ )
1460        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1461                 high_words( itmp[ i ] ) );
1462
1463     /* Now we want to initialise the two gradients based on the
1464        hashes.  Format conversion from signed integer to float leaves
1465        everything scaled too high by a factor of pow( 2, 31 ), but
1466        we correct for that right at the end. */
1467     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1468     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1469     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1470
1471     brw_MUL( p, x0, x0, param );
1472     brw_MUL( p, x1, x1, t );
1473
1474     /* We interpolate between the gradients using the polynomial
1475        6t^5 - 15t^4 + 10t^3 (Perlin). */
1476     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1477     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1478     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1479     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1480     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1481     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1482                                            pipeline */
1483     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1484     brw_MUL( p, param, tmp[ 0 ], param );
1485     brw_MUL( p, x1, x1, param );
1486     brw_ADD( p, x0, x0, x1 );
1487     /* scale by pow( 2, -30 ), to compensate for the format conversion
1488        above and an extra factor of 2 so that a single gradient covers
1489        the [-1,1] range */
1490     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1491
1492     release_tmps( c, mark );
1493 }
1494
1495 static void emit_noise1( struct brw_wm_compile *c,
1496                          const struct prog_instruction *inst )
1497 {
1498     struct brw_compile *p = &c->func;
1499     struct brw_reg src, param, dst;
1500     GLuint mask = inst->DstReg.WriteMask;
1501     int i;
1502     int mark = mark_tmps( c );
1503
1504     assert( mark == 0 );
1505
1506     src = get_src_reg( c, inst, 0, 0 );
1507
1508     param = alloc_tmp( c );
1509
1510     brw_MOV( p, param, src );
1511
1512     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1513
1514     /* Fill in the result: */
1515     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1516     for (i = 0 ; i < 4; i++) {
1517         if (mask & (1<<i)) {
1518             dst = get_dst_reg(c, inst, i);
1519             brw_MOV( p, dst, param );
1520         }
1521     }
1522     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1523         brw_set_saturate( p, 0 );
1524
1525     release_tmps( c, mark );
1526 }
1527
1528 static void noise2_sub( struct brw_wm_compile *c ) {
1529
1530     struct brw_compile *p = &c->func;
1531     struct brw_reg param0, param1,
1532         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1533         t, tmp[ 4 ], /* float temporaries */
1534         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1535     int i;
1536     int mark = mark_tmps( c );
1537
1538     x0y0 = alloc_tmp( c );
1539     x0y1 = alloc_tmp( c );
1540     x1y0 = alloc_tmp( c );
1541     x1y1 = alloc_tmp( c );
1542     t = alloc_tmp( c );
1543     for( i = 0; i < 4; i++ ) {
1544         tmp[ i ] = alloc_tmp( c );
1545         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1546     }
1547     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1548     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1549     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1550
1551     param0 = lookup_tmp( c, mark - 3 );
1552     param1 = lookup_tmp( c, mark - 2 );
1553
1554     brw_set_access_mode( p, BRW_ALIGN_1 );
1555
1556     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1557        be hashed.  Also compute the remainders (offsets within the unit
1558        square), interleaved to reduce register dependency penalties. */
1559     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1560     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1561     brw_FRC( p, param0, param0 );
1562     brw_FRC( p, param1, param1 );
1563     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1564     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1565              low_words( itmp[ 1 ] ) );
1566     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1567     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1568     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1569     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1570     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1571
1572     /* We're now ready to perform the hashing.  The four hashes are
1573        interleaved for performance.  The hash function used is
1574        designed to rapidly achieve avalanche and require only 32x16
1575        bit multiplication, and 16-bit swizzles (which we get for
1576        free).  We can't use immediate operands in the multiplies,
1577        because immediates are permitted only in src1 and the 16-bit
1578        factor is permitted only in src0. */
1579     for( i = 0; i < 4; i++ )
1580         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1581     for( i = 0; i < 4; i++ )
1582         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1583                  high_words( itmp[ i ] ) );
1584     for( i = 0; i < 4; i++ )
1585         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1586     for( i = 0; i < 4; i++ )
1587         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1588                  high_words( itmp[ i ] ) );
1589     for( i = 0; i < 4; i++ )
1590         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1591     for( i = 0; i < 4; i++ )
1592         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1593                  high_words( itmp[ i ] ) );
1594
1595     /* Now we want to initialise the four gradients based on the
1596        hashes.  Format conversion from signed integer to float leaves
1597        everything scaled too high by a factor of pow( 2, 15 ), but
1598        we correct for that right at the end. */
1599     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1600     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1601     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1602     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1603     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1604
1605     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1606     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1607     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1608     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1609
1610     brw_MUL( p, x1y0, x1y0, t );
1611     brw_MUL( p, x1y1, x1y1, t );
1612     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1613     brw_MUL( p, x0y0, x0y0, param0 );
1614     brw_MUL( p, x0y1, x0y1, param0 );
1615
1616     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1617     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1618     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1619     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1620
1621     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1622     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1623     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1624     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1625
1626     /* We interpolate between the gradients using the polynomial
1627        6t^5 - 15t^4 + 10t^3 (Perlin). */
1628     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1629     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1630     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1631     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1632     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1633     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1634     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1635                                                  pipeline */
1636     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1637     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1638     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1639     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1640     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1641                                                  pipeline */
1642     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1643     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1644     brw_MUL( p, param0, tmp[ 0 ], param0 );
1645     brw_MUL( p, param1, tmp[ 1 ], param1 );
1646
1647     /* Here we interpolate in the y dimension... */
1648     brw_MUL( p, x0y1, x0y1, param1 );
1649     brw_MUL( p, x1y1, x1y1, param1 );
1650     brw_ADD( p, x0y0, x0y0, x0y1 );
1651     brw_ADD( p, x1y0, x1y0, x1y1 );
1652
1653     /* And now in x.  There are horrible register dependencies here,
1654        but we have nothing else to do. */
1655     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1656     brw_MUL( p, x1y0, x1y0, param0 );
1657     brw_ADD( p, x0y0, x0y0, x1y0 );
1658
1659     /* scale by pow( 2, -15 ), as described above */
1660     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1661
1662     release_tmps( c, mark );
1663 }
1664
1665 static void emit_noise2( struct brw_wm_compile *c,
1666                          const struct prog_instruction *inst )
1667 {
1668     struct brw_compile *p = &c->func;
1669     struct brw_reg src0, src1, param0, param1, dst;
1670     GLuint mask = inst->DstReg.WriteMask;
1671     int i;
1672     int mark = mark_tmps( c );
1673
1674     assert( mark == 0 );
1675
1676     src0 = get_src_reg( c, inst, 0, 0 );
1677     src1 = get_src_reg( c, inst, 0, 1 );
1678
1679     param0 = alloc_tmp( c );
1680     param1 = alloc_tmp( c );
1681
1682     brw_MOV( p, param0, src0 );
1683     brw_MOV( p, param1, src1 );
1684
1685     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1686
1687     /* Fill in the result: */
1688     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1689     for (i = 0 ; i < 4; i++) {
1690         if (mask & (1<<i)) {
1691             dst = get_dst_reg(c, inst, i);
1692             brw_MOV( p, dst, param0 );
1693         }
1694     }
1695     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1696         brw_set_saturate( p, 0 );
1697
1698     release_tmps( c, mark );
1699 }
1700
1701 /**
1702  * The three-dimensional case is much like the one- and two- versions above,
1703  * but since the number of corners is rapidly growing we now pack 16 16-bit
1704  * hashes into each register to extract more parallelism from the EUs.
1705  */
1706 static void noise3_sub( struct brw_wm_compile *c ) {
1707
1708     struct brw_compile *p = &c->func;
1709     struct brw_reg param0, param1, param2,
1710         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1711         xi, yi, zi, /* interpolation coefficients */
1712         t, tmp[ 8 ], /* float temporaries */
1713         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1714         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1715     int i;
1716     int mark = mark_tmps( c );
1717
1718     x0y0 = alloc_tmp( c );
1719     x0y1 = alloc_tmp( c );
1720     x1y0 = alloc_tmp( c );
1721     x1y1 = alloc_tmp( c );
1722     xi = alloc_tmp( c );
1723     yi = alloc_tmp( c );
1724     zi = alloc_tmp( c );
1725     t = alloc_tmp( c );
1726     for( i = 0; i < 8; i++ ) {
1727         tmp[ i ] = alloc_tmp( c );
1728         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1729         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1730     }
1731
1732     param0 = lookup_tmp( c, mark - 4 );
1733     param1 = lookup_tmp( c, mark - 3 );
1734     param2 = lookup_tmp( c, mark - 2 );
1735
1736     brw_set_access_mode( p, BRW_ALIGN_1 );
1737
1738     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1739        be hashed.  Also compute the remainders (offsets within the unit
1740        cube), interleaved to reduce register dependency penalties. */
1741     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1742     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1743     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1744     brw_FRC( p, param0, param0 );
1745     brw_FRC( p, param1, param1 );
1746     brw_FRC( p, param2, param2 );
1747     /* Since we now have only 16 bits of precision in the hash, we must
1748        be more careful about thorough mixing to maintain entropy as we
1749        squash the input vector into a small scalar. */
1750     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1751     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1752     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1753              brw_imm_uw( 0x9B93 ) );
1754     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1755              brw_imm_uw( 0xBC8F ) );
1756
1757     /* Temporarily disable the execution mask while we work with ExecSize=16
1758        channels (the mask is set for ExecSize=8 and is probably incorrect).
1759        Although this might cause execution of unwanted channels, the code
1760        writes only to temporary registers and has no side effects, so
1761        disabling the mask is harmless. */
1762     brw_push_insn_state( p );
1763     brw_set_mask_control( p, BRW_MASK_DISABLE );
1764     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1765     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1766     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1767
1768     /* We're now ready to perform the hashing.  The eight hashes are
1769        interleaved for performance.  The hash function used is
1770        designed to rapidly achieve avalanche and require only 16x16
1771        bit multiplication, and 8-bit swizzles (which we get for
1772        free). */
1773     for( i = 0; i < 4; i++ )
1774         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1775     for( i = 0; i < 4; i++ )
1776         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1777                  odd_bytes( wtmp[ i ] ) );
1778     for( i = 0; i < 4; i++ )
1779         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1780     for( i = 0; i < 4; i++ )
1781         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1782                  odd_bytes( wtmp[ i ] ) );
1783     brw_pop_insn_state( p );
1784
1785     /* Now we want to initialise the four rear gradients based on the
1786        hashes.  Format conversion from signed integer to float leaves
1787        everything scaled too high by a factor of pow( 2, 15 ), but
1788        we correct for that right at the end. */
1789     /* x component */
1790     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1791     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1792     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1793     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1794     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1795
1796     brw_push_insn_state( p );
1797     brw_set_mask_control( p, BRW_MASK_DISABLE );
1798     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1799     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1800     brw_pop_insn_state( p );
1801
1802     brw_MUL( p, x1y0, x1y0, t );
1803     brw_MUL( p, x1y1, x1y1, t );
1804     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1805     brw_MUL( p, x0y0, x0y0, param0 );
1806     brw_MUL( p, x0y1, x0y1, param0 );
1807
1808     /* y component */
1809     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1810     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1811     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1812     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1813
1814     brw_push_insn_state( p );
1815     brw_set_mask_control( p, BRW_MASK_DISABLE );
1816     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1817     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1818     brw_pop_insn_state( p );
1819
1820     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1821     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1822     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1823     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1824     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1825
1826     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1827     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1828     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1829     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1830
1831     /* z component */
1832     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1833     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1834     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1835     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1836
1837     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1838     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1839     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1840     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1841
1842     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1843     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1844     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1845     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1846
1847     /* We interpolate between the gradients using the polynomial
1848        6t^5 - 15t^4 + 10t^3 (Perlin). */
1849     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1850     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1851     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1852     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1853     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1854     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1855     brw_MUL( p, xi, xi, param0 );
1856     brw_MUL( p, yi, yi, param1 );
1857     brw_MUL( p, zi, zi, param2 );
1858     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1859     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1860     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1861     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1862     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1863     brw_MUL( p, xi, xi, param0 );
1864     brw_MUL( p, yi, yi, param1 );
1865     brw_MUL( p, zi, zi, param2 );
1866     brw_MUL( p, xi, xi, param0 );
1867     brw_MUL( p, yi, yi, param1 );
1868     brw_MUL( p, zi, zi, param2 );
1869     brw_MUL( p, xi, xi, param0 );
1870     brw_MUL( p, yi, yi, param1 );
1871     brw_MUL( p, zi, zi, param2 );
1872
1873     /* Here we interpolate in the y dimension... */
1874     brw_MUL( p, x0y1, x0y1, yi );
1875     brw_MUL( p, x1y1, x1y1, yi );
1876     brw_ADD( p, x0y0, x0y0, x0y1 );
1877     brw_ADD( p, x1y0, x1y0, x1y1 );
1878
1879     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1880     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1881     brw_MUL( p, x1y0, x1y0, xi );
1882     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1883
1884     /* Now do the same thing for the front four gradients... */
1885     /* x component */
1886     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1887     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1888     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1889     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1890
1891     brw_push_insn_state( p );
1892     brw_set_mask_control( p, BRW_MASK_DISABLE );
1893     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1894     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1895     brw_pop_insn_state( p );
1896
1897     brw_MUL( p, x1y0, x1y0, t );
1898     brw_MUL( p, x1y1, x1y1, t );
1899     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1900     brw_MUL( p, x0y0, x0y0, param0 );
1901     brw_MUL( p, x0y1, x0y1, param0 );
1902
1903     /* y component */
1904     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1905     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1906     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1907     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1908
1909     brw_push_insn_state( p );
1910     brw_set_mask_control( p, BRW_MASK_DISABLE );
1911     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1912     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1913     brw_pop_insn_state( p );
1914
1915     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1916     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1917     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1918     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1919     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1920
1921     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1922     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1923     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1924     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1925
1926     /* z component */
1927     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1928     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1929     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1930     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1931
1932     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1933     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1934     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1935     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1936
1937     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1938     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1939     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1940     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1941
1942     /* The interpolation coefficients are still around from last time, so
1943        again interpolate in the y dimension... */
1944     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1945     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1946     brw_MUL( p, x0y1, x0y1, yi );
1947     brw_MUL( p, x1y1, x1y1, yi );
1948     brw_ADD( p, x0y0, x0y0, x0y1 );
1949     brw_ADD( p, x1y0, x1y0, x1y1 );
1950
1951     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1952        time put the front face in tmp[ 1 ] and we're nearly there... */
1953     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1954     brw_MUL( p, x1y0, x1y0, xi );
1955     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1956
1957     /* The final interpolation, in the z dimension: */
1958     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1959     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1960     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1961
1962     /* scale by pow( 2, -15 ), as described above */
1963     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1964
1965     release_tmps( c, mark );
1966 }
1967
1968 static void emit_noise3( struct brw_wm_compile *c,
1969                          const struct prog_instruction *inst )
1970 {
1971     struct brw_compile *p = &c->func;
1972     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1973     GLuint mask = inst->DstReg.WriteMask;
1974     int i;
1975     int mark = mark_tmps( c );
1976
1977     assert( mark == 0 );
1978
1979     src0 = get_src_reg( c, inst, 0, 0 );
1980     src1 = get_src_reg( c, inst, 0, 1 );
1981     src2 = get_src_reg( c, inst, 0, 2 );
1982
1983     param0 = alloc_tmp( c );
1984     param1 = alloc_tmp( c );
1985     param2 = alloc_tmp( c );
1986
1987     brw_MOV( p, param0, src0 );
1988     brw_MOV( p, param1, src1 );
1989     brw_MOV( p, param2, src2 );
1990
1991     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1992
1993     /* Fill in the result: */
1994     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1995     for (i = 0 ; i < 4; i++) {
1996         if (mask & (1<<i)) {
1997             dst = get_dst_reg(c, inst, i);
1998             brw_MOV( p, dst, param0 );
1999         }
2000     }
2001     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2002         brw_set_saturate( p, 0 );
2003
2004     release_tmps( c, mark );
2005 }
2006
2007 /**
2008  * For the four-dimensional case, the little micro-optimisation benefits
2009  * we obtain by unrolling all the loops aren't worth the massive bloat it
2010  * now causes.  Instead, we loop twice around performing a similar operation
2011  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2012  * code to glue it all together.
2013  */
2014 static void noise4_sub( struct brw_wm_compile *c )
2015 {
2016     struct brw_compile *p = &c->func;
2017     struct brw_reg param[ 4 ],
2018         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2019         w0, /* noise for the w=0 cube */
2020         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2021         interp[ 4 ], /* interpolation coefficients */
2022         t, tmp[ 8 ], /* float temporaries */
2023         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2024         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2025     int i, j;
2026     int mark = mark_tmps( c );
2027     GLuint loop, origin;
2028
2029     x0y0 = alloc_tmp( c );
2030     x0y1 = alloc_tmp( c );
2031     x1y0 = alloc_tmp( c );
2032     x1y1 = alloc_tmp( c );
2033     t = alloc_tmp( c );
2034     w0 = alloc_tmp( c );
2035     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2036     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2037
2038     for( i = 0; i < 4; i++ ) {
2039         param[ i ] = lookup_tmp( c, mark - 5 + i );
2040         interp[ i ] = alloc_tmp( c );
2041     }
2042
2043     for( i = 0; i < 8; i++ ) {
2044         tmp[ i ] = alloc_tmp( c );
2045         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2046         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2047     }
2048
2049     brw_set_access_mode( p, BRW_ALIGN_1 );
2050
2051     /* We only want 16 bits of precision from the integral part of each
2052        co-ordinate, but unfortunately the RNDD semantics would saturate
2053        at 16 bits if we performed the operation directly to a 16-bit
2054        destination.  Therefore, we round to 32-bit temporaries where
2055        appropriate, and then store only the lower 16 bits. */
2056     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2057     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2058     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2059     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2060     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2061     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2062
2063     /* Modify the flag register here, because the side effect is useful
2064        later (see below).  We know for certain that all flags will be
2065        cleared, since the FRC instruction cannot possibly generate
2066        negative results.  Even for exceptional inputs (infinities, denormals,
2067        NaNs), the architecture guarantees that the L conditional is false. */
2068     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2069     brw_FRC( p, param[ 0 ], param[ 0 ] );
2070     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2071     for( i = 1; i < 4; i++ )
2072         brw_FRC( p, param[ i ], param[ i ] );
2073
2074     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2075        of all. */
2076     for( i = 0; i < 4; i++ )
2077         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2078     for( i = 0; i < 4; i++ )
2079         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2080     for( i = 0; i < 4; i++ )
2081         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2082     for( i = 0; i < 4; i++ )
2083         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2084     for( j = 0; j < 3; j++ )
2085         for( i = 0; i < 4; i++ )
2086             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2087
2088     /* Mark the current address, as it will be a jump destination.  The
2089        following code will be executed twice: first, with the flag
2090        register clear indicating the w=0 case, and second with flags
2091        set for w=1. */
2092     loop = p->nr_insn;
2093
2094     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2095        be hashed.  Since we have only 16 bits of precision in the hash, we
2096        must be careful about thorough mixing to maintain entropy as we
2097        squash the input vector into a small scalar. */
2098     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2099              brw_imm_uw( 0xBC8F ) );
2100     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2101              brw_imm_uw( 0xD0BD ) );
2102     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2103              brw_imm_uw( 0x9B93 ) );
2104     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2105              brw_imm_uw( 0xA359 ) );
2106     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2107              brw_imm_uw( 0xBC8F ) );
2108
2109     /* Temporarily disable the execution mask while we work with ExecSize=16
2110        channels (the mask is set for ExecSize=8 and is probably incorrect).
2111        Although this might cause execution of unwanted channels, the code
2112        writes only to temporary registers and has no side effects, so
2113        disabling the mask is harmless. */
2114     brw_push_insn_state( p );
2115     brw_set_mask_control( p, BRW_MASK_DISABLE );
2116     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2117     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2118     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2119
2120     /* We're now ready to perform the hashing.  The eight hashes are
2121        interleaved for performance.  The hash function used is
2122        designed to rapidly achieve avalanche and require only 16x16
2123        bit multiplication, and 8-bit swizzles (which we get for
2124        free). */
2125     for( i = 0; i < 4; i++ )
2126         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2127     for( i = 0; i < 4; i++ )
2128         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2129                  odd_bytes( wtmp[ i ] ) );
2130     for( i = 0; i < 4; i++ )
2131         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2132     for( i = 0; i < 4; i++ )
2133         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2134                  odd_bytes( wtmp[ i ] ) );
2135     brw_pop_insn_state( p );
2136
2137     /* Now we want to initialise the four rear gradients based on the
2138        hashes.  Format conversion from signed integer to float leaves
2139        everything scaled too high by a factor of pow( 2, 15 ), but
2140        we correct for that right at the end. */
2141     /* x component */
2142     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2143     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2144     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2145     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2146     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2147
2148     brw_push_insn_state( p );
2149     brw_set_mask_control( p, BRW_MASK_DISABLE );
2150     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2151     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2152     brw_pop_insn_state( p );
2153
2154     brw_MUL( p, x1y0, x1y0, t );
2155     brw_MUL( p, x1y1, x1y1, t );
2156     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2157     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2158     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2159
2160     /* y component */
2161     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2162     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2163     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2164     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2165
2166     brw_push_insn_state( p );
2167     brw_set_mask_control( p, BRW_MASK_DISABLE );
2168     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2169     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2170     brw_pop_insn_state( p );
2171
2172     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2173     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2174     /* prepare t for the w component (used below): w the first time through
2175        the loop; w - 1 the second time) */
2176     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2177     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2178     p->current->header.predicate_inverse = 1;
2179     brw_MOV( p, t, param[ 3 ] );
2180     p->current->header.predicate_inverse = 0;
2181     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2182     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2183     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2184
2185     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2186     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2187     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2188     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2189
2190     /* z component */
2191     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2192     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2193     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2194     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2195
2196     brw_push_insn_state( p );
2197     brw_set_mask_control( p, BRW_MASK_DISABLE );
2198     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2199     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2200     brw_pop_insn_state( p );
2201
2202     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2203     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2204     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2205     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2206
2207     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2208     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2209     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2210     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2211
2212     /* w component */
2213     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2214     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2215     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2216     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2217
2218     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2219     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2220     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2221     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2222     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2223
2224     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2225     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2226     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2227     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2228
2229     /* Here we interpolate in the y dimension... */
2230     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2231     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2232     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2233     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2234     brw_ADD( p, x0y0, x0y0, x0y1 );
2235     brw_ADD( p, x1y0, x1y0, x1y1 );
2236
2237     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2238     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2239     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2240     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2241
2242     /* Now do the same thing for the front four gradients... */
2243     /* x component */
2244     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2245     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2246     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2247     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2248
2249     brw_push_insn_state( p );
2250     brw_set_mask_control( p, BRW_MASK_DISABLE );
2251     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2252     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2253     brw_pop_insn_state( p );
2254
2255     brw_MUL( p, x1y0, x1y0, t );
2256     brw_MUL( p, x1y1, x1y1, t );
2257     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2258     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2259     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2260
2261     /* y component */
2262     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2263     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2264     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2265     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2266
2267     brw_push_insn_state( p );
2268     brw_set_mask_control( p, BRW_MASK_DISABLE );
2269     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2270     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2271     brw_pop_insn_state( p );
2272
2273     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2274     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2275     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2276     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2277     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2278
2279     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2280     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2281     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2282     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2283
2284     /* z component */
2285     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2286     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2287     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2288     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2289
2290     brw_push_insn_state( p );
2291     brw_set_mask_control( p, BRW_MASK_DISABLE );
2292     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2293     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2294     brw_pop_insn_state( p );
2295
2296     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2297     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2298     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2299     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2300     /* prepare t for the w component (used below): w the first time through
2301        the loop; w - 1 the second time) */
2302     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2303     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2304     p->current->header.predicate_inverse = 1;
2305     brw_MOV( p, t, param[ 3 ] );
2306     p->current->header.predicate_inverse = 0;
2307     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2308
2309     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2310     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2311     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2312     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2313
2314     /* w component */
2315     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2316     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2317     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2318     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2319
2320     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2321     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2322     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2323     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2324
2325     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2326     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2327     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2328     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2329
2330     /* Interpolate in the y dimension: */
2331     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2332     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2333     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2334     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2335     brw_ADD( p, x0y0, x0y0, x0y1 );
2336     brw_ADD( p, x1y0, x1y0, x1y1 );
2337
2338     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2339        time put the front face in tmp[ 1 ] and we're nearly there... */
2340     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2341     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2342     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2343
2344     /* Another interpolation, in the z dimension: */
2345     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2346     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2347     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2348
2349     /* Exit the loop if we've computed both cubes... */
2350     origin = p->nr_insn;
2351     brw_push_insn_state( p );
2352     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2353     brw_set_mask_control( p, BRW_MASK_DISABLE );
2354     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2355     brw_pop_insn_state( p );
2356
2357     /* Save the result for the w=0 case, and increment the w coordinate: */
2358     brw_MOV( p, w0, tmp[ 0 ] );
2359     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2360              brw_imm_uw( 1 ) );
2361
2362     /* Loop around for the other cube.  Explicitly set the flag register
2363        (unfortunately we must spend an extra instruction to do this: we
2364        can't rely on a side effect of the previous MOV or ADD because
2365        conditional modifiers which are normally true might be false in
2366        exceptional circumstances, e.g. given a NaN input; the add to
2367        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2368     brw_push_insn_state( p );
2369     brw_set_mask_control( p, BRW_MASK_DISABLE );
2370     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2371     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2372              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2373     brw_pop_insn_state( p );
2374
2375     /* Patch the previous conditional branch now that we know the
2376        destination address. */
2377     brw_set_src1( p->store + origin,
2378                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2379
2380     /* The very last interpolation. */
2381     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2382     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2383     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2384
2385     /* scale by pow( 2, -15 ), as described above */
2386     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2387
2388     release_tmps( c, mark );
2389 }
2390
2391 static void emit_noise4( struct brw_wm_compile *c,
2392                          const struct prog_instruction *inst )
2393 {
2394     struct brw_compile *p = &c->func;
2395     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2396     GLuint mask = inst->DstReg.WriteMask;
2397     int i;
2398     int mark = mark_tmps( c );
2399
2400     assert( mark == 0 );
2401
2402     src0 = get_src_reg( c, inst, 0, 0 );
2403     src1 = get_src_reg( c, inst, 0, 1 );
2404     src2 = get_src_reg( c, inst, 0, 2 );
2405     src3 = get_src_reg( c, inst, 0, 3 );
2406
2407     param0 = alloc_tmp( c );
2408     param1 = alloc_tmp( c );
2409     param2 = alloc_tmp( c );
2410     param3 = alloc_tmp( c );
2411
2412     brw_MOV( p, param0, src0 );
2413     brw_MOV( p, param1, src1 );
2414     brw_MOV( p, param2, src2 );
2415     brw_MOV( p, param3, src3 );
2416
2417     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2418
2419     /* Fill in the result: */
2420     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2421     for (i = 0 ; i < 4; i++) {
2422         if (mask & (1<<i)) {
2423             dst = get_dst_reg(c, inst, i);
2424             brw_MOV( p, dst, param0 );
2425         }
2426     }
2427     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2428         brw_set_saturate( p, 0 );
2429
2430     release_tmps( c, mark );
2431 }
2432
2433 static void emit_wpos_xy(struct brw_wm_compile *c,
2434                          const struct prog_instruction *inst)
2435 {
2436     struct brw_compile *p = &c->func;
2437     GLuint mask = inst->DstReg.WriteMask;
2438     struct brw_reg src0[2], dst[2];
2439
2440     dst[0] = get_dst_reg(c, inst, 0);
2441     dst[1] = get_dst_reg(c, inst, 1);
2442
2443     src0[0] = get_src_reg(c, inst, 0, 0);
2444     src0[1] = get_src_reg(c, inst, 0, 1);
2445
2446     /* Calculate the pixel offset from window bottom left into destination
2447      * X and Y channels.
2448      */
2449     if (mask & WRITEMASK_X) {
2450         /* X' = X - origin_x */
2451         brw_ADD(p,
2452                 dst[0],
2453                 retype(src0[0], BRW_REGISTER_TYPE_W),
2454                 brw_imm_d(0 - c->key.origin_x));
2455     }
2456
2457     if (mask & WRITEMASK_Y) {
2458         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2459         brw_ADD(p,
2460                 dst[1],
2461                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2462                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2463     }
2464 }
2465
2466 /* TODO
2467    BIAS on SIMD8 not working yet...
2468  */
2469 static void emit_txb(struct brw_wm_compile *c,
2470                      const struct prog_instruction *inst)
2471 {
2472     struct brw_compile *p = &c->func;
2473     struct brw_reg dst[4], src[4], payload_reg;
2474     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2475     const GLuint unit = inst->TexSrcUnit;
2476     GLuint i;
2477     GLuint msg_type;
2478
2479     assert(unit < BRW_MAX_TEX_UNIT);
2480
2481     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2482
2483     for (i = 0; i < 4; i++)
2484         dst[i] = get_dst_reg(c, inst, i);
2485     for (i = 0; i < 4; i++)
2486         src[i] = get_src_reg(c, inst, 0, i);
2487
2488     switch (inst->TexSrcTarget) {
2489         case TEXTURE_1D_INDEX:
2490             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2491             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2492             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2493             break;
2494         case TEXTURE_2D_INDEX:
2495         case TEXTURE_RECT_INDEX:
2496             brw_MOV(p, brw_message_reg(2), src[0]);
2497             brw_MOV(p, brw_message_reg(3), src[1]);
2498             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2499             break;
2500         case TEXTURE_3D_INDEX:
2501         case TEXTURE_CUBE_INDEX:
2502             brw_MOV(p, brw_message_reg(2), src[0]);
2503             brw_MOV(p, brw_message_reg(3), src[1]);
2504             brw_MOV(p, brw_message_reg(4), src[2]);
2505             break;
2506         default:
2507             /* invalid target */
2508             abort();
2509     }
2510     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2511     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2512
2513     if (BRW_IS_IGDNG(p->brw)) {
2514         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2515     } else {
2516         /* Does it work well on SIMD8? */
2517         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2518     }
2519
2520     brw_SAMPLE(p,
2521                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2522                1,                                           /* msg_reg_nr */
2523                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2524                SURF_INDEX_TEXTURE(unit),
2525                unit,                                        /* sampler */
2526                inst->DstReg.WriteMask,                      /* writemask */
2527                msg_type,                                    /* msg_type */
2528                4,                                           /* response_length */
2529                4,                                           /* msg_length */
2530                0,                                           /* eot */
2531                1,
2532                BRW_SAMPLER_SIMD_MODE_SIMD8);
2533 }
2534
2535
2536 static void emit_tex(struct brw_wm_compile *c,
2537                      const struct prog_instruction *inst)
2538 {
2539     struct brw_compile *p = &c->func;
2540     struct brw_reg dst[4], src[4], payload_reg;
2541     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2542     const GLuint unit = inst->TexSrcUnit;
2543     GLuint msg_len;
2544     GLuint i, nr;
2545     GLuint emit;
2546     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2547     GLuint msg_type;
2548
2549     assert(unit < BRW_MAX_TEX_UNIT);
2550
2551     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2552
2553     for (i = 0; i < 4; i++)
2554         dst[i] = get_dst_reg(c, inst, i);
2555     for (i = 0; i < 4; i++)
2556         src[i] = get_src_reg(c, inst, 0, i);
2557
2558     switch (inst->TexSrcTarget) {
2559         case TEXTURE_1D_INDEX:
2560             emit = WRITEMASK_X;
2561             nr = 1;
2562             break;
2563         case TEXTURE_2D_INDEX:
2564         case TEXTURE_RECT_INDEX:
2565             emit = WRITEMASK_XY;
2566             nr = 2;
2567             break;
2568         case TEXTURE_3D_INDEX:
2569         case TEXTURE_CUBE_INDEX:
2570             emit = WRITEMASK_XYZ;
2571             nr = 3;
2572             break;
2573         default:
2574            /* invalid target */
2575            abort();
2576     }
2577     msg_len = 1;
2578
2579     /* move/load S, T, R coords */
2580     for (i = 0; i < nr; i++) {
2581         static const GLuint swz[4] = {0,1,2,2};
2582         if (emit & (1<<i))
2583             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2584         else
2585             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2586         msg_len += 1;
2587     }
2588
2589     if (shadow) {
2590        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2591        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2592     }
2593
2594     if (BRW_IS_IGDNG(p->brw)) {
2595         if (shadow)
2596             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2597         else
2598             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2599     } else {
2600         /* Does it work for shadow on SIMD8 ? */
2601         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2602     }
2603
2604     brw_SAMPLE(p,
2605                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2606                1,                                          /* msg_reg_nr */
2607                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2608                SURF_INDEX_TEXTURE(unit),
2609                unit,                                       /* sampler */
2610                inst->DstReg.WriteMask,                     /* writemask */
2611                msg_type,                                   /* msg_type */
2612                4,                                          /* response_length */
2613                shadow ? 6 : 4,                             /* msg_length */
2614                0,                                          /* eot */
2615                1,
2616                BRW_SAMPLER_SIMD_MODE_SIMD8);
2617
2618     if (shadow)
2619         brw_MOV(p, dst[3], brw_imm_f(1.0));
2620 }
2621
2622
2623 /**
2624  * Resolve subroutine calls after code emit is done.
2625  */
2626 static void post_wm_emit( struct brw_wm_compile *c )
2627 {
2628     brw_resolve_cals(&c->func);
2629 }
2630
2631 static void
2632 get_argument_regs(struct brw_wm_compile *c,
2633                   const struct prog_instruction *inst,
2634                   int index,
2635                   struct brw_reg *regs,
2636                   int mask)
2637 {
2638     int i;
2639
2640     for (i = 0; i < 4; i++) {
2641         if (mask & (1 << i))
2642             regs[i] = get_src_reg(c, inst, index, i);
2643     }
2644 }
2645
2646 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2647 {
2648 #define MAX_IF_DEPTH 32
2649 #define MAX_LOOP_DEPTH 32
2650     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2651     GLuint i, if_depth = 0, loop_depth = 0;
2652     struct brw_compile *p = &c->func;
2653     struct brw_indirect stack_index = brw_indirect(0, 0);
2654
2655     c->out_of_regs = GL_FALSE;
2656
2657     prealloc_reg(c);
2658     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2659     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2660
2661     for (i = 0; i < c->nr_fp_insns; i++) {
2662         const struct prog_instruction *inst = &c->prog_instructions[i];
2663         int dst_flags;
2664         struct brw_reg args[3][4], dst[4];
2665         int j;
2666
2667         c->cur_inst = i;
2668
2669 #if 0
2670         _mesa_printf("Inst %d: ", i);
2671         _mesa_print_instruction(inst);
2672 #endif
2673
2674         /* fetch any constants that this instruction needs */
2675         if (c->fp->use_const_buffer)
2676            fetch_constants(c, inst);
2677
2678         if (inst->Opcode != OPCODE_ARL) {
2679            for (j = 0; j < 4; j++) {
2680               if (inst->DstReg.WriteMask & (1 << j))
2681                  dst[j] = get_dst_reg(c, inst, j);
2682               else
2683                  dst[j] = brw_null_reg();
2684            }
2685         }
2686         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2687             get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2688
2689         dst_flags = inst->DstReg.WriteMask;
2690         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2691             dst_flags |= SATURATE;
2692
2693         if (inst->CondUpdate)
2694             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2695         else
2696             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2697
2698         dst_flags = inst->DstReg.WriteMask;
2699         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2700             dst_flags |= SATURATE;
2701
2702         switch (inst->Opcode) {
2703             case WM_PIXELXY:
2704                 emit_pixel_xy(c, inst);
2705                 break;
2706             case WM_DELTAXY:
2707                 emit_delta_xy(c, inst);
2708                 break;
2709             case WM_PIXELW:
2710                 emit_pixel_w(c, inst);
2711                 break;
2712             case WM_LINTERP:
2713                 emit_linterp(c, inst);
2714                 break;
2715             case WM_PINTERP:
2716                 emit_pinterp(c, inst);
2717                 break;
2718             case WM_CINTERP:
2719                 emit_cinterp(c, inst);
2720                 break;
2721             case WM_WPOSXY:
2722                 emit_wpos_xy(c, inst);
2723                 break;
2724             case WM_FB_WRITE:
2725                 emit_fb_write(c, inst);
2726                 break;
2727             case WM_FRONTFACING:
2728                 emit_frontfacing(c, inst);
2729                 break;
2730             case OPCODE_ADD:
2731                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2732                 break;
2733             case OPCODE_ARL:
2734                 emit_arl(c, inst);
2735                 break;
2736             case OPCODE_FRC:
2737                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2738                 break;
2739             case OPCODE_FLR:
2740                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2741                 break;
2742             case OPCODE_LRP:
2743                 emit_lrp(c, inst);
2744                 break;
2745             case OPCODE_TRUNC:
2746                 emit_trunc(c, inst);
2747                 break;
2748             case OPCODE_MOV:
2749             case OPCODE_SWZ:
2750                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2751                 break;
2752             case OPCODE_DP3:
2753                 emit_dp3(c, inst);
2754                 break;
2755             case OPCODE_DP4:
2756                 emit_dp4(c, inst);
2757                 break;
2758             case OPCODE_XPD:
2759                 emit_xpd(c, inst);
2760                 break;
2761             case OPCODE_DPH:
2762                 emit_dph(c, inst);
2763                 break;
2764             case OPCODE_RCP:
2765                 emit_rcp(c, inst);
2766                 break;
2767             case OPCODE_RSQ:
2768                 emit_rsq(c, inst);
2769                 break;
2770             case OPCODE_SIN:
2771                 emit_sin(c, inst);
2772                 break;
2773             case OPCODE_COS:
2774                 emit_cos(c, inst);
2775                 break;
2776             case OPCODE_EX2:
2777                 emit_ex2(c, inst);
2778                 break;
2779             case OPCODE_LG2:
2780                 emit_lg2(c, inst);
2781                 break;
2782             case OPCODE_MIN:
2783             case OPCODE_MAX:
2784                 emit_min_max(c, inst);
2785                 break;
2786             case OPCODE_DDX:
2787             case OPCODE_DDY:
2788                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2789                           args[0]);
2790                 break;
2791             case OPCODE_SLT:
2792                 emit_slt(c, inst);
2793                 break;
2794             case OPCODE_SLE:
2795                 emit_sle(c, inst);
2796                 break;
2797             case OPCODE_SGT:
2798                 emit_sgt(c, inst);
2799                 break;
2800             case OPCODE_SGE:
2801                 emit_sge(c, inst);
2802                 break;
2803             case OPCODE_SEQ:
2804                 emit_seq(c, inst);
2805                 break;
2806             case OPCODE_SNE:
2807                 emit_sne(c, inst);
2808                 break;
2809             case OPCODE_MUL:
2810                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2811                 break;
2812             case OPCODE_POW:
2813                 emit_pow(c, inst);
2814                 break;
2815             case OPCODE_MAD:
2816                 emit_mad(c, inst);
2817                 break;
2818             case OPCODE_NOISE1:
2819                 emit_noise1(c, inst);
2820                 break;
2821             case OPCODE_NOISE2:
2822                 emit_noise2(c, inst);
2823                 break;
2824             case OPCODE_NOISE3:
2825                 emit_noise3(c, inst);
2826                 break;
2827             case OPCODE_NOISE4:
2828                 emit_noise4(c, inst);
2829                 break;
2830             case OPCODE_TEX:
2831                 emit_tex(c, inst);
2832                 break;
2833             case OPCODE_TXB:
2834                 emit_txb(c, inst);
2835                 break;
2836             case OPCODE_KIL_NV:
2837                 emit_kil(c);
2838                 break;
2839             case OPCODE_IF:
2840                 assert(if_depth < MAX_IF_DEPTH);
2841                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2842                 break;
2843             case OPCODE_ELSE:
2844                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2845                 break;
2846             case OPCODE_ENDIF:
2847                 assert(if_depth > 0);
2848                 brw_ENDIF(p, if_inst[--if_depth]);
2849                 break;
2850             case OPCODE_BGNSUB:
2851                 brw_save_label(p, inst->Comment, p->nr_insn);
2852                 break;
2853             case OPCODE_ENDSUB:
2854                 /* no-op */
2855                 break;
2856             case OPCODE_CAL:
2857                 brw_push_insn_state(p);
2858                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2859                 brw_set_access_mode(p, BRW_ALIGN_1);
2860                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2861                 brw_set_access_mode(p, BRW_ALIGN_16);
2862                 brw_ADD(p, get_addr_reg(stack_index),
2863                          get_addr_reg(stack_index), brw_imm_d(4));
2864                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2865                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2866                 brw_pop_insn_state(p);
2867                 break;
2868
2869             case OPCODE_RET:
2870                 brw_push_insn_state(p);
2871                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2872                 brw_ADD(p, get_addr_reg(stack_index),
2873                         get_addr_reg(stack_index), brw_imm_d(-4));
2874                 brw_set_access_mode(p, BRW_ALIGN_1);
2875                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2876                 brw_set_access_mode(p, BRW_ALIGN_16);
2877                 brw_pop_insn_state(p);
2878
2879                 break;
2880             case OPCODE_BGNLOOP:
2881                 /* XXX may need to invalidate the current_constant regs */
2882                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2883                 break;
2884             case OPCODE_BRK:
2885                 brw_BREAK(p);
2886                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2887                 break;
2888             case OPCODE_CONT:
2889                 brw_CONT(p);
2890                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2891                 break;
2892             case OPCODE_ENDLOOP:
2893                {
2894                   struct brw_instruction *inst0, *inst1;
2895                   GLuint br = 1;
2896
2897                   if (BRW_IS_IGDNG(brw))
2898                      br = 2;
2899
2900                   loop_depth--;
2901                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2902                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2903                   while (inst0 > loop_inst[loop_depth]) {
2904                      inst0--;
2905                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2906                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2907                         inst0->bits3.if_else.pop_count = 0;
2908                      }
2909                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2910                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2911                         inst0->bits3.if_else.pop_count = 0;
2912                      }
2913                   }
2914                }
2915                break;
2916             default:
2917                 _mesa_printf("unsupported IR in fragment shader %d\n",
2918                         inst->Opcode);
2919         }
2920
2921         if (inst->CondUpdate)
2922             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2923         else
2924             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2925     }
2926     post_wm_emit(c);
2927
2928     if (INTEL_DEBUG & DEBUG_WM) {
2929       _mesa_printf("wm-native:\n");
2930       for (i = 0; i < p->nr_insn; i++)
2931          brw_disasm(stderr, &p->store[i]);
2932       _mesa_printf("\n");
2933     }
2934 }
2935
2936 /**
2937  * Do GPU code generation for shaders that use GLSL features such as
2938  * flow control.  Other shaders will be compiled with the
2939  */
2940 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2941 {
2942     if (INTEL_DEBUG & DEBUG_WM) {
2943         _mesa_printf("brw_wm_glsl_emit:\n");
2944     }
2945
2946     /* initial instruction translation/simplification */
2947     brw_wm_pass_fp(c);
2948
2949     /* actual code generation */
2950     brw_wm_emit_glsl(brw, c);
2951
2952     if (INTEL_DEBUG & DEBUG_WM) {
2953         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2954     }
2955
2956     c->prog_data.total_grf = num_grf_used(c);
2957     c->prog_data.total_scratch = 0;
2958 }