src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13
  14 /**
  15  * Determine if the given fragment program uses GLSL features such
  16  * as flow conditionals, loops, subroutines.
  17  * Some GLSL shaders may use these features, others might not.
  18  */
  19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  20 {
  21     int i;
  22     for (i = 0; i < fp->Base.NumInstructions; i++) {
  23         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  24         switch (inst->Opcode) {
  25             case OPCODE_IF:
  26             case OPCODE_TRUNC:
  27             case OPCODE_ENDIF:
  28             case OPCODE_CAL:
  29             case OPCODE_BRK:
  30             case OPCODE_RET:
  31             case OPCODE_DDX:
  32             case OPCODE_DDY:
  33             case OPCODE_NOISE1:
  34             case OPCODE_NOISE2:
  35             case OPCODE_NOISE3:
  36             case OPCODE_NOISE4:
  37             case OPCODE_BGNLOOP:
  38                 return GL_TRUE;
  39             default:
  40                 break;
  41         }
  42     }
  43     return GL_FALSE;
  44 }
  45
  46
  47
  48 static void
  49 reclaim_temps(struct brw_wm_compile *c);
  50
  51
  52 /** Mark GRF register as used. */
  53 static void
  54 prealloc_grf(struct brw_wm_compile *c, int r)
  55 {
  56    c->used_grf[r] = GL_TRUE;
  57 }
  58
  59
  60 /** Mark given GRF register as not in use. */
  61 static void
  62 release_grf(struct brw_wm_compile *c, int r)
  63 {
  64    /*assert(c->used_grf[r]);*/
  65    c->used_grf[r] = GL_FALSE;
  66    c->first_free_grf = MIN2(c->first_free_grf, r);
  67 }
  68
  69
  70 /** Return index of a free GRF, mark it as used. */
  71 static int
  72 alloc_grf(struct brw_wm_compile *c)
  73 {
  74    GLuint r;
  75    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  76       if (!c->used_grf[r]) {
  77          c->used_grf[r] = GL_TRUE;
  78          c->first_free_grf = r + 1;  /* a guess */
  79          return r;
  80       }
  81    }
  82
  83    /* no free temps, try to reclaim some */
  84    reclaim_temps(c);
  85    c->first_free_grf = 0;
  86
  87    /* try alloc again */
  88    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  89       if (!c->used_grf[r]) {
  90          c->used_grf[r] = GL_TRUE;
  91          c->first_free_grf = r + 1;  /* a guess */
  92          return r;
  93       }
  94    }
  95
  96    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  97       assert(c->used_grf[r]);
  98    }
  99    /*printf("Really out of temp regs!\n");*/
 100    return 60;
 101 }
 102
 103
 104 /** Return number of GRF registers used */
 105 static int
 106 num_grf_used(const struct brw_wm_compile *c)
 107 {
 108    int r;
 109    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 110       if (c->used_grf[r])
 111          return r + 1;
 112    return 0;
 113 }
 114
 115
 116
 117 /**
 118  * Record the mapping of a Mesa register to a hardware register.
 119  */
 120 static void set_reg(struct brw_wm_compile *c, int file, int index,
 121         int component, struct brw_reg reg)
 122 {
 123     c->wm_regs[file][index][component].reg = reg;
 124     c->wm_regs[file][index][component].inited = GL_TRUE;
 125 }
 126
 127 /**
 128  * Examine instruction's write mask to find index of first component
 129  * enabled for writing.
 130  */
 131 static int get_scalar_dst_index(const struct prog_instruction *inst)
 132 {
 133     int i;
 134     for (i = 0; i < 4; i++)
 135         if (inst->DstReg.WriteMask & (1<<i))
 136             break;
 137     return i;
 138 }
 139
 140 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 141 {
 142     struct brw_reg reg;
 143
 144     /* if we need to allocate another temp, grow the tmp_regs[] array */
 145     if (c->tmp_index == c->tmp_max) {
 146        c->tmp_regs[ c->tmp_max++ ] = alloc_grf(c);
 147     }
 148
 149     /* form the GRF register */
 150     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 151     /*printf("alloc_temp %d\n", reg.nr);*/
 152     assert(reg.nr < BRW_WM_MAX_GRF);
 153     return reg;
 154
 155 }
 156
 157 /**
 158  * Save current temp register info.
 159  * There must be a matching call to release_tmps().
 160  */
 161 static int mark_tmps(struct brw_wm_compile *c)
 162 {
 163     return c->tmp_index;
 164 }
 165
 166 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 167 {
 168     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 169 }
 170
 171 static void release_tmps(struct brw_wm_compile *c, int mark)
 172 {
 173     c->tmp_index = mark;
 174 }
 175
 176 /**
 177  * Convert Mesa src register to brw register.
 178  *
 179  * Since we're running in SOA mode each Mesa register corresponds to four
 180  * hardware registers.  We allocate the hardware registers as needed here.
 181  *
 182  * \param file  register file, one of PROGRAM_x
 183  * \param index  register number
 184  * \param component  src component (X=0, Y=1, Z=2, W=3)
 185  * \param nr  not used?!?
 186  * \param neg  negate value?
 187  * \param abs  take absolute value?
 188  */
 189 static struct brw_reg
 190 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 191         int nr, GLuint neg, GLuint abs)
 192 {
 193     struct brw_reg reg;
 194     switch (file) {
 195         case PROGRAM_STATE_VAR:
 196         case PROGRAM_CONSTANT:
 197         case PROGRAM_UNIFORM:
 198             file = PROGRAM_STATE_VAR;
 199             break;
 200         case PROGRAM_UNDEFINED:
 201             return brw_null_reg();
 202         case PROGRAM_TEMPORARY:
 203         case PROGRAM_INPUT:
 204         case PROGRAM_OUTPUT:
 205         case PROGRAM_PAYLOAD:
 206             break;
 207         default:
 208             _mesa_problem(NULL, "Unexpected file in get_reg()");
 209             return brw_null_reg();
 210     }
 211
 212     assert(index < 256);
 213     /* see if we've already allocated a HW register for this Mesa register */
 214     if (c->wm_regs[file][index][component].inited) {
 215        /* yes, re-use */
 216        reg = c->wm_regs[file][index][component].reg;
 217     }
 218     else {
 219         /* no, allocate new register */
 220        int grf = alloc_grf(c);
 221        if (grf < 0) {
 222           /* totally out of temps */
 223           grf = 70; /* XXX !!!! */
 224        }
 225
 226        reg = brw_vec8_grf(grf, 0);
 227        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 228
 229        set_reg(c, file, index, component, reg);
 230     }
 231
 232     if (neg & (1 << component)) {
 233         reg = negate(reg);
 234     }
 235     if (abs)
 236         reg = brw_abs(reg);
 237     return reg;
 238 }
 239
 240
 241
 242 /**
 243  * This is called if we run out of GRF registers.  Examine the live intervals
 244  * of temp regs in the program and free those which won't be used again.
 245  */
 246 static void
 247 reclaim_temps(struct brw_wm_compile *c)
 248 {
 249    GLint intBegin[MAX_PROGRAM_TEMPS];
 250    GLint intEnd[MAX_PROGRAM_TEMPS];
 251    int index;
 252
 253    /*printf("Reclaim temps:\n");*/
 254
 255    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 256                              intBegin, intEnd);
 257
 258    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 259       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 260          /* program temp[i] can be freed */
 261          int component;
 262          /*printf("  temp[%d] is dead\n", index);*/
 263          for (component = 0; component < 4; component++) {
 264             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 265                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 266                release_grf(c, r);
 267                /*
 268                printf("  Reclaim temp %d, reg %d at inst %d\n",
 269                       index, r, c->cur_inst);
 270                */
 271                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 272             }
 273          }
 274       }
 275    }
 276 }
 277
 278
 279
 280
 281 /**
 282  * Preallocate registers.  This sets up the Mesa to hardware register
 283  * mapping for certain registers, such as constants (uniforms/state vars)
 284  * and shader inputs.
 285  */
 286 static void prealloc_reg(struct brw_wm_compile *c)
 287 {
 288     int i, j;
 289     struct brw_reg reg;
 290     int nr_interp_regs = 0;
 291     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 292     GLuint reg_index = 0;
 293
 294     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 295     c->first_free_grf = 0;
 296
 297     for (i = 0; i < 4; i++) {
 298         if (i < c->key.nr_depth_regs)
 299             reg = brw_vec8_grf(i * 2, 0);
 300         else
 301             reg = brw_vec8_grf(0, 0);
 302         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 303     }
 304     reg_index += 2 * c->key.nr_depth_regs;
 305
 306     /* constants */
 307     {
 308         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 309         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 310
 311         /* use a real constant buffer, or just use a section of the GRF? */
 312         /* XXX this heuristic may need adjustment... */
 313         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 314            c->fp->use_const_buffer = GL_TRUE;
 315         else
 316            c->fp->use_const_buffer = GL_FALSE;
 317         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 318
 319         if (c->fp->use_const_buffer) {
 320            /* We'll use a real constant buffer and fetch constants from
 321             * it with a dataport read message.
 322             */
 323
 324            /* number of float constants in CURBE */
 325            c->prog_data.nr_params = 0;
 326         }
 327         else {
 328            const struct gl_program_parameter_list *plist =
 329               c->fp->program.Base.Parameters;
 330            int index = 0;
 331
 332            /* number of float constants in CURBE */
 333            c->prog_data.nr_params = 4 * nr_params;
 334
 335            /* loop over program constants (float[4]) */
 336            for (i = 0; i < nr_params; i++) {
 337               /* loop over XYZW channels */
 338               for (j = 0; j < 4; j++, index++) {
 339                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 340                  /* Save pointer to parameter/constant value.
 341                   * Constants will be copied in prepare_constant_buffer()
 342                   */
 343                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 344                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 345               }
 346            }
 347            /* number of constant regs used (each reg is float[8]) */
 348            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 349            reg_index += c->nr_creg;
 350         }
 351     }
 352
 353     /* fragment shader inputs */
 354     for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
 355         if (inputs & (1<<i)) {
 356             nr_interp_regs++;
 357             reg = brw_vec8_grf(reg_index, 0);
 358             for (j = 0; j < 4; j++)
 359                 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
 360             reg_index += 2;
 361         }
 362     }
 363
 364     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 365     c->prog_data.urb_read_length = nr_interp_regs * 2;
 366     c->prog_data.curb_read_length = c->nr_creg;
 367     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 368     reg_index++;
 369     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 370     reg_index += 2;
 371
 372     /* mark GRF regs [0..reg_index-1] as in-use */
 373     for (i = 0; i < reg_index; i++)
 374        prealloc_grf(c, i);
 375
 376     /* An instruction may reference up to three constants.
 377      * They'll be found in these registers.
 378      * XXX alloc these on demand!
 379      */
 380     if (c->fp->use_const_buffer) {
 381        for (i = 0; i < 3; i++) {
 382           c->current_const[i].index = -1;
 383           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 384        }
 385     }
 386 #if 0
 387     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 388     printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
 389 #endif
 390 }
 391
 392
 393 /**
 394  * Check if any of the instruction's src registers are constants, uniforms,
 395  * or statevars.  If so, fetch any constants that we don't already have in
 396  * the three GRF slots.
 397  */
 398 static void fetch_constants(struct brw_wm_compile *c,
 399                             const struct prog_instruction *inst)
 400 {
 401    struct brw_compile *p = &c->func;
 402    GLuint i;
 403
 404    /* loop over instruction src regs */
 405    for (i = 0; i < 3; i++) {
 406       const struct prog_src_register *src = &inst->SrcReg[i];
 407       if (src->File == PROGRAM_STATE_VAR ||
 408           src->File == PROGRAM_CONSTANT ||
 409           src->File == PROGRAM_UNIFORM) {
 410          if (c->current_const[i].index != src->Index) {
 411             c->current_const[i].index = src->Index;
 412
 413 #if 0
 414             printf("  fetch const[%d] for arg %d into reg %d\n",
 415                    src->Index, i, c->current_const[i].reg.nr);
 416 #endif
 417
 418             /* need to fetch the constant now */
 419             brw_dp_READ_4(p,
 420                           c->current_const[i].reg,  /* writeback dest */
 421                           1,                        /* msg_reg */
 422                           src->RelAddr,             /* relative indexing? */
 423                           16 * src->Index,          /* byte offset */
 424                           SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 425                           );
 426          }
 427       }
 428    }
 429 }
 430
 431
 432 /**
 433  * Convert Mesa dst register to brw register.
 434  */
 435 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 436                                   const struct prog_instruction *inst,
 437                                   GLuint component)
 438 {
 439     const int nr = 1;
 440     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 441             0, 0);
 442 }
 443
 444
 445 static struct brw_reg
 446 get_src_reg_const(struct brw_wm_compile *c,
 447                   const struct prog_instruction *inst,
 448                   GLuint srcRegIndex, GLuint component)
 449 {
 450    /* We should have already fetched the constant from the constant
 451     * buffer in fetch_constants().  Now we just have to return a
 452     * register description that extracts the needed component and
 453     * smears it across all eight vector components.
 454     */
 455    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 456    struct brw_reg const_reg;
 457
 458    assert(component < 4);
 459    assert(srcRegIndex < 3);
 460    assert(c->current_const[srcRegIndex].index != -1);
 461    const_reg = c->current_const[srcRegIndex].reg;
 462
 463    /* extract desired float from the const_reg, and smear */
 464    const_reg = stride(const_reg, 0, 1, 0);
 465    const_reg.subnr = component * 4;
 466
 467    if (src->Negate & (1 << component))
 468       const_reg = negate(const_reg);
 469    if (src->Abs)
 470       const_reg = brw_abs(const_reg);
 471
 472 #if 0
 473    printf("  form const[%d].%d for arg %d, reg %d\n",
 474           c->current_const[srcRegIndex].index,
 475           component,
 476           srcRegIndex,
 477           const_reg.nr);
 478 #endif
 479
 480    return const_reg;
 481 }
 482
 483
 484 /**
 485  * Convert Mesa src register to brw register.
 486  */
 487 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 488                                   const struct prog_instruction *inst,
 489                                   GLuint srcRegIndex, GLuint channel)
 490 {
 491     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 492     const GLuint nr = 1;
 493     const GLuint component = GET_SWZ(src->Swizzle, channel);
 494
 495     if (c->fp->use_const_buffer &&
 496         (src->File == PROGRAM_STATE_VAR ||
 497          src->File == PROGRAM_CONSTANT ||
 498          src->File == PROGRAM_UNIFORM)) {
 499        return get_src_reg_const(c, inst, srcRegIndex, component);
 500     }
 501     else {
 502        /* other type of source register */
 503        return get_reg(c, src->File, src->Index, component, nr,
 504                       src->Negate, src->Abs);
 505     }
 506 }
 507
 508
 509 /**
 510  * Same as \sa get_src_reg() but if the register is a literal, emit
 511  * a brw_reg encoding the literal.
 512  * Note that a brw instruction only allows one src operand to be a literal.
 513  * For instructions with more than one operand, only the second can be a
 514  * literal.  This means that we treat some literals as constants/uniforms
 515  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 516  *
 517  */
 518 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 519                                       const struct prog_instruction *inst,
 520                                       GLuint srcRegIndex, GLuint channel)
 521 {
 522     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 523     if (src->File == PROGRAM_CONSTANT) {
 524        /* a literal */
 525        const int component = GET_SWZ(src->Swizzle, channel);
 526        const GLfloat *param =
 527           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 528        GLfloat value = param[component];
 529        if (src->Negate & (1 << channel))
 530           value = -value;
 531        if (src->Abs)
 532           value = FABSF(value);
 533 #if 0
 534        printf("  form immed value %f for chan %d\n", value, channel);
 535 #endif
 536        return brw_imm_f(value);
 537     }
 538     else {
 539        return get_src_reg(c, inst, srcRegIndex, channel);
 540     }
 541 }
 542
 543
 544 /**
 545  * Subroutines are minimal support for resusable instruction sequences.
 546  * They are implemented as simply as possible to minimise overhead: there
 547  * is no explicit support for communication between the caller and callee
 548  * other than saving the return address in a temporary register, nor is
 549  * there any automatic local storage.  This implies that great care is
 550  * required before attempting reentrancy or any kind of nested
 551  * subroutine invocations.
 552  */
 553 static void invoke_subroutine( struct brw_wm_compile *c,
 554                                enum _subroutine subroutine,
 555                                void (*emit)( struct brw_wm_compile * ) )
 556 {
 557     struct brw_compile *p = &c->func;
 558
 559     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 560
 561     if( c->subroutines[ subroutine ] ) {
 562         /* subroutine previously emitted: reuse existing instructions */
 563
 564         int mark = mark_tmps( c );
 565         struct brw_reg return_address = retype( alloc_tmp( c ),
 566                                                 BRW_REGISTER_TYPE_UD );
 567         int here = p->nr_insn;
 568
 569         brw_push_insn_state(p);
 570         brw_set_mask_control(p, BRW_MASK_DISABLE);
 571         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 572
 573         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 574                  brw_imm_d( ( c->subroutines[ subroutine ] -
 575                               here - 1 ) << 4 ) );
 576         brw_pop_insn_state(p);
 577
 578         release_tmps( c, mark );
 579     } else {
 580         /* previously unused subroutine: emit, and mark for later reuse */
 581
 582         int mark = mark_tmps( c );
 583         struct brw_reg return_address = retype( alloc_tmp( c ),
 584                                                 BRW_REGISTER_TYPE_UD );
 585         struct brw_instruction *calc;
 586         int base = p->nr_insn;
 587
 588         brw_push_insn_state(p);
 589         brw_set_mask_control(p, BRW_MASK_DISABLE);
 590         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 591         brw_pop_insn_state(p);
 592
 593         c->subroutines[ subroutine ] = p->nr_insn;
 594
 595         emit( c );
 596
 597         brw_push_insn_state(p);
 598         brw_set_mask_control(p, BRW_MASK_DISABLE);
 599         brw_MOV( p, brw_ip_reg(), return_address );
 600         brw_pop_insn_state(p);
 601
 602         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 603
 604         release_tmps( c, mark );
 605     }
 606 }
 607
 608 static void emit_abs( struct brw_wm_compile *c,
 609                       const struct prog_instruction *inst)
 610 {
 611     int i;
 612     struct brw_compile *p = &c->func;
 613     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 614     for (i = 0; i < 4; i++) {
 615         if (inst->DstReg.WriteMask & (1<<i)) {
 616             struct brw_reg src, dst;
 617             dst = get_dst_reg(c, inst, i);
 618             src = get_src_reg(c, inst, 0, i);
 619             brw_MOV(p, dst, brw_abs(src));
 620         }
 621     }
 622     brw_set_saturate(p, 0);
 623 }
 624
 625 static void emit_trunc( struct brw_wm_compile *c,
 626                         const struct prog_instruction *inst)
 627 {
 628     int i;
 629     struct brw_compile *p = &c->func;
 630     GLuint mask = inst->DstReg.WriteMask;
 631     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 632     for (i = 0; i < 4; i++) {
 633         if (mask & (1<<i)) {
 634             struct brw_reg src, dst;
 635             dst = get_dst_reg(c, inst, i);
 636             src = get_src_reg(c, inst, 0, i);
 637             brw_RNDZ(p, dst, src);
 638         }
 639     }
 640     brw_set_saturate(p, 0);
 641 }
 642
 643 static void emit_mov( struct brw_wm_compile *c,
 644                       const struct prog_instruction *inst)
 645 {
 646     int i;
 647     struct brw_compile *p = &c->func;
 648     GLuint mask = inst->DstReg.WriteMask;
 649     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 650     for (i = 0; i < 4; i++) {
 651         if (mask & (1<<i)) {
 652             struct brw_reg src, dst;
 653             dst = get_dst_reg(c, inst, i);
 654             /* XXX some moves from immediate value don't work reliably!!! */
 655             /*src = get_src_reg_imm(c, inst, 0, i);*/
 656             src = get_src_reg(c, inst, 0, i);
 657             brw_MOV(p, dst, src);
 658         }
 659     }
 660     brw_set_saturate(p, 0);
 661 }
 662
 663 static void emit_pixel_xy(struct brw_wm_compile *c,
 664                           const struct prog_instruction *inst)
 665 {
 666     struct brw_reg r1 = brw_vec1_grf(1, 0);
 667     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 668
 669     struct brw_reg dst0, dst1;
 670     struct brw_compile *p = &c->func;
 671     GLuint mask = inst->DstReg.WriteMask;
 672
 673     dst0 = get_dst_reg(c, inst, 0);
 674     dst1 = get_dst_reg(c, inst, 1);
 675     /* Calculate pixel centers by adding 1 or 0 to each of the
 676      * micro-tile coordinates passed in r1.
 677      */
 678     if (mask & WRITEMASK_X) {
 679         brw_ADD(p,
 680                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 681                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 682                 brw_imm_v(0x10101010));
 683     }
 684
 685     if (mask & WRITEMASK_Y) {
 686         brw_ADD(p,
 687                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 688                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 689                 brw_imm_v(0x11001100));
 690     }
 691 }
 692
 693 static void emit_delta_xy(struct brw_wm_compile *c,
 694                           const struct prog_instruction *inst)
 695 {
 696     struct brw_reg r1 = brw_vec1_grf(1, 0);
 697     struct brw_reg dst0, dst1, src0, src1;
 698     struct brw_compile *p = &c->func;
 699     GLuint mask = inst->DstReg.WriteMask;
 700
 701     dst0 = get_dst_reg(c, inst, 0);
 702     dst1 = get_dst_reg(c, inst, 1);
 703     src0 = get_src_reg(c, inst, 0, 0);
 704     src1 = get_src_reg(c, inst, 0, 1);
 705     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 706      * centers.
 707      */
 708     if (mask & WRITEMASK_X) {
 709         brw_ADD(p,
 710                 dst0,
 711                 retype(src0, BRW_REGISTER_TYPE_UW),
 712                 negate(r1));
 713     }
 714
 715     if (mask & WRITEMASK_Y) {
 716         brw_ADD(p,
 717                 dst1,
 718                 retype(src1, BRW_REGISTER_TYPE_UW),
 719                 negate(suboffset(r1,1)));
 720
 721     }
 722 }
 723
 724 static void fire_fb_write( struct brw_wm_compile *c,
 725                            GLuint base_reg,
 726                            GLuint nr,
 727                            GLuint target,
 728                            GLuint eot)
 729 {
 730     struct brw_compile *p = &c->func;
 731     /* Pass through control information:
 732      */
 733     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 734     {
 735         brw_push_insn_state(p);
 736         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 737         brw_MOV(p,
 738                 brw_message_reg(base_reg + 1),
 739                 brw_vec8_grf(1, 0));
 740         brw_pop_insn_state(p);
 741     }
 742     /* Send framebuffer write message: */
 743     brw_fb_WRITE(p,
 744             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 745             base_reg,
 746             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 747             target,
 748             nr,
 749             0,
 750             eot);
 751 }
 752
 753 static void emit_fb_write(struct brw_wm_compile *c,
 754                           const struct prog_instruction *inst)
 755 {
 756     struct brw_compile *p = &c->func;
 757     int nr = 2;
 758     int channel;
 759     GLuint target, eot;
 760     struct brw_reg src0;
 761
 762     /* Reserve a space for AA - may not be needed:
 763      */
 764     if (c->key.aa_dest_stencil_reg)
 765         nr += 1;
 766
 767     brw_push_insn_state(p);
 768     for (channel = 0; channel < 4; channel++) {
 769         src0 = get_src_reg(c,  inst, 0, channel);
 770         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 771         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 772         brw_MOV(p, brw_message_reg(nr + channel), src0);
 773     }
 774     /* skip over the regs populated above: */
 775     nr += 8;
 776     brw_pop_insn_state(p);
 777
 778     if (c->key.source_depth_to_render_target) {
 779        if (c->key.computes_depth) {
 780           src0 = get_src_reg(c, inst, 2, 2);
 781           brw_MOV(p, brw_message_reg(nr), src0);
 782        }
 783        else {
 784           src0 = get_src_reg(c, inst, 1, 1);
 785           brw_MOV(p, brw_message_reg(nr), src0);
 786        }
 787
 788        nr += 2;
 789     }
 790
 791     if (c->key.dest_depth_reg) {
 792         GLuint comp = c->key.dest_depth_reg / 2;
 793         GLuint off = c->key.dest_depth_reg % 2;
 794
 795         assert(comp == 1);
 796         assert(off == 0);
 797 #if 0
 798         /* XXX do we need this code?   comp always 1, off always 0, it seems */
 799         if (off != 0) {
 800             brw_push_insn_state(p);
 801             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 802
 803             brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
 804             /* 2nd half? */
 805             brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
 806             brw_pop_insn_state(p);
 807         }
 808         else
 809 #endif
 810         {
 811            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 812            brw_MOV(p, brw_message_reg(nr), src);
 813         }
 814         nr += 2;
 815    }
 816
 817     target = inst->Aux >> 1;
 818     eot = inst->Aux & 1;
 819     fire_fb_write(c, 0, nr, target, eot);
 820 }
 821
 822 static void emit_pixel_w( struct brw_wm_compile *c,
 823                           const struct prog_instruction *inst)
 824 {
 825     struct brw_compile *p = &c->func;
 826     GLuint mask = inst->DstReg.WriteMask;
 827     if (mask & WRITEMASK_W) {
 828         struct brw_reg dst, src0, delta0, delta1;
 829         struct brw_reg interp3;
 830
 831         dst = get_dst_reg(c, inst, 3);
 832         src0 = get_src_reg(c, inst, 0, 0);
 833         delta0 = get_src_reg(c, inst, 1, 0);
 834         delta1 = get_src_reg(c, inst, 1, 1);
 835
 836         interp3 = brw_vec1_grf(src0.nr+1, 4);
 837         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 838          * result straight into a message reg.
 839          */
 840         brw_LINE(p, brw_null_reg(), interp3, delta0);
 841         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 842
 843         /* Calc w */
 844         brw_math_16( p, dst,
 845                 BRW_MATH_FUNCTION_INV,
 846                 BRW_MATH_SATURATE_NONE,
 847                 2, brw_null_reg(),
 848                 BRW_MATH_PRECISION_FULL);
 849     }
 850 }
 851
 852 static void emit_linterp(struct brw_wm_compile *c,
 853                          const struct prog_instruction *inst)
 854 {
 855     struct brw_compile *p = &c->func;
 856     GLuint mask = inst->DstReg.WriteMask;
 857     struct brw_reg interp[4];
 858     struct brw_reg dst, delta0, delta1;
 859     struct brw_reg src0;
 860     GLuint nr, i;
 861
 862     src0 = get_src_reg(c, inst, 0, 0);
 863     delta0 = get_src_reg(c, inst, 1, 0);
 864     delta1 = get_src_reg(c, inst, 1, 1);
 865     nr = src0.nr;
 866
 867     interp[0] = brw_vec1_grf(nr, 0);
 868     interp[1] = brw_vec1_grf(nr, 4);
 869     interp[2] = brw_vec1_grf(nr+1, 0);
 870     interp[3] = brw_vec1_grf(nr+1, 4);
 871
 872     for(i = 0; i < 4; i++ ) {
 873         if (mask & (1<<i)) {
 874             dst = get_dst_reg(c, inst, i);
 875             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 876             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 877         }
 878     }
 879 }
 880
 881 static void emit_cinterp(struct brw_wm_compile *c,
 882                          const struct prog_instruction *inst)
 883 {
 884     struct brw_compile *p = &c->func;
 885     GLuint mask = inst->DstReg.WriteMask;
 886
 887     struct brw_reg interp[4];
 888     struct brw_reg dst, src0;
 889     GLuint nr, i;
 890
 891     src0 = get_src_reg(c, inst, 0, 0);
 892     nr = src0.nr;
 893
 894     interp[0] = brw_vec1_grf(nr, 0);
 895     interp[1] = brw_vec1_grf(nr, 4);
 896     interp[2] = brw_vec1_grf(nr+1, 0);
 897     interp[3] = brw_vec1_grf(nr+1, 4);
 898
 899     for(i = 0; i < 4; i++ ) {
 900         if (mask & (1<<i)) {
 901             dst = get_dst_reg(c, inst, i);
 902             brw_MOV(p, dst, suboffset(interp[i],3));
 903         }
 904     }
 905 }
 906
 907 static void emit_pinterp(struct brw_wm_compile *c,
 908                          const struct prog_instruction *inst)
 909 {
 910     struct brw_compile *p = &c->func;
 911     GLuint mask = inst->DstReg.WriteMask;
 912
 913     struct brw_reg interp[4];
 914     struct brw_reg dst, delta0, delta1;
 915     struct brw_reg src0, w;
 916     GLuint nr, i;
 917
 918     src0 = get_src_reg(c, inst, 0, 0);
 919     delta0 = get_src_reg(c, inst, 1, 0);
 920     delta1 = get_src_reg(c, inst, 1, 1);
 921     w = get_src_reg(c, inst, 2, 3);
 922     nr = src0.nr;
 923
 924     interp[0] = brw_vec1_grf(nr, 0);
 925     interp[1] = brw_vec1_grf(nr, 4);
 926     interp[2] = brw_vec1_grf(nr+1, 0);
 927     interp[3] = brw_vec1_grf(nr+1, 4);
 928
 929     for(i = 0; i < 4; i++ ) {
 930         if (mask & (1<<i)) {
 931             dst = get_dst_reg(c, inst, i);
 932             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 933             brw_MAC(p, dst, suboffset(interp[i],1),
 934                     delta1);
 935             brw_MUL(p, dst, dst, w);
 936         }
 937     }
 938 }
 939
 940 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 941 static void emit_frontfacing(struct brw_wm_compile *c,
 942                              const struct prog_instruction *inst)
 943 {
 944     struct brw_compile *p = &c->func;
 945     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 946     struct brw_reg dst;
 947     GLuint mask = inst->DstReg.WriteMask;
 948     int i;
 949
 950     for (i = 0; i < 4; i++) {
 951         if (mask & (1<<i)) {
 952             dst = get_dst_reg(c, inst, i);
 953             brw_MOV(p, dst, brw_imm_f(0.0));
 954         }
 955     }
 956
 957     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 958      * us front face
 959      */
 960     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 961     for (i = 0; i < 4; i++) {
 962         if (mask & (1<<i)) {
 963             dst = get_dst_reg(c, inst, i);
 964             brw_MOV(p, dst, brw_imm_f(1.0));
 965         }
 966     }
 967     brw_set_predicate_control_flag_value(p, 0xff);
 968 }
 969
 970 static void emit_xpd(struct brw_wm_compile *c,
 971                      const struct prog_instruction *inst)
 972 {
 973     int i;
 974     struct brw_compile *p = &c->func;
 975     GLuint mask = inst->DstReg.WriteMask;
 976     for (i = 0; i < 4; i++) {
 977         GLuint i2 = (i+2)%3;
 978         GLuint i1 = (i+1)%3;
 979         if (mask & (1<<i)) {
 980             struct brw_reg src0, src1, dst;
 981             dst = get_dst_reg(c, inst, i);
 982             src0 = negate(get_src_reg(c, inst, 0, i2));
 983             src1 = get_src_reg_imm(c, inst, 1, i1);
 984             brw_MUL(p, brw_null_reg(), src0, src1);
 985             src0 = get_src_reg(c, inst, 0, i1);
 986             src1 = get_src_reg_imm(c, inst, 1, i2);
 987             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 988             brw_MAC(p, dst, src0, src1);
 989             brw_set_saturate(p, 0);
 990         }
 991     }
 992     brw_set_saturate(p, 0);
 993 }
 994
 995 static void emit_dp3(struct brw_wm_compile *c,
 996                      const struct prog_instruction *inst)
 997 {
 998     struct brw_reg src0[3], src1[3], dst;
 999     int i;
1000     struct brw_compile *p = &c->func;
1001     for (i = 0; i < 3; i++) {
1002         src0[i] = get_src_reg(c, inst, 0, i);
1003         src1[i] = get_src_reg_imm(c, inst, 1, i);
1004     }
1005
1006     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1007     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1008     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1009     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1010     brw_MAC(p, dst, src0[2], src1[2]);
1011     brw_set_saturate(p, 0);
1012 }
1013
1014 static void emit_dp4(struct brw_wm_compile *c,
1015                      const struct prog_instruction *inst)
1016 {
1017     struct brw_reg src0[4], src1[4], dst;
1018     int i;
1019     struct brw_compile *p = &c->func;
1020     for (i = 0; i < 4; i++) {
1021         src0[i] = get_src_reg(c, inst, 0, i);
1022         src1[i] = get_src_reg_imm(c, inst, 1, i);
1023     }
1024     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1025     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1026     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1027     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1028     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1029     brw_MAC(p, dst, src0[3], src1[3]);
1030     brw_set_saturate(p, 0);
1031 }
1032
1033 static void emit_dph(struct brw_wm_compile *c,
1034                      const struct prog_instruction *inst)
1035 {
1036     struct brw_reg src0[4], src1[4], dst;
1037     int i;
1038     struct brw_compile *p = &c->func;
1039     for (i = 0; i < 4; i++) {
1040         src0[i] = get_src_reg(c, inst, 0, i);
1041         src1[i] = get_src_reg_imm(c, inst, 1, i);
1042     }
1043     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1044     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1045     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1046     brw_MAC(p, dst, src0[2], src1[2]);
1047     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1048     brw_ADD(p, dst, dst, src1[3]);
1049     brw_set_saturate(p, 0);
1050 }
1051
1052 /**
1053  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1054  * Note that the result of the function is smeared across the dest
1055  * register's X, Y, Z and W channels (subject to writemasking of course).
1056  */
1057 static void emit_math1(struct brw_wm_compile *c,
1058                        const struct prog_instruction *inst, GLuint func)
1059 {
1060     struct brw_compile *p = &c->func;
1061     struct brw_reg src0, dst, tmp;
1062     const int mark = mark_tmps( c );
1063     int i;
1064
1065     tmp = alloc_tmp(c);
1066
1067     /* Get first component of source register */
1068     src0 = get_src_reg(c, inst, 0, 0);
1069
1070     /* tmp = func(src0) */
1071     brw_MOV(p, brw_message_reg(2), src0);
1072     brw_math(p,
1073              tmp,
1074              func,
1075              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1076              2,
1077              brw_null_reg(),
1078              BRW_MATH_DATA_VECTOR,
1079              BRW_MATH_PRECISION_FULL);
1080
1081     /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1082
1083     /* replicate tmp value across enabled dest channels */
1084     for (i = 0; i < 4; i++) {
1085        if (inst->DstReg.WriteMask & (1 << i)) {
1086           dst = get_dst_reg(c, inst, i);
1087           brw_MOV(p, dst, tmp);
1088        }
1089     }
1090
1091     release_tmps(c, mark);
1092 }
1093
1094 static void emit_rcp(struct brw_wm_compile *c,
1095                      const struct prog_instruction *inst)
1096 {
1097     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1098 }
1099
1100 static void emit_rsq(struct brw_wm_compile *c,
1101                      const struct prog_instruction *inst)
1102 {
1103     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1104 }
1105
1106 static void emit_sin(struct brw_wm_compile *c,
1107                      const struct prog_instruction *inst)
1108 {
1109     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1110 }
1111
1112 static void emit_cos(struct brw_wm_compile *c,
1113                      const struct prog_instruction *inst)
1114 {
1115     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1116 }
1117
1118 static void emit_ex2(struct brw_wm_compile *c,
1119                      const struct prog_instruction *inst)
1120 {
1121     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1122 }
1123
1124 static void emit_lg2(struct brw_wm_compile *c,
1125                      const struct prog_instruction *inst)
1126 {
1127     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1128 }
1129
1130 static void emit_add(struct brw_wm_compile *c,
1131                      const struct prog_instruction *inst)
1132 {
1133     struct brw_compile *p = &c->func;
1134     struct brw_reg src0, src1, dst;
1135     GLuint mask = inst->DstReg.WriteMask;
1136     int i;
1137     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1138     for (i = 0 ; i < 4; i++) {
1139         if (mask & (1<<i)) {
1140             dst = get_dst_reg(c, inst, i);
1141             src0 = get_src_reg(c, inst, 0, i);
1142             src1 = get_src_reg_imm(c, inst, 1, i);
1143             brw_ADD(p, dst, src0, src1);
1144         }
1145     }
1146     brw_set_saturate(p, 0);
1147 }
1148
1149 static void emit_arl(struct brw_wm_compile *c,
1150                      const struct prog_instruction *inst)
1151 {
1152     struct brw_compile *p = &c->func;
1153     struct brw_reg src0, addr_reg;
1154     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1155     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1156                            BRW_ARF_ADDRESS, 0);
1157     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1158     brw_MOV(p, addr_reg, src0);
1159     brw_set_saturate(p, 0);
1160 }
1161
1162 static void emit_sub(struct brw_wm_compile *c,
1163                      const struct prog_instruction *inst)
1164 {
1165     struct brw_compile *p = &c->func;
1166     struct brw_reg src0, src1, dst;
1167     GLuint mask = inst->DstReg.WriteMask;
1168     int i;
1169     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1170     for (i = 0 ; i < 4; i++) {
1171         if (mask & (1<<i)) {
1172             dst = get_dst_reg(c, inst, i);
1173             src0 = get_src_reg(c, inst, 0, i);
1174             src1 = get_src_reg_imm(c, inst, 1, i);
1175             brw_ADD(p, dst, src0, negate(src1));
1176         }
1177     }
1178     brw_set_saturate(p, 0);
1179 }
1180
1181 static void emit_mul(struct brw_wm_compile *c,
1182                      const struct prog_instruction *inst)
1183 {
1184     struct brw_compile *p = &c->func;
1185     struct brw_reg src0, src1, dst;
1186     GLuint mask = inst->DstReg.WriteMask;
1187     int i;
1188     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1189     for (i = 0 ; i < 4; i++) {
1190         if (mask & (1<<i)) {
1191             dst = get_dst_reg(c, inst, i);
1192             src0 = get_src_reg(c, inst, 0, i);
1193             src1 = get_src_reg_imm(c, inst, 1, i);
1194             brw_MUL(p, dst, src0, src1);
1195         }
1196     }
1197     brw_set_saturate(p, 0);
1198 }
1199
1200 static void emit_frc(struct brw_wm_compile *c,
1201                      const struct prog_instruction *inst)
1202 {
1203     struct brw_compile *p = &c->func;
1204     struct brw_reg src0, dst;
1205     GLuint mask = inst->DstReg.WriteMask;
1206     int i;
1207     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1208     for (i = 0 ; i < 4; i++) {
1209         if (mask & (1<<i)) {
1210             dst = get_dst_reg(c, inst, i);
1211             src0 = get_src_reg_imm(c, inst, 0, i);
1212             brw_FRC(p, dst, src0);
1213         }
1214     }
1215     if (inst->SaturateMode != SATURATE_OFF)
1216         brw_set_saturate(p, 0);
1217 }
1218
1219 static void emit_flr(struct brw_wm_compile *c,
1220                      const struct prog_instruction *inst)
1221 {
1222     struct brw_compile *p = &c->func;
1223     struct brw_reg src0, dst;
1224     GLuint mask = inst->DstReg.WriteMask;
1225     int i;
1226     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1227     for (i = 0 ; i < 4; i++) {
1228         if (mask & (1<<i)) {
1229             dst = get_dst_reg(c, inst, i);
1230             src0 = get_src_reg_imm(c, inst, 0, i);
1231             brw_RNDD(p, dst, src0);
1232         }
1233     }
1234     brw_set_saturate(p, 0);
1235 }
1236
1237
1238 static void emit_min_max(struct brw_wm_compile *c,
1239                          const struct prog_instruction *inst)
1240 {
1241     struct brw_compile *p = &c->func;
1242     const GLuint mask = inst->DstReg.WriteMask;
1243     const int mark = mark_tmps(c);
1244     int i;
1245     brw_push_insn_state(p);
1246     for (i = 0; i < 4; i++) {
1247         if (mask & (1<<i)) {
1248             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1249             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1250             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1251             struct brw_reg dst;
1252             /* if dst==src0 or dst==src1 we need to use a temp reg */
1253             GLboolean use_temp = brw_same_reg(dst, src0) ||
1254                                  brw_same_reg(dst, src1);
1255             if (use_temp)
1256                dst = alloc_tmp(c);
1257             else
1258                dst = real_dst;
1259
1260             /*
1261             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1262                    dst.nr, src0.nr, src1.nr);
1263             */
1264             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1265             brw_MOV(p, dst, src0);
1266             brw_set_saturate(p, 0);
1267
1268             if (inst->Opcode == OPCODE_MIN)
1269                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1270             else
1271                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1272
1273             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1274             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1275             brw_MOV(p, dst, src1);
1276             brw_set_saturate(p, 0);
1277             brw_set_predicate_control_flag_value(p, 0xff);
1278             if (use_temp)
1279                brw_MOV(p, real_dst, dst);
1280         }
1281     }
1282     brw_pop_insn_state(p);
1283     release_tmps(c, mark);
1284 }
1285
1286 static void emit_pow(struct brw_wm_compile *c,
1287                      const struct prog_instruction *inst)
1288 {
1289     struct brw_compile *p = &c->func;
1290     struct brw_reg dst, src0, src1;
1291     dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1292     src0 = get_src_reg_imm(c, inst, 0, 0);
1293     src1 = get_src_reg_imm(c, inst, 1, 0);
1294
1295     brw_MOV(p, brw_message_reg(2), src0);
1296     brw_MOV(p, brw_message_reg(3), src1);
1297
1298     brw_math(p,
1299             dst,
1300             BRW_MATH_FUNCTION_POW,
1301             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1302             2,
1303             brw_null_reg(),
1304             BRW_MATH_DATA_VECTOR,
1305             BRW_MATH_PRECISION_FULL);
1306 }
1307
1308 static void emit_lrp(struct brw_wm_compile *c,
1309                      const struct prog_instruction *inst)
1310 {
1311     struct brw_compile *p = &c->func;
1312     GLuint mask = inst->DstReg.WriteMask;
1313     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1314     int i;
1315     int mark = mark_tmps(c);
1316     for (i = 0; i < 4; i++) {
1317         if (mask & (1<<i)) {
1318             dst = get_dst_reg(c, inst, i);
1319             src0 = get_src_reg(c, inst, 0, i);
1320
1321             src1 = get_src_reg_imm(c, inst, 1, i);
1322
1323             if (src1.nr == dst.nr) {
1324                 tmp1 = alloc_tmp(c);
1325                 brw_MOV(p, tmp1, src1);
1326             } else
1327                 tmp1 = src1;
1328
1329             src2 = get_src_reg(c, inst, 2, i);
1330             if (src2.nr == dst.nr) {
1331                 tmp2 = alloc_tmp(c);
1332                 brw_MOV(p, tmp2, src2);
1333             } else
1334                 tmp2 = src2;
1335
1336             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1337             brw_MUL(p, brw_null_reg(), dst, tmp2);
1338             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1339             brw_MAC(p, dst, src0, tmp1);
1340             brw_set_saturate(p, 0);
1341         }
1342         release_tmps(c, mark);
1343     }
1344 }
1345
1346 /**
1347  * For GLSL shaders, this KIL will be unconditional.
1348  * It may be contained inside an IF/ENDIF structure of course.
1349  */
1350 static void emit_kil(struct brw_wm_compile *c)
1351 {
1352     struct brw_compile *p = &c->func;
1353     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1354     brw_push_insn_state(p);
1355     brw_set_mask_control(p, BRW_MASK_DISABLE);
1356     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1357     brw_AND(p, depth, c->emit_mask_reg, depth);
1358     brw_pop_insn_state(p);
1359 }
1360
1361 static void emit_mad(struct brw_wm_compile *c,
1362                      const struct prog_instruction *inst)
1363 {
1364     struct brw_compile *p = &c->func;
1365     GLuint mask = inst->DstReg.WriteMask;
1366     struct brw_reg dst, src0, src1, src2;
1367     int i;
1368
1369     for (i = 0; i < 4; i++) {
1370         if (mask & (1<<i)) {
1371             dst = get_dst_reg(c, inst, i);
1372             src0 = get_src_reg(c, inst, 0, i);
1373             src1 = get_src_reg_imm(c, inst, 1, i);
1374             src2 = get_src_reg_imm(c, inst, 2, i);
1375             brw_MUL(p, dst, src0, src1);
1376
1377             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1378             brw_ADD(p, dst, dst, src2);
1379             brw_set_saturate(p, 0);
1380         }
1381     }
1382 }
1383
1384 static void emit_sop(struct brw_wm_compile *c,
1385                      const struct prog_instruction *inst, GLuint cond)
1386 {
1387     struct brw_compile *p = &c->func;
1388     GLuint mask = inst->DstReg.WriteMask;
1389     struct brw_reg dst, src0, src1;
1390     int i;
1391
1392     for (i = 0; i < 4; i++) {
1393         if (mask & (1<<i)) {
1394             dst = get_dst_reg(c, inst, i);
1395             src0 = get_src_reg(c, inst, 0, i);
1396             src1 = get_src_reg_imm(c, inst, 1, i);
1397             brw_push_insn_state(p);
1398             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1399             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1400             brw_MOV(p, dst, brw_imm_f(0.0));
1401             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1402             brw_MOV(p, dst, brw_imm_f(1.0));
1403             brw_pop_insn_state(p);
1404         }
1405     }
1406 }
1407
1408 static void emit_slt(struct brw_wm_compile *c,
1409                      const struct prog_instruction *inst)
1410 {
1411     emit_sop(c, inst, BRW_CONDITIONAL_L);
1412 }
1413
1414 static void emit_sle(struct brw_wm_compile *c,
1415                      const struct prog_instruction *inst)
1416 {
1417     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1418 }
1419
1420 static void emit_sgt(struct brw_wm_compile *c,
1421                      const struct prog_instruction *inst)
1422 {
1423     emit_sop(c, inst, BRW_CONDITIONAL_G);
1424 }
1425
1426 static void emit_sge(struct brw_wm_compile *c,
1427                      const struct prog_instruction *inst)
1428 {
1429     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1430 }
1431
1432 static void emit_seq(struct brw_wm_compile *c,
1433                      const struct prog_instruction *inst)
1434 {
1435     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1436 }
1437
1438 static void emit_sne(struct brw_wm_compile *c,
1439                      const struct prog_instruction *inst)
1440 {
1441     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1442 }
1443
1444 static void emit_ddx(struct brw_wm_compile *c,
1445                      const struct prog_instruction *inst)
1446 {
1447     struct brw_compile *p = &c->func;
1448     GLuint mask = inst->DstReg.WriteMask;
1449     struct brw_reg interp[4];
1450     struct brw_reg dst;
1451     struct brw_reg src0, w;
1452     GLuint nr, i;
1453     src0 = get_src_reg(c, inst, 0, 0);
1454     w = get_src_reg(c, inst, 1, 3);
1455     nr = src0.nr;
1456     interp[0] = brw_vec1_grf(nr, 0);
1457     interp[1] = brw_vec1_grf(nr, 4);
1458     interp[2] = brw_vec1_grf(nr+1, 0);
1459     interp[3] = brw_vec1_grf(nr+1, 4);
1460     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1461     for(i = 0; i < 4; i++ ) {
1462         if (mask & (1<<i)) {
1463             dst = get_dst_reg(c, inst, i);
1464             brw_MOV(p, dst, interp[i]);
1465             brw_MUL(p, dst, dst, w);
1466         }
1467     }
1468     brw_set_saturate(p, 0);
1469 }
1470
1471 static void emit_ddy(struct brw_wm_compile *c,
1472                      const struct prog_instruction *inst)
1473 {
1474     struct brw_compile *p = &c->func;
1475     GLuint mask = inst->DstReg.WriteMask;
1476     struct brw_reg interp[4];
1477     struct brw_reg dst;
1478     struct brw_reg src0, w;
1479     GLuint nr, i;
1480
1481     src0 = get_src_reg(c, inst, 0, 0);
1482     nr = src0.nr;
1483     w = get_src_reg(c, inst, 1, 3);
1484     interp[0] = brw_vec1_grf(nr, 0);
1485     interp[1] = brw_vec1_grf(nr, 4);
1486     interp[2] = brw_vec1_grf(nr+1, 0);
1487     interp[3] = brw_vec1_grf(nr+1, 4);
1488     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1489     for(i = 0; i < 4; i++ ) {
1490         if (mask & (1<<i)) {
1491             dst = get_dst_reg(c, inst, i);
1492             brw_MOV(p, dst, suboffset(interp[i], 1));
1493             brw_MUL(p, dst, dst, w);
1494         }
1495     }
1496     brw_set_saturate(p, 0);
1497 }
1498
1499 static INLINE struct brw_reg high_words( struct brw_reg reg )
1500 {
1501     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1502                    0, 8, 2 );
1503 }
1504
1505 static INLINE struct brw_reg low_words( struct brw_reg reg )
1506 {
1507     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1508 }
1509
1510 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1511 {
1512     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1513 }
1514
1515 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1516 {
1517     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1518                    0, 16, 2 );
1519 }
1520
1521 /* One-, two- and three-dimensional Perlin noise, similar to the description
1522    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1523 static void noise1_sub( struct brw_wm_compile *c ) {
1524
1525     struct brw_compile *p = &c->func;
1526     struct brw_reg param,
1527         x0, x1, /* gradients at each end */
1528         t, tmp[ 2 ], /* float temporaries */
1529         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1530     int i;
1531     int mark = mark_tmps( c );
1532
1533     x0 = alloc_tmp( c );
1534     x1 = alloc_tmp( c );
1535     t = alloc_tmp( c );
1536     tmp[ 0 ] = alloc_tmp( c );
1537     tmp[ 1 ] = alloc_tmp( c );
1538     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1539     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1540     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1541     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1542     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1543
1544     param = lookup_tmp( c, mark - 2 );
1545
1546     brw_set_access_mode( p, BRW_ALIGN_1 );
1547
1548     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1549
1550     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1551        be hashed.  Also compute the remainder (offset within the unit
1552        length), interleaved to reduce register dependency penalties. */
1553     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1554     brw_FRC( p, param, param );
1555     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1556     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1557     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1558
1559     /* We're now ready to perform the hashing.  The two hashes are
1560        interleaved for performance.  The hash function used is
1561        designed to rapidly achieve avalanche and require only 32x16
1562        bit multiplication, and 16-bit swizzles (which we get for
1563        free).  We can't use immediate operands in the multiplies,
1564        because immediates are permitted only in src1 and the 16-bit
1565        factor is permitted only in src0. */
1566     for( i = 0; i < 2; i++ )
1567         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1568     for( i = 0; i < 2; i++ )
1569        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1570                 high_words( itmp[ i ] ) );
1571     for( i = 0; i < 2; i++ )
1572         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1573     for( i = 0; i < 2; i++ )
1574        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1575                 high_words( itmp[ i ] ) );
1576     for( i = 0; i < 2; i++ )
1577         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1578     for( i = 0; i < 2; i++ )
1579        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1580                 high_words( itmp[ i ] ) );
1581
1582     /* Now we want to initialise the two gradients based on the
1583        hashes.  Format conversion from signed integer to float leaves
1584        everything scaled too high by a factor of pow( 2, 31 ), but
1585        we correct for that right at the end. */
1586     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1587     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1588     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1589
1590     brw_MUL( p, x0, x0, param );
1591     brw_MUL( p, x1, x1, t );
1592
1593     /* We interpolate between the gradients using the polynomial
1594        6t^5 - 15t^4 + 10t^3 (Perlin). */
1595     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1596     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1597     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1598     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1599     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1600     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1601                                            pipeline */
1602     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1603     brw_MUL( p, param, tmp[ 0 ], param );
1604     brw_MUL( p, x1, x1, param );
1605     brw_ADD( p, x0, x0, x1 );
1606     /* scale by pow( 2, -30 ), to compensate for the format conversion
1607        above and an extra factor of 2 so that a single gradient covers
1608        the [-1,1] range */
1609     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1610
1611     release_tmps( c, mark );
1612 }
1613
1614 static void emit_noise1( struct brw_wm_compile *c,
1615                          const struct prog_instruction *inst )
1616 {
1617     struct brw_compile *p = &c->func;
1618     struct brw_reg src, param, dst;
1619     GLuint mask = inst->DstReg.WriteMask;
1620     int i;
1621     int mark = mark_tmps( c );
1622
1623     assert( mark == 0 );
1624
1625     src = get_src_reg( c, inst, 0, 0 );
1626
1627     param = alloc_tmp( c );
1628
1629     brw_MOV( p, param, src );
1630
1631     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1632
1633     /* Fill in the result: */
1634     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1635     for (i = 0 ; i < 4; i++) {
1636         if (mask & (1<<i)) {
1637             dst = get_dst_reg(c, inst, i);
1638             brw_MOV( p, dst, param );
1639         }
1640     }
1641     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1642         brw_set_saturate( p, 0 );
1643
1644     release_tmps( c, mark );
1645 }
1646
1647 static void noise2_sub( struct brw_wm_compile *c ) {
1648
1649     struct brw_compile *p = &c->func;
1650     struct brw_reg param0, param1,
1651         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1652         t, tmp[ 4 ], /* float temporaries */
1653         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1654     int i;
1655     int mark = mark_tmps( c );
1656
1657     x0y0 = alloc_tmp( c );
1658     x0y1 = alloc_tmp( c );
1659     x1y0 = alloc_tmp( c );
1660     x1y1 = alloc_tmp( c );
1661     t = alloc_tmp( c );
1662     for( i = 0; i < 4; i++ ) {
1663         tmp[ i ] = alloc_tmp( c );
1664         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1665     }
1666     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1667     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1668     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1669
1670     param0 = lookup_tmp( c, mark - 3 );
1671     param1 = lookup_tmp( c, mark - 2 );
1672
1673     brw_set_access_mode( p, BRW_ALIGN_1 );
1674
1675     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1676        be hashed.  Also compute the remainders (offsets within the unit
1677        square), interleaved to reduce register dependency penalties. */
1678     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1679     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1680     brw_FRC( p, param0, param0 );
1681     brw_FRC( p, param1, param1 );
1682     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1683     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1684              low_words( itmp[ 1 ] ) );
1685     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1686     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1687     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1688     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1689     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1690
1691     /* We're now ready to perform the hashing.  The four hashes are
1692        interleaved for performance.  The hash function used is
1693        designed to rapidly achieve avalanche and require only 32x16
1694        bit multiplication, and 16-bit swizzles (which we get for
1695        free).  We can't use immediate operands in the multiplies,
1696        because immediates are permitted only in src1 and the 16-bit
1697        factor is permitted only in src0. */
1698     for( i = 0; i < 4; i++ )
1699         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1700     for( i = 0; i < 4; i++ )
1701         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1702                  high_words( itmp[ i ] ) );
1703     for( i = 0; i < 4; i++ )
1704         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1705     for( i = 0; i < 4; i++ )
1706         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1707                  high_words( itmp[ i ] ) );
1708     for( i = 0; i < 4; i++ )
1709         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1710     for( i = 0; i < 4; i++ )
1711         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1712                  high_words( itmp[ i ] ) );
1713
1714     /* Now we want to initialise the four gradients based on the
1715        hashes.  Format conversion from signed integer to float leaves
1716        everything scaled too high by a factor of pow( 2, 15 ), but
1717        we correct for that right at the end. */
1718     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1719     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1720     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1721     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1722     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1723
1724     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1725     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1726     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1727     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1728
1729     brw_MUL( p, x1y0, x1y0, t );
1730     brw_MUL( p, x1y1, x1y1, t );
1731     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1732     brw_MUL( p, x0y0, x0y0, param0 );
1733     brw_MUL( p, x0y1, x0y1, param0 );
1734
1735     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1736     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1737     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1738     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1739
1740     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1741     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1742     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1743     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1744
1745     /* We interpolate between the gradients using the polynomial
1746        6t^5 - 15t^4 + 10t^3 (Perlin). */
1747     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1748     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1749     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1750     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1751     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1752     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1753     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1754                                                  pipeline */
1755     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1756     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1757     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1758     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1759     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1760                                                  pipeline */
1761     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1762     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1763     brw_MUL( p, param0, tmp[ 0 ], param0 );
1764     brw_MUL( p, param1, tmp[ 1 ], param1 );
1765
1766     /* Here we interpolate in the y dimension... */
1767     brw_MUL( p, x0y1, x0y1, param1 );
1768     brw_MUL( p, x1y1, x1y1, param1 );
1769     brw_ADD( p, x0y0, x0y0, x0y1 );
1770     brw_ADD( p, x1y0, x1y0, x1y1 );
1771
1772     /* And now in x.  There are horrible register dependencies here,
1773        but we have nothing else to do. */
1774     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1775     brw_MUL( p, x1y0, x1y0, param0 );
1776     brw_ADD( p, x0y0, x0y0, x1y0 );
1777
1778     /* scale by pow( 2, -15 ), as described above */
1779     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1780
1781     release_tmps( c, mark );
1782 }
1783
1784 static void emit_noise2( struct brw_wm_compile *c,
1785                          const struct prog_instruction *inst )
1786 {
1787     struct brw_compile *p = &c->func;
1788     struct brw_reg src0, src1, param0, param1, dst;
1789     GLuint mask = inst->DstReg.WriteMask;
1790     int i;
1791     int mark = mark_tmps( c );
1792
1793     assert( mark == 0 );
1794
1795     src0 = get_src_reg( c, inst, 0, 0 );
1796     src1 = get_src_reg( c, inst, 0, 1 );
1797
1798     param0 = alloc_tmp( c );
1799     param1 = alloc_tmp( c );
1800
1801     brw_MOV( p, param0, src0 );
1802     brw_MOV( p, param1, src1 );
1803
1804     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1805
1806     /* Fill in the result: */
1807     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1808     for (i = 0 ; i < 4; i++) {
1809         if (mask & (1<<i)) {
1810             dst = get_dst_reg(c, inst, i);
1811             brw_MOV( p, dst, param0 );
1812         }
1813     }
1814     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1815         brw_set_saturate( p, 0 );
1816
1817     release_tmps( c, mark );
1818 }
1819
1820 /**
1821  * The three-dimensional case is much like the one- and two- versions above,
1822  * but since the number of corners is rapidly growing we now pack 16 16-bit
1823  * hashes into each register to extract more parallelism from the EUs.
1824  */
1825 static void noise3_sub( struct brw_wm_compile *c ) {
1826
1827     struct brw_compile *p = &c->func;
1828     struct brw_reg param0, param1, param2,
1829         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1830         xi, yi, zi, /* interpolation coefficients */
1831         t, tmp[ 8 ], /* float temporaries */
1832         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1833         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1834     int i;
1835     int mark = mark_tmps( c );
1836
1837     x0y0 = alloc_tmp( c );
1838     x0y1 = alloc_tmp( c );
1839     x1y0 = alloc_tmp( c );
1840     x1y1 = alloc_tmp( c );
1841     xi = alloc_tmp( c );
1842     yi = alloc_tmp( c );
1843     zi = alloc_tmp( c );
1844     t = alloc_tmp( c );
1845     for( i = 0; i < 8; i++ ) {
1846         tmp[ i ] = alloc_tmp( c );
1847         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1848         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1849     }
1850
1851     param0 = lookup_tmp( c, mark - 4 );
1852     param1 = lookup_tmp( c, mark - 3 );
1853     param2 = lookup_tmp( c, mark - 2 );
1854
1855     brw_set_access_mode( p, BRW_ALIGN_1 );
1856
1857     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1858        be hashed.  Also compute the remainders (offsets within the unit
1859        cube), interleaved to reduce register dependency penalties. */
1860     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1861     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1862     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1863     brw_FRC( p, param0, param0 );
1864     brw_FRC( p, param1, param1 );
1865     brw_FRC( p, param2, param2 );
1866     /* Since we now have only 16 bits of precision in the hash, we must
1867        be more careful about thorough mixing to maintain entropy as we
1868        squash the input vector into a small scalar. */
1869     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1870     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1871     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1872              brw_imm_uw( 0x9B93 ) );
1873     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1874              brw_imm_uw( 0xBC8F ) );
1875
1876     /* Temporarily disable the execution mask while we work with ExecSize=16
1877        channels (the mask is set for ExecSize=8 and is probably incorrect).
1878        Although this might cause execution of unwanted channels, the code
1879        writes only to temporary registers and has no side effects, so
1880        disabling the mask is harmless. */
1881     brw_push_insn_state( p );
1882     brw_set_mask_control( p, BRW_MASK_DISABLE );
1883     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1884     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1885     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1886
1887     /* We're now ready to perform the hashing.  The eight hashes are
1888        interleaved for performance.  The hash function used is
1889        designed to rapidly achieve avalanche and require only 16x16
1890        bit multiplication, and 8-bit swizzles (which we get for
1891        free). */
1892     for( i = 0; i < 4; i++ )
1893         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1894     for( i = 0; i < 4; i++ )
1895         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1896                  odd_bytes( wtmp[ i ] ) );
1897     for( i = 0; i < 4; i++ )
1898         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1899     for( i = 0; i < 4; i++ )
1900         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1901                  odd_bytes( wtmp[ i ] ) );
1902     brw_pop_insn_state( p );
1903
1904     /* Now we want to initialise the four rear gradients based on the
1905        hashes.  Format conversion from signed integer to float leaves
1906        everything scaled too high by a factor of pow( 2, 15 ), but
1907        we correct for that right at the end. */
1908     /* x component */
1909     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1910     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1911     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1912     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1913     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1914
1915     brw_push_insn_state( p );
1916     brw_set_mask_control( p, BRW_MASK_DISABLE );
1917     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1918     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1919     brw_pop_insn_state( p );
1920
1921     brw_MUL( p, x1y0, x1y0, t );
1922     brw_MUL( p, x1y1, x1y1, t );
1923     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1924     brw_MUL( p, x0y0, x0y0, param0 );
1925     brw_MUL( p, x0y1, x0y1, param0 );
1926
1927     /* y component */
1928     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1929     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1930     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1931     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1932
1933     brw_push_insn_state( p );
1934     brw_set_mask_control( p, BRW_MASK_DISABLE );
1935     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1936     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1937     brw_pop_insn_state( p );
1938
1939     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1940     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1941     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1942     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1943     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1944
1945     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1946     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1947     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1948     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1949
1950     /* z component */
1951     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1952     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1953     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1954     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1955
1956     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1957     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1958     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1959     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1960
1961     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1962     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1963     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1964     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1965
1966     /* We interpolate between the gradients using the polynomial
1967        6t^5 - 15t^4 + 10t^3 (Perlin). */
1968     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1969     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1970     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1971     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1972     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1973     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1974     brw_MUL( p, xi, xi, param0 );
1975     brw_MUL( p, yi, yi, param1 );
1976     brw_MUL( p, zi, zi, param2 );
1977     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1978     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1979     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1980     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1981     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1982     brw_MUL( p, xi, xi, param0 );
1983     brw_MUL( p, yi, yi, param1 );
1984     brw_MUL( p, zi, zi, param2 );
1985     brw_MUL( p, xi, xi, param0 );
1986     brw_MUL( p, yi, yi, param1 );
1987     brw_MUL( p, zi, zi, param2 );
1988     brw_MUL( p, xi, xi, param0 );
1989     brw_MUL( p, yi, yi, param1 );
1990     brw_MUL( p, zi, zi, param2 );
1991
1992     /* Here we interpolate in the y dimension... */
1993     brw_MUL( p, x0y1, x0y1, yi );
1994     brw_MUL( p, x1y1, x1y1, yi );
1995     brw_ADD( p, x0y0, x0y0, x0y1 );
1996     brw_ADD( p, x1y0, x1y0, x1y1 );
1997
1998     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1999     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2000     brw_MUL( p, x1y0, x1y0, xi );
2001     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2002
2003     /* Now do the same thing for the front four gradients... */
2004     /* x component */
2005     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2006     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2007     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2008     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2009
2010     brw_push_insn_state( p );
2011     brw_set_mask_control( p, BRW_MASK_DISABLE );
2012     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2013     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2014     brw_pop_insn_state( p );
2015
2016     brw_MUL( p, x1y0, x1y0, t );
2017     brw_MUL( p, x1y1, x1y1, t );
2018     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2019     brw_MUL( p, x0y0, x0y0, param0 );
2020     brw_MUL( p, x0y1, x0y1, param0 );
2021
2022     /* y component */
2023     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2024     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2025     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2026     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2027
2028     brw_push_insn_state( p );
2029     brw_set_mask_control( p, BRW_MASK_DISABLE );
2030     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2031     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2032     brw_pop_insn_state( p );
2033
2034     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2035     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2036     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2037     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2038     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2039
2040     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2041     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2042     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2043     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2044
2045     /* z component */
2046     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2047     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2048     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2049     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2050
2051     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2052     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2053     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2054     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2055
2056     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2057     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2058     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2059     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2060
2061     /* The interpolation coefficients are still around from last time, so
2062        again interpolate in the y dimension... */
2063     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2064     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2065     brw_MUL( p, x0y1, x0y1, yi );
2066     brw_MUL( p, x1y1, x1y1, yi );
2067     brw_ADD( p, x0y0, x0y0, x0y1 );
2068     brw_ADD( p, x1y0, x1y0, x1y1 );
2069
2070     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2071        time put the front face in tmp[ 1 ] and we're nearly there... */
2072     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2073     brw_MUL( p, x1y0, x1y0, xi );
2074     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2075
2076     /* The final interpolation, in the z dimension: */
2077     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2078     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2079     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2080
2081     /* scale by pow( 2, -15 ), as described above */
2082     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2083
2084     release_tmps( c, mark );
2085 }
2086
2087 static void emit_noise3( struct brw_wm_compile *c,
2088                          const struct prog_instruction *inst )
2089 {
2090     struct brw_compile *p = &c->func;
2091     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2092     GLuint mask = inst->DstReg.WriteMask;
2093     int i;
2094     int mark = mark_tmps( c );
2095
2096     assert( mark == 0 );
2097
2098     src0 = get_src_reg( c, inst, 0, 0 );
2099     src1 = get_src_reg( c, inst, 0, 1 );
2100     src2 = get_src_reg( c, inst, 0, 2 );
2101
2102     param0 = alloc_tmp( c );
2103     param1 = alloc_tmp( c );
2104     param2 = alloc_tmp( c );
2105
2106     brw_MOV( p, param0, src0 );
2107     brw_MOV( p, param1, src1 );
2108     brw_MOV( p, param2, src2 );
2109
2110     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2111
2112     /* Fill in the result: */
2113     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2114     for (i = 0 ; i < 4; i++) {
2115         if (mask & (1<<i)) {
2116             dst = get_dst_reg(c, inst, i);
2117             brw_MOV( p, dst, param0 );
2118         }
2119     }
2120     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2121         brw_set_saturate( p, 0 );
2122
2123     release_tmps( c, mark );
2124 }
2125
2126 /**
2127  * For the four-dimensional case, the little micro-optimisation benefits
2128  * we obtain by unrolling all the loops aren't worth the massive bloat it
2129  * now causes.  Instead, we loop twice around performing a similar operation
2130  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2131  * code to glue it all together.
2132  */
2133 static void noise4_sub( struct brw_wm_compile *c )
2134 {
2135     struct brw_compile *p = &c->func;
2136     struct brw_reg param[ 4 ],
2137         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2138         w0, /* noise for the w=0 cube */
2139         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2140         interp[ 4 ], /* interpolation coefficients */
2141         t, tmp[ 8 ], /* float temporaries */
2142         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2143         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2144     int i, j;
2145     int mark = mark_tmps( c );
2146     GLuint loop, origin;
2147
2148     x0y0 = alloc_tmp( c );
2149     x0y1 = alloc_tmp( c );
2150     x1y0 = alloc_tmp( c );
2151     x1y1 = alloc_tmp( c );
2152     t = alloc_tmp( c );
2153     w0 = alloc_tmp( c );
2154     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2155     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2156
2157     for( i = 0; i < 4; i++ ) {
2158         param[ i ] = lookup_tmp( c, mark - 5 + i );
2159         interp[ i ] = alloc_tmp( c );
2160     }
2161
2162     for( i = 0; i < 8; i++ ) {
2163         tmp[ i ] = alloc_tmp( c );
2164         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2165         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2166     }
2167
2168     brw_set_access_mode( p, BRW_ALIGN_1 );
2169
2170     /* We only want 16 bits of precision from the integral part of each
2171        co-ordinate, but unfortunately the RNDD semantics would saturate
2172        at 16 bits if we performed the operation directly to a 16-bit
2173        destination.  Therefore, we round to 32-bit temporaries where
2174        appropriate, and then store only the lower 16 bits. */
2175     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2176     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2177     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2178     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2179     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2180     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2181
2182     /* Modify the flag register here, because the side effect is useful
2183        later (see below).  We know for certain that all flags will be
2184        cleared, since the FRC instruction cannot possibly generate
2185        negative results.  Even for exceptional inputs (infinities, denormals,
2186        NaNs), the architecture guarantees that the L conditional is false. */
2187     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2188     brw_FRC( p, param[ 0 ], param[ 0 ] );
2189     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2190     for( i = 1; i < 4; i++ )
2191         brw_FRC( p, param[ i ], param[ i ] );
2192
2193     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2194        of all. */
2195     for( i = 0; i < 4; i++ )
2196         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2197     for( i = 0; i < 4; i++ )
2198         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2199     for( i = 0; i < 4; i++ )
2200         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2201     for( i = 0; i < 4; i++ )
2202         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2203     for( j = 0; j < 3; j++ )
2204         for( i = 0; i < 4; i++ )
2205             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2206
2207     /* Mark the current address, as it will be a jump destination.  The
2208        following code will be executed twice: first, with the flag
2209        register clear indicating the w=0 case, and second with flags
2210        set for w=1. */
2211     loop = p->nr_insn;
2212
2213     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2214        be hashed.  Since we have only 16 bits of precision in the hash, we
2215        must be careful about thorough mixing to maintain entropy as we
2216        squash the input vector into a small scalar. */
2217     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2218              brw_imm_uw( 0xBC8F ) );
2219     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2220              brw_imm_uw( 0xD0BD ) );
2221     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2222              brw_imm_uw( 0x9B93 ) );
2223     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2224              brw_imm_uw( 0xA359 ) );
2225     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2226              brw_imm_uw( 0xBC8F ) );
2227
2228     /* Temporarily disable the execution mask while we work with ExecSize=16
2229        channels (the mask is set for ExecSize=8 and is probably incorrect).
2230        Although this might cause execution of unwanted channels, the code
2231        writes only to temporary registers and has no side effects, so
2232        disabling the mask is harmless. */
2233     brw_push_insn_state( p );
2234     brw_set_mask_control( p, BRW_MASK_DISABLE );
2235     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2236     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2237     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2238
2239     /* We're now ready to perform the hashing.  The eight hashes are
2240        interleaved for performance.  The hash function used is
2241        designed to rapidly achieve avalanche and require only 16x16
2242        bit multiplication, and 8-bit swizzles (which we get for
2243        free). */
2244     for( i = 0; i < 4; i++ )
2245         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2246     for( i = 0; i < 4; i++ )
2247         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2248                  odd_bytes( wtmp[ i ] ) );
2249     for( i = 0; i < 4; i++ )
2250         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2251     for( i = 0; i < 4; i++ )
2252         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2253                  odd_bytes( wtmp[ i ] ) );
2254     brw_pop_insn_state( p );
2255
2256     /* Now we want to initialise the four rear gradients based on the
2257        hashes.  Format conversion from signed integer to float leaves
2258        everything scaled too high by a factor of pow( 2, 15 ), but
2259        we correct for that right at the end. */
2260     /* x component */
2261     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2262     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2263     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2264     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2265     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2266
2267     brw_push_insn_state( p );
2268     brw_set_mask_control( p, BRW_MASK_DISABLE );
2269     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2270     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2271     brw_pop_insn_state( p );
2272
2273     brw_MUL( p, x1y0, x1y0, t );
2274     brw_MUL( p, x1y1, x1y1, t );
2275     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2276     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2277     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2278
2279     /* y component */
2280     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2281     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2282     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2283     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2284
2285     brw_push_insn_state( p );
2286     brw_set_mask_control( p, BRW_MASK_DISABLE );
2287     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2288     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2289     brw_pop_insn_state( p );
2290
2291     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2292     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2293     /* prepare t for the w component (used below): w the first time through
2294        the loop; w - 1 the second time) */
2295     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2296     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2297     p->current->header.predicate_inverse = 1;
2298     brw_MOV( p, t, param[ 3 ] );
2299     p->current->header.predicate_inverse = 0;
2300     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2301     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2302     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2303
2304     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2305     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2306     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2307     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2308
2309     /* z component */
2310     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2311     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2312     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2313     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2314
2315     brw_push_insn_state( p );
2316     brw_set_mask_control( p, BRW_MASK_DISABLE );
2317     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2318     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2319     brw_pop_insn_state( p );
2320
2321     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2322     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2323     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2324     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2325
2326     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2327     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2328     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2329     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2330
2331     /* w component */
2332     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2333     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2334     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2335     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2336
2337     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2338     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2339     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2340     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2341     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2342
2343     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2344     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2345     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2346     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2347
2348     /* Here we interpolate in the y dimension... */
2349     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2350     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2351     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2352     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2353     brw_ADD( p, x0y0, x0y0, x0y1 );
2354     brw_ADD( p, x1y0, x1y0, x1y1 );
2355
2356     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2357     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2358     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2359     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2360
2361     /* Now do the same thing for the front four gradients... */
2362     /* x component */
2363     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2364     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2365     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2366     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2367
2368     brw_push_insn_state( p );
2369     brw_set_mask_control( p, BRW_MASK_DISABLE );
2370     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2371     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2372     brw_pop_insn_state( p );
2373
2374     brw_MUL( p, x1y0, x1y0, t );
2375     brw_MUL( p, x1y1, x1y1, t );
2376     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2377     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2378     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2379
2380     /* y component */
2381     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2382     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2383     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2384     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2385
2386     brw_push_insn_state( p );
2387     brw_set_mask_control( p, BRW_MASK_DISABLE );
2388     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2389     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2390     brw_pop_insn_state( p );
2391
2392     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2393     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2394     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2395     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2396     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2397
2398     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2399     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2400     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2401     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2402
2403     /* z component */
2404     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2405     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2406     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2407     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2408
2409     brw_push_insn_state( p );
2410     brw_set_mask_control( p, BRW_MASK_DISABLE );
2411     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2412     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2413     brw_pop_insn_state( p );
2414
2415     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2416     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2417     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2418     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2419     /* prepare t for the w component (used below): w the first time through
2420        the loop; w - 1 the second time) */
2421     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2422     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2423     p->current->header.predicate_inverse = 1;
2424     brw_MOV( p, t, param[ 3 ] );
2425     p->current->header.predicate_inverse = 0;
2426     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2427
2428     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2429     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2430     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2431     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2432
2433     /* w component */
2434     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2435     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2436     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2437     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2438
2439     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2440     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2441     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2442     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2443
2444     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2445     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2446     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2447     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2448
2449     /* Interpolate in the y dimension: */
2450     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2451     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2452     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2453     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2454     brw_ADD( p, x0y0, x0y0, x0y1 );
2455     brw_ADD( p, x1y0, x1y0, x1y1 );
2456
2457     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2458        time put the front face in tmp[ 1 ] and we're nearly there... */
2459     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2460     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2461     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2462
2463     /* Another interpolation, in the z dimension: */
2464     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2465     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2466     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2467
2468     /* Exit the loop if we've computed both cubes... */
2469     origin = p->nr_insn;
2470     brw_push_insn_state( p );
2471     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2472     brw_set_mask_control( p, BRW_MASK_DISABLE );
2473     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2474     brw_pop_insn_state( p );
2475
2476     /* Save the result for the w=0 case, and increment the w coordinate: */
2477     brw_MOV( p, w0, tmp[ 0 ] );
2478     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2479              brw_imm_uw( 1 ) );
2480
2481     /* Loop around for the other cube.  Explicitly set the flag register
2482        (unfortunately we must spend an extra instruction to do this: we
2483        can't rely on a side effect of the previous MOV or ADD because
2484        conditional modifiers which are normally true might be false in
2485        exceptional circumstances, e.g. given a NaN input; the add to
2486        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2487     brw_push_insn_state( p );
2488     brw_set_mask_control( p, BRW_MASK_DISABLE );
2489     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2490     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2491              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2492     brw_pop_insn_state( p );
2493
2494     /* Patch the previous conditional branch now that we know the
2495        destination address. */
2496     brw_set_src1( p->store + origin,
2497                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2498
2499     /* The very last interpolation. */
2500     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2501     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2502     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2503
2504     /* scale by pow( 2, -15 ), as described above */
2505     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2506
2507     release_tmps( c, mark );
2508 }
2509
2510 static void emit_noise4( struct brw_wm_compile *c,
2511                          const struct prog_instruction *inst )
2512 {
2513     struct brw_compile *p = &c->func;
2514     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2515     GLuint mask = inst->DstReg.WriteMask;
2516     int i;
2517     int mark = mark_tmps( c );
2518
2519     assert( mark == 0 );
2520
2521     src0 = get_src_reg( c, inst, 0, 0 );
2522     src1 = get_src_reg( c, inst, 0, 1 );
2523     src2 = get_src_reg( c, inst, 0, 2 );
2524     src3 = get_src_reg( c, inst, 0, 3 );
2525
2526     param0 = alloc_tmp( c );
2527     param1 = alloc_tmp( c );
2528     param2 = alloc_tmp( c );
2529     param3 = alloc_tmp( c );
2530
2531     brw_MOV( p, param0, src0 );
2532     brw_MOV( p, param1, src1 );
2533     brw_MOV( p, param2, src2 );
2534     brw_MOV( p, param3, src3 );
2535
2536     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2537
2538     /* Fill in the result: */
2539     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2540     for (i = 0 ; i < 4; i++) {
2541         if (mask & (1<<i)) {
2542             dst = get_dst_reg(c, inst, i);
2543             brw_MOV( p, dst, param0 );
2544         }
2545     }
2546     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2547         brw_set_saturate( p, 0 );
2548
2549     release_tmps( c, mark );
2550 }
2551
2552 static void emit_wpos_xy(struct brw_wm_compile *c,
2553                          const struct prog_instruction *inst)
2554 {
2555     struct brw_compile *p = &c->func;
2556     GLuint mask = inst->DstReg.WriteMask;
2557     struct brw_reg src0[2], dst[2];
2558
2559     dst[0] = get_dst_reg(c, inst, 0);
2560     dst[1] = get_dst_reg(c, inst, 1);
2561
2562     src0[0] = get_src_reg(c, inst, 0, 0);
2563     src0[1] = get_src_reg(c, inst, 0, 1);
2564
2565     /* Calculate the pixel offset from window bottom left into destination
2566      * X and Y channels.
2567      */
2568     if (mask & WRITEMASK_X) {
2569         /* X' = X - origin_x */
2570         brw_ADD(p,
2571                 dst[0],
2572                 retype(src0[0], BRW_REGISTER_TYPE_W),
2573                 brw_imm_d(0 - c->key.origin_x));
2574     }
2575
2576     if (mask & WRITEMASK_Y) {
2577         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2578         brw_ADD(p,
2579                 dst[1],
2580                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2581                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2582     }
2583 }
2584
2585 /* TODO
2586    BIAS on SIMD8 not working yet...
2587  */
2588 static void emit_txb(struct brw_wm_compile *c,
2589                      const struct prog_instruction *inst)
2590 {
2591     struct brw_compile *p = &c->func;
2592     struct brw_reg dst[4], src[4], payload_reg;
2593     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2594     GLuint i;
2595
2596     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2597
2598     for (i = 0; i < 4; i++)
2599         dst[i] = get_dst_reg(c, inst, i);
2600     for (i = 0; i < 4; i++)
2601         src[i] = get_src_reg(c, inst, 0, i);
2602
2603     switch (inst->TexSrcTarget) {
2604         case TEXTURE_1D_INDEX:
2605             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2606             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2607             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2608             break;
2609         case TEXTURE_2D_INDEX:
2610         case TEXTURE_RECT_INDEX:
2611             brw_MOV(p, brw_message_reg(2), src[0]);
2612             brw_MOV(p, brw_message_reg(3), src[1]);
2613             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2614             break;
2615         default:
2616             brw_MOV(p, brw_message_reg(2), src[0]);
2617             brw_MOV(p, brw_message_reg(3), src[1]);
2618             brw_MOV(p, brw_message_reg(4), src[2]);
2619             break;
2620     }
2621     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2622     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2623     brw_SAMPLE(p,
2624                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2625                1,                                           /* msg_reg_nr */
2626                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2627                SURF_INDEX_TEXTURE(unit),
2628                unit,                                        /* sampler */
2629                inst->DstReg.WriteMask,                      /* writemask */
2630                BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
2631                4,                                           /* response_length */
2632                4,                                           /* msg_length */
2633                0);                                          /* eot */
2634 }
2635
2636
2637 static void emit_tex(struct brw_wm_compile *c,
2638                      const struct prog_instruction *inst)
2639 {
2640     struct brw_compile *p = &c->func;
2641     struct brw_reg dst[4], src[4], payload_reg;
2642     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2643     GLuint msg_len;
2644     GLuint i, nr;
2645     GLuint emit;
2646     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2647
2648     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2649
2650     for (i = 0; i < 4; i++)
2651         dst[i] = get_dst_reg(c, inst, i);
2652     for (i = 0; i < 4; i++)
2653         src[i] = get_src_reg(c, inst, 0, i);
2654
2655     switch (inst->TexSrcTarget) {
2656         case TEXTURE_1D_INDEX:
2657             emit = WRITEMASK_X;
2658             nr = 1;
2659             break;
2660         case TEXTURE_2D_INDEX:
2661         case TEXTURE_RECT_INDEX:
2662             emit = WRITEMASK_XY;
2663             nr = 2;
2664             break;
2665         default:
2666             emit = WRITEMASK_XYZ;
2667             nr = 3;
2668             break;
2669     }
2670     msg_len = 1;
2671
2672     /* move/load S, T, R coords */
2673     for (i = 0; i < nr; i++) {
2674         static const GLuint swz[4] = {0,1,2,2};
2675         if (emit & (1<<i))
2676             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2677         else
2678             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2679         msg_len += 1;
2680     }
2681
2682     if (shadow) {
2683        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2684        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2685     }
2686
2687     brw_SAMPLE(p,
2688                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2689                1,                                          /* msg_reg_nr */
2690                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2691                SURF_INDEX_TEXTURE(unit),
2692                unit,                                       /* sampler */
2693                inst->DstReg.WriteMask,                     /* writemask */
2694                BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
2695                4,                                          /* response_length */
2696                shadow ? 6 : 4,                             /* msg_length */
2697                0);                                         /* eot */
2698
2699     if (shadow)
2700         brw_MOV(p, dst[3], brw_imm_f(1.0));
2701 }
2702
2703
2704 /**
2705  * Resolve subroutine calls after code emit is done.
2706  */
2707 static void post_wm_emit( struct brw_wm_compile *c )
2708 {
2709     brw_resolve_cals(&c->func);
2710 }
2711
2712 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2713 {
2714 #define MAX_IFSN 32
2715 #define MAX_LOOP_DEPTH 32
2716     struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2717     struct brw_instruction *inst0, *inst1;
2718     int i, if_insn = 0, loop_insn = 0;
2719     struct brw_compile *p = &c->func;
2720     struct brw_indirect stack_index = brw_indirect(0, 0);
2721
2722     prealloc_reg(c);
2723     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2724     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2725
2726     for (i = 0; i < c->nr_fp_insns; i++) {
2727         const struct prog_instruction *inst = &c->prog_instructions[i];
2728
2729         c->cur_inst = i;
2730
2731 #if 0
2732         _mesa_printf("Inst %d: ", i);
2733         _mesa_print_instruction(inst);
2734 #endif
2735
2736         /* fetch any constants that this instruction needs */
2737         if (c->fp->use_const_buffer)
2738            fetch_constants(c, inst);
2739
2740         if (inst->CondUpdate)
2741             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2742         else
2743             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2744
2745         switch (inst->Opcode) {
2746             case WM_PIXELXY:
2747                 emit_pixel_xy(c, inst);
2748                 break;
2749             case WM_DELTAXY:
2750                 emit_delta_xy(c, inst);
2751                 break;
2752             case WM_PIXELW:
2753                 emit_pixel_w(c, inst);
2754                 break;
2755             case WM_LINTERP:
2756                 emit_linterp(c, inst);
2757                 break;
2758             case WM_PINTERP:
2759                 emit_pinterp(c, inst);
2760                 break;
2761             case WM_CINTERP:
2762                 emit_cinterp(c, inst);
2763                 break;
2764             case WM_WPOSXY:
2765                 emit_wpos_xy(c, inst);
2766                 break;
2767             case WM_FB_WRITE:
2768                 emit_fb_write(c, inst);
2769                 break;
2770             case WM_FRONTFACING:
2771                 emit_frontfacing(c, inst);
2772                 break;
2773             case OPCODE_ABS:
2774                 emit_abs(c, inst);
2775                 break;
2776             case OPCODE_ADD:
2777                 emit_add(c, inst);
2778                 break;
2779             case OPCODE_ARL:
2780                 emit_arl(c, inst);
2781                 break;
2782             case OPCODE_SUB:
2783                 emit_sub(c, inst);
2784                 break;
2785             case OPCODE_FRC:
2786                 emit_frc(c, inst);
2787                 break;
2788             case OPCODE_FLR:
2789                 emit_flr(c, inst);
2790                 break;
2791             case OPCODE_LRP:
2792                 emit_lrp(c, inst);
2793                 break;
2794             case OPCODE_TRUNC:
2795                 emit_trunc(c, inst);
2796                 break;
2797             case OPCODE_MOV:
2798                 emit_mov(c, inst);
2799                 break;
2800             case OPCODE_DP3:
2801                 emit_dp3(c, inst);
2802                 break;
2803             case OPCODE_DP4:
2804                 emit_dp4(c, inst);
2805                 break;
2806             case OPCODE_XPD:
2807                 emit_xpd(c, inst);
2808                 break;
2809             case OPCODE_DPH:
2810                 emit_dph(c, inst);
2811                 break;
2812             case OPCODE_RCP:
2813                 emit_rcp(c, inst);
2814                 break;
2815             case OPCODE_RSQ:
2816                 emit_rsq(c, inst);
2817                 break;
2818             case OPCODE_SIN:
2819                 emit_sin(c, inst);
2820                 break;
2821             case OPCODE_COS:
2822                 emit_cos(c, inst);
2823                 break;
2824             case OPCODE_EX2:
2825                 emit_ex2(c, inst);
2826                 break;
2827             case OPCODE_LG2:
2828                 emit_lg2(c, inst);
2829                 break;
2830             case OPCODE_MIN:
2831             case OPCODE_MAX:
2832                 emit_min_max(c, inst);
2833                 break;
2834             case OPCODE_DDX:
2835                 emit_ddx(c, inst);
2836                 break;
2837             case OPCODE_DDY:
2838                 emit_ddy(c, inst);
2839                 break;
2840             case OPCODE_SLT:
2841                 emit_slt(c, inst);
2842                 break;
2843             case OPCODE_SLE:
2844                 emit_sle(c, inst);
2845                 break;
2846             case OPCODE_SGT:
2847                 emit_sgt(c, inst);
2848                 break;
2849             case OPCODE_SGE:
2850                 emit_sge(c, inst);
2851                 break;
2852             case OPCODE_SEQ:
2853                 emit_seq(c, inst);
2854                 break;
2855             case OPCODE_SNE:
2856                 emit_sne(c, inst);
2857                 break;
2858             case OPCODE_MUL:
2859                 emit_mul(c, inst);
2860                 break;
2861             case OPCODE_POW:
2862                 emit_pow(c, inst);
2863                 break;
2864             case OPCODE_MAD:
2865                 emit_mad(c, inst);
2866                 break;
2867             case OPCODE_NOISE1:
2868                 emit_noise1(c, inst);
2869                 break;
2870             case OPCODE_NOISE2:
2871                 emit_noise2(c, inst);
2872                 break;
2873             case OPCODE_NOISE3:
2874                 emit_noise3(c, inst);
2875                 break;
2876             case OPCODE_NOISE4:
2877                 emit_noise4(c, inst);
2878                 break;
2879             case OPCODE_TEX:
2880                 emit_tex(c, inst);
2881                 break;
2882             case OPCODE_TXB:
2883                 emit_txb(c, inst);
2884                 break;
2885             case OPCODE_KIL_NV:
2886                 emit_kil(c);
2887                 break;
2888             case OPCODE_IF:
2889                 assert(if_insn < MAX_IFSN);
2890                 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2891                 break;
2892             case OPCODE_ELSE:
2893                 if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
2894                 break;
2895             case OPCODE_ENDIF:
2896                 assert(if_insn > 0);
2897                 brw_ENDIF(p, if_inst[--if_insn]);
2898                 break;
2899             case OPCODE_BGNSUB:
2900                 brw_save_label(p, inst->Comment, p->nr_insn);
2901                 break;
2902             case OPCODE_ENDSUB:
2903                 /* no-op */
2904                 break;
2905             case OPCODE_CAL:
2906                 brw_push_insn_state(p);
2907                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2908                 brw_set_access_mode(p, BRW_ALIGN_1);
2909                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2910                 brw_set_access_mode(p, BRW_ALIGN_16);
2911                 brw_ADD(p, get_addr_reg(stack_index),
2912                          get_addr_reg(stack_index), brw_imm_d(4));
2913                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2914                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2915                 brw_pop_insn_state(p);
2916                 break;
2917
2918             case OPCODE_RET:
2919                 brw_push_insn_state(p);
2920                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2921                 brw_ADD(p, get_addr_reg(stack_index),
2922                         get_addr_reg(stack_index), brw_imm_d(-4));
2923                 brw_set_access_mode(p, BRW_ALIGN_1);
2924                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2925                 brw_set_access_mode(p, BRW_ALIGN_16);
2926                 brw_pop_insn_state(p);
2927
2928                 break;
2929             case OPCODE_BGNLOOP:
2930                 /* XXX may need to invalidate the current_constant regs */
2931                 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2932                 break;
2933             case OPCODE_BRK:
2934                 brw_BREAK(p);
2935                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2936                 break;
2937             case OPCODE_CONT:
2938                 brw_CONT(p);
2939                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2940                 break;
2941             case OPCODE_ENDLOOP:
2942                 loop_insn--;
2943                 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2944                 /* patch all the BREAK instructions from
2945                    last BEGINLOOP */
2946                 while (inst0 > loop_inst[loop_insn]) {
2947                     inst0--;
2948                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2949                         inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2950                         inst0->bits3.if_else.pop_count = 0;
2951                     } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2952                         inst0->bits3.if_else.jump_count = inst1 - inst0;
2953                         inst0->bits3.if_else.pop_count = 0;
2954                     }
2955                 }
2956                 break;
2957             default:
2958                 _mesa_printf("unsupported IR in fragment shader %d\n",
2959                         inst->Opcode);
2960         }
2961
2962         if (inst->CondUpdate)
2963             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2964         else
2965             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2966     }
2967     post_wm_emit(c);
2968 }
2969
2970
2971 /**
2972  * Do GPU code generation for shaders that use GLSL features such as
2973  * flow control.  Other shaders will be compiled with the
2974  */
2975 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2976 {
2977     if (INTEL_DEBUG & DEBUG_WM) {
2978         _mesa_printf("brw_wm_glsl_emit:\n");
2979     }
2980
2981     /* initial instruction translation/simplification */
2982     brw_wm_pass_fp(c);
2983
2984     /* actual code generation */
2985     brw_wm_emit_glsl(brw, c);
2986
2987     if (INTEL_DEBUG & DEBUG_WM) {
2988         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2989     }
2990
2991     c->prog_data.total_grf = num_grf_used(c);
2992     c->prog_data.total_scratch = 0;
2993 }