src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & (1 << i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553
 554 /**
 555  * Same as \sa get_src_reg() but if the register is a literal, emit
 556  * a brw_reg encoding the literal.
 557  * Note that a brw instruction only allows one src operand to be a literal.
 558  * For instructions with more than one operand, only the second can be a
 559  * literal.  This means that we treat some literals as constants/uniforms
 560  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 561  *
 562  */
 563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 564                                       const struct prog_instruction *inst,
 565                                       GLuint srcRegIndex, GLuint channel)
 566 {
 567     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 568     if (src->File == PROGRAM_CONSTANT) {
 569        /* a literal */
 570        const int component = GET_SWZ(src->Swizzle, channel);
 571        const GLfloat *param =
 572           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 573        GLfloat value = param[component];
 574        if (src->Negate & (1 << channel))
 575           value = -value;
 576        if (src->Abs)
 577           value = FABSF(value);
 578 #if 0
 579        printf("  form immed value %f for chan %d\n", value, channel);
 580 #endif
 581        return brw_imm_f(value);
 582     }
 583     else {
 584        return get_src_reg(c, inst, srcRegIndex, channel);
 585     }
 586 }
 587
 588
 589 /**
 590  * Subroutines are minimal support for resusable instruction sequences.
 591  * They are implemented as simply as possible to minimise overhead: there
 592  * is no explicit support for communication between the caller and callee
 593  * other than saving the return address in a temporary register, nor is
 594  * there any automatic local storage.  This implies that great care is
 595  * required before attempting reentrancy or any kind of nested
 596  * subroutine invocations.
 597  */
 598 static void invoke_subroutine( struct brw_wm_compile *c,
 599                                enum _subroutine subroutine,
 600                                void (*emit)( struct brw_wm_compile * ) )
 601 {
 602     struct brw_compile *p = &c->func;
 603
 604     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 605
 606     if( c->subroutines[ subroutine ] ) {
 607         /* subroutine previously emitted: reuse existing instructions */
 608
 609         int mark = mark_tmps( c );
 610         struct brw_reg return_address = retype( alloc_tmp( c ),
 611                                                 BRW_REGISTER_TYPE_UD );
 612         int here = p->nr_insn;
 613
 614         brw_push_insn_state(p);
 615         brw_set_mask_control(p, BRW_MASK_DISABLE);
 616         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 617
 618         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 619                  brw_imm_d( ( c->subroutines[ subroutine ] -
 620                               here - 1 ) << 4 ) );
 621         brw_pop_insn_state(p);
 622
 623         release_tmps( c, mark );
 624     } else {
 625         /* previously unused subroutine: emit, and mark for later reuse */
 626
 627         int mark = mark_tmps( c );
 628         struct brw_reg return_address = retype( alloc_tmp( c ),
 629                                                 BRW_REGISTER_TYPE_UD );
 630         struct brw_instruction *calc;
 631         int base = p->nr_insn;
 632
 633         brw_push_insn_state(p);
 634         brw_set_mask_control(p, BRW_MASK_DISABLE);
 635         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 636         brw_pop_insn_state(p);
 637
 638         c->subroutines[ subroutine ] = p->nr_insn;
 639
 640         emit( c );
 641
 642         brw_push_insn_state(p);
 643         brw_set_mask_control(p, BRW_MASK_DISABLE);
 644         brw_MOV( p, brw_ip_reg(), return_address );
 645         brw_pop_insn_state(p);
 646
 647         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 648
 649         release_tmps( c, mark );
 650     }
 651 }
 652
 653 static void emit_trunc( struct brw_wm_compile *c,
 654                         const struct prog_instruction *inst)
 655 {
 656     int i;
 657     struct brw_compile *p = &c->func;
 658     GLuint mask = inst->DstReg.WriteMask;
 659     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 660     for (i = 0; i < 4; i++) {
 661         if (mask & (1<<i)) {
 662             struct brw_reg src, dst;
 663             dst = get_dst_reg(c, inst, i);
 664             src = get_src_reg(c, inst, 0, i);
 665             brw_RNDZ(p, dst, src);
 666         }
 667     }
 668     brw_set_saturate(p, 0);
 669 }
 670
 671 static void emit_mov( struct brw_wm_compile *c,
 672                       const struct prog_instruction *inst)
 673 {
 674     int i;
 675     struct brw_compile *p = &c->func;
 676     GLuint mask = inst->DstReg.WriteMask;
 677     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 678     for (i = 0; i < 4; i++) {
 679         if (mask & (1<<i)) {
 680             struct brw_reg src, dst;
 681             dst = get_dst_reg(c, inst, i);
 682             /* XXX some moves from immediate value don't work reliably!!! */
 683             /*src = get_src_reg_imm(c, inst, 0, i);*/
 684             src = get_src_reg(c, inst, 0, i);
 685             brw_MOV(p, dst, src);
 686         }
 687     }
 688     brw_set_saturate(p, 0);
 689 }
 690
 691 static void emit_pixel_xy(struct brw_wm_compile *c,
 692                           const struct prog_instruction *inst)
 693 {
 694     struct brw_reg r1 = brw_vec1_grf(1, 0);
 695     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 696
 697     struct brw_reg dst0, dst1;
 698     struct brw_compile *p = &c->func;
 699     GLuint mask = inst->DstReg.WriteMask;
 700
 701     dst0 = get_dst_reg(c, inst, 0);
 702     dst1 = get_dst_reg(c, inst, 1);
 703     /* Calculate pixel centers by adding 1 or 0 to each of the
 704      * micro-tile coordinates passed in r1.
 705      */
 706     if (mask & WRITEMASK_X) {
 707         brw_ADD(p,
 708                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 709                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 710                 brw_imm_v(0x10101010));
 711     }
 712
 713     if (mask & WRITEMASK_Y) {
 714         brw_ADD(p,
 715                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 716                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 717                 brw_imm_v(0x11001100));
 718     }
 719 }
 720
 721 static void emit_delta_xy(struct brw_wm_compile *c,
 722                           const struct prog_instruction *inst)
 723 {
 724     struct brw_reg r1 = brw_vec1_grf(1, 0);
 725     struct brw_reg dst0, dst1, src0, src1;
 726     struct brw_compile *p = &c->func;
 727     GLuint mask = inst->DstReg.WriteMask;
 728
 729     dst0 = get_dst_reg(c, inst, 0);
 730     dst1 = get_dst_reg(c, inst, 1);
 731     src0 = get_src_reg(c, inst, 0, 0);
 732     src1 = get_src_reg(c, inst, 0, 1);
 733     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 734      * centers.
 735      */
 736     if (mask & WRITEMASK_X) {
 737         brw_ADD(p,
 738                 dst0,
 739                 retype(src0, BRW_REGISTER_TYPE_UW),
 740                 negate(r1));
 741     }
 742
 743     if (mask & WRITEMASK_Y) {
 744         brw_ADD(p,
 745                 dst1,
 746                 retype(src1, BRW_REGISTER_TYPE_UW),
 747                 negate(suboffset(r1,1)));
 748
 749     }
 750 }
 751
 752 static void fire_fb_write( struct brw_wm_compile *c,
 753                            GLuint base_reg,
 754                            GLuint nr,
 755                            GLuint target,
 756                            GLuint eot)
 757 {
 758     struct brw_compile *p = &c->func;
 759     /* Pass through control information:
 760      */
 761     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 762     {
 763         brw_push_insn_state(p);
 764         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 765         brw_MOV(p,
 766                 brw_message_reg(base_reg + 1),
 767                 brw_vec8_grf(1, 0));
 768         brw_pop_insn_state(p);
 769     }
 770     /* Send framebuffer write message: */
 771     brw_fb_WRITE(p,
 772             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 773             base_reg,
 774             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 775             target,
 776             nr,
 777             0,
 778             eot);
 779 }
 780
 781 static void emit_fb_write(struct brw_wm_compile *c,
 782                           const struct prog_instruction *inst)
 783 {
 784     struct brw_compile *p = &c->func;
 785     int nr = 2;
 786     int channel;
 787     GLuint target, eot;
 788     struct brw_reg src0;
 789
 790     /* Reserve a space for AA - may not be needed:
 791      */
 792     if (c->key.aa_dest_stencil_reg)
 793         nr += 1;
 794
 795     brw_push_insn_state(p);
 796     for (channel = 0; channel < 4; channel++) {
 797         src0 = get_src_reg(c,  inst, 0, channel);
 798         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 799         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 800         brw_MOV(p, brw_message_reg(nr + channel), src0);
 801     }
 802     /* skip over the regs populated above: */
 803     nr += 8;
 804     brw_pop_insn_state(p);
 805
 806     if (c->key.source_depth_to_render_target) {
 807        if (c->key.computes_depth) {
 808           src0 = get_src_reg(c, inst, 2, 2);
 809           brw_MOV(p, brw_message_reg(nr), src0);
 810        }
 811        else {
 812           src0 = get_src_reg(c, inst, 1, 1);
 813           brw_MOV(p, brw_message_reg(nr), src0);
 814        }
 815
 816        nr += 2;
 817     }
 818
 819     if (c->key.dest_depth_reg) {
 820         const GLuint comp = c->key.dest_depth_reg / 2;
 821         const GLuint off = c->key.dest_depth_reg % 2;
 822
 823         if (off != 0) {
 824             /* XXX this code needs review/testing */
 825             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 826             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 827
 828             brw_push_insn_state(p);
 829             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 830
 831             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 832             /* 2nd half? */
 833             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 834             brw_pop_insn_state(p);
 835         }
 836         else
 837         {
 838             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 839             brw_MOV(p, brw_message_reg(nr), src);
 840         }
 841         nr += 2;
 842    }
 843
 844     target = inst->Aux >> 1;
 845     eot = inst->Aux & 1;
 846     fire_fb_write(c, 0, nr, target, eot);
 847 }
 848
 849 static void emit_pixel_w( struct brw_wm_compile *c,
 850                           const struct prog_instruction *inst)
 851 {
 852     struct brw_compile *p = &c->func;
 853     GLuint mask = inst->DstReg.WriteMask;
 854     if (mask & WRITEMASK_W) {
 855         struct brw_reg dst, src0, delta0, delta1;
 856         struct brw_reg interp3;
 857
 858         dst = get_dst_reg(c, inst, 3);
 859         src0 = get_src_reg(c, inst, 0, 0);
 860         delta0 = get_src_reg(c, inst, 1, 0);
 861         delta1 = get_src_reg(c, inst, 1, 1);
 862
 863         interp3 = brw_vec1_grf(src0.nr+1, 4);
 864         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 865          * result straight into a message reg.
 866          */
 867         brw_LINE(p, brw_null_reg(), interp3, delta0);
 868         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 869
 870         /* Calc w */
 871         brw_math_16( p, dst,
 872                 BRW_MATH_FUNCTION_INV,
 873                 BRW_MATH_SATURATE_NONE,
 874                 2, brw_null_reg(),
 875                 BRW_MATH_PRECISION_FULL);
 876     }
 877 }
 878
 879 static void emit_linterp(struct brw_wm_compile *c,
 880                          const struct prog_instruction *inst)
 881 {
 882     struct brw_compile *p = &c->func;
 883     GLuint mask = inst->DstReg.WriteMask;
 884     struct brw_reg interp[4];
 885     struct brw_reg dst, delta0, delta1;
 886     struct brw_reg src0;
 887     GLuint nr, i;
 888
 889     src0 = get_src_reg(c, inst, 0, 0);
 890     delta0 = get_src_reg(c, inst, 1, 0);
 891     delta1 = get_src_reg(c, inst, 1, 1);
 892     nr = src0.nr;
 893
 894     interp[0] = brw_vec1_grf(nr, 0);
 895     interp[1] = brw_vec1_grf(nr, 4);
 896     interp[2] = brw_vec1_grf(nr+1, 0);
 897     interp[3] = brw_vec1_grf(nr+1, 4);
 898
 899     for(i = 0; i < 4; i++ ) {
 900         if (mask & (1<<i)) {
 901             dst = get_dst_reg(c, inst, i);
 902             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 903             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 904         }
 905     }
 906 }
 907
 908 static void emit_cinterp(struct brw_wm_compile *c,
 909                          const struct prog_instruction *inst)
 910 {
 911     struct brw_compile *p = &c->func;
 912     GLuint mask = inst->DstReg.WriteMask;
 913
 914     struct brw_reg interp[4];
 915     struct brw_reg dst, src0;
 916     GLuint nr, i;
 917
 918     src0 = get_src_reg(c, inst, 0, 0);
 919     nr = src0.nr;
 920
 921     interp[0] = brw_vec1_grf(nr, 0);
 922     interp[1] = brw_vec1_grf(nr, 4);
 923     interp[2] = brw_vec1_grf(nr+1, 0);
 924     interp[3] = brw_vec1_grf(nr+1, 4);
 925
 926     for(i = 0; i < 4; i++ ) {
 927         if (mask & (1<<i)) {
 928             dst = get_dst_reg(c, inst, i);
 929             brw_MOV(p, dst, suboffset(interp[i],3));
 930         }
 931     }
 932 }
 933
 934 static void emit_pinterp(struct brw_wm_compile *c,
 935                          const struct prog_instruction *inst)
 936 {
 937     struct brw_compile *p = &c->func;
 938     GLuint mask = inst->DstReg.WriteMask;
 939
 940     struct brw_reg interp[4];
 941     struct brw_reg dst, delta0, delta1;
 942     struct brw_reg src0, w;
 943     GLuint nr, i;
 944
 945     src0 = get_src_reg(c, inst, 0, 0);
 946     delta0 = get_src_reg(c, inst, 1, 0);
 947     delta1 = get_src_reg(c, inst, 1, 1);
 948     w = get_src_reg(c, inst, 2, 3);
 949     nr = src0.nr;
 950
 951     interp[0] = brw_vec1_grf(nr, 0);
 952     interp[1] = brw_vec1_grf(nr, 4);
 953     interp[2] = brw_vec1_grf(nr+1, 0);
 954     interp[3] = brw_vec1_grf(nr+1, 4);
 955
 956     for(i = 0; i < 4; i++ ) {
 957         if (mask & (1<<i)) {
 958             dst = get_dst_reg(c, inst, i);
 959             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 960             brw_MAC(p, dst, suboffset(interp[i],1),
 961                     delta1);
 962             brw_MUL(p, dst, dst, w);
 963         }
 964     }
 965 }
 966
 967 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 968 static void emit_frontfacing(struct brw_wm_compile *c,
 969                              const struct prog_instruction *inst)
 970 {
 971     struct brw_compile *p = &c->func;
 972     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 973     struct brw_reg dst;
 974     GLuint mask = inst->DstReg.WriteMask;
 975     int i;
 976
 977     for (i = 0; i < 4; i++) {
 978         if (mask & (1<<i)) {
 979             dst = get_dst_reg(c, inst, i);
 980             brw_MOV(p, dst, brw_imm_f(0.0));
 981         }
 982     }
 983
 984     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 985      * us front face
 986      */
 987     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 988     for (i = 0; i < 4; i++) {
 989         if (mask & (1<<i)) {
 990             dst = get_dst_reg(c, inst, i);
 991             brw_MOV(p, dst, brw_imm_f(1.0));
 992         }
 993     }
 994     brw_set_predicate_control_flag_value(p, 0xff);
 995 }
 996
 997 static void emit_xpd(struct brw_wm_compile *c,
 998                      const struct prog_instruction *inst)
 999 {
1000     int i;
1001     struct brw_compile *p = &c->func;
1002     GLuint mask = inst->DstReg.WriteMask;
1003     for (i = 0; i < 4; i++) {
1004         GLuint i2 = (i+2)%3;
1005         GLuint i1 = (i+1)%3;
1006         if (mask & (1<<i)) {
1007             struct brw_reg src0, src1, dst;
1008             dst = get_dst_reg(c, inst, i);
1009             src0 = negate(get_src_reg(c, inst, 0, i2));
1010             src1 = get_src_reg_imm(c, inst, 1, i1);
1011             brw_MUL(p, brw_null_reg(), src0, src1);
1012             src0 = get_src_reg(c, inst, 0, i1);
1013             src1 = get_src_reg_imm(c, inst, 1, i2);
1014             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1015             brw_MAC(p, dst, src0, src1);
1016             brw_set_saturate(p, 0);
1017         }
1018     }
1019     brw_set_saturate(p, 0);
1020 }
1021
1022 static void emit_dp3(struct brw_wm_compile *c,
1023                      const struct prog_instruction *inst)
1024 {
1025     struct brw_reg src0[3], src1[3], dst;
1026     int i;
1027     struct brw_compile *p = &c->func;
1028     GLuint mask = inst->DstReg.WriteMask;
1029     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1030
1031     if (!(mask & WRITEMASK_XYZW))
1032         return;
1033
1034     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1035
1036     for (i = 0; i < 3; i++) {
1037         src0[i] = get_src_reg(c, inst, 0, i);
1038         src1[i] = get_src_reg_imm(c, inst, 1, i);
1039     }
1040
1041     dst = get_dst_reg(c, inst, dst_chan);
1042     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1043     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1044     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1045     brw_MAC(p, dst, src0[2], src1[2]);
1046     brw_set_saturate(p, 0);
1047 }
1048
1049 static void emit_dp4(struct brw_wm_compile *c,
1050                      const struct prog_instruction *inst)
1051 {
1052     struct brw_reg src0[4], src1[4], dst;
1053     int i;
1054     struct brw_compile *p = &c->func;
1055     GLuint mask = inst->DstReg.WriteMask;
1056     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1057
1058     if (!(mask & WRITEMASK_XYZW))
1059         return;
1060
1061     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1062
1063     for (i = 0; i < 4; i++) {
1064         src0[i] = get_src_reg(c, inst, 0, i);
1065         src1[i] = get_src_reg_imm(c, inst, 1, i);
1066     }
1067     dst = get_dst_reg(c, inst, dst_chan);
1068     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1069     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1070     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1071     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1072     brw_MAC(p, dst, src0[3], src1[3]);
1073     brw_set_saturate(p, 0);
1074 }
1075
1076 static void emit_dph(struct brw_wm_compile *c,
1077                      const struct prog_instruction *inst)
1078 {
1079     struct brw_reg src0[4], src1[4], dst;
1080     int i;
1081     struct brw_compile *p = &c->func;
1082     GLuint mask = inst->DstReg.WriteMask;
1083     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1084
1085     if (!(mask & WRITEMASK_XYZW))
1086         return;
1087
1088     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1089
1090     for (i = 0; i < 4; i++) {
1091         src0[i] = get_src_reg(c, inst, 0, i);
1092         src1[i] = get_src_reg_imm(c, inst, 1, i);
1093     }
1094     dst = get_dst_reg(c, inst, dst_chan);
1095     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1096     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1097     brw_MAC(p, dst, src0[2], src1[2]);
1098     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1099     brw_ADD(p, dst, dst, src1[3]);
1100     brw_set_saturate(p, 0);
1101 }
1102
1103 /**
1104  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1105  * Note that the result of the function is smeared across the dest
1106  * register's X, Y, Z and W channels (subject to writemasking of course).
1107  */
1108 static void emit_math1(struct brw_wm_compile *c,
1109                        const struct prog_instruction *inst, GLuint func)
1110 {
1111     struct brw_compile *p = &c->func;
1112     struct brw_reg src0, dst;
1113     GLuint mask = inst->DstReg.WriteMask;
1114     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1115
1116     if (!(mask & WRITEMASK_XYZW))
1117         return;
1118
1119     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1120
1121     /* Get first component of source register */
1122     dst = get_dst_reg(c, inst, dst_chan);
1123     src0 = get_src_reg(c, inst, 0, 0);
1124
1125     brw_MOV(p, brw_message_reg(2), src0);
1126     brw_math(p,
1127              dst,
1128              func,
1129              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1130              2,
1131              brw_null_reg(),
1132              BRW_MATH_DATA_VECTOR,
1133              BRW_MATH_PRECISION_FULL);
1134 }
1135
1136 static void emit_rcp(struct brw_wm_compile *c,
1137                      const struct prog_instruction *inst)
1138 {
1139     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1140 }
1141
1142 static void emit_rsq(struct brw_wm_compile *c,
1143                      const struct prog_instruction *inst)
1144 {
1145     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1146 }
1147
1148 static void emit_sin(struct brw_wm_compile *c,
1149                      const struct prog_instruction *inst)
1150 {
1151     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1152 }
1153
1154 static void emit_cos(struct brw_wm_compile *c,
1155                      const struct prog_instruction *inst)
1156 {
1157     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1158 }
1159
1160 static void emit_ex2(struct brw_wm_compile *c,
1161                      const struct prog_instruction *inst)
1162 {
1163     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1164 }
1165
1166 static void emit_lg2(struct brw_wm_compile *c,
1167                      const struct prog_instruction *inst)
1168 {
1169     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1170 }
1171
1172 static void emit_add(struct brw_wm_compile *c,
1173                      const struct prog_instruction *inst)
1174 {
1175     struct brw_compile *p = &c->func;
1176     struct brw_reg src0, src1, dst;
1177     GLuint mask = inst->DstReg.WriteMask;
1178     int i;
1179     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1180     for (i = 0 ; i < 4; i++) {
1181         if (mask & (1<<i)) {
1182             dst = get_dst_reg(c, inst, i);
1183             src0 = get_src_reg(c, inst, 0, i);
1184             src1 = get_src_reg_imm(c, inst, 1, i);
1185             brw_ADD(p, dst, src0, src1);
1186         }
1187     }
1188     brw_set_saturate(p, 0);
1189 }
1190
1191 static void emit_arl(struct brw_wm_compile *c,
1192                      const struct prog_instruction *inst)
1193 {
1194     struct brw_compile *p = &c->func;
1195     struct brw_reg src0, addr_reg;
1196     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1197     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1198                            BRW_ARF_ADDRESS, 0);
1199     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1200     brw_MOV(p, addr_reg, src0);
1201     brw_set_saturate(p, 0);
1202 }
1203
1204
1205 static void emit_mul(struct brw_wm_compile *c,
1206                      const struct prog_instruction *inst)
1207 {
1208     struct brw_compile *p = &c->func;
1209     struct brw_reg src0, src1, dst;
1210     GLuint mask = inst->DstReg.WriteMask;
1211     int i;
1212     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1213     for (i = 0 ; i < 4; i++) {
1214         if (mask & (1<<i)) {
1215             dst = get_dst_reg(c, inst, i);
1216             src0 = get_src_reg(c, inst, 0, i);
1217             src1 = get_src_reg_imm(c, inst, 1, i);
1218             brw_MUL(p, dst, src0, src1);
1219         }
1220     }
1221     brw_set_saturate(p, 0);
1222 }
1223
1224 static void emit_frc(struct brw_wm_compile *c,
1225                      const struct prog_instruction *inst)
1226 {
1227     struct brw_compile *p = &c->func;
1228     struct brw_reg src0, dst;
1229     GLuint mask = inst->DstReg.WriteMask;
1230     int i;
1231     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1232     for (i = 0 ; i < 4; i++) {
1233         if (mask & (1<<i)) {
1234             dst = get_dst_reg(c, inst, i);
1235             src0 = get_src_reg_imm(c, inst, 0, i);
1236             brw_FRC(p, dst, src0);
1237         }
1238     }
1239     if (inst->SaturateMode != SATURATE_OFF)
1240         brw_set_saturate(p, 0);
1241 }
1242
1243 static void emit_flr(struct brw_wm_compile *c,
1244                      const struct prog_instruction *inst)
1245 {
1246     struct brw_compile *p = &c->func;
1247     struct brw_reg src0, dst;
1248     GLuint mask = inst->DstReg.WriteMask;
1249     int i;
1250     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1251     for (i = 0 ; i < 4; i++) {
1252         if (mask & (1<<i)) {
1253             dst = get_dst_reg(c, inst, i);
1254             src0 = get_src_reg_imm(c, inst, 0, i);
1255             brw_RNDD(p, dst, src0);
1256         }
1257     }
1258     brw_set_saturate(p, 0);
1259 }
1260
1261
1262 static void emit_min_max(struct brw_wm_compile *c,
1263                          const struct prog_instruction *inst)
1264 {
1265     struct brw_compile *p = &c->func;
1266     const GLuint mask = inst->DstReg.WriteMask;
1267     const int mark = mark_tmps(c);
1268     int i;
1269     brw_push_insn_state(p);
1270     for (i = 0; i < 4; i++) {
1271         if (mask & (1<<i)) {
1272             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1273             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1274             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1275             struct brw_reg dst;
1276             /* if dst==src0 or dst==src1 we need to use a temp reg */
1277             GLboolean use_temp = brw_same_reg(dst, src0) ||
1278                                  brw_same_reg(dst, src1);
1279             if (use_temp)
1280                dst = alloc_tmp(c);
1281             else
1282                dst = real_dst;
1283
1284             /*
1285             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1286                    dst.nr, src0.nr, src1.nr);
1287             */
1288             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1289             brw_MOV(p, dst, src0);
1290             brw_set_saturate(p, 0);
1291
1292             if (inst->Opcode == OPCODE_MIN)
1293                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1294             else
1295                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1296
1297             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1298             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1299             brw_MOV(p, dst, src1);
1300             brw_set_saturate(p, 0);
1301             brw_set_predicate_control_flag_value(p, 0xff);
1302             if (use_temp)
1303                brw_MOV(p, real_dst, dst);
1304         }
1305     }
1306     brw_pop_insn_state(p);
1307     release_tmps(c, mark);
1308 }
1309
1310 static void emit_pow(struct brw_wm_compile *c,
1311                      const struct prog_instruction *inst)
1312 {
1313     struct brw_compile *p = &c->func;
1314     struct brw_reg dst, src0, src1;
1315     GLuint mask = inst->DstReg.WriteMask;
1316     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1317
1318     if (!(mask & WRITEMASK_XYZW))
1319         return;
1320
1321     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1322
1323     dst = get_dst_reg(c, inst, dst_chan);
1324     src0 = get_src_reg_imm(c, inst, 0, 0);
1325     src1 = get_src_reg_imm(c, inst, 1, 0);
1326
1327     brw_MOV(p, brw_message_reg(2), src0);
1328     brw_MOV(p, brw_message_reg(3), src1);
1329
1330     brw_math(p,
1331             dst,
1332             BRW_MATH_FUNCTION_POW,
1333             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1334             2,
1335             brw_null_reg(),
1336             BRW_MATH_DATA_VECTOR,
1337             BRW_MATH_PRECISION_FULL);
1338 }
1339
1340 static void emit_lrp(struct brw_wm_compile *c,
1341                      const struct prog_instruction *inst)
1342 {
1343     struct brw_compile *p = &c->func;
1344     GLuint mask = inst->DstReg.WriteMask;
1345     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1346     int i;
1347     int mark = mark_tmps(c);
1348     for (i = 0; i < 4; i++) {
1349         if (mask & (1<<i)) {
1350             dst = get_dst_reg(c, inst, i);
1351             src0 = get_src_reg(c, inst, 0, i);
1352
1353             src1 = get_src_reg_imm(c, inst, 1, i);
1354
1355             if (src1.nr == dst.nr) {
1356                 tmp1 = alloc_tmp(c);
1357                 brw_MOV(p, tmp1, src1);
1358             } else
1359                 tmp1 = src1;
1360
1361             src2 = get_src_reg(c, inst, 2, i);
1362             if (src2.nr == dst.nr) {
1363                 tmp2 = alloc_tmp(c);
1364                 brw_MOV(p, tmp2, src2);
1365             } else
1366                 tmp2 = src2;
1367
1368             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1369             brw_MUL(p, brw_null_reg(), dst, tmp2);
1370             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1371             brw_MAC(p, dst, src0, tmp1);
1372             brw_set_saturate(p, 0);
1373         }
1374         release_tmps(c, mark);
1375     }
1376 }
1377
1378 /**
1379  * For GLSL shaders, this KIL will be unconditional.
1380  * It may be contained inside an IF/ENDIF structure of course.
1381  */
1382 static void emit_kil(struct brw_wm_compile *c)
1383 {
1384     struct brw_compile *p = &c->func;
1385     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1386     brw_push_insn_state(p);
1387     brw_set_mask_control(p, BRW_MASK_DISABLE);
1388     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1389     brw_AND(p, depth, c->emit_mask_reg, depth);
1390     brw_pop_insn_state(p);
1391 }
1392
1393 static void emit_mad(struct brw_wm_compile *c,
1394                      const struct prog_instruction *inst)
1395 {
1396     struct brw_compile *p = &c->func;
1397     GLuint mask = inst->DstReg.WriteMask;
1398     struct brw_reg dst, src0, src1, src2;
1399     int i;
1400
1401     for (i = 0; i < 4; i++) {
1402         if (mask & (1<<i)) {
1403             dst = get_dst_reg(c, inst, i);
1404             src0 = get_src_reg(c, inst, 0, i);
1405             src1 = get_src_reg_imm(c, inst, 1, i);
1406             src2 = get_src_reg_imm(c, inst, 2, i);
1407             brw_MUL(p, dst, src0, src1);
1408
1409             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1410             brw_ADD(p, dst, dst, src2);
1411             brw_set_saturate(p, 0);
1412         }
1413     }
1414 }
1415
1416 static void emit_sop(struct brw_wm_compile *c,
1417                      const struct prog_instruction *inst, GLuint cond)
1418 {
1419     struct brw_compile *p = &c->func;
1420     GLuint mask = inst->DstReg.WriteMask;
1421     struct brw_reg dst, src0, src1;
1422     int i;
1423
1424     for (i = 0; i < 4; i++) {
1425         if (mask & (1<<i)) {
1426             dst = get_dst_reg(c, inst, i);
1427             src0 = get_src_reg(c, inst, 0, i);
1428             src1 = get_src_reg_imm(c, inst, 1, i);
1429             brw_push_insn_state(p);
1430             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1431             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1432             brw_MOV(p, dst, brw_imm_f(0.0));
1433             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1434             brw_MOV(p, dst, brw_imm_f(1.0));
1435             brw_pop_insn_state(p);
1436         }
1437     }
1438 }
1439
1440 static void emit_slt(struct brw_wm_compile *c,
1441                      const struct prog_instruction *inst)
1442 {
1443     emit_sop(c, inst, BRW_CONDITIONAL_L);
1444 }
1445
1446 static void emit_sle(struct brw_wm_compile *c,
1447                      const struct prog_instruction *inst)
1448 {
1449     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1450 }
1451
1452 static void emit_sgt(struct brw_wm_compile *c,
1453                      const struct prog_instruction *inst)
1454 {
1455     emit_sop(c, inst, BRW_CONDITIONAL_G);
1456 }
1457
1458 static void emit_sge(struct brw_wm_compile *c,
1459                      const struct prog_instruction *inst)
1460 {
1461     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1462 }
1463
1464 static void emit_seq(struct brw_wm_compile *c,
1465                      const struct prog_instruction *inst)
1466 {
1467     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1468 }
1469
1470 static void emit_sne(struct brw_wm_compile *c,
1471                      const struct prog_instruction *inst)
1472 {
1473     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1474 }
1475
1476 static INLINE struct brw_reg high_words( struct brw_reg reg )
1477 {
1478     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1479                    0, 8, 2 );
1480 }
1481
1482 static INLINE struct brw_reg low_words( struct brw_reg reg )
1483 {
1484     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1485 }
1486
1487 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1488 {
1489     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1490 }
1491
1492 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1493 {
1494     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1495                    0, 16, 2 );
1496 }
1497
1498 /* One-, two- and three-dimensional Perlin noise, similar to the description
1499    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1500 static void noise1_sub( struct brw_wm_compile *c ) {
1501
1502     struct brw_compile *p = &c->func;
1503     struct brw_reg param,
1504         x0, x1, /* gradients at each end */
1505         t, tmp[ 2 ], /* float temporaries */
1506         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1507     int i;
1508     int mark = mark_tmps( c );
1509
1510     x0 = alloc_tmp( c );
1511     x1 = alloc_tmp( c );
1512     t = alloc_tmp( c );
1513     tmp[ 0 ] = alloc_tmp( c );
1514     tmp[ 1 ] = alloc_tmp( c );
1515     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1516     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1517     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1518     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1519     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1520
1521     param = lookup_tmp( c, mark - 2 );
1522
1523     brw_set_access_mode( p, BRW_ALIGN_1 );
1524
1525     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1526
1527     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1528        be hashed.  Also compute the remainder (offset within the unit
1529        length), interleaved to reduce register dependency penalties. */
1530     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1531     brw_FRC( p, param, param );
1532     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1533     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1534     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1535
1536     /* We're now ready to perform the hashing.  The two hashes are
1537        interleaved for performance.  The hash function used is
1538        designed to rapidly achieve avalanche and require only 32x16
1539        bit multiplication, and 16-bit swizzles (which we get for
1540        free).  We can't use immediate operands in the multiplies,
1541        because immediates are permitted only in src1 and the 16-bit
1542        factor is permitted only in src0. */
1543     for( i = 0; i < 2; i++ )
1544         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1545     for( i = 0; i < 2; i++ )
1546        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1547                 high_words( itmp[ i ] ) );
1548     for( i = 0; i < 2; i++ )
1549         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1550     for( i = 0; i < 2; i++ )
1551        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1552                 high_words( itmp[ i ] ) );
1553     for( i = 0; i < 2; i++ )
1554         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1555     for( i = 0; i < 2; i++ )
1556        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1557                 high_words( itmp[ i ] ) );
1558
1559     /* Now we want to initialise the two gradients based on the
1560        hashes.  Format conversion from signed integer to float leaves
1561        everything scaled too high by a factor of pow( 2, 31 ), but
1562        we correct for that right at the end. */
1563     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1564     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1565     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1566
1567     brw_MUL( p, x0, x0, param );
1568     brw_MUL( p, x1, x1, t );
1569
1570     /* We interpolate between the gradients using the polynomial
1571        6t^5 - 15t^4 + 10t^3 (Perlin). */
1572     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1573     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1574     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1575     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1576     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1577     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1578                                            pipeline */
1579     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1580     brw_MUL( p, param, tmp[ 0 ], param );
1581     brw_MUL( p, x1, x1, param );
1582     brw_ADD( p, x0, x0, x1 );
1583     /* scale by pow( 2, -30 ), to compensate for the format conversion
1584        above and an extra factor of 2 so that a single gradient covers
1585        the [-1,1] range */
1586     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1587
1588     release_tmps( c, mark );
1589 }
1590
1591 static void emit_noise1( struct brw_wm_compile *c,
1592                          const struct prog_instruction *inst )
1593 {
1594     struct brw_compile *p = &c->func;
1595     struct brw_reg src, param, dst;
1596     GLuint mask = inst->DstReg.WriteMask;
1597     int i;
1598     int mark = mark_tmps( c );
1599
1600     assert( mark == 0 );
1601
1602     src = get_src_reg( c, inst, 0, 0 );
1603
1604     param = alloc_tmp( c );
1605
1606     brw_MOV( p, param, src );
1607
1608     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1609
1610     /* Fill in the result: */
1611     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1612     for (i = 0 ; i < 4; i++) {
1613         if (mask & (1<<i)) {
1614             dst = get_dst_reg(c, inst, i);
1615             brw_MOV( p, dst, param );
1616         }
1617     }
1618     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1619         brw_set_saturate( p, 0 );
1620
1621     release_tmps( c, mark );
1622 }
1623
1624 static void noise2_sub( struct brw_wm_compile *c ) {
1625
1626     struct brw_compile *p = &c->func;
1627     struct brw_reg param0, param1,
1628         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1629         t, tmp[ 4 ], /* float temporaries */
1630         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1631     int i;
1632     int mark = mark_tmps( c );
1633
1634     x0y0 = alloc_tmp( c );
1635     x0y1 = alloc_tmp( c );
1636     x1y0 = alloc_tmp( c );
1637     x1y1 = alloc_tmp( c );
1638     t = alloc_tmp( c );
1639     for( i = 0; i < 4; i++ ) {
1640         tmp[ i ] = alloc_tmp( c );
1641         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1642     }
1643     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1644     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1645     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1646
1647     param0 = lookup_tmp( c, mark - 3 );
1648     param1 = lookup_tmp( c, mark - 2 );
1649
1650     brw_set_access_mode( p, BRW_ALIGN_1 );
1651
1652     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1653        be hashed.  Also compute the remainders (offsets within the unit
1654        square), interleaved to reduce register dependency penalties. */
1655     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1656     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1657     brw_FRC( p, param0, param0 );
1658     brw_FRC( p, param1, param1 );
1659     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1660     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1661              low_words( itmp[ 1 ] ) );
1662     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1663     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1664     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1665     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1666     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1667
1668     /* We're now ready to perform the hashing.  The four hashes are
1669        interleaved for performance.  The hash function used is
1670        designed to rapidly achieve avalanche and require only 32x16
1671        bit multiplication, and 16-bit swizzles (which we get for
1672        free).  We can't use immediate operands in the multiplies,
1673        because immediates are permitted only in src1 and the 16-bit
1674        factor is permitted only in src0. */
1675     for( i = 0; i < 4; i++ )
1676         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1677     for( i = 0; i < 4; i++ )
1678         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1679                  high_words( itmp[ i ] ) );
1680     for( i = 0; i < 4; i++ )
1681         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1682     for( i = 0; i < 4; i++ )
1683         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1684                  high_words( itmp[ i ] ) );
1685     for( i = 0; i < 4; i++ )
1686         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1687     for( i = 0; i < 4; i++ )
1688         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1689                  high_words( itmp[ i ] ) );
1690
1691     /* Now we want to initialise the four gradients based on the
1692        hashes.  Format conversion from signed integer to float leaves
1693        everything scaled too high by a factor of pow( 2, 15 ), but
1694        we correct for that right at the end. */
1695     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1696     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1697     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1698     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1699     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1700
1701     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1702     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1703     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1704     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1705
1706     brw_MUL( p, x1y0, x1y0, t );
1707     brw_MUL( p, x1y1, x1y1, t );
1708     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1709     brw_MUL( p, x0y0, x0y0, param0 );
1710     brw_MUL( p, x0y1, x0y1, param0 );
1711
1712     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1713     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1714     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1715     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1716
1717     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1718     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1719     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1720     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1721
1722     /* We interpolate between the gradients using the polynomial
1723        6t^5 - 15t^4 + 10t^3 (Perlin). */
1724     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1725     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1726     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1727     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1728     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1729     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1730     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1731                                                  pipeline */
1732     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1733     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1734     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1735     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1736     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1737                                                  pipeline */
1738     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1739     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1740     brw_MUL( p, param0, tmp[ 0 ], param0 );
1741     brw_MUL( p, param1, tmp[ 1 ], param1 );
1742
1743     /* Here we interpolate in the y dimension... */
1744     brw_MUL( p, x0y1, x0y1, param1 );
1745     brw_MUL( p, x1y1, x1y1, param1 );
1746     brw_ADD( p, x0y0, x0y0, x0y1 );
1747     brw_ADD( p, x1y0, x1y0, x1y1 );
1748
1749     /* And now in x.  There are horrible register dependencies here,
1750        but we have nothing else to do. */
1751     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1752     brw_MUL( p, x1y0, x1y0, param0 );
1753     brw_ADD( p, x0y0, x0y0, x1y0 );
1754
1755     /* scale by pow( 2, -15 ), as described above */
1756     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1757
1758     release_tmps( c, mark );
1759 }
1760
1761 static void emit_noise2( struct brw_wm_compile *c,
1762                          const struct prog_instruction *inst )
1763 {
1764     struct brw_compile *p = &c->func;
1765     struct brw_reg src0, src1, param0, param1, dst;
1766     GLuint mask = inst->DstReg.WriteMask;
1767     int i;
1768     int mark = mark_tmps( c );
1769
1770     assert( mark == 0 );
1771
1772     src0 = get_src_reg( c, inst, 0, 0 );
1773     src1 = get_src_reg( c, inst, 0, 1 );
1774
1775     param0 = alloc_tmp( c );
1776     param1 = alloc_tmp( c );
1777
1778     brw_MOV( p, param0, src0 );
1779     brw_MOV( p, param1, src1 );
1780
1781     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1782
1783     /* Fill in the result: */
1784     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1785     for (i = 0 ; i < 4; i++) {
1786         if (mask & (1<<i)) {
1787             dst = get_dst_reg(c, inst, i);
1788             brw_MOV( p, dst, param0 );
1789         }
1790     }
1791     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1792         brw_set_saturate( p, 0 );
1793
1794     release_tmps( c, mark );
1795 }
1796
1797 /**
1798  * The three-dimensional case is much like the one- and two- versions above,
1799  * but since the number of corners is rapidly growing we now pack 16 16-bit
1800  * hashes into each register to extract more parallelism from the EUs.
1801  */
1802 static void noise3_sub( struct brw_wm_compile *c ) {
1803
1804     struct brw_compile *p = &c->func;
1805     struct brw_reg param0, param1, param2,
1806         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1807         xi, yi, zi, /* interpolation coefficients */
1808         t, tmp[ 8 ], /* float temporaries */
1809         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1810         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1811     int i;
1812     int mark = mark_tmps( c );
1813
1814     x0y0 = alloc_tmp( c );
1815     x0y1 = alloc_tmp( c );
1816     x1y0 = alloc_tmp( c );
1817     x1y1 = alloc_tmp( c );
1818     xi = alloc_tmp( c );
1819     yi = alloc_tmp( c );
1820     zi = alloc_tmp( c );
1821     t = alloc_tmp( c );
1822     for( i = 0; i < 8; i++ ) {
1823         tmp[ i ] = alloc_tmp( c );
1824         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1825         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1826     }
1827
1828     param0 = lookup_tmp( c, mark - 4 );
1829     param1 = lookup_tmp( c, mark - 3 );
1830     param2 = lookup_tmp( c, mark - 2 );
1831
1832     brw_set_access_mode( p, BRW_ALIGN_1 );
1833
1834     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1835        be hashed.  Also compute the remainders (offsets within the unit
1836        cube), interleaved to reduce register dependency penalties. */
1837     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1838     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1839     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1840     brw_FRC( p, param0, param0 );
1841     brw_FRC( p, param1, param1 );
1842     brw_FRC( p, param2, param2 );
1843     /* Since we now have only 16 bits of precision in the hash, we must
1844        be more careful about thorough mixing to maintain entropy as we
1845        squash the input vector into a small scalar. */
1846     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1847     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1848     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1849              brw_imm_uw( 0x9B93 ) );
1850     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1851              brw_imm_uw( 0xBC8F ) );
1852
1853     /* Temporarily disable the execution mask while we work with ExecSize=16
1854        channels (the mask is set for ExecSize=8 and is probably incorrect).
1855        Although this might cause execution of unwanted channels, the code
1856        writes only to temporary registers and has no side effects, so
1857        disabling the mask is harmless. */
1858     brw_push_insn_state( p );
1859     brw_set_mask_control( p, BRW_MASK_DISABLE );
1860     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1861     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1862     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1863
1864     /* We're now ready to perform the hashing.  The eight hashes are
1865        interleaved for performance.  The hash function used is
1866        designed to rapidly achieve avalanche and require only 16x16
1867        bit multiplication, and 8-bit swizzles (which we get for
1868        free). */
1869     for( i = 0; i < 4; i++ )
1870         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1871     for( i = 0; i < 4; i++ )
1872         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1873                  odd_bytes( wtmp[ i ] ) );
1874     for( i = 0; i < 4; i++ )
1875         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1876     for( i = 0; i < 4; i++ )
1877         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1878                  odd_bytes( wtmp[ i ] ) );
1879     brw_pop_insn_state( p );
1880
1881     /* Now we want to initialise the four rear gradients based on the
1882        hashes.  Format conversion from signed integer to float leaves
1883        everything scaled too high by a factor of pow( 2, 15 ), but
1884        we correct for that right at the end. */
1885     /* x component */
1886     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1887     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1888     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1889     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1890     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1891
1892     brw_push_insn_state( p );
1893     brw_set_mask_control( p, BRW_MASK_DISABLE );
1894     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1895     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1896     brw_pop_insn_state( p );
1897
1898     brw_MUL( p, x1y0, x1y0, t );
1899     brw_MUL( p, x1y1, x1y1, t );
1900     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1901     brw_MUL( p, x0y0, x0y0, param0 );
1902     brw_MUL( p, x0y1, x0y1, param0 );
1903
1904     /* y component */
1905     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1906     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1907     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1908     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1909
1910     brw_push_insn_state( p );
1911     brw_set_mask_control( p, BRW_MASK_DISABLE );
1912     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1913     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1914     brw_pop_insn_state( p );
1915
1916     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1917     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1918     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1919     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1920     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1921
1922     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1923     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1924     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1925     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1926
1927     /* z component */
1928     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1929     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1930     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1931     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1932
1933     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1934     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1935     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1936     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1937
1938     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1939     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1940     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1941     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1942
1943     /* We interpolate between the gradients using the polynomial
1944        6t^5 - 15t^4 + 10t^3 (Perlin). */
1945     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1946     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1947     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1948     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1949     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1950     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1951     brw_MUL( p, xi, xi, param0 );
1952     brw_MUL( p, yi, yi, param1 );
1953     brw_MUL( p, zi, zi, param2 );
1954     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1955     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1956     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1957     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1958     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1959     brw_MUL( p, xi, xi, param0 );
1960     brw_MUL( p, yi, yi, param1 );
1961     brw_MUL( p, zi, zi, param2 );
1962     brw_MUL( p, xi, xi, param0 );
1963     brw_MUL( p, yi, yi, param1 );
1964     brw_MUL( p, zi, zi, param2 );
1965     brw_MUL( p, xi, xi, param0 );
1966     brw_MUL( p, yi, yi, param1 );
1967     brw_MUL( p, zi, zi, param2 );
1968
1969     /* Here we interpolate in the y dimension... */
1970     brw_MUL( p, x0y1, x0y1, yi );
1971     brw_MUL( p, x1y1, x1y1, yi );
1972     brw_ADD( p, x0y0, x0y0, x0y1 );
1973     brw_ADD( p, x1y0, x1y0, x1y1 );
1974
1975     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1976     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1977     brw_MUL( p, x1y0, x1y0, xi );
1978     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1979
1980     /* Now do the same thing for the front four gradients... */
1981     /* x component */
1982     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1983     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1984     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1985     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1986
1987     brw_push_insn_state( p );
1988     brw_set_mask_control( p, BRW_MASK_DISABLE );
1989     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1990     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1991     brw_pop_insn_state( p );
1992
1993     brw_MUL( p, x1y0, x1y0, t );
1994     brw_MUL( p, x1y1, x1y1, t );
1995     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1996     brw_MUL( p, x0y0, x0y0, param0 );
1997     brw_MUL( p, x0y1, x0y1, param0 );
1998
1999     /* y component */
2000     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2001     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2002     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2003     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2004
2005     brw_push_insn_state( p );
2006     brw_set_mask_control( p, BRW_MASK_DISABLE );
2007     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2008     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2009     brw_pop_insn_state( p );
2010
2011     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2012     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2013     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2014     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2015     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2016
2017     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2018     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2019     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2020     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2021
2022     /* z component */
2023     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2024     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2025     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2026     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2027
2028     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2029     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2030     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2031     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2032
2033     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2034     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2035     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2036     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2037
2038     /* The interpolation coefficients are still around from last time, so
2039        again interpolate in the y dimension... */
2040     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2041     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2042     brw_MUL( p, x0y1, x0y1, yi );
2043     brw_MUL( p, x1y1, x1y1, yi );
2044     brw_ADD( p, x0y0, x0y0, x0y1 );
2045     brw_ADD( p, x1y0, x1y0, x1y1 );
2046
2047     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2048        time put the front face in tmp[ 1 ] and we're nearly there... */
2049     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2050     brw_MUL( p, x1y0, x1y0, xi );
2051     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2052
2053     /* The final interpolation, in the z dimension: */
2054     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2055     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2056     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2057
2058     /* scale by pow( 2, -15 ), as described above */
2059     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2060
2061     release_tmps( c, mark );
2062 }
2063
2064 static void emit_noise3( struct brw_wm_compile *c,
2065                          const struct prog_instruction *inst )
2066 {
2067     struct brw_compile *p = &c->func;
2068     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2069     GLuint mask = inst->DstReg.WriteMask;
2070     int i;
2071     int mark = mark_tmps( c );
2072
2073     assert( mark == 0 );
2074
2075     src0 = get_src_reg( c, inst, 0, 0 );
2076     src1 = get_src_reg( c, inst, 0, 1 );
2077     src2 = get_src_reg( c, inst, 0, 2 );
2078
2079     param0 = alloc_tmp( c );
2080     param1 = alloc_tmp( c );
2081     param2 = alloc_tmp( c );
2082
2083     brw_MOV( p, param0, src0 );
2084     brw_MOV( p, param1, src1 );
2085     brw_MOV( p, param2, src2 );
2086
2087     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2088
2089     /* Fill in the result: */
2090     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2091     for (i = 0 ; i < 4; i++) {
2092         if (mask & (1<<i)) {
2093             dst = get_dst_reg(c, inst, i);
2094             brw_MOV( p, dst, param0 );
2095         }
2096     }
2097     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2098         brw_set_saturate( p, 0 );
2099
2100     release_tmps( c, mark );
2101 }
2102
2103 /**
2104  * For the four-dimensional case, the little micro-optimisation benefits
2105  * we obtain by unrolling all the loops aren't worth the massive bloat it
2106  * now causes.  Instead, we loop twice around performing a similar operation
2107  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2108  * code to glue it all together.
2109  */
2110 static void noise4_sub( struct brw_wm_compile *c )
2111 {
2112     struct brw_compile *p = &c->func;
2113     struct brw_reg param[ 4 ],
2114         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2115         w0, /* noise for the w=0 cube */
2116         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2117         interp[ 4 ], /* interpolation coefficients */
2118         t, tmp[ 8 ], /* float temporaries */
2119         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2120         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2121     int i, j;
2122     int mark = mark_tmps( c );
2123     GLuint loop, origin;
2124
2125     x0y0 = alloc_tmp( c );
2126     x0y1 = alloc_tmp( c );
2127     x1y0 = alloc_tmp( c );
2128     x1y1 = alloc_tmp( c );
2129     t = alloc_tmp( c );
2130     w0 = alloc_tmp( c );
2131     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2132     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2133
2134     for( i = 0; i < 4; i++ ) {
2135         param[ i ] = lookup_tmp( c, mark - 5 + i );
2136         interp[ i ] = alloc_tmp( c );
2137     }
2138
2139     for( i = 0; i < 8; i++ ) {
2140         tmp[ i ] = alloc_tmp( c );
2141         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2142         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2143     }
2144
2145     brw_set_access_mode( p, BRW_ALIGN_1 );
2146
2147     /* We only want 16 bits of precision from the integral part of each
2148        co-ordinate, but unfortunately the RNDD semantics would saturate
2149        at 16 bits if we performed the operation directly to a 16-bit
2150        destination.  Therefore, we round to 32-bit temporaries where
2151        appropriate, and then store only the lower 16 bits. */
2152     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2153     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2154     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2155     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2156     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2157     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2158
2159     /* Modify the flag register here, because the side effect is useful
2160        later (see below).  We know for certain that all flags will be
2161        cleared, since the FRC instruction cannot possibly generate
2162        negative results.  Even for exceptional inputs (infinities, denormals,
2163        NaNs), the architecture guarantees that the L conditional is false. */
2164     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2165     brw_FRC( p, param[ 0 ], param[ 0 ] );
2166     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2167     for( i = 1; i < 4; i++ )
2168         brw_FRC( p, param[ i ], param[ i ] );
2169
2170     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2171        of all. */
2172     for( i = 0; i < 4; i++ )
2173         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2174     for( i = 0; i < 4; i++ )
2175         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2176     for( i = 0; i < 4; i++ )
2177         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2178     for( i = 0; i < 4; i++ )
2179         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2180     for( j = 0; j < 3; j++ )
2181         for( i = 0; i < 4; i++ )
2182             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2183
2184     /* Mark the current address, as it will be a jump destination.  The
2185        following code will be executed twice: first, with the flag
2186        register clear indicating the w=0 case, and second with flags
2187        set for w=1. */
2188     loop = p->nr_insn;
2189
2190     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2191        be hashed.  Since we have only 16 bits of precision in the hash, we
2192        must be careful about thorough mixing to maintain entropy as we
2193        squash the input vector into a small scalar. */
2194     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2195              brw_imm_uw( 0xBC8F ) );
2196     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2197              brw_imm_uw( 0xD0BD ) );
2198     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2199              brw_imm_uw( 0x9B93 ) );
2200     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2201              brw_imm_uw( 0xA359 ) );
2202     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2203              brw_imm_uw( 0xBC8F ) );
2204
2205     /* Temporarily disable the execution mask while we work with ExecSize=16
2206        channels (the mask is set for ExecSize=8 and is probably incorrect).
2207        Although this might cause execution of unwanted channels, the code
2208        writes only to temporary registers and has no side effects, so
2209        disabling the mask is harmless. */
2210     brw_push_insn_state( p );
2211     brw_set_mask_control( p, BRW_MASK_DISABLE );
2212     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2213     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2214     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2215
2216     /* We're now ready to perform the hashing.  The eight hashes are
2217        interleaved for performance.  The hash function used is
2218        designed to rapidly achieve avalanche and require only 16x16
2219        bit multiplication, and 8-bit swizzles (which we get for
2220        free). */
2221     for( i = 0; i < 4; i++ )
2222         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2223     for( i = 0; i < 4; i++ )
2224         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2225                  odd_bytes( wtmp[ i ] ) );
2226     for( i = 0; i < 4; i++ )
2227         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2228     for( i = 0; i < 4; i++ )
2229         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2230                  odd_bytes( wtmp[ i ] ) );
2231     brw_pop_insn_state( p );
2232
2233     /* Now we want to initialise the four rear gradients based on the
2234        hashes.  Format conversion from signed integer to float leaves
2235        everything scaled too high by a factor of pow( 2, 15 ), but
2236        we correct for that right at the end. */
2237     /* x component */
2238     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2239     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2240     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2241     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2242     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2243
2244     brw_push_insn_state( p );
2245     brw_set_mask_control( p, BRW_MASK_DISABLE );
2246     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2247     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2248     brw_pop_insn_state( p );
2249
2250     brw_MUL( p, x1y0, x1y0, t );
2251     brw_MUL( p, x1y1, x1y1, t );
2252     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2253     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2254     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2255
2256     /* y component */
2257     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2258     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2259     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2260     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2261
2262     brw_push_insn_state( p );
2263     brw_set_mask_control( p, BRW_MASK_DISABLE );
2264     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2265     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2266     brw_pop_insn_state( p );
2267
2268     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2269     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2270     /* prepare t for the w component (used below): w the first time through
2271        the loop; w - 1 the second time) */
2272     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2273     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2274     p->current->header.predicate_inverse = 1;
2275     brw_MOV( p, t, param[ 3 ] );
2276     p->current->header.predicate_inverse = 0;
2277     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2278     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2279     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2280
2281     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2282     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2283     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2284     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2285
2286     /* z component */
2287     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2288     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2289     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2290     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2291
2292     brw_push_insn_state( p );
2293     brw_set_mask_control( p, BRW_MASK_DISABLE );
2294     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2295     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2296     brw_pop_insn_state( p );
2297
2298     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2299     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2300     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2301     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2302
2303     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2304     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2305     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2306     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2307
2308     /* w component */
2309     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2310     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2311     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2312     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2313
2314     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2315     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2316     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2317     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2318     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2319
2320     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2321     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2322     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2323     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2324
2325     /* Here we interpolate in the y dimension... */
2326     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2327     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2328     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2329     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2330     brw_ADD( p, x0y0, x0y0, x0y1 );
2331     brw_ADD( p, x1y0, x1y0, x1y1 );
2332
2333     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2334     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2335     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2336     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2337
2338     /* Now do the same thing for the front four gradients... */
2339     /* x component */
2340     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2341     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2342     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2343     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2344
2345     brw_push_insn_state( p );
2346     brw_set_mask_control( p, BRW_MASK_DISABLE );
2347     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2348     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2349     brw_pop_insn_state( p );
2350
2351     brw_MUL( p, x1y0, x1y0, t );
2352     brw_MUL( p, x1y1, x1y1, t );
2353     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2354     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2355     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2356
2357     /* y component */
2358     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2359     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2360     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2361     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2362
2363     brw_push_insn_state( p );
2364     brw_set_mask_control( p, BRW_MASK_DISABLE );
2365     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2366     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2367     brw_pop_insn_state( p );
2368
2369     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2370     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2371     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2372     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2373     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2374
2375     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2376     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2377     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2378     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2379
2380     /* z component */
2381     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2382     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2383     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2384     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2385
2386     brw_push_insn_state( p );
2387     brw_set_mask_control( p, BRW_MASK_DISABLE );
2388     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2389     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2390     brw_pop_insn_state( p );
2391
2392     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2393     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2394     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2395     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2396     /* prepare t for the w component (used below): w the first time through
2397        the loop; w - 1 the second time) */
2398     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2399     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2400     p->current->header.predicate_inverse = 1;
2401     brw_MOV( p, t, param[ 3 ] );
2402     p->current->header.predicate_inverse = 0;
2403     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2404
2405     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2406     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2407     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2408     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2409
2410     /* w component */
2411     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2412     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2413     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2414     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2415
2416     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2417     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2418     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2419     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2420
2421     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2422     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2423     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2424     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2425
2426     /* Interpolate in the y dimension: */
2427     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2428     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2429     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2430     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2431     brw_ADD( p, x0y0, x0y0, x0y1 );
2432     brw_ADD( p, x1y0, x1y0, x1y1 );
2433
2434     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2435        time put the front face in tmp[ 1 ] and we're nearly there... */
2436     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2437     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2438     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2439
2440     /* Another interpolation, in the z dimension: */
2441     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2442     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2443     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2444
2445     /* Exit the loop if we've computed both cubes... */
2446     origin = p->nr_insn;
2447     brw_push_insn_state( p );
2448     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2449     brw_set_mask_control( p, BRW_MASK_DISABLE );
2450     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2451     brw_pop_insn_state( p );
2452
2453     /* Save the result for the w=0 case, and increment the w coordinate: */
2454     brw_MOV( p, w0, tmp[ 0 ] );
2455     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2456              brw_imm_uw( 1 ) );
2457
2458     /* Loop around for the other cube.  Explicitly set the flag register
2459        (unfortunately we must spend an extra instruction to do this: we
2460        can't rely on a side effect of the previous MOV or ADD because
2461        conditional modifiers which are normally true might be false in
2462        exceptional circumstances, e.g. given a NaN input; the add to
2463        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2464     brw_push_insn_state( p );
2465     brw_set_mask_control( p, BRW_MASK_DISABLE );
2466     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2467     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2468              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2469     brw_pop_insn_state( p );
2470
2471     /* Patch the previous conditional branch now that we know the
2472        destination address. */
2473     brw_set_src1( p->store + origin,
2474                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2475
2476     /* The very last interpolation. */
2477     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2478     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2479     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2480
2481     /* scale by pow( 2, -15 ), as described above */
2482     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2483
2484     release_tmps( c, mark );
2485 }
2486
2487 static void emit_noise4( struct brw_wm_compile *c,
2488                          const struct prog_instruction *inst )
2489 {
2490     struct brw_compile *p = &c->func;
2491     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2492     GLuint mask = inst->DstReg.WriteMask;
2493     int i;
2494     int mark = mark_tmps( c );
2495
2496     assert( mark == 0 );
2497
2498     src0 = get_src_reg( c, inst, 0, 0 );
2499     src1 = get_src_reg( c, inst, 0, 1 );
2500     src2 = get_src_reg( c, inst, 0, 2 );
2501     src3 = get_src_reg( c, inst, 0, 3 );
2502
2503     param0 = alloc_tmp( c );
2504     param1 = alloc_tmp( c );
2505     param2 = alloc_tmp( c );
2506     param3 = alloc_tmp( c );
2507
2508     brw_MOV( p, param0, src0 );
2509     brw_MOV( p, param1, src1 );
2510     brw_MOV( p, param2, src2 );
2511     brw_MOV( p, param3, src3 );
2512
2513     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2514
2515     /* Fill in the result: */
2516     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2517     for (i = 0 ; i < 4; i++) {
2518         if (mask & (1<<i)) {
2519             dst = get_dst_reg(c, inst, i);
2520             brw_MOV( p, dst, param0 );
2521         }
2522     }
2523     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2524         brw_set_saturate( p, 0 );
2525
2526     release_tmps( c, mark );
2527 }
2528
2529 static void emit_wpos_xy(struct brw_wm_compile *c,
2530                          const struct prog_instruction *inst)
2531 {
2532     struct brw_compile *p = &c->func;
2533     GLuint mask = inst->DstReg.WriteMask;
2534     struct brw_reg src0[2], dst[2];
2535
2536     dst[0] = get_dst_reg(c, inst, 0);
2537     dst[1] = get_dst_reg(c, inst, 1);
2538
2539     src0[0] = get_src_reg(c, inst, 0, 0);
2540     src0[1] = get_src_reg(c, inst, 0, 1);
2541
2542     /* Calculate the pixel offset from window bottom left into destination
2543      * X and Y channels.
2544      */
2545     if (mask & WRITEMASK_X) {
2546         /* X' = X - origin_x */
2547         brw_ADD(p,
2548                 dst[0],
2549                 retype(src0[0], BRW_REGISTER_TYPE_W),
2550                 brw_imm_d(0 - c->key.origin_x));
2551     }
2552
2553     if (mask & WRITEMASK_Y) {
2554         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2555         brw_ADD(p,
2556                 dst[1],
2557                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2558                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2559     }
2560 }
2561
2562 /* TODO
2563    BIAS on SIMD8 not working yet...
2564  */
2565 static void emit_txb(struct brw_wm_compile *c,
2566                      const struct prog_instruction *inst)
2567 {
2568     struct brw_compile *p = &c->func;
2569     struct brw_reg dst[4], src[4], payload_reg;
2570     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2571     const GLuint unit = inst->TexSrcUnit;
2572     GLuint i;
2573     GLuint msg_type;
2574
2575     assert(unit < BRW_MAX_TEX_UNIT);
2576
2577     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2578
2579     for (i = 0; i < 4; i++)
2580         dst[i] = get_dst_reg(c, inst, i);
2581     for (i = 0; i < 4; i++)
2582         src[i] = get_src_reg(c, inst, 0, i);
2583
2584     switch (inst->TexSrcTarget) {
2585         case TEXTURE_1D_INDEX:
2586             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2587             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2588             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2589             break;
2590         case TEXTURE_2D_INDEX:
2591         case TEXTURE_RECT_INDEX:
2592             brw_MOV(p, brw_message_reg(2), src[0]);
2593             brw_MOV(p, brw_message_reg(3), src[1]);
2594             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2595             break;
2596         case TEXTURE_3D_INDEX:
2597         case TEXTURE_CUBE_INDEX:
2598             brw_MOV(p, brw_message_reg(2), src[0]);
2599             brw_MOV(p, brw_message_reg(3), src[1]);
2600             brw_MOV(p, brw_message_reg(4), src[2]);
2601             break;
2602         default:
2603             /* invalid target */
2604             abort();
2605     }
2606     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2607     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2608
2609     if (BRW_IS_IGDNG(p->brw)) {
2610         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2611     } else {
2612         /* Does it work well on SIMD8? */
2613         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2614     }
2615
2616     brw_SAMPLE(p,
2617                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2618                1,                                           /* msg_reg_nr */
2619                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2620                SURF_INDEX_TEXTURE(unit),
2621                unit,                                        /* sampler */
2622                inst->DstReg.WriteMask,                      /* writemask */
2623                msg_type,                                    /* msg_type */
2624                4,                                           /* response_length */
2625                4,                                           /* msg_length */
2626                0,                                           /* eot */
2627                1,
2628                BRW_SAMPLER_SIMD_MODE_SIMD8);
2629 }
2630
2631
2632 static void emit_tex(struct brw_wm_compile *c,
2633                      const struct prog_instruction *inst)
2634 {
2635     struct brw_compile *p = &c->func;
2636     struct brw_reg dst[4], src[4], payload_reg;
2637     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2638     const GLuint unit = inst->TexSrcUnit;
2639     GLuint msg_len;
2640     GLuint i, nr;
2641     GLuint emit;
2642     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2643     GLuint msg_type;
2644
2645     assert(unit < BRW_MAX_TEX_UNIT);
2646
2647     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2648
2649     for (i = 0; i < 4; i++)
2650         dst[i] = get_dst_reg(c, inst, i);
2651     for (i = 0; i < 4; i++)
2652         src[i] = get_src_reg(c, inst, 0, i);
2653
2654     switch (inst->TexSrcTarget) {
2655         case TEXTURE_1D_INDEX:
2656             emit = WRITEMASK_X;
2657             nr = 1;
2658             break;
2659         case TEXTURE_2D_INDEX:
2660         case TEXTURE_RECT_INDEX:
2661             emit = WRITEMASK_XY;
2662             nr = 2;
2663             break;
2664         case TEXTURE_3D_INDEX:
2665         case TEXTURE_CUBE_INDEX:
2666             emit = WRITEMASK_XYZ;
2667             nr = 3;
2668             break;
2669         default:
2670            /* invalid target */
2671            abort();
2672     }
2673     msg_len = 1;
2674
2675     /* move/load S, T, R coords */
2676     for (i = 0; i < nr; i++) {
2677         static const GLuint swz[4] = {0,1,2,2};
2678         if (emit & (1<<i))
2679             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2680         else
2681             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2682         msg_len += 1;
2683     }
2684
2685     if (shadow) {
2686        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2687        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2688     }
2689
2690     if (BRW_IS_IGDNG(p->brw)) {
2691         if (shadow)
2692             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2693         else
2694             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2695     } else {
2696         /* Does it work for shadow on SIMD8 ? */
2697         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2698     }
2699
2700     brw_SAMPLE(p,
2701                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2702                1,                                          /* msg_reg_nr */
2703                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2704                SURF_INDEX_TEXTURE(unit),
2705                unit,                                       /* sampler */
2706                inst->DstReg.WriteMask,                     /* writemask */
2707                msg_type,                                   /* msg_type */
2708                4,                                          /* response_length */
2709                shadow ? 6 : 4,                             /* msg_length */
2710                0,                                          /* eot */
2711                1,
2712                BRW_SAMPLER_SIMD_MODE_SIMD8);
2713
2714     if (shadow)
2715         brw_MOV(p, dst[3], brw_imm_f(1.0));
2716 }
2717
2718
2719 /**
2720  * Resolve subroutine calls after code emit is done.
2721  */
2722 static void post_wm_emit( struct brw_wm_compile *c )
2723 {
2724     brw_resolve_cals(&c->func);
2725 }
2726
2727 static void
2728 get_argument_regs(struct brw_wm_compile *c,
2729                   const struct prog_instruction *inst,
2730                   int index,
2731                   struct brw_reg *regs,
2732                   int mask)
2733 {
2734     int i;
2735
2736     for (i = 0; i < 4; i++) {
2737         if (mask & (1 << i))
2738             regs[i] = get_src_reg(c, inst, index, i);
2739     }
2740 }
2741
2742 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2743 {
2744 #define MAX_IF_DEPTH 32
2745 #define MAX_LOOP_DEPTH 32
2746     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2747     GLuint i, if_depth = 0, loop_depth = 0;
2748     struct brw_compile *p = &c->func;
2749     struct brw_indirect stack_index = brw_indirect(0, 0);
2750
2751     c->out_of_regs = GL_FALSE;
2752
2753     prealloc_reg(c);
2754     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2755     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2756
2757     for (i = 0; i < c->nr_fp_insns; i++) {
2758         const struct prog_instruction *inst = &c->prog_instructions[i];
2759         int dst_flags;
2760         struct brw_reg args[3][4], dst[4];
2761         int j;
2762
2763         c->cur_inst = i;
2764
2765 #if 0
2766         _mesa_printf("Inst %d: ", i);
2767         _mesa_print_instruction(inst);
2768 #endif
2769
2770         /* fetch any constants that this instruction needs */
2771         if (c->fp->use_const_buffer)
2772            fetch_constants(c, inst);
2773
2774         if (inst->CondUpdate)
2775             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2776         else
2777             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2778
2779         dst_flags = inst->DstReg.WriteMask;
2780         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2781             dst_flags |= SATURATE;
2782
2783         switch (inst->Opcode) {
2784             case WM_PIXELXY:
2785                 emit_pixel_xy(c, inst);
2786                 break;
2787             case WM_DELTAXY:
2788                 emit_delta_xy(c, inst);
2789                 break;
2790             case WM_PIXELW:
2791                 emit_pixel_w(c, inst);
2792                 break;
2793             case WM_LINTERP:
2794                 emit_linterp(c, inst);
2795                 break;
2796             case WM_PINTERP:
2797                 emit_pinterp(c, inst);
2798                 break;
2799             case WM_CINTERP:
2800                 emit_cinterp(c, inst);
2801                 break;
2802             case WM_WPOSXY:
2803                 emit_wpos_xy(c, inst);
2804                 break;
2805             case WM_FB_WRITE:
2806                 emit_fb_write(c, inst);
2807                 break;
2808             case WM_FRONTFACING:
2809                 emit_frontfacing(c, inst);
2810                 break;
2811             case OPCODE_ADD:
2812                 emit_add(c, inst);
2813                 break;
2814             case OPCODE_ARL:
2815                 emit_arl(c, inst);
2816                 break;
2817             case OPCODE_FRC:
2818                 emit_frc(c, inst);
2819                 break;
2820             case OPCODE_FLR:
2821                 emit_flr(c, inst);
2822                 break;
2823             case OPCODE_LRP:
2824                 emit_lrp(c, inst);
2825                 break;
2826             case OPCODE_TRUNC:
2827                 emit_trunc(c, inst);
2828                 break;
2829             case OPCODE_MOV:
2830             case OPCODE_SWZ:
2831                 emit_mov(c, inst);
2832                 break;
2833             case OPCODE_DP3:
2834                 emit_dp3(c, inst);
2835                 break;
2836             case OPCODE_DP4:
2837                 emit_dp4(c, inst);
2838                 break;
2839             case OPCODE_XPD:
2840                 emit_xpd(c, inst);
2841                 break;
2842             case OPCODE_DPH:
2843                 emit_dph(c, inst);
2844                 break;
2845             case OPCODE_RCP:
2846                 emit_rcp(c, inst);
2847                 break;
2848             case OPCODE_RSQ:
2849                 emit_rsq(c, inst);
2850                 break;
2851             case OPCODE_SIN:
2852                 emit_sin(c, inst);
2853                 break;
2854             case OPCODE_COS:
2855                 emit_cos(c, inst);
2856                 break;
2857             case OPCODE_EX2:
2858                 emit_ex2(c, inst);
2859                 break;
2860             case OPCODE_LG2:
2861                 emit_lg2(c, inst);
2862                 break;
2863             case OPCODE_MIN:
2864             case OPCODE_MAX:
2865                 emit_min_max(c, inst);
2866                 break;
2867             case OPCODE_DDX:
2868             case OPCODE_DDY:
2869                 for (j = 0; j < 4; j++) {
2870                     if (inst->DstReg.WriteMask & (1 << j))
2871                         dst[j] = get_dst_reg(c, inst, j);
2872                     else
2873                         dst[j] = brw_null_reg();
2874                 }
2875                 get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
2876                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2877                           args[0]);
2878                 break;
2879             case OPCODE_SLT:
2880                 emit_slt(c, inst);
2881                 break;
2882             case OPCODE_SLE:
2883                 emit_sle(c, inst);
2884                 break;
2885             case OPCODE_SGT:
2886                 emit_sgt(c, inst);
2887                 break;
2888             case OPCODE_SGE:
2889                 emit_sge(c, inst);
2890                 break;
2891             case OPCODE_SEQ:
2892                 emit_seq(c, inst);
2893                 break;
2894             case OPCODE_SNE:
2895                 emit_sne(c, inst);
2896                 break;
2897             case OPCODE_MUL:
2898                 emit_mul(c, inst);
2899                 break;
2900             case OPCODE_POW:
2901                 emit_pow(c, inst);
2902                 break;
2903             case OPCODE_MAD:
2904                 emit_mad(c, inst);
2905                 break;
2906             case OPCODE_NOISE1:
2907                 emit_noise1(c, inst);
2908                 break;
2909             case OPCODE_NOISE2:
2910                 emit_noise2(c, inst);
2911                 break;
2912             case OPCODE_NOISE3:
2913                 emit_noise3(c, inst);
2914                 break;
2915             case OPCODE_NOISE4:
2916                 emit_noise4(c, inst);
2917                 break;
2918             case OPCODE_TEX:
2919                 emit_tex(c, inst);
2920                 break;
2921             case OPCODE_TXB:
2922                 emit_txb(c, inst);
2923                 break;
2924             case OPCODE_KIL_NV:
2925                 emit_kil(c);
2926                 break;
2927             case OPCODE_IF:
2928                 assert(if_depth < MAX_IF_DEPTH);
2929                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2930                 break;
2931             case OPCODE_ELSE:
2932                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2933                 break;
2934             case OPCODE_ENDIF:
2935                 assert(if_depth > 0);
2936                 brw_ENDIF(p, if_inst[--if_depth]);
2937                 break;
2938             case OPCODE_BGNSUB:
2939                 brw_save_label(p, inst->Comment, p->nr_insn);
2940                 break;
2941             case OPCODE_ENDSUB:
2942                 /* no-op */
2943                 break;
2944             case OPCODE_CAL:
2945                 brw_push_insn_state(p);
2946                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2947                 brw_set_access_mode(p, BRW_ALIGN_1);
2948                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2949                 brw_set_access_mode(p, BRW_ALIGN_16);
2950                 brw_ADD(p, get_addr_reg(stack_index),
2951                          get_addr_reg(stack_index), brw_imm_d(4));
2952                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2953                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2954                 brw_pop_insn_state(p);
2955                 break;
2956
2957             case OPCODE_RET:
2958                 brw_push_insn_state(p);
2959                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2960                 brw_ADD(p, get_addr_reg(stack_index),
2961                         get_addr_reg(stack_index), brw_imm_d(-4));
2962                 brw_set_access_mode(p, BRW_ALIGN_1);
2963                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2964                 brw_set_access_mode(p, BRW_ALIGN_16);
2965                 brw_pop_insn_state(p);
2966
2967                 break;
2968             case OPCODE_BGNLOOP:
2969                 /* XXX may need to invalidate the current_constant regs */
2970                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2971                 break;
2972             case OPCODE_BRK:
2973                 brw_BREAK(p);
2974                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2975                 break;
2976             case OPCODE_CONT:
2977                 brw_CONT(p);
2978                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2979                 break;
2980             case OPCODE_ENDLOOP:
2981                {
2982                   struct brw_instruction *inst0, *inst1;
2983                   GLuint br = 1;
2984
2985                   if (BRW_IS_IGDNG(brw))
2986                      br = 2;
2987
2988                   loop_depth--;
2989                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2990                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2991                   while (inst0 > loop_inst[loop_depth]) {
2992                      inst0--;
2993                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2994                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2995                         inst0->bits3.if_else.pop_count = 0;
2996                      }
2997                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2998                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2999                         inst0->bits3.if_else.pop_count = 0;
3000                      }
3001                   }
3002                }
3003                break;
3004             default:
3005                 _mesa_printf("unsupported IR in fragment shader %d\n",
3006                         inst->Opcode);
3007         }
3008
3009         if (inst->CondUpdate)
3010             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3011         else
3012             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3013     }
3014     post_wm_emit(c);
3015
3016     if (INTEL_DEBUG & DEBUG_WM) {
3017       _mesa_printf("wm-native:\n");
3018       for (i = 0; i < p->nr_insn; i++)
3019          brw_disasm(stderr, &p->store[i]);
3020       _mesa_printf("\n");
3021     }
3022 }
3023
3024 /**
3025  * Do GPU code generation for shaders that use GLSL features such as
3026  * flow control.  Other shaders will be compiled with the
3027  */
3028 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3029 {
3030     if (INTEL_DEBUG & DEBUG_WM) {
3031         _mesa_printf("brw_wm_glsl_emit:\n");
3032     }
3033
3034     /* initial instruction translation/simplification */
3035     brw_wm_pass_fp(c);
3036
3037     /* actual code generation */
3038     brw_wm_emit_glsl(brw, c);
3039
3040     if (INTEL_DEBUG & DEBUG_WM) {
3041         brw_wm_print_program(c, "brw_wm_glsl_emit done");
3042     }
3043
3044     c->prog_data.total_grf = num_grf_used(c);
3045     c->prog_data.total_scratch = 0;
3046 }