src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25     for (i = 0; i < fp->Base.NumInstructions; i++) {
  26         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  27         switch (inst->Opcode) {
  28             case OPCODE_ARL:
  29             case OPCODE_IF:
  30             case OPCODE_ENDIF:
  31             case OPCODE_CAL:
  32             case OPCODE_BRK:
  33             case OPCODE_RET:
  34             case OPCODE_DDX:
  35             case OPCODE_DDY:
  36             case OPCODE_NOISE1:
  37             case OPCODE_NOISE2:
  38             case OPCODE_NOISE3:
  39             case OPCODE_NOISE4:
  40             case OPCODE_BGNLOOP:
  41                 return GL_TRUE;
  42             default:
  43                 break;
  44         }
  45     }
  46     return GL_FALSE;
  47 }
  48
  49
  50
  51 static void
  52 reclaim_temps(struct brw_wm_compile *c);
  53
  54
  55 /** Mark GRF register as used. */
  56 static void
  57 prealloc_grf(struct brw_wm_compile *c, int r)
  58 {
  59    c->used_grf[r] = GL_TRUE;
  60 }
  61
  62
  63 /** Mark given GRF register as not in use. */
  64 static void
  65 release_grf(struct brw_wm_compile *c, int r)
  66 {
  67    /*assert(c->used_grf[r]);*/
  68    c->used_grf[r] = GL_FALSE;
  69    c->first_free_grf = MIN2(c->first_free_grf, r);
  70 }
  71
  72
  73 /** Return index of a free GRF, mark it as used. */
  74 static int
  75 alloc_grf(struct brw_wm_compile *c)
  76 {
  77    GLuint r;
  78    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  79       if (!c->used_grf[r]) {
  80          c->used_grf[r] = GL_TRUE;
  81          c->first_free_grf = r + 1;  /* a guess */
  82          return r;
  83       }
  84    }
  85
  86    /* no free temps, try to reclaim some */
  87    reclaim_temps(c);
  88    c->first_free_grf = 0;
  89
  90    /* try alloc again */
  91    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  92       if (!c->used_grf[r]) {
  93          c->used_grf[r] = GL_TRUE;
  94          c->first_free_grf = r + 1;  /* a guess */
  95          return r;
  96       }
  97    }
  98
  99    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
 100       assert(c->used_grf[r]);
 101    }
 102
 103    /* really, no free GRF regs found */
 104    if (!c->out_of_regs) {
 105       /* print warning once per compilation */
 106       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 107       c->out_of_regs = GL_TRUE;
 108    }
 109
 110    return -1;
 111 }
 112
 113
 114 /** Return number of GRF registers used */
 115 static int
 116 num_grf_used(const struct brw_wm_compile *c)
 117 {
 118    int r;
 119    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 120       if (c->used_grf[r])
 121          return r + 1;
 122    return 0;
 123 }
 124
 125
 126
 127 /**
 128  * Record the mapping of a Mesa register to a hardware register.
 129  */
 130 static void set_reg(struct brw_wm_compile *c, int file, int index,
 131         int component, struct brw_reg reg)
 132 {
 133     c->wm_regs[file][index][component].reg = reg;
 134     c->wm_regs[file][index][component].inited = GL_TRUE;
 135 }
 136
 137 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 138 {
 139     struct brw_reg reg;
 140
 141     /* if we need to allocate another temp, grow the tmp_regs[] array */
 142     if (c->tmp_index == c->tmp_max) {
 143        int r = alloc_grf(c);
 144        if (r < 0) {
 145           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 146           r = 50; /* XXX random register! */
 147        }
 148        c->tmp_regs[ c->tmp_max++ ] = r;
 149     }
 150
 151     /* form the GRF register */
 152     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 153     /*printf("alloc_temp %d\n", reg.nr);*/
 154     assert(reg.nr < BRW_WM_MAX_GRF);
 155     return reg;
 156
 157 }
 158
 159 /**
 160  * Save current temp register info.
 161  * There must be a matching call to release_tmps().
 162  */
 163 static int mark_tmps(struct brw_wm_compile *c)
 164 {
 165     return c->tmp_index;
 166 }
 167
 168 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 169 {
 170     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 171 }
 172
 173 static void release_tmps(struct brw_wm_compile *c, int mark)
 174 {
 175     c->tmp_index = mark;
 176 }
 177
 178 /**
 179  * Convert Mesa src register to brw register.
 180  *
 181  * Since we're running in SOA mode each Mesa register corresponds to four
 182  * hardware registers.  We allocate the hardware registers as needed here.
 183  *
 184  * \param file  register file, one of PROGRAM_x
 185  * \param index  register number
 186  * \param component  src component (X=0, Y=1, Z=2, W=3)
 187  * \param nr  not used?!?
 188  * \param neg  negate value?
 189  * \param abs  take absolute value?
 190  */
 191 static struct brw_reg
 192 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 193         int nr, GLuint neg, GLuint abs)
 194 {
 195     struct brw_reg reg;
 196     switch (file) {
 197         case PROGRAM_STATE_VAR:
 198         case PROGRAM_CONSTANT:
 199         case PROGRAM_UNIFORM:
 200             file = PROGRAM_STATE_VAR;
 201             break;
 202         case PROGRAM_UNDEFINED:
 203             return brw_null_reg();
 204         case PROGRAM_TEMPORARY:
 205         case PROGRAM_INPUT:
 206         case PROGRAM_OUTPUT:
 207         case PROGRAM_PAYLOAD:
 208             break;
 209         default:
 210             _mesa_problem(NULL, "Unexpected file in get_reg()");
 211             return brw_null_reg();
 212     }
 213
 214     assert(index < 256);
 215     assert(component < 4);
 216
 217     /* see if we've already allocated a HW register for this Mesa register */
 218     if (c->wm_regs[file][index][component].inited) {
 219        /* yes, re-use */
 220        reg = c->wm_regs[file][index][component].reg;
 221     }
 222     else {
 223         /* no, allocate new register */
 224        int grf = alloc_grf(c);
 225        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 226        if (grf < 0) {
 227           /* totally out of temps */
 228           grf = 51; /* XXX random register! */
 229        }
 230
 231        reg = brw_vec8_grf(grf, 0);
 232        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 233
 234        set_reg(c, file, index, component, reg);
 235     }
 236
 237     if (neg & (1 << component)) {
 238         reg = negate(reg);
 239     }
 240     if (abs)
 241         reg = brw_abs(reg);
 242     return reg;
 243 }
 244
 245
 246
 247 /**
 248  * This is called if we run out of GRF registers.  Examine the live intervals
 249  * of temp regs in the program and free those which won't be used again.
 250  */
 251 static void
 252 reclaim_temps(struct brw_wm_compile *c)
 253 {
 254    GLint intBegin[MAX_PROGRAM_TEMPS];
 255    GLint intEnd[MAX_PROGRAM_TEMPS];
 256    int index;
 257
 258    /*printf("Reclaim temps:\n");*/
 259
 260    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 261                              intBegin, intEnd);
 262
 263    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 264       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 265          /* program temp[i] can be freed */
 266          int component;
 267          /*printf("  temp[%d] is dead\n", index);*/
 268          for (component = 0; component < 4; component++) {
 269             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 270                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 271                release_grf(c, r);
 272                /*
 273                printf("  Reclaim temp %d, reg %d at inst %d\n",
 274                       index, r, c->cur_inst);
 275                */
 276                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 277             }
 278          }
 279       }
 280    }
 281 }
 282
 283
 284
 285
 286 /**
 287  * Preallocate registers.  This sets up the Mesa to hardware register
 288  * mapping for certain registers, such as constants (uniforms/state vars)
 289  * and shader inputs.
 290  */
 291 static void prealloc_reg(struct brw_wm_compile *c)
 292 {
 293     int i, j;
 294     struct brw_reg reg;
 295     int urb_read_length = 0;
 296     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 297     GLuint reg_index = 0;
 298
 299     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 300     c->first_free_grf = 0;
 301
 302     for (i = 0; i < 4; i++) {
 303         if (i < c->key.nr_depth_regs)
 304             reg = brw_vec8_grf(i * 2, 0);
 305         else
 306             reg = brw_vec8_grf(0, 0);
 307         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 308     }
 309     reg_index += 2 * c->key.nr_depth_regs;
 310
 311     /* constants */
 312     {
 313         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 314         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 315
 316         /* use a real constant buffer, or just use a section of the GRF? */
 317         /* XXX this heuristic may need adjustment... */
 318         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 319            c->fp->use_const_buffer = GL_TRUE;
 320         else
 321            c->fp->use_const_buffer = GL_FALSE;
 322         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 323
 324         if (c->fp->use_const_buffer) {
 325            /* We'll use a real constant buffer and fetch constants from
 326             * it with a dataport read message.
 327             */
 328
 329            /* number of float constants in CURBE */
 330            c->prog_data.nr_params = 0;
 331         }
 332         else {
 333            const struct gl_program_parameter_list *plist =
 334               c->fp->program.Base.Parameters;
 335            int index = 0;
 336
 337            /* number of float constants in CURBE */
 338            c->prog_data.nr_params = 4 * nr_params;
 339
 340            /* loop over program constants (float[4]) */
 341            for (i = 0; i < nr_params; i++) {
 342               /* loop over XYZW channels */
 343               for (j = 0; j < 4; j++, index++) {
 344                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 345                  /* Save pointer to parameter/constant value.
 346                   * Constants will be copied in prepare_constant_buffer()
 347                   */
 348                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 349                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 350               }
 351            }
 352            /* number of constant regs used (each reg is float[8]) */
 353            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 354            reg_index += c->nr_creg;
 355         }
 356     }
 357
 358     /* fragment shader inputs */
 359     for (i = 0; i < VERT_RESULT_MAX; i++) {
 360        int fp_input;
 361
 362        if (i >= VERT_RESULT_VAR0)
 363           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 364        else if (i <= VERT_RESULT_TEX7)
 365           fp_input = i;
 366        else
 367           fp_input = -1;
 368
 369        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 370           urb_read_length = reg_index;
 371           reg = brw_vec8_grf(reg_index, 0);
 372           for (j = 0; j < 4; j++)
 373              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 374        }
 375        if (c->key.vp_outputs_written & (1 << i)) {
 376           reg_index += 2;
 377        }
 378     }
 379
 380     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 381     c->prog_data.urb_read_length = urb_read_length;
 382     c->prog_data.curb_read_length = c->nr_creg;
 383     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 384     reg_index++;
 385     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 386     reg_index += 2;
 387
 388     /* mark GRF regs [0..reg_index-1] as in-use */
 389     for (i = 0; i < reg_index; i++)
 390        prealloc_grf(c, i);
 391
 392     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 393     prealloc_grf(c, 126);
 394     prealloc_grf(c, 127);
 395
 396     for (i = 0; i < c->nr_fp_insns; i++) {
 397         const struct prog_instruction *inst = &c->prog_instructions[i];
 398         struct brw_reg dst[4];
 399
 400         switch (inst->Opcode) {
 401         case OPCODE_TEX:
 402         case OPCODE_TXB:
 403             /* Allocate the channels of texture results contiguously,
 404              * since they are written out that way by the sampler unit.
 405              */
 406             for (j = 0; j < 4; j++) {
 407                 dst[j] = get_dst_reg(c, inst, j);
 408                 if (j != 0)
 409                     assert(dst[j].nr == dst[j - 1].nr + 1);
 410             }
 411             break;
 412         default:
 413             break;
 414         }
 415     }
 416
 417     /* An instruction may reference up to three constants.
 418      * They'll be found in these registers.
 419      * XXX alloc these on demand!
 420      */
 421     if (c->fp->use_const_buffer) {
 422        for (i = 0; i < 3; i++) {
 423           c->current_const[i].index = -1;
 424           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 425        }
 426     }
 427 #if 0
 428     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 429     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 430 #endif
 431 }
 432
 433
 434 /**
 435  * Check if any of the instruction's src registers are constants, uniforms,
 436  * or statevars.  If so, fetch any constants that we don't already have in
 437  * the three GRF slots.
 438  */
 439 static void fetch_constants(struct brw_wm_compile *c,
 440                             const struct prog_instruction *inst)
 441 {
 442    struct brw_compile *p = &c->func;
 443    GLuint i;
 444
 445    /* loop over instruction src regs */
 446    for (i = 0; i < 3; i++) {
 447       const struct prog_src_register *src = &inst->SrcReg[i];
 448       if (src->File == PROGRAM_STATE_VAR ||
 449           src->File == PROGRAM_CONSTANT ||
 450           src->File == PROGRAM_UNIFORM) {
 451          c->current_const[i].index = src->Index;
 452
 453 #if 0
 454          printf("  fetch const[%d] for arg %d into reg %d\n",
 455                 src->Index, i, c->current_const[i].reg.nr);
 456 #endif
 457
 458          /* need to fetch the constant now */
 459          brw_dp_READ_4(p,
 460                        c->current_const[i].reg,  /* writeback dest */
 461                        src->RelAddr,             /* relative indexing? */
 462                        16 * src->Index,          /* byte offset */
 463                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 464                        );
 465       }
 466    }
 467 }
 468
 469
 470 /**
 471  * Convert Mesa dst register to brw register.
 472  */
 473 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 474                                   const struct prog_instruction *inst,
 475                                   GLuint component)
 476 {
 477     const int nr = 1;
 478     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 479             0, 0);
 480 }
 481
 482
 483 static struct brw_reg
 484 get_src_reg_const(struct brw_wm_compile *c,
 485                   const struct prog_instruction *inst,
 486                   GLuint srcRegIndex, GLuint component)
 487 {
 488    /* We should have already fetched the constant from the constant
 489     * buffer in fetch_constants().  Now we just have to return a
 490     * register description that extracts the needed component and
 491     * smears it across all eight vector components.
 492     */
 493    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 494    struct brw_reg const_reg;
 495
 496    assert(component < 4);
 497    assert(srcRegIndex < 3);
 498    assert(c->current_const[srcRegIndex].index != -1);
 499    const_reg = c->current_const[srcRegIndex].reg;
 500
 501    /* extract desired float from the const_reg, and smear */
 502    const_reg = stride(const_reg, 0, 1, 0);
 503    const_reg.subnr = component * 4;
 504
 505    if (src->Negate & (1 << component))
 506       const_reg = negate(const_reg);
 507    if (src->Abs)
 508       const_reg = brw_abs(const_reg);
 509
 510 #if 0
 511    printf("  form const[%d].%d for arg %d, reg %d\n",
 512           c->current_const[srcRegIndex].index,
 513           component,
 514           srcRegIndex,
 515           const_reg.nr);
 516 #endif
 517
 518    return const_reg;
 519 }
 520
 521
 522 /**
 523  * Convert Mesa src register to brw register.
 524  */
 525 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 526                                   const struct prog_instruction *inst,
 527                                   GLuint srcRegIndex, GLuint channel)
 528 {
 529     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 530     const GLuint nr = 1;
 531     const GLuint component = GET_SWZ(src->Swizzle, channel);
 532
 533     /* Extended swizzle terms */
 534     if (component == SWIZZLE_ZERO) {
 535        return brw_imm_f(0.0F);
 536     }
 537     else if (component == SWIZZLE_ONE) {
 538        return brw_imm_f(1.0F);
 539     }
 540
 541     if (c->fp->use_const_buffer &&
 542         (src->File == PROGRAM_STATE_VAR ||
 543          src->File == PROGRAM_CONSTANT ||
 544          src->File == PROGRAM_UNIFORM)) {
 545        return get_src_reg_const(c, inst, srcRegIndex, component);
 546     }
 547     else {
 548        /* other type of source register */
 549        return get_reg(c, src->File, src->Index, component, nr,
 550                       src->Negate, src->Abs);
 551     }
 552 }
 553
 554
 555 /**
 556  * Same as \sa get_src_reg() but if the register is a literal, emit
 557  * a brw_reg encoding the literal.
 558  * Note that a brw instruction only allows one src operand to be a literal.
 559  * For instructions with more than one operand, only the second can be a
 560  * literal.  This means that we treat some literals as constants/uniforms
 561  * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
 562  *
 563  */
 564 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
 565                                       const struct prog_instruction *inst,
 566                                       GLuint srcRegIndex, GLuint channel)
 567 {
 568     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 569     if (src->File == PROGRAM_CONSTANT) {
 570        /* a literal */
 571        const int component = GET_SWZ(src->Swizzle, channel);
 572        const GLfloat *param =
 573           c->fp->program.Base.Parameters->ParameterValues[src->Index];
 574        GLfloat value = param[component];
 575        if (src->Negate & (1 << channel))
 576           value = -value;
 577        if (src->Abs)
 578           value = FABSF(value);
 579 #if 0
 580        printf("  form immed value %f for chan %d\n", value, channel);
 581 #endif
 582        return brw_imm_f(value);
 583     }
 584     else {
 585        return get_src_reg(c, inst, srcRegIndex, channel);
 586     }
 587 }
 588
 589
 590 /**
 591  * Subroutines are minimal support for resusable instruction sequences.
 592  * They are implemented as simply as possible to minimise overhead: there
 593  * is no explicit support for communication between the caller and callee
 594  * other than saving the return address in a temporary register, nor is
 595  * there any automatic local storage.  This implies that great care is
 596  * required before attempting reentrancy or any kind of nested
 597  * subroutine invocations.
 598  */
 599 static void invoke_subroutine( struct brw_wm_compile *c,
 600                                enum _subroutine subroutine,
 601                                void (*emit)( struct brw_wm_compile * ) )
 602 {
 603     struct brw_compile *p = &c->func;
 604
 605     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 606
 607     if( c->subroutines[ subroutine ] ) {
 608         /* subroutine previously emitted: reuse existing instructions */
 609
 610         int mark = mark_tmps( c );
 611         struct brw_reg return_address = retype( alloc_tmp( c ),
 612                                                 BRW_REGISTER_TYPE_UD );
 613         int here = p->nr_insn;
 614
 615         brw_push_insn_state(p);
 616         brw_set_mask_control(p, BRW_MASK_DISABLE);
 617         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 618
 619         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 620                  brw_imm_d( ( c->subroutines[ subroutine ] -
 621                               here - 1 ) << 4 ) );
 622         brw_pop_insn_state(p);
 623
 624         release_tmps( c, mark );
 625     } else {
 626         /* previously unused subroutine: emit, and mark for later reuse */
 627
 628         int mark = mark_tmps( c );
 629         struct brw_reg return_address = retype( alloc_tmp( c ),
 630                                                 BRW_REGISTER_TYPE_UD );
 631         struct brw_instruction *calc;
 632         int base = p->nr_insn;
 633
 634         brw_push_insn_state(p);
 635         brw_set_mask_control(p, BRW_MASK_DISABLE);
 636         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 637         brw_pop_insn_state(p);
 638
 639         c->subroutines[ subroutine ] = p->nr_insn;
 640
 641         emit( c );
 642
 643         brw_push_insn_state(p);
 644         brw_set_mask_control(p, BRW_MASK_DISABLE);
 645         brw_MOV( p, brw_ip_reg(), return_address );
 646         brw_pop_insn_state(p);
 647
 648         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 649
 650         release_tmps( c, mark );
 651     }
 652 }
 653
 654 static void emit_trunc( struct brw_wm_compile *c,
 655                         const struct prog_instruction *inst)
 656 {
 657     int i;
 658     struct brw_compile *p = &c->func;
 659     GLuint mask = inst->DstReg.WriteMask;
 660     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 661     for (i = 0; i < 4; i++) {
 662         if (mask & (1<<i)) {
 663             struct brw_reg src, dst;
 664             dst = get_dst_reg(c, inst, i);
 665             src = get_src_reg(c, inst, 0, i);
 666             brw_RNDZ(p, dst, src);
 667         }
 668     }
 669     brw_set_saturate(p, 0);
 670 }
 671
 672 static void emit_mov( struct brw_wm_compile *c,
 673                       const struct prog_instruction *inst)
 674 {
 675     int i;
 676     struct brw_compile *p = &c->func;
 677     GLuint mask = inst->DstReg.WriteMask;
 678     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 679     for (i = 0; i < 4; i++) {
 680         if (mask & (1<<i)) {
 681             struct brw_reg src, dst;
 682             dst = get_dst_reg(c, inst, i);
 683             /* XXX some moves from immediate value don't work reliably!!! */
 684             /*src = get_src_reg_imm(c, inst, 0, i);*/
 685             src = get_src_reg(c, inst, 0, i);
 686             brw_MOV(p, dst, src);
 687         }
 688     }
 689     brw_set_saturate(p, 0);
 690 }
 691
 692 static void emit_pixel_xy(struct brw_wm_compile *c,
 693                           const struct prog_instruction *inst)
 694 {
 695     struct brw_reg r1 = brw_vec1_grf(1, 0);
 696     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
 697
 698     struct brw_reg dst0, dst1;
 699     struct brw_compile *p = &c->func;
 700     GLuint mask = inst->DstReg.WriteMask;
 701
 702     dst0 = get_dst_reg(c, inst, 0);
 703     dst1 = get_dst_reg(c, inst, 1);
 704     /* Calculate pixel centers by adding 1 or 0 to each of the
 705      * micro-tile coordinates passed in r1.
 706      */
 707     if (mask & WRITEMASK_X) {
 708         brw_ADD(p,
 709                 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
 710                 stride(suboffset(r1_uw, 4), 2, 4, 0),
 711                 brw_imm_v(0x10101010));
 712     }
 713
 714     if (mask & WRITEMASK_Y) {
 715         brw_ADD(p,
 716                 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
 717                 stride(suboffset(r1_uw, 5), 2, 4, 0),
 718                 brw_imm_v(0x11001100));
 719     }
 720 }
 721
 722 static void emit_delta_xy(struct brw_wm_compile *c,
 723                           const struct prog_instruction *inst)
 724 {
 725     struct brw_reg r1 = brw_vec1_grf(1, 0);
 726     struct brw_reg dst0, dst1, src0, src1;
 727     struct brw_compile *p = &c->func;
 728     GLuint mask = inst->DstReg.WriteMask;
 729
 730     dst0 = get_dst_reg(c, inst, 0);
 731     dst1 = get_dst_reg(c, inst, 1);
 732     src0 = get_src_reg(c, inst, 0, 0);
 733     src1 = get_src_reg(c, inst, 0, 1);
 734     /* Calc delta X,Y by subtracting origin in r1 from the pixel
 735      * centers.
 736      */
 737     if (mask & WRITEMASK_X) {
 738         brw_ADD(p,
 739                 dst0,
 740                 retype(src0, BRW_REGISTER_TYPE_UW),
 741                 negate(r1));
 742     }
 743
 744     if (mask & WRITEMASK_Y) {
 745         brw_ADD(p,
 746                 dst1,
 747                 retype(src1, BRW_REGISTER_TYPE_UW),
 748                 negate(suboffset(r1,1)));
 749
 750     }
 751 }
 752
 753 static void fire_fb_write( struct brw_wm_compile *c,
 754                            GLuint base_reg,
 755                            GLuint nr,
 756                            GLuint target,
 757                            GLuint eot)
 758 {
 759     struct brw_compile *p = &c->func;
 760     /* Pass through control information:
 761      */
 762     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 763     {
 764         brw_push_insn_state(p);
 765         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 766         brw_MOV(p,
 767                 brw_message_reg(base_reg + 1),
 768                 brw_vec8_grf(1, 0));
 769         brw_pop_insn_state(p);
 770     }
 771     /* Send framebuffer write message: */
 772     brw_fb_WRITE(p,
 773             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 774             base_reg,
 775             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 776             target,
 777             nr,
 778             0,
 779             eot);
 780 }
 781
 782 static void emit_fb_write(struct brw_wm_compile *c,
 783                           const struct prog_instruction *inst)
 784 {
 785     struct brw_compile *p = &c->func;
 786     int nr = 2;
 787     int channel;
 788     GLuint target, eot;
 789     struct brw_reg src0;
 790
 791     /* Reserve a space for AA - may not be needed:
 792      */
 793     if (c->key.aa_dest_stencil_reg)
 794         nr += 1;
 795
 796     brw_push_insn_state(p);
 797     for (channel = 0; channel < 4; channel++) {
 798         src0 = get_src_reg(c,  inst, 0, channel);
 799         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 800         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 801         brw_MOV(p, brw_message_reg(nr + channel), src0);
 802     }
 803     /* skip over the regs populated above: */
 804     nr += 8;
 805     brw_pop_insn_state(p);
 806
 807     if (c->key.source_depth_to_render_target) {
 808        if (c->key.computes_depth) {
 809           src0 = get_src_reg(c, inst, 2, 2);
 810           brw_MOV(p, brw_message_reg(nr), src0);
 811        }
 812        else {
 813           src0 = get_src_reg(c, inst, 1, 1);
 814           brw_MOV(p, brw_message_reg(nr), src0);
 815        }
 816
 817        nr += 2;
 818     }
 819
 820     if (c->key.dest_depth_reg) {
 821         const GLuint comp = c->key.dest_depth_reg / 2;
 822         const GLuint off = c->key.dest_depth_reg % 2;
 823
 824         if (off != 0) {
 825             /* XXX this code needs review/testing */
 826             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 827             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 828
 829             brw_push_insn_state(p);
 830             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 831
 832             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 833             /* 2nd half? */
 834             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 835             brw_pop_insn_state(p);
 836         }
 837         else
 838         {
 839             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 840             brw_MOV(p, brw_message_reg(nr), src);
 841         }
 842         nr += 2;
 843    }
 844
 845     target = inst->Aux >> 1;
 846     eot = inst->Aux & 1;
 847     fire_fb_write(c, 0, nr, target, eot);
 848 }
 849
 850 static void emit_pixel_w( struct brw_wm_compile *c,
 851                           const struct prog_instruction *inst)
 852 {
 853     struct brw_compile *p = &c->func;
 854     GLuint mask = inst->DstReg.WriteMask;
 855     if (mask & WRITEMASK_W) {
 856         struct brw_reg dst, src0, delta0, delta1;
 857         struct brw_reg interp3;
 858
 859         dst = get_dst_reg(c, inst, 3);
 860         src0 = get_src_reg(c, inst, 0, 0);
 861         delta0 = get_src_reg(c, inst, 1, 0);
 862         delta1 = get_src_reg(c, inst, 1, 1);
 863
 864         interp3 = brw_vec1_grf(src0.nr+1, 4);
 865         /* Calc 1/w - just linterp wpos[3] optimized by putting the
 866          * result straight into a message reg.
 867          */
 868         brw_LINE(p, brw_null_reg(), interp3, delta0);
 869         brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
 870
 871         /* Calc w */
 872         brw_math_16( p, dst,
 873                 BRW_MATH_FUNCTION_INV,
 874                 BRW_MATH_SATURATE_NONE,
 875                 2, brw_null_reg(),
 876                 BRW_MATH_PRECISION_FULL);
 877     }
 878 }
 879
 880 static void emit_linterp(struct brw_wm_compile *c,
 881                          const struct prog_instruction *inst)
 882 {
 883     struct brw_compile *p = &c->func;
 884     GLuint mask = inst->DstReg.WriteMask;
 885     struct brw_reg interp[4];
 886     struct brw_reg dst, delta0, delta1;
 887     struct brw_reg src0;
 888     GLuint nr, i;
 889
 890     src0 = get_src_reg(c, inst, 0, 0);
 891     delta0 = get_src_reg(c, inst, 1, 0);
 892     delta1 = get_src_reg(c, inst, 1, 1);
 893     nr = src0.nr;
 894
 895     interp[0] = brw_vec1_grf(nr, 0);
 896     interp[1] = brw_vec1_grf(nr, 4);
 897     interp[2] = brw_vec1_grf(nr+1, 0);
 898     interp[3] = brw_vec1_grf(nr+1, 4);
 899
 900     for(i = 0; i < 4; i++ ) {
 901         if (mask & (1<<i)) {
 902             dst = get_dst_reg(c, inst, i);
 903             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 904             brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 905         }
 906     }
 907 }
 908
 909 static void emit_cinterp(struct brw_wm_compile *c,
 910                          const struct prog_instruction *inst)
 911 {
 912     struct brw_compile *p = &c->func;
 913     GLuint mask = inst->DstReg.WriteMask;
 914
 915     struct brw_reg interp[4];
 916     struct brw_reg dst, src0;
 917     GLuint nr, i;
 918
 919     src0 = get_src_reg(c, inst, 0, 0);
 920     nr = src0.nr;
 921
 922     interp[0] = brw_vec1_grf(nr, 0);
 923     interp[1] = brw_vec1_grf(nr, 4);
 924     interp[2] = brw_vec1_grf(nr+1, 0);
 925     interp[3] = brw_vec1_grf(nr+1, 4);
 926
 927     for(i = 0; i < 4; i++ ) {
 928         if (mask & (1<<i)) {
 929             dst = get_dst_reg(c, inst, i);
 930             brw_MOV(p, dst, suboffset(interp[i],3));
 931         }
 932     }
 933 }
 934
 935 static void emit_pinterp(struct brw_wm_compile *c,
 936                          const struct prog_instruction *inst)
 937 {
 938     struct brw_compile *p = &c->func;
 939     GLuint mask = inst->DstReg.WriteMask;
 940
 941     struct brw_reg interp[4];
 942     struct brw_reg dst, delta0, delta1;
 943     struct brw_reg src0, w;
 944     GLuint nr, i;
 945
 946     src0 = get_src_reg(c, inst, 0, 0);
 947     delta0 = get_src_reg(c, inst, 1, 0);
 948     delta1 = get_src_reg(c, inst, 1, 1);
 949     w = get_src_reg(c, inst, 2, 3);
 950     nr = src0.nr;
 951
 952     interp[0] = brw_vec1_grf(nr, 0);
 953     interp[1] = brw_vec1_grf(nr, 4);
 954     interp[2] = brw_vec1_grf(nr+1, 0);
 955     interp[3] = brw_vec1_grf(nr+1, 4);
 956
 957     for(i = 0; i < 4; i++ ) {
 958         if (mask & (1<<i)) {
 959             dst = get_dst_reg(c, inst, i);
 960             brw_LINE(p, brw_null_reg(), interp[i], delta0);
 961             brw_MAC(p, dst, suboffset(interp[i],1),
 962                     delta1);
 963             brw_MUL(p, dst, dst, w);
 964         }
 965     }
 966 }
 967
 968 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 969 static void emit_frontfacing(struct brw_wm_compile *c,
 970                              const struct prog_instruction *inst)
 971 {
 972     struct brw_compile *p = &c->func;
 973     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 974     struct brw_reg dst;
 975     GLuint mask = inst->DstReg.WriteMask;
 976     int i;
 977
 978     for (i = 0; i < 4; i++) {
 979         if (mask & (1<<i)) {
 980             dst = get_dst_reg(c, inst, i);
 981             brw_MOV(p, dst, brw_imm_f(0.0));
 982         }
 983     }
 984
 985     /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 986      * us front face
 987      */
 988     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
 989     for (i = 0; i < 4; i++) {
 990         if (mask & (1<<i)) {
 991             dst = get_dst_reg(c, inst, i);
 992             brw_MOV(p, dst, brw_imm_f(1.0));
 993         }
 994     }
 995     brw_set_predicate_control_flag_value(p, 0xff);
 996 }
 997
 998 static void emit_xpd(struct brw_wm_compile *c,
 999                      const struct prog_instruction *inst)
1000 {
1001     int i;
1002     struct brw_compile *p = &c->func;
1003     GLuint mask = inst->DstReg.WriteMask;
1004     for (i = 0; i < 4; i++) {
1005         GLuint i2 = (i+2)%3;
1006         GLuint i1 = (i+1)%3;
1007         if (mask & (1<<i)) {
1008             struct brw_reg src0, src1, dst;
1009             dst = get_dst_reg(c, inst, i);
1010             src0 = negate(get_src_reg(c, inst, 0, i2));
1011             src1 = get_src_reg_imm(c, inst, 1, i1);
1012             brw_MUL(p, brw_null_reg(), src0, src1);
1013             src0 = get_src_reg(c, inst, 0, i1);
1014             src1 = get_src_reg_imm(c, inst, 1, i2);
1015             brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1016             brw_MAC(p, dst, src0, src1);
1017             brw_set_saturate(p, 0);
1018         }
1019     }
1020     brw_set_saturate(p, 0);
1021 }
1022
1023 static void emit_dp3(struct brw_wm_compile *c,
1024                      const struct prog_instruction *inst)
1025 {
1026     struct brw_reg src0[3], src1[3], dst;
1027     int i;
1028     struct brw_compile *p = &c->func;
1029     GLuint mask = inst->DstReg.WriteMask;
1030     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1031
1032     if (!(mask & WRITEMASK_XYZW))
1033         return;
1034
1035     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1036
1037     for (i = 0; i < 3; i++) {
1038         src0[i] = get_src_reg(c, inst, 0, i);
1039         src1[i] = get_src_reg_imm(c, inst, 1, i);
1040     }
1041
1042     dst = get_dst_reg(c, inst, dst_chan);
1043     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1044     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1045     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1046     brw_MAC(p, dst, src0[2], src1[2]);
1047     brw_set_saturate(p, 0);
1048 }
1049
1050 static void emit_dp4(struct brw_wm_compile *c,
1051                      const struct prog_instruction *inst)
1052 {
1053     struct brw_reg src0[4], src1[4], dst;
1054     int i;
1055     struct brw_compile *p = &c->func;
1056     GLuint mask = inst->DstReg.WriteMask;
1057     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1058
1059     if (!(mask & WRITEMASK_XYZW))
1060         return;
1061
1062     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1063
1064     for (i = 0; i < 4; i++) {
1065         src0[i] = get_src_reg(c, inst, 0, i);
1066         src1[i] = get_src_reg_imm(c, inst, 1, i);
1067     }
1068     dst = get_dst_reg(c, inst, dst_chan);
1069     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1070     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1071     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1072     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1073     brw_MAC(p, dst, src0[3], src1[3]);
1074     brw_set_saturate(p, 0);
1075 }
1076
1077 static void emit_dph(struct brw_wm_compile *c,
1078                      const struct prog_instruction *inst)
1079 {
1080     struct brw_reg src0[4], src1[4], dst;
1081     int i;
1082     struct brw_compile *p = &c->func;
1083     GLuint mask = inst->DstReg.WriteMask;
1084     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1085
1086     if (!(mask & WRITEMASK_XYZW))
1087         return;
1088
1089     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1090
1091     for (i = 0; i < 4; i++) {
1092         src0[i] = get_src_reg(c, inst, 0, i);
1093         src1[i] = get_src_reg_imm(c, inst, 1, i);
1094     }
1095     dst = get_dst_reg(c, inst, dst_chan);
1096     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1097     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1098     brw_MAC(p, dst, src0[2], src1[2]);
1099     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1100     brw_ADD(p, dst, dst, src1[3]);
1101     brw_set_saturate(p, 0);
1102 }
1103
1104 /**
1105  * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1106  * Note that the result of the function is smeared across the dest
1107  * register's X, Y, Z and W channels (subject to writemasking of course).
1108  */
1109 static void emit_math1(struct brw_wm_compile *c,
1110                        const struct prog_instruction *inst, GLuint func)
1111 {
1112     struct brw_compile *p = &c->func;
1113     struct brw_reg src0, dst;
1114     GLuint mask = inst->DstReg.WriteMask;
1115     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1116
1117     if (!(mask & WRITEMASK_XYZW))
1118         return;
1119
1120     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1121
1122     /* Get first component of source register */
1123     dst = get_dst_reg(c, inst, dst_chan);
1124     src0 = get_src_reg(c, inst, 0, 0);
1125
1126     brw_MOV(p, brw_message_reg(2), src0);
1127     brw_math(p,
1128              dst,
1129              func,
1130              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1131              2,
1132              brw_null_reg(),
1133              BRW_MATH_DATA_VECTOR,
1134              BRW_MATH_PRECISION_FULL);
1135 }
1136
1137 static void emit_rcp(struct brw_wm_compile *c,
1138                      const struct prog_instruction *inst)
1139 {
1140     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1141 }
1142
1143 static void emit_rsq(struct brw_wm_compile *c,
1144                      const struct prog_instruction *inst)
1145 {
1146     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1147 }
1148
1149 static void emit_sin(struct brw_wm_compile *c,
1150                      const struct prog_instruction *inst)
1151 {
1152     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1153 }
1154
1155 static void emit_cos(struct brw_wm_compile *c,
1156                      const struct prog_instruction *inst)
1157 {
1158     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1159 }
1160
1161 static void emit_ex2(struct brw_wm_compile *c,
1162                      const struct prog_instruction *inst)
1163 {
1164     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1165 }
1166
1167 static void emit_lg2(struct brw_wm_compile *c,
1168                      const struct prog_instruction *inst)
1169 {
1170     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1171 }
1172
1173 static void emit_add(struct brw_wm_compile *c,
1174                      const struct prog_instruction *inst)
1175 {
1176     struct brw_compile *p = &c->func;
1177     struct brw_reg src0, src1, dst;
1178     GLuint mask = inst->DstReg.WriteMask;
1179     int i;
1180     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1181     for (i = 0 ; i < 4; i++) {
1182         if (mask & (1<<i)) {
1183             dst = get_dst_reg(c, inst, i);
1184             src0 = get_src_reg(c, inst, 0, i);
1185             src1 = get_src_reg_imm(c, inst, 1, i);
1186             brw_ADD(p, dst, src0, src1);
1187         }
1188     }
1189     brw_set_saturate(p, 0);
1190 }
1191
1192 static void emit_arl(struct brw_wm_compile *c,
1193                      const struct prog_instruction *inst)
1194 {
1195     struct brw_compile *p = &c->func;
1196     struct brw_reg src0, addr_reg;
1197     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1198     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1199                            BRW_ARF_ADDRESS, 0);
1200     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1201     brw_MOV(p, addr_reg, src0);
1202     brw_set_saturate(p, 0);
1203 }
1204
1205
1206 static void emit_mul(struct brw_wm_compile *c,
1207                      const struct prog_instruction *inst)
1208 {
1209     struct brw_compile *p = &c->func;
1210     struct brw_reg src0, src1, dst;
1211     GLuint mask = inst->DstReg.WriteMask;
1212     int i;
1213     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1214     for (i = 0 ; i < 4; i++) {
1215         if (mask & (1<<i)) {
1216             dst = get_dst_reg(c, inst, i);
1217             src0 = get_src_reg(c, inst, 0, i);
1218             src1 = get_src_reg_imm(c, inst, 1, i);
1219             brw_MUL(p, dst, src0, src1);
1220         }
1221     }
1222     brw_set_saturate(p, 0);
1223 }
1224
1225 static void emit_frc(struct brw_wm_compile *c,
1226                      const struct prog_instruction *inst)
1227 {
1228     struct brw_compile *p = &c->func;
1229     struct brw_reg src0, dst;
1230     GLuint mask = inst->DstReg.WriteMask;
1231     int i;
1232     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1233     for (i = 0 ; i < 4; i++) {
1234         if (mask & (1<<i)) {
1235             dst = get_dst_reg(c, inst, i);
1236             src0 = get_src_reg_imm(c, inst, 0, i);
1237             brw_FRC(p, dst, src0);
1238         }
1239     }
1240     if (inst->SaturateMode != SATURATE_OFF)
1241         brw_set_saturate(p, 0);
1242 }
1243
1244 static void emit_flr(struct brw_wm_compile *c,
1245                      const struct prog_instruction *inst)
1246 {
1247     struct brw_compile *p = &c->func;
1248     struct brw_reg src0, dst;
1249     GLuint mask = inst->DstReg.WriteMask;
1250     int i;
1251     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1252     for (i = 0 ; i < 4; i++) {
1253         if (mask & (1<<i)) {
1254             dst = get_dst_reg(c, inst, i);
1255             src0 = get_src_reg_imm(c, inst, 0, i);
1256             brw_RNDD(p, dst, src0);
1257         }
1258     }
1259     brw_set_saturate(p, 0);
1260 }
1261
1262
1263 static void emit_min_max(struct brw_wm_compile *c,
1264                          const struct prog_instruction *inst)
1265 {
1266     struct brw_compile *p = &c->func;
1267     const GLuint mask = inst->DstReg.WriteMask;
1268     const int mark = mark_tmps(c);
1269     int i;
1270     brw_push_insn_state(p);
1271     for (i = 0; i < 4; i++) {
1272         if (mask & (1<<i)) {
1273             struct brw_reg real_dst = get_dst_reg(c, inst, i);
1274             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1275             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1276             struct brw_reg dst;
1277             /* if dst==src0 or dst==src1 we need to use a temp reg */
1278             GLboolean use_temp = brw_same_reg(dst, src0) ||
1279                                  brw_same_reg(dst, src1);
1280             if (use_temp)
1281                dst = alloc_tmp(c);
1282             else
1283                dst = real_dst;
1284
1285             /*
1286             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
1287                    dst.nr, src0.nr, src1.nr);
1288             */
1289             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1290             brw_MOV(p, dst, src0);
1291             brw_set_saturate(p, 0);
1292
1293             if (inst->Opcode == OPCODE_MIN)
1294                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1295             else
1296                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1297
1298             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1299             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1300             brw_MOV(p, dst, src1);
1301             brw_set_saturate(p, 0);
1302             brw_set_predicate_control_flag_value(p, 0xff);
1303             if (use_temp)
1304                brw_MOV(p, real_dst, dst);
1305         }
1306     }
1307     brw_pop_insn_state(p);
1308     release_tmps(c, mark);
1309 }
1310
1311 static void emit_pow(struct brw_wm_compile *c,
1312                      const struct prog_instruction *inst)
1313 {
1314     struct brw_compile *p = &c->func;
1315     struct brw_reg dst, src0, src1;
1316     GLuint mask = inst->DstReg.WriteMask;
1317     int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1318
1319     if (!(mask & WRITEMASK_XYZW))
1320         return;
1321
1322     assert(is_power_of_two(mask & WRITEMASK_XYZW));
1323
1324     dst = get_dst_reg(c, inst, dst_chan);
1325     src0 = get_src_reg_imm(c, inst, 0, 0);
1326     src1 = get_src_reg_imm(c, inst, 1, 0);
1327
1328     brw_MOV(p, brw_message_reg(2), src0);
1329     brw_MOV(p, brw_message_reg(3), src1);
1330
1331     brw_math(p,
1332             dst,
1333             BRW_MATH_FUNCTION_POW,
1334             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1335             2,
1336             brw_null_reg(),
1337             BRW_MATH_DATA_VECTOR,
1338             BRW_MATH_PRECISION_FULL);
1339 }
1340
1341 static void emit_lrp(struct brw_wm_compile *c,
1342                      const struct prog_instruction *inst)
1343 {
1344     struct brw_compile *p = &c->func;
1345     GLuint mask = inst->DstReg.WriteMask;
1346     struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1347     int i;
1348     int mark = mark_tmps(c);
1349     for (i = 0; i < 4; i++) {
1350         if (mask & (1<<i)) {
1351             dst = get_dst_reg(c, inst, i);
1352             src0 = get_src_reg(c, inst, 0, i);
1353
1354             src1 = get_src_reg_imm(c, inst, 1, i);
1355
1356             if (src1.nr == dst.nr) {
1357                 tmp1 = alloc_tmp(c);
1358                 brw_MOV(p, tmp1, src1);
1359             } else
1360                 tmp1 = src1;
1361
1362             src2 = get_src_reg(c, inst, 2, i);
1363             if (src2.nr == dst.nr) {
1364                 tmp2 = alloc_tmp(c);
1365                 brw_MOV(p, tmp2, src2);
1366             } else
1367                 tmp2 = src2;
1368
1369             brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1370             brw_MUL(p, brw_null_reg(), dst, tmp2);
1371             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1372             brw_MAC(p, dst, src0, tmp1);
1373             brw_set_saturate(p, 0);
1374         }
1375         release_tmps(c, mark);
1376     }
1377 }
1378
1379 /**
1380  * For GLSL shaders, this KIL will be unconditional.
1381  * It may be contained inside an IF/ENDIF structure of course.
1382  */
1383 static void emit_kil(struct brw_wm_compile *c)
1384 {
1385     struct brw_compile *p = &c->func;
1386     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1387     brw_push_insn_state(p);
1388     brw_set_mask_control(p, BRW_MASK_DISABLE);
1389     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1390     brw_AND(p, depth, c->emit_mask_reg, depth);
1391     brw_pop_insn_state(p);
1392 }
1393
1394 static void emit_mad(struct brw_wm_compile *c,
1395                      const struct prog_instruction *inst)
1396 {
1397     struct brw_compile *p = &c->func;
1398     GLuint mask = inst->DstReg.WriteMask;
1399     struct brw_reg dst, src0, src1, src2;
1400     int i;
1401
1402     for (i = 0; i < 4; i++) {
1403         if (mask & (1<<i)) {
1404             dst = get_dst_reg(c, inst, i);
1405             src0 = get_src_reg(c, inst, 0, i);
1406             src1 = get_src_reg_imm(c, inst, 1, i);
1407             src2 = get_src_reg_imm(c, inst, 2, i);
1408             brw_MUL(p, dst, src0, src1);
1409
1410             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1411             brw_ADD(p, dst, dst, src2);
1412             brw_set_saturate(p, 0);
1413         }
1414     }
1415 }
1416
1417 static void emit_sop(struct brw_wm_compile *c,
1418                      const struct prog_instruction *inst, GLuint cond)
1419 {
1420     struct brw_compile *p = &c->func;
1421     GLuint mask = inst->DstReg.WriteMask;
1422     struct brw_reg dst, src0, src1;
1423     int i;
1424
1425     for (i = 0; i < 4; i++) {
1426         if (mask & (1<<i)) {
1427             dst = get_dst_reg(c, inst, i);
1428             src0 = get_src_reg(c, inst, 0, i);
1429             src1 = get_src_reg_imm(c, inst, 1, i);
1430             brw_push_insn_state(p);
1431             brw_CMP(p, brw_null_reg(), cond, src0, src1);
1432             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1433             brw_MOV(p, dst, brw_imm_f(0.0));
1434             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1435             brw_MOV(p, dst, brw_imm_f(1.0));
1436             brw_pop_insn_state(p);
1437         }
1438     }
1439 }
1440
1441 static void emit_slt(struct brw_wm_compile *c,
1442                      const struct prog_instruction *inst)
1443 {
1444     emit_sop(c, inst, BRW_CONDITIONAL_L);
1445 }
1446
1447 static void emit_sle(struct brw_wm_compile *c,
1448                      const struct prog_instruction *inst)
1449 {
1450     emit_sop(c, inst, BRW_CONDITIONAL_LE);
1451 }
1452
1453 static void emit_sgt(struct brw_wm_compile *c,
1454                      const struct prog_instruction *inst)
1455 {
1456     emit_sop(c, inst, BRW_CONDITIONAL_G);
1457 }
1458
1459 static void emit_sge(struct brw_wm_compile *c,
1460                      const struct prog_instruction *inst)
1461 {
1462     emit_sop(c, inst, BRW_CONDITIONAL_GE);
1463 }
1464
1465 static void emit_seq(struct brw_wm_compile *c,
1466                      const struct prog_instruction *inst)
1467 {
1468     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1469 }
1470
1471 static void emit_sne(struct brw_wm_compile *c,
1472                      const struct prog_instruction *inst)
1473 {
1474     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1475 }
1476
1477 static void emit_ddx(struct brw_wm_compile *c,
1478                      const struct prog_instruction *inst)
1479 {
1480     struct brw_compile *p = &c->func;
1481     GLuint mask = inst->DstReg.WriteMask;
1482     struct brw_reg interp[4];
1483     struct brw_reg dst;
1484     struct brw_reg src0, w;
1485     GLuint nr, i;
1486     src0 = get_src_reg(c, inst, 0, 0);
1487     w = get_src_reg(c, inst, 1, 3);
1488     nr = src0.nr;
1489     interp[0] = brw_vec1_grf(nr, 0);
1490     interp[1] = brw_vec1_grf(nr, 4);
1491     interp[2] = brw_vec1_grf(nr+1, 0);
1492     interp[3] = brw_vec1_grf(nr+1, 4);
1493     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1494     for(i = 0; i < 4; i++ ) {
1495         if (mask & (1<<i)) {
1496             dst = get_dst_reg(c, inst, i);
1497             brw_MOV(p, dst, interp[i]);
1498             brw_MUL(p, dst, dst, w);
1499         }
1500     }
1501     brw_set_saturate(p, 0);
1502 }
1503
1504 static void emit_ddy(struct brw_wm_compile *c,
1505                      const struct prog_instruction *inst)
1506 {
1507     struct brw_compile *p = &c->func;
1508     GLuint mask = inst->DstReg.WriteMask;
1509     struct brw_reg interp[4];
1510     struct brw_reg dst;
1511     struct brw_reg src0, w;
1512     GLuint nr, i;
1513
1514     src0 = get_src_reg(c, inst, 0, 0);
1515     nr = src0.nr;
1516     w = get_src_reg(c, inst, 1, 3);
1517     interp[0] = brw_vec1_grf(nr, 0);
1518     interp[1] = brw_vec1_grf(nr, 4);
1519     interp[2] = brw_vec1_grf(nr+1, 0);
1520     interp[3] = brw_vec1_grf(nr+1, 4);
1521     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1522     for(i = 0; i < 4; i++ ) {
1523         if (mask & (1<<i)) {
1524             dst = get_dst_reg(c, inst, i);
1525             brw_MOV(p, dst, suboffset(interp[i], 1));
1526             brw_MUL(p, dst, dst, w);
1527         }
1528     }
1529     brw_set_saturate(p, 0);
1530 }
1531
1532 static INLINE struct brw_reg high_words( struct brw_reg reg )
1533 {
1534     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1535                    0, 8, 2 );
1536 }
1537
1538 static INLINE struct brw_reg low_words( struct brw_reg reg )
1539 {
1540     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1541 }
1542
1543 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1544 {
1545     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1546 }
1547
1548 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1549 {
1550     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1551                    0, 16, 2 );
1552 }
1553
1554 /* One-, two- and three-dimensional Perlin noise, similar to the description
1555    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1556 static void noise1_sub( struct brw_wm_compile *c ) {
1557
1558     struct brw_compile *p = &c->func;
1559     struct brw_reg param,
1560         x0, x1, /* gradients at each end */
1561         t, tmp[ 2 ], /* float temporaries */
1562         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1563     int i;
1564     int mark = mark_tmps( c );
1565
1566     x0 = alloc_tmp( c );
1567     x1 = alloc_tmp( c );
1568     t = alloc_tmp( c );
1569     tmp[ 0 ] = alloc_tmp( c );
1570     tmp[ 1 ] = alloc_tmp( c );
1571     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1572     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1573     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1574     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1575     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1576
1577     param = lookup_tmp( c, mark - 2 );
1578
1579     brw_set_access_mode( p, BRW_ALIGN_1 );
1580
1581     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1582
1583     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1584        be hashed.  Also compute the remainder (offset within the unit
1585        length), interleaved to reduce register dependency penalties. */
1586     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1587     brw_FRC( p, param, param );
1588     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1589     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1590     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1591
1592     /* We're now ready to perform the hashing.  The two hashes are
1593        interleaved for performance.  The hash function used is
1594        designed to rapidly achieve avalanche and require only 32x16
1595        bit multiplication, and 16-bit swizzles (which we get for
1596        free).  We can't use immediate operands in the multiplies,
1597        because immediates are permitted only in src1 and the 16-bit
1598        factor is permitted only in src0. */
1599     for( i = 0; i < 2; i++ )
1600         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1601     for( i = 0; i < 2; i++ )
1602        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1603                 high_words( itmp[ i ] ) );
1604     for( i = 0; i < 2; i++ )
1605         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1606     for( i = 0; i < 2; i++ )
1607        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1608                 high_words( itmp[ i ] ) );
1609     for( i = 0; i < 2; i++ )
1610         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1611     for( i = 0; i < 2; i++ )
1612        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1613                 high_words( itmp[ i ] ) );
1614
1615     /* Now we want to initialise the two gradients based on the
1616        hashes.  Format conversion from signed integer to float leaves
1617        everything scaled too high by a factor of pow( 2, 31 ), but
1618        we correct for that right at the end. */
1619     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1620     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1621     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1622
1623     brw_MUL( p, x0, x0, param );
1624     brw_MUL( p, x1, x1, t );
1625
1626     /* We interpolate between the gradients using the polynomial
1627        6t^5 - 15t^4 + 10t^3 (Perlin). */
1628     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1629     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1630     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1631     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1632     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1633     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1634                                            pipeline */
1635     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1636     brw_MUL( p, param, tmp[ 0 ], param );
1637     brw_MUL( p, x1, x1, param );
1638     brw_ADD( p, x0, x0, x1 );
1639     /* scale by pow( 2, -30 ), to compensate for the format conversion
1640        above and an extra factor of 2 so that a single gradient covers
1641        the [-1,1] range */
1642     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1643
1644     release_tmps( c, mark );
1645 }
1646
1647 static void emit_noise1( struct brw_wm_compile *c,
1648                          const struct prog_instruction *inst )
1649 {
1650     struct brw_compile *p = &c->func;
1651     struct brw_reg src, param, dst;
1652     GLuint mask = inst->DstReg.WriteMask;
1653     int i;
1654     int mark = mark_tmps( c );
1655
1656     assert( mark == 0 );
1657
1658     src = get_src_reg( c, inst, 0, 0 );
1659
1660     param = alloc_tmp( c );
1661
1662     brw_MOV( p, param, src );
1663
1664     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1665
1666     /* Fill in the result: */
1667     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1668     for (i = 0 ; i < 4; i++) {
1669         if (mask & (1<<i)) {
1670             dst = get_dst_reg(c, inst, i);
1671             brw_MOV( p, dst, param );
1672         }
1673     }
1674     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1675         brw_set_saturate( p, 0 );
1676
1677     release_tmps( c, mark );
1678 }
1679
1680 static void noise2_sub( struct brw_wm_compile *c ) {
1681
1682     struct brw_compile *p = &c->func;
1683     struct brw_reg param0, param1,
1684         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1685         t, tmp[ 4 ], /* float temporaries */
1686         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1687     int i;
1688     int mark = mark_tmps( c );
1689
1690     x0y0 = alloc_tmp( c );
1691     x0y1 = alloc_tmp( c );
1692     x1y0 = alloc_tmp( c );
1693     x1y1 = alloc_tmp( c );
1694     t = alloc_tmp( c );
1695     for( i = 0; i < 4; i++ ) {
1696         tmp[ i ] = alloc_tmp( c );
1697         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1698     }
1699     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1700     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1701     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1702
1703     param0 = lookup_tmp( c, mark - 3 );
1704     param1 = lookup_tmp( c, mark - 2 );
1705
1706     brw_set_access_mode( p, BRW_ALIGN_1 );
1707
1708     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1709        be hashed.  Also compute the remainders (offsets within the unit
1710        square), interleaved to reduce register dependency penalties. */
1711     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1712     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1713     brw_FRC( p, param0, param0 );
1714     brw_FRC( p, param1, param1 );
1715     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1716     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1717              low_words( itmp[ 1 ] ) );
1718     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1719     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1720     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1721     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1722     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1723
1724     /* We're now ready to perform the hashing.  The four hashes are
1725        interleaved for performance.  The hash function used is
1726        designed to rapidly achieve avalanche and require only 32x16
1727        bit multiplication, and 16-bit swizzles (which we get for
1728        free).  We can't use immediate operands in the multiplies,
1729        because immediates are permitted only in src1 and the 16-bit
1730        factor is permitted only in src0. */
1731     for( i = 0; i < 4; i++ )
1732         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1733     for( i = 0; i < 4; i++ )
1734         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1735                  high_words( itmp[ i ] ) );
1736     for( i = 0; i < 4; i++ )
1737         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1738     for( i = 0; i < 4; i++ )
1739         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1740                  high_words( itmp[ i ] ) );
1741     for( i = 0; i < 4; i++ )
1742         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1743     for( i = 0; i < 4; i++ )
1744         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1745                  high_words( itmp[ i ] ) );
1746
1747     /* Now we want to initialise the four gradients based on the
1748        hashes.  Format conversion from signed integer to float leaves
1749        everything scaled too high by a factor of pow( 2, 15 ), but
1750        we correct for that right at the end. */
1751     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1752     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1753     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1754     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1755     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1756
1757     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1758     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1759     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1760     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1761
1762     brw_MUL( p, x1y0, x1y0, t );
1763     brw_MUL( p, x1y1, x1y1, t );
1764     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1765     brw_MUL( p, x0y0, x0y0, param0 );
1766     brw_MUL( p, x0y1, x0y1, param0 );
1767
1768     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1769     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1770     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1771     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1772
1773     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1774     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1775     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1776     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1777
1778     /* We interpolate between the gradients using the polynomial
1779        6t^5 - 15t^4 + 10t^3 (Perlin). */
1780     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1781     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1782     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1783     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1784     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1785     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1786     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1787                                                  pipeline */
1788     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1789     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1790     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1791     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1792     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1793                                                  pipeline */
1794     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1795     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1796     brw_MUL( p, param0, tmp[ 0 ], param0 );
1797     brw_MUL( p, param1, tmp[ 1 ], param1 );
1798
1799     /* Here we interpolate in the y dimension... */
1800     brw_MUL( p, x0y1, x0y1, param1 );
1801     brw_MUL( p, x1y1, x1y1, param1 );
1802     brw_ADD( p, x0y0, x0y0, x0y1 );
1803     brw_ADD( p, x1y0, x1y0, x1y1 );
1804
1805     /* And now in x.  There are horrible register dependencies here,
1806        but we have nothing else to do. */
1807     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1808     brw_MUL( p, x1y0, x1y0, param0 );
1809     brw_ADD( p, x0y0, x0y0, x1y0 );
1810
1811     /* scale by pow( 2, -15 ), as described above */
1812     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1813
1814     release_tmps( c, mark );
1815 }
1816
1817 static void emit_noise2( struct brw_wm_compile *c,
1818                          const struct prog_instruction *inst )
1819 {
1820     struct brw_compile *p = &c->func;
1821     struct brw_reg src0, src1, param0, param1, dst;
1822     GLuint mask = inst->DstReg.WriteMask;
1823     int i;
1824     int mark = mark_tmps( c );
1825
1826     assert( mark == 0 );
1827
1828     src0 = get_src_reg( c, inst, 0, 0 );
1829     src1 = get_src_reg( c, inst, 0, 1 );
1830
1831     param0 = alloc_tmp( c );
1832     param1 = alloc_tmp( c );
1833
1834     brw_MOV( p, param0, src0 );
1835     brw_MOV( p, param1, src1 );
1836
1837     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1838
1839     /* Fill in the result: */
1840     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1841     for (i = 0 ; i < 4; i++) {
1842         if (mask & (1<<i)) {
1843             dst = get_dst_reg(c, inst, i);
1844             brw_MOV( p, dst, param0 );
1845         }
1846     }
1847     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1848         brw_set_saturate( p, 0 );
1849
1850     release_tmps( c, mark );
1851 }
1852
1853 /**
1854  * The three-dimensional case is much like the one- and two- versions above,
1855  * but since the number of corners is rapidly growing we now pack 16 16-bit
1856  * hashes into each register to extract more parallelism from the EUs.
1857  */
1858 static void noise3_sub( struct brw_wm_compile *c ) {
1859
1860     struct brw_compile *p = &c->func;
1861     struct brw_reg param0, param1, param2,
1862         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1863         xi, yi, zi, /* interpolation coefficients */
1864         t, tmp[ 8 ], /* float temporaries */
1865         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1866         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1867     int i;
1868     int mark = mark_tmps( c );
1869
1870     x0y0 = alloc_tmp( c );
1871     x0y1 = alloc_tmp( c );
1872     x1y0 = alloc_tmp( c );
1873     x1y1 = alloc_tmp( c );
1874     xi = alloc_tmp( c );
1875     yi = alloc_tmp( c );
1876     zi = alloc_tmp( c );
1877     t = alloc_tmp( c );
1878     for( i = 0; i < 8; i++ ) {
1879         tmp[ i ] = alloc_tmp( c );
1880         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1881         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1882     }
1883
1884     param0 = lookup_tmp( c, mark - 4 );
1885     param1 = lookup_tmp( c, mark - 3 );
1886     param2 = lookup_tmp( c, mark - 2 );
1887
1888     brw_set_access_mode( p, BRW_ALIGN_1 );
1889
1890     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1891        be hashed.  Also compute the remainders (offsets within the unit
1892        cube), interleaved to reduce register dependency penalties. */
1893     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1894     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1895     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1896     brw_FRC( p, param0, param0 );
1897     brw_FRC( p, param1, param1 );
1898     brw_FRC( p, param2, param2 );
1899     /* Since we now have only 16 bits of precision in the hash, we must
1900        be more careful about thorough mixing to maintain entropy as we
1901        squash the input vector into a small scalar. */
1902     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1903     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1904     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1905              brw_imm_uw( 0x9B93 ) );
1906     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1907              brw_imm_uw( 0xBC8F ) );
1908
1909     /* Temporarily disable the execution mask while we work with ExecSize=16
1910        channels (the mask is set for ExecSize=8 and is probably incorrect).
1911        Although this might cause execution of unwanted channels, the code
1912        writes only to temporary registers and has no side effects, so
1913        disabling the mask is harmless. */
1914     brw_push_insn_state( p );
1915     brw_set_mask_control( p, BRW_MASK_DISABLE );
1916     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1917     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1918     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1919
1920     /* We're now ready to perform the hashing.  The eight hashes are
1921        interleaved for performance.  The hash function used is
1922        designed to rapidly achieve avalanche and require only 16x16
1923        bit multiplication, and 8-bit swizzles (which we get for
1924        free). */
1925     for( i = 0; i < 4; i++ )
1926         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1927     for( i = 0; i < 4; i++ )
1928         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1929                  odd_bytes( wtmp[ i ] ) );
1930     for( i = 0; i < 4; i++ )
1931         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1932     for( i = 0; i < 4; i++ )
1933         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1934                  odd_bytes( wtmp[ i ] ) );
1935     brw_pop_insn_state( p );
1936
1937     /* Now we want to initialise the four rear gradients based on the
1938        hashes.  Format conversion from signed integer to float leaves
1939        everything scaled too high by a factor of pow( 2, 15 ), but
1940        we correct for that right at the end. */
1941     /* x component */
1942     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1943     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1944     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1945     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1946     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1947
1948     brw_push_insn_state( p );
1949     brw_set_mask_control( p, BRW_MASK_DISABLE );
1950     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1951     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1952     brw_pop_insn_state( p );
1953
1954     brw_MUL( p, x1y0, x1y0, t );
1955     brw_MUL( p, x1y1, x1y1, t );
1956     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1957     brw_MUL( p, x0y0, x0y0, param0 );
1958     brw_MUL( p, x0y1, x0y1, param0 );
1959
1960     /* y component */
1961     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1962     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1963     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1964     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1965
1966     brw_push_insn_state( p );
1967     brw_set_mask_control( p, BRW_MASK_DISABLE );
1968     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1969     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1970     brw_pop_insn_state( p );
1971
1972     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1973     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1974     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1975     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1976     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1977
1978     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1979     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1980     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1981     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1982
1983     /* z component */
1984     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1985     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1986     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1987     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1988
1989     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1990     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1991     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1992     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1993
1994     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1995     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1996     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1997     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1998
1999     /* We interpolate between the gradients using the polynomial
2000        6t^5 - 15t^4 + 10t^3 (Perlin). */
2001     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2002     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2003     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2004     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2005     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2006     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2007     brw_MUL( p, xi, xi, param0 );
2008     brw_MUL( p, yi, yi, param1 );
2009     brw_MUL( p, zi, zi, param2 );
2010     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2011     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2012     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2013     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2014     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2015     brw_MUL( p, xi, xi, param0 );
2016     brw_MUL( p, yi, yi, param1 );
2017     brw_MUL( p, zi, zi, param2 );
2018     brw_MUL( p, xi, xi, param0 );
2019     brw_MUL( p, yi, yi, param1 );
2020     brw_MUL( p, zi, zi, param2 );
2021     brw_MUL( p, xi, xi, param0 );
2022     brw_MUL( p, yi, yi, param1 );
2023     brw_MUL( p, zi, zi, param2 );
2024
2025     /* Here we interpolate in the y dimension... */
2026     brw_MUL( p, x0y1, x0y1, yi );
2027     brw_MUL( p, x1y1, x1y1, yi );
2028     brw_ADD( p, x0y0, x0y0, x0y1 );
2029     brw_ADD( p, x1y0, x1y0, x1y1 );
2030
2031     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2032     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2033     brw_MUL( p, x1y0, x1y0, xi );
2034     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2035
2036     /* Now do the same thing for the front four gradients... */
2037     /* x component */
2038     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2039     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2040     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2041     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2042
2043     brw_push_insn_state( p );
2044     brw_set_mask_control( p, BRW_MASK_DISABLE );
2045     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2046     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2047     brw_pop_insn_state( p );
2048
2049     brw_MUL( p, x1y0, x1y0, t );
2050     brw_MUL( p, x1y1, x1y1, t );
2051     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2052     brw_MUL( p, x0y0, x0y0, param0 );
2053     brw_MUL( p, x0y1, x0y1, param0 );
2054
2055     /* y component */
2056     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2057     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2058     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2059     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2060
2061     brw_push_insn_state( p );
2062     brw_set_mask_control( p, BRW_MASK_DISABLE );
2063     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2064     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2065     brw_pop_insn_state( p );
2066
2067     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2068     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2069     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2070     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2071     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2072
2073     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2074     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2075     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2076     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2077
2078     /* z component */
2079     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2080     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2081     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2082     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2083
2084     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2085     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2086     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2087     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2088
2089     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2090     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2091     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2092     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2093
2094     /* The interpolation coefficients are still around from last time, so
2095        again interpolate in the y dimension... */
2096     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2097     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2098     brw_MUL( p, x0y1, x0y1, yi );
2099     brw_MUL( p, x1y1, x1y1, yi );
2100     brw_ADD( p, x0y0, x0y0, x0y1 );
2101     brw_ADD( p, x1y0, x1y0, x1y1 );
2102
2103     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2104        time put the front face in tmp[ 1 ] and we're nearly there... */
2105     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2106     brw_MUL( p, x1y0, x1y0, xi );
2107     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2108
2109     /* The final interpolation, in the z dimension: */
2110     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2111     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2112     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2113
2114     /* scale by pow( 2, -15 ), as described above */
2115     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2116
2117     release_tmps( c, mark );
2118 }
2119
2120 static void emit_noise3( struct brw_wm_compile *c,
2121                          const struct prog_instruction *inst )
2122 {
2123     struct brw_compile *p = &c->func;
2124     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2125     GLuint mask = inst->DstReg.WriteMask;
2126     int i;
2127     int mark = mark_tmps( c );
2128
2129     assert( mark == 0 );
2130
2131     src0 = get_src_reg( c, inst, 0, 0 );
2132     src1 = get_src_reg( c, inst, 0, 1 );
2133     src2 = get_src_reg( c, inst, 0, 2 );
2134
2135     param0 = alloc_tmp( c );
2136     param1 = alloc_tmp( c );
2137     param2 = alloc_tmp( c );
2138
2139     brw_MOV( p, param0, src0 );
2140     brw_MOV( p, param1, src1 );
2141     brw_MOV( p, param2, src2 );
2142
2143     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2144
2145     /* Fill in the result: */
2146     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2147     for (i = 0 ; i < 4; i++) {
2148         if (mask & (1<<i)) {
2149             dst = get_dst_reg(c, inst, i);
2150             brw_MOV( p, dst, param0 );
2151         }
2152     }
2153     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2154         brw_set_saturate( p, 0 );
2155
2156     release_tmps( c, mark );
2157 }
2158
2159 /**
2160  * For the four-dimensional case, the little micro-optimisation benefits
2161  * we obtain by unrolling all the loops aren't worth the massive bloat it
2162  * now causes.  Instead, we loop twice around performing a similar operation
2163  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2164  * code to glue it all together.
2165  */
2166 static void noise4_sub( struct brw_wm_compile *c )
2167 {
2168     struct brw_compile *p = &c->func;
2169     struct brw_reg param[ 4 ],
2170         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2171         w0, /* noise for the w=0 cube */
2172         floors[ 2 ], /* integer coordinates of base corner of hypercube */
2173         interp[ 4 ], /* interpolation coefficients */
2174         t, tmp[ 8 ], /* float temporaries */
2175         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2176         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2177     int i, j;
2178     int mark = mark_tmps( c );
2179     GLuint loop, origin;
2180
2181     x0y0 = alloc_tmp( c );
2182     x0y1 = alloc_tmp( c );
2183     x1y0 = alloc_tmp( c );
2184     x1y1 = alloc_tmp( c );
2185     t = alloc_tmp( c );
2186     w0 = alloc_tmp( c );
2187     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2188     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2189
2190     for( i = 0; i < 4; i++ ) {
2191         param[ i ] = lookup_tmp( c, mark - 5 + i );
2192         interp[ i ] = alloc_tmp( c );
2193     }
2194
2195     for( i = 0; i < 8; i++ ) {
2196         tmp[ i ] = alloc_tmp( c );
2197         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2198         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2199     }
2200
2201     brw_set_access_mode( p, BRW_ALIGN_1 );
2202
2203     /* We only want 16 bits of precision from the integral part of each
2204        co-ordinate, but unfortunately the RNDD semantics would saturate
2205        at 16 bits if we performed the operation directly to a 16-bit
2206        destination.  Therefore, we round to 32-bit temporaries where
2207        appropriate, and then store only the lower 16 bits. */
2208     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2209     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2210     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2211     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2212     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2213     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2214
2215     /* Modify the flag register here, because the side effect is useful
2216        later (see below).  We know for certain that all flags will be
2217        cleared, since the FRC instruction cannot possibly generate
2218        negative results.  Even for exceptional inputs (infinities, denormals,
2219        NaNs), the architecture guarantees that the L conditional is false. */
2220     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2221     brw_FRC( p, param[ 0 ], param[ 0 ] );
2222     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2223     for( i = 1; i < 4; i++ )
2224         brw_FRC( p, param[ i ], param[ i ] );
2225
2226     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2227        of all. */
2228     for( i = 0; i < 4; i++ )
2229         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2230     for( i = 0; i < 4; i++ )
2231         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2232     for( i = 0; i < 4; i++ )
2233         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2234     for( i = 0; i < 4; i++ )
2235         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2236     for( j = 0; j < 3; j++ )
2237         for( i = 0; i < 4; i++ )
2238             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2239
2240     /* Mark the current address, as it will be a jump destination.  The
2241        following code will be executed twice: first, with the flag
2242        register clear indicating the w=0 case, and second with flags
2243        set for w=1. */
2244     loop = p->nr_insn;
2245
2246     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2247        be hashed.  Since we have only 16 bits of precision in the hash, we
2248        must be careful about thorough mixing to maintain entropy as we
2249        squash the input vector into a small scalar. */
2250     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2251              brw_imm_uw( 0xBC8F ) );
2252     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2253              brw_imm_uw( 0xD0BD ) );
2254     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2255              brw_imm_uw( 0x9B93 ) );
2256     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2257              brw_imm_uw( 0xA359 ) );
2258     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2259              brw_imm_uw( 0xBC8F ) );
2260
2261     /* Temporarily disable the execution mask while we work with ExecSize=16
2262        channels (the mask is set for ExecSize=8 and is probably incorrect).
2263        Although this might cause execution of unwanted channels, the code
2264        writes only to temporary registers and has no side effects, so
2265        disabling the mask is harmless. */
2266     brw_push_insn_state( p );
2267     brw_set_mask_control( p, BRW_MASK_DISABLE );
2268     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2269     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2270     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2271
2272     /* We're now ready to perform the hashing.  The eight hashes are
2273        interleaved for performance.  The hash function used is
2274        designed to rapidly achieve avalanche and require only 16x16
2275        bit multiplication, and 8-bit swizzles (which we get for
2276        free). */
2277     for( i = 0; i < 4; i++ )
2278         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2279     for( i = 0; i < 4; i++ )
2280         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2281                  odd_bytes( wtmp[ i ] ) );
2282     for( i = 0; i < 4; i++ )
2283         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2284     for( i = 0; i < 4; i++ )
2285         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2286                  odd_bytes( wtmp[ i ] ) );
2287     brw_pop_insn_state( p );
2288
2289     /* Now we want to initialise the four rear gradients based on the
2290        hashes.  Format conversion from signed integer to float leaves
2291        everything scaled too high by a factor of pow( 2, 15 ), but
2292        we correct for that right at the end. */
2293     /* x component */
2294     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2295     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2296     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2297     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2298     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2299
2300     brw_push_insn_state( p );
2301     brw_set_mask_control( p, BRW_MASK_DISABLE );
2302     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2303     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2304     brw_pop_insn_state( p );
2305
2306     brw_MUL( p, x1y0, x1y0, t );
2307     brw_MUL( p, x1y1, x1y1, t );
2308     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2309     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2310     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2311
2312     /* y component */
2313     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2314     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2315     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2316     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2317
2318     brw_push_insn_state( p );
2319     brw_set_mask_control( p, BRW_MASK_DISABLE );
2320     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2321     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2322     brw_pop_insn_state( p );
2323
2324     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2325     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2326     /* prepare t for the w component (used below): w the first time through
2327        the loop; w - 1 the second time) */
2328     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2329     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2330     p->current->header.predicate_inverse = 1;
2331     brw_MOV( p, t, param[ 3 ] );
2332     p->current->header.predicate_inverse = 0;
2333     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2334     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2335     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2336
2337     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2338     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2339     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2340     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2341
2342     /* z component */
2343     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2344     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2345     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2346     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2347
2348     brw_push_insn_state( p );
2349     brw_set_mask_control( p, BRW_MASK_DISABLE );
2350     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2351     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2352     brw_pop_insn_state( p );
2353
2354     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2355     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2356     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2357     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2358
2359     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2360     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2361     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2362     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2363
2364     /* w component */
2365     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2366     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2367     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2368     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2369
2370     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2371     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2372     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2373     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2374     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2375
2376     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2377     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2378     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2379     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2380
2381     /* Here we interpolate in the y dimension... */
2382     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2383     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2384     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2385     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2386     brw_ADD( p, x0y0, x0y0, x0y1 );
2387     brw_ADD( p, x1y0, x1y0, x1y1 );
2388
2389     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
2390     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2391     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2392     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2393
2394     /* Now do the same thing for the front four gradients... */
2395     /* x component */
2396     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2397     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2398     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2399     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2400
2401     brw_push_insn_state( p );
2402     brw_set_mask_control( p, BRW_MASK_DISABLE );
2403     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2404     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2405     brw_pop_insn_state( p );
2406
2407     brw_MUL( p, x1y0, x1y0, t );
2408     brw_MUL( p, x1y1, x1y1, t );
2409     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2410     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2411     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2412
2413     /* y component */
2414     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2415     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2416     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2417     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2418
2419     brw_push_insn_state( p );
2420     brw_set_mask_control( p, BRW_MASK_DISABLE );
2421     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2422     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2423     brw_pop_insn_state( p );
2424
2425     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2426     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2427     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2428     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2429     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2430
2431     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2432     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2433     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2434     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2435
2436     /* z component */
2437     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2438     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2439     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2440     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2441
2442     brw_push_insn_state( p );
2443     brw_set_mask_control( p, BRW_MASK_DISABLE );
2444     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2445     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2446     brw_pop_insn_state( p );
2447
2448     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2449     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2450     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2451     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2452     /* prepare t for the w component (used below): w the first time through
2453        the loop; w - 1 the second time) */
2454     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2455     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2456     p->current->header.predicate_inverse = 1;
2457     brw_MOV( p, t, param[ 3 ] );
2458     p->current->header.predicate_inverse = 0;
2459     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2460
2461     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2462     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2463     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2464     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2465
2466     /* w component */
2467     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2468     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2469     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2470     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2471
2472     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2473     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2474     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2475     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2476
2477     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2478     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2479     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2480     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2481
2482     /* Interpolate in the y dimension: */
2483     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2484     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2485     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2486     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2487     brw_ADD( p, x0y0, x0y0, x0y1 );
2488     brw_ADD( p, x1y0, x1y0, x1y1 );
2489
2490     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
2491        time put the front face in tmp[ 1 ] and we're nearly there... */
2492     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2493     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2494     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2495
2496     /* Another interpolation, in the z dimension: */
2497     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2498     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2499     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2500
2501     /* Exit the loop if we've computed both cubes... */
2502     origin = p->nr_insn;
2503     brw_push_insn_state( p );
2504     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2505     brw_set_mask_control( p, BRW_MASK_DISABLE );
2506     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2507     brw_pop_insn_state( p );
2508
2509     /* Save the result for the w=0 case, and increment the w coordinate: */
2510     brw_MOV( p, w0, tmp[ 0 ] );
2511     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2512              brw_imm_uw( 1 ) );
2513
2514     /* Loop around for the other cube.  Explicitly set the flag register
2515        (unfortunately we must spend an extra instruction to do this: we
2516        can't rely on a side effect of the previous MOV or ADD because
2517        conditional modifiers which are normally true might be false in
2518        exceptional circumstances, e.g. given a NaN input; the add to
2519        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2520     brw_push_insn_state( p );
2521     brw_set_mask_control( p, BRW_MASK_DISABLE );
2522     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2523     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2524              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2525     brw_pop_insn_state( p );
2526
2527     /* Patch the previous conditional branch now that we know the
2528        destination address. */
2529     brw_set_src1( p->store + origin,
2530                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2531
2532     /* The very last interpolation. */
2533     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2534     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2535     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2536
2537     /* scale by pow( 2, -15 ), as described above */
2538     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2539
2540     release_tmps( c, mark );
2541 }
2542
2543 static void emit_noise4( struct brw_wm_compile *c,
2544                          const struct prog_instruction *inst )
2545 {
2546     struct brw_compile *p = &c->func;
2547     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2548     GLuint mask = inst->DstReg.WriteMask;
2549     int i;
2550     int mark = mark_tmps( c );
2551
2552     assert( mark == 0 );
2553
2554     src0 = get_src_reg( c, inst, 0, 0 );
2555     src1 = get_src_reg( c, inst, 0, 1 );
2556     src2 = get_src_reg( c, inst, 0, 2 );
2557     src3 = get_src_reg( c, inst, 0, 3 );
2558
2559     param0 = alloc_tmp( c );
2560     param1 = alloc_tmp( c );
2561     param2 = alloc_tmp( c );
2562     param3 = alloc_tmp( c );
2563
2564     brw_MOV( p, param0, src0 );
2565     brw_MOV( p, param1, src1 );
2566     brw_MOV( p, param2, src2 );
2567     brw_MOV( p, param3, src3 );
2568
2569     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2570
2571     /* Fill in the result: */
2572     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2573     for (i = 0 ; i < 4; i++) {
2574         if (mask & (1<<i)) {
2575             dst = get_dst_reg(c, inst, i);
2576             brw_MOV( p, dst, param0 );
2577         }
2578     }
2579     if( inst->SaturateMode == SATURATE_ZERO_ONE )
2580         brw_set_saturate( p, 0 );
2581
2582     release_tmps( c, mark );
2583 }
2584
2585 static void emit_wpos_xy(struct brw_wm_compile *c,
2586                          const struct prog_instruction *inst)
2587 {
2588     struct brw_compile *p = &c->func;
2589     GLuint mask = inst->DstReg.WriteMask;
2590     struct brw_reg src0[2], dst[2];
2591
2592     dst[0] = get_dst_reg(c, inst, 0);
2593     dst[1] = get_dst_reg(c, inst, 1);
2594
2595     src0[0] = get_src_reg(c, inst, 0, 0);
2596     src0[1] = get_src_reg(c, inst, 0, 1);
2597
2598     /* Calculate the pixel offset from window bottom left into destination
2599      * X and Y channels.
2600      */
2601     if (mask & WRITEMASK_X) {
2602         /* X' = X - origin_x */
2603         brw_ADD(p,
2604                 dst[0],
2605                 retype(src0[0], BRW_REGISTER_TYPE_W),
2606                 brw_imm_d(0 - c->key.origin_x));
2607     }
2608
2609     if (mask & WRITEMASK_Y) {
2610         /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2611         brw_ADD(p,
2612                 dst[1],
2613                 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2614                 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2615     }
2616 }
2617
2618 /* TODO
2619    BIAS on SIMD8 not working yet...
2620  */
2621 static void emit_txb(struct brw_wm_compile *c,
2622                      const struct prog_instruction *inst)
2623 {
2624     struct brw_compile *p = &c->func;
2625     struct brw_reg dst[4], src[4], payload_reg;
2626     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2627     const GLuint unit = inst->TexSrcUnit;
2628     GLuint i;
2629     GLuint msg_type;
2630
2631     assert(unit < BRW_MAX_TEX_UNIT);
2632
2633     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2634
2635     for (i = 0; i < 4; i++)
2636         dst[i] = get_dst_reg(c, inst, i);
2637     for (i = 0; i < 4; i++)
2638         src[i] = get_src_reg(c, inst, 0, i);
2639
2640     switch (inst->TexSrcTarget) {
2641         case TEXTURE_1D_INDEX:
2642             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
2643             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
2644             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
2645             break;
2646         case TEXTURE_2D_INDEX:
2647         case TEXTURE_RECT_INDEX:
2648             brw_MOV(p, brw_message_reg(2), src[0]);
2649             brw_MOV(p, brw_message_reg(3), src[1]);
2650             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2651             break;
2652         case TEXTURE_3D_INDEX:
2653         case TEXTURE_CUBE_INDEX:
2654             brw_MOV(p, brw_message_reg(2), src[0]);
2655             brw_MOV(p, brw_message_reg(3), src[1]);
2656             brw_MOV(p, brw_message_reg(4), src[2]);
2657             break;
2658         default:
2659             /* invalid target */
2660             abort();
2661     }
2662     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
2663     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
2664
2665     if (BRW_IS_IGDNG(p->brw)) {
2666         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2667     } else {
2668         /* Does it work well on SIMD8? */
2669         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2670     }
2671
2672     brw_SAMPLE(p,
2673                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
2674                1,                                           /* msg_reg_nr */
2675                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
2676                SURF_INDEX_TEXTURE(unit),
2677                unit,                                        /* sampler */
2678                inst->DstReg.WriteMask,                      /* writemask */
2679                msg_type,                                    /* msg_type */
2680                4,                                           /* response_length */
2681                4,                                           /* msg_length */
2682                0,                                           /* eot */
2683                1,
2684                BRW_SAMPLER_SIMD_MODE_SIMD8);
2685 }
2686
2687
2688 static void emit_tex(struct brw_wm_compile *c,
2689                      const struct prog_instruction *inst)
2690 {
2691     struct brw_compile *p = &c->func;
2692     struct brw_reg dst[4], src[4], payload_reg;
2693     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2694     const GLuint unit = inst->TexSrcUnit;
2695     GLuint msg_len;
2696     GLuint i, nr;
2697     GLuint emit;
2698     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2699     GLuint msg_type;
2700
2701     assert(unit < BRW_MAX_TEX_UNIT);
2702
2703     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2704
2705     for (i = 0; i < 4; i++)
2706         dst[i] = get_dst_reg(c, inst, i);
2707     for (i = 0; i < 4; i++)
2708         src[i] = get_src_reg(c, inst, 0, i);
2709
2710     switch (inst->TexSrcTarget) {
2711         case TEXTURE_1D_INDEX:
2712             emit = WRITEMASK_X;
2713             nr = 1;
2714             break;
2715         case TEXTURE_2D_INDEX:
2716         case TEXTURE_RECT_INDEX:
2717             emit = WRITEMASK_XY;
2718             nr = 2;
2719             break;
2720         case TEXTURE_3D_INDEX:
2721         case TEXTURE_CUBE_INDEX:
2722             emit = WRITEMASK_XYZ;
2723             nr = 3;
2724             break;
2725         default:
2726            /* invalid target */
2727            abort();
2728     }
2729     msg_len = 1;
2730
2731     /* move/load S, T, R coords */
2732     for (i = 0; i < nr; i++) {
2733         static const GLuint swz[4] = {0,1,2,2};
2734         if (emit & (1<<i))
2735             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2736         else
2737             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2738         msg_len += 1;
2739     }
2740
2741     if (shadow) {
2742        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2743        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2744     }
2745
2746     if (BRW_IS_IGDNG(p->brw)) {
2747         if (shadow)
2748             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2749         else
2750             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2751     } else {
2752         /* Does it work for shadow on SIMD8 ? */
2753         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2754     }
2755
2756     brw_SAMPLE(p,
2757                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2758                1,                                          /* msg_reg_nr */
2759                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2760                SURF_INDEX_TEXTURE(unit),
2761                unit,                                       /* sampler */
2762                inst->DstReg.WriteMask,                     /* writemask */
2763                msg_type,                                   /* msg_type */
2764                4,                                          /* response_length */
2765                shadow ? 6 : 4,                             /* msg_length */
2766                0,                                          /* eot */
2767                1,
2768                BRW_SAMPLER_SIMD_MODE_SIMD8);
2769
2770     if (shadow)
2771         brw_MOV(p, dst[3], brw_imm_f(1.0));
2772 }
2773
2774
2775 /**
2776  * Resolve subroutine calls after code emit is done.
2777  */
2778 static void post_wm_emit( struct brw_wm_compile *c )
2779 {
2780     brw_resolve_cals(&c->func);
2781 }
2782
2783 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2784 {
2785 #define MAX_IF_DEPTH 32
2786 #define MAX_LOOP_DEPTH 32
2787     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2788     GLuint i, if_depth = 0, loop_depth = 0;
2789     struct brw_compile *p = &c->func;
2790     struct brw_indirect stack_index = brw_indirect(0, 0);
2791
2792     c->out_of_regs = GL_FALSE;
2793
2794     prealloc_reg(c);
2795     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2796     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2797
2798     for (i = 0; i < c->nr_fp_insns; i++) {
2799         const struct prog_instruction *inst = &c->prog_instructions[i];
2800
2801         c->cur_inst = i;
2802
2803 #if 0
2804         _mesa_printf("Inst %d: ", i);
2805         _mesa_print_instruction(inst);
2806 #endif
2807
2808         /* fetch any constants that this instruction needs */
2809         if (c->fp->use_const_buffer)
2810            fetch_constants(c, inst);
2811
2812         if (inst->CondUpdate)
2813             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2814         else
2815             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2816
2817         switch (inst->Opcode) {
2818             case WM_PIXELXY:
2819                 emit_pixel_xy(c, inst);
2820                 break;
2821             case WM_DELTAXY:
2822                 emit_delta_xy(c, inst);
2823                 break;
2824             case WM_PIXELW:
2825                 emit_pixel_w(c, inst);
2826                 break;
2827             case WM_LINTERP:
2828                 emit_linterp(c, inst);
2829                 break;
2830             case WM_PINTERP:
2831                 emit_pinterp(c, inst);
2832                 break;
2833             case WM_CINTERP:
2834                 emit_cinterp(c, inst);
2835                 break;
2836             case WM_WPOSXY:
2837                 emit_wpos_xy(c, inst);
2838                 break;
2839             case WM_FB_WRITE:
2840                 emit_fb_write(c, inst);
2841                 break;
2842             case WM_FRONTFACING:
2843                 emit_frontfacing(c, inst);
2844                 break;
2845             case OPCODE_ADD:
2846                 emit_add(c, inst);
2847                 break;
2848             case OPCODE_ARL:
2849                 emit_arl(c, inst);
2850                 break;
2851             case OPCODE_FRC:
2852                 emit_frc(c, inst);
2853                 break;
2854             case OPCODE_FLR:
2855                 emit_flr(c, inst);
2856                 break;
2857             case OPCODE_LRP:
2858                 emit_lrp(c, inst);
2859                 break;
2860             case OPCODE_TRUNC:
2861                 emit_trunc(c, inst);
2862                 break;
2863             case OPCODE_MOV:
2864             case OPCODE_SWZ:
2865                 emit_mov(c, inst);
2866                 break;
2867             case OPCODE_DP3:
2868                 emit_dp3(c, inst);
2869                 break;
2870             case OPCODE_DP4:
2871                 emit_dp4(c, inst);
2872                 break;
2873             case OPCODE_XPD:
2874                 emit_xpd(c, inst);
2875                 break;
2876             case OPCODE_DPH:
2877                 emit_dph(c, inst);
2878                 break;
2879             case OPCODE_RCP:
2880                 emit_rcp(c, inst);
2881                 break;
2882             case OPCODE_RSQ:
2883                 emit_rsq(c, inst);
2884                 break;
2885             case OPCODE_SIN:
2886                 emit_sin(c, inst);
2887                 break;
2888             case OPCODE_COS:
2889                 emit_cos(c, inst);
2890                 break;
2891             case OPCODE_EX2:
2892                 emit_ex2(c, inst);
2893                 break;
2894             case OPCODE_LG2:
2895                 emit_lg2(c, inst);
2896                 break;
2897             case OPCODE_MIN:
2898             case OPCODE_MAX:
2899                 emit_min_max(c, inst);
2900                 break;
2901             case OPCODE_DDX:
2902                 emit_ddx(c, inst);
2903                 break;
2904             case OPCODE_DDY:
2905                 emit_ddy(c, inst);
2906                 break;
2907             case OPCODE_SLT:
2908                 emit_slt(c, inst);
2909                 break;
2910             case OPCODE_SLE:
2911                 emit_sle(c, inst);
2912                 break;
2913             case OPCODE_SGT:
2914                 emit_sgt(c, inst);
2915                 break;
2916             case OPCODE_SGE:
2917                 emit_sge(c, inst);
2918                 break;
2919             case OPCODE_SEQ:
2920                 emit_seq(c, inst);
2921                 break;
2922             case OPCODE_SNE:
2923                 emit_sne(c, inst);
2924                 break;
2925             case OPCODE_MUL:
2926                 emit_mul(c, inst);
2927                 break;
2928             case OPCODE_POW:
2929                 emit_pow(c, inst);
2930                 break;
2931             case OPCODE_MAD:
2932                 emit_mad(c, inst);
2933                 break;
2934             case OPCODE_NOISE1:
2935                 emit_noise1(c, inst);
2936                 break;
2937             case OPCODE_NOISE2:
2938                 emit_noise2(c, inst);
2939                 break;
2940             case OPCODE_NOISE3:
2941                 emit_noise3(c, inst);
2942                 break;
2943             case OPCODE_NOISE4:
2944                 emit_noise4(c, inst);
2945                 break;
2946             case OPCODE_TEX:
2947                 emit_tex(c, inst);
2948                 break;
2949             case OPCODE_TXB:
2950                 emit_txb(c, inst);
2951                 break;
2952             case OPCODE_KIL_NV:
2953                 emit_kil(c);
2954                 break;
2955             case OPCODE_IF:
2956                 assert(if_depth < MAX_IF_DEPTH);
2957                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2958                 break;
2959             case OPCODE_ELSE:
2960                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2961                 break;
2962             case OPCODE_ENDIF:
2963                 assert(if_depth > 0);
2964                 brw_ENDIF(p, if_inst[--if_depth]);
2965                 break;
2966             case OPCODE_BGNSUB:
2967                 brw_save_label(p, inst->Comment, p->nr_insn);
2968                 break;
2969             case OPCODE_ENDSUB:
2970                 /* no-op */
2971                 break;
2972             case OPCODE_CAL:
2973                 brw_push_insn_state(p);
2974                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2975                 brw_set_access_mode(p, BRW_ALIGN_1);
2976                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2977                 brw_set_access_mode(p, BRW_ALIGN_16);
2978                 brw_ADD(p, get_addr_reg(stack_index),
2979                          get_addr_reg(stack_index), brw_imm_d(4));
2980                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2981                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2982                 brw_pop_insn_state(p);
2983                 break;
2984
2985             case OPCODE_RET:
2986                 brw_push_insn_state(p);
2987                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2988                 brw_ADD(p, get_addr_reg(stack_index),
2989                         get_addr_reg(stack_index), brw_imm_d(-4));
2990                 brw_set_access_mode(p, BRW_ALIGN_1);
2991                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2992                 brw_set_access_mode(p, BRW_ALIGN_16);
2993                 brw_pop_insn_state(p);
2994
2995                 break;
2996             case OPCODE_BGNLOOP:
2997                 /* XXX may need to invalidate the current_constant regs */
2998                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2999                 break;
3000             case OPCODE_BRK:
3001                 brw_BREAK(p);
3002                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3003                 break;
3004             case OPCODE_CONT:
3005                 brw_CONT(p);
3006                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3007                 break;
3008             case OPCODE_ENDLOOP:
3009                {
3010                   struct brw_instruction *inst0, *inst1;
3011                   GLuint br = 1;
3012
3013                   if (BRW_IS_IGDNG(brw))
3014                      br = 2;
3015
3016                   loop_depth--;
3017                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
3018                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
3019                   while (inst0 > loop_inst[loop_depth]) {
3020                      inst0--;
3021                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
3022                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3023                         inst0->bits3.if_else.pop_count = 0;
3024                      }
3025                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3026                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3027                         inst0->bits3.if_else.pop_count = 0;
3028                      }
3029                   }
3030                }
3031                break;
3032             default:
3033                 _mesa_printf("unsupported IR in fragment shader %d\n",
3034                         inst->Opcode);
3035         }
3036
3037         if (inst->CondUpdate)
3038             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3039         else
3040             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3041     }
3042     post_wm_emit(c);
3043
3044     if (INTEL_DEBUG & DEBUG_WM) {
3045       _mesa_printf("wm-native:\n");
3046       for (i = 0; i < p->nr_insn; i++)
3047          brw_disasm(stderr, &p->store[i]);
3048       _mesa_printf("\n");
3049     }
3050 }
3051
3052 /**
3053  * Do GPU code generation for shaders that use GLSL features such as
3054  * flow control.  Other shaders will be compiled with the
3055  */
3056 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3057 {
3058     if (INTEL_DEBUG & DEBUG_WM) {
3059         _mesa_printf("brw_wm_glsl_emit:\n");
3060     }
3061
3062     /* initial instruction translation/simplification */
3063     brw_wm_pass_fp(c);
3064
3065     /* actual code generation */
3066     brw_wm_emit_glsl(brw, c);
3067
3068     if (INTEL_DEBUG & DEBUG_WM) {
3069         brw_wm_print_program(c, "brw_wm_glsl_emit done");
3070     }
3071
3072     c->prog_data.total_grf = num_grf_used(c);
3073     c->prog_data.total_scratch = 0;
3074 }