src/mesa/drivers/dri/i965/brw_wm_glsl.c

   1 #include "main/macros.h"
   2 #include "shader/prog_parameter.h"
   3 #include "shader/prog_print.h"
   4 #include "shader/prog_optimize.h"
   5 #include "brw_context.h"
   6 #include "brw_eu.h"
   7 #include "brw_wm.h"
   8
   9 enum _subroutine {
  10     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
  11 };
  12
  13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
  14                                   const struct prog_instruction *inst,
  15                                   GLuint component);
  16
  17 /**
  18  * Determine if the given fragment program uses GLSL features such
  19  * as flow conditionals, loops, subroutines.
  20  * Some GLSL shaders may use these features, others might not.
  21  */
  22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
  23 {
  24     int i;
  25
  26     for (i = 0; i < fp->Base.NumInstructions; i++) {
  27         const struct prog_instruction *inst = &fp->Base.Instructions[i];
  28         switch (inst->Opcode) {
  29             case OPCODE_ARL:
  30             case OPCODE_IF:
  31             case OPCODE_ENDIF:
  32             case OPCODE_CAL:
  33             case OPCODE_BRK:
  34             case OPCODE_RET:
  35             case OPCODE_NOISE1:
  36             case OPCODE_NOISE2:
  37             case OPCODE_NOISE3:
  38             case OPCODE_NOISE4:
  39             case OPCODE_BGNLOOP:
  40                 return GL_TRUE;
  41             default:
  42                 break;
  43         }
  44     }
  45     return GL_FALSE;
  46 }
  47
  48
  49
  50 static void
  51 reclaim_temps(struct brw_wm_compile *c);
  52
  53
  54 /** Mark GRF register as used. */
  55 static void
  56 prealloc_grf(struct brw_wm_compile *c, int r)
  57 {
  58    c->used_grf[r] = GL_TRUE;
  59 }
  60
  61
  62 /** Mark given GRF register as not in use. */
  63 static void
  64 release_grf(struct brw_wm_compile *c, int r)
  65 {
  66    /*assert(c->used_grf[r]);*/
  67    c->used_grf[r] = GL_FALSE;
  68    c->first_free_grf = MIN2(c->first_free_grf, r);
  69 }
  70
  71
  72 /** Return index of a free GRF, mark it as used. */
  73 static int
  74 alloc_grf(struct brw_wm_compile *c)
  75 {
  76    GLuint r;
  77    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  78       if (!c->used_grf[r]) {
  79          c->used_grf[r] = GL_TRUE;
  80          c->first_free_grf = r + 1;  /* a guess */
  81          return r;
  82       }
  83    }
  84
  85    /* no free temps, try to reclaim some */
  86    reclaim_temps(c);
  87    c->first_free_grf = 0;
  88
  89    /* try alloc again */
  90    for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
  91       if (!c->used_grf[r]) {
  92          c->used_grf[r] = GL_TRUE;
  93          c->first_free_grf = r + 1;  /* a guess */
  94          return r;
  95       }
  96    }
  97
  98    for (r = 0; r < BRW_WM_MAX_GRF; r++) {
  99       assert(c->used_grf[r]);
 100    }
 101
 102    /* really, no free GRF regs found */
 103    if (!c->out_of_regs) {
 104       /* print warning once per compilation */
 105       _mesa_warning(NULL, "i965: ran out of registers for fragment program");
 106       c->out_of_regs = GL_TRUE;
 107    }
 108
 109    return -1;
 110 }
 111
 112
 113 /** Return number of GRF registers used */
 114 static int
 115 num_grf_used(const struct brw_wm_compile *c)
 116 {
 117    int r;
 118    for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
 119       if (c->used_grf[r])
 120          return r + 1;
 121    return 0;
 122 }
 123
 124
 125
 126 /**
 127  * Record the mapping of a Mesa register to a hardware register.
 128  */
 129 static void set_reg(struct brw_wm_compile *c, int file, int index,
 130         int component, struct brw_reg reg)
 131 {
 132     c->wm_regs[file][index][component].reg = reg;
 133     c->wm_regs[file][index][component].inited = GL_TRUE;
 134 }
 135
 136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 137 {
 138     struct brw_reg reg;
 139
 140     /* if we need to allocate another temp, grow the tmp_regs[] array */
 141     if (c->tmp_index == c->tmp_max) {
 142        int r = alloc_grf(c);
 143        if (r < 0) {
 144           /*printf("Out of temps in %s\n", __FUNCTION__);*/
 145           r = 50; /* XXX random register! */
 146        }
 147        c->tmp_regs[ c->tmp_max++ ] = r;
 148     }
 149
 150     /* form the GRF register */
 151     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
 152     /*printf("alloc_temp %d\n", reg.nr);*/
 153     assert(reg.nr < BRW_WM_MAX_GRF);
 154     return reg;
 155
 156 }
 157
 158 /**
 159  * Save current temp register info.
 160  * There must be a matching call to release_tmps().
 161  */
 162 static int mark_tmps(struct brw_wm_compile *c)
 163 {
 164     return c->tmp_index;
 165 }
 166
 167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
 168 {
 169     return brw_vec8_grf( c->tmp_regs[ index ], 0 );
 170 }
 171
 172 static void release_tmps(struct brw_wm_compile *c, int mark)
 173 {
 174     c->tmp_index = mark;
 175 }
 176
 177 /**
 178  * Convert Mesa src register to brw register.
 179  *
 180  * Since we're running in SOA mode each Mesa register corresponds to four
 181  * hardware registers.  We allocate the hardware registers as needed here.
 182  *
 183  * \param file  register file, one of PROGRAM_x
 184  * \param index  register number
 185  * \param component  src component (X=0, Y=1, Z=2, W=3)
 186  * \param nr  not used?!?
 187  * \param neg  negate value?
 188  * \param abs  take absolute value?
 189  */
 190 static struct brw_reg
 191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
 192         int nr, GLuint neg, GLuint abs)
 193 {
 194     struct brw_reg reg;
 195     switch (file) {
 196         case PROGRAM_STATE_VAR:
 197         case PROGRAM_CONSTANT:
 198         case PROGRAM_UNIFORM:
 199             file = PROGRAM_STATE_VAR;
 200             break;
 201         case PROGRAM_UNDEFINED:
 202             return brw_null_reg();
 203         case PROGRAM_TEMPORARY:
 204         case PROGRAM_INPUT:
 205         case PROGRAM_OUTPUT:
 206         case PROGRAM_PAYLOAD:
 207             break;
 208         default:
 209             _mesa_problem(NULL, "Unexpected file in get_reg()");
 210             return brw_null_reg();
 211     }
 212
 213     assert(index < 256);
 214     assert(component < 4);
 215
 216     /* see if we've already allocated a HW register for this Mesa register */
 217     if (c->wm_regs[file][index][component].inited) {
 218        /* yes, re-use */
 219        reg = c->wm_regs[file][index][component].reg;
 220     }
 221     else {
 222         /* no, allocate new register */
 223        int grf = alloc_grf(c);
 224        /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
 225        if (grf < 0) {
 226           /* totally out of temps */
 227           grf = 51; /* XXX random register! */
 228        }
 229
 230        reg = brw_vec8_grf(grf, 0);
 231        /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 232
 233        set_reg(c, file, index, component, reg);
 234     }
 235
 236     if (neg & (1 << component)) {
 237         reg = negate(reg);
 238     }
 239     if (abs)
 240         reg = brw_abs(reg);
 241     return reg;
 242 }
 243
 244
 245
 246 /**
 247  * This is called if we run out of GRF registers.  Examine the live intervals
 248  * of temp regs in the program and free those which won't be used again.
 249  */
 250 static void
 251 reclaim_temps(struct brw_wm_compile *c)
 252 {
 253    GLint intBegin[MAX_PROGRAM_TEMPS];
 254    GLint intEnd[MAX_PROGRAM_TEMPS];
 255    int index;
 256
 257    /*printf("Reclaim temps:\n");*/
 258
 259    _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
 260                              intBegin, intEnd);
 261
 262    for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
 263       if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
 264          /* program temp[i] can be freed */
 265          int component;
 266          /*printf("  temp[%d] is dead\n", index);*/
 267          for (component = 0; component < 4; component++) {
 268             if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
 269                int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
 270                release_grf(c, r);
 271                /*
 272                printf("  Reclaim temp %d, reg %d at inst %d\n",
 273                       index, r, c->cur_inst);
 274                */
 275                c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
 276             }
 277          }
 278       }
 279    }
 280 }
 281
 282
 283
 284
 285 /**
 286  * Preallocate registers.  This sets up the Mesa to hardware register
 287  * mapping for certain registers, such as constants (uniforms/state vars)
 288  * and shader inputs.
 289  */
 290 static void prealloc_reg(struct brw_wm_compile *c)
 291 {
 292     int i, j;
 293     struct brw_reg reg;
 294     int urb_read_length = 0;
 295     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
 296     GLuint reg_index = 0;
 297
 298     memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
 299     c->first_free_grf = 0;
 300
 301     for (i = 0; i < 4; i++) {
 302         if (i < c->key.nr_depth_regs)
 303             reg = brw_vec8_grf(i * 2, 0);
 304         else
 305             reg = brw_vec8_grf(0, 0);
 306         set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
 307     }
 308     reg_index += 2 * c->key.nr_depth_regs;
 309
 310     /* constants */
 311     {
 312         const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
 313         const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 314
 315         /* use a real constant buffer, or just use a section of the GRF? */
 316         /* XXX this heuristic may need adjustment... */
 317         if ((nr_params + nr_temps) * 4 + reg_index > 80)
 318            c->fp->use_const_buffer = GL_TRUE;
 319         else
 320            c->fp->use_const_buffer = GL_FALSE;
 321         /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 322
 323         if (c->fp->use_const_buffer) {
 324            /* We'll use a real constant buffer and fetch constants from
 325             * it with a dataport read message.
 326             */
 327
 328            /* number of float constants in CURBE */
 329            c->prog_data.nr_params = 0;
 330         }
 331         else {
 332            const struct gl_program_parameter_list *plist =
 333               c->fp->program.Base.Parameters;
 334            int index = 0;
 335
 336            /* number of float constants in CURBE */
 337            c->prog_data.nr_params = 4 * nr_params;
 338
 339            /* loop over program constants (float[4]) */
 340            for (i = 0; i < nr_params; i++) {
 341               /* loop over XYZW channels */
 342               for (j = 0; j < 4; j++, index++) {
 343                  reg = brw_vec1_grf(reg_index + index / 8, index % 8);
 344                  /* Save pointer to parameter/constant value.
 345                   * Constants will be copied in prepare_constant_buffer()
 346                   */
 347                  c->prog_data.param[index] = &plist->ParameterValues[i][j];
 348                  set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
 349               }
 350            }
 351            /* number of constant regs used (each reg is float[8]) */
 352            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
 353            reg_index += c->nr_creg;
 354         }
 355     }
 356
 357     /* fragment shader inputs */
 358     for (i = 0; i < VERT_RESULT_MAX; i++) {
 359        int fp_input;
 360
 361        if (i >= VERT_RESULT_VAR0)
 362           fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
 363        else if (i <= VERT_RESULT_TEX7)
 364           fp_input = i;
 365        else
 366           fp_input = -1;
 367
 368        if (fp_input >= 0 && inputs & (1 << fp_input)) {
 369           urb_read_length = reg_index;
 370           reg = brw_vec8_grf(reg_index, 0);
 371           for (j = 0; j < 4; j++)
 372              set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
 373        }
 374        if (c->key.vp_outputs_written & (1 << i)) {
 375           reg_index += 2;
 376        }
 377     }
 378
 379     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 380     c->prog_data.urb_read_length = urb_read_length;
 381     c->prog_data.curb_read_length = c->nr_creg;
 382     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 383     reg_index++;
 384     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
 385     reg_index += 2;
 386
 387     /* mark GRF regs [0..reg_index-1] as in-use */
 388     for (i = 0; i < reg_index; i++)
 389        prealloc_grf(c, i);
 390
 391     /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
 392     prealloc_grf(c, 126);
 393     prealloc_grf(c, 127);
 394
 395     for (i = 0; i < c->nr_fp_insns; i++) {
 396         const struct prog_instruction *inst = &c->prog_instructions[i];
 397         struct brw_reg dst[4];
 398
 399         switch (inst->Opcode) {
 400         case OPCODE_TEX:
 401         case OPCODE_TXB:
 402             /* Allocate the channels of texture results contiguously,
 403              * since they are written out that way by the sampler unit.
 404              */
 405             for (j = 0; j < 4; j++) {
 406                 dst[j] = get_dst_reg(c, inst, j);
 407                 if (j != 0)
 408                     assert(dst[j].nr == dst[j - 1].nr + 1);
 409             }
 410             break;
 411         default:
 412             break;
 413         }
 414     }
 415
 416     /* An instruction may reference up to three constants.
 417      * They'll be found in these registers.
 418      * XXX alloc these on demand!
 419      */
 420     if (c->fp->use_const_buffer) {
 421        for (i = 0; i < 3; i++) {
 422           c->current_const[i].index = -1;
 423           c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
 424        }
 425     }
 426 #if 0
 427     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
 428     printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 429 #endif
 430 }
 431
 432
 433 /**
 434  * Check if any of the instruction's src registers are constants, uniforms,
 435  * or statevars.  If so, fetch any constants that we don't already have in
 436  * the three GRF slots.
 437  */
 438 static void fetch_constants(struct brw_wm_compile *c,
 439                             const struct prog_instruction *inst)
 440 {
 441    struct brw_compile *p = &c->func;
 442    GLuint i;
 443
 444    /* loop over instruction src regs */
 445    for (i = 0; i < 3; i++) {
 446       const struct prog_src_register *src = &inst->SrcReg[i];
 447       if (src->File == PROGRAM_STATE_VAR ||
 448           src->File == PROGRAM_CONSTANT ||
 449           src->File == PROGRAM_UNIFORM) {
 450          c->current_const[i].index = src->Index;
 451
 452 #if 0
 453          printf("  fetch const[%d] for arg %d into reg %d\n",
 454                 src->Index, i, c->current_const[i].reg.nr);
 455 #endif
 456
 457          /* need to fetch the constant now */
 458          brw_dp_READ_4(p,
 459                        c->current_const[i].reg,  /* writeback dest */
 460                        src->RelAddr,             /* relative indexing? */
 461                        16 * src->Index,          /* byte offset */
 462                        SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
 463                        );
 464       }
 465    }
 466 }
 467
 468
 469 /**
 470  * Convert Mesa dst register to brw register.
 471  */
 472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
 473                                   const struct prog_instruction *inst,
 474                                   GLuint component)
 475 {
 476     const int nr = 1;
 477     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 478             0, 0);
 479 }
 480
 481
 482 static struct brw_reg
 483 get_src_reg_const(struct brw_wm_compile *c,
 484                   const struct prog_instruction *inst,
 485                   GLuint srcRegIndex, GLuint component)
 486 {
 487    /* We should have already fetched the constant from the constant
 488     * buffer in fetch_constants().  Now we just have to return a
 489     * register description that extracts the needed component and
 490     * smears it across all eight vector components.
 491     */
 492    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 493    struct brw_reg const_reg;
 494
 495    assert(component < 4);
 496    assert(srcRegIndex < 3);
 497    assert(c->current_const[srcRegIndex].index != -1);
 498    const_reg = c->current_const[srcRegIndex].reg;
 499
 500    /* extract desired float from the const_reg, and smear */
 501    const_reg = stride(const_reg, 0, 1, 0);
 502    const_reg.subnr = component * 4;
 503
 504    if (src->Negate & (1 << component))
 505       const_reg = negate(const_reg);
 506    if (src->Abs)
 507       const_reg = brw_abs(const_reg);
 508
 509 #if 0
 510    printf("  form const[%d].%d for arg %d, reg %d\n",
 511           c->current_const[srcRegIndex].index,
 512           component,
 513           srcRegIndex,
 514           const_reg.nr);
 515 #endif
 516
 517    return const_reg;
 518 }
 519
 520
 521 /**
 522  * Convert Mesa src register to brw register.
 523  */
 524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
 525                                   const struct prog_instruction *inst,
 526                                   GLuint srcRegIndex, GLuint channel)
 527 {
 528     const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
 529     const GLuint nr = 1;
 530     const GLuint component = GET_SWZ(src->Swizzle, channel);
 531
 532     /* Extended swizzle terms */
 533     if (component == SWIZZLE_ZERO) {
 534        return brw_imm_f(0.0F);
 535     }
 536     else if (component == SWIZZLE_ONE) {
 537        return brw_imm_f(1.0F);
 538     }
 539
 540     if (c->fp->use_const_buffer &&
 541         (src->File == PROGRAM_STATE_VAR ||
 542          src->File == PROGRAM_CONSTANT ||
 543          src->File == PROGRAM_UNIFORM)) {
 544        return get_src_reg_const(c, inst, srcRegIndex, component);
 545     }
 546     else {
 547        /* other type of source register */
 548        return get_reg(c, src->File, src->Index, component, nr,
 549                       src->Negate, src->Abs);
 550     }
 551 }
 552
 553 /**
 554  * Subroutines are minimal support for resusable instruction sequences.
 555  * They are implemented as simply as possible to minimise overhead: there
 556  * is no explicit support for communication between the caller and callee
 557  * other than saving the return address in a temporary register, nor is
 558  * there any automatic local storage.  This implies that great care is
 559  * required before attempting reentrancy or any kind of nested
 560  * subroutine invocations.
 561  */
 562 static void invoke_subroutine( struct brw_wm_compile *c,
 563                                enum _subroutine subroutine,
 564                                void (*emit)( struct brw_wm_compile * ) )
 565 {
 566     struct brw_compile *p = &c->func;
 567
 568     assert( subroutine < BRW_WM_MAX_SUBROUTINE );
 569
 570     if( c->subroutines[ subroutine ] ) {
 571         /* subroutine previously emitted: reuse existing instructions */
 572
 573         int mark = mark_tmps( c );
 574         struct brw_reg return_address = retype( alloc_tmp( c ),
 575                                                 BRW_REGISTER_TYPE_UD );
 576         int here = p->nr_insn;
 577
 578         brw_push_insn_state(p);
 579         brw_set_mask_control(p, BRW_MASK_DISABLE);
 580         brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
 581
 582         brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
 583                  brw_imm_d( ( c->subroutines[ subroutine ] -
 584                               here - 1 ) << 4 ) );
 585         brw_pop_insn_state(p);
 586
 587         release_tmps( c, mark );
 588     } else {
 589         /* previously unused subroutine: emit, and mark for later reuse */
 590
 591         int mark = mark_tmps( c );
 592         struct brw_reg return_address = retype( alloc_tmp( c ),
 593                                                 BRW_REGISTER_TYPE_UD );
 594         struct brw_instruction *calc;
 595         int base = p->nr_insn;
 596
 597         brw_push_insn_state(p);
 598         brw_set_mask_control(p, BRW_MASK_DISABLE);
 599         calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
 600         brw_pop_insn_state(p);
 601
 602         c->subroutines[ subroutine ] = p->nr_insn;
 603
 604         emit( c );
 605
 606         brw_push_insn_state(p);
 607         brw_set_mask_control(p, BRW_MASK_DISABLE);
 608         brw_MOV( p, brw_ip_reg(), return_address );
 609         brw_pop_insn_state(p);
 610
 611         brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
 612
 613         release_tmps( c, mark );
 614     }
 615 }
 616
 617 /* Workaround for using brw_wm_emit.c's emit functions, which expect
 618  * destination regs to be uniquely written.  Moves arguments out to
 619  * temporaries as necessary for instructions which use their destination as
 620  * a temporary.
 621  */
 622 static void
 623 unalias3(struct brw_wm_compile *c,
 624          void (*func)(struct brw_compile *c,
 625                       const struct brw_reg *dst,
 626                       GLuint mask,
 627                       const struct brw_reg *arg0,
 628                       const struct brw_reg *arg1,
 629                       const struct brw_reg *arg2),
 630          const struct brw_reg *dst,
 631          GLuint mask,
 632          const struct brw_reg *arg0,
 633          const struct brw_reg *arg1,
 634          const struct brw_reg *arg2)
 635 {
 636     struct brw_compile *p = &c->func;
 637     struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
 638     int i, j;
 639     int mark = mark_tmps(c);
 640
 641     for (j = 0; j < 4; j++) {
 642         tmp_arg0[j] = arg0[j];
 643         tmp_arg1[j] = arg1[j];
 644         tmp_arg2[j] = arg2[j];
 645     }
 646
 647     for (i = 0; i < 4; i++) {
 648         if (mask & (1<<i)) {
 649             for (j = 0; j < 4; j++) {
 650                 if (arg0[j].file == dst[i].file &&
 651                     dst[i].nr == arg0[j].nr) {
 652                     tmp_arg0[j] = alloc_tmp(c);
 653                     brw_MOV(p, tmp_arg0[j], arg0[j]);
 654                 }
 655                 if (arg1[j].file == dst[i].file &&
 656                     dst[i].nr == arg1[j].nr) {
 657                     tmp_arg1[j] = alloc_tmp(c);
 658                     brw_MOV(p, tmp_arg1[j], arg1[j]);
 659                 }
 660                 if (arg2[j].file == dst[i].file &&
 661                     dst[i].nr == arg2[j].nr) {
 662                     tmp_arg2[j] = alloc_tmp(c);
 663                     brw_MOV(p, tmp_arg2[j], arg2[j]);
 664                 }
 665             }
 666         }
 667     }
 668
 669     func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
 670
 671     release_tmps(c, mark);
 672 }
 673
 674 static void fire_fb_write( struct brw_wm_compile *c,
 675                            GLuint base_reg,
 676                            GLuint nr,
 677                            GLuint target,
 678                            GLuint eot)
 679 {
 680     struct brw_compile *p = &c->func;
 681     /* Pass through control information:
 682      */
 683     /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
 684     {
 685         brw_push_insn_state(p);
 686         brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
 687         brw_MOV(p,
 688                 brw_message_reg(base_reg + 1),
 689                 brw_vec8_grf(1, 0));
 690         brw_pop_insn_state(p);
 691     }
 692     /* Send framebuffer write message: */
 693     brw_fb_WRITE(p,
 694             retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
 695             base_reg,
 696             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 697             target,
 698             nr,
 699             0,
 700             eot);
 701 }
 702
 703 static void emit_fb_write(struct brw_wm_compile *c,
 704                           const struct prog_instruction *inst)
 705 {
 706     struct brw_compile *p = &c->func;
 707     int nr = 2;
 708     int channel;
 709     GLuint target, eot;
 710     struct brw_reg src0;
 711
 712     /* Reserve a space for AA - may not be needed:
 713      */
 714     if (c->key.aa_dest_stencil_reg)
 715         nr += 1;
 716
 717     brw_push_insn_state(p);
 718     for (channel = 0; channel < 4; channel++) {
 719         src0 = get_src_reg(c,  inst, 0, channel);
 720         /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 721         /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
 722         brw_MOV(p, brw_message_reg(nr + channel), src0);
 723     }
 724     /* skip over the regs populated above: */
 725     nr += 8;
 726     brw_pop_insn_state(p);
 727
 728     if (c->key.source_depth_to_render_target) {
 729        if (c->key.computes_depth) {
 730           src0 = get_src_reg(c, inst, 2, 2);
 731           brw_MOV(p, brw_message_reg(nr), src0);
 732        }
 733        else {
 734           src0 = get_src_reg(c, inst, 1, 1);
 735           brw_MOV(p, brw_message_reg(nr), src0);
 736        }
 737
 738        nr += 2;
 739     }
 740
 741     if (c->key.dest_depth_reg) {
 742         const GLuint comp = c->key.dest_depth_reg / 2;
 743         const GLuint off = c->key.dest_depth_reg % 2;
 744
 745         if (off != 0) {
 746             /* XXX this code needs review/testing */
 747             struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
 748             struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 749
 750             brw_push_insn_state(p);
 751             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 752
 753             brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
 754             /* 2nd half? */
 755             brw_MOV(p, brw_message_reg(nr+1), arg1_1);
 756             brw_pop_insn_state(p);
 757         }
 758         else
 759         {
 760             struct brw_reg src =  get_src_reg(c, inst, 1, 1);
 761             brw_MOV(p, brw_message_reg(nr), src);
 762         }
 763         nr += 2;
 764    }
 765
 766     target = INST_AUX_GET_TARGET(inst->Aux);
 767     eot = inst->Aux & INST_AUX_EOT;
 768     fire_fb_write(c, 0, nr, target, eot);
 769 }
 770
 771 static void emit_arl(struct brw_wm_compile *c,
 772                      const struct prog_instruction *inst)
 773 {
 774     struct brw_compile *p = &c->func;
 775     struct brw_reg src0, addr_reg;
 776     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 777     addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 778                            BRW_ARF_ADDRESS, 0);
 779     src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
 780     brw_MOV(p, addr_reg, src0);
 781     brw_set_saturate(p, 0);
 782 }
 783
 784
 785 static void emit_min_max(struct brw_wm_compile *c,
 786                          const struct prog_instruction *inst)
 787 {
 788     struct brw_compile *p = &c->func;
 789     const GLuint mask = inst->DstReg.WriteMask;
 790     const int mark = mark_tmps(c);
 791     int i;
 792     brw_push_insn_state(p);
 793     for (i = 0; i < 4; i++) {
 794         if (mask & (1<<i)) {
 795             struct brw_reg real_dst = get_dst_reg(c, inst, i);
 796             struct brw_reg src0 = get_src_reg(c, inst, 0, i);
 797             struct brw_reg src1 = get_src_reg(c, inst, 1, i);
 798             struct brw_reg dst;
 799             /* if dst==src0 or dst==src1 we need to use a temp reg */
 800             GLboolean use_temp = brw_same_reg(dst, src0) ||
 801                                  brw_same_reg(dst, src1);
 802             if (use_temp)
 803                dst = alloc_tmp(c);
 804             else
 805                dst = real_dst;
 806
 807             /*
 808             printf("  Min/max: dst %d  src0 %d  src1 %d\n",
 809                    dst.nr, src0.nr, src1.nr);
 810             */
 811             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 812             brw_MOV(p, dst, src0);
 813             brw_set_saturate(p, 0);
 814
 815             if (inst->Opcode == OPCODE_MIN)
 816                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
 817             else
 818                brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
 819
 820             brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 821             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 822             brw_MOV(p, dst, src1);
 823             brw_set_saturate(p, 0);
 824             brw_set_predicate_control_flag_value(p, 0xff);
 825             if (use_temp)
 826                brw_MOV(p, real_dst, dst);
 827         }
 828     }
 829     brw_pop_insn_state(p);
 830     release_tmps(c, mark);
 831 }
 832
 833 /**
 834  * For GLSL shaders, this KIL will be unconditional.
 835  * It may be contained inside an IF/ENDIF structure of course.
 836  */
 837 static void emit_kil(struct brw_wm_compile *c)
 838 {
 839     struct brw_compile *p = &c->func;
 840     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 841     brw_push_insn_state(p);
 842     brw_set_mask_control(p, BRW_MASK_DISABLE);
 843     brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
 844     brw_AND(p, depth, c->emit_mask_reg, depth);
 845     brw_pop_insn_state(p);
 846 }
 847
 848 static INLINE struct brw_reg high_words( struct brw_reg reg )
 849 {
 850     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 851                    0, 8, 2 );
 852 }
 853
 854 static INLINE struct brw_reg low_words( struct brw_reg reg )
 855 {
 856     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 857 }
 858
 859 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 860 {
 861     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 862 }
 863
 864 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 865 {
 866     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 867                    0, 16, 2 );
 868 }
 869
 870 /* One-, two- and three-dimensional Perlin noise, similar to the description
 871    in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
 872 static void noise1_sub( struct brw_wm_compile *c ) {
 873
 874     struct brw_compile *p = &c->func;
 875     struct brw_reg param,
 876         x0, x1, /* gradients at each end */
 877         t, tmp[ 2 ], /* float temporaries */
 878         itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
 879     int i;
 880     int mark = mark_tmps( c );
 881
 882     x0 = alloc_tmp( c );
 883     x1 = alloc_tmp( c );
 884     t = alloc_tmp( c );
 885     tmp[ 0 ] = alloc_tmp( c );
 886     tmp[ 1 ] = alloc_tmp( c );
 887     itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
 888     itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
 889     itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
 890     itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
 891     itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
 892
 893     param = lookup_tmp( c, mark - 2 );
 894
 895     brw_set_access_mode( p, BRW_ALIGN_1 );
 896
 897     brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
 898
 899     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
 900        be hashed.  Also compute the remainder (offset within the unit
 901        length), interleaved to reduce register dependency penalties. */
 902     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
 903     brw_FRC( p, param, param );
 904     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
 905     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
 906     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
 907
 908     /* We're now ready to perform the hashing.  The two hashes are
 909        interleaved for performance.  The hash function used is
 910        designed to rapidly achieve avalanche and require only 32x16
 911        bit multiplication, and 16-bit swizzles (which we get for
 912        free).  We can't use immediate operands in the multiplies,
 913        because immediates are permitted only in src1 and the 16-bit
 914        factor is permitted only in src0. */
 915     for( i = 0; i < 2; i++ )
 916         brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
 917     for( i = 0; i < 2; i++ )
 918        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 919                 high_words( itmp[ i ] ) );
 920     for( i = 0; i < 2; i++ )
 921         brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
 922     for( i = 0; i < 2; i++ )
 923        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 924                 high_words( itmp[ i ] ) );
 925     for( i = 0; i < 2; i++ )
 926         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
 927     for( i = 0; i < 2; i++ )
 928        brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
 929                 high_words( itmp[ i ] ) );
 930
 931     /* Now we want to initialise the two gradients based on the
 932        hashes.  Format conversion from signed integer to float leaves
 933        everything scaled too high by a factor of pow( 2, 31 ), but
 934        we correct for that right at the end. */
 935     brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
 936     brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
 937     brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
 938
 939     brw_MUL( p, x0, x0, param );
 940     brw_MUL( p, x1, x1, t );
 941
 942     /* We interpolate between the gradients using the polynomial
 943        6t^5 - 15t^4 + 10t^3 (Perlin). */
 944     brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
 945     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
 946     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 947     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
 948     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 949     brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
 950                                            pipeline */
 951     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
 952     brw_MUL( p, param, tmp[ 0 ], param );
 953     brw_MUL( p, x1, x1, param );
 954     brw_ADD( p, x0, x0, x1 );
 955     /* scale by pow( 2, -30 ), to compensate for the format conversion
 956        above and an extra factor of 2 so that a single gradient covers
 957        the [-1,1] range */
 958     brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
 959
 960     release_tmps( c, mark );
 961 }
 962
 963 static void emit_noise1( struct brw_wm_compile *c,
 964                          const struct prog_instruction *inst )
 965 {
 966     struct brw_compile *p = &c->func;
 967     struct brw_reg src, param, dst;
 968     GLuint mask = inst->DstReg.WriteMask;
 969     int i;
 970     int mark = mark_tmps( c );
 971
 972     assert( mark == 0 );
 973
 974     src = get_src_reg( c, inst, 0, 0 );
 975
 976     param = alloc_tmp( c );
 977
 978     brw_MOV( p, param, src );
 979
 980     invoke_subroutine( c, SUB_NOISE1, noise1_sub );
 981
 982     /* Fill in the result: */
 983     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
 984     for (i = 0 ; i < 4; i++) {
 985         if (mask & (1<<i)) {
 986             dst = get_dst_reg(c, inst, i);
 987             brw_MOV( p, dst, param );
 988         }
 989     }
 990     if( inst->SaturateMode == SATURATE_ZERO_ONE )
 991         brw_set_saturate( p, 0 );
 992
 993     release_tmps( c, mark );
 994 }
 995
 996 static void noise2_sub( struct brw_wm_compile *c ) {
 997
 998     struct brw_compile *p = &c->func;
 999     struct brw_reg param0, param1,
1000         x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1001         t, tmp[ 4 ], /* float temporaries */
1002         itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1003     int i;
1004     int mark = mark_tmps( c );
1005
1006     x0y0 = alloc_tmp( c );
1007     x0y1 = alloc_tmp( c );
1008     x1y0 = alloc_tmp( c );
1009     x1y1 = alloc_tmp( c );
1010     t = alloc_tmp( c );
1011     for( i = 0; i < 4; i++ ) {
1012         tmp[ i ] = alloc_tmp( c );
1013         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1014     }
1015     itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1016     itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1017     itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1018
1019     param0 = lookup_tmp( c, mark - 3 );
1020     param1 = lookup_tmp( c, mark - 2 );
1021
1022     brw_set_access_mode( p, BRW_ALIGN_1 );
1023
1024     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1025        be hashed.  Also compute the remainders (offsets within the unit
1026        square), interleaved to reduce register dependency penalties. */
1027     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1028     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1029     brw_FRC( p, param0, param0 );
1030     brw_FRC( p, param1, param1 );
1031     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1032     brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1033              low_words( itmp[ 1 ] ) );
1034     brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1035     brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1036     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1037     brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1038     brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1039
1040     /* We're now ready to perform the hashing.  The four hashes are
1041        interleaved for performance.  The hash function used is
1042        designed to rapidly achieve avalanche and require only 32x16
1043        bit multiplication, and 16-bit swizzles (which we get for
1044        free).  We can't use immediate operands in the multiplies,
1045        because immediates are permitted only in src1 and the 16-bit
1046        factor is permitted only in src0. */
1047     for( i = 0; i < 4; i++ )
1048         brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1049     for( i = 0; i < 4; i++ )
1050         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1051                  high_words( itmp[ i ] ) );
1052     for( i = 0; i < 4; i++ )
1053         brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1054     for( i = 0; i < 4; i++ )
1055         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1056                  high_words( itmp[ i ] ) );
1057     for( i = 0; i < 4; i++ )
1058         brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1059     for( i = 0; i < 4; i++ )
1060         brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1061                  high_words( itmp[ i ] ) );
1062
1063     /* Now we want to initialise the four gradients based on the
1064        hashes.  Format conversion from signed integer to float leaves
1065        everything scaled too high by a factor of pow( 2, 15 ), but
1066        we correct for that right at the end. */
1067     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1068     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1069     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1070     brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1071     brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1072
1073     brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1074     brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1075     brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1076     brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1077
1078     brw_MUL( p, x1y0, x1y0, t );
1079     brw_MUL( p, x1y1, x1y1, t );
1080     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1081     brw_MUL( p, x0y0, x0y0, param0 );
1082     brw_MUL( p, x0y1, x0y1, param0 );
1083
1084     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1085     brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1086     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1087     brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1088
1089     brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1090     brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1091     brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1092     brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1093
1094     /* We interpolate between the gradients using the polynomial
1095        6t^5 - 15t^4 + 10t^3 (Perlin). */
1096     brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1097     brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1098     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1099     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1100     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1101     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1102     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1103                                                  pipeline */
1104     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1105     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1106     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1107     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1108     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1109                                                  pipeline */
1110     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1111     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1112     brw_MUL( p, param0, tmp[ 0 ], param0 );
1113     brw_MUL( p, param1, tmp[ 1 ], param1 );
1114
1115     /* Here we interpolate in the y dimension... */
1116     brw_MUL( p, x0y1, x0y1, param1 );
1117     brw_MUL( p, x1y1, x1y1, param1 );
1118     brw_ADD( p, x0y0, x0y0, x0y1 );
1119     brw_ADD( p, x1y0, x1y0, x1y1 );
1120
1121     /* And now in x.  There are horrible register dependencies here,
1122        but we have nothing else to do. */
1123     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1124     brw_MUL( p, x1y0, x1y0, param0 );
1125     brw_ADD( p, x0y0, x0y0, x1y0 );
1126
1127     /* scale by pow( 2, -15 ), as described above */
1128     brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1129
1130     release_tmps( c, mark );
1131 }
1132
1133 static void emit_noise2( struct brw_wm_compile *c,
1134                          const struct prog_instruction *inst )
1135 {
1136     struct brw_compile *p = &c->func;
1137     struct brw_reg src0, src1, param0, param1, dst;
1138     GLuint mask = inst->DstReg.WriteMask;
1139     int i;
1140     int mark = mark_tmps( c );
1141
1142     assert( mark == 0 );
1143
1144     src0 = get_src_reg( c, inst, 0, 0 );
1145     src1 = get_src_reg( c, inst, 0, 1 );
1146
1147     param0 = alloc_tmp( c );
1148     param1 = alloc_tmp( c );
1149
1150     brw_MOV( p, param0, src0 );
1151     brw_MOV( p, param1, src1 );
1152
1153     invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1154
1155     /* Fill in the result: */
1156     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1157     for (i = 0 ; i < 4; i++) {
1158         if (mask & (1<<i)) {
1159             dst = get_dst_reg(c, inst, i);
1160             brw_MOV( p, dst, param0 );
1161         }
1162     }
1163     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1164         brw_set_saturate( p, 0 );
1165
1166     release_tmps( c, mark );
1167 }
1168
1169 /**
1170  * The three-dimensional case is much like the one- and two- versions above,
1171  * but since the number of corners is rapidly growing we now pack 16 16-bit
1172  * hashes into each register to extract more parallelism from the EUs.
1173  */
1174 static void noise3_sub( struct brw_wm_compile *c ) {
1175
1176     struct brw_compile *p = &c->func;
1177     struct brw_reg param0, param1, param2,
1178         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1179         xi, yi, zi, /* interpolation coefficients */
1180         t, tmp[ 8 ], /* float temporaries */
1181         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1182         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1183     int i;
1184     int mark = mark_tmps( c );
1185
1186     x0y0 = alloc_tmp( c );
1187     x0y1 = alloc_tmp( c );
1188     x1y0 = alloc_tmp( c );
1189     x1y1 = alloc_tmp( c );
1190     xi = alloc_tmp( c );
1191     yi = alloc_tmp( c );
1192     zi = alloc_tmp( c );
1193     t = alloc_tmp( c );
1194     for( i = 0; i < 8; i++ ) {
1195         tmp[ i ] = alloc_tmp( c );
1196         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1197         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1198     }
1199
1200     param0 = lookup_tmp( c, mark - 4 );
1201     param1 = lookup_tmp( c, mark - 3 );
1202     param2 = lookup_tmp( c, mark - 2 );
1203
1204     brw_set_access_mode( p, BRW_ALIGN_1 );
1205
1206     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1207        be hashed.  Also compute the remainders (offsets within the unit
1208        cube), interleaved to reduce register dependency penalties. */
1209     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1210     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1211     brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1212     brw_FRC( p, param0, param0 );
1213     brw_FRC( p, param1, param1 );
1214     brw_FRC( p, param2, param2 );
1215     /* Since we now have only 16 bits of precision in the hash, we must
1216        be more careful about thorough mixing to maintain entropy as we
1217        squash the input vector into a small scalar. */
1218     brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1219     brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1220     brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1221              brw_imm_uw( 0x9B93 ) );
1222     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1223              brw_imm_uw( 0xBC8F ) );
1224
1225     /* Temporarily disable the execution mask while we work with ExecSize=16
1226        channels (the mask is set for ExecSize=8 and is probably incorrect).
1227        Although this might cause execution of unwanted channels, the code
1228        writes only to temporary registers and has no side effects, so
1229        disabling the mask is harmless. */
1230     brw_push_insn_state( p );
1231     brw_set_mask_control( p, BRW_MASK_DISABLE );
1232     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1233     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1234     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1235
1236     /* We're now ready to perform the hashing.  The eight hashes are
1237        interleaved for performance.  The hash function used is
1238        designed to rapidly achieve avalanche and require only 16x16
1239        bit multiplication, and 8-bit swizzles (which we get for
1240        free). */
1241     for( i = 0; i < 4; i++ )
1242         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1243     for( i = 0; i < 4; i++ )
1244         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1245                  odd_bytes( wtmp[ i ] ) );
1246     for( i = 0; i < 4; i++ )
1247         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1248     for( i = 0; i < 4; i++ )
1249         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1250                  odd_bytes( wtmp[ i ] ) );
1251     brw_pop_insn_state( p );
1252
1253     /* Now we want to initialise the four rear gradients based on the
1254        hashes.  Format conversion from signed integer to float leaves
1255        everything scaled too high by a factor of pow( 2, 15 ), but
1256        we correct for that right at the end. */
1257     /* x component */
1258     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1259     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1260     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1261     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1262     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1263
1264     brw_push_insn_state( p );
1265     brw_set_mask_control( p, BRW_MASK_DISABLE );
1266     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1267     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1268     brw_pop_insn_state( p );
1269
1270     brw_MUL( p, x1y0, x1y0, t );
1271     brw_MUL( p, x1y1, x1y1, t );
1272     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1273     brw_MUL( p, x0y0, x0y0, param0 );
1274     brw_MUL( p, x0y1, x0y1, param0 );
1275
1276     /* y component */
1277     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1278     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1279     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1280     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1281
1282     brw_push_insn_state( p );
1283     brw_set_mask_control( p, BRW_MASK_DISABLE );
1284     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1285     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1286     brw_pop_insn_state( p );
1287
1288     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1289     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1290     brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1291     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1292     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1293
1294     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1295     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1296     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1297     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1298
1299     /* z component */
1300     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1301     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1302     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1303     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1304
1305     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1306     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1307     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1308     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1309
1310     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1311     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1312     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1313     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1314
1315     /* We interpolate between the gradients using the polynomial
1316        6t^5 - 15t^4 + 10t^3 (Perlin). */
1317     brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1318     brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1319     brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1320     brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1321     brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1322     brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1323     brw_MUL( p, xi, xi, param0 );
1324     brw_MUL( p, yi, yi, param1 );
1325     brw_MUL( p, zi, zi, param2 );
1326     brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1327     brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1328     brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1329     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1330     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1331     brw_MUL( p, xi, xi, param0 );
1332     brw_MUL( p, yi, yi, param1 );
1333     brw_MUL( p, zi, zi, param2 );
1334     brw_MUL( p, xi, xi, param0 );
1335     brw_MUL( p, yi, yi, param1 );
1336     brw_MUL( p, zi, zi, param2 );
1337     brw_MUL( p, xi, xi, param0 );
1338     brw_MUL( p, yi, yi, param1 );
1339     brw_MUL( p, zi, zi, param2 );
1340
1341     /* Here we interpolate in the y dimension... */
1342     brw_MUL( p, x0y1, x0y1, yi );
1343     brw_MUL( p, x1y1, x1y1, yi );
1344     brw_ADD( p, x0y0, x0y0, x0y1 );
1345     brw_ADD( p, x1y0, x1y0, x1y1 );
1346
1347     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1348     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1349     brw_MUL( p, x1y0, x1y0, xi );
1350     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1351
1352     /* Now do the same thing for the front four gradients... */
1353     /* x component */
1354     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1355     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1356     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1357     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1358
1359     brw_push_insn_state( p );
1360     brw_set_mask_control( p, BRW_MASK_DISABLE );
1361     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1362     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1363     brw_pop_insn_state( p );
1364
1365     brw_MUL( p, x1y0, x1y0, t );
1366     brw_MUL( p, x1y1, x1y1, t );
1367     brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1368     brw_MUL( p, x0y0, x0y0, param0 );
1369     brw_MUL( p, x0y1, x0y1, param0 );
1370
1371     /* y component */
1372     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1373     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1374     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1375     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1376
1377     brw_push_insn_state( p );
1378     brw_set_mask_control( p, BRW_MASK_DISABLE );
1379     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1380     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1381     brw_pop_insn_state( p );
1382
1383     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1384     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1385     brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1386     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1387     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1388
1389     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1390     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1391     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1392     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1393
1394     /* z component */
1395     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1396     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1397     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1398     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1399
1400     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1401     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1402     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1403     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1404
1405     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1406     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1407     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1408     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1409
1410     /* The interpolation coefficients are still around from last time, so
1411        again interpolate in the y dimension... */
1412     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1413     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1414     brw_MUL( p, x0y1, x0y1, yi );
1415     brw_MUL( p, x1y1, x1y1, yi );
1416     brw_ADD( p, x0y0, x0y0, x0y1 );
1417     brw_ADD( p, x1y0, x1y0, x1y1 );
1418
1419     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1420        time put the front face in tmp[ 1 ] and we're nearly there... */
1421     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1422     brw_MUL( p, x1y0, x1y0, xi );
1423     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1424
1425     /* The final interpolation, in the z dimension: */
1426     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1427     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1428     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1429
1430     /* scale by pow( 2, -15 ), as described above */
1431     brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1432
1433     release_tmps( c, mark );
1434 }
1435
1436 static void emit_noise3( struct brw_wm_compile *c,
1437                          const struct prog_instruction *inst )
1438 {
1439     struct brw_compile *p = &c->func;
1440     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1441     GLuint mask = inst->DstReg.WriteMask;
1442     int i;
1443     int mark = mark_tmps( c );
1444
1445     assert( mark == 0 );
1446
1447     src0 = get_src_reg( c, inst, 0, 0 );
1448     src1 = get_src_reg( c, inst, 0, 1 );
1449     src2 = get_src_reg( c, inst, 0, 2 );
1450
1451     param0 = alloc_tmp( c );
1452     param1 = alloc_tmp( c );
1453     param2 = alloc_tmp( c );
1454
1455     brw_MOV( p, param0, src0 );
1456     brw_MOV( p, param1, src1 );
1457     brw_MOV( p, param2, src2 );
1458
1459     invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1460
1461     /* Fill in the result: */
1462     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1463     for (i = 0 ; i < 4; i++) {
1464         if (mask & (1<<i)) {
1465             dst = get_dst_reg(c, inst, i);
1466             brw_MOV( p, dst, param0 );
1467         }
1468     }
1469     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1470         brw_set_saturate( p, 0 );
1471
1472     release_tmps( c, mark );
1473 }
1474
1475 /**
1476  * For the four-dimensional case, the little micro-optimisation benefits
1477  * we obtain by unrolling all the loops aren't worth the massive bloat it
1478  * now causes.  Instead, we loop twice around performing a similar operation
1479  * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1480  * code to glue it all together.
1481  */
1482 static void noise4_sub( struct brw_wm_compile *c )
1483 {
1484     struct brw_compile *p = &c->func;
1485     struct brw_reg param[ 4 ],
1486         x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1487         w0, /* noise for the w=0 cube */
1488         floors[ 2 ], /* integer coordinates of base corner of hypercube */
1489         interp[ 4 ], /* interpolation coefficients */
1490         t, tmp[ 8 ], /* float temporaries */
1491         itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1492         wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1493     int i, j;
1494     int mark = mark_tmps( c );
1495     GLuint loop, origin;
1496
1497     x0y0 = alloc_tmp( c );
1498     x0y1 = alloc_tmp( c );
1499     x1y0 = alloc_tmp( c );
1500     x1y1 = alloc_tmp( c );
1501     t = alloc_tmp( c );
1502     w0 = alloc_tmp( c );
1503     floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1504     floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1505
1506     for( i = 0; i < 4; i++ ) {
1507         param[ i ] = lookup_tmp( c, mark - 5 + i );
1508         interp[ i ] = alloc_tmp( c );
1509     }
1510
1511     for( i = 0; i < 8; i++ ) {
1512         tmp[ i ] = alloc_tmp( c );
1513         itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1514         wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1515     }
1516
1517     brw_set_access_mode( p, BRW_ALIGN_1 );
1518
1519     /* We only want 16 bits of precision from the integral part of each
1520        co-ordinate, but unfortunately the RNDD semantics would saturate
1521        at 16 bits if we performed the operation directly to a 16-bit
1522        destination.  Therefore, we round to 32-bit temporaries where
1523        appropriate, and then store only the lower 16 bits. */
1524     brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1525     brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1526     brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1527     brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1528     brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1529     brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1530
1531     /* Modify the flag register here, because the side effect is useful
1532        later (see below).  We know for certain that all flags will be
1533        cleared, since the FRC instruction cannot possibly generate
1534        negative results.  Even for exceptional inputs (infinities, denormals,
1535        NaNs), the architecture guarantees that the L conditional is false. */
1536     brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1537     brw_FRC( p, param[ 0 ], param[ 0 ] );
1538     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1539     for( i = 1; i < 4; i++ )
1540         brw_FRC( p, param[ i ], param[ i ] );
1541
1542     /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1543        of all. */
1544     for( i = 0; i < 4; i++ )
1545         brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1546     for( i = 0; i < 4; i++ )
1547         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1548     for( i = 0; i < 4; i++ )
1549         brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1550     for( i = 0; i < 4; i++ )
1551         brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1552     for( j = 0; j < 3; j++ )
1553         for( i = 0; i < 4; i++ )
1554             brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1555
1556     /* Mark the current address, as it will be a jump destination.  The
1557        following code will be executed twice: first, with the flag
1558        register clear indicating the w=0 case, and second with flags
1559        set for w=1. */
1560     loop = p->nr_insn;
1561
1562     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1563        be hashed.  Since we have only 16 bits of precision in the hash, we
1564        must be careful about thorough mixing to maintain entropy as we
1565        squash the input vector into a small scalar. */
1566     brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1567              brw_imm_uw( 0xBC8F ) );
1568     brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1569              brw_imm_uw( 0xD0BD ) );
1570     brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1571              brw_imm_uw( 0x9B93 ) );
1572     brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1573              brw_imm_uw( 0xA359 ) );
1574     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1575              brw_imm_uw( 0xBC8F ) );
1576
1577     /* Temporarily disable the execution mask while we work with ExecSize=16
1578        channels (the mask is set for ExecSize=8 and is probably incorrect).
1579        Although this might cause execution of unwanted channels, the code
1580        writes only to temporary registers and has no side effects, so
1581        disabling the mask is harmless. */
1582     brw_push_insn_state( p );
1583     brw_set_mask_control( p, BRW_MASK_DISABLE );
1584     brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1585     brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1586     brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1587
1588     /* We're now ready to perform the hashing.  The eight hashes are
1589        interleaved for performance.  The hash function used is
1590        designed to rapidly achieve avalanche and require only 16x16
1591        bit multiplication, and 8-bit swizzles (which we get for
1592        free). */
1593     for( i = 0; i < 4; i++ )
1594         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1595     for( i = 0; i < 4; i++ )
1596         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1597                  odd_bytes( wtmp[ i ] ) );
1598     for( i = 0; i < 4; i++ )
1599         brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1600     for( i = 0; i < 4; i++ )
1601         brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1602                  odd_bytes( wtmp[ i ] ) );
1603     brw_pop_insn_state( p );
1604
1605     /* Now we want to initialise the four rear gradients based on the
1606        hashes.  Format conversion from signed integer to float leaves
1607        everything scaled too high by a factor of pow( 2, 15 ), but
1608        we correct for that right at the end. */
1609     /* x component */
1610     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1611     brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1612     brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1613     brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1614     brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1615
1616     brw_push_insn_state( p );
1617     brw_set_mask_control( p, BRW_MASK_DISABLE );
1618     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1619     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1620     brw_pop_insn_state( p );
1621
1622     brw_MUL( p, x1y0, x1y0, t );
1623     brw_MUL( p, x1y1, x1y1, t );
1624     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1625     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1626     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1627
1628     /* y component */
1629     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1630     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1631     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1632     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1633
1634     brw_push_insn_state( p );
1635     brw_set_mask_control( p, BRW_MASK_DISABLE );
1636     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1637     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1638     brw_pop_insn_state( p );
1639
1640     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1641     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1642     /* prepare t for the w component (used below): w the first time through
1643        the loop; w - 1 the second time) */
1644     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1645     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1646     p->current->header.predicate_inverse = 1;
1647     brw_MOV( p, t, param[ 3 ] );
1648     p->current->header.predicate_inverse = 0;
1649     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1650     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1651     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1652
1653     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1654     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1655     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1656     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1657
1658     /* z component */
1659     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1660     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1661     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1662     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1663
1664     brw_push_insn_state( p );
1665     brw_set_mask_control( p, BRW_MASK_DISABLE );
1666     brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1667     brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1668     brw_pop_insn_state( p );
1669
1670     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1671     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1672     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1673     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1674
1675     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1676     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1677     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1678     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1679
1680     /* w component */
1681     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1682     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1683     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1684     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1685
1686     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1687     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1688     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1689     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1690     brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1691
1692     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1693     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1694     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1695     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1696
1697     /* Here we interpolate in the y dimension... */
1698     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1699     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1700     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1701     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1702     brw_ADD( p, x0y0, x0y0, x0y1 );
1703     brw_ADD( p, x1y0, x1y0, x1y1 );
1704
1705     /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
1706     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1707     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1708     brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1709
1710     /* Now do the same thing for the front four gradients... */
1711     /* x component */
1712     brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1713     brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1714     brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1715     brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1716
1717     brw_push_insn_state( p );
1718     brw_set_mask_control( p, BRW_MASK_DISABLE );
1719     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1720     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1721     brw_pop_insn_state( p );
1722
1723     brw_MUL( p, x1y0, x1y0, t );
1724     brw_MUL( p, x1y1, x1y1, t );
1725     brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1726     brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1727     brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1728
1729     /* y component */
1730     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1731     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1732     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1733     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1734
1735     brw_push_insn_state( p );
1736     brw_set_mask_control( p, BRW_MASK_DISABLE );
1737     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1738     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1739     brw_pop_insn_state( p );
1740
1741     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1742     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1743     brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1744     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1745     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1746
1747     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1748     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1749     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1750     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1751
1752     /* z component */
1753     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1754     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1755     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1756     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1757
1758     brw_push_insn_state( p );
1759     brw_set_mask_control( p, BRW_MASK_DISABLE );
1760     brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1761     brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1762     brw_pop_insn_state( p );
1763
1764     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1765     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1766     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1767     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1768     /* prepare t for the w component (used below): w the first time through
1769        the loop; w - 1 the second time) */
1770     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1771     brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1772     p->current->header.predicate_inverse = 1;
1773     brw_MOV( p, t, param[ 3 ] );
1774     p->current->header.predicate_inverse = 0;
1775     brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1776
1777     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1778     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1779     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1780     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1781
1782     /* w component */
1783     brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1784     brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1785     brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1786     brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1787
1788     brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1789     brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1790     brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1791     brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1792
1793     brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1794     brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1795     brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1796     brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1797
1798     /* Interpolate in the y dimension: */
1799     brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1800     brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1801     brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1802     brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1803     brw_ADD( p, x0y0, x0y0, x0y1 );
1804     brw_ADD( p, x1y0, x1y0, x1y1 );
1805
1806     /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
1807        time put the front face in tmp[ 1 ] and we're nearly there... */
1808     brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1809     brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1810     brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1811
1812     /* Another interpolation, in the z dimension: */
1813     brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1814     brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1815     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1816
1817     /* Exit the loop if we've computed both cubes... */
1818     origin = p->nr_insn;
1819     brw_push_insn_state( p );
1820     brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1821     brw_set_mask_control( p, BRW_MASK_DISABLE );
1822     brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1823     brw_pop_insn_state( p );
1824
1825     /* Save the result for the w=0 case, and increment the w coordinate: */
1826     brw_MOV( p, w0, tmp[ 0 ] );
1827     brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1828              brw_imm_uw( 1 ) );
1829
1830     /* Loop around for the other cube.  Explicitly set the flag register
1831        (unfortunately we must spend an extra instruction to do this: we
1832        can't rely on a side effect of the previous MOV or ADD because
1833        conditional modifiers which are normally true might be false in
1834        exceptional circumstances, e.g. given a NaN input; the add to
1835        brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1836     brw_push_insn_state( p );
1837     brw_set_mask_control( p, BRW_MASK_DISABLE );
1838     brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1839     brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1840              brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1841     brw_pop_insn_state( p );
1842
1843     /* Patch the previous conditional branch now that we know the
1844        destination address. */
1845     brw_set_src1( p->store + origin,
1846                   brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1847
1848     /* The very last interpolation. */
1849     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1850     brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1851     brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1852
1853     /* scale by pow( 2, -15 ), as described above */
1854     brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1855
1856     release_tmps( c, mark );
1857 }
1858
1859 static void emit_noise4( struct brw_wm_compile *c,
1860                          const struct prog_instruction *inst )
1861 {
1862     struct brw_compile *p = &c->func;
1863     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1864     GLuint mask = inst->DstReg.WriteMask;
1865     int i;
1866     int mark = mark_tmps( c );
1867
1868     assert( mark == 0 );
1869
1870     src0 = get_src_reg( c, inst, 0, 0 );
1871     src1 = get_src_reg( c, inst, 0, 1 );
1872     src2 = get_src_reg( c, inst, 0, 2 );
1873     src3 = get_src_reg( c, inst, 0, 3 );
1874
1875     param0 = alloc_tmp( c );
1876     param1 = alloc_tmp( c );
1877     param2 = alloc_tmp( c );
1878     param3 = alloc_tmp( c );
1879
1880     brw_MOV( p, param0, src0 );
1881     brw_MOV( p, param1, src1 );
1882     brw_MOV( p, param2, src2 );
1883     brw_MOV( p, param3, src3 );
1884
1885     invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1886
1887     /* Fill in the result: */
1888     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1889     for (i = 0 ; i < 4; i++) {
1890         if (mask & (1<<i)) {
1891             dst = get_dst_reg(c, inst, i);
1892             brw_MOV( p, dst, param0 );
1893         }
1894     }
1895     if( inst->SaturateMode == SATURATE_ZERO_ONE )
1896         brw_set_saturate( p, 0 );
1897
1898     release_tmps( c, mark );
1899 }
1900
1901
1902 /* TODO
1903    BIAS on SIMD8 not working yet...
1904  */
1905 static void emit_txb(struct brw_wm_compile *c,
1906                      const struct prog_instruction *inst)
1907 {
1908     struct brw_compile *p = &c->func;
1909     struct brw_reg dst[4], src[4], payload_reg;
1910     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1911     const GLuint unit = inst->TexSrcUnit;
1912     GLuint i;
1913     GLuint msg_type;
1914
1915     assert(unit < BRW_MAX_TEX_UNIT);
1916
1917     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
1918
1919     for (i = 0; i < 4; i++)
1920         dst[i] = get_dst_reg(c, inst, i);
1921     for (i = 0; i < 4; i++)
1922         src[i] = get_src_reg(c, inst, 0, i);
1923
1924     switch (inst->TexSrcTarget) {
1925         case TEXTURE_1D_INDEX:
1926             brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
1927             brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
1928             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
1929             break;
1930         case TEXTURE_2D_INDEX:
1931         case TEXTURE_RECT_INDEX:
1932             brw_MOV(p, brw_message_reg(2), src[0]);
1933             brw_MOV(p, brw_message_reg(3), src[1]);
1934             brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
1935             break;
1936         case TEXTURE_3D_INDEX:
1937         case TEXTURE_CUBE_INDEX:
1938             brw_MOV(p, brw_message_reg(2), src[0]);
1939             brw_MOV(p, brw_message_reg(3), src[1]);
1940             brw_MOV(p, brw_message_reg(4), src[2]);
1941             break;
1942         default:
1943             /* invalid target */
1944             abort();
1945     }
1946     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
1947     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
1948
1949     if (BRW_IS_IGDNG(p->brw)) {
1950         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
1951     } else {
1952         /* Does it work well on SIMD8? */
1953         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1954     }
1955
1956     brw_SAMPLE(p,
1957                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
1958                1,                                           /* msg_reg_nr */
1959                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
1960                SURF_INDEX_TEXTURE(unit),
1961                unit,                                        /* sampler */
1962                inst->DstReg.WriteMask,                      /* writemask */
1963                msg_type,                                    /* msg_type */
1964                4,                                           /* response_length */
1965                4,                                           /* msg_length */
1966                0,                                           /* eot */
1967                1,
1968                BRW_SAMPLER_SIMD_MODE_SIMD8);
1969 }
1970
1971
1972 static void emit_tex(struct brw_wm_compile *c,
1973                      const struct prog_instruction *inst)
1974 {
1975     struct brw_compile *p = &c->func;
1976     struct brw_reg dst[4], src[4], payload_reg;
1977     /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1978     const GLuint unit = inst->TexSrcUnit;
1979     GLuint msg_len;
1980     GLuint i, nr;
1981     GLuint emit;
1982     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
1983     GLuint msg_type;
1984
1985     assert(unit < BRW_MAX_TEX_UNIT);
1986
1987     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
1988
1989     for (i = 0; i < 4; i++)
1990         dst[i] = get_dst_reg(c, inst, i);
1991     for (i = 0; i < 4; i++)
1992         src[i] = get_src_reg(c, inst, 0, i);
1993
1994     switch (inst->TexSrcTarget) {
1995         case TEXTURE_1D_INDEX:
1996             emit = WRITEMASK_X;
1997             nr = 1;
1998             break;
1999         case TEXTURE_2D_INDEX:
2000         case TEXTURE_RECT_INDEX:
2001             emit = WRITEMASK_XY;
2002             nr = 2;
2003             break;
2004         case TEXTURE_3D_INDEX:
2005         case TEXTURE_CUBE_INDEX:
2006             emit = WRITEMASK_XYZ;
2007             nr = 3;
2008             break;
2009         default:
2010            /* invalid target */
2011            abort();
2012     }
2013     msg_len = 1;
2014
2015     /* move/load S, T, R coords */
2016     for (i = 0; i < nr; i++) {
2017         static const GLuint swz[4] = {0,1,2,2};
2018         if (emit & (1<<i))
2019             brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2020         else
2021             brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2022         msg_len += 1;
2023     }
2024
2025     if (shadow) {
2026        brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
2027        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
2028     }
2029
2030     if (BRW_IS_IGDNG(p->brw)) {
2031         if (shadow)
2032             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2033         else
2034             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2035     } else {
2036         /* Does it work for shadow on SIMD8 ? */
2037         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2038     }
2039
2040     brw_SAMPLE(p,
2041                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2042                1,                                          /* msg_reg_nr */
2043                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
2044                SURF_INDEX_TEXTURE(unit),
2045                unit,                                       /* sampler */
2046                inst->DstReg.WriteMask,                     /* writemask */
2047                msg_type,                                   /* msg_type */
2048                4,                                          /* response_length */
2049                shadow ? 6 : 4,                             /* msg_length */
2050                0,                                          /* eot */
2051                1,
2052                BRW_SAMPLER_SIMD_MODE_SIMD8);
2053
2054     if (shadow)
2055         brw_MOV(p, dst[3], brw_imm_f(1.0));
2056 }
2057
2058
2059 /**
2060  * Resolve subroutine calls after code emit is done.
2061  */
2062 static void post_wm_emit( struct brw_wm_compile *c )
2063 {
2064     brw_resolve_cals(&c->func);
2065 }
2066
2067 static void
2068 get_argument_regs(struct brw_wm_compile *c,
2069                   const struct prog_instruction *inst,
2070                   int index,
2071                   struct brw_reg *regs,
2072                   int mask)
2073 {
2074     int i;
2075
2076     for (i = 0; i < 4; i++) {
2077         if (mask & (1 << i))
2078             regs[i] = get_src_reg(c, inst, index, i);
2079     }
2080 }
2081
2082 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2083 {
2084 #define MAX_IF_DEPTH 32
2085 #define MAX_LOOP_DEPTH 32
2086     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2087     GLuint i, if_depth = 0, loop_depth = 0;
2088     struct brw_compile *p = &c->func;
2089     struct brw_indirect stack_index = brw_indirect(0, 0);
2090
2091     c->out_of_regs = GL_FALSE;
2092
2093     prealloc_reg(c);
2094     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2095     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2096
2097     for (i = 0; i < c->nr_fp_insns; i++) {
2098         const struct prog_instruction *inst = &c->prog_instructions[i];
2099         int dst_flags;
2100         struct brw_reg args[3][4], dst[4];
2101         int j;
2102
2103         c->cur_inst = i;
2104
2105 #if 0
2106         _mesa_printf("Inst %d: ", i);
2107         _mesa_print_instruction(inst);
2108 #endif
2109
2110         /* fetch any constants that this instruction needs */
2111         if (c->fp->use_const_buffer)
2112            fetch_constants(c, inst);
2113
2114         if (inst->Opcode != OPCODE_ARL) {
2115            for (j = 0; j < 4; j++) {
2116               if (inst->DstReg.WriteMask & (1 << j))
2117                  dst[j] = get_dst_reg(c, inst, j);
2118               else
2119                  dst[j] = brw_null_reg();
2120            }
2121         }
2122         for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2123             get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2124
2125         dst_flags = inst->DstReg.WriteMask;
2126         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2127             dst_flags |= SATURATE;
2128
2129         if (inst->CondUpdate)
2130             brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2131         else
2132             brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2133
2134         dst_flags = inst->DstReg.WriteMask;
2135         if (inst->SaturateMode == SATURATE_ZERO_ONE)
2136             dst_flags |= SATURATE;
2137
2138         switch (inst->Opcode) {
2139             case WM_PIXELXY:
2140                 emit_pixel_xy(c, dst, dst_flags);
2141                 break;
2142             case WM_DELTAXY:
2143                 emit_delta_xy(p, dst, dst_flags, args[0]);
2144                 break;
2145             case WM_PIXELW:
2146                 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
2147                 break;
2148             case WM_LINTERP:
2149                 emit_linterp(p, dst, dst_flags, args[0], args[1]);
2150                 break;
2151             case WM_PINTERP:
2152                 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
2153                 break;
2154             case WM_CINTERP:
2155                 emit_cinterp(p, dst, dst_flags, args[0]);
2156                 break;
2157             case WM_WPOSXY:
2158                 emit_wpos_xy(c, dst, dst_flags, args[0]);
2159                 break;
2160             case WM_FB_WRITE:
2161                 emit_fb_write(c, inst);
2162                 break;
2163             case WM_FRONTFACING:
2164                 emit_frontfacing(p, dst, dst_flags);
2165                 break;
2166             case OPCODE_ADD:
2167                 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2168                 break;
2169             case OPCODE_ARL:
2170                 emit_arl(c, inst);
2171                 break;
2172             case OPCODE_FRC:
2173                 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2174                 break;
2175             case OPCODE_FLR:
2176                 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2177                 break;
2178             case OPCODE_LRP:
2179                 unalias3(c, emit_lrp,
2180                          dst, dst_flags, args[0], args[1], args[2]);
2181                 break;
2182             case OPCODE_TRUNC:
2183                 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2184                 break;
2185             case OPCODE_MOV:
2186             case OPCODE_SWZ:
2187                 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2188                 break;
2189             case OPCODE_DP3:
2190                 emit_dp3(p, dst, dst_flags, args[0], args[1]);
2191                 break;
2192             case OPCODE_DP4:
2193                 emit_dp4(p, dst, dst_flags, args[0], args[1]);
2194                 break;
2195             case OPCODE_XPD:
2196                 emit_xpd(p, dst, dst_flags, args[0], args[1]);
2197                 break;
2198             case OPCODE_DPH:
2199                 emit_dph(p, dst, dst_flags, args[0], args[1]);
2200                 break;
2201             case OPCODE_RCP:
2202                 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
2203                 break;
2204             case OPCODE_RSQ:
2205                 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
2206                 break;
2207             case OPCODE_SIN:
2208                 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
2209                 break;
2210             case OPCODE_COS:
2211                 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
2212                 break;
2213             case OPCODE_EX2:
2214                 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
2215                 break;
2216             case OPCODE_LG2:
2217                 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
2218                 break;
2219             case OPCODE_MIN:
2220             case OPCODE_MAX:
2221                 emit_min_max(c, inst);
2222                 break;
2223             case OPCODE_DDX:
2224             case OPCODE_DDY:
2225                 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2226                           args[0]);
2227                 break;
2228             case OPCODE_SLT:
2229                 emit_sop(p, dst, dst_flags,
2230                          BRW_CONDITIONAL_L, args[0], args[1]);
2231                 break;
2232             case OPCODE_SLE:
2233                 emit_sop(p, dst, dst_flags,
2234                          BRW_CONDITIONAL_LE, args[0], args[1]);
2235                 break;
2236             case OPCODE_SGT:
2237                 emit_sop(p, dst, dst_flags,
2238                          BRW_CONDITIONAL_G, args[0], args[1]);
2239                 break;
2240             case OPCODE_SGE:
2241                 emit_sop(p, dst, dst_flags,
2242                          BRW_CONDITIONAL_GE, args[0], args[1]);
2243                 break;
2244             case OPCODE_SEQ:
2245                 emit_sop(p, dst, dst_flags,
2246                          BRW_CONDITIONAL_EQ, args[0], args[1]);
2247                 break;
2248             case OPCODE_SNE:
2249                 emit_sop(p, dst, dst_flags,
2250                          BRW_CONDITIONAL_NEQ, args[0], args[1]);
2251                 break;
2252             case OPCODE_MUL:
2253                 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2254                 break;
2255             case OPCODE_POW:
2256                 emit_math2(c, BRW_MATH_FUNCTION_POW,
2257                            dst, dst_flags, args[0], args[1]);
2258                 break;
2259             case OPCODE_MAD:
2260                 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2261                 break;
2262             case OPCODE_NOISE1:
2263                 emit_noise1(c, inst);
2264                 break;
2265             case OPCODE_NOISE2:
2266                 emit_noise2(c, inst);
2267                 break;
2268             case OPCODE_NOISE3:
2269                 emit_noise3(c, inst);
2270                 break;
2271             case OPCODE_NOISE4:
2272                 emit_noise4(c, inst);
2273                 break;
2274             case OPCODE_TEX:
2275                 emit_tex(c, inst);
2276                 break;
2277             case OPCODE_TXB:
2278                 emit_txb(c, inst);
2279                 break;
2280             case OPCODE_KIL_NV:
2281                 emit_kil(c);
2282                 break;
2283             case OPCODE_IF:
2284                 assert(if_depth < MAX_IF_DEPTH);
2285                 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2286                 break;
2287             case OPCODE_ELSE:
2288                 if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
2289                 break;
2290             case OPCODE_ENDIF:
2291                 assert(if_depth > 0);
2292                 brw_ENDIF(p, if_inst[--if_depth]);
2293                 break;
2294             case OPCODE_BGNSUB:
2295                 brw_save_label(p, inst->Comment, p->nr_insn);
2296                 break;
2297             case OPCODE_ENDSUB:
2298                 /* no-op */
2299                 break;
2300             case OPCODE_CAL:
2301                 brw_push_insn_state(p);
2302                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2303                 brw_set_access_mode(p, BRW_ALIGN_1);
2304                 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2305                 brw_set_access_mode(p, BRW_ALIGN_16);
2306                 brw_ADD(p, get_addr_reg(stack_index),
2307                          get_addr_reg(stack_index), brw_imm_d(4));
2308                 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2309                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2310                 brw_pop_insn_state(p);
2311                 break;
2312
2313             case OPCODE_RET:
2314                 brw_push_insn_state(p);
2315                 brw_set_mask_control(p, BRW_MASK_DISABLE);
2316                 brw_ADD(p, get_addr_reg(stack_index),
2317                         get_addr_reg(stack_index), brw_imm_d(-4));
2318                 brw_set_access_mode(p, BRW_ALIGN_1);
2319                 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2320                 brw_set_access_mode(p, BRW_ALIGN_16);
2321                 brw_pop_insn_state(p);
2322
2323                 break;
2324             case OPCODE_BGNLOOP:
2325                 /* XXX may need to invalidate the current_constant regs */
2326                 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2327                 break;
2328             case OPCODE_BRK:
2329                 brw_BREAK(p);
2330                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2331                 break;
2332             case OPCODE_CONT:
2333                 brw_CONT(p);
2334                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2335                 break;
2336             case OPCODE_ENDLOOP:
2337                {
2338                   struct brw_instruction *inst0, *inst1;
2339                   GLuint br = 1;
2340
2341                   if (BRW_IS_IGDNG(brw))
2342                      br = 2;
2343
2344                   loop_depth--;
2345                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2346                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
2347                   while (inst0 > loop_inst[loop_depth]) {
2348                      inst0--;
2349                      if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2350                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2351                         inst0->bits3.if_else.pop_count = 0;
2352                      }
2353                      else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2354                         inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2355                         inst0->bits3.if_else.pop_count = 0;
2356                      }
2357                   }
2358                }
2359                break;
2360             default:
2361                 _mesa_printf("unsupported IR in fragment shader %d\n",
2362                         inst->Opcode);
2363         }
2364
2365         if (inst->CondUpdate)
2366             brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2367         else
2368             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2369     }
2370     post_wm_emit(c);
2371
2372     if (INTEL_DEBUG & DEBUG_WM) {
2373       _mesa_printf("wm-native:\n");
2374       for (i = 0; i < p->nr_insn; i++)
2375          brw_disasm(stderr, &p->store[i]);
2376       _mesa_printf("\n");
2377     }
2378 }
2379
2380 /**
2381  * Do GPU code generation for shaders that use GLSL features such as
2382  * flow control.  Other shaders will be compiled with the
2383  */
2384 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2385 {
2386     if (INTEL_DEBUG & DEBUG_WM) {
2387         _mesa_printf("brw_wm_glsl_emit:\n");
2388     }
2389
2390     /* initial instruction translation/simplification */
2391     brw_wm_pass_fp(c);
2392
2393     /* actual code generation */
2394     brw_wm_emit_glsl(brw, c);
2395
2396     if (INTEL_DEBUG & DEBUG_WM) {
2397         brw_wm_print_program(c, "brw_wm_glsl_emit done");
2398     }
2399
2400     c->prog_data.total_grf = num_grf_used(c);
2401     c->prog_data.total_scratch = 0;
2402 }